diff options
author | hkuang <hkuang@google.com> | 2013-11-07 15:50:31 -0800 |
---|---|---|
committer | hkuang <hkuang@google.com> | 2013-11-08 11:40:06 -0800 |
commit | 5ae7ac49f08a179e4f054d99fcfc9dce78d26e58 (patch) | |
tree | 0d891d2cbbac4c3da6fd15a25bf8797b29b31994 /libvpx/vp9/common/arm/neon | |
parent | e6eeaaa14ccef4c0938fcce21c54979204041a30 (diff) | |
download | android_external_libvpx-5ae7ac49f08a179e4f054d99fcfc9dce78d26e58.tar.gz android_external_libvpx-5ae7ac49f08a179e4f054d99fcfc9dce78d26e58.tar.bz2 android_external_libvpx-5ae7ac49f08a179e4f054d99fcfc9dce78d26e58.zip |
Roll latest libvpx into Android.
The lastest libvpx just added multithread tile decoding support.
Checkout is from master: abdefeaa89a0908327518e5ca75c935c66b2e1aa
Bug:11576718
Change-Id: Icbe5430633e179b8dc6d419e280ad7ebd3cad4a0
Diffstat (limited to 'libvpx/vp9/common/arm/neon')
12 files changed, 565 insertions, 299 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c index 3e3e400..0b9fc09 100644 --- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c @@ -11,45 +11,47 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" -extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input, - int16_t *output, - int output_stride); -extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); -extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input, - int16_t *output, - int output_stride); -extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, - int16_t *output, - int16_t *pass1Output, - int16_t skip_adding, - uint8_t *dest, - int dest_stride); -extern void save_neon_registers(); -extern void restore_neon_registers(); - - -void vp9_short_idct16x16_add_neon(int16_t *input, - uint8_t *dest, int dest_stride) { +void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); +void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, + int16_t *output, + int output_stride); +void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, + int16_t *output, + int16_t *pass1Output, + int16_t skip_adding, + uint8_t *dest, + int dest_stride); + +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void vp9_push_neon(int64_t *store); +extern void vp9_pop_neon(int64_t *store); + +void vp9_idct16x16_256_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct16x16_add_neon_pass2(input+1, + vp9_idct16x16_256_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -59,12 +61,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the lower 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct16x16_add_neon_pass2(input+8*16+1, + vp9_idct16x16_256_add_neon_pass2(input+8*16+1, row_idct_output+8, pass1_output, 0, @@ -74,12 +76,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, @@ -89,12 +91,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input, /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, @@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } -void vp9_short_idct10_16x16_add_neon(int16_t *input, - uint8_t *dest, int dest_stride) { +void vp9_idct16x16_10_add_neon(const int16_t *input, + uint8_t *dest, int dest_stride) { + int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. - save_neon_registers(); + vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8); + vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vp9_short_idct10_16x16_add_neon_pass2(input+1, + vp9_idct16x16_10_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, @@ -135,12 +138,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, @@ -150,12 +153,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); + vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. - vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1, + vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, @@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input, dest_stride); // restore d8-d15 register values. - restore_neon_registers(); + vp9_pop_neon(store_reg); return; } diff --git a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c deleted file mode 100644 index ceecd6f..0000000 --- a/libvpx/vp9/common/arm/neon/vp9_idct32x32_neon.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/vp9_common.h" - -// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm -extern void idct32_transpose_and_transform(int16_t *transpose_buffer, - int16_t *output, int16_t *input); -extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); - - -// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm -extern void save_neon_registers(); -extern void restore_neon_registers(); - -void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest, - int dest_stride) { - // TODO(cd): move the creation of these buffers within the ASM file - // internal buffer used to transpose 8 lines into before transforming them - int16_t transpose_buffer[32 * 8]; - // results of the first pass (transpose and transform rows) - int16_t pass1[32 * 32]; - // results of the second pass (transpose and transform columns) - int16_t pass2[32 * 32]; - - // save register we need to preserve - save_neon_registers(); - // process rows - idct32_transpose_and_transform(transpose_buffer, pass1, input); - // process columns - // TODO(cd): do these two steps/passes within the ASM file - idct32_transpose_and_transform(transpose_buffer, pass2, pass1); - // combine and add to dest - // TODO(cd): integrate this within the last storage step of the second pass - idct32_combine_add(dest, pass2, dest_stride); - // restore register we need to preserve - restore_neon_registers(); -} - -// TODO(cd): Eliminate this file altogether when everything is in ASM file diff --git a/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm new file mode 100644 index 0000000..71c3e70 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_save_reg_neon.asm @@ -0,0 +1,36 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_push_neon| + EXPORT |vp9_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp9_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm index cf5c8f7..b1fd21b 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct16x16_1_add_neon| + EXPORT |vp9_idct16x16_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct16x16_1_add_neon| PROC +|vp9_idct16x16_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -193,6 +193,6 @@ vst1.64 {d31}, [r12], r2 bx lr - ENDP ; |vp9_short_idct16x16_1_add_neon| + ENDP ; |vp9_idct16x16_1_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm index 7464e80..a13c0d0 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm @@ -8,12 +8,10 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct16x16_add_neon_pass1| - EXPORT |vp9_short_idct16x16_add_neon_pass2| - EXPORT |vp9_short_idct10_16x16_add_neon_pass1| - EXPORT |vp9_short_idct10_16x16_add_neon_pass2| - EXPORT |save_neon_registers| - EXPORT |restore_neon_registers| + EXPORT |vp9_idct16x16_256_add_neon_pass1| + EXPORT |vp9_idct16x16_256_add_neon_pass2| + EXPORT |vp9_idct16x16_10_add_neon_pass1| + EXPORT |vp9_idct16x16_10_add_neon_pass2| ARM REQUIRE8 PRESERVE8 @@ -38,7 +36,7 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void |vp9_short_idct16x16_add_neon_pass1|(int16_t *input, +;void |vp9_idct16x16_256_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -48,7 +46,7 @@ ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_add_neon_pass1| PROC +|vp9_idct16x16_256_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -275,9 +273,9 @@ vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct16x16_add_neon_pass1| + ENDP ; |vp9_idct16x16_256_add_neon_pass1| -;void vp9_short_idct16x16_add_neon_pass2(int16_t *src, +;void vp9_idct16x16_256_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -294,7 +292,7 @@ ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct16x16_add_neon_pass2| PROC +|vp9_idct16x16_256_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -786,9 +784,9 @@ skip_adding_dest end_idct16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct16x16_add_neon_pass2| + ENDP ; |vp9_idct16x16_256_add_neon_pass2| -;void |vp9_short_idct10_16x16_add_neon_pass1|(int16_t *input, +;void |vp9_idct16x16_10_add_neon_pass1|(int16_t *input, ; int16_t *output, int output_stride) ; ; r0 int16_t input @@ -798,7 +796,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass1| PROC +|vp9_idct16x16_10_add_neon_pass1| PROC ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 @@ -907,9 +905,9 @@ end_idct16x16_pass2 vst1.64 {d31}, [r1], r2 bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass1| + ENDP ; |vp9_idct16x16_10_add_neon_pass1| -;void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src, +;void vp9_idct16x16_10_add_neon_pass2(int16_t *src, ; int16_t *output, ; int16_t *pass1Output, ; int16_t skip_adding, @@ -926,7 +924,7 @@ end_idct16x16_pass2 ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. -|vp9_short_idct10_16x16_add_neon_pass2| PROC +|vp9_idct16x16_10_add_neon_pass2| PROC push {r3-r9} ; TODO(hkuang): Find a better way to load the elements. @@ -1177,15 +1175,5 @@ end_idct16x16_pass2 end_idct10_16x16_pass2 pop {r3-r9} bx lr - ENDP ; |vp9_short_idct10_16x16_add_neon_pass2| -;void |save_neon_registers|() -|save_neon_registers| PROC - vpush {d8-d15} - bx lr - ENDP ; |save_registers| -;void |restore_neon_registers|() -|restore_neon_registers| PROC - vpop {d8-d15} - bx lr - ENDP ; |restore_registers| + ENDP ; |vp9_idct16x16_10_add_neon_pass2| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm index 5c097cc..f00d027 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm @@ -43,8 +43,7 @@ cospi_30_64 EQU 1606 cospi_31_64 EQU 804 - EXPORT |idct32_transpose_and_transform| - EXPORT |idct32_combine_add| + EXPORT |vp9_idct32x32_1024_add_neon| ARM REQUIRE8 PRESERVE8 @@ -100,6 +99,142 @@ cospi_31_64 EQU 804 vst1.16 {$reg2}, [r1] MEND ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10] + vst1.16 {d11}, [r9] + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10]! + vst1.16 {d11}, [r9]! + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6] + vst1.16 {d4}, [r7] + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6]! + vst1.16 {d4}, [r7]! + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- ; Touches q8-q12, q15 (q13-q14 are preserved) ; valid output registers are anything but q8-q11 MACRO @@ -110,12 +245,12 @@ cospi_31_64 EQU 804 ; additions/substractions before the multiplies. ; generate the constants ; generate scalar constants - mov r3, #$first_constant & 0xFF00 - add r3, #$first_constant & 0x00FF + mov r8, #$first_constant & 0xFF00 mov r12, #$second_constant & 0xFF00 + add r8, #$first_constant & 0x00FF add r12, #$second_constant & 0x00FF ; generate vector constants - vdup.16 d30, r3 + vdup.16 d30, r8 vdup.16 d31, r12 ; (used) two for inputs (regA-regD), one for constants (q15) ; do some multiplications (ordered for maximum latency hiding) @@ -153,15 +288,22 @@ cospi_31_64 EQU 804 MEND ; -------------------------------------------------------------------------- -;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input); +;void vp9_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); ; -; r0 int16_t *transpose_buffer -; r1 int16_t *output -; r2 int16_t *input) -; TODO(cd): have more logical parameter ordering but this issue will disappear -; when functions are combined. +; r0 int16_t *input, +; r1 uint8_t *dest, +; r2 int dest_stride) +; loop counters +; r4 bands loop counter +; r5 pass loop counter +; r8 transpose loop counter +; combine-add pointers +; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) +; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) +; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) +; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) -|idct32_transpose_and_transform| PROC +|vp9_idct32x32_1024_add_neon| PROC ; This function does one pass of idct32x32 transform. ; ; This is done by transposing the input and then doing a 1d transform on @@ -171,43 +313,73 @@ cospi_31_64 EQU 804 ; The 1d transform is done by looping over bands of eight columns (the ; idct32_bands loop). For each band, the transform input transposition ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are trsnposed by pairs (the idct32_transpose_pair loop). - push {r4} - mov r4, #0 ; initialize bands loop counter + ; matrices are transposed by pairs (the idct32_transpose_pair loop). + push {r4-r11} + vpush {d8-d15} + ; stack operation + ; internal buffer used to transpose 8 lines into before transforming them + ; int16_t transpose_buffer[32 * 8]; + ; at sp + [4096, 4607] + ; results of the first pass (transpose and transform rows) + ; int16_t pass1[32 * 32]; + ; at sp + [0, 2047] + ; results of the second pass (transpose and transform columns) + ; int16_t pass2[32 * 32]; + ; at sp + [2048, 4095] + sub sp, sp, #512+2048+2048 + + ; r6 = dest + 31 * dest_stride + ; r7 = dest + 0 * dest_stride + ; r9 = dest + 15 * dest_stride + ; r10 = dest + 16 * dest_stride + rsb r6, r2, r2, lsl #5 + rsb r9, r2, r2, lsl #4 + add r10, r1, r2, lsl #4 + mov r7, r1 + add r6, r6, r1 + add r9, r9, r1 + ; r11 = -dest_stride + neg r11, r2 + ; r3 = input + mov r3, r0 + ; parameters for first pass + ; r0 = transpose_buffer[32 * 8] + add r0, sp, #4096 + ; r1 = pass1[32 * 32] + mov r1, sp + + mov r5, #0 ; initialize pass loop counter +idct32_pass_loop + mov r4, #4 ; initialize bands loop counter idct32_bands_loop - ; TODO(cd) get rid of these push/pop by properly adjusting register - ; content at end of loop - push {r0} - push {r1} - push {r2} - mov r3, #0 ; initialize transpose loop counter + mov r8, #2 ; initialize transpose loop counter idct32_transpose_pair_loop ; Load two horizontally consecutive 8x8 16bit data matrices. The first one ; into q0-q7 and the second one into q8-q15. There is a stride of 64, ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r2]! - vld1.s16 {q0}, [r2]! - add r2, #32 - vld1.s16 {q9}, [r2]! - vld1.s16 {q1}, [r2]! - add r2, #32 - vld1.s16 {q10}, [r2]! - vld1.s16 {q2}, [r2]! - add r2, #32 - vld1.s16 {q11}, [r2]! - vld1.s16 {q3}, [r2]! - add r2, #32 - vld1.s16 {q12}, [r2]! - vld1.s16 {q4}, [r2]! - add r2, #32 - vld1.s16 {q13}, [r2]! - vld1.s16 {q5}, [r2]! - add r2, #32 - vld1.s16 {q14}, [r2]! - vld1.s16 {q6}, [r2]! - add r2, #32 - vld1.s16 {q15}, [r2]! - vld1.s16 {q7}, [r2]! + vld1.s16 {q8}, [r3]! + vld1.s16 {q0}, [r3]! + add r3, #32 + vld1.s16 {q9}, [r3]! + vld1.s16 {q1}, [r3]! + add r3, #32 + vld1.s16 {q10}, [r3]! + vld1.s16 {q2}, [r3]! + add r3, #32 + vld1.s16 {q11}, [r3]! + vld1.s16 {q3}, [r3]! + add r3, #32 + vld1.s16 {q12}, [r3]! + vld1.s16 {q4}, [r3]! + add r3, #32 + vld1.s16 {q13}, [r3]! + vld1.s16 {q5}, [r3]! + add r3, #32 + vld1.s16 {q14}, [r3]! + vld1.s16 {q6}, [r3]! + add r3, #32 + vld1.s16 {q15}, [r3]! + vld1.s16 {q7}, [r3]! ; Transpose the two 8x8 16bit data matrices. vswp d17, d24 @@ -255,11 +427,13 @@ idct32_transpose_pair_loop vst1.16 {q7}, [r0]! ; increment pointers by adjusted stride (not necessary for r0/out) - sub r2, r2, #8*32*2-32-16*2 + ; go back by 7*32 for the seven lines moved fully by read and add + ; go back by 32 for the eigth line only read + ; advance by 16*2 to go the next pair + sub r3, r3, #7*32*2 + 32 - 16*2 ; transpose pair loop processing - add r3, r3, #1 - cmp r3, #1 - BLE idct32_transpose_pair_loop + subs r8, r8, #1 + bne idct32_transpose_pair_loop ; restore r0/input to its original value sub r0, r0, #32*8*2 @@ -815,21 +989,26 @@ idct32_transpose_pair_loop vadd.s16 q9, q5, q0 vsub.s16 q6, q5, q0 vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 17, 17, 16, q7, q6 - STORE_IN_OUTPUT 16, 15, 14, q9, q8 + + cmp r5, #0 + bgt idct32_bands_end_2nd_pass + +idct32_bands_end_1st_pass + STORE_IN_OUTPUT 17, 16, 17, q6, q7 + STORE_IN_OUTPUT 17, 14, 15, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; ;output[30 * 32] = step1b[1][i] - step1b[30][i]; ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 14, 30, 31, q0, q1 + LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 31, 30, q7, q6 - STORE_IN_OUTPUT 30, 0, 1, q4, q5 + STORE_IN_OUTPUT 31, 30, 31, q6, q7 + STORE_IN_OUTPUT 31, 0, 1, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[2] = step1b[2][i] + step1b[13][i]; @@ -848,25 +1027,25 @@ idct32_transpose_pair_loop ;output[18 * 32] = step1b[13][i] - step1b[18][i]; ;output[19 * 32] = step1b[12][i] - step1b[19][i]; LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 19, 19, 18, q9, q8 - STORE_IN_OUTPUT 18, 13, 12, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 19, 18, 19, q6, q7 + STORE_IN_OUTPUT 19, 12, 13, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; ;output[28 * 32] = step1b[3][i] - step1b[28][i]; ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 12, 28, 29, q0, q1 + LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 29, 28, q7, q6 - STORE_IN_OUTPUT 28, 2, 3, q4, q5 + STORE_IN_OUTPUT 29, 28, 29, q6, q7 + STORE_IN_OUTPUT 29, 2, 3, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[4] = step1b[4][i] + step1b[11][i]; @@ -885,25 +1064,25 @@ idct32_transpose_pair_loop ;output[20 * 32] = step1b[11][i] - step1b[20][i]; ;output[21 * 32] = step1b[10][i] - step1b[21][i]; LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 21, 21, 20, q9, q8 - STORE_IN_OUTPUT 20, 11, 10, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 21, 20, 21, q6, q7 + STORE_IN_OUTPUT 21, 10, 11, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; ;output[26 * 32] = step1b[5][i] - step1b[26][i]; ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 10, 26, 27, q0, q1 + LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 27, 26, q7, q6 - STORE_IN_OUTPUT 26, 4, 5, q4, q5 + STORE_IN_OUTPUT 27, 26, 27, q6, q7 + STORE_IN_OUTPUT 27, 4, 5, q4, q5 ; -------------------------------------------------------------------------- ; part of stage 7 ;step1[6] = step1b[6][i] + step1b[9][i]; @@ -922,92 +1101,199 @@ idct32_transpose_pair_loop ;output[22 * 32] = step1b[9][i] - step1b[22][i]; ;output[23 * 32] = step1b[8][i] - step1b[23][i]; LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q6, q4, q1 - vadd.s16 q7, q5, q0 - vsub.s16 q8, q5, q0 - vsub.s16 q9, q4, q1 - STORE_IN_OUTPUT 23, 23, 22, q9, q8 - STORE_IN_OUTPUT 22, 9, 8, q7, q6 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 23, 22, 23, q6, q7 + STORE_IN_OUTPUT 23, 8, 9, q8, q9 ; -------------------------------------------------------------------------- ; part of final stage ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; ;output[24 * 32] = step1b[7][i] - step1b[24][i]; ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 8, 24, 25, q0, q1 + LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 vadd.s16 q4, q2, q1 vadd.s16 q5, q3, q0 vsub.s16 q6, q3, q0 vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 25, 24, q7, q6 - STORE_IN_OUTPUT 24, 6, 7, q4, q5 - ; -------------------------------------------------------------------------- + STORE_IN_OUTPUT 25, 24, 25, q6, q7 + STORE_IN_OUTPUT 25, 6, 7, q4, q5 - ; TODO(cd) get rid of these push/pop by properly adjusting register - ; content at end of loop - pop {r2} - pop {r1} - pop {r0} - add r1, r1, #8*2 - add r2, r2, #8*32*2 + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #7*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 ; bands loop processing - add r4, r4, #1 - cmp r4, #3 - BLE idct32_bands_loop + subs r4, r4, #1 + bne idct32_bands_loop - pop {r4} - bx lr - ENDP ; |idct32_transpose_and_transform| + ; parameters for second pass + ; the input of pass2 is the result of pass1. we have to remove the offset + ; of 32 columns induced by the above idct32_bands_loop + sub r3, r1, #32*2 + ; r1 = pass2[32 * 32] + add r1, sp, #2048 -;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride); -; -; r0 uint8_t *dest -; r1 int16_t *out -; r2 int dest_stride) - -|idct32_combine_add| PROC - - mov r12, r0 ; dest pointer used for stores - sub r2, r2, #32 ; adjust the stride (remove the post-increments) - mov r3, #0 ; initialize loop counter - -idct32_combine_add_loop - ; load out[j * 32 + 0-31] - vld1.s16 {q12}, [r1]! - vld1.s16 {q13}, [r1]! - vld1.s16 {q14}, [r1]! - vld1.s16 {q15}, [r1]! - ; load dest[j * dest_stride + 0-31] - vld1.s16 {q6}, [r0]! - vld1.s16 {q7}, [r0]! - ; ROUND_POWER_OF_TWO - vrshr.s16 q12, q12, #6 - vrshr.s16 q13, q13, #6 - vrshr.s16 q14, q14, #6 - vrshr.s16 q15, q15, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q12, q12, d12 - vaddw.u8 q13, q13, d13 - vaddw.u8 q14, q14, d14 - vaddw.u8 q15, q15, d15 - ; clip pixel - vqmovun.s16 d12, q12 - vqmovun.s16 d13, q13 - vqmovun.s16 d14, q14 - vqmovun.s16 d15, q15 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {q6}, [r12]! - vst1.16 {q7}, [r12]! - ; increment pointers by adjusted stride (not necessary for r1/out) - add r0, r0, r2 - add r12, r12, r2 - ; loop processing - add r3, r3, #1 - cmp r3, #31 - BLE idct32_combine_add_loop + ; pass loop processing + add r5, r5, #1 + B idct32_pass_loop - bx lr - ENDP ; |idct32_transpose| +idct32_bands_end_2nd_pass + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; restore pointers to their initial indices for next band pass by + ; removing/adding dest_stride * 8. The actual increment by eight + ; is taken care of within the _LAST macros. + add r6, r6, r2, lsl #3 + add r9, r9, r2, lsl #3 + sub r7, r7, r2, lsl #3 + sub r10, r10, r2, lsl #3 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #25*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; stack operation + add sp, sp, #512+2048+2048 + vpop {d8-d15} + pop {r4-r11} + bx lr + ENDP ; |vp9_idct32x32_1024_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm index 869ee5f..0d4a721 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct4x4_1_add_neon| + EXPORT |vp9_idct4x4_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct4x4_1_add_neon| PROC +|vp9_idct4x4_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -63,6 +63,6 @@ vst1.32 {d7[1]}, [r12] bx lr - ENDP ; |vp9_short_idct4x4_1_add_neon| + ENDP ; |vp9_idct4x4_1_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm index 640fb93..00283fc 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct4x4_add_neon| + EXPORT |vp9_idct4x4_16_add_neon| ARM REQUIRE8 PRESERVE8 @@ -16,13 +16,13 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct4x4_add_neon| PROC +|vp9_idct4x4_16_add_neon| PROC ; The 2D transform is done with two passes which are actually pretty ; similar. We first transform the rows. This is done by transposing @@ -185,6 +185,6 @@ vst1.32 {d26[1]}, [r1], r2 vst1.32 {d26[0]}, [r1] ; no post-increment bx lr - ENDP ; |vp9_short_idct4x4_add_neon| + ENDP ; |vp9_idct4x4_16_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm index 923804f..421d202 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_1_add_neon.asm @@ -8,21 +8,21 @@ ; - EXPORT |vp9_short_idct8x8_1_add_neon| + EXPORT |vp9_idct8x8_1_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp9_short_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct8x8_1_add_neon| PROC +|vp9_idct8x8_1_add_neon| PROC ldrsh r0, [r0] ; generate cospi_16_64 = 11585 @@ -83,6 +83,6 @@ vst1.64 {d31}, [r12], r2 bx lr - ENDP ; |vp9_short_idct8x8_1_add_neon| + ENDP ; |vp9_idct8x8_1_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm index a744f59..5476400 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm @@ -8,8 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_idct8x8_add_neon| - EXPORT |vp9_short_idct10_8x8_add_neon| + EXPORT |vp9_idct8x8_64_add_neon| + EXPORT |vp9_idct8x8_10_add_neon| ARM REQUIRE8 PRESERVE8 @@ -198,13 +198,13 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct8x8_add_neon| PROC +|vp9_idct8x8_64_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -308,15 +308,15 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_short_idct8x8_add_neon| + ENDP ; |vp9_idct8x8_64_add_neon| -;void vp9_short_idct10_8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vp9_idct8x8_10_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; ; r0 int16_t input ; r1 uint8_t *dest ; r2 int dest_stride) -|vp9_short_idct10_8x8_add_neon| PROC +|vp9_idct8x8_10_add_neon| PROC push {r4-r9} vpush {d8-d15} vld1.s16 {q8,q9}, [r0]! @@ -514,6 +514,6 @@ vpop {d8-d15} pop {r4-r9} bx lr - ENDP ; |vp9_short_idct10_8x8_add_neon| + ENDP ; |vp9_idct8x8_10_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm index 963ef35..2f326e2 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_iht4x4_add_neon.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_iht4x4_add_neon| + EXPORT |vp9_iht4x4_16_add_neon| ARM REQUIRE8 PRESERVE8 @@ -139,7 +139,7 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_iht4x4_add_neon(int16_t *input, uint8_t *dest, +;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride, int tx_type) ; ; r0 int16_t input @@ -147,7 +147,7 @@ ; r2 int dest_stride ; r3 int tx_type) ; This function will only handle tx_type of 1,2,3. -|vp9_short_iht4x4_add_neon| PROC +|vp9_iht4x4_16_add_neon| PROC ; load the inputs into d16-d19 vld1.s16 {q8,q9}, [r0]! @@ -175,7 +175,7 @@ iadst_idct ; then transform columns IADST4x4_1D - b end_vp9_short_iht4x4_add_neon + b end_vp9_iht4x4_16_add_neon idct_iadst ; generate constants @@ -191,7 +191,7 @@ idct_iadst ; then transform columns IDCT4x4_1D - b end_vp9_short_iht4x4_add_neon + b end_vp9_iht4x4_16_add_neon iadst_iadst ; generate constants @@ -206,7 +206,7 @@ iadst_iadst ; then transform columns IADST4x4_1D -end_vp9_short_iht4x4_add_neon +end_vp9_iht4x4_16_add_neon ; ROUND_POWER_OF_TWO(temp_out[j], 4) vrshr.s16 q8, q8, #4 vrshr.s16 q9, q9, #4 @@ -232,6 +232,6 @@ end_vp9_short_iht4x4_add_neon vst1.32 {d26[1]}, [r1], r2 vst1.32 {d26[0]}, [r1] ; no post-increment bx lr - ENDP ; |vp9_short_iht4x4_add_neon| + ENDP ; |vp9_iht4x4_16_add_neon| END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm index bab9cb4..93d3af3 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm @@ -8,7 +8,7 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_short_iht8x8_add_neon| + EXPORT |vp9_iht8x8_64_add_neon| ARM REQUIRE8 PRESERVE8 @@ -559,7 +559,7 @@ AREA Block, CODE, READONLY ; name this block of code -;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest, +;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest, ; int dest_stride, int tx_type) ; ; r0 int16_t input @@ -567,7 +567,7 @@ ; r2 int dest_stride ; r3 int tx_type) ; This function will only handle tx_type of 1,2,3. -|vp9_short_iht8x8_add_neon| PROC +|vp9_iht8x8_64_add_neon| PROC ; load the inputs into d16-d19 vld1.s16 {q8,q9}, [r0]! @@ -602,7 +602,7 @@ iadst_idct ; then transform columns IADST8X8_1D - b end_vp9_short_iht8x8_add_neon + b end_vp9_iht8x8_64_add_neon idct_iadst ; generate IADST constants @@ -620,7 +620,7 @@ idct_iadst ; then transform columns IDCT8x8_1D - b end_vp9_short_iht8x8_add_neon + b end_vp9_iht8x8_64_add_neon iadst_iadst ; generate IADST constants @@ -635,7 +635,7 @@ iadst_iadst ; then transform columns IADST8X8_1D -end_vp9_short_iht8x8_add_neon +end_vp9_iht8x8_64_add_neon pop {r0-r10} ; ROUND_POWER_OF_TWO(temp_out[j], 5) @@ -691,6 +691,6 @@ end_vp9_short_iht8x8_add_neon vst1.64 {d6}, [r0], r2 vst1.64 {d7}, [r0], r2 bx lr - ENDP ; |vp9_short_iht8x8_add_neon| + ENDP ; |vp9_iht8x8_64_add_neon| END |