1 files changed, 47 insertions, 44 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 3e3e400..0b9fc09 100644
--- a/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -11,45 +11,47 @@
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-extern void vp9_short_idct16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass1(int16_t *input,
-                                               int16_t *output,
-                                               int output_stride);
-extern void vp9_short_idct10_16x16_add_neon_pass2(int16_t *src,
-                                               int16_t *output,
-                                               int16_t *pass1Output,
-                                               int16_t skip_adding,
-                                               uint8_t *dest,
-                                               int dest_stride);
-extern void save_neon_registers();
-extern void restore_neon_registers();
-
-
-void vp9_short_idct16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
+                                      int16_t *output,
+                                      int output_stride);
+void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
+                                      int16_t *output,
+                                      int16_t *pass1Output,
+                                      int16_t skip_adding,
+                                      uint8_t *dest,
+                                      int dest_stride);
+void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
+                                     int16_t *output,
+                                     int output_stride);
+void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
+                                     int16_t *output,
+                                     int16_t *pass1Output,
+                                     int16_t skip_adding,
+                                     uint8_t *dest,
+                                     int dest_stride);
+
+/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
+extern void vp9_push_neon(int64_t *store);
+extern void vp9_pop_neon(int64_t *store);
+
+void vp9_idct16x16_256_add_neon(const int16_t *input,
+                                uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+1,
+  vp9_idct16x16_256_add_neon_pass2(input+1,
                                      row_idct_output,
                                      pass1_output,
                                      0,
@@ -59,12 +61,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
   /* Parallel idct on the lower 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(input+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct16x16_add_neon_pass2(input+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      0,
@@ -74,12 +76,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -89,12 +91,12 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
@@ -102,28 +104,29 @@ void vp9_short_idct16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }
 
-void vp9_short_idct10_16x16_add_neon(int16_t *input,
-                                  uint8_t *dest, int dest_stride) {
+void vp9_idct16x16_10_add_neon(const int16_t *input,
+                               uint8_t *dest, int dest_stride) {
+  int64_t store_reg[8];
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_neon_registers();
+  vp9_push_neon(store_reg);
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct10_16x16_add_neon_pass1(input, pass1_output, 8);
+  vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7
   // which will be saved into row_idct_output.
-  vp9_short_idct10_16x16_add_neon_pass2(input+1,
+  vp9_idct16x16_10_add_neon_pass2(input+1,
                                         row_idct_output,
                                         pass1_output,
                                         0,
@@ -135,12 +138,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
   /* Parallel idct on the left 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
                                      row_idct_output,
                                      pass1_output,
                                      1,
@@ -150,12 +153,12 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
   /* Parallel idct on the right 8 columns */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
   // stage 6 result in pass1_output.
-  vp9_short_idct16x16_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
+  vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
 
   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
   // with result in pass1(pass1_output) to calculate final result in stage 7.
   // Then add the result to the destination data.
-  vp9_short_idct16x16_add_neon_pass2(row_idct_output+8*16+1,
+  vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
                                      row_idct_output+8,
                                      pass1_output,
                                      1,
@@ -163,7 +166,7 @@ void vp9_short_idct10_16x16_add_neon(int16_t *input,
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_neon_registers();
+  vp9_pop_neon(store_reg);
 
   return;
 }