1 files changed, 235 insertions, 244 deletions
diff --git a/libvpx/vp9/encoder/vp9_dct.c b/libvpx/vp9/encoder/vp9_dct.c
index 065992a..d523239 100644
--- a/libvpx/vp9/encoder/vp9_dct.c
+++ b/libvpx/vp9/encoder/vp9_dct.c
@@ -18,7 +18,11 @@
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#include "vp9/encoder/vp9_dct.h"
+static INLINE int fdct_round_shift(int input) {
+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  assert(INT16_MIN <= rv && rv <= INT16_MAX);
+  return rv;
+}
 
 static void fdct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
@@ -31,19 +35,19 @@ static void fdct4(const int16_t *input, int16_t *output) {
 
   temp1 = (step[0] + step[1]) * cospi_16_64;
   temp2 = (step[0] - step[1]) * cospi_16_64;
-  output[0] = dct_const_round_shift(temp1);
-  output[2] = dct_const_round_shift(temp2);
+  output[0] = fdct_round_shift(temp1);
+  output[2] = fdct_round_shift(temp2);
   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-  output[1] = dct_const_round_shift(temp1);
-  output[3] = dct_const_round_shift(temp2);
+  output[1] = fdct_round_shift(temp1);
+  output[3] = fdct_round_shift(temp2);
 }
 
 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -80,12 +84,12 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
       step[3] = input[0] - input[3];
       temp1 = (step[0] + step[1]) * cospi_16_64;
       temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = dct_const_round_shift(temp1);
-      out[2] = dct_const_round_shift(temp2);
+      out[0] = fdct_round_shift(temp1);
+      out[2] = fdct_round_shift(temp2);
       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = dct_const_round_shift(temp1);
-      out[3] = dct_const_round_shift(temp2);
+      out[1] = fdct_round_shift(temp1);
+      out[3] = fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
       out += 4;
@@ -138,10 +142,10 @@ static void fadst4(const int16_t *input, int16_t *output) {
   s3 = x2 - x0 + x3;
 
   // 1-D transform scaling factor is sqrt(2).
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
+  output[0] = fdct_round_shift(s0);
+  output[1] = fdct_round_shift(s1);
+  output[2] = fdct_round_shift(s2);
+  output[3] = fdct_round_shift(s3);
 }
 
 static const transform_2d FHT_4[] = {
@@ -151,32 +155,36 @@ static const transform_2d FHT_4[] = {
   { fadst4, fadst4 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[4 * 4];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[4], temp_out[4];
-  const transform_2d ht = FHT_4[tx_type];
+void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct4x4_c(input, output, stride);
+  } else {
+    int16_t out[4 * 4];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
 
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = input[j * stride + i] * 16;
-    if (i == 0 && temp_in[0])
-      temp_in[0] += 1;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      outptr[j * 4 + i] = temp_out[j];
-  }
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        outptr[j * 4 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j + i * 4];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 4; ++j)
-      output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
   }
 }
 
@@ -204,16 +212,16 @@ static void fdct8(const int16_t *input, int16_t *output) {
   t1 = (x0 - x1) * cospi_16_64;
   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[0] = dct_const_round_shift(t0);
-  output[2] = dct_const_round_shift(t2);
-  output[4] = dct_const_round_shift(t1);
-  output[6] = dct_const_round_shift(t3);
+  output[0] = fdct_round_shift(t0);
+  output[2] = fdct_round_shift(t2);
+  output[4] = fdct_round_shift(t1);
+  output[6] = fdct_round_shift(t3);
 
   // Stage 2
   t0 = (s6 - s5) * cospi_16_64;
   t1 = (s6 + s5) * cospi_16_64;
-  t2 = dct_const_round_shift(t0);
-  t3 = dct_const_round_shift(t1);
+  t2 = fdct_round_shift(t0);
+  t3 = fdct_round_shift(t1);
 
   // Stage 3
   x0 = s4 + t2;
@@ -226,10 +234,10 @@ static void fdct8(const int16_t *input, int16_t *output) {
   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[1] = dct_const_round_shift(t0);
-  output[3] = dct_const_round_shift(t2);
-  output[5] = dct_const_round_shift(t1);
-  output[7] = dct_const_round_shift(t3);
+  output[1] = fdct_round_shift(t0);
+  output[3] = fdct_round_shift(t2);
+  output[5] = fdct_round_shift(t1);
+  output[7] = fdct_round_shift(t3);
 }
 
 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
@@ -264,16 +272,16 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
       t1 = (x0 - x1) * cospi_16_64;
       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-      output[0 * 8] = dct_const_round_shift(t0);
-      output[2 * 8] = dct_const_round_shift(t2);
-      output[4 * 8] = dct_const_round_shift(t1);
-      output[6 * 8] = dct_const_round_shift(t3);
+      output[0 * 8] = fdct_round_shift(t0);
+      output[2 * 8] = fdct_round_shift(t2);
+      output[4 * 8] = fdct_round_shift(t1);
+      output[6 * 8] = fdct_round_shift(t3);
 
       // Stage 2
       t0 = (s6 - s5) * cospi_16_64;
       t1 = (s6 + s5) * cospi_16_64;
-      t2 = dct_const_round_shift(t0);
-      t3 = dct_const_round_shift(t1);
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
 
       // Stage 3
       x0 = s4 + t2;
@@ -286,10 +294,10 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-      output[1 * 8] = dct_const_round_shift(t0);
-      output[3 * 8] = dct_const_round_shift(t2);
-      output[5 * 8] = dct_const_round_shift(t1);
-      output[7 * 8] = dct_const_round_shift(t3);
+      output[1 * 8] = fdct_round_shift(t0);
+      output[3 * 8] = fdct_round_shift(t2);
+      output[5 * 8] = fdct_round_shift(t1);
+      output[7 * 8] = fdct_round_shift(t3);
       input++;
       output++;
     }
@@ -307,7 +315,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
   // similar. In the first one, we transform the columns and transpose
   // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we tranpose the columns (that
+  // as the first pass results are transposed, we transpose the columns (that
   // is the transposed rows) and transpose the results (so that it goes back
   // in normal/row positions).
   int pass;
@@ -388,16 +396,16 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         t1 = (x0 - x1) * cospi_16_64;
         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = dct_const_round_shift(t0);
-        out[4] = dct_const_round_shift(t2);
-        out[8] = dct_const_round_shift(t1);
-        out[12] = dct_const_round_shift(t3);
+        out[0] = fdct_round_shift(t0);
+        out[4] = fdct_round_shift(t2);
+        out[8] = fdct_round_shift(t1);
+        out[12] = fdct_round_shift(t3);
 
         // Stage 2
         t0 = (s6 - s5) * cospi_16_64;
         t1 = (s6 + s5) * cospi_16_64;
-        t2 = dct_const_round_shift(t0);
-        t3 = dct_const_round_shift(t1);
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
 
         // Stage 3
         x0 = s4 + t2;
@@ -410,22 +418,22 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-        out[2] = dct_const_round_shift(t0);
-        out[6] = dct_const_round_shift(t2);
-        out[10] = dct_const_round_shift(t1);
-        out[14] = dct_const_round_shift(t3);
+        out[2] = fdct_round_shift(t0);
+        out[6] = fdct_round_shift(t2);
+        out[10] = fdct_round_shift(t1);
+        out[14] = fdct_round_shift(t3);
       }
       // Work on the next eight values; step1 -> odd_results
       {
         // step 2
         temp1 = (step1[5] - step1[2]) * cospi_16_64;
         temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = dct_const_round_shift(temp1);
-        step2[3] = dct_const_round_shift(temp2);
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
         temp1 = (step1[4] + step1[3]) * cospi_16_64;
         temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = dct_const_round_shift(temp1);
-        step2[5] = dct_const_round_shift(temp2);
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
         // step 3
         step3[0] = step1[0] + step2[3];
         step3[1] = step1[1] + step2[2];
@@ -438,12 +446,12 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         // step 4
         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
         temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
-        step2[1] = dct_const_round_shift(temp1);
-        step2[2] = dct_const_round_shift(temp2);
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
         temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-        step2[5] = dct_const_round_shift(temp1);
-        step2[6] = dct_const_round_shift(temp2);
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
         // step 5
         step1[0] = step3[0] + step2[1];
         step1[1] = step3[0] - step2[1];
@@ -456,20 +464,20 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         // step 6
         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = dct_const_round_shift(temp1);
-        out[9] = dct_const_round_shift(temp2);
+        out[1] = fdct_round_shift(temp1);
+        out[9] = fdct_round_shift(temp2);
         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = dct_const_round_shift(temp1);
-        out[13] = dct_const_round_shift(temp2);
+        out[5] = fdct_round_shift(temp1);
+        out[13] = fdct_round_shift(temp2);
         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = dct_const_round_shift(temp1);
-        out[11] = dct_const_round_shift(temp2);
+        out[3] = fdct_round_shift(temp1);
+        out[11] = fdct_round_shift(temp2);
         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = dct_const_round_shift(temp1);
-        out[15] = dct_const_round_shift(temp2);
+        out[7] = fdct_round_shift(temp1);
+        out[15] = fdct_round_shift(temp2);
       }
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
@@ -503,14 +511,14 @@ static void fadst8(const int16_t *input, int16_t *output) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = dct_const_round_shift(s0 + s4);
-  x1 = dct_const_round_shift(s1 + s5);
-  x2 = dct_const_round_shift(s2 + s6);
-  x3 = dct_const_round_shift(s3 + s7);
-  x4 = dct_const_round_shift(s0 - s4);
-  x5 = dct_const_round_shift(s1 - s5);
-  x6 = dct_const_round_shift(s2 - s6);
-  x7 = dct_const_round_shift(s3 - s7);
+  x0 = fdct_round_shift(s0 + s4);
+  x1 = fdct_round_shift(s1 + s5);
+  x2 = fdct_round_shift(s2 + s6);
+  x3 = fdct_round_shift(s3 + s7);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
 
   // stage 2
   s0 = x0;
@@ -526,10 +534,10 @@ static void fadst8(const int16_t *input, int16_t *output) {
   x1 = s1 + s3;
   x2 = s0 - s2;
   x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -537,10 +545,10 @@ static void fadst8(const int16_t *input, int16_t *output) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
 
   output[0] =   x0;
   output[1] = - x4;
@@ -559,30 +567,34 @@ static const transform_2d FHT_8[] = {
   { fadst8, fadst8 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
-                        int stride, int tx_type) {
-  int16_t out[64];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[8], temp_out[8];
-  const transform_2d ht = FHT_8[tx_type];
-
-  // Columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      outptr[j * 8 + i] = temp_out[j];
-  }
+void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+                  int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct8x8_c(input, output, stride);
+  } else {
+    int16_t out[64];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        outptr[j * 8 + i] = temp_out[j];
+    }
 
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j + i * 8];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 8; ++j)
-      output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
   }
 }
 
@@ -693,16 +705,16 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
     t1 = (x0 - x1) * cospi_16_64;
     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = dct_const_round_shift(t0);
-    out[4] = dct_const_round_shift(t2);
-    out[8] = dct_const_round_shift(t1);
-    out[12] = dct_const_round_shift(t3);
+    out[0] = fdct_round_shift(t0);
+    out[4] = fdct_round_shift(t2);
+    out[8] = fdct_round_shift(t1);
+    out[12] = fdct_round_shift(t3);
 
     // Stage 2
     t0 = (s6 - s5) * cospi_16_64;
     t1 = (s6 + s5) * cospi_16_64;
-    t2 = dct_const_round_shift(t0);
-    t3 = dct_const_round_shift(t1);
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
 
     // Stage 3
     x0 = s4 + t2;
@@ -715,21 +727,21 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    out[2] = dct_const_round_shift(t0);
-    out[6] = dct_const_round_shift(t2);
-    out[10] = dct_const_round_shift(t1);
-    out[14] = dct_const_round_shift(t3);
+    out[2] = fdct_round_shift(t0);
+    out[6] = fdct_round_shift(t2);
+    out[10] = fdct_round_shift(t1);
+    out[14] = fdct_round_shift(t3);
   }
 
   // step 2
   temp1 = (step1[5] - step1[2]) * cospi_16_64;
   temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = dct_const_round_shift(temp1);
-  step2[3] = dct_const_round_shift(temp2);
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
   temp1 = (step1[4] + step1[3]) * cospi_16_64;
   temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = dct_const_round_shift(temp1);
-  step2[5] = dct_const_round_shift(temp2);
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
 
   // step 3
   step3[0] = step1[0] + step2[3];
@@ -744,12 +756,12 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
   // step 4
   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
   temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
-  step2[1] = dct_const_round_shift(temp1);
-  step2[2] = dct_const_round_shift(temp2);
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
   temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-  step2[5] = dct_const_round_shift(temp1);
-  step2[6] = dct_const_round_shift(temp2);
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
 
   // step 5
   step1[0] = step3[0] + step2[1];
@@ -764,23 +776,23 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
   // step 6
   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = dct_const_round_shift(temp1);
-  out[9] = dct_const_round_shift(temp2);
+  out[1] = fdct_round_shift(temp1);
+  out[9] = fdct_round_shift(temp2);
 
   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = dct_const_round_shift(temp1);
-  out[13] = dct_const_round_shift(temp2);
+  out[5] = fdct_round_shift(temp1);
+  out[13] = fdct_round_shift(temp2);
 
   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = dct_const_round_shift(temp1);
-  out[11] = dct_const_round_shift(temp2);
+  out[3] = fdct_round_shift(temp1);
+  out[11] = fdct_round_shift(temp2);
 
   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = dct_const_round_shift(temp1);
-  out[15] = dct_const_round_shift(temp2);
+  out[7] = fdct_round_shift(temp1);
+  out[15] = fdct_round_shift(temp2);
 }
 
 static void fadst16(const int16_t *input, int16_t *output) {
@@ -821,22 +833,22 @@ static void fadst16(const int16_t *input, int16_t *output) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
+  x0 = fdct_round_shift(s0 + s8);
+  x1 = fdct_round_shift(s1 + s9);
+  x2 = fdct_round_shift(s2 + s10);
+  x3 = fdct_round_shift(s3 + s11);
+  x4 = fdct_round_shift(s4 + s12);
+  x5 = fdct_round_shift(s5 + s13);
+  x6 = fdct_round_shift(s6 + s14);
+  x7 = fdct_round_shift(s7 + s15);
+  x8  = fdct_round_shift(s0 - s8);
+  x9  = fdct_round_shift(s1 - s9);
+  x10 = fdct_round_shift(s2 - s10);
+  x11 = fdct_round_shift(s3 - s11);
+  x12 = fdct_round_shift(s4 - s12);
+  x13 = fdct_round_shift(s5 - s13);
+  x14 = fdct_round_shift(s6 - s14);
+  x15 = fdct_round_shift(s7 - s15);
 
   // stage 2
   s0 = x0;
@@ -864,14 +876,14 @@ static void fadst16(const int16_t *input, int16_t *output) {
   x5 = s1 - s5;
   x6 = s2 - s6;
   x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
+  x8 = fdct_round_shift(s8 + s12);
+  x9 = fdct_round_shift(s9 + s13);
+  x10 = fdct_round_shift(s10 + s14);
+  x11 = fdct_round_shift(s11 + s15);
+  x12 = fdct_round_shift(s8 - s12);
+  x13 = fdct_round_shift(s9 - s13);
+  x14 = fdct_round_shift(s10 - s14);
+  x15 = fdct_round_shift(s11 - s15);
 
   // stage 3
   s0 = x0;
@@ -895,18 +907,18 @@ static void fadst16(const int16_t *input, int16_t *output) {
   x1 = s1 + s3;
   x2 = s0 - s2;
   x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
   x8 = s8 + s10;
   x9 = s9 + s11;
   x10 = s8 - s10;
   x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
+  x12 = fdct_round_shift(s12 + s14);
+  x13 = fdct_round_shift(s13 + s15);
+  x14 = fdct_round_shift(s12 - s14);
+  x15 = fdct_round_shift(s13 - s15);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -918,14 +930,14 @@ static void fadst16(const int16_t *input, int16_t *output) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+  x10 = fdct_round_shift(s10);
+  x11 = fdct_round_shift(s11);
+  x14 = fdct_round_shift(s14);
+  x15 = fdct_round_shift(s15);
 
   output[0] = x0;
   output[1] = - x8;
@@ -952,31 +964,34 @@ static const transform_2d FHT_16[] = {
   { fadst16, fadst16 }   // ADST_ADST = 3
 };
 
-void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
-                          int stride, int tx_type) {
-  int16_t out[256];
-  int16_t *outptr = &out[0];
-  int i, j;
-  int16_t temp_in[16], temp_out[16];
-  const transform_2d ht = FHT_16[tx_type];
-
-  // Columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = input[j * stride + i] * 4;
-    ht.cols(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-//      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
+void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+                    int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_fdct16x16_c(input, output, stride);
+  } else {
+    int16_t out[256];
+    int16_t *outptr = &out[0];
+    int i, j;
+    int16_t temp_in[16], temp_out[16];
+    const transform_2d ht = FHT_16[tx_type];
+
+    // Columns
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
 
-  // Rows
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j + i * 16];
-    ht.rows(temp_in, temp_out);
-    for (j = 0; j < 16; ++j)
-      output[j + i * 16] = temp_out[j];
+    // Rows
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j + i * 16];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        output[j + i * 16] = temp_out[j];
+    }
   }
 }
 
@@ -991,7 +1006,7 @@ static INLINE int half_round_shift(int input) {
   return rv;
 }
 
-static void dct32_1d(const int *input, int *output, int round) {
+static void fdct32(const int *input, int *output, int round) {
   int step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
@@ -1323,7 +1338,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
@@ -1333,13 +1348,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
 }
 
-// Note that although we use dct_32_round in dct32_1d computation flow,
+// Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
@@ -1351,7 +1366,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    dct32_1d(temp_in, temp_out, 0);
+    fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1364,32 +1379,8 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
     int temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    dct32_1d(temp_in, temp_out, 1);
+    fdct32(temp_in, temp_out, 1);
     for (j = 0; j < 32; ++j)
       out[j + i * 32] = temp_out[j];
   }
 }
-
-void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct4x4(input, output, stride);
-  else
-    vp9_short_fht4x4(input, output, stride, tx_type);
-}
-
-void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct8x8(input, output, stride);
-  else
-    vp9_short_fht8x8(input, output, stride, tx_type);
-}
-
-void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
-                  int stride) {
-  if (tx_type == DCT_DCT)
-    vp9_fdct16x16(input, output, stride);
-  else
-    vp9_short_fht16x16(input, output, stride, tx_type);
-}