summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJyri Sarha <jsarha@ti.com>2011-08-04 15:12:43 -0700
committerEric Laurent <elaurent@google.com>2011-08-04 15:12:43 -0700
commitecb2da017f5b5101d9414b398aced34de623b9eb (patch)
treeddb8b7387ed454b8cea2aae8fb96cd21d126b287
parent35318dd943257760780f28b95b6ca99a79886c3d (diff)
downloadandroid_external_speex-ecb2da017f5b5101d9414b398aced34de623b9eb.tar.gz
android_external_speex-ecb2da017f5b5101d9414b398aced34de623b9eb.tar.bz2
android_external_speex-ecb2da017f5b5101d9414b398aced34de623b9eb.zip
resampler: add floating point neon optimization
Change-Id: I0684ea002b2b02be5dae01a6c00ffbdf28b0e265
-rw-r--r--libspeex/resample_neon.h90
1 files changed, 89 insertions, 1 deletions
diff --git a/libspeex/resample_neon.h b/libspeex/resample_neon.h
index 784ee2f..8a97429 100644
--- a/libspeex/resample_neon.h
+++ b/libspeex/resample_neon.h
@@ -38,7 +38,18 @@
#include <arm_neon.h>
#ifdef FIXED_POINT
-
+static inline int32_t saturate_32bit_to_16bit(int32_t a) {
+ int32_t ret;
+ asm volatile ("vmov.s32 d24[0], %[a]\n"
+ "vqmovn.s32 d24, q12\n"
+ "vmov.s16 %[ret], d24[0]\n"
+ : [ret] "=&r" (ret)
+ : [a] "r" (a)
+ : "q12", "d24", "d25" );
+ return ret;
+}
+#undef WORD2INT
+#define WORD2INT(x) (saturate_32bit_to_16bit(x))
#define OVERRIDE_INNER_PRODUCT_SINGLE
static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
@@ -97,4 +108,81 @@ static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, u
return ret;
}
+#elif defined(FLOATING_POINT)
+
+static inline int32_t saturate_float_to_16bit(float a) {
+ int32_t ret;
+ asm ("vmov.f32 d24[0], %[a]\n"
+ "vcvt.s32.f32 d24, d24, #15\n"
+ "vqrshrn.s32 d24, q12, #15\n"
+ "vmov.s16 %[ret], d24[0]\n"
+ : [ret] "=&r" (ret)
+ : [a] "r" (a)
+ : "q12", "d24", "d25" );
+ return ret;
+}
+#undef WORD2INT
+#define WORD2INT(x) (saturate_float_to_16bit(x))
+
+#define OVERRIDE_INNER_PRODUCT_SINGLE
+static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+{
+ float ret;
+ uint32_t remainder = len % 16;
+ len = len - remainder;
+
+ asm volatile (" cmp %[len], #0\n"
+ " bne 1f\n"
+ " vld1.32 {q4}, [%[a]]!\n"
+ " vld1.32 {q8}, [%[b]]!\n"
+ " subs %[remainder], %[remainder], #4\n"
+ " vmul.f32 q0, q4, q8\n"
+ " beq 5f\n"
+ " b 4f\n"
+ "1:"
+ " vld1.32 {q4, q5}, [%[a]]!\n"
+ " vld1.32 {q8, q9}, [%[b]]!\n"
+ " vld1.32 {q6, q7}, [%[a]]!\n"
+ " vld1.32 {q10, q11}, [%[b]]!\n"
+ " subs %[len], %[len], #16\n"
+ " vmul.f32 q0, q4, q8\n"
+ " vmul.f32 q1, q5, q9\n"
+ " vmul.f32 q2, q6, q10\n"
+ " vmul.f32 q3, q7, q11\n"
+ " beq 3f\n"
+ "2:"
+ " vld1.32 {q4, q5}, [%[a]]!\n"
+ " vld1.32 {q8, q9}, [%[b]]!\n"
+ " vld1.32 {q6, q7}, [%[a]]!\n"
+ " vld1.32 {q10, q11}, [%[b]]!\n"
+ " subs %[len], %[len], #16\n"
+ " vmla.f32 q0, q4, q8\n"
+ " vmla.f32 q1, q5, q9\n"
+ " vmla.f32 q2, q6, q10\n"
+ " vmla.f32 q3, q7, q11\n"
+ " bne 2b\n"
+ "3:"
+ " vadd.f32 q4, q0, q1\n"
+ " vadd.f32 q5, q2, q3\n"
+ " vadd.f32 q0, q4, q5\n"
+ " cmp %[remainder], #0\n"
+ " beq 5f\n"
+ "4:"
+ " vld1.32 {q6}, [%[a]]!\n"
+ " vld1.32 {q10}, [%[b]]!\n"
+ " subs %[remainder], %[remainder], #4\n"
+ " vmla.f32 q0, q6, q10\n"
+ " bne 4b\n"
+ "5:"
+ " vadd.f32 d0, d0, d1\n"
+ " vpadd.f32 d0, d0, d0\n"
+ " vmov.f32 %[ret], d0[0]\n"
+ : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
+ [len] "+l" (len), [remainder] "+l" (remainder)
+ :
+ : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q10", "q11");
+ return ret;
+}
+
#endif