diff options
-rw-r--r-- | libspeex/resample_neon.h | 90 |
1 files changed, 89 insertions, 1 deletions
diff --git a/libspeex/resample_neon.h b/libspeex/resample_neon.h index 784ee2f..8a97429 100644 --- a/libspeex/resample_neon.h +++ b/libspeex/resample_neon.h @@ -38,7 +38,18 @@ #include <arm_neon.h> #ifdef FIXED_POINT - +static inline int32_t saturate_32bit_to_16bit(int32_t a) { + int32_t ret; + asm volatile ("vmov.s32 d24[0], %[a]\n" + "vqmovn.s32 d24, q12\n" + "vmov.s16 %[ret], d24[0]\n" + : [ret] "=&r" (ret) + : [a] "r" (a) + : "q12", "d24", "d25" ); + return ret; +} +#undef WORD2INT +#define WORD2INT(x) (saturate_32bit_to_16bit(x)) #define OVERRIDE_INNER_PRODUCT_SINGLE static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len) @@ -97,4 +108,81 @@ static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, u return ret; } +#elif defined(FLOATING_POINT) + +static inline int32_t saturate_float_to_16bit(float a) { + int32_t ret; + asm ("vmov.f32 d24[0], %[a]\n" + "vcvt.s32.f32 d24, d24, #15\n" + "vqrshrn.s32 d24, q12, #15\n" + "vmov.s16 %[ret], d24[0]\n" + : [ret] "=&r" (ret) + : [a] "r" (a) + : "q12", "d24", "d25" ); + return ret; +} +#undef WORD2INT +#define WORD2INT(x) (saturate_float_to_16bit(x)) + +#define OVERRIDE_INNER_PRODUCT_SINGLE +static inline float inner_product_single(const float *a, const float *b, unsigned int len) +{ + float ret; + uint32_t remainder = len % 16; + len = len - remainder; + + asm volatile (" cmp %[len], #0\n" + " bne 1f\n" + " vld1.32 {q4}, [%[a]]!\n" + " vld1.32 {q8}, [%[b]]!\n" + " subs %[remainder], %[remainder], #4\n" + " vmul.f32 q0, q4, q8\n" + " beq 5f\n" + " b 4f\n" + "1:" + " vld1.32 {q4, q5}, [%[a]]!\n" + " vld1.32 {q8, q9}, [%[b]]!\n" + " vld1.32 {q6, q7}, [%[a]]!\n" + " vld1.32 {q10, q11}, [%[b]]!\n" + " subs %[len], %[len], #16\n" + " vmul.f32 q0, q4, q8\n" + " vmul.f32 q1, q5, q9\n" + " vmul.f32 q2, q6, q10\n" + " vmul.f32 q3, q7, q11\n" + " beq 3f\n" + "2:" + " vld1.32 {q4, q5}, [%[a]]!\n" + " vld1.32 {q8, q9}, [%[b]]!\n" + " vld1.32 {q6, q7}, [%[a]]!\n" + " vld1.32 {q10, q11}, [%[b]]!\n" + " subs %[len], %[len], #16\n" + " vmla.f32 q0, q4, q8\n" + " vmla.f32 q1, q5, q9\n" + " vmla.f32 q2, q6, q10\n" + " vmla.f32 q3, q7, q11\n" + " bne 2b\n" + "3:" + " vadd.f32 q4, q0, q1\n" + " vadd.f32 q5, q2, q3\n" + " vadd.f32 q0, q4, q5\n" + " cmp %[remainder], #0\n" + " beq 5f\n" + "4:" + " vld1.32 {q6}, [%[a]]!\n" + " vld1.32 {q10}, [%[b]]!\n" + " subs %[remainder], %[remainder], #4\n" + " vmla.f32 q0, q6, q10\n" + " bne 4b\n" + "5:" + " vadd.f32 d0, d0, d1\n" + " vpadd.f32 d0, d0, d0\n" + " vmov.f32 %[ret], d0[0]\n" + : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b), + [len] "+l" (len), [remainder] "+l" (remainder) + : + : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q10", "q11"); + return ret; +} + #endif |