diff options
author | Andreas Huber <andih@google.com> | 2010-09-15 15:12:42 -0700 |
---|---|---|
committer | Andreas Huber <andih@google.com> | 2010-09-15 15:12:42 -0700 |
commit | f71323e297a928af368937089d3ed71239786f86 (patch) | |
tree | dad338caad00af21fd81f0975335c343a91be085 /vp8/decoder/arm | |
parent | 8aa17fc40a750935d80b115d89ca9403f42ab211 (diff) | |
download | android_external_libvpx-f71323e297a928af368937089d3ed71239786f86.tar.gz android_external_libvpx-f71323e297a928af368937089d3ed71239786f86.tar.bz2 android_external_libvpx-f71323e297a928af368937089d3ed71239786f86.zip |
Upgrade to the latest .webm project code.
Change-Id: I33907e0c9ded667e54d31e2f9226c77501731c6c
Diffstat (limited to 'vp8/decoder/arm')
21 files changed, 1225 insertions, 1621 deletions
diff --git a/vp8/decoder/arm/armv5/dequantize_v5.asm b/vp8/decoder/arm/armv5/dequantize_v5.asm index eb3f030..de3648a 100644 --- a/vp8/decoder/arm/armv5/dequantize_v5.asm +++ b/vp8/decoder/arm/armv5/dequantize_v5.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; diff --git a/vp8/decoder/arm/armv6/dboolhuff_v6.asm b/vp8/decoder/arm/armv6/dboolhuff_v6.asm index 143e33e..6515804 100644 --- a/vp8/decoder/arm/armv6/dboolhuff_v6.asm +++ b/vp8/decoder/arm/armv6/dboolhuff_v6.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm new file mode 100644 index 0000000..6bebda2 --- /dev/null +++ b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm @@ -0,0 +1,218 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dequant_dc_idct_add_v6| + + AREA |.text|, CODE, READONLY + +;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride, int Dc) +; r0 = input +; r1 = dq +; r2 = pred +; r3 = dest +; sp + 36 = pitch ; +4 = 40 +; sp + 40 = stride ; +4 = 44 +; sp + 44 = Dc ; +4 = 48 + + +|vp8_dequant_dc_idct_add_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r6, [sp, #44] + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r3, [sp] + + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + mov r12, #3 + +vp8_dequant_dc_add_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne vp8_dequant_dc_add_loop + + sub r0, r0, #32 + mov r1, r0 + +; short_idct4x4llm_v6_dual + ldr r3, cospi8sqrt2minus1 + ldr r4, sinpi8sqrt2 + ldr r6, [r0, #8] + mov r5, #2 +vp8_dequant_dc_idct_loop1_v6 + ldr r12, [r0, #24] + ldr r14, [r0, #16] + smulwt r9, r3, r6 + smulwb r7, r3, r6 + smulwt r10, r4, r6 + smulwb r8, r4, r6 + pkhbt r7, r7, r9, lsl #16 + smulwt r11, r3, r12 + pkhbt r8, r8, r10, lsl #16 + uadd16 r6, r6, r7 + smulwt r7, r4, r12 + smulwb r9, r3, r12 + smulwb r10, r4, r12 + subs r5, r5, #1 + pkhbt r9, r9, r11, lsl #16 + ldr r11, [r0], #4 + pkhbt r10, r10, r7, lsl #16 + uadd16 r7, r12, r9 + usub16 r7, r8, r7 + uadd16 r6, r6, r10 + uadd16 r10, r11, r14 + usub16 r8, r11, r14 + uadd16 r9, r10, r6 + usub16 r10, r10, r6 + uadd16 r6, r8, r7 + usub16 r7, r8, r7 + str r6, [r1, #8] + ldrne r6, [r0, #8] + str r7, [r1, #16] + str r10, [r1, #24] + str r9, [r1], #4 + bne vp8_dequant_dc_idct_loop1_v6 + + mov r5, #2 + sub r0, r1, #8 +vp8_dequant_dc_idct_loop2_v6 + ldr r6, [r0], #4 + ldr r7, [r0], #4 + ldr r8, [r0], #4 + ldr r9, [r0], #4 + smulwt r1, r3, r6 + smulwt r12, r4, r6 + smulwt lr, r3, r8 + smulwt r10, r4, r8 + pkhbt r11, r8, r6, lsl #16 + pkhbt r1, lr, r1, lsl #16 + pkhbt r12, r10, r12, lsl #16 + pkhtb r6, r6, r8, asr #16 + uadd16 r6, r1, r6 + pkhbt lr, r9, r7, lsl #16 + uadd16 r10, r11, lr + usub16 lr, r11, lr + pkhtb r8, r7, r9, asr #16 + subs r5, r5, #1 + smulwt r1, r3, r8 + smulwb r7, r3, r8 + smulwt r11, r4, r8 + smulwb r9, r4, r8 + pkhbt r1, r7, r1, lsl #16 + uadd16 r8, r1, r8 + pkhbt r11, r9, r11, lsl #16 + usub16 r1, r12, r8 + uadd16 r8, r11, r6 + ldr r9, c0x00040004 + ldr r12, [sp, #40] + uadd16 r6, r10, r8 + usub16 r7, r10, r8 + uadd16 r7, r7, r9 + uadd16 r6, r6, r9 + uadd16 r10, r14, r1 + usub16 r1, r14, r1 + uadd16 r10, r10, r9 + uadd16 r1, r1, r9 + ldr r11, [r2], r12 + mov r8, r7, asr #3 + pkhtb r9, r8, r10, asr #19 + mov r8, r1, asr #3 + pkhtb r8, r8, r6, asr #19 + uxtb16 lr, r11, ror #8 + qadd16 r9, r9, lr + uxtb16 lr, r11 + qadd16 r8, r8, lr + usat16 r9, #8, r9 + usat16 r8, #8, r8 + orr r9, r8, r9, lsl #8 + ldr r11, [r2], r12 + ldr lr, [sp] + ldr r12, [sp, #44] + mov r7, r7, lsl #16 + mov r1, r1, lsl #16 + mov r10, r10, lsl #16 + mov r6, r6, lsl #16 + mov r7, r7, asr #3 + pkhtb r7, r7, r10, asr #19 + mov r1, r1, asr #3 + pkhtb r1, r1, r6, asr #19 + uxtb16 r8, r11, ror #8 + qadd16 r7, r7, r8 + uxtb16 r8, r11 + qadd16 r1, r1, r8 + usat16 r7, #8, r7 + usat16 r1, #8, r1 + orr r1, r1, r7, lsl #8 + str r9, [lr], r12 + str r1, [lr], r12 + str lr, [sp] + bne vp8_dequant_dc_idct_loop2_v6 + +; vpx_memset + sub r0, r0, #32 + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_dequant_dc_idct_add_v6| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x00004E7B +sinpi8sqrt2 DCD 0x00008A8C +c0x00040004 DCD 0x00040004 + + END diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_idct_v6.asm new file mode 100644 index 0000000..47b671c --- /dev/null +++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm @@ -0,0 +1,196 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dequant_idct_add_v6| + + AREA |.text|, CODE, READONLY +;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride) +; r0 = input +; r1 = dq +; r2 = pred +; r3 = dest +; sp + 36 = pitch ; +4 = 40 +; sp + 40 = stride ; +4 = 44 + + +|vp8_dequant_idct_add_v6| PROC + stmdb sp!, {r4-r11, lr} + + ldr r4, [r0] ;input + ldr r5, [r1], #4 ;dq + + sub sp, sp, #4 + str r3, [sp] + + mov r12, #4 + +vp8_dequant_add_loop + smulbb r6, r4, r5 + smultt r7, r4, r5 + + ldr r4, [r0, #4] ;input + ldr r5, [r1], #4 ;dq + + strh r6, [r0], #2 + strh r7, [r0], #2 + + smulbb r6, r4, r5 + smultt r7, r4, r5 + + subs r12, r12, #1 + + ldrne r4, [r0, #4] + ldrne r5, [r1], #4 + + strh r6, [r0], #2 + strh r7, [r0], #2 + + bne vp8_dequant_add_loop + + sub r0, r0, #32 + mov r1, r0 + +; short_idct4x4llm_v6_dual + ldr r3, cospi8sqrt2minus1 + ldr r4, sinpi8sqrt2 + ldr r6, [r0, #8] + mov r5, #2 +vp8_dequant_idct_loop1_v6 + ldr r12, [r0, #24] + ldr r14, [r0, #16] + smulwt r9, r3, r6 + smulwb r7, r3, r6 + smulwt r10, r4, r6 + smulwb r8, r4, r6 + pkhbt r7, r7, r9, lsl #16 + smulwt r11, r3, r12 + pkhbt r8, r8, r10, lsl #16 + uadd16 r6, r6, r7 + smulwt r7, r4, r12 + smulwb r9, r3, r12 + smulwb r10, r4, r12 + subs r5, r5, #1 + pkhbt r9, r9, r11, lsl #16 + ldr r11, [r0], #4 + pkhbt r10, r10, r7, lsl #16 + uadd16 r7, r12, r9 + usub16 r7, r8, r7 + uadd16 r6, r6, r10 + uadd16 r10, r11, r14 + usub16 r8, r11, r14 + uadd16 r9, r10, r6 + usub16 r10, r10, r6 + uadd16 r6, r8, r7 + usub16 r7, r8, r7 + str r6, [r1, #8] + ldrne r6, [r0, #8] + str r7, [r1, #16] + str r10, [r1, #24] + str r9, [r1], #4 + bne vp8_dequant_idct_loop1_v6 + + mov r5, #2 + sub r0, r1, #8 +vp8_dequant_idct_loop2_v6 + ldr r6, [r0], #4 + ldr r7, [r0], #4 + ldr r8, [r0], #4 + ldr r9, [r0], #4 + smulwt r1, r3, r6 + smulwt r12, r4, r6 + smulwt lr, r3, r8 + smulwt r10, r4, r8 + pkhbt r11, r8, r6, lsl #16 + pkhbt r1, lr, r1, lsl #16 + pkhbt r12, r10, r12, lsl #16 + pkhtb r6, r6, r8, asr #16 + uadd16 r6, r1, r6 + pkhbt lr, r9, r7, lsl #16 + uadd16 r10, r11, lr + usub16 lr, r11, lr + pkhtb r8, r7, r9, asr #16 + subs r5, r5, #1 + smulwt r1, r3, r8 + smulwb r7, r3, r8 + smulwt r11, r4, r8 + smulwb r9, r4, r8 + pkhbt r1, r7, r1, lsl #16 + uadd16 r8, r1, r8 + pkhbt r11, r9, r11, lsl #16 + usub16 r1, r12, r8 + uadd16 r8, r11, r6 + ldr r9, c0x00040004 + ldr r12, [sp, #40] + uadd16 r6, r10, r8 + usub16 r7, r10, r8 + uadd16 r7, r7, r9 + uadd16 r6, r6, r9 + uadd16 r10, r14, r1 + usub16 r1, r14, r1 + uadd16 r10, r10, r9 + uadd16 r1, r1, r9 + ldr r11, [r2], r12 + mov r8, r7, asr #3 + pkhtb r9, r8, r10, asr #19 + mov r8, r1, asr #3 + pkhtb r8, r8, r6, asr #19 + uxtb16 lr, r11, ror #8 + qadd16 r9, r9, lr + uxtb16 lr, r11 + qadd16 r8, r8, lr + usat16 r9, #8, r9 + usat16 r8, #8, r8 + orr r9, r8, r9, lsl #8 + ldr r11, [r2], r12 + ldr lr, [sp] + ldr r12, [sp, #44] + mov r7, r7, lsl #16 + mov r1, r1, lsl #16 + mov r10, r10, lsl #16 + mov r6, r6, lsl #16 + mov r7, r7, asr #3 + pkhtb r7, r7, r10, asr #19 + mov r1, r1, asr #3 + pkhtb r1, r1, r6, asr #19 + uxtb16 r8, r11, ror #8 + qadd16 r7, r7, r8 + uxtb16 r8, r11 + qadd16 r1, r1, r8 + usat16 r7, #8, r7 + usat16 r1, #8, r1 + orr r1, r1, r7, lsl #8 + str r9, [lr], r12 + str r1, [lr], r12 + str lr, [sp] + bne vp8_dequant_idct_loop2_v6 + +; vpx_memset + sub r0, r0, #32 + add sp, sp, #4 + + mov r12, #0 + str r12, [r0] + str r12, [r0, #4] + str r12, [r0, #8] + str r12, [r0, #12] + str r12, [r0, #16] + str r12, [r0, #20] + str r12, [r0, #24] + str r12, [r0, #28] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_dequant_idct_add_v6| + +; Constant Pool +cospi8sqrt2minus1 DCD 0x00004E7B +sinpi8sqrt2 DCD 0x00008A8C +c0x00040004 DCD 0x00040004 + + END diff --git a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm b/vp8/decoder/arm/armv6/dequantdcidct_v6.asm deleted file mode 100644 index 3daa9b3..0000000 --- a/vp8/decoder/arm/armv6/dequantdcidct_v6.asm +++ /dev/null @@ -1,202 +0,0 @@ -; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dequant_dc_idct_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code -;void vp8_dequant_dc_idct_v6(short *input, short *dq, short *output, int pitch,int Dc) -|vp8_dequant_dc_idct_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r6, [sp, #36] ;load Dc - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r0, [sp] - - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - mov r12, #3 - -dequant_dc_idct_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne dequant_dc_idct_loop - - sub r0, r0, #32 - mov r1, r2 - mov r2, r3 - -; short_idct4x4llm_v6_dual - - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - mov r5, #0x2 ; i=2 i -loop1_dual_11 - ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 - ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 - ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 - - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s - pkhbt r7, r7, r9, lsl #16 ; 5c | 4c - smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c - pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s - smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s - subs r5, r5, #0x1 ; i-- -- - pkhbt r9, r9, r11, lsl #16 ; 13c | 12c - ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 - pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - usub16 r7, r8, r7 ; c c - uadd16 r6, r6, r10 ; d d - uadd16 r10, r11, r14 ; a a - usub16 r8, r11, r14 ; b b - uadd16 r9, r10, r6 ; a+d a+d - usub16 r10, r10, r6 ; a-d a-d - uadd16 r6, r8, r7 ; b+c b+c - usub16 r7, r8, r7 ; b-c b-c - str r6, [r1, r2] ; o5 | o4 - add r6, r2, r2 ; pitch * 2 p2 - str r7, [r1, r6] ; o9 | o8 - add r6, r6, r2 ; pitch * 3 p3 - str r10, [r1, r6] ; o13 | o12 - str r9, [r1], #0x4 ; o1 | o0 ++ - bne loop1_dual_11 ; - mov r5, #0x2 ; i=2 i - sub r0, r1, #8 ; reset input/output i/o -loop2_dual_22 - ldr r6, [r0, r2] ; i5 | i4 5|4 - ldr r1, [r0] ; i1 | i0 1|0 - ldr r12, [r0, #0x4] ; i3 | i2 3|2 - add r14, r2, #0x4 ; pitch + 2 p+2 - ldr r14, [r0, r14] ; i7 | i6 7|6 - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s - pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 - pkhbt r7, r9, r7, lsl #16 ; 1c | 5c - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 - pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 - uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 - uadd16 r10, r11, r9 ; a a - usub16 r9, r11, r9 ; b b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 - subs r5, r5, #0x1 ; i-- -- - smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s - smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s - - pkhbt r7, r12, r7, lsl #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 - usub16 r12, r8, r6 ; c (o1 | o5) c - uadd16 r6, r11, r1 ; d (o3 | o7) d - uadd16 r7, r10, r6 ; a+d a+d - mov r8, #0x4 ; set up 4's 4 - orr r8, r8, #0x40000 ; 4|4 - usub16 r6, r10, r6 ; a-d a-d - uadd16 r6, r6, r8 ; a-d+4 3|7 - uadd16 r7, r7, r8 ; a+d+4 0|4 - uadd16 r10, r9, r12 ; b+c b+c - usub16 r1, r9, r12 ; b-c b-c - uadd16 r10, r10, r8 ; b+c+4 1|5 - uadd16 r1, r1, r8 ; b-c+4 2|6 - mov r8, r10, asr #19 ; o1 >> 3 - strh r8, [r0, #2] ; o1 - mov r8, r1, asr #19 ; o2 >> 3 - strh r8, [r0, #4] ; o2 - mov r8, r6, asr #19 ; o3 >> 3 - strh r8, [r0, #6] ; o3 - mov r8, r7, asr #19 ; o0 >> 3 - strh r8, [r0], r2 ; o0 +p - sxth r10, r10 ; - mov r8, r10, asr #3 ; o5 >> 3 - strh r8, [r0, #2] ; o5 - sxth r1, r1 ; - mov r8, r1, asr #3 ; o6 >> 3 - strh r8, [r0, #4] ; o6 - sxth r6, r6 ; - mov r8, r6, asr #3 ; o7 >> 3 - strh r8, [r0, #6] ; o7 - sxth r7, r7 ; - mov r8, r7, asr #3 ; o4 >> 3 - strh r8, [r0], r2 ; o4 +p -;;;;; subs r5, r5, #0x1 ; i-- -- - bne loop2_dual_22 ; - - -;vpx_memset - ldr r0, [sp] - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - - ENDP ;|vp8_dequant_dc_idct_v68| - - END diff --git a/vp8/decoder/arm/armv6/dequantidct_v6.asm b/vp8/decoder/arm/armv6/dequantidct_v6.asm deleted file mode 100644 index 61bb48d..0000000 --- a/vp8/decoder/arm/armv6/dequantidct_v6.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dequant_idct_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code -;void vp8_dequant_idct_v6(short *input, short *dq, short *output, int pitch) -|vp8_dequant_idct_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r0, [sp] - - mov r12, #4 - -dequant_idct_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne dequant_idct_loop - - sub r0, r0, #32 - mov r1, r2 - mov r2, r3 - -; short_idct4x4llm_v6_dual - - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - mov r5, #0x2 ; i=2 i -loop1_dual_1 - ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 - ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 - ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 - - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s - pkhbt r7, r7, r9, lsl #16 ; 5c | 4c - smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c - pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s - smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s - subs r5, r5, #0x1 ; i-- -- - pkhbt r9, r9, r11, lsl #16 ; 13c | 12c - ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 - pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - usub16 r7, r8, r7 ; c c - uadd16 r6, r6, r10 ; d d - uadd16 r10, r11, r14 ; a a - usub16 r8, r11, r14 ; b b - uadd16 r9, r10, r6 ; a+d a+d - usub16 r10, r10, r6 ; a-d a-d - uadd16 r6, r8, r7 ; b+c b+c - usub16 r7, r8, r7 ; b-c b-c - str r6, [r1, r2] ; o5 | o4 - add r6, r2, r2 ; pitch * 2 p2 - str r7, [r1, r6] ; o9 | o8 - add r6, r6, r2 ; pitch * 3 p3 - str r10, [r1, r6] ; o13 | o12 - str r9, [r1], #0x4 ; o1 | o0 ++ - bne loop1_dual_1 ; - mov r5, #0x2 ; i=2 i - sub r0, r1, #8 ; reset input/output i/o -loop2_dual_2 - ldr r6, [r0, r2] ; i5 | i4 5|4 - ldr r1, [r0] ; i1 | i0 1|0 - ldr r12, [r0, #0x4] ; i3 | i2 3|2 - add r14, r2, #0x4 ; pitch + 2 p+2 - ldr r14, [r0, r14] ; i7 | i6 7|6 - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s - pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 - pkhbt r7, r9, r7, lsl #16 ; 1c | 5c - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 - pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 - uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 - uadd16 r10, r11, r9 ; a a - usub16 r9, r11, r9 ; b b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 - subs r5, r5, #0x1 ; i-- -- - smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s - smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s - - pkhbt r7, r12, r7, lsl #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 - usub16 r12, r8, r6 ; c (o1 | o5) c - uadd16 r6, r11, r1 ; d (o3 | o7) d - uadd16 r7, r10, r6 ; a+d a+d - mov r8, #0x4 ; set up 4's 4 - orr r8, r8, #0x40000 ; 4|4 - usub16 r6, r10, r6 ; a-d a-d - uadd16 r6, r6, r8 ; a-d+4 3|7 - uadd16 r7, r7, r8 ; a+d+4 0|4 - uadd16 r10, r9, r12 ; b+c b+c - usub16 r1, r9, r12 ; b-c b-c - uadd16 r10, r10, r8 ; b+c+4 1|5 - uadd16 r1, r1, r8 ; b-c+4 2|6 - mov r8, r10, asr #19 ; o1 >> 3 - strh r8, [r0, #2] ; o1 - mov r8, r1, asr #19 ; o2 >> 3 - strh r8, [r0, #4] ; o2 - mov r8, r6, asr #19 ; o3 >> 3 - strh r8, [r0, #6] ; o3 - mov r8, r7, asr #19 ; o0 >> 3 - strh r8, [r0], r2 ; o0 +p - sxth r10, r10 ; - mov r8, r10, asr #3 ; o5 >> 3 - strh r8, [r0, #2] ; o5 - sxth r1, r1 ; - mov r8, r1, asr #3 ; o6 >> 3 - strh r8, [r0, #4] ; o6 - sxth r6, r6 ; - mov r8, r6, asr #3 ; o7 >> 3 - strh r8, [r0, #6] ; o7 - sxth r7, r7 ; - mov r8, r7, asr #3 ; o4 >> 3 - strh r8, [r0], r2 ; o4 +p -;;;;; subs r5, r5, #0x1 ; i-- -- - bne loop2_dual_2 ; - ; - -;vpx_memset - ldr r0, [sp] - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - - ENDP ;|vp8_dequant_idct_v6| - - END diff --git a/vp8/decoder/arm/armv6/dequantize_v6.asm b/vp8/decoder/arm/armv6/dequantize_v6.asm index 95e3859..72f7e0e 100644 --- a/vp8/decoder/arm/armv6/dequantize_v6.asm +++ b/vp8/decoder/arm/armv6/dequantize_v6.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c new file mode 100644 index 0000000..3c7bc50 --- /dev/null +++ b/vp8/decoder/arm/armv6/idct_blk_v6.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "idct.h" +#include "dequantize.h" + +void vp8_dequant_dc_idct_add_y_block_v6 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs, short *dc) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]); + else + vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride); + + if (eobs[1] > 1) + vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]); + else + vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride); + + if (eobs[2] > 1) + vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]); + else + vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride); + + if (eobs[3] > 1) + vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]); + else + vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride); + + q += 64; + dc += 4; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_y_block_v6 + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride); + else + { + vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_v6 + (short *q, short *dq, unsigned char *pre, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride); + else + { + vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstv += 4*stride; + eobs += 2; + } +} diff --git a/vp8/decoder/arm/dboolhuff_arm.h b/vp8/decoder/arm/dboolhuff_arm.h index 495004f..d2ebc71 100644 --- a/vp8/decoder/arm/dboolhuff_arm.h +++ b/vp8/decoder/arm/dboolhuff_arm.h @@ -16,9 +16,6 @@ #undef vp8_dbool_start #define vp8_dbool_start vp8dx_start_decode_v6 -#undef vp8_dbool_stop -#define vp8_dbool_stop vp8dx_stop_decode_v6 - #undef vp8_dbool_fill #define vp8_dbool_fill vp8_bool_decoder_fill_v6 @@ -33,9 +30,6 @@ #undef vp8_dbool_start #define vp8_dbool_start vp8dx_start_decode_neon -#undef vp8_dbool_stop -#define vp8_dbool_stop vp8dx_stop_decode_neon - #undef vp8_dbool_fill #define vp8_dbool_fill vp8_bool_decoder_fill_neon diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c index 54006a9..3926587 100644 --- a/vp8/decoder/arm/dequantize_arm.c +++ b/vp8/decoder/arm/dequantize_arm.c @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h index c8a61a4..40151e0 100644 --- a/vp8/decoder/arm/dequantize_arm.h +++ b/vp8/decoder/arm/dequantize_arm.h @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -13,32 +14,56 @@ #if HAVE_ARMV6 extern prototype_dequant_block(vp8_dequantize_b_v6); -extern prototype_dequant_idct(vp8_dequant_idct_v6); -extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_v6); +extern prototype_dequant_idct_add(vp8_dequant_idct_add_v6); +extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_v6); +extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_v6); +extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_v6); +extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_v6); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_v6 -#undef vp8_dequant_idct -#define vp8_dequant_idct vp8_dequant_idct_v6 +#undef vp8_dequant_idct_add +#define vp8_dequant_idct_add vp8_dequant_idct_add_v6 -#undef vp8_dequant_idct_dc -#define vp8_dequant_idct_dc vp8_dequant_dc_idct_v6 +#undef vp8_dequant_dc_idct_add +#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_v6 + +#undef vp8_dequant_dc_idct_add_y_block +#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_v6 + +#undef vp8_dequant_idct_add_y_block +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_v6 + +#undef vp8_dequant_idct_add_uv_block +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_v6 #endif #if HAVE_ARMV7 extern prototype_dequant_block(vp8_dequantize_b_neon); -extern prototype_dequant_idct(vp8_dequant_idct_neon); -extern prototype_dequant_idct_dc(vp8_dequant_dc_idct_neon); +extern prototype_dequant_idct_add(vp8_dequant_idct_add_neon); +extern prototype_dequant_dc_idct_add(vp8_dequant_dc_idct_add_neon); +extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neon); +extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon); +extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_neon -#undef vp8_dequant_idct -#define vp8_dequant_idct vp8_dequant_idct_neon +#undef vp8_dequant_idct_add +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +#undef vp8_dequant_dc_idct_add +#define vp8_dequant_dc_idct_add vp8_dequant_dc_idct_add_neon + +#undef vp8_dequant_dc_idct_add_y_block +#define vp8_dequant_dc_idct_add_y_block vp8_dequant_dc_idct_add_y_block_neon + +#undef vp8_dequant_idct_add_y_block +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon -#undef vp8_dequant_idct_dc -#define vp8_dequant_idct_dc vp8_dequant_dc_idct_neon +#undef vp8_dequant_idct_add_uv_block +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon #endif #endif diff --git a/vp8/decoder/arm/detokenize.asm b/vp8/decoder/arm/detokenize.asm new file mode 100644 index 0000000..45e068a --- /dev/null +++ b/vp8/decoder/arm/detokenize.asm @@ -0,0 +1,320 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_decode_mb_tokens_v6| + + AREA |.text|, CODE, READONLY ; name this block of code + + INCLUDE vpx_asm_offsets.asm + +l_qcoeff EQU 0 +l_i EQU 4 +l_type EQU 8 +l_stop EQU 12 +l_c EQU 16 +l_l_ptr EQU 20 +l_a_ptr EQU 24 +l_bc EQU 28 +l_coef_ptr EQU 32 +l_stacksize EQU 64 + + +;; constant offsets -- these should be created at build time +c_block2above_offset EQU 25 +c_entropy_nodes EQU 11 +c_dct_eob_token EQU 11 + +|vp8_decode_mb_tokens_v6| PROC + stmdb sp!, {r4 - r11, lr} + sub sp, sp, #l_stacksize + mov r7, r1 ; type + mov r9, r0 ; detoken + + ldr r1, [r9, #detok_current_bc] + ldr r0, [r9, #detok_qcoeff_start_ptr] + mov r11, #0 ; i + mov r3, #16 ; stop + + cmp r7, #1 ; type ?= 1 + addeq r11, r11, #24 ; i = 24 + addeq r3, r3, #8 ; stop = 24 + addeq r0, r0, #3, 24 ; qcoefptr += 24*16 + + str r0, [sp, #l_qcoeff] + str r11, [sp, #l_i] + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + str r1, [sp, #l_bc] + + add lr, r9, r7, lsl #2 ; detoken + type*4 + + ldr r8, [r1, #bool_decoder_user_buffer] + + ldr r10, [lr, #detok_coef_probs] + ldr r5, [r1, #bool_decoder_count] + ldr r6, [r1, #bool_decoder_range] + ldr r4, [r1, #bool_decoder_value] + + str r10, [sp, #l_coef_ptr] + +BLOCK_LOOP + ldr r3, [r9, #detok_ptr_block2leftabove] + ldr r1, [r9, #detok_L] + ldr r2, [r9, #detok_A] + ldrb r12, [r3, r11]! ; block2left[i] + ldrb r3, [r3, #c_block2above_offset]; block2above[i] + + cmp r7, #0 ; c = !type + moveq r7, #1 + movne r7, #0 + + ldrb r0, [r1, r12]! ; *(L += block2left[i]) + ldrb r3, [r2, r3]! ; *(A += block2above[i]) + mov lr, #c_entropy_nodes ; ENTROPY_NODES = 11 + +; VP8_COMBINEENTROPYCONTETEXTS(t, *a, *l) => t = ((*a) != 0) + ((*l) !=0) + cmp r0, #0 ; *l ?= 0 + movne r0, #1 + cmp r3, #0 ; *a ?= 0 + addne r0, r0, #1 ; t + + str r1, [sp, #l_l_ptr] ; save &l + str r2, [sp, #l_a_ptr] ; save &a + smlabb r0, r0, lr, r10 ; Prob = coef_probs + (t * ENTROPY_NODES) + mov r1, #0 ; t = 0 + str r7, [sp, #l_c] + + ;align 4 +COEFF_LOOP + ldr r3, [r9, #detok_ptr_coef_bands_x] + ldr lr, [r9, #detok_coef_tree_ptr] + ;STALL + ldrb r3, [r3, r7] ; coef_bands_x[c] + ;STALL + ;STALL + add r0, r0, r3 ; Prob += coef_bands_x[c] + +get_token_loop + ldrb r2, [r0, +r1, asr #1] ; Prob[t >> 1] + mov r3, r6, lsl #8 ; range << 8 + sub r3, r3, #256 ; (range << 8) - (1 << 8) + mov r10, #1 ; 1 + + smlawb r2, r3, r2, r10 ; split = 1 + (((range-1) * probability) >> 8) + + ldrb r12, [r8] ; load cx data byte in stall slot : r8 = bufptr + ;++ + + subs r3, r4, r2, lsl #24 ; value-(split<<24): used later to calculate shift for NORMALIZE + addhs r1, r1, #1 ; t += 1 + movhs r4, r3 ; value -= bigsplit (split << 24) + subhs r2, r6, r2 ; range -= split + ; movlo r6, r2 ; range = split + + ldrsb r1, [lr, r1] ; t = onyx_coef_tree_ptr[t] + +; NORMALIZE + clz r3, r2 ; vp8dx_bitreader_norm[range] + 24 + sub r3, r3, #24 ; vp8dx_bitreader_norm[range] + subs r5, r5, r3 ; count -= shift + mov r6, r2, lsl r3 ; range <<= shift + mov r4, r4, lsl r3 ; value <<= shift + +; if count <= 0, += BR_COUNT; value |= *bufptr++ << (BR_COUNT-count); BR_COUNT = 8, but need to upshift values by +16 + addle r5, r5, #8 ; count += 8 + rsble r3, r5, #24 ; 24 - count + addle r8, r8, #1 ; bufptr++ + orrle r4, r4, r12, lsl r3 ; value |= *bufptr << shift + 16 + + cmp r1, #0 ; t ?= 0 + bgt get_token_loop ; while (t > 0) + + cmn r1, #c_dct_eob_token ; if(t == -DCT_EOB_TOKEN) + beq END_OF_BLOCK ; break + + rsb lr, r1, #0 ; v = -t; + + cmp lr, #4 ; if(v > FOUR_TOKEN) + ble SKIP_EXTRABITS + + ldr r3, [r9, #detok_teb_base_ptr] + mov r11, #1 ; 1 in split = 1 + ... nope, v+= 1 << bits_count + add r7, r3, lr, lsl #4 ; detok_teb_base_ptr + (v << 4) + + ldrsh lr, [r7, #tokenextrabits_min_val] ; v = teb_ptr->min_val + ldrsh r0, [r7, #tokenextrabits_length] ; bits_count = teb_ptr->Length + +extrabits_loop + add r3, r0, r7 ; &teb_ptr->Probs[bits_count] + + ldrb r2, [r3, #4] ; probability. why +4? + mov r3, r6, lsl #8 ; range << 8 + sub r3, r3, #256 ; range << 8 + 1 << 8 + + smlawb r2, r3, r2, r11 ; split = 1 + (((range-1) * probability) >> 8) + + ldrb r12, [r8] ; *bufptr + ;++ + + subs r10, r4, r2, lsl #24 ; value - (split<<24) + movhs r4, r10 ; value = value - (split << 24) + subhs r2, r6, r2 ; range = range - split + addhs lr, lr, r11, lsl r0 ; v += ((UINT16)1<<bits_count) + +; NORMALIZE + clz r3, r2 ; shift - leading zeros in split + sub r3, r3, #24 ; don't count first 3 bytes + subs r5, r5, r3 ; count -= shift + mov r6, r2, lsl r3 ; range = range << shift + mov r4, r4, lsl r3 ; value <<= shift + + addle r5, r5, #8 ; count += BR_COUNT + addle r8, r8, #1 ; bufptr++ + rsble r3, r5, #24 ; BR_COUNT - count + orrle r4, r4, r12, lsl r3 ; value |= *bufptr << (BR_COUNT - count) + + subs r0, r0, #1 ; bits_count -- + bpl extrabits_loop + + +SKIP_EXTRABITS + ldr r11, [sp, #l_qcoeff] + ldr r0, [sp, #l_coef_ptr] ; Prob = coef_probs + + cmp r1, #0 ; check for nonzero token - if (t) + beq SKIP_EOB_CHECK ; if t is zero, we will skip the eob table chec + + add r3, r6, #1 ; range + 1 + mov r2, r3, lsr #1 ; split = (range + 1) >> 1 + + subs r3, r4, r2, lsl #24 ; value - (split<<24) + movhs r4, r3 ; value -= (split << 24) + subhs r2, r6, r2 ; range -= split + mvnhs r3, lr ; -v + addhs lr, r3, #1 ; v = (v ^ -1) + 1 + +; NORMALIZE + clz r3, r2 ; leading 0s in split + sub r3, r3, #24 ; shift + subs r5, r5, r3 ; count -= shift + mov r6, r2, lsl r3 ; range <<= shift + mov r4, r4, lsl r3 ; value <<= shift + ldrleb r2, [r8], #1 ; *(bufptr++) + addle r5, r5, #8 ; count += 8 + rsble r3, r5, #24 ; BR_COUNT - count + orrle r4, r4, r2, lsl r3 ; value |= *bufptr << (BR_COUNT - count) + + add r0, r0, #11 ; Prob += ENTROPY_NODES (11) + + cmn r1, #1 ; t < -ONE_TOKEN + + addlt r0, r0, #11 ; Prob += ENTROPY_NODES (11) + + mvn r1, #1 ; t = -1 ???? C is -2 + +SKIP_EOB_CHECK + ldr r7, [sp, #l_c] ; c + ldr r3, [r9, #detok_scan] + add r1, r1, #2 ; t+= 2 + cmp r7, #15 ; c should will be one higher + + ldr r3, [r3, +r7, lsl #2] ; scan[c] this needs pre-inc c value + add r7, r7, #1 ; c++ + add r3, r11, r3, lsl #1 ; qcoeff + scan[c] + + str r7, [sp, #l_c] ; store c + strh lr, [r3] ; qcoef_ptr[scan[c]] = v + + blt COEFF_LOOP + + sub r7, r7, #1 ; if(t != -DCT_EOB_TOKEN) --c + +END_OF_BLOCK + ldr r3, [sp, #l_type] ; type + ldr r10, [sp, #l_coef_ptr] ; coef_ptr + ldr r0, [sp, #l_qcoeff] ; qcoeff + ldr r11, [sp, #l_i] ; i + ldr r12, [sp, #l_stop] ; stop + + cmp r3, #0 ; type ?= 0 + moveq r1, #1 + movne r1, #0 + add r3, r11, r9 ; detok + i + + cmp r7, r1 ; c ?= !type + strb r7, [r3, #detok_eob] ; eob[i] = c + + ldr r7, [sp, #l_l_ptr] ; l + ldr r2, [sp, #l_a_ptr] ; a + movne r3, #1 ; t + moveq r3, #0 + + add r0, r0, #32 ; qcoeff += 32 (16 * 2?) + add r11, r11, #1 ; i++ + strb r3, [r7] ; *l = t + strb r3, [r2] ; *a = t + str r0, [sp, #l_qcoeff] ; qcoeff + str r11, [sp, #l_i] ; i + + cmp r11, r12 ; i < stop + ldr r7, [sp, #l_type] ; type + + blt BLOCK_LOOP + + cmp r11, #25 ; i ?= 25 + bne ln2_decode_mb_to + + ldr r12, [r9, #detok_qcoeff_start_ptr] + ldr r10, [r9, #detok_coef_probs] + mov r7, #0 ; type/i = 0 + mov r3, #16 ; stop = 16 + str r12, [sp, #l_qcoeff] ; qcoeff_ptr = qcoeff_start_ptr + str r7, [sp, #l_i] + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + + str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type=0] + + b BLOCK_LOOP + +ln2_decode_mb_to + cmp r11, #16 ; i ?= 16 + bne ln1_decode_mb_to + + mov r10, #detok_coef_probs + add r10, r10, #2*4 ; coef_probs[type] + ldr r10, [r9, r10] ; detok + detok_coef_probs[type] + + mov r7, #2 ; type = 2 + mov r3, #24 ; stop = 24 + + str r7, [sp, #l_type] + str r3, [sp, #l_stop] + + str r10, [sp, #l_coef_ptr] ; coef_probs = coef_probs[type] + b BLOCK_LOOP + +ln1_decode_mb_to + ldr r2, [sp, #l_bc] + mov r0, #0 + nop + + str r8, [r2, #bool_decoder_user_buffer] + str r5, [r2, #bool_decoder_count] + str r4, [r2, #bool_decoder_value] + str r6, [r2, #bool_decoder_range] + + add sp, sp, #l_stacksize + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp8_decode_mb_tokens_v6| + + END diff --git a/vp8/decoder/arm/detokenize_arm.h b/vp8/decoder/arm/detokenize_arm.h new file mode 100644 index 0000000..9bb19b6 --- /dev/null +++ b/vp8/decoder/arm/detokenize_arm.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef DETOKENIZE_ARM_H +#define DETOKENIZE_ARM_H + +#if HAVE_ARMV6 +#if CONFIG_ARM_ASM_DETOK +void vp8_init_detokenizer(VP8D_COMP *dx); +void vp8_decode_mb_tokens_v6(DETOK *detoken, int type); +#endif +#endif + +#endif diff --git a/vp8/decoder/arm/detokenizearm_sjl.c b/vp8/decoder/arm/detokenizearm_sjl.c deleted file mode 100644 index c714452..0000000 --- a/vp8/decoder/arm/detokenizearm_sjl.c +++ /dev/null @@ -1,730 +0,0 @@ -/* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. - */ - - -#include "type_aliases.h" -#include "blockd.h" -#include "onyxd_int.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_ports/mem.h" - -#define BR_COUNT 8 -#define BOOL_DATA UINT8 - -#define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES -//ALIGN16 UINT16 onyx_coef_bands_x[16] = { 0, 1*OCB_X, 2*OCB_X, 3*OCB_X, 6*OCB_X, 4*OCB_X, 5*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 6*OCB_X, 7*OCB_X}; -DECLARE_ALIGNED(16, UINT8, vp8_coef_bands_x[16]) = { 0, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X}; - -#define EOB_CONTEXT_NODE 0 -#define ZERO_CONTEXT_NODE 1 -#define ONE_CONTEXT_NODE 2 -#define LOW_VAL_CONTEXT_NODE 3 -#define TWO_CONTEXT_NODE 4 -#define THREE_CONTEXT_NODE 5 -#define HIGH_LOW_CONTEXT_NODE 6 -#define CAT_ONE_CONTEXT_NODE 7 -#define CAT_THREEFOUR_CONTEXT_NODE 8 -#define CAT_THREE_CONTEXT_NODE 9 -#define CAT_FIVE_CONTEXT_NODE 10 - - - - -DECLARE_ALIGNED(16, static const TOKENEXTRABITS, vp8d_token_extra_bits2[MAX_ENTROPY_TOKENS]) = -{ - { 0, -1, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //ZERO_TOKEN - { 1, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //ONE_TOKEN - { 2, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //TWO_TOKEN - { 3, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //THREE_TOKEN - { 4, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //FOUR_TOKEN - { 5, 0, { 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY1 - { 7, 1, { 145, 165, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY2 - { 11, 2, { 140, 148, 173, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY3 - { 19, 3, { 135, 140, 155, 176, 0, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY4 - { 35, 4, { 130, 134, 141, 157, 180, 0, 0, 0, 0, 0, 0, 0 } }, //DCT_VAL_CATEGORY5 - { 67, 10, { 129, 130, 133, 140, 153, 177, 196, 230, 243, 254, 254, 0 } }, //DCT_VAL_CATEGORY6 - { 0, -1, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }, // EOB TOKEN -}; - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ -DECLARE_ALIGNED(16, const UINT8, vp8_block2context_leftabove[25*3]) = -{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, //end of vp8_block2context - 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 0, 0, 1, 1, 0, 0, 1, 1, 0, //end of vp8_block2left - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0 //end of vp8_block2above -}; - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -void vp8_reset_mb_tokens_context(MACROBLOCKD *x) -{ - ENTROPY_CONTEXT **const A = x->above_context; - ENTROPY_CONTEXT(* const L)[4] = x->left_context; - - ENTROPY_CONTEXT *a; - ENTROPY_CONTEXT *l; - int i; - - for (i = 0; i < 24; i++) - { - - a = A[ vp8_block2context[i] ] + vp8_block2above[i]; - l = L[ vp8_block2context[i] ] + vp8_block2left[i]; - - *a = *l = 0; - } - - if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) - { - a = A[Y2CONTEXT] + vp8_block2above[24]; - l = L[Y2CONTEXT] + vp8_block2left[24]; - *a = *l = 0; - } - - -} - -#define ONYXBLOCK2CONTEXT_OFFSET 0 -#define ONYXBLOCK2LEFT_OFFSET 25 -#define ONYXBLOCK2ABOVE_OFFSET 50 - -DECLARE_ALIGNED(16, const static unsigned char, norm[128]) = -{ - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 -}; - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ -void init_detokenizer(VP8D_COMP *dx) -{ - const VP8_COMMON *const oc = & dx->common; - MACROBLOCKD *x = & dx->mb; - - dx->detoken.norm_ptr = (unsigned char *)norm; - dx->detoken.vp8_coef_tree_ptr = (vp8_tree_index *)vp8_coef_tree; - dx->detoken.ptr_onyxblock2context_leftabove = (UINT8 *)vp8_block2context_leftabove; - dx->detoken.ptr_onyx_coef_bands_x = vp8_coef_bands_x; - dx->detoken.scan = (int *)vp8_default_zig_zag1d; - dx->detoken.teb_base_ptr = (TOKENEXTRABITS *)vp8d_token_extra_bits2; - - dx->detoken.qcoeff_start_ptr = &x->qcoeff[0]; - - - dx->detoken.coef_probs[0] = (unsigned char *)(oc->fc.coef_probs [0] [ 0 ] [0]); - dx->detoken.coef_probs[1] = (unsigned char *)(oc->fc.coef_probs [1] [ 0 ] [0]); - dx->detoken.coef_probs[2] = (unsigned char *)(oc->fc.coef_probs [2] [ 0 ] [0]); - dx->detoken.coef_probs[3] = (unsigned char *)(oc->fc.coef_probs [3] [ 0 ] [0]); - -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - - -//shift = norm[range]; \ -// shift = norm_ptr[range]; \ - -#define NORMALIZE \ - /*if(range < 0x80)*/ \ - { \ - shift = detoken->norm_ptr[range]; \ - range <<= shift; \ - value <<= shift; \ - count -= shift; \ - if(count <= 0) \ - { \ - count += BR_COUNT ; \ - value |= (*bufptr) << (BR_COUNT-count); \ - bufptr++; \ - } \ - } -#if 1 -#define DECODE_AND_APPLYSIGN(value_to_sign) \ - split = (range + 1) >> 1; \ - if ( (value >> 24) < split ) \ - { \ - range = split; \ - v= value_to_sign; \ - } \ - else \ - { \ - range = range-split; \ - value = value-(split<<24); \ - v = -value_to_sign; \ - } \ - range +=range; \ - value +=value; \ - if (!--count) \ - { \ - count = BR_COUNT; \ - value |= *bufptr; \ - bufptr++; \ - } - -#define DECODE_AND_BRANCH_IF_ZERO(probability,branch) \ - { \ - split = 1 + ((( probability*(range-1) ) )>> 8); \ - if ( (value >> 24) < split ) \ - { \ - range = split; \ - NORMALIZE \ - goto branch; \ - } \ - value -= (split<<24); \ - range = range - split; \ - NORMALIZE \ - } - -#define DECODE_AND_LOOP_IF_ZERO(probability,branch) \ - { \ - split = 1 + ((( probability*(range-1) ) ) >> 8); \ - if ( (value >> 24) < split ) \ - { \ - range = split; \ - NORMALIZE \ - Prob = coef_probs; \ - ++c; \ - Prob += vp8_coef_bands_x[c]; \ - goto branch; \ - } \ - value -= (split<<24); \ - range = range - split; \ - NORMALIZE \ - } - -#define DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val) \ - DECODE_AND_APPLYSIGN(val) \ - Prob = coef_probs + (ENTROPY_NODES*2); \ - if(c < 15){\ - qcoeff_ptr [ scan[c] ] = (INT16) v; \ - ++c; \ - goto DO_WHILE; }\ - qcoeff_ptr [ scan[15] ] = (INT16) v; \ - goto BLOCK_FINISHED; - - -#define DECODE_EXTRABIT_AND_ADJUST_VAL(t,bits_count)\ - split = 1 + (((range-1) * vp8d_token_extra_bits2[t].Probs[bits_count]) >> 8); \ - if(value >= (split<<24))\ - {\ - range = range-split;\ - value = value-(split<<24);\ - val += ((UINT16)1<<bits_count);\ - }\ - else\ - {\ - range = split;\ - }\ - NORMALIZE -#endif - -#if 0 -int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) -{ - ENTROPY_CONTEXT **const A = x->above_context; - ENTROPY_CONTEXT(* const L)[4] = x->left_context; - const VP8_COMMON *const oc = & dx->common; - - BOOL_DECODER *bc = x->current_bc; - - ENTROPY_CONTEXT *a; - ENTROPY_CONTEXT *l; - int i; - - int eobtotal = 0; - - register int count; - - BOOL_DATA *bufptr; - register unsigned int range; - register unsigned int value; - const int *scan; - register unsigned int shift; - UINT32 split; - INT16 *qcoeff_ptr; - - UINT8 *coef_probs; - int type; - int stop; - INT16 val, bits_count; - INT16 c; - INT16 t; - INT16 v; - vp8_prob *Prob; - - //int *scan; - type = 3; - i = 0; - stop = 16; - - if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) - { - i = 24; - stop = 24; - type = 1; - qcoeff_ptr = &x->qcoeff[24*16]; - scan = vp8_default_zig_zag1d; - eobtotal -= 16; - } - else - { - scan = vp8_default_zig_zag1d; - qcoeff_ptr = &x->qcoeff[0]; - } - - count = bc->count; - range = bc->range; - value = bc->value; - bufptr = &bc->buffer[bc->pos]; - - - coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); - -BLOCK_LOOP: - a = A[ vp8_block2context[i] ] + vp8_block2above[i]; - l = L[ vp8_block2context[i] ] + vp8_block2left[i]; - c = (INT16)(!type); - - VP8_COMBINEENTROPYCONTEXTS(t, *a, *l); - Prob = coef_probs; - Prob += t * ENTROPY_NODES; - -DO_WHILE: - Prob += vp8_coef_bands_x[c]; - DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED); - -CHECK_0_: - DECODE_AND_LOOP_IF_ZERO(Prob[ZERO_CONTEXT_NODE], CHECK_0_); - DECODE_AND_BRANCH_IF_ZERO(Prob[ONE_CONTEXT_NODE], ONE_CONTEXT_NODE_0_); - DECODE_AND_BRANCH_IF_ZERO(Prob[LOW_VAL_CONTEXT_NODE], LOW_VAL_CONTEXT_NODE_0_); - DECODE_AND_BRANCH_IF_ZERO(Prob[HIGH_LOW_CONTEXT_NODE], HIGH_LOW_CONTEXT_NODE_0_); - DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREEFOUR_CONTEXT_NODE], CAT_THREEFOUR_CONTEXT_NODE_0_); - DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_FIVE_CONTEXT_NODE], CAT_FIVE_CONTEXT_NODE_0_); - val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].min_val; - bits_count = vp8d_token_extra_bits2[DCT_VAL_CATEGORY6].Length; - - do - { - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY6, bits_count); - bits_count -- ; - } - while (bits_count >= 0); - - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); - -CAT_FIVE_CONTEXT_NODE_0_: - val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY5].min_val; - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 4); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 3); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 2); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 1); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY5, 0); - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); - -CAT_THREEFOUR_CONTEXT_NODE_0_: - DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_THREE_CONTEXT_NODE], CAT_THREE_CONTEXT_NODE_0_); - val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY4].min_val; - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 3); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 2); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 1); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY4, 0); - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); - -CAT_THREE_CONTEXT_NODE_0_: - val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY3].min_val; - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 2); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 1); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY3, 0); - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); - -HIGH_LOW_CONTEXT_NODE_0_: - DECODE_AND_BRANCH_IF_ZERO(Prob[CAT_ONE_CONTEXT_NODE], CAT_ONE_CONTEXT_NODE_0_); - - val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY2].min_val; - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 1); - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY2, 0); - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); - -CAT_ONE_CONTEXT_NODE_0_: - val = vp8d_token_extra_bits2[DCT_VAL_CATEGORY1].min_val; - DECODE_EXTRABIT_AND_ADJUST_VAL(DCT_VAL_CATEGORY1, 0); - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(val); - -LOW_VAL_CONTEXT_NODE_0_: - DECODE_AND_BRANCH_IF_ZERO(Prob[TWO_CONTEXT_NODE], TWO_CONTEXT_NODE_0_); - DECODE_AND_BRANCH_IF_ZERO(Prob[THREE_CONTEXT_NODE], THREE_CONTEXT_NODE_0_); - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(4); - -THREE_CONTEXT_NODE_0_: - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(3); - -TWO_CONTEXT_NODE_0_: - DECODE_SIGN_WRITE_COEFF_AND_CHECK_EXIT(2); - -ONE_CONTEXT_NODE_0_: - DECODE_AND_APPLYSIGN(1); - Prob = coef_probs + ENTROPY_NODES; - - if (c < 15) - { - qcoeff_ptr [ scan[c] ] = (INT16) v; - ++c; - goto DO_WHILE; - } - - qcoeff_ptr [ scan[15] ] = (INT16) v; -BLOCK_FINISHED: - t = ((x->Block[i].eob = c) != !type); // any nonzero data? - eobtotal += x->Block[i].eob; - *a = *l = t; - qcoeff_ptr += 16; - - i++; - - if (i < stop) - goto BLOCK_LOOP; - - if (i == 25) - { - scan = vp8_default_zig_zag1d;//x->scan_order1d; - type = 0; - i = 0; - stop = 16; - coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); - qcoeff_ptr = &x->qcoeff[0]; - goto BLOCK_LOOP; - } - - if (i == 16) - { - type = 2; - coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); - stop = 24; - goto BLOCK_LOOP; - } - - bc->count = count; - bc->value = value; - bc->range = range; - bc->pos = bufptr - bc->buffer; - return eobtotal; - -} -//#endif -#else -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -#if 0 -//uses relative offsets - -const vp8_tree_index vp8_coef_tree_x[ 22] = /* corresponding _CONTEXT_NODEs */ -{ - -DCT_EOB_TOKEN, 1, /* 0 = EOB */ - -ZERO_TOKEN, 1, /* 1 = ZERO */ - -ONE_TOKEN, 1, /* 2 = ONE */ - 2, 5, /* 3 = LOW_VAL */ - -TWO_TOKEN, 1, /* 4 = TWO */ - -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ - 2, 3, /* 6 = HIGH_LOW */ - -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ - 2, 3, /* 8 = CAT_THREEFOUR */ - -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ - -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ -}; -#endif - -#define _SCALEDOWN 8 //16 //8 - -int vp8_decode_mb_tokens_v5(DETOK *detoken, int type); - -int vp8_decode_mb_tokens_v5_c(DETOK *detoken, int type) -{ - BOOL_DECODER *bc = detoken->current_bc; - - ENTROPY_CONTEXT *a; - ENTROPY_CONTEXT *l; - int i; - - register int count; - - BOOL_DATA *bufptr; - register unsigned int range; - register unsigned int value; - register unsigned int shift; - UINT32 split; - INT16 *qcoeff_ptr; - - UINT8 *coef_probs; -// int type; - int stop; - INT16 c; - INT16 t; - INT16 v; - vp8_prob *Prob; - - - -// type = 3; - i = 0; - stop = 16; - qcoeff_ptr = detoken->qcoeff_start_ptr; - -// if( detoken->mode != B_PRED && detoken->mode != SPLITMV) - if (type == 1) - { - i += 24; - stop += 8; //24; -// type = 1; - qcoeff_ptr += 24 * 16; -// eobtotal-=16; - } - - count = bc->count; - range = bc->range; - value = bc->value; - bufptr = &bc->buffer[bc->pos]; - - - coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); - -BLOCK_LOOP: - a = detoken->A[ detoken->ptr_onyxblock2context_leftabove[i] ]; - l = detoken->L[ detoken->ptr_onyxblock2context_leftabove[i] ]; - c = !type; - a += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2ABOVE_OFFSET]; - l += detoken->ptr_onyxblock2context_leftabove[i + ONYXBLOCK2LEFT_OFFSET]; - - //#define ONYX_COMBINEENTROPYCONTEXTS( Dest, A, B) \ - //Dest = ((A)!=0) + ((B)!=0); - - VP8_COMBINEENTROPYCONTEXTS(t, *a, *l); - - Prob = coef_probs; - Prob += t * ENTROPY_NODES; - t = 0; - - do - { - - { -// onyx_tree_index * onyx_coef_tree_ptr = onyx_coef_tree_x; - - Prob += detoken->ptr_onyx_coef_bands_x[c]; - - GET_TOKEN_START: - - do - { - split = 1 + (((range - 1) * (Prob[t>>1])) >> 8); - - if (value >> 24 >= split) - { - range = range - split; - value = value - (split << 24); - t += 1; - - //used to eliminate else branch - split = range; - } - - range = split; - - t = detoken->vp8_coef_tree_ptr[ t ]; - - NORMALIZE - - } - while (t > 0) ; - } - GET_TOKEN_STOP: - - if (t == -DCT_EOB_TOKEN) - { - break; - } - - v = -t; - - if (v > FOUR_TOKEN) - { - INT16 bits_count; - TOKENEXTRABITS *teb_ptr; - -// teb_ptr = &onyxd_token_extra_bits2[t]; -// teb_ptr = &onyxd_token_extra_bits2[v]; - teb_ptr = &detoken->teb_base_ptr[v]; - - - v = teb_ptr->min_val; - bits_count = teb_ptr->Length; - - do - { - split = 1 + (((range - 1) * teb_ptr->Probs[bits_count]) >> _SCALEDOWN); - - if ((value >> 24) >= split) - { - range = range - split; - value = value - (split << 24); - v += ((UINT16)1 << bits_count); - - //used to eliminate else branch - split = range; - } - - range = split; - - NORMALIZE - - bits_count -- ; - } - while (bits_count >= 0); - } - - Prob = coef_probs; - - if (t) - { - split = 1 + (((range - 1) * vp8_prob_half) >> 8); - - if ((value >> 24) >= split) - { - range = range - split; - value = value - (split << 24); - v = (v ^ -1) + 1; /* negate w/out conditionals */ - - //used to eliminate else branch - split = range; - } - - range = split; - - NORMALIZE - Prob += ENTROPY_NODES; - - if (t < -ONE_TOKEN) - Prob += ENTROPY_NODES; - - t = -2; - } - - //if t is zero, we will skip the eob table check - t += 2; - qcoeff_ptr [detoken->scan [c] ] = (INT16) v; - - } - while (++c < 16); - - if (t != -DCT_EOB_TOKEN) - { - --c; - } - - t = ((detoken->eob[i] = c) != !type); // any nonzero data? -// eobtotal += detoken->eob[i]; - *a = *l = t; - qcoeff_ptr += 16; - - i++; - - if (i < stop) - goto BLOCK_LOOP; - - if (i == 25) - { - type = 0; - i = 0; - stop = 16; -// coef_probs = (unsigned char *)(oc->fc.coef_probs [type] [ 0 ] [0]); - coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); - qcoeff_ptr = detoken->qcoeff_start_ptr; - goto BLOCK_LOOP; - } - - if (i == 16) - { - type = 2; -// coef_probs =(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); - coef_probs = detoken->coef_probs[type]; //(unsigned char *)( oc->fc.coef_probs [type] [ 0 ] [0]); - stop = 24; - goto BLOCK_LOOP; - } - - bc->count = count; - bc->value = value; - bc->range = range; - bc->pos = bufptr - bc->buffer; - return 0; -} -//#if 0 -int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x) -{ -// const ONYX_COMMON * const oc = & dx->common; - int eobtotal = 0; - int i, type; - /* - dx->detoken.norm_ptr = norm; - dx->detoken.onyx_coef_tree_ptr = onyx_coef_tree; - dx->detoken.ptr_onyxblock2context_leftabove = ONYXBLOCK2CONTEXT_LEFTABOVE; - dx->detoken.ptr_onyx_coef_bands_x = onyx_coef_bands_x; - dx->detoken.scan = default_zig_zag1d; - dx->detoken.teb_base_ptr = onyxd_token_extra_bits2; - - dx->detoken.qcoeff_start_ptr = &x->qcoeff[0]; - - dx->detoken.A = x->above_context; - dx->detoken.L = x->left_context; - - dx->detoken.coef_probs[0] = (unsigned char *)( oc->fc.coef_probs [0] [ 0 ] [0]); - dx->detoken.coef_probs[1] = (unsigned char *)( oc->fc.coef_probs [1] [ 0 ] [0]); - dx->detoken.coef_probs[2] = (unsigned char *)( oc->fc.coef_probs [2] [ 0 ] [0]); - dx->detoken.coef_probs[3] = (unsigned char *)( oc->fc.coef_probs [3] [ 0 ] [0]); - */ - - dx->detoken.current_bc = x->current_bc; - dx->detoken.A = x->above_context; - dx->detoken.L = x->left_context; - - type = 3; - - if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV) - { - type = 1; - eobtotal -= 16; - } - - vp8_decode_mb_tokens_v5(&dx->detoken, type); - - for (i = 0; i < 25; i++) - { - x->Block[i].eob = dx->detoken.eob[i]; - eobtotal += dx->detoken.eob[i]; - } - - return eobtotal; -} -#endif diff --git a/vp8/decoder/arm/detokenizearm_v6.asm b/vp8/decoder/arm/detokenizearm_v6.asm deleted file mode 100644 index 4d87ee5..0000000 --- a/vp8/decoder/arm/detokenizearm_v6.asm +++ /dev/null @@ -1,364 +0,0 @@ -; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_decode_mb_tokens_v5| - - AREA |.text|, CODE, READONLY ; name this block of code - - INCLUDE vpx_asm_offsets.asm - -l_qcoeff EQU 0 -l_i EQU 4 -l_type EQU 8 -l_stop EQU 12 -l_c EQU 16 -l_l_ptr EQU 20 -l_a_ptr EQU 24 -l_bc EQU 28 -l_coef_ptr EQU 32 -l_stacksize EQU 64 - - -;; constant offsets -- these should be created at build time -c_onyxblock2left_offset EQU 25 -c_onyxblock2above_offset EQU 50 -c_entropy_nodes EQU 11 -c_dct_eob_token EQU 11 - -|vp8_decode_mb_tokens_v5| PROC - stmdb sp!, {r4 - r11, lr} - sub sp, sp, #l_stacksize - mov r7, r1 - mov r9, r0 ;DETOK *detoken - - ldr r1, [r9, #detok_current_bc] - ldr r0, [r9, #detok_qcoeff_start_ptr] - mov r11, #0 - mov r3, #0x10 - - cmp r7, #1 - addeq r11, r11, #24 - addeq r3, r3, #8 - addeq r0, r0, #3, 24 - - str r0, [sp, #l_qcoeff] - str r11, [sp, #l_i] - str r7, [sp, #l_type] - str r3, [sp, #l_stop] - str r1, [sp, #l_bc] - - add lr, r9, r7, lsl #2 - - ldr r2, [r1, #bool_decoder_buffer] - ldr r3, [r1, #bool_decoder_pos] - - ldr r10, [lr, #detok_coef_probs] - ldr r5, [r1, #bool_decoder_count] - ldr r6, [r1, #bool_decoder_range] - ldr r4, [r1, #bool_decoder_value] - add r8, r2, r3 - - str r10, [sp, #l_coef_ptr] - - - ;align 4 -BLOCK_LOOP - ldr r3, [r9, #detok_ptr_onyxblock2context_leftabove] - ldr r2, [r9, #DETOK_A] - ldr r1, [r9, #DETOK_L] - ldrb r12, [r3, +r11] ; detoken->ptr_onyxblock2context_leftabove[i] - - cmp r7, #0 ; check type - moveq r7, #1 - movne r7, #0 - - ldr r0, [r2, +r12, lsl #2] ; a - add r1, r1, r12, lsl #4 - add r3, r3, r11 - - ldrb r2, [r3, #c_onyxblock2above_offset] - ldrb r3, [r3, #c_onyxblock2left_offset] - mov lr, #c_entropy_nodes -;; ;++ - - ldr r2, [r0, +r2, lsl #2]! - add r3, r1, r3, lsl #2 - str r3, [sp, #l_l_ptr] - ldr r3, [r3] - - cmp r2, #0 - movne r2, #1 - cmp r3, #0 - addne r2, r2, #1 - - str r0, [sp, #l_a_ptr] - smlabb r0, r2, lr, r10 - mov r1, #0 ; t = 0 - str r7, [sp, #l_c] - - ;align 4 -COEFF_LOOP - ldr r3, [r9, #detok_ptr_onyx_coef_bands_x] - ldr lr, [r9, #detok_onyx_coef_tree_ptr] - -;;the following two lines are used if onyx_coef_bands_x is UINT16 -;; add r3, r3, r7, lsl #1 -;; ldrh r3, [r3] - -;;the following line is used if onyx_coef_bands_x is UINT8 - ldrb r3, [r7, +r3] - - -;; ;++ -;; pld [r8] - ;++ - add r0, r0, r3 - - ;align 4 -get_token_loop - ldrb r2, [r0, +r1, asr #1] - mov r3, r6, lsl #8 - sub r3, r3, #256 ;split = 1 + (((range-1) * probability) >> 8) - mov r10, #1 - - smlawb r2, r3, r2, r10 - ldrb r12, [r8] ;load cx data byte in stall slot - ;++ - - subs r3, r4, r2, lsl #24 ;x = value-(split<<24) - addhs r1, r1, #1 ;t += 1 - movhs r4, r3 ;update value - subhs r2, r6, r2 ;range = range - split - movlo r6, r2 - -;;; ldrsbhs r1, [r1, +lr] - ldrsb r1, [r1, +lr] - - -;; use branch for short pipelines ??? -;; cmp r2, #0x80 -;; bcs |$LN22@decode_mb_to| - - clz r3, r2 - sub r3, r3, #24 - subs r5, r5, r3 - mov r6, r2, lsl r3 - mov r4, r4, lsl r3 - -;; use branch for short pipelines ??? -;; bgt |$LN22@decode_mb_to| - - addle r5, r5, #8 - rsble r3, r5, #8 - addle r8, r8, #1 - orrle r4, r4, r12, lsl r3 - -;;|$LN22@decode_mb_to| - - cmp r1, #0 - bgt get_token_loop - - cmn r1, #c_dct_eob_token ;if(t == -DCT_EOB_TOKEN) - beq END_OF_BLOCK - - rsb lr, r1, #0 ;v = -t; - - cmp lr, #4 ;if(v > FOUR_TOKEN) - ble SKIP_EXTRABITS - - ldr r3, [r9, #detok_teb_base_ptr] - mov r11, #1 - add r7, r3, lr, lsl #4 - - ldrsh lr, [r7, #tokenextrabits_min_val];v = teb_ptr->min_val - ldrsh r0, [r7, #tokenextrabits_length];bits_count = teb_ptr->Length - -extrabits_loop - add r3, r0, r7 - - ldrb r2, [r3, #4] - mov r3, r6, lsl #8 - sub r3, r3, #256 ;split = 1 + (((range-1) * probability) >> 8) - mov r10, #1 - - smlawb r2, r3, r2, r10 - ldrb r12, [r8] - ;++ - - subs r10, r4, r2, lsl #24 ;x = value-(split<<24) - movhs r4, r10 ;update value - subhs r2, r6, r2 ;range = range - split - addhs lr, lr, r11, lsl r0 ;v += ((UINT16)1<<bits_count) - movlo r6, r2 ;range = split - - -;; use branch for short pipelines ??? -;; cmp r2, #0x80 -;; bcs |$LN10@decode_mb_to| - - clz r3, r2 - sub r3, r3, #24 - subs r5, r5, r3 - mov r6, r2, lsl r3 ;range - mov r4, r4, lsl r3 ;value - - addle r5, r5, #8 - addle r8, r8, #1 - rsble r3, r5, #8 - orrle r4, r4, r12, lsl r3 - -;;|$LN10@decode_mb_to| - subs r0, r0, #1 - bpl extrabits_loop - - -SKIP_EXTRABITS - ldr r11, [sp, #l_qcoeff] - ldr r0, [sp, #l_coef_ptr] - - cmp r1, #0 ;check for nonzero token - beq SKIP_EOB_CHECK ;if t is zero, we will skip the eob table chec - - sub r3, r6, #1 ;range - 1 - ;++ - mov r3, r3, lsl #7 ; *= onyx_prob_half (128) - ;++ - mov r3, r3, lsr #8 - add r2, r3, #1 ;split - - subs r3, r4, r2, lsl #24 ;x = value-(split<<24) - movhs r4, r3 ;update value - subhs r2, r6, r2 ;range = range - split - mvnhs r3, lr - addhs lr, r3, #1 ;v = (v ^ -1) + 1 - movlo r6, r2 ;range = split - -;; use branch for short pipelines ??? -;; cmp r2, #0x80 -;; bcs |$LN6@decode_mb_to| - - clz r3, r2 - sub r3, r3, #24 - subs r5, r5, r3 - mov r6, r2, lsl r3 - mov r4, r4, lsl r3 - ldrleb r2, [r8], #1 - addle r5, r5, #8 - rsble r3, r5, #8 - orrle r4, r4, r2, lsl r3 - -;;|$LN6@decode_mb_to| - add r0, r0, #0xB - - cmn r1, #1 - - addlt r0, r0, #0xB - - mvn r1, #1 - -SKIP_EOB_CHECK - ldr r7, [sp, #l_c] - ldr r3, [r9, #detok_scan] - add r1, r1, #2 - cmp r7, #(0x10 - 1) ;assume one less for now.... increment below - - ldr r3, [r3, +r7, lsl #2] - add r7, r7, #1 - add r3, r11, r3, lsl #1 - - str r7, [sp, #l_c] - strh lr, [r3] - - blt COEFF_LOOP - - sub r7, r7, #1 ;if(t != -DCT_EOB_TOKEN) --c - -END_OF_BLOCK - ldr r3, [sp, #l_type] - ldr r10, [sp, #l_coef_ptr] - ldr r0, [sp, #l_qcoeff] - ldr r11, [sp, #l_i] - ldr r12, [sp, #l_stop] - - cmp r3, #0 - moveq r1, #1 - movne r1, #0 - add r3, r11, r9 - - cmp r7, r1 - strb r7, [r3, #detok_eob] - - ldr r7, [sp, #l_l_ptr] - ldr r2, [sp, #l_a_ptr] - movne r3, #1 - moveq r3, #0 - - add r0, r0, #0x20 - add r11, r11, #1 - str r3, [r7] - str r3, [r2] - str r0, [sp, #l_qcoeff] - str r11, [sp, #l_i] - - cmp r11, r12 ;i >= stop ? - ldr r7, [sp, #l_type] - mov lr, #0xB - - blt BLOCK_LOOP - - cmp r11, #0x19 - bne ln2_decode_mb_to - - ldr r12, [r9, #detok_qcoeff_start_ptr] - ldr r10, [r9, #detok_coef_probs] - mov r7, #0 - mov r3, #0x10 - str r12, [sp, #l_qcoeff] - str r7, [sp, #l_i] - str r7, [sp, #l_type] - str r3, [sp, #l_stop] - - str r10, [sp, #l_coef_ptr] - - b BLOCK_LOOP - -ln2_decode_mb_to - cmp r11, #0x10 - bne ln1_decode_mb_to - - ldr r10, [r9, #0x30] - - mov r7, #2 - mov r3, #0x18 - - str r7, [sp, #l_type] - str r3, [sp, #l_stop] - - str r10, [sp, #l_coef_ptr] - b BLOCK_LOOP - -ln1_decode_mb_to - ldr r2, [sp, #l_bc] - mov r0, #0 - nop - - ldr r3, [r2, #bool_decoder_buffer] - str r5, [r2, #bool_decoder_count] - str r4, [r2, #bool_decoder_value] - sub r3, r8, r3 - str r3, [r2, #bool_decoder_pos] - str r6, [r2, #bool_decoder_range] - - add sp, sp, #l_stacksize - ldmia sp!, {r4 - r11, pc} - - ENDP ; |vp8_decode_mb_tokens_v5| - - END diff --git a/vp8/decoder/arm/dsystemdependent.c b/vp8/decoder/arm/dsystemdependent.c index 455c83a..9dcf7b6 100644 --- a/vp8/decoder/arm/dsystemdependent.c +++ b/vp8/decoder/arm/dsystemdependent.c @@ -1,10 +1,11 @@ /* - * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. * - * Use of this source code is governed by a BSD-style license and patent - * grant that can be found in the LICENSE file in the root of the source - * tree. All contributing project authors may be found in the AUTHORS - * file in the root of the source tree. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. */ @@ -22,20 +23,14 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi) pbi->mb.rtcd = &pbi->common.rtcd; #if HAVE_ARMV7 pbi->dequant.block = vp8_dequantize_b_neon; - pbi->dequant.idct = vp8_dequant_idct_neon; - pbi->dequant.idct_dc = vp8_dequant_dc_idct_neon; pbi->dboolhuff.start = vp8dx_start_decode_c; - pbi->dboolhuff.stop = vp8dx_stop_decode_c; pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c; pbi->dboolhuff.debool = vp8dx_decode_bool_c; pbi->dboolhuff.devalue = vp8dx_decode_value_c; #elif HAVE_ARMV6 pbi->dequant.block = vp8_dequantize_b_v6; - pbi->dequant.idct = vp8_dequant_idct_v6; - pbi->dequant.idct_dc = vp8_dequant_dc_idct_v6; pbi->dboolhuff.start = vp8dx_start_decode_c; - pbi->dboolhuff.stop = vp8dx_stop_decode_c; pbi->dboolhuff.fill = vp8dx_bool_decoder_fill_c; pbi->dboolhuff.debool = vp8dx_decode_bool_c; pbi->dboolhuff.devalue = vp8dx_decode_value_c; diff --git a/vp8/decoder/arm/neon/dboolhuff_neon.asm b/vp8/decoder/arm/neon/dboolhuff_neon.asm index 7ec62a3..ff3ffda 100644 --- a/vp8/decoder/arm/neon/dboolhuff_neon.asm +++ b/vp8/decoder/arm/neon/dboolhuff_neon.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; diff --git a/vp8/decoder/arm/neon/dequantdcidct_neon.asm b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm index 3392f2c..f68a780 100644 --- a/vp8/decoder/arm/neon/dequantdcidct_neon.asm +++ b/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm @@ -1,38 +1,51 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp8_dequant_dc_idct_neon| + EXPORT |vp8_dequant_dc_idct_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_dc_idct_c(short *input, short *dq, short *output, int pitch, int Dc); +;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride, +; int Dc); ; r0 short *input, ; r1 short *dq, -; r2 short *output, -; r3 int pitch, -; (stack) int Dc -|vp8_dequant_dc_idct_neon| PROC +; r2 unsigned char *pred +; r3 unsigned char *dest +; sp int pitch +; sp+4 int stride +; sp+8 int Dc +|vp8_dequant_dc_idct_add_neon| PROC vld1.16 {q3, q4}, [r0] vld1.16 {q5, q6}, [r1] - ldr r1, [sp] ;load Dc from stack + ldr r1, [sp, #8] ;load Dc from stack - ldr r12, _dcidct_coeff_ + ldr r12, _CONSTANTS_ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon vmul.i16 q2, q4, q6 vmov.16 d2[0], r1 + ldr r1, [sp] ; pitch + vld1.32 {d14[0]}, [r2], r1 + vld1.32 {d14[1]}, [r2], r1 + vld1.32 {d15[0]}, [r2], r1 + vld1.32 {d15[1]}, [r2] + + ldr r1, [sp, #4] ; stride + ;|short_idct4x4llm_neon| PROC vld1.16 {d0}, [r12] vswp d3, d4 ;q2(vp[4] vp[12]) @@ -46,14 +59,9 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - vqsub.s16 d10, d6, d9 ;c1 vqadd.s16 d11, d7, d8 ;d1 @@ -82,7 +90,7 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 vqsub.s16 d10, d6, d9 ;c1 @@ -100,34 +108,29 @@ vrshr.s16 d4, d4, #3 vrshr.s16 d5, d5, #3 - add r1, r2, r3 - add r12, r1, r3 - add r0, r12, r3 - vtrn.32 d2, d4 vtrn.32 d3, d5 vtrn.16 d2, d3 vtrn.16 d4, d5 - vst1.16 {d2}, [r2] - vst1.16 {d3}, [r1] - vst1.16 {d4}, [r12] - vst1.16 {d5}, [r0] + vaddw.u8 q1, q1, d14 + vaddw.u8 q2, q2, d15 - bx lr + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + + vst1.32 {d0[0]}, [r3], r1 + vst1.32 {d0[1]}, [r3], r1 + vst1.32 {d1[0]}, [r3], r1 + vst1.32 {d1[1]}, [r3] - ENDP + bx lr -;----------------- - AREA dcidct4x4_dat, DATA, READWRITE ;read/write by default -;Data section with name data_area is specified. DCD reserves space in memory for 48 data. -;One word each is reserved. Label filter_coeff can be used to access the data. -;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... -_dcidct_coeff_ - DCD dcidct_coeff -dcidct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c + ENDP ; |vp8_dequant_dc_idct_add_neon| -;20091, 20091, 35468, 35468 +; Constant Pool +_CONSTANTS_ DCD cospi8sqrt2minus1 +cospi8sqrt2minus1 DCD 0x4e7b4e7b +sinpi8sqrt2 DCD 0x8a8c8a8c END diff --git a/vp8/decoder/arm/neon/dequantidct_neon.asm b/vp8/decoder/arm/neon/dequant_idct_neon.asm index bba4d5d..1923be4 100644 --- a/vp8/decoder/arm/neon/dequantidct_neon.asm +++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm @@ -1,29 +1,41 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp8_dequant_idct_neon| + EXPORT |vp8_dequant_idct_add_neon| ARM REQUIRE8 PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_idct_c(short *input, short *dq, short *output, int pitch); +;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred, +; unsigned char *dest, int pitch, int stride) ; r0 short *input, ; r1 short *dq, -; r2 short *output, -; r3 int pitch, -|vp8_dequant_idct_neon| PROC +; r2 unsigned char *pred +; r3 unsigned char *dest +; sp int pitch +; sp+4 int stride + +|vp8_dequant_idct_add_neon| PROC vld1.16 {q3, q4}, [r0] vld1.16 {q5, q6}, [r1] + ldr r1, [sp] ; pitch + vld1.32 {d14[0]}, [r2], r1 + vld1.32 {d14[1]}, [r2], r1 + vld1.32 {d15[0]}, [r2], r1 + vld1.32 {d15[1]}, [r2] + + ldr r1, [sp, #4] ; stride - ldr r12, _didct_coeff_ + ldr r12, _CONSTANTS_ vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon vmul.i16 q2, q4, q6 @@ -41,14 +53,9 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - vqsub.s16 d10, d6, d9 ;c1 vqadd.s16 d11, d7, d8 ;d1 @@ -77,7 +84,7 @@ vshr.s16 q3, q3, #1 vshr.s16 q4, q4, #1 - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q3, q3, q2 vqadd.s16 q4, q4, q2 vqsub.s16 d10, d6, d9 ;c1 @@ -95,34 +102,29 @@ vrshr.s16 d4, d4, #3 vrshr.s16 d5, d5, #3 - add r1, r2, r3 - add r12, r1, r3 - add r0, r12, r3 - vtrn.32 d2, d4 vtrn.32 d3, d5 vtrn.16 d2, d3 vtrn.16 d4, d5 - vst1.16 {d2}, [r2] - vst1.16 {d3}, [r1] - vst1.16 {d4}, [r12] - vst1.16 {d5}, [r0] + vaddw.u8 q1, q1, d14 + vaddw.u8 q2, q2, d15 - bx lr + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + + vst1.32 {d0[0]}, [r3], r1 + vst1.32 {d0[1]}, [r3], r1 + vst1.32 {d1[0]}, [r3], r1 + vst1.32 {d1[1]}, [r3] - ENDP + bx lr -;----------------- - AREA didct4x4_dat, DATA, READWRITE ;read/write by default -;Data section with name data_area is specified. DCD reserves space in memory for 48 data. -;One word each is reserved. Label filter_coeff can be used to access the data. -;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... -_didct_coeff_ - DCD didct_coeff -didct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c + ENDP ; |vp8_dequant_idct_add_neon| -;20091, 20091, 35468, 35468 +; Constant Pool +_CONSTANTS_ DCD cospi8sqrt2minus1 +cospi8sqrt2minus1 DCD 0x4e7b4e7b +sinpi8sqrt2 DCD 0x8a8c8a8c END diff --git a/vp8/decoder/arm/neon/dequantizeb_neon.asm b/vp8/decoder/arm/neon/dequantizeb_neon.asm index 1bde946..c8e0c31 100644 --- a/vp8/decoder/arm/neon/dequantizeb_neon.asm +++ b/vp8/decoder/arm/neon/dequantizeb_neon.asm @@ -1,10 +1,11 @@ ; -; Copyright (c) 2010 The VP8 project authors. All Rights Reserved. +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. ; diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c new file mode 100644 index 0000000..4725e62 --- /dev/null +++ b/vp8/decoder/arm/neon/idct_blk_neon.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "idct.h" +#include "dequantize.h" + +void vp8_dequant_dc_idct_add_y_block_neon + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs, short *dc) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]); + else + vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride); + + if (eobs[1] > 1) + vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]); + else + vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride); + + if (eobs[2] > 1) + vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]); + else + vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride); + + if (eobs[3] > 1) + vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]); + else + vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride); + + q += 64; + dc += 4; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_y_block_neon + (short *q, short *dq, unsigned char *pre, + unsigned char *dst, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 4; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride); + ((int *)(q+16))[0] = 0; + } + + if (eobs[2] > 1) + vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride); + ((int *)(q+32))[0] = 0; + } + + if (eobs[3] > 1) + vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride); + else + { + vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride); + ((int *)(q+48))[0] = 0; + } + + q += 64; + pre += 64; + dst += 4*stride; + eobs += 4; + } +} + +void vp8_dequant_idct_add_uv_block_neon + (short *q, short *dq, unsigned char *pre, + unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +{ + int i; + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstu += 4*stride; + eobs += 2; + } + + for (i = 0; i < 2; i++) + { + if (eobs[0] > 1) + vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride); + ((int *)q)[0] = 0; + } + + if (eobs[1] > 1) + vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride); + else + { + vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride); + ((int *)(q+16))[0] = 0; + } + + q += 32; + pre += 32; + dstv += 4*stride; + eobs += 2; + } +} |