diff options
Diffstat (limited to 'libvpx/vp8')
107 files changed, 2464 insertions, 2003 deletions
diff --git a/libvpx/vp8/common/alloccommon.h b/libvpx/vp8/common/alloccommon.h index ea93c25..93e99d7 100644 --- a/libvpx/vp8/common/alloccommon.h +++ b/libvpx/vp8/common/alloccommon.h @@ -9,15 +9,23 @@ */ -#ifndef __INC_ALLOCCOMMON_H -#define __INC_ALLOCCOMMON_H +#ifndef VP8_COMMON_ALLOCCOMMON_H_ +#define VP8_COMMON_ALLOCCOMMON_H_ #include "onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_create_common(VP8_COMMON *oci); void vp8_remove_common(VP8_COMMON *oci); void vp8_de_alloc_frame_buffers(VP8_COMMON *oci); int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height); void vp8_setup_version(VP8_COMMON *oci); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ALLOCCOMMON_H_ diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm index dc84c30..3991957 100644 --- a/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm +++ b/libvpx/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm @@ -53,7 +53,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -77,7 +77,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -101,7 +101,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -127,7 +127,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm index adc353d..915ee49 100644 --- a/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm +++ b/libvpx/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm @@ -51,7 +51,7 @@ loop orr r8, r8, r10 ; differences of all 4 pixels ; calculate total sum add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum + sub r4, r4, r7 ; subtract negative differences from sum ; calculate sse uxtb16 r7, r8 ; byte (two pixels) to halfwords @@ -77,7 +77,7 @@ loop ; calculate total sum add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum + sub r4, r4, r7 ; subtract negative differences from sum ; calculate sse uxtb16 r7, r8 ; byte (two pixels) to halfwords diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm index dd2ce68..3668dc5 100644 --- a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm +++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm @@ -58,7 +58,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -89,7 +89,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -120,7 +120,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -153,7 +153,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm index f972d9b..b4e0959 100644 --- a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm +++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm @@ -69,7 +69,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -111,7 +111,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -153,7 +153,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -195,7 +195,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm index f5da9c0..10863e2 100644 --- a/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm +++ b/libvpx/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm @@ -59,7 +59,7 @@ loop orr r6, r6, r7 ; differences of all 4 pixels ; calculate total sum adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum + subs r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -90,7 +90,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -121,7 +121,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords @@ -154,7 +154,7 @@ loop ; calculate total sum add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum + sub r8, r8, r5 ; subtract negative differences from sum ; calculate sse uxtb16 r5, r6 ; byte (two pixels) to halfwords diff --git a/libvpx/vp8/common/arm/bilinearfilter_arm.h b/libvpx/vp8/common/arm/bilinearfilter_arm.h index b7155d3..6b84e6f 100644 --- a/libvpx/vp8/common/arm/bilinearfilter_arm.h +++ b/libvpx/vp8/common/arm/bilinearfilter_arm.h @@ -9,8 +9,12 @@ */ -#ifndef BILINEARFILTER_ARM_H -#define BILINEARFILTER_ARM_H +#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ +#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_filter_block2d_bil_first_pass_armv6 ( @@ -32,4 +36,8 @@ extern void vp8_filter_block2d_bil_second_pass_armv6 const short *vp8_filter ); -#endif /* BILINEARFILTER_ARM_H */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm deleted file mode 100644 index e392786..0000000 --- a/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm +++ /dev/null @@ -1,357 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_bilinear_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, bifilter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - vst1.u8 {d4, d5}, [r4], r5 - vst1.u8 {d6, d7}, [r4], r5 - vmov q11, q15 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_sp16x16_loop_neon - - add sp, sp, #272 - - pop {r4-r5,pc} - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r4], r5 ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r4], r5 - vst1.u8 {d18, d19}, [r4], r5 - vst1.u8 {d20, d21}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - pop {r4-r5,pc} - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r4], r5 - vmov q11, q15 - vst1.u8 {d6, d7}, [r4], r5 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_spo16x16_loop_neon - pop {r4-r5,pc} - - ENDP - -;----------------- - -bifilter16_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm deleted file mode 100644 index 0ac6243..0000000 --- a/libvpx/vp8/common/arm/neon/bilinearpredict4x4_neon.asm +++ /dev/null @@ -1,130 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict4x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict4x4_neon| PROC - push {r4, lr} - - adr r12, bifilter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x4) - vld1.u8 {d2}, [r0], r1 ;load src data - add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) - - vld1.u8 {d3}, [r0], r1 - vld1.u32 {d31}, [r2] ;first_pass filter - - vld1.u8 {d4}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0-d1) - vld1.u8 {d5}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {d6}, [r0], r1 - - vshr.u64 q4, q1, #8 ;construct src_ptr[1] - vshr.u64 q5, q2, #8 - vshr.u64 d12, d6, #8 - - vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d4, d5 - vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - - vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q8, d10, d1 - vmlal.u8 q9, d12, d1 - - vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d29, q8, #7 - vqrshrn.u16 d30, q9, #7 - -;Second pass: 4x4 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 ;calculate Vfilter location - vld1.u32 {d31}, [r3] ;load second_pass filter - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d31[4] - - vmull.u8 q1, d28, d0 - vmull.u8 q2, d29, d0 - - vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] - vext.8 d27, d29, d30, #4 - - vmlal.u8 q1, d26, d1 - vmlal.u8 q2, d27, d1 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - - vst1.32 {d2[0]}, [r4] ;store result - vst1.32 {d2[1]}, [r0] - vst1.32 {d3[0]}, [r1] - vst1.32 {d3[1]}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - - vld1.32 {d28[0]}, [r0], r1 ;load src data - vld1.32 {d28[1]}, [r0], r1 - vld1.32 {d29[0]}, [r0], r1 - vld1.32 {d29[1]}, [r0], r1 - vld1.32 {d30[0]}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.32 {d28[0]}, [r4], lr ;store result - vst1.32 {d28[1]}, [r4], lr - vst1.32 {d29[0]}, [r4], lr - vst1.32 {d29[1]}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm deleted file mode 100644 index 41f5c45..0000000 --- a/libvpx/vp8/common/arm/neon/bilinearpredict8x4_neon.asm +++ /dev/null @@ -1,135 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x4_neon| PROC - push {r4, lr} - - adr r12, bifilter8x4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vld1.u8 {q5}, [r0], r1 - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d23, q7, #7 - vqrshrn.u16 d24, q8, #7 - vqrshrn.u16 d25, q9, #7 - vqrshrn.u16 d26, q10, #7 - -;Second pass: 4x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1] - vst1.u8 {d5}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8x4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm b/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm deleted file mode 100644 index c4711bc..0000000 --- a/libvpx/vp8/common/arm/neon/bilinearpredict8x8_neon.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x8_neon| PROC - push {r4, lr} - - adr r12, bifilter8_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (9x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - - vld1.u8 {q1}, [r0], r1 ;load src data - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vld1.u8 {q2}, [r0], r1 - vqrshrn.u16 d23, q7, #7 - vld1.u8 {q3}, [r0], r1 - vqrshrn.u16 d24, q8, #7 - vld1.u8 {q4}, [r0], r1 - vqrshrn.u16 d25, q9, #7 - - ;first_pass filtering on the rest 5-line data - vld1.u8 {q5}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d27, q7, #7 - vqrshrn.u16 d28, q8, #7 - vqrshrn.u16 d29, q9, #7 - vqrshrn.u16 d30, q10, #7 - -;Second pass: 8x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - vmlal.u8 q5, d27, d1 - vmlal.u8 q6, d28, d1 - vmlal.u8 q7, d29, d1 - vmlal.u8 q8, d30, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1], lr - vst1.u8 {d5}, [r1], lr - vst1.u8 {d6}, [r1], lr - vst1.u8 {d7}, [r1], lr - vst1.u8 {d8}, [r1], lr - vst1.u8 {d9}, [r1], lr - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - vld1.u8 {d27}, [r0], r1 - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - vst1.u8 {d26}, [r4], lr - vst1.u8 {d27}, [r4], lr - vst1.u8 {d28}, [r4], lr - vst1.u8 {d29}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c new file mode 100644 index 0000000..e1c3c2b --- /dev/null +++ b/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const uint16_t bifilter4_coeff[8][2] = { + {128, 0}, + {112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +void vp8_bilinear_predict4x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8; + uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8; + uint32x2_t d28u32, d29u32, d30u32; + uint8x16_t q1u8, q2u8; + uint16x8_t q1u16, q2u16; + uint16x8_t q7u16, q8u16, q9u16; + uint64x2_t q4u64, q5u64; + uint64x1_t d12u64; + uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2; + + if (xoffset == 0) { // skip_1stpass_filter + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0); + src_ptr += src_pixels_per_line; + d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0); + src_ptr += src_pixels_per_line; + d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1); + src_ptr += src_pixels_per_line; + d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0); + d28u8 = vreinterpret_u8_u32(d28u32); + d29u8 = vreinterpret_u8_u32(d29u32); + d30u8 = vreinterpret_u8_u32(d30u32); + } else { + d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d6u8 = vld1_u8(src_ptr); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + + d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + + q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8); + q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8); + d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8); + + d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)), + vreinterpret_u32_u8(vget_high_u8(q1u8))); + d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)), + vreinterpret_u32_u8(vget_high_u8(q2u8))); + d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)), + vreinterpret_u32_u64(vget_high_u64(q4u64))); + d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)), + vreinterpret_u32_u64(vget_high_u64(q5u64))); + + q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8); + q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8); + q9u16 = vmull_u8(d6u8, d0u8); + + q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8); + q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8); + q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8); + + d28u8 = vqrshrn_n_u16(q7u16, 7); + d29u8 = vqrshrn_n_u16(q8u16, 7); + d30u8 = vqrshrn_n_u16(q9u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d28u8, d0u8); + q2u16 = vmull_u8(d29u8, d0u8); + + d26u8 = vext_u8(d28u8, d29u8, 4); + d27u8 = vext_u8(d29u8, d30u8, 4); + + q1u16 = vmlal_u8(q1u16, d26u8, d1u8); + q2u16 = vmlal_u8(q2u16, d27u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0); + dst_ptr += dst_pitch; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1); + } + return; +} + +void vp8_bilinear_predict8x4_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8; + uint8x8_t d7u8, d9u8, d11u8, d22u8, d23u8, d24u8, d25u8, d26u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + d26u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); + } + return; +} + +void vp8_bilinear_predict8x8_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8, d11u8; + uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8, d30u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16; + uint16x8_t q6u16, q7u16, q8u16, q9u16, q10u16; + + if (xoffset == 0) { // skip_1stpass_filter + d22u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d23u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d24u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d25u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d26u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d27u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d28u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d29u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line; + d30u8 = vld1_u8(src_ptr); + } else { + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + + d22u8 = vqrshrn_n_u16(q6u16, 7); + d23u8 = vqrshrn_n_u16(q7u16, 7); + d24u8 = vqrshrn_n_u16(q8u16, 7); + d25u8 = vqrshrn_n_u16(q9u16, 7); + + // first_pass filtering on the rest 5-line data + q1u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q2u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q5u8 = vld1q_u8(src_ptr); + + q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8); + q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8); + q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8); + q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8); + + d3u8 = vext_u8(vget_low_u8(q1u8), vget_high_u8(q1u8), 1); + d5u8 = vext_u8(vget_low_u8(q2u8), vget_high_u8(q2u8), 1); + d7u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1); + d9u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1); + d11u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1); + + q6u16 = vmlal_u8(q6u16, d3u8, d1u8); + q7u16 = vmlal_u8(q7u16, d5u8, d1u8); + q8u16 = vmlal_u8(q8u16, d7u8, d1u8); + q9u16 = vmlal_u8(q9u16, d9u8, d1u8); + q10u16 = vmlal_u8(q10u16, d11u8, d1u8); + + d26u8 = vqrshrn_n_u16(q6u16, 7); + d27u8 = vqrshrn_n_u16(q7u16, 7); + d28u8 = vqrshrn_n_u16(q8u16, 7); + d29u8 = vqrshrn_n_u16(q9u16, 7); + d30u8 = vqrshrn_n_u16(q10u16, 7); + } + + // secondpass_filter + if (yoffset == 0) { // skip_2ndpass_filter + vst1_u8((uint8_t *)dst_ptr, d22u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d23u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d24u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d25u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d26u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d27u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d28u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d29u8); + } else { + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q1u16 = vmull_u8(d22u8, d0u8); + q2u16 = vmull_u8(d23u8, d0u8); + q3u16 = vmull_u8(d24u8, d0u8); + q4u16 = vmull_u8(d25u8, d0u8); + q5u16 = vmull_u8(d26u8, d0u8); + q6u16 = vmull_u8(d27u8, d0u8); + q7u16 = vmull_u8(d28u8, d0u8); + q8u16 = vmull_u8(d29u8, d0u8); + + q1u16 = vmlal_u8(q1u16, d23u8, d1u8); + q2u16 = vmlal_u8(q2u16, d24u8, d1u8); + q3u16 = vmlal_u8(q3u16, d25u8, d1u8); + q4u16 = vmlal_u8(q4u16, d26u8, d1u8); + q5u16 = vmlal_u8(q5u16, d27u8, d1u8); + q6u16 = vmlal_u8(q6u16, d28u8, d1u8); + q7u16 = vmlal_u8(q7u16, d29u8, d1u8); + q8u16 = vmlal_u8(q8u16, d30u8, d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + vst1_u8((uint8_t *)dst_ptr, d2u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d3u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d4u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d5u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d6u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d7u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d8u8); dst_ptr += dst_pitch; + vst1_u8((uint8_t *)dst_ptr, d9u8); + } + return; +} + +void vp8_bilinear_predict16x16_neon( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + int i; + unsigned char tmp[272]; + unsigned char *tmpp; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; + uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; + uint8x8_t d19u8, d20u8, d21u8; + uint8x16_t q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + uint8x16_t q11u8, q12u8, q13u8, q14u8, q15u8; + uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; + uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; + + if (xoffset == 0) { // secondpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + q11u8 = vld1q_u8(src_ptr); + src_ptr += src_pixels_per_line; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q13u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q14u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + q15u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; + } + + if (yoffset == 0) { // firstpass_bfilter16x16_only + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + for (i = 4; i > 0 ; i--) { + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 =vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)dst_ptr, q7u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q8u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q9u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q10u8); dst_ptr += dst_pitch; + } + return; + } + + d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + // First Pass: output_height lines x output_width columns (17x16) + tmpp = tmp; + for (i = 3; i > 0; i--) { + q7u16 = vmull_u8(d2u8, d0u8); + q8u16 = vmull_u8(d3u8, d0u8); + q9u16 = vmull_u8(d5u8, d0u8); + q10u16 = vmull_u8(d6u8, d0u8); + q11u16 = vmull_u8(d8u8, d0u8); + q12u16 = vmull_u8(d9u8, d0u8); + q13u16 = vmull_u8(d11u8, d0u8); + q14u16 = vmull_u8(d12u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + d11u8 = vext_u8(d11u8, d12u8, 1); + + q7u16 = vmlal_u8(q7u16, d2u8, d1u8); + q9u16 = vmlal_u8(q9u16, d5u8, d1u8); + q11u16 = vmlal_u8(q11u16, d8u8, d1u8); + q13u16 = vmlal_u8(q13u16, d11u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + d12u8 = vext_u8(d12u8, d13u8, 1); + + q8u16 = vmlal_u8(q8u16, d3u8, d1u8); + q10u16 = vmlal_u8(q10u16, d6u8, d1u8); + q12u16 = vmlal_u8(q12u16, d9u8, d1u8); + q14u16 = vmlal_u8(q14u16, d12u8, d1u8); + + d14u8 = vqrshrn_n_u16(q7u16, 7); + d15u8 = vqrshrn_n_u16(q8u16, 7); + d16u8 = vqrshrn_n_u16(q9u16, 7); + d17u8 = vqrshrn_n_u16(q10u16, 7); + d18u8 = vqrshrn_n_u16(q11u16, 7); + d19u8 = vqrshrn_n_u16(q12u16, 7); + d20u8 = vqrshrn_n_u16(q13u16, 7); + d21u8 = vqrshrn_n_u16(q14u16, 7); + + d2u8 = vld1_u8(src_ptr); + d3u8 = vld1_u8(src_ptr + 8); + d4u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d5u8 = vld1_u8(src_ptr); + d6u8 = vld1_u8(src_ptr + 8); + d7u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d8u8 = vld1_u8(src_ptr); + d9u8 = vld1_u8(src_ptr + 8); + d10u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + d11u8 = vld1_u8(src_ptr); + d12u8 = vld1_u8(src_ptr + 8); + d13u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + q10u8 = vcombine_u8(d20u8, d21u8); + + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q10u8); tmpp += 16; + } + + // First-pass filtering for rest 5 lines + d14u8 = vld1_u8(src_ptr); + d15u8 = vld1_u8(src_ptr + 8); + d16u8 = vld1_u8(src_ptr + 16); src_ptr += src_pixels_per_line; + + q9u16 = vmull_u8(d2u8, d0u8); + q10u16 = vmull_u8(d3u8, d0u8); + q11u16 = vmull_u8(d5u8, d0u8); + q12u16 = vmull_u8(d6u8, d0u8); + q13u16 = vmull_u8(d8u8, d0u8); + q14u16 = vmull_u8(d9u8, d0u8); + + d2u8 = vext_u8(d2u8, d3u8, 1); + d5u8 = vext_u8(d5u8, d6u8, 1); + d8u8 = vext_u8(d8u8, d9u8, 1); + + q9u16 = vmlal_u8(q9u16, d2u8, d1u8); + q11u16 = vmlal_u8(q11u16, d5u8, d1u8); + q13u16 = vmlal_u8(q13u16, d8u8, d1u8); + + d3u8 = vext_u8(d3u8, d4u8, 1); + d6u8 = vext_u8(d6u8, d7u8, 1); + d9u8 = vext_u8(d9u8, d10u8, 1); + + q10u16 = vmlal_u8(q10u16, d3u8, d1u8); + q12u16 = vmlal_u8(q12u16, d6u8, d1u8); + q14u16 = vmlal_u8(q14u16, d9u8, d1u8); + + q1u16 = vmull_u8(d11u8, d0u8); + q2u16 = vmull_u8(d12u8, d0u8); + q3u16 = vmull_u8(d14u8, d0u8); + q4u16 = vmull_u8(d15u8, d0u8); + + d11u8 = vext_u8(d11u8, d12u8, 1); + d14u8 = vext_u8(d14u8, d15u8, 1); + + q1u16 = vmlal_u8(q1u16, d11u8, d1u8); + q3u16 = vmlal_u8(q3u16, d14u8, d1u8); + + d12u8 = vext_u8(d12u8, d13u8, 1); + d15u8 = vext_u8(d15u8, d16u8, 1); + + q2u16 = vmlal_u8(q2u16, d12u8, d1u8); + q4u16 = vmlal_u8(q4u16, d15u8, d1u8); + + d10u8 = vqrshrn_n_u16(q9u16, 7); + d11u8 = vqrshrn_n_u16(q10u16, 7); + d12u8 = vqrshrn_n_u16(q11u16, 7); + d13u8 = vqrshrn_n_u16(q12u16, 7); + d14u8 = vqrshrn_n_u16(q13u16, 7); + d15u8 = vqrshrn_n_u16(q14u16, 7); + d16u8 = vqrshrn_n_u16(q1u16, 7); + d17u8 = vqrshrn_n_u16(q2u16, 7); + d18u8 = vqrshrn_n_u16(q3u16, 7); + d19u8 = vqrshrn_n_u16(q4u16, 7); + + q5u8 = vcombine_u8(d10u8, d11u8); + q6u8 = vcombine_u8(d12u8, d13u8); + q7u8 = vcombine_u8(d14u8, d15u8); + q8u8 = vcombine_u8(d16u8, d17u8); + q9u8 = vcombine_u8(d18u8, d19u8); + + vst1q_u8((uint8_t *)tmpp, q5u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q6u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q7u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q8u8); tmpp += 16; + vst1q_u8((uint8_t *)tmpp, q9u8); + + // secondpass_filter + d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]); + d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]); + + tmpp = tmp; + q11u8 = vld1q_u8(tmpp); + tmpp += 16; + for (i = 4; i > 0; i--) { + q12u8 = vld1q_u8(tmpp); tmpp += 16; + q13u8 = vld1q_u8(tmpp); tmpp += 16; + q14u8 = vld1q_u8(tmpp); tmpp += 16; + q15u8 = vld1q_u8(tmpp); tmpp += 16; + + q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); + q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); + q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); + q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); + q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); + q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); + q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); + q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); + + q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); + q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); + q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); + q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); + q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); + q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); + q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); + q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); + + d2u8 = vqrshrn_n_u16(q1u16, 7); + d3u8 = vqrshrn_n_u16(q2u16, 7); + d4u8 = vqrshrn_n_u16(q3u16, 7); + d5u8 = vqrshrn_n_u16(q4u16, 7); + d6u8 = vqrshrn_n_u16(q5u16, 7); + d7u8 = vqrshrn_n_u16(q6u16, 7); + d8u8 = vqrshrn_n_u16(q7u16, 7); + d9u8 = vqrshrn_n_u16(q8u16, 7); + + q1u8 = vcombine_u8(d2u8, d3u8); + q2u8 = vcombine_u8(d4u8, d5u8); + q3u8 = vcombine_u8(d6u8, d7u8); + q4u8 = vcombine_u8(d8u8, d9u8); + + q11u8 = q15u8; + + vst1q_u8((uint8_t *)dst_ptr, q1u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q2u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q3u8); dst_ptr += dst_pitch; + vst1q_u8((uint8_t *)dst_ptr, q4u8); dst_ptr += dst_pitch; + } + return; +} diff --git a/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm b/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm deleted file mode 100644 index bda4b96..0000000 --- a/libvpx/vp8/common/arm/neon/copymem16x16_neon.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem16x16_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem16x16_neon| PROC - - vld1.u8 {q0}, [r0], r1 - vld1.u8 {q1}, [r0], r1 - vld1.u8 {q2}, [r0], r1 - vst1.u8 {q0}, [r2], r3 - vld1.u8 {q3}, [r0], r1 - vst1.u8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vst1.u8 {q2}, [r2], r3 - vld1.u8 {q5}, [r0], r1 - vst1.u8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vst1.u8 {q4}, [r2], r3 - vld1.u8 {q7}, [r0], r1 - vst1.u8 {q5}, [r2], r3 - vld1.u8 {q8}, [r0], r1 - vst1.u8 {q6}, [r2], r3 - vld1.u8 {q9}, [r0], r1 - vst1.u8 {q7}, [r2], r3 - vld1.u8 {q10}, [r0], r1 - vst1.u8 {q8}, [r2], r3 - vld1.u8 {q11}, [r0], r1 - vst1.u8 {q9}, [r2], r3 - vld1.u8 {q12}, [r0], r1 - vst1.u8 {q10}, [r2], r3 - vld1.u8 {q13}, [r0], r1 - vst1.u8 {q11}, [r2], r3 - vld1.u8 {q14}, [r0], r1 - vst1.u8 {q12}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - vst1.u8 {q13}, [r2], r3 - vst1.u8 {q14}, [r2], r3 - vst1.u8 {q15}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem16x16_neon| - - END diff --git a/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm b/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm deleted file mode 100644 index 35c0f67..0000000 --- a/libvpx/vp8/common/arm/neon/copymem8x4_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x4_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x4_neon| PROC - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vst1.u8 {d3}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem8x4_neon| - - END diff --git a/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm b/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm deleted file mode 100644 index 1f5b941..0000000 --- a/libvpx/vp8/common/arm/neon/copymem8x8_neon.asm +++ /dev/null @@ -1,43 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x8_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x8_neon| PROC - - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vld1.u8 {d4}, [r0], r1 - vst1.u8 {d3}, [r2], r3 - vld1.u8 {d5}, [r0], r1 - vst1.u8 {d4}, [r2], r3 - vld1.u8 {d6}, [r0], r1 - vst1.u8 {d5}, [r2], r3 - vld1.u8 {d7}, [r0], r1 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - mov pc, lr - - ENDP ; |vp8_copy_mem8x8_neon| - - END diff --git a/libvpx/vp8/common/arm/neon/copymem_neon.c b/libvpx/vp8/common/arm/neon/copymem_neon.c new file mode 100644 index 0000000..deced11 --- /dev/null +++ b/libvpx/vp8/common/arm/neon/copymem_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_copy_mem8x4_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 4; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem8x8_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + uint8x8_t vtmp; + int r; + + for (r = 0; r < 8; r++) { + vtmp = vld1_u8(src); + vst1_u8(dst, vtmp); + src += src_stride; + dst += dst_stride; + } +} + +void vp8_copy_mem16x16_neon( + unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + uint8x16_t qtmp; + + for (r = 0; r < 16; r++) { + qtmp = vld1q_u8(src); + vst1q_u8(dst, qtmp); + src += src_stride; + dst += dst_stride; + } +} diff --git a/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm deleted file mode 100644 index 79ff02c..0000000 --- a/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.asm +++ /dev/null @@ -1,54 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, -; int pred_stride, unsigned char *dst_ptr, -; int dst_stride) - -; r0 input_dc -; r1 pred_ptr -; r2 pred_stride -; r3 dst_ptr -; sp dst_stride - -|vp8_dc_only_idct_add_neon| PROC - add r0, r0, #4 - asr r0, r0, #3 - ldr r12, [sp] - vdup.16 q0, r0 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r3], r12 - vst1.32 {d2[1]}, [r3], r12 - vst1.32 {d4[0]}, [r3], r12 - vst1.32 {d4[1]}, [r3] - - bx lr - - ENDP - - END diff --git a/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c b/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c new file mode 100644 index 0000000..ad5f41d --- /dev/null +++ b/libvpx/vp8/common/arm/neon/dc_only_idct_add_neon.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_dc_only_idct_add_neon( + int16_t input_dc, + unsigned char *pred_ptr, + int pred_stride, + unsigned char *dst_ptr, + int dst_stride) { + int i; + uint16_t a1 = ((input_dc + 4) >> 3); + uint32x2_t d2u32 = vdup_n_u32(0); + uint8x8_t d2u8; + uint16x8_t q1u16; + uint16x8_t qAdd; + + qAdd = vdupq_n_u16(a1); + + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0); + pred_ptr += pred_stride; + d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1); + pred_ptr += pred_stride; + + q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); + + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0); + dst_ptr += dst_stride; + vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1); + dst_ptr += dst_stride; + } +} diff --git a/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm b/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm deleted file mode 100644 index 602cce6..0000000 --- a/libvpx/vp8/common/arm/neon/dequant_idct_neon.asm +++ /dev/null @@ -1,131 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequant_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_idct_add_neon(short *input, short *dq, -; unsigned char *dest, int stride) -; r0 short *input, -; r1 short *dq, -; r2 unsigned char *dest -; r3 int stride - -|vp8_dequant_idct_add_neon| PROC - vld1.16 {q3, q4}, [r0] - vld1.16 {q5, q6}, [r1] - - add r1, r2, r3 ; r1 = dest + stride - lsl r3, #1 ; 2x stride - - vld1.32 {d14[0]}, [r2], r3 - vld1.32 {d14[1]}, [r1], r3 - vld1.32 {d15[0]}, [r2] - vld1.32 {d15[1]}, [r1] - - adr r12, cospi8sqrt2minus1 ; pointer to the first constant - - vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon - vmul.i16 q2, q4, q6 - -;|short_idct4x4llm_neon| PROC - vld1.16 {d0}, [r12] - vswp d3, d4 ;q2(vp[4] vp[12]) - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - -; memset(input, 0, 32) -- 32bytes - vmov.i16 q14, #0 - - vswp d3, d4 - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vmov q15, q14 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vst1.16 {q14, q15}, [r0] - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vaddw.u8 q1, q1, d14 - vaddw.u8 q2, q2, d15 - - sub r2, r2, r3 - sub r1, r1, r3 - - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - - vst1.32 {d0[0]}, [r2], r3 - vst1.32 {d0[1]}, [r1], r3 - vst1.32 {d1[0]}, [r2] - vst1.32 {d1[1]}, [r1] - - bx lr - - ENDP ; |vp8_dequant_idct_add_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b4e7b -sinpi8sqrt2 DCD 0x8a8c8a8c - - END diff --git a/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/libvpx/vp8/common/arm/neon/dequant_idct_neon.c new file mode 100644 index 0000000..58e1192 --- /dev/null +++ b/libvpx/vp8/common/arm/neon/dequant_idct_neon.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static const int16_t cospi8sqrt2minus1 = 20091; +static const int16_t sinpi8sqrt2 = 35468; + +void vp8_dequant_idct_add_neon( + int16_t *input, + int16_t *dq, + unsigned char *dst, + int stride) { + unsigned char *dst0; + int32x2_t d14, d15; + int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; + int16x8_t q1, q2, q3, q4, q5, q6; + int16x8_t qEmpty = vdupq_n_s16(0); + int32x2x2_t d2tmp0, d2tmp1; + int16x4x2_t d2tmp2, d2tmp3; + + d14 = d15 = vdup_n_s32(0); + + // load input + q3 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + input += 8; + q4 = vld1q_s16(input); + vst1q_s16(input, qEmpty); + + // load dq + q5 = vld1q_s16(dq); + dq += 8; + q6 = vld1q_s16(dq); + + // load src from dst + dst0 = dst; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); + dst0 += stride; + d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); + dst0 += stride; + d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); + + q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3), + vreinterpretq_u16_s16(q5))); + q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4), + vreinterpretq_u16_s16(q6))); + + d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); + d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); + + q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + // loop 2 + q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); + + q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); + q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); + + d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); + d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); + + q3 = vshrq_n_s16(q3, 1); + q4 = vshrq_n_s16(q4, 1); + + q3 = vqaddq_s16(q3, q2); + q4 = vqaddq_s16(q4, q2); + + d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); + d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); + + d2 = vqadd_s16(d12, d11); + d3 = vqadd_s16(d13, d10); + d4 = vqsub_s16(d13, d10); + d5 = vqsub_s16(d12, d11); + + d2 = vrshr_n_s16(d2, 3); + d3 = vrshr_n_s16(d3, 3); + d4 = vrshr_n_s16(d4, 3); + d5 = vrshr_n_s16(d5, 3); + + d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); + d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); + d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), + vreinterpret_s16_s32(d2tmp1.val[0])); + d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), + vreinterpret_s16_s32(d2tmp1.val[1])); + + q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); + q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); + + q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), + vreinterpret_u8_s32(d14))); + q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), + vreinterpret_u8_s32(d15))); + + d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); + d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); + + dst0 = dst; + vst1_lane_s32((int32_t *)dst0, d14, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d14, 1); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 0); + dst0 += stride; + vst1_lane_s32((int32_t *)dst0, d15, 1); + return; +} diff --git a/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm b/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm deleted file mode 100644 index c8e0c31..0000000 --- a/libvpx/vp8/common/arm/neon/dequantizeb_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequantize_b_loop_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 short *Q, -; r1 short *DQC -; r2 short *DQ -|vp8_dequantize_b_loop_neon| PROC - vld1.16 {q0, q1}, [r0] - vld1.16 {q2, q3}, [r1] - - vmul.i16 q4, q0, q2 - vmul.i16 q5, q1, q3 - - vst1.16 {q4, q5}, [r2] - - bx lr - - ENDP - - END diff --git a/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/libvpx/vp8/common/arm/neon/dequantizeb_neon.c new file mode 100644 index 0000000..60f69c8 --- /dev/null +++ b/libvpx/vp8/common/arm/neon/dequantizeb_neon.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_dequantize_b_loop_neon( + int16_t *Q, + int16_t *DQC, + int16_t *DQ) { + int16x8x2_t qQ, qDQC, qDQ; + + qQ = vld2q_s16(Q); + qDQC = vld2q_s16(DQC); + + qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]); + qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]); + + vst2q_s16(DQ, qDQ); + return; +} diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h index f7ff577..ea1a6a4 100644 --- a/libvpx/vp8/common/blockd.h +++ b/libvpx/vp8/common/blockd.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_BLOCKD_H -#define __INC_BLOCKD_H +#ifndef VP8_COMMON_BLOCKD_H_ +#define VP8_COMMON_BLOCKD_H_ void vpx_log(const char *format, ...); @@ -20,6 +20,10 @@ void vpx_log(const char *format, ...); #include "treecoder.h" #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + /*#define DCPRED 1*/ #define DCPREDSIMTHRESH 0 #define DCPREDCNTTHRESH 3 @@ -297,4 +301,8 @@ typedef struct macroblockd extern void vp8_build_block_doffsets(MACROBLOCKD *x); extern void vp8_setup_block_dptrs(MACROBLOCKD *x); -#endif /* __INC_BLOCKD_H */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_BLOCKD_H_ diff --git a/libvpx/vp8/common/coefupdateprobs.h b/libvpx/vp8/common/coefupdateprobs.h index 9e194dc..d96a19e 100644 --- a/libvpx/vp8/common/coefupdateprobs.h +++ b/libvpx/vp8/common/coefupdateprobs.h @@ -8,6 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_COEFUPDATEPROBS_H_ +#define VP8_COMMON_COEFUPDATEPROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif /* Update probabilities for the nodes in the token entropy tree. Generated file included by entropy.c */ @@ -183,3 +189,9 @@ const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTE }, }, }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COEFUPDATEPROBS_H_ diff --git a/libvpx/vp8/common/common.h b/libvpx/vp8/common/common.h index 2cc1c54..ee5b58c 100644 --- a/libvpx/vp8/common/common.h +++ b/libvpx/vp8/common/common.h @@ -9,8 +9,8 @@ */ -#ifndef common_h -#define common_h 1 +#ifndef VP8_COMMON_COMMON_H_ +#define VP8_COMMON_COMMON_H_ #include <assert.h> @@ -18,6 +18,10 @@ #include "vpx_mem/vpx_mem.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Only need this for fixed-size arrays, for structs just assign. */ #define vp8_copy( Dest, Src) { \ @@ -37,4 +41,8 @@ #define vp8_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest)); -#endif /* common_h */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_COMMON_H_ diff --git a/libvpx/vp8/common/default_coef_probs.h b/libvpx/vp8/common/default_coef_probs.h index 0d19563..4d69e4b 100644 --- a/libvpx/vp8/common/default_coef_probs.h +++ b/libvpx/vp8/common/default_coef_probs.h @@ -8,6 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_ +#define VP8_COMMON_DEFAULT_COEF_PROBS_H_ + +#ifdef __cplusplus +extern "C" { +#endif /*Generated file, included by entropy.c*/ @@ -186,3 +192,9 @@ static const vp8_prob default_coef_probs [BLOCK_TYPES] } } }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_DEFAULT_COEF_PROBS_H_ diff --git a/libvpx/vp8/common/entropy.h b/libvpx/vp8/common/entropy.h index 5389bc1..a90bab4 100644 --- a/libvpx/vp8/common/entropy.h +++ b/libvpx/vp8/common/entropy.h @@ -9,12 +9,16 @@ */ -#ifndef __INC_ENTROPY_H -#define __INC_ENTROPY_H +#ifndef VP8_COMMON_ENTROPY_H_ +#define VP8_COMMON_ENTROPY_H_ #include "treecoder.h" #include "blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Coefficient token alphabet */ #define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ @@ -98,4 +102,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_default_zig_zag_mask[16]); extern const int vp8_mb_feature_data_bits[MB_LVL_MAX]; void vp8_coef_tree_initialize(void); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ENTROPY_H_ diff --git a/libvpx/vp8/common/entropymode.h b/libvpx/vp8/common/entropymode.h index 1df0f64..81bdfc4 100644 --- a/libvpx/vp8/common/entropymode.h +++ b/libvpx/vp8/common/entropymode.h @@ -9,12 +9,16 @@ */ -#ifndef __INC_ENTROPYMODE_H -#define __INC_ENTROPYMODE_H +#ifndef VP8_COMMON_ENTROPYMODE_H_ +#define VP8_COMMON_ENTROPYMODE_H_ #include "onyxc_int.h" #include "treecoder.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef enum { SUBMVREF_NORMAL, @@ -77,4 +81,8 @@ void vp8_init_mbmode_probs(VP8_COMMON *x); void vp8_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES-1]); void vp8_kf_default_bmode_probs(vp8_prob dest [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES-1]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ENTROPYMODE_H_ diff --git a/libvpx/vp8/common/entropymv.h b/libvpx/vp8/common/entropymv.h index 2db1e38..42840d5 100644 --- a/libvpx/vp8/common/entropymv.h +++ b/libvpx/vp8/common/entropymv.h @@ -9,11 +9,15 @@ */ -#ifndef __INC_ENTROPYMV_H -#define __INC_ENTROPYMV_H +#ifndef VP8_COMMON_ENTROPYMV_H_ +#define VP8_COMMON_ENTROPYMV_H_ #include "treecoder.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { mv_max = 1023, /* max absolute value of a MV component */ @@ -41,4 +45,8 @@ typedef struct mv_context extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2]; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ENTROPYMV_H_ diff --git a/libvpx/vp8/common/extend.h b/libvpx/vp8/common/extend.h index 74a0b17..068f4ac 100644 --- a/libvpx/vp8/common/extend.h +++ b/libvpx/vp8/common/extend.h @@ -9,11 +9,15 @@ */ -#ifndef __INC_EXTEND_H -#define __INC_EXTEND_H +#ifndef VP8_COMMON_EXTEND_H_ +#define VP8_COMMON_EXTEND_H_ #include "vpx_scale/yv12config.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr); void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst); @@ -22,4 +26,8 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, int srcy, int srcx, int srch, int srcw); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_EXTEND_H_ diff --git a/libvpx/vp8/common/filter.h b/libvpx/vp8/common/filter.h index ccda7c8..cfba775 100644 --- a/libvpx/vp8/common/filter.h +++ b/libvpx/vp8/common/filter.h @@ -9,11 +9,15 @@ */ -#ifndef FILTER_H -#define FILTER_H +#ifndef VP8_COMMON_FILTER_H_ +#define VP8_COMMON_FILTER_H_ #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + #define BLOCK_HEIGHT_WIDTH 4 #define VP8_FILTER_WEIGHT 128 #define VP8_FILTER_SHIFT 7 @@ -21,4 +25,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]); extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_FILTER_H_ diff --git a/libvpx/vp8/common/findnearmv.h b/libvpx/vp8/common/findnearmv.h index c60e463..3c8c050 100644 --- a/libvpx/vp8/common/findnearmv.h +++ b/libvpx/vp8/common/findnearmv.h @@ -9,14 +9,18 @@ */ -#ifndef __INC_FINDNEARMV_H -#define __INC_FINDNEARMV_H +#ifndef VP8_COMMON_FINDNEARMV_H_ +#define VP8_COMMON_FINDNEARMV_H_ #include "mv.h" #include "blockd.h" #include "modecont.h" #include "treecoder.h" +#ifdef __cplusplus +extern "C" { +#endif + static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) @@ -179,4 +183,8 @@ static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi return (cur_mb->bmi + b - 4)->as_mode; } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_FINDNEARMV_H_ diff --git a/libvpx/vp8/common/header.h b/libvpx/vp8/common/header.h index 3e98eeb..e27bca1 100644 --- a/libvpx/vp8/common/header.h +++ b/libvpx/vp8/common/header.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_HEADER_H -#define __INC_HEADER_H +#ifndef VP8_COMMON_HEADER_H_ +#define VP8_COMMON_HEADER_H_ + +#ifdef __cplusplus +extern "C" { +#endif /* 24 bits total */ typedef struct @@ -40,4 +44,8 @@ typedef struct #endif +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_HEADER_H_ diff --git a/libvpx/vp8/common/invtrans.h b/libvpx/vp8/common/invtrans.h index 9262640..affe57e 100644 --- a/libvpx/vp8/common/invtrans.h +++ b/libvpx/vp8/common/invtrans.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_INVTRANS_H -#define __INC_INVTRANS_H +#ifndef VP8_COMMON_INVTRANS_H_ +#define VP8_COMMON_INVTRANS_H_ #include "vpx_config.h" #include "vp8_rtcd.h" @@ -21,6 +21,10 @@ #include "vpx_mem/vpx_mem.h" #endif +#ifdef __cplusplus +extern "C" { +#endif + static void eob_adjust(char *eobs, short *diff) { /* eob adjust.... the idct can only skip if both the dc and eob are zero */ @@ -59,4 +63,8 @@ static void vp8_inverse_transform_mby(MACROBLOCKD *xd) xd->dst.y_buffer, xd->dst.y_stride, xd->eobs); } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_INVTRANS_H_ diff --git a/libvpx/vp8/common/loopfilter.h b/libvpx/vp8/common/loopfilter.h index 1e47f34..20a6bd3 100644 --- a/libvpx/vp8/common/loopfilter.h +++ b/libvpx/vp8/common/loopfilter.h @@ -9,13 +9,17 @@ */ -#ifndef loopfilter_h -#define loopfilter_h +#ifndef VP8_COMMON_LOOPFILTER_H_ +#define VP8_COMMON_LOOPFILTER_H_ #include "vpx_ports/mem.h" #include "vpx_config.h" #include "vp8_rtcd.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_LOOP_FILTER 63 /* fraction of total macroblock rows to be used in fast filter level picking */ /* has to be > 2 */ @@ -102,4 +106,8 @@ void vp8_loop_filter_row_simple(struct VP8Common *cm, int mb_row, int post_ystride, int post_uvstride, unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_LOOPFILTER_H_ diff --git a/libvpx/vp8/common/modecont.h b/libvpx/vp8/common/modecont.h index 24db882..ff34c33 100644 --- a/libvpx/vp8/common/modecont.h +++ b/libvpx/vp8/common/modecont.h @@ -9,9 +9,17 @@ */ -#ifndef __INC_MODECONT_H -#define __INC_MODECONT_H +#ifndef VP8_COMMON_MODECONT_H_ +#define VP8_COMMON_MODECONT_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern const int vp8_mode_contexts[6][4]; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_MODECONT_H_ diff --git a/libvpx/vp8/common/mv.h b/libvpx/vp8/common/mv.h index b3f919d..111ccd6 100644 --- a/libvpx/vp8/common/mv.h +++ b/libvpx/vp8/common/mv.h @@ -9,10 +9,14 @@ */ -#ifndef __INC_MV_H -#define __INC_MV_H +#ifndef VP8_COMMON_MV_H_ +#define VP8_COMMON_MV_H_ #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { short row; @@ -25,4 +29,8 @@ typedef union int_mv MV as_mv; } int_mv; /* facilitates faster equality tests and copies */ +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_MV_H_ diff --git a/libvpx/vp8/common/onyx.h b/libvpx/vp8/common/onyx.h index 30c4cbb..119e40c 100644 --- a/libvpx/vp8/common/onyx.h +++ b/libvpx/vp8/common/onyx.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8_H -#define __INC_VP8_H +#ifndef VP8_COMMON_ONYX_H_ +#define VP8_COMMON_ONYX_H_ #ifdef __cplusplus extern "C" @@ -39,8 +39,8 @@ extern "C" typedef enum { - USAGE_STREAM_FROM_SERVER = 0x0, - USAGE_LOCAL_FILE_PLAYBACK = 0x1, + USAGE_LOCAL_FILE_PLAYBACK = 0x0, + USAGE_STREAM_FROM_SERVER = 0x1, USAGE_CONSTRAINED_QUALITY = 0x2, USAGE_CONSTANT_QUALITY = 0x3 } END_USAGE; @@ -267,4 +267,4 @@ extern "C" } #endif -#endif +#endif // VP8_COMMON_ONYX_H_ diff --git a/libvpx/vp8/common/onyxc_int.h b/libvpx/vp8/common/onyxc_int.h index e9bb7af..6d89865 100644 --- a/libvpx/vp8/common/onyxc_int.h +++ b/libvpx/vp8/common/onyxc_int.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8C_INT_H -#define __INC_VP8C_INT_H +#ifndef VP8_COMMON_ONYXC_INT_H_ +#define VP8_COMMON_ONYXC_INT_H_ #include "vpx_config.h" #include "vp8_rtcd.h" @@ -26,6 +26,10 @@ #include "header.h" /*#endif*/ +#ifdef __cplusplus +extern "C" { +#endif + #define MINQ 0 #define MAXQ 127 #define QINDEX_RANGE (MAXQ + 1) @@ -174,4 +178,8 @@ typedef struct VP8Common int cpu_caps; } VP8_COMMON; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_ONYXC_INT_H_ diff --git a/libvpx/vp8/common/onyxd.h b/libvpx/vp8/common/onyxd.h index 97c81c1..e37b29f 100644 --- a/libvpx/vp8/common/onyxd.h +++ b/libvpx/vp8/common/onyxd.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8D_H -#define __INC_VP8D_H +#ifndef VP8_COMMON_ONYXD_H_ +#define VP8_COMMON_ONYXD_H_ /* Create/destroy static data structures. */ @@ -60,4 +60,4 @@ extern "C" #endif -#endif +#endif // VP8_COMMON_ONYXD_H_ diff --git a/libvpx/vp8/common/postproc.c b/libvpx/vp8/common/postproc.c index dd998f1..e3bee32 100644 --- a/libvpx/vp8/common/postproc.c +++ b/libvpx/vp8/common/postproc.c @@ -71,11 +71,6 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = }; #endif -static const short kernel5[] = -{ - 1, 1, 4, 1, 1 -}; - const short vp8_rv[] = { 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, diff --git a/libvpx/vp8/common/postproc.h b/libvpx/vp8/common/postproc.h index 495a2c9..33d0a7f 100644 --- a/libvpx/vp8/common/postproc.h +++ b/libvpx/vp8/common/postproc.h @@ -9,8 +9,8 @@ */ -#ifndef POSTPROC_H -#define POSTPROC_H +#ifndef VP8_COMMON_POSTPROC_H_ +#define VP8_COMMON_POSTPROC_H_ #include "vpx_ports/mem.h" struct postproc_state @@ -26,6 +26,10 @@ struct postproc_state }; #include "onyxc_int.h" #include "ppflags.h" + +#ifdef __cplusplus +extern "C" { +#endif int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); @@ -47,4 +51,8 @@ void vp8_deblock(struct VP8Common *oci, #define MFQE_PRECISION 4 void vp8_multiframe_quality_enhance(struct VP8Common *cm); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_POSTPROC_H_ diff --git a/libvpx/vp8/common/ppflags.h b/libvpx/vp8/common/ppflags.h index 665e21f..768224a 100644 --- a/libvpx/vp8/common/ppflags.h +++ b/libvpx/vp8/common/ppflags.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_PPFLAGS_H -#define __INC_PPFLAGS_H +#ifndef VP8_COMMON_PPFLAGS_H_ +#define VP8_COMMON_PPFLAGS_H_ + +#ifdef __cplusplus +extern "C" { +#endif enum { VP8D_NOFILTERING = 0, @@ -38,4 +42,8 @@ typedef struct int display_mv_flag; } vp8_ppflags_t; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_PPFLAGS_H_ diff --git a/libvpx/vp8/common/pragmas.h b/libvpx/vp8/common/pragmas.h index 99fee5a..329cc82 100644 --- a/libvpx/vp8/common/pragmas.h +++ b/libvpx/vp8/common/pragmas.h @@ -8,8 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_PRAGMAS_H_ +#define VP8_COMMON_PRAGMAS_H_ - +#ifdef __cplusplus +extern "C" { +#endif #ifdef __INTEL_COMPILER #pragma warning(disable:997 1011 170) @@ -17,3 +21,9 @@ #ifdef _MSC_VER #pragma warning(disable:4799) #endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_PRAGMAS_H_ diff --git a/libvpx/vp8/common/quant_common.h b/libvpx/vp8/common/quant_common.h index cb64d8e..700b5e6 100644 --- a/libvpx/vp8/common/quant_common.h +++ b/libvpx/vp8/common/quant_common.h @@ -8,14 +8,27 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_QUANT_COMMON_H_ +#define VP8_COMMON_QUANT_COMMON_H_ + #include "string.h" #include "blockd.h" #include "onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern int vp8_ac_yquant(int QIndex); extern int vp8_dc_quant(int QIndex, int Delta); extern int vp8_dc2quant(int QIndex, int Delta); extern int vp8_ac2quant(int QIndex, int Delta); extern int vp8_dc_uv_quant(int QIndex, int Delta); extern int vp8_ac_uv_quant(int QIndex, int Delta); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_QUANT_COMMON_H_ diff --git a/libvpx/vp8/common/reconinter.h b/libvpx/vp8/common/reconinter.h index 233c02e..ba979b9 100644 --- a/libvpx/vp8/common/reconinter.h +++ b/libvpx/vp8/common/reconinter.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_RECONINTER_H -#define __INC_RECONINTER_H +#ifndef VP8_COMMON_RECONINTER_H_ +#define VP8_COMMON_RECONINTER_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, @@ -32,4 +36,8 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_RECONINTER_H_ diff --git a/libvpx/vp8/common/reconintra4x4.h b/libvpx/vp8/common/reconintra4x4.h index d2b0d43..ed59c9e 100644 --- a/libvpx/vp8/common/reconintra4x4.h +++ b/libvpx/vp8/common/reconintra4x4.h @@ -9,10 +9,14 @@ */ -#ifndef __INC_RECONINTRA4x4_H -#define __INC_RECONINTRA4x4_H +#ifndef VP8_COMMON_RECONINTRA4X4_H_ +#define VP8_COMMON_RECONINTRA4X4_H_ #include "vp8/common/blockd.h" +#ifdef __cplusplus +extern "C" { +#endif + static void intra_prediction_down_copy(MACROBLOCKD *xd, unsigned char *above_right_src) { @@ -29,4 +33,8 @@ static void intra_prediction_down_copy(MACROBLOCKD *xd, *dst_ptr2 = *src_ptr; } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_RECONINTRA4X4_H_ diff --git a/libvpx/vp8/common/rtcd_defs.pl b/libvpx/vp8/common/rtcd_defs.pl new file mode 100644 index 0000000..130d965 --- /dev/null +++ b/libvpx/vp8/common/rtcd_defs.pl @@ -0,0 +1,541 @@ +sub vp8_common_forward_decls() { +print <<EOF +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; +EOF +} +forward_decls qw/vp8_common_forward_decls/; + +# +# system state +# +add_proto qw/void vp8_clear_system_state/, ""; +specialize qw/vp8_clear_system_state mmx/; +$vp8_clear_system_state_mmx=vpx_reset_mmx_state; + +# +# Dequant +# +add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc"; +specialize qw/vp8_dequantize_b mmx media neon/; +$vp8_dequantize_b_media=vp8_dequantize_b_v6; + +add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride"; +specialize qw/vp8_dequant_idct_add mmx media neon dspr2/; +$vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6; +$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2; + +add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; +specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/; +$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6; +$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; + +add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; +specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/; +$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6; +$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; + +# +# Loopfilter +# +add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2/; +$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6; +$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2; + +add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2/; +$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6; +$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2; + +add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2/; +$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6; +$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2; + +add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; +specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2/; +$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6; +$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2; + + +add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit"; +specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon/; +$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c; +$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx; +$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2; +$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6; +$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon; + +add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit"; +specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon/; +$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c; +$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx; +$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2; +$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6; +$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon; + +add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit"; +specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon/; +$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c; +$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx; +$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2; +$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6; +$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon; + +add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit"; +specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon/; +$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c; +$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx; +$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2; +$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6; +$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon; + +# +# IDCT +# +#idct16 +add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"; +specialize qw/vp8_short_idct4x4llm mmx media neon dspr2/; +$vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual; +$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2; + +#iwalsh1 +add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output"; +specialize qw/vp8_short_inv_walsh4x4_1 dspr2/; +$vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2; +# no asm yet + +#iwalsh16 +add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output"; +specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2/; +$vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6; +$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2; + +#idct1_scalar_add +add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"; +specialize qw/vp8_dc_only_idct_add mmx media neon dspr2/; +$vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6; +$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2; + +# +# RECON +# +add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2/; +$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6; +$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2; + +add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_copy_mem8x8 mmx media neon dspr2/; +$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6; +$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2; + +add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_copy_mem8x4 mmx media neon dspr2/; +$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6; +$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; + +add_proto qw/void vp8_build_intra_predictors_mby_s/, "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride"; +specialize qw/vp8_build_intra_predictors_mby_s sse2 ssse3/; +#TODO: fix assembly for neon + +add_proto qw/void vp8_build_intra_predictors_mbuv_s/, "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride"; +specialize qw/vp8_build_intra_predictors_mbuv_s sse2 ssse3/; + +add_proto qw/void vp8_intra4x4_predict/, "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left"; +specialize qw/vp8_intra4x4_predict media/; +$vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6; + +# +# Postproc +# +if (vpx_config("CONFIG_POSTPROC") eq "yes") { + add_proto qw/void vp8_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; + specialize qw/vp8_mbpost_proc_down mmx sse2/; + $vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm; + + add_proto qw/void vp8_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; + specialize qw/vp8_mbpost_proc_across_ip sse2/; + $vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm; + + add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; + specialize qw/vp8_post_proc_down_and_across_mb_row sse2/; + + add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"; + specialize qw/vp8_plane_add_noise mmx sse2/; + $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt; + + add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; + # no asm yet + + add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; + # no asm yet + + add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; + # no asm yet + + add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; + specialize qw/vp8_filter_by_weight16x16 sse2/; + + add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; + specialize qw/vp8_filter_by_weight8x8 sse2/; + + add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"; + # no asm yet +} + +# +# Subpixel +# +add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2/; +$vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6; +$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2; + +add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2/; +$vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6; +$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2; + +add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2/; +$vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6; +$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2; + +add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2/; +$vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6; +$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2; + +add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon/; +$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6; + +add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon/; +$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6; + +add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_bilinear_predict8x4 mmx media neon/; +$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6; + +add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; +specialize qw/vp8_bilinear_predict4x4 mmx media neon/; +$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; + +# +# Whole-pixel Variance +# +add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance4x4 mmx sse2/; +$vp8_variance4x4_sse2=vp8_variance4x4_wmt; + +add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance8x8 mmx sse2 media neon/; +$vp8_variance8x8_sse2=vp8_variance8x8_wmt; +$vp8_variance8x8_media=vp8_variance8x8_armv6; + +add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance8x16 mmx sse2 neon/; +$vp8_variance8x16_sse2=vp8_variance8x16_wmt; + +add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance16x8 mmx sse2 neon/; +$vp8_variance16x8_sse2=vp8_variance16x8_wmt; + +add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance16x16 mmx sse2 media neon/; +$vp8_variance16x16_sse2=vp8_variance16x16_wmt; +$vp8_variance16x16_media=vp8_variance16x16_armv6; + +# +# Sub-pixel Variance +# +add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; +specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/; +$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt; + +add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; +specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media neon/; +$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt; +$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6; + +add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; +specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/; +$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt; + +add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; +specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/; +$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt; + +add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; +specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon/; +$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt; +$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6; + +add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/; +$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt; +$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6; + +add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/; +$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt; +$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6; + +add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/; +$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt; +$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6; + +# +# Single block SAD +# +add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp8_sad4x4 mmx sse2 neon/; +$vp8_sad4x4_sse2=vp8_sad4x4_wmt; + +add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp8_sad8x8 mmx sse2 neon/; +$vp8_sad8x8_sse2=vp8_sad8x8_wmt; + +add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp8_sad8x16 mmx sse2 neon/; +$vp8_sad8x16_sse2=vp8_sad8x16_wmt; + +add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp8_sad16x8 mmx sse2 neon/; +$vp8_sad16x8_sse2=vp8_sad16x8_wmt; + +add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad"; +specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/; +$vp8_sad16x16_sse2=vp8_sad16x16_wmt; +$vp8_sad16x16_media=vp8_sad16x16_armv6; + +# +# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally +# +add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad4x4x3 sse3/; + +add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad8x8x3 sse3/; + +add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad8x16x3 sse3/; + +add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad16x8x3 sse3 ssse3/; + +add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad16x16x3 sse3 ssse3/; + +# Note the only difference in the following prototypes is that they return into +# an array of short +add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; +specialize qw/vp8_sad4x4x8 sse4_1/; +$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4; + +add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; +specialize qw/vp8_sad8x8x8 sse4_1/; +$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4; + +add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; +specialize qw/vp8_sad8x16x8 sse4_1/; +$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4; + +add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; +specialize qw/vp8_sad16x8x8 sse4_1/; +$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4; + +add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array"; +specialize qw/vp8_sad16x16x8 sse4_1/; +$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4; + +# +# Multi-block SAD, comparing a reference to N independent blocks +# +add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad4x4x4d sse3/; + +add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad8x8x4d sse3/; + +add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad8x16x4d sse3/; + +add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad16x8x4d sse3/; + +add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array"; +specialize qw/vp8_sad16x16x4d sse3/; + +# +# Encoder functions below this point. +# +if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") { + +# +# Sum of squares (vector) +# +add_proto qw/unsigned int vp8_get_mb_ss/, "const short *"; +specialize qw/vp8_get_mb_ss mmx sse2/; + +# +# SSE (Sum Squared Error) +# +add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; +specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/; +$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt; + +add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; +specialize qw/vp8_mse16x16 mmx sse2 media neon/; +$vp8_mse16x16_sse2=vp8_mse16x16_wmt; +$vp8_mse16x16_media=vp8_mse16x16_armv6; + +add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; +specialize qw/vp8_get4x4sse_cs mmx neon/; + +# +# Block copy +# +if ($opts{arch} =~ /x86/) { + add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"; + specialize qw/vp8_copy32xn sse2 sse3/; +} + +# +# Structured Similarity (SSIM) +# +if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { + $opts{arch} eq "x86_64" and $sse2_on_x86_64 = "sse2"; + + add_proto qw/void vp8_ssim_parms_8x8/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"; + specialize qw/vp8_ssim_parms_8x8/, "$sse2_on_x86_64"; + + add_proto qw/void vp8_ssim_parms_16x16/, "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"; + specialize qw/vp8_ssim_parms_16x16/, "$sse2_on_x86_64"; +} + +# +# Forward DCT +# +add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; +specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/; +$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6; + +add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; +specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/; +$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6; + +add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; +specialize qw/vp8_short_walsh4x4 sse2 media neon/; +$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6; + +# +# Quantizer +# +add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *"; +specialize qw/vp8_regular_quantize_b sse2/; +# TODO(johann) Update sse4 implementation and re-enable +#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4; + +add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; +specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon/; +$vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6; + +add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"; +# no asm yet + +add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"; +specialize qw/vp8_fast_quantize_b_pair neon/; + +add_proto qw/void vp8_quantize_mb/, "struct macroblock *"; +specialize qw/vp8_quantize_mb neon/; + +add_proto qw/void vp8_quantize_mby/, "struct macroblock *"; +specialize qw/vp8_quantize_mby neon/; + +add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *"; +specialize qw/vp8_quantize_mbuv neon/; + +# +# Block subtraction +# +add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff"; +specialize qw/vp8_block_error mmx sse2/; +$vp8_block_error_sse2=vp8_block_error_xmm; + +add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc"; +specialize qw/vp8_mbblock_error mmx sse2/; +$vp8_mbblock_error_sse2=vp8_mbblock_error_xmm; + +add_proto qw/int vp8_mbuverror/, "struct macroblock *mb"; +specialize qw/vp8_mbuverror mmx sse2/; +$vp8_mbuverror_sse2=vp8_mbuverror_xmm; + +add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch"; +specialize qw/vp8_subtract_b mmx sse2 media neon/; +$vp8_subtract_b_media=vp8_subtract_b_armv6; + +add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride"; +specialize qw/vp8_subtract_mby mmx sse2 media neon/; +$vp8_subtract_mby_media=vp8_subtract_mby_armv6; + +add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride"; +specialize qw/vp8_subtract_mbuv mmx sse2 media neon/; +$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6; + +# +# Motion search +# +add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; +specialize qw/vp8_full_search_sad sse3 sse4_1/; +$vp8_full_search_sad_sse3=vp8_full_search_sadx3; +$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8; + +add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; +specialize qw/vp8_refining_search_sad sse3/; +$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4; + +add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; +$vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4; + +# +# Alt-ref Noise Reduction (ARNR) +# +if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") { + add_proto qw/void vp8_temporal_filter_apply/, "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"; + specialize qw/vp8_temporal_filter_apply sse2/; +} + +# +# Pick Loopfilter +# +add_proto qw/void vp8_yv12_copy_partial_frame/, "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; +specialize qw/vp8_yv12_copy_partial_frame neon/; + +# +# Denoiser filter +# +if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") { + add_proto qw/int vp8_denoiser_filter/, "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset"; + specialize qw/vp8_denoiser_filter sse2 neon/; +} + +# End of encoder only functions +} +1; diff --git a/libvpx/vp8/common/rtcd_defs.sh b/libvpx/vp8/common/rtcd_defs.sh deleted file mode 100644 index 9ebf389..0000000 --- a/libvpx/vp8/common/rtcd_defs.sh +++ /dev/null @@ -1,542 +0,0 @@ -vp8_common_forward_decls() { -cat <<EOF -/* - * VP8 - */ - -struct blockd; -struct macroblockd; -struct loop_filter_info; - -/* Encoder forward decls */ -struct block; -struct macroblock; -struct variance_vtable; -union int_mv; -struct yv12_buffer_config; -EOF -} -forward_decls vp8_common_forward_decls - -# -# system state -# -prototype void vp8_clear_system_state "" -specialize vp8_clear_system_state mmx -vp8_clear_system_state_mmx=vpx_reset_mmx_state - -# -# Dequant -# -prototype void vp8_dequantize_b "struct blockd*, short *dqc" -specialize vp8_dequantize_b mmx media neon -vp8_dequantize_b_media=vp8_dequantize_b_v6 - -prototype void vp8_dequant_idct_add "short *input, short *dq, unsigned char *output, int stride" -specialize vp8_dequant_idct_add mmx media neon dspr2 -vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6 -vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2 - -prototype void vp8_dequant_idct_add_y_block "short *q, short *dq, unsigned char *dst, int stride, char *eobs" -specialize vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 -vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6 -vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2 - -prototype void vp8_dequant_idct_add_uv_block "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs" -specialize vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 -vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6 -vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2 - -# -# Loopfilter -# -prototype void vp8_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_mbv mmx sse2 media neon dspr2 -vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6 -vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2 - -prototype void vp8_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_bv mmx sse2 media neon dspr2 -vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6 -vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2 - -prototype void vp8_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_mbh mmx sse2 media neon dspr2 -vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6 -vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2 - -prototype void vp8_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi" -specialize vp8_loop_filter_bh mmx sse2 media neon dspr2 -vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6 -vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2 - - -prototype void vp8_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp8_loop_filter_simple_mbv mmx sse2 media neon -vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c -vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx -vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2 -vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6 -vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon - -prototype void vp8_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp8_loop_filter_simple_mbh mmx sse2 media neon -vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c -vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx -vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2 -vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6 -vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon - -prototype void vp8_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp8_loop_filter_simple_bv mmx sse2 media neon -vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c -vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx -vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2 -vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6 -vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon - -prototype void vp8_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp8_loop_filter_simple_bh mmx sse2 media neon -vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c -vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx -vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2 -vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6 -vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon - -# -# IDCT -# -#idct16 -prototype void vp8_short_idct4x4llm "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride" -specialize vp8_short_idct4x4llm mmx media neon dspr2 -vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual -vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2 - -#iwalsh1 -prototype void vp8_short_inv_walsh4x4_1 "short *input, short *output" -specialize vp8_short_inv_walsh4x4_1 dspr2 -vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2 -# no asm yet - -#iwalsh16 -prototype void vp8_short_inv_walsh4x4 "short *input, short *output" -specialize vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 -vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6 -vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2 - -#idct1_scalar_add -prototype void vp8_dc_only_idct_add "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride" -specialize vp8_dc_only_idct_add mmx media neon dspr2 -vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6 -vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2 - -# -# RECON -# -prototype void vp8_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch" -specialize vp8_copy_mem16x16 mmx sse2 media neon dspr2 -vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6 -vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2 - -prototype void vp8_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch" -specialize vp8_copy_mem8x8 mmx media neon dspr2 -vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6 -vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2 - -prototype void vp8_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch" -specialize vp8_copy_mem8x4 mmx media neon dspr2 -vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6 -vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2 - -prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride" -specialize vp8_build_intra_predictors_mby_s sse2 ssse3 -#TODO: fix assembly for neon - -prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride" -specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3 - -prototype void vp8_intra4x4_predict "unsigned char *Above, unsigned char *yleft, int left_stride, int b_mode, unsigned char *dst, int dst_stride, unsigned char top_left" -specialize vp8_intra4x4_predict media -vp8_intra4x4_predict_media=vp8_intra4x4_predict_armv6 - -# -# Postproc -# -if [ "$CONFIG_POSTPROC" = "yes" ]; then - prototype void vp8_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols,int flimit" - specialize vp8_mbpost_proc_down mmx sse2 - vp8_mbpost_proc_down_sse2=vp8_mbpost_proc_down_xmm - - prototype void vp8_mbpost_proc_across_ip "unsigned char *dst, int pitch, int rows, int cols,int flimit" - specialize vp8_mbpost_proc_across_ip sse2 - vp8_mbpost_proc_across_ip_sse2=vp8_mbpost_proc_across_ip_xmm - - prototype void vp8_post_proc_down_and_across_mb_row "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size" - specialize vp8_post_proc_down_and_across_mb_row sse2 - - prototype void vp8_plane_add_noise "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch" - specialize vp8_plane_add_noise mmx sse2 - vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt - - prototype void vp8_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride" - # no asm yet - - prototype void vp8_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride" - # no asm yet - - prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride" - # no asm yet - - prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight" - specialize vp8_filter_by_weight16x16 sse2 - - prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight" - specialize vp8_filter_by_weight8x8 sse2 - - prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight" - # no asm yet -fi - -# -# Subpixel -# -prototype void vp8_sixtap_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2 -vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6 -vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2 - -prototype void vp8_sixtap_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2 -vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6 -vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2 - -prototype void vp8_sixtap_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2 -vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6 -vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2 - -prototype void vp8_sixtap_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_sixtap_predict4x4 mmx ssse3 media neon dspr2 -vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6 -vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2 - -prototype void vp8_bilinear_predict16x16 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon -vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6 - -prototype void vp8_bilinear_predict8x8 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon -vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6 - -prototype void vp8_bilinear_predict8x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_bilinear_predict8x4 mmx media neon -vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6 - -prototype void vp8_bilinear_predict4x4 "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch" -specialize vp8_bilinear_predict4x4 mmx media neon -vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6 - -# -# Whole-pixel Variance -# -prototype unsigned int vp8_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance4x4 mmx sse2 -vp8_variance4x4_sse2=vp8_variance4x4_wmt - -prototype unsigned int vp8_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance8x8 mmx sse2 media neon -vp8_variance8x8_sse2=vp8_variance8x8_wmt -vp8_variance8x8_media=vp8_variance8x8_armv6 - -prototype unsigned int vp8_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance8x16 mmx sse2 neon -vp8_variance8x16_sse2=vp8_variance8x16_wmt - -prototype unsigned int vp8_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance16x8 mmx sse2 neon -vp8_variance16x8_sse2=vp8_variance16x8_wmt - -prototype unsigned int vp8_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance16x16 mmx sse2 media neon -vp8_variance16x16_sse2=vp8_variance16x16_wmt -vp8_variance16x16_media=vp8_variance16x16_armv6 - -# -# Sub-pixel Variance -# -prototype unsigned int vp8_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" -specialize vp8_sub_pixel_variance4x4 mmx sse2 -vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt - -prototype unsigned int vp8_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" -specialize vp8_sub_pixel_variance8x8 mmx sse2 media neon -vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt -vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6 - -prototype unsigned int vp8_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" -specialize vp8_sub_pixel_variance8x16 mmx sse2 -vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt - -prototype unsigned int vp8_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" -specialize vp8_sub_pixel_variance16x8 mmx sse2 ssse3 -vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt - -prototype unsigned int vp8_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" -specialize vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon -vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt -vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6 - -prototype unsigned int vp8_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance_halfpixvar16x16_h mmx sse2 media neon -vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt -vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6 - -prototype unsigned int vp8_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance_halfpixvar16x16_v mmx sse2 media neon -vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt -vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6 - -prototype unsigned int vp8_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_variance_halfpixvar16x16_hv mmx sse2 media neon -vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt -vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6 - -# -# Single block SAD -# -prototype unsigned int vp8_sad4x4 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp8_sad4x4 mmx sse2 neon -vp8_sad4x4_sse2=vp8_sad4x4_wmt - -prototype unsigned int vp8_sad8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp8_sad8x8 mmx sse2 neon -vp8_sad8x8_sse2=vp8_sad8x8_wmt - -prototype unsigned int vp8_sad8x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp8_sad8x16 mmx sse2 neon -vp8_sad8x16_sse2=vp8_sad8x16_wmt - -prototype unsigned int vp8_sad16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp8_sad16x8 mmx sse2 neon -vp8_sad16x8_sse2=vp8_sad16x8_wmt - -prototype unsigned int vp8_sad16x16 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad" -specialize vp8_sad16x16 mmx sse2 sse3 media neon -vp8_sad16x16_sse2=vp8_sad16x16_wmt -vp8_sad16x16_media=vp8_sad16x16_armv6 - -# -# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally -# -prototype void vp8_sad4x4x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp8_sad4x4x3 sse3 - -prototype void vp8_sad8x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp8_sad8x8x3 sse3 - -prototype void vp8_sad8x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp8_sad8x16x3 sse3 - -prototype void vp8_sad16x8x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp8_sad16x8x3 sse3 ssse3 - -prototype void vp8_sad16x16x3 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array" -specialize vp8_sad16x16x3 sse3 ssse3 - -# Note the only difference in the following prototypes is that they return into -# an array of short -prototype void vp8_sad4x4x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" -specialize vp8_sad4x4x8 sse4_1 -vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4 - -prototype void vp8_sad8x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" -specialize vp8_sad8x8x8 sse4_1 -vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4 - -prototype void vp8_sad8x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" -specialize vp8_sad8x16x8 sse4_1 -vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4 - -prototype void vp8_sad16x8x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" -specialize vp8_sad16x8x8 sse4_1 -vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4 - -prototype void vp8_sad16x16x8 "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array" -specialize vp8_sad16x16x8 sse4_1 -vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4 - -# -# Multi-block SAD, comparing a reference to N independent blocks -# -prototype void vp8_sad4x4x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp8_sad4x4x4d sse3 - -prototype void vp8_sad8x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp8_sad8x8x4d sse3 - -prototype void vp8_sad8x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp8_sad8x16x4d sse3 - -prototype void vp8_sad16x8x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp8_sad16x8x4d sse3 - -prototype void vp8_sad16x16x4d "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int ref_stride, unsigned int *sad_array" -specialize vp8_sad16x16x4d sse3 - -# -# Encoder functions below this point. -# -if [ "$CONFIG_VP8_ENCODER" = "yes" ]; then - -# -# Sum of squares (vector) -# -prototype unsigned int vp8_get_mb_ss "const short *" -specialize vp8_get_mb_ss mmx sse2 - -# -# SSE (Sum Squared Error) -# -prototype unsigned int vp8_sub_pixel_mse16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse" -specialize vp8_sub_pixel_mse16x16 mmx sse2 -vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt - -prototype unsigned int vp8_mse16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp8_mse16x16 mmx sse2 media neon -vp8_mse16x16_sse2=vp8_mse16x16_wmt -vp8_mse16x16_media=vp8_mse16x16_armv6 - -prototype unsigned int vp8_get4x4sse_cs "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride" -specialize vp8_get4x4sse_cs mmx neon - -# -# Block copy -# -case $arch in - x86*) - prototype void vp8_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n" - specialize vp8_copy32xn sse2 sse3 - ;; -esac - -# -# Structured Similarity (SSIM) -# -if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then - [ $arch = "x86_64" ] && sse2_on_x86_64=sse2 - - prototype void vp8_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" - specialize vp8_ssim_parms_8x8 $sse2_on_x86_64 - - prototype void vp8_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" - specialize vp8_ssim_parms_16x16 $sse2_on_x86_64 -fi - -# -# Forward DCT -# -prototype void vp8_short_fdct4x4 "short *input, short *output, int pitch" -specialize vp8_short_fdct4x4 mmx sse2 media neon -vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6 - -prototype void vp8_short_fdct8x4 "short *input, short *output, int pitch" -specialize vp8_short_fdct8x4 mmx sse2 media neon -vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6 - -prototype void vp8_short_walsh4x4 "short *input, short *output, int pitch" -specialize vp8_short_walsh4x4 sse2 media neon -vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6 - -# -# Quantizer -# -prototype void vp8_regular_quantize_b "struct block *, struct blockd *" -specialize vp8_regular_quantize_b sse2 #sse4_1 -# TODO(johann) Update sse4 implementation and re-enable -#vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4 - -prototype void vp8_fast_quantize_b "struct block *, struct blockd *" -specialize vp8_fast_quantize_b sse2 ssse3 media neon -vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6 - -prototype void vp8_regular_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2" -# no asm yet - -prototype void vp8_fast_quantize_b_pair "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2" -specialize vp8_fast_quantize_b_pair neon - -prototype void vp8_quantize_mb "struct macroblock *" -specialize vp8_quantize_mb neon - -prototype void vp8_quantize_mby "struct macroblock *" -specialize vp8_quantize_mby neon - -prototype void vp8_quantize_mbuv "struct macroblock *" -specialize vp8_quantize_mbuv neon - -# -# Block subtraction -# -prototype int vp8_block_error "short *coeff, short *dqcoeff" -specialize vp8_block_error mmx sse2 -vp8_block_error_sse2=vp8_block_error_xmm - -prototype int vp8_mbblock_error "struct macroblock *mb, int dc" -specialize vp8_mbblock_error mmx sse2 -vp8_mbblock_error_sse2=vp8_mbblock_error_xmm - -prototype int vp8_mbuverror "struct macroblock *mb" -specialize vp8_mbuverror mmx sse2 -vp8_mbuverror_sse2=vp8_mbuverror_xmm - -prototype void vp8_subtract_b "struct block *be, struct blockd *bd, int pitch" -specialize vp8_subtract_b mmx sse2 media neon -vp8_subtract_b_media=vp8_subtract_b_armv6 - -prototype void vp8_subtract_mby "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride" -specialize vp8_subtract_mby mmx sse2 media neon -vp8_subtract_mby_media=vp8_subtract_mby_armv6 - -prototype void vp8_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride" -specialize vp8_subtract_mbuv mmx sse2 media neon -vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6 - -# -# Motion search -# -prototype int vp8_full_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv" -specialize vp8_full_search_sad sse3 sse4_1 -vp8_full_search_sad_sse3=vp8_full_search_sadx3 -vp8_full_search_sad_sse4_1=vp8_full_search_sadx8 - -prototype int vp8_refining_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv" -specialize vp8_refining_search_sad sse3 -vp8_refining_search_sad_sse3=vp8_refining_search_sadx4 - -prototype int vp8_diamond_search_sad "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv" -vp8_diamond_search_sad_sse3=vp8_diamond_search_sadx4 - -# -# Alt-ref Noise Reduction (ARNR) -# -if [ "$CONFIG_REALTIME_ONLY" != "yes" ]; then - prototype void vp8_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count" - specialize vp8_temporal_filter_apply sse2 -fi - -# -# Pick Loopfilter -# -prototype void vp8_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc" -specialize vp8_yv12_copy_partial_frame neon - -# -# Denoiser filter -# -if [ "$CONFIG_TEMPORAL_DENOISING" = "yes" ]; then - prototype int vp8_denoiser_filter "struct yv12_buffer_config* mc_running_avg, struct yv12_buffer_config* running_avg, struct macroblock* signal, unsigned int motion_magnitude2, int y_offset, int uv_offset" - specialize vp8_denoiser_filter sse2 -fi - -# End of encoder only functions -fi diff --git a/libvpx/vp8/common/setupintrarecon.h b/libvpx/vp8/common/setupintrarecon.h index e515c3a..608f4a9 100644 --- a/libvpx/vp8/common/setupintrarecon.h +++ b/libvpx/vp8/common/setupintrarecon.h @@ -8,8 +8,14 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_SETUPINTRARECON_H_ +#define VP8_COMMON_SETUPINTRARECON_H_ #include "vpx_scale/yv12config.h" + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); extern void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf); @@ -31,3 +37,9 @@ void setup_intra_recon_left(unsigned char *y_buffer, for (i = 0; i < 8; i++) v_buffer[uv_stride *i] = (unsigned char) 129; } + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SETUPINTRARECON_H_ diff --git a/libvpx/vp8/common/swapyv12buffer.h b/libvpx/vp8/common/swapyv12buffer.h index a6473ed..1d66cd3 100644 --- a/libvpx/vp8/common/swapyv12buffer.h +++ b/libvpx/vp8/common/swapyv12buffer.h @@ -9,11 +9,19 @@ */ -#ifndef SWAPYV12_BUFFER_H -#define SWAPYV12_BUFFER_H +#ifndef VP8_COMMON_SWAPYV12BUFFER_H_ +#define VP8_COMMON_SWAPYV12BUFFER_H_ #include "vpx_scale/yv12config.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_SWAPYV12BUFFER_H_ diff --git a/libvpx/vp8/common/systemdependent.h b/libvpx/vp8/common/systemdependent.h index e6b0456..3d44e37 100644 --- a/libvpx/vp8/common/systemdependent.h +++ b/libvpx/vp8/common/systemdependent.h @@ -8,8 +8,20 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_ +#define VP8_COMMON_SYSTEMDEPENDENT_H_ #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + struct VP8Common; void vp8_machine_specific_config(struct VP8Common *); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_SYSTEMDEPENDENT_H_ diff --git a/libvpx/vp8/common/threading.h b/libvpx/vp8/common/threading.h index ed9e3e6..01c82db 100644 --- a/libvpx/vp8/common/threading.h +++ b/libvpx/vp8/common/threading.h @@ -9,8 +9,12 @@ */ -#ifndef _PTHREAD_EMULATION -#define _PTHREAD_EMULATION +#ifndef VP8_COMMON_THREADING_H_ +#define VP8_COMMON_THREADING_H_ + +#ifdef __cplusplus +extern "C" { +#endif #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD @@ -183,4 +187,8 @@ static inline int sem_destroy(sem_t * sem) #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_THREADING_H_ diff --git a/libvpx/vp8/common/treecoder.h b/libvpx/vp8/common/treecoder.h index ebf51c5..d22b7c5 100644 --- a/libvpx/vp8/common/treecoder.h +++ b/libvpx/vp8/common/treecoder.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_TREECODER_H -#define __INC_TREECODER_H +#ifndef VP8_COMMON_TREECODER_H_ +#define VP8_COMMON_TREECODER_H_ + +#ifdef __cplusplus +extern "C" { +#endif typedef unsigned char vp8bc_index_t; /* probability index */ @@ -87,4 +91,8 @@ void vp8bc_tree_probs_from_distribution( ); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_TREECODER_H_ diff --git a/libvpx/vp8/common/variance.h b/libvpx/vp8/common/variance.h index 01193b8..89a32a7 100644 --- a/libvpx/vp8/common/variance.h +++ b/libvpx/vp8/common/variance.h @@ -9,11 +9,15 @@ */ -#ifndef VARIANCE_H -#define VARIANCE_H +#ifndef VP8_COMMON_VARIANCE_H_ +#define VP8_COMMON_VARIANCE_H_ #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef unsigned int(*vp8_sad_fn_t)( const unsigned char *src_ptr, int source_stride, @@ -112,4 +116,8 @@ typedef struct variance_vtable #endif } vp8_variance_fn_ptr_t; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_COMMON_VARIANCE_H_ diff --git a/libvpx/vp8/common/vp8_entropymodedata.h b/libvpx/vp8/common/vp8_entropymodedata.h index 13e9a92..c4aed49 100644 --- a/libvpx/vp8/common/vp8_entropymodedata.h +++ b/libvpx/vp8/common/vp8_entropymodedata.h @@ -8,6 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_ +#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_ + +#ifdef __cplusplus +extern "C" { +#endif /*Generated file, included by entropymode.c*/ @@ -240,3 +246,9 @@ const vp8_prob vp8_kf_bmode_prob { 112, 19, 12, 61, 195, 128, 48, 4, 24 } } }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_VP8_ENTROPYMODEDATA_H_ diff --git a/libvpx/vp8/common/x86/filter_x86.h b/libvpx/vp8/common/x86/filter_x86.h index cfadaee..d282841 100644 --- a/libvpx/vp8/common/x86/filter_x86.h +++ b/libvpx/vp8/common/x86/filter_x86.h @@ -8,11 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef FILTER_X86_H -#define FILTER_X86_H +#ifndef VP8_COMMON_X86_FILTER_X86_H_ +#define VP8_COMMON_X86_FILTER_X86_H_ #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + /* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with * duplicated values */ @@ -22,4 +26,8 @@ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]); /* duplicated 8x */ extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]); -#endif /* FILTER_X86_H */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_COMMON_X86_FILTER_X86_H_ diff --git a/libvpx/vp8/common/x86/loopfilter_mmx.asm b/libvpx/vp8/common/x86/loopfilter_mmx.asm index f388d24..88a07b9 100644 --- a/libvpx/vp8/common/x86/loopfilter_mmx.asm +++ b/libvpx/vp8/common/x86/loopfilter_mmx.asm @@ -527,7 +527,7 @@ sym(vp8_loop_filter_vertical_edge_mmx): pxor mm7, [GLOBAL(t80)] ; unoffset ; mm7 = q1 - ; tranpose and write back + ; transpose and write back ; mm1 = 72 62 52 42 32 22 12 02 ; mm6 = 73 63 53 43 33 23 13 03 ; mm3 = 74 64 54 44 34 24 14 04 @@ -1289,7 +1289,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx): pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 - ; tranpose and write back + ; transpose and write back movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 diff --git a/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/vp8/common/x86/loopfilter_sse2.asm index a66753b..1913abc 100644 --- a/libvpx/vp8/common/x86/loopfilter_sse2.asm +++ b/libvpx/vp8/common/x86/loopfilter_sse2.asm @@ -958,7 +958,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): ; start work on filters B_FILTER 2 - ; tranpose and write back - only work on q1, q0, p0, p1 + ; transpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE ; store 16-line result @@ -1023,7 +1023,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ; start work on filters B_FILTER 2 - ; tranpose and write back - only work on q1, q0, p0, p1 + ; transpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing diff --git a/libvpx/vp8/decoder/dboolhuff.h b/libvpx/vp8/decoder/dboolhuff.h index 4c0ca1c..36af7ee 100644 --- a/libvpx/vp8/decoder/dboolhuff.h +++ b/libvpx/vp8/decoder/dboolhuff.h @@ -9,8 +9,8 @@ */ -#ifndef DBOOLHUFF_H_ -#define DBOOLHUFF_H_ +#ifndef VP8_DECODER_DBOOLHUFF_H_ +#define VP8_DECODER_DBOOLHUFF_H_ #include <stddef.h> #include <limits.h> @@ -19,6 +19,10 @@ #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef size_t VP8_BD_VALUE; #define VP8_BD_VALUE_SIZE ((int)sizeof(VP8_BD_VALUE)*CHAR_BIT) @@ -135,4 +139,8 @@ static int vp8dx_bool_error(BOOL_DECODER *br) return 0; } -#endif // DBOOLHUFF_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DBOOLHUFF_H_ diff --git a/libvpx/vp8/decoder/decodframe.c b/libvpx/vp8/decoder/decodeframe.c index 16da78a..bfde599 100644 --- a/libvpx/vp8/decoder/decodframe.c +++ b/libvpx/vp8/decoder/decodeframe.c @@ -680,7 +680,6 @@ static void decode_mb_rows(VP8D_COMP *pbi) vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1, recon_y_stride, recon_uv_stride, lf_dst[0], lf_dst[1], lf_dst[2]); - if(mb_row > 1) { yv12_extend_frame_left_right_c(yv12_fb_new, @@ -691,10 +690,6 @@ static void decode_mb_rows(VP8D_COMP *pbi) eb_dst[0] += recon_y_stride * 16; eb_dst[1] += recon_uv_stride * 8; eb_dst[2] += recon_uv_stride * 8; - - if(mb_row == 2) - yv12_extend_frame_top_c(yv12_fb_new); - } lf_dst[0] += recon_y_stride * 16; @@ -713,13 +708,9 @@ static void decode_mb_rows(VP8D_COMP *pbi) eb_dst[0], eb_dst[1], eb_dst[2]); - eb_dst[0] += recon_y_stride * 16; eb_dst[1] += recon_uv_stride * 8; eb_dst[2] += recon_uv_stride * 8; - - if(mb_row == 1) - yv12_extend_frame_top_c(yv12_fb_new); } } } @@ -747,7 +738,7 @@ static void decode_mb_rows(VP8D_COMP *pbi) eb_dst[0], eb_dst[1], eb_dst[2]); - + yv12_extend_frame_top_c(yv12_fb_new); yv12_extend_frame_bottom_c(yv12_fb_new); } diff --git a/libvpx/vp8/decoder/decodemv.h b/libvpx/vp8/decoder/decodemv.h index 05a33d2..f33b073 100644 --- a/libvpx/vp8/decoder/decodemv.h +++ b/libvpx/vp8/decoder/decodemv.h @@ -8,11 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef DECODEMV_H_ -#define DECODEMV_H_ +#ifndef VP8_DECODER_DECODEMV_H_ +#define VP8_DECODER_DECODEMV_H_ #include "onyxd_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_decode_mode_mvs(VP8D_COMP *); -#endif // DECODEMV_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DECODEMV_H_ diff --git a/libvpx/vp8/decoder/decoderthreading.h b/libvpx/vp8/decoder/decoderthreading.h index bc716e4..c563cf6 100644 --- a/libvpx/vp8/decoder/decoderthreading.h +++ b/libvpx/vp8/decoder/decoderthreading.h @@ -8,8 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef DECODERTHREADING_H_ -#define DECODERTHREADING_H_ +#ifndef VP8_DECODER_DECODERTHREADING_H_ +#define VP8_DECODER_DECODERTHREADING_H_ + +#ifdef __cplusplus +extern "C" { +#endif #if CONFIG_MULTITHREAD void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd); @@ -19,4 +23,8 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows); void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows); #endif -#endif // DECODERTHREADING_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DECODERTHREADING_H_ diff --git a/libvpx/vp8/decoder/detokenize.h b/libvpx/vp8/decoder/detokenize.h index f2130b3..f0b1254 100644 --- a/libvpx/vp8/decoder/detokenize.h +++ b/libvpx/vp8/decoder/detokenize.h @@ -8,12 +8,20 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef DETOKENIZE_H_ -#define DETOKENIZE_H_ +#ifndef VP8_DECODER_DETOKENIZE_H_ +#define VP8_DECODER_DETOKENIZE_H_ #include "onyxd_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_reset_mb_tokens_context(MACROBLOCKD *x); int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *); -#endif // DETOKENIZE_H +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_DETOKENIZE_H_ diff --git a/libvpx/vp8/decoder/ec_types.h b/libvpx/vp8/decoder/ec_types.h index b24bfd9..3af5ca8 100644 --- a/libvpx/vp8/decoder/ec_types.h +++ b/libvpx/vp8/decoder/ec_types.h @@ -8,8 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP8_DEC_EC_TYPES_H -#define VP8_DEC_EC_TYPES_H +#ifndef VP8_DECODER_EC_TYPES_H_ +#define VP8_DECODER_EC_TYPES_H_ + +#ifdef __cplusplus +extern "C" { +#endif #define MAX_OVERLAPS 16 @@ -47,4 +51,8 @@ typedef struct MV_REFERENCE_FRAME ref_frame; } EC_BLOCK; -#endif // VP8_DEC_EC_TYPES_H +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_EC_TYPES_H_ diff --git a/libvpx/vp8/decoder/error_concealment.h b/libvpx/vp8/decoder/error_concealment.h index fb96b36..9a1e024 100644 --- a/libvpx/vp8/decoder/error_concealment.h +++ b/libvpx/vp8/decoder/error_concealment.h @@ -9,12 +9,16 @@ */ -#ifndef ERROR_CONCEALMENT_H_ -#define ERROR_CONCEALMENT_H_ +#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_ +#define VP8_DECODER_ERROR_CONCEALMENT_H_ #include "onyxd_int.h" #include "ec_types.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Allocate memory for the overlap lists */ int vp8_alloc_overlap_lists(VP8D_COMP *pbi); @@ -38,4 +42,8 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, */ void vp8_conceal_corrupt_mb(MACROBLOCKD *xd); -#endif // ERROR_CONCEALMENT_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_ERROR_CONCEALMENT_H_ diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h index 54a98f7..8ef4894 100644 --- a/libvpx/vp8/decoder/onyxd_int.h +++ b/libvpx/vp8/decoder/onyxd_int.h @@ -9,8 +9,8 @@ */ -#ifndef ONYXD_INT_H_ -#define ONYXD_INT_H_ +#ifndef VP8_DECODER_ONYXD_INT_H_ +#define VP8_DECODER_ONYXD_INT_H_ #include "vpx_config.h" #include "vp8/common/onyxd.h" @@ -22,6 +22,10 @@ #include "ec_types.h" #endif +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { int ithread; @@ -148,4 +152,8 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb); } while(0) #endif -#endif // ONYXD_INT_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_ONYXD_INT_H_ diff --git a/libvpx/vp8/decoder/treereader.h b/libvpx/vp8/decoder/treereader.h index 9393bb4..35ee696 100644 --- a/libvpx/vp8/decoder/treereader.h +++ b/libvpx/vp8/decoder/treereader.h @@ -9,12 +9,16 @@ */ -#ifndef TREEREADER_H_ -#define TREEREADER_H_ +#ifndef VP8_DECODER_TREEREADER_H_ +#define VP8_DECODER_TREEREADER_H_ #include "vp8/common/treecoder.h" #include "dboolhuff.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef BOOL_DECODER vp8_reader; #define vp8_read vp8dx_decode_bool @@ -37,4 +41,8 @@ static int vp8_treed_read( return -i; } -#endif // TREEREADER_H_ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_DECODER_TREEREADER_H_ diff --git a/libvpx/vp8/encoder/arm/neon/denoising_neon.c b/libvpx/vp8/encoder/arm/neon/denoising_neon.c new file mode 100644 index 0000000..3f85397 --- /dev/null +++ b/libvpx/vp8/encoder/arm/neon/denoising_neon.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "vp8/encoder/denoising.h" +#include "vpx_mem/vpx_mem.h" +#include "./vp8_rtcd.h" + +/* + * The filter function was modified to reduce the computational complexity. + * + * Step 1: + * Instead of applying tap coefficients for each pixel, we calculated the + * pixel adjustments vs. pixel diff value ahead of time. + * adjustment = filtered_value - current_raw + * = (filter_coefficient * diff + 128) >> 8 + * where + * filter_coefficient = (255 << 8) / (256 + ((abs_diff * 330) >> 3)); + * filter_coefficient += filter_coefficient / + * (3 + motion_magnitude_adjustment); + * filter_coefficient is clamped to 0 ~ 255. + * + * Step 2: + * The adjustment vs. diff curve becomes flat very quick when diff increases. + * This allowed us to use only several levels to approximate the curve without + * changing the filtering algorithm too much. + * The adjustments were further corrected by checking the motion magnitude. + * The levels used are: + * diff level adjustment w/o adjustment w/ + * motion correction motion correction + * [-255, -16] 3 -6 -7 + * [-15, -8] 2 -4 -5 + * [-7, -4] 1 -3 -4 + * [-3, 3] 0 diff diff + * [4, 7] 1 3 4 + * [8, 15] 2 4 5 + * [16, 255] 3 6 7 + */ + +int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, + YV12_BUFFER_CONFIG *running_avg, + MACROBLOCK *signal, unsigned int motion_magnitude, + int y_offset, int uv_offset) { + /* If motion_magnitude is small, making the denoiser more aggressive by + * increasing the adjustment for each level, level1 adjustment is + * increased, the deltas stay the same. + */ + const uint8x16_t v_level1_adjustment = vdupq_n_u8( + (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 : 3); + const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1); + const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2); + const uint8x16_t v_level1_threshold = vdupq_n_u8(4); + const uint8x16_t v_level2_threshold = vdupq_n_u8(8); + const uint8x16_t v_level3_threshold = vdupq_n_u8(16); + + /* Local variables for array pointers and strides. */ + unsigned char *sig = signal->thismb; + int sig_stride = 16; + unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset; + int mc_running_avg_y_stride = mc_running_avg->y_stride; + unsigned char *running_avg_y = running_avg->y_buffer + y_offset; + int running_avg_y_stride = running_avg->y_stride; + + /* Go over lines. */ + int i; + int sum_diff = 0; + for (i = 0; i < 16; ++i) { + int8x16_t v_sum_diff = vdupq_n_s8(0); + uint8x16_t v_running_avg_y; + + /* Load inputs. */ + const uint8x16_t v_sig = vld1q_u8(sig); + const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); + + /* Calculate absolute difference and sign masks. */ + const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y); + const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y); + + /* Figure out which level that put us in. */ + const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, + v_abs_diff); + const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, + v_abs_diff); + const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, + v_abs_diff); + + /* Calculate absolute adjustments for level 1, 2 and 3. */ + const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask, + v_delta_level_1_and_2); + const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask, + v_delta_level_2_and_3); + const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment, + v_level2_adjustment); + const uint8x16_t v_level1and2and3_adjustment = vaddq_u8( + v_level1and2_adjustment, v_level3_adjustment); + + /* Figure adjustment absolute value by selecting between the absolute + * difference if in level0 or the value for level 1, 2 and 3. + */ + const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask, + v_level1and2and3_adjustment, v_abs_diff); + + /* Calculate positive and negative adjustments. Apply them to the signal + * and accumulate them. Adjustments are less than eight and the maximum + * sum of them (7 * 16) can fit in a signed char. + */ + const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask, + v_abs_adjustment); + const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, + v_abs_adjustment); + v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); + v_sum_diff = vqaddq_s8(v_sum_diff, + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff = vqsubq_s8(v_sum_diff, + vreinterpretq_s8_u8(v_neg_adjustment)); + + /* Store results. */ + vst1q_u8(running_avg_y, v_running_avg_y); + + /* Sum all the accumulators to have the sum of all pixel differences + * for this macroblock. + */ + { + int s0 = vgetq_lane_s8(v_sum_diff, 0) + + vgetq_lane_s8(v_sum_diff, 1) + + vgetq_lane_s8(v_sum_diff, 2) + + vgetq_lane_s8(v_sum_diff, 3); + int s1 = vgetq_lane_s8(v_sum_diff, 4) + + vgetq_lane_s8(v_sum_diff, 5) + + vgetq_lane_s8(v_sum_diff, 6) + + vgetq_lane_s8(v_sum_diff, 7); + int s2 = vgetq_lane_s8(v_sum_diff, 8) + + vgetq_lane_s8(v_sum_diff, 9) + + vgetq_lane_s8(v_sum_diff, 10) + + vgetq_lane_s8(v_sum_diff, 11); + int s3 = vgetq_lane_s8(v_sum_diff, 12) + + vgetq_lane_s8(v_sum_diff, 13) + + vgetq_lane_s8(v_sum_diff, 14) + + vgetq_lane_s8(v_sum_diff, 15); + sum_diff += s0 + s1+ s2 + s3; + } + + /* Update pointers for next iteration. */ + sig += sig_stride; + mc_running_avg_y += mc_running_avg_y_stride; + running_avg_y += running_avg_y_stride; + } + + /* Too much adjustments => copy block. */ + if (abs(sum_diff) > SUM_DIFF_THRESHOLD) + return COPY_BLOCK; + + /* Tell above level that block was filtered. */ + vp8_copy_mem16x16(running_avg->y_buffer + y_offset, running_avg_y_stride, + signal->thismb, sig_stride); + return FILTER_BLOCK; +} diff --git a/libvpx/vp8/encoder/bitstream.h b/libvpx/vp8/encoder/bitstream.h index 455a94f..eef2d79 100644 --- a/libvpx/vp8/encoder/bitstream.h +++ b/libvpx/vp8/encoder/bitstream.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_BITSTREAM_H -#define __INC_BITSTREAM_H +#ifndef VP8_ENCODER_BITSTREAM_H_ +#define VP8_ENCODER_BITSTREAM_H_ + +#ifdef __cplusplus +extern "C" { +#endif #if HAVE_EDSP void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount, @@ -43,4 +47,8 @@ void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount); # define pack_mb_row_tokens(a,b) pack_mb_row_tokens_c(a,b) #endif +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_BITSTREAM_H_ diff --git a/libvpx/vp8/encoder/block.h b/libvpx/vp8/encoder/block.h index cf74c7a..dd733e5 100644 --- a/libvpx/vp8/encoder/block.h +++ b/libvpx/vp8/encoder/block.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_BLOCK_H -#define __INC_BLOCK_H +#ifndef VP8_ENCODER_BLOCK_H_ +#define VP8_ENCODER_BLOCK_H_ #include "vp8/common/onyx.h" #include "vp8/common/blockd.h" @@ -18,6 +18,10 @@ #include "vp8/common/entropy.h" #include "vpx_ports/mem.h" +#ifdef __cplusplus +extern "C" { +#endif + #define MAX_MODES 20 #define MAX_ERROR_BINS 1024 @@ -160,4 +164,8 @@ typedef struct macroblock } MACROBLOCK; +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_BLOCK_H_ diff --git a/libvpx/vp8/encoder/boolhuff.h b/libvpx/vp8/encoder/boolhuff.h index 39ab586..6114215 100644 --- a/libvpx/vp8/encoder/boolhuff.h +++ b/libvpx/vp8/encoder/boolhuff.h @@ -16,12 +16,16 @@ * Description : Bool Coder header file. * ****************************************************************************/ -#ifndef __INC_BOOLHUFF_H -#define __INC_BOOLHUFF_H +#ifndef VP8_ENCODER_BOOLHUFF_H_ +#define VP8_ENCODER_BOOLHUFF_H_ #include "vpx_ports/mem.h" #include "vpx/internal/vpx_codec_internal.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct { unsigned int lowvalue; @@ -125,4 +129,8 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) br->range = range; } +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_BOOLHUFF_H_ diff --git a/libvpx/vp8/encoder/dct_value_cost.h b/libvpx/vp8/encoder/dct_value_cost.h index e892765..1cd3eec 100644 --- a/libvpx/vp8/encoder/dct_value_cost.h +++ b/libvpx/vp8/encoder/dct_value_cost.h @@ -8,6 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_DCT_VALUE_COST_H_ +#define VP8_ENCODER_DCT_VALUE_COST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + /* Generated file, included by tokenize.c */ /* Values generated by fill_value_tokens() */ @@ -356,3 +363,9 @@ static const short dct_value_cost[2048*2] = 8134, 8140, 8148, 8170, 8178, 8184, 8192, 8202, 8210, 8216, 8224, 8243, 8251, 8257, 8265, 8275 }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DCT_VALUE_COST_H_ diff --git a/libvpx/vp8/encoder/dct_value_tokens.h b/libvpx/vp8/encoder/dct_value_tokens.h index ef08eed..c2aadef 100644 --- a/libvpx/vp8/encoder/dct_value_tokens.h +++ b/libvpx/vp8/encoder/dct_value_tokens.h @@ -8,6 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_ +#define VP8_ENCODER_DCT_VALUE_TOKENS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + /* Generated file, included by tokenize.c */ /* Values generated by fill_value_tokens() */ @@ -697,3 +704,9 @@ static const TOKENVALUE dct_value_tokens[2048*2] = {10, 3942}, {10, 3944}, {10, 3946}, {10, 3948}, {10, 3950}, {10, 3952}, {10, 3954}, {10, 3956}, {10, 3958}, {10, 3960} }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DCT_VALUE_TOKENS_H_ diff --git a/libvpx/vp8/encoder/defaultcoefcounts.h b/libvpx/vp8/encoder/defaultcoefcounts.h index 2c0f3dd..1e8e804 100644 --- a/libvpx/vp8/encoder/defaultcoefcounts.h +++ b/libvpx/vp8/encoder/defaultcoefcounts.h @@ -8,6 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ +#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + /* Generated file, included by entropy.c */ static const unsigned int default_coef_counts[BLOCK_TYPES] @@ -221,3 +228,9 @@ static const unsigned int default_coef_counts[BLOCK_TYPES] }, }, }; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_ diff --git a/libvpx/vp8/encoder/denoising.h b/libvpx/vp8/encoder/denoising.h index b025f5c..cc9913a 100644 --- a/libvpx/vp8/encoder/denoising.h +++ b/libvpx/vp8/encoder/denoising.h @@ -13,6 +13,10 @@ #include "block.h" +#ifdef __cplusplus +extern "C" { +#endif + #define SUM_DIFF_THRESHOLD (16 * 16 * 2) #define MOTION_MAGNITUDE_THRESHOLD (8*3) @@ -39,4 +43,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, int recon_yoffset, int recon_uvoffset); -#endif /* VP8_ENCODER_DENOISING_H_ */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_DENOISING_H_ diff --git a/libvpx/vp8/encoder/encodeframe.h b/libvpx/vp8/encoder/encodeframe.h index 4dd6ba0..e185c10 100644 --- a/libvpx/vp8/encoder/encodeframe.h +++ b/libvpx/vp8/encoder/encodeframe.h @@ -7,8 +7,12 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef ENCODEFRAME_H -#define ENCODEFRAME_H +#ifndef VP8_ENCODER_ENCODEFRAME_H_ +#define VP8_ENCODER_ENCODEFRAME_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x); extern void vp8_build_block_offsets(MACROBLOCK *x); @@ -24,4 +28,8 @@ extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEFRAME_H_ diff --git a/libvpx/vp8/encoder/encodeintra.h b/libvpx/vp8/encoder/encodeintra.h index be2141f..a8d0284 100644 --- a/libvpx/vp8/encoder/encodeintra.h +++ b/libvpx/vp8/encoder/encodeintra.h @@ -9,13 +9,21 @@ */ -#ifndef _ENCODEINTRA_H_ -#define _ENCODEINTRA_H_ +#ifndef VP8_ENCODER_ENCODEINTRA_H_ +#define VP8_ENCODER_ENCODEINTRA_H_ #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred); void vp8_encode_intra16x16mby(MACROBLOCK *x); void vp8_encode_intra16x16mbuv(MACROBLOCK *x); void vp8_encode_intra4x4mby(MACROBLOCK *mb); void vp8_encode_intra4x4block(MACROBLOCK *x, int ib); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEINTRA_H_ diff --git a/libvpx/vp8/encoder/encodemb.h b/libvpx/vp8/encoder/encodemb.h index 6badf7d..0b3ec87 100644 --- a/libvpx/vp8/encoder/encodemb.h +++ b/libvpx/vp8/encoder/encodemb.h @@ -9,10 +9,14 @@ */ -#ifndef __INC_ENCODEMB_H -#define __INC_ENCODEMB_H +#ifndef VP8_ENCODER_ENCODEMB_H_ +#define VP8_ENCODER_ENCODEMB_H_ #include "onyx_int.h" + +#ifdef __cplusplus +extern "C" { +#endif void vp8_encode_inter16x16(MACROBLOCK *x); void vp8_build_dcblock(MACROBLOCK *b); @@ -23,4 +27,8 @@ void vp8_transform_intra_mby(MACROBLOCK *x); void vp8_optimize_mby(MACROBLOCK *x); void vp8_optimize_mbuv(MACROBLOCK *x); void vp8_encode_inter16x16y(MACROBLOCK *x); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEMB_H_ diff --git a/libvpx/vp8/encoder/encodemv.h b/libvpx/vp8/encoder/encodemv.h index a6116c1..722162b 100644 --- a/libvpx/vp8/encoder/encodemv.h +++ b/libvpx/vp8/encoder/encodemv.h @@ -9,13 +9,21 @@ */ -#ifndef __INC_ENCODEMV_H -#define __INC_ENCODEMV_H +#ifndef VP8_ENCODER_ENCODEMV_H_ +#define VP8_ENCODER_ENCODEMV_H_ #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_write_mvprobs(VP8_COMP *); void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *); void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ENCODEMV_H_ diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c index 968c7f3..98e5a71 100644 --- a/libvpx/vp8/encoder/firstpass.c +++ b/libvpx/vp8/encoder/firstpass.c @@ -940,9 +940,9 @@ static int64_t estimate_modemvcost(VP8_COMP *cpi, /* Crude estimate of overhead cost from modes * << 9 is the normalization to (bits * 512) used in vp8_bits_per_mb */ - mode_cost =((((av_pct_inter - av_pct_motion) * zz_cost) + - (av_pct_motion * motion_cost) + - (av_intra * intra_cost)) * cpi->common.MBs) * 512; + mode_cost = (int64_t)((((av_pct_inter - av_pct_motion) * zz_cost) + + (av_pct_motion * motion_cost) + + (av_intra * intra_cost)) * cpi->common.MBs) * 512; return mv_cost + mode_cost; } @@ -2310,7 +2310,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) pct_extra = (pct_extra > 20) ? 20 : pct_extra; cpi->twopass.alt_extra_bits = - (cpi->twopass.gf_group_bits * pct_extra) / 100; + (int)(cpi->twopass.gf_group_bits * pct_extra) / 100; cpi->twopass.gf_group_bits -= cpi->twopass.alt_extra_bits; cpi->twopass.alt_extra_bits /= ((cpi->baseline_gf_interval-1)>>1); @@ -2386,7 +2386,7 @@ static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) target_frame_size = max_bits; if (target_frame_size > cpi->twopass.gf_group_bits) - target_frame_size = cpi->twopass.gf_group_bits; + target_frame_size = (int)cpi->twopass.gf_group_bits; } /* Adjust error and bits remaining */ @@ -2444,10 +2444,10 @@ void vp8_second_pass(VP8_COMP *cpi) find_next_key_frame(cpi, &this_frame_copy); /* Special case: Error error_resilient_mode mode does not make much - * sense for two pass but with its current meaning but this code is + * sense for two pass but with its current meaning this code is * designed to stop outlandish behaviour if someone does set it when * using two pass. It effectively disables GF groups. This is - * temporary code till we decide what should really happen in this + * temporary code until we decide what should really happen in this * case. */ if (cpi->oxcf.error_resilient_mode) @@ -2773,7 +2773,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) kf_group_intra_err += this_frame->intra_error; kf_group_coded_err += this_frame->coded_error; - /* load a the next frame's stats */ + /* Load the next frame's stats. */ vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame)); input_stats(cpi, this_frame); diff --git a/libvpx/vp8/encoder/firstpass.h b/libvpx/vp8/encoder/firstpass.h index 95e1e54..c409ebc 100644 --- a/libvpx/vp8/encoder/firstpass.h +++ b/libvpx/vp8/encoder/firstpass.h @@ -9,8 +9,12 @@ */ -#if !defined __INC_FIRSTPASS_H -#define __INC_FIRSTPASS_H +#ifndef VP8_ENCODER_FIRSTPASS_H_ +#define VP8_ENCODER_FIRSTPASS_H_ + +#ifdef __cplusplus +extern "C" { +#endif extern void vp8_init_first_pass(VP8_COMP *cpi); extern void vp8_first_pass(VP8_COMP *cpi); @@ -21,4 +25,8 @@ extern void vp8_second_pass(VP8_COMP *cpi); extern void vp8_end_second_pass(VP8_COMP *cpi); extern size_t vp8_firstpass_stats_sz(unsigned int mb_count); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_FIRSTPASS_H_ diff --git a/libvpx/vp8/encoder/lookahead.h b/libvpx/vp8/encoder/lookahead.h index cf56b75..cad68e6 100644 --- a/libvpx/vp8/encoder/lookahead.h +++ b/libvpx/vp8/encoder/lookahead.h @@ -7,11 +7,15 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef LOOKAHEAD_H -#define LOOKAHEAD_H +#ifndef VP8_ENCODER_LOOKAHEAD_H_ +#define VP8_ENCODER_LOOKAHEAD_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" +#ifdef __cplusplus +extern "C" { +#endif + struct lookahead_entry { YV12_BUFFER_CONFIG img; @@ -106,4 +110,8 @@ unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_LOOKAHEAD_H_ diff --git a/libvpx/vp8/encoder/mcomp.h b/libvpx/vp8/encoder/mcomp.h index e36c515..f284f7c 100644 --- a/libvpx/vp8/encoder/mcomp.h +++ b/libvpx/vp8/encoder/mcomp.h @@ -9,12 +9,16 @@ */ -#ifndef __INC_MCOMP_H -#define __INC_MCOMP_H +#ifndef VP8_ENCODER_MCOMP_H_ +#define VP8_ENCODER_MCOMP_H_ #include "block.h" #include "vp8/common/variance.h" +#ifdef __cplusplus +extern "C" { +#endif + #ifdef VP8_ENTROPY_STATS extern void init_mv_ref_counts(); extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); @@ -104,4 +108,8 @@ typedef int (*vp8_diamond_search_fn_t) int_mv *center_mv ); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_MCOMP_H_ diff --git a/libvpx/vp8/encoder/modecosts.h b/libvpx/vp8/encoder/modecosts.h index 99ef119..9281551 100644 --- a/libvpx/vp8/encoder/modecosts.h +++ b/libvpx/vp8/encoder/modecosts.h @@ -9,9 +9,17 @@ */ -#ifndef __INC_MODECOSTS_H -#define __INC_MODECOSTS_H +#ifndef VP8_ENCODER_MODECOSTS_H_ +#define VP8_ENCODER_MODECOSTS_H_ + +#ifdef __cplusplus +extern "C" { +#endif void vp8_init_mode_costs(VP8_COMP *x); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_MODECOSTS_H_ diff --git a/libvpx/vp8/encoder/mr_dissim.h b/libvpx/vp8/encoder/mr_dissim.h index f8cb135..5a59ce6 100644 --- a/libvpx/vp8/encoder/mr_dissim.h +++ b/libvpx/vp8/encoder/mr_dissim.h @@ -9,12 +9,20 @@ */ -#ifndef __INC_MR_DISSIM_H -#define __INC_MR_DISSIM_H +#ifndef VP8_ENCODER_MR_DISSIM_H_ +#define VP8_ENCODER_MR_DISSIM_H_ #include "vpx_config.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_cal_low_res_mb_cols(VP8_COMP *cpi); extern void vp8_cal_dissimilarity(VP8_COMP *cpi); extern void vp8_store_drop_frame_info(VP8_COMP *cpi); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_MR_DISSIM_H_ diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c index 4b60cfd..ef37c0e 100644 --- a/libvpx/vp8/encoder/onyx_if.c +++ b/libvpx/vp8/encoder/onyx_if.c @@ -19,7 +19,7 @@ #include "vp8/common/alloccommon.h" #include "mcomp.h" #include "firstpass.h" -#include "psnr.h" +#include "vpx/internal/vpx_psnr.h" #include "vpx_scale/vpx_scale.h" #include "vp8/common/extend.h" #include "ratectrl.h" @@ -1401,6 +1401,7 @@ static void update_layer_contexts (VP8_COMP *cpi) unsigned int i; double prev_layer_framerate=0; + assert(oxcf->number_of_layers <= VPX_TS_MAX_LAYERS); for (i=0; i<oxcf->number_of_layers; i++) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; @@ -1623,6 +1624,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) cpi->oxcf.maximum_buffer_size = rescale((int)cpi->oxcf.maximum_buffer_size, cpi->oxcf.target_bandwidth, 1000); + // Under a configuration change, where maximum_buffer_size may change, + // keep buffer level clipped to the maximum allowed buffer size. + if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) { + cpi->bits_off_target = cpi->oxcf.maximum_buffer_size; + cpi->buffer_level = cpi->bits_off_target; + } /* Set up frame rate and related parameters rate control values. */ vp8_new_framerate(cpi, cpi->framerate); @@ -2164,10 +2171,12 @@ void vp8_remove_compressor(VP8_COMP **ptr) 8.0 / 1000.0 / time_encoded; double samples = 3.0 / 2 * cpi->frames_in_layer[i] * lst_yv12->y_width * lst_yv12->y_height; - double total_psnr = vp8_mse2psnr(samples, 255.0, - cpi->total_error2[i]); - double total_psnr2 = vp8_mse2psnr(samples, 255.0, - cpi->total_error2_p[i]); + double total_psnr = + vpx_sse_to_psnr(samples, 255.0, + cpi->total_error2[i]); + double total_psnr2 = + vpx_sse_to_psnr(samples, 255.0, + cpi->total_error2_p[i]); double total_ssim = 100 * pow(cpi->sum_ssim[i] / cpi->sum_weights[i], 8.0); @@ -2184,9 +2193,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) { double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height; - double total_psnr = vp8_mse2psnr(samples, 255.0, - cpi->total_sq_error); - double total_psnr2 = vp8_mse2psnr(samples, 255.0, + double total_psnr = vpx_sse_to_psnr(samples, 255.0, + cpi->total_sq_error); + double total_psnr2 = vpx_sse_to_psnr(samples, 255.0, cpi->total_sq_error2); double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); @@ -2516,8 +2525,8 @@ static void generate_psnr_packet(VP8_COMP *cpi) pkt.data.psnr.samples[3] = width * height; for (i = 0; i < 4; i++) - pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0, - (double)(pkt.data.psnr.sse[i])); + pkt.data.psnr.psnr[i] = vpx_sse_to_psnr(pkt.data.psnr.samples[i], 255.0, + (double)(pkt.data.psnr.sse[i])); vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt); } @@ -2675,8 +2684,8 @@ static int resize_key_frame(VP8_COMP *cpi) VP8_COMMON *cm = &cpi->common; /* Do we need to apply resampling for one pass cbr. - * In one pass this is more limited than in two pass cbr - * The test and any change is only made one per key frame sequence + * In one pass this is more limited than in two pass cbr. + * The test and any change is only made once per key frame sequence. */ if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) { @@ -2699,7 +2708,7 @@ static int resize_key_frame(VP8_COMP *cpi) cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL; } - /* Get the new hieght and width */ + /* Get the new height and width */ Scale2Ratio(cm->horiz_scale, &hr, &hs); Scale2Ratio(cm->vert_scale, &vr, &vs); new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs; @@ -3808,7 +3817,7 @@ static void encode_frame_to_data_rate /* Setup background Q adjustment for error resilient mode. * For multi-layer encodes only enable this for the base layer. - */ + */ if (cpi->cyclic_refresh_mode_enabled) { if (cpi->current_layer==0) @@ -4621,45 +4630,43 @@ static void encode_frame_to_data_rate vp8_clear_system_state(); if (cpi->twopass.total_left_stats.coded_error != 0.0) - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" - "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f" - "%10.3f %8d\n", + fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64 + "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d " + "%8.2lf %"PRId64" %10.3lf %10"PRId64" %8d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), - (int)cpi->total_target_vs_actual, + cpi->total_target_vs_actual, cpi->buffer_level, (cpi->oxcf.starting_buffer_level-cpi->bits_off_target), - (int)cpi->total_actual_bits, cm->base_qindex, + cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->ni_av_qi, cpi->cq_target_quality, - cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, - (int)cpi->twopass.bits_left, + cpi->twopass.bits_left, cpi->twopass.total_left_stats.coded_error, (double)cpi->twopass.bits_left / cpi->twopass.total_left_stats.coded_error, cpi->tot_recode_hits); else - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" - "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f" - "%8d\n", - cpi->common.current_video_frame, - cpi->this_frame_target, cpi->projected_frame_size, + fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64 + "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d " + "%8.2lf %"PRId64" %10.3lf %8d\n", + cpi->common.current_video_frame, cpi->this_frame_target, + cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), - (int)cpi->total_target_vs_actual, + cpi->total_target_vs_actual, cpi->buffer_level, (cpi->oxcf.starting_buffer_level-cpi->bits_off_target), - (int)cpi->total_actual_bits, cm->base_qindex, + cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality, cpi->ni_av_qi, cpi->cq_target_quality, - cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, - (int)cpi->twopass.bits_left, + cpi->twopass.bits_left, cpi->twopass.total_left_stats.coded_error, cpi->tot_recode_hits); @@ -4667,7 +4674,6 @@ static void encode_frame_to_data_rate { FILE *fmodes = fopen("Modes.stt", "a"); - int i; fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, @@ -5066,6 +5072,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l unsigned int i; /* Update frame rates for each layer */ + assert(cpi->oxcf.number_of_layers <= VPX_TS_MAX_LAYERS); for (i=0; i<cpi->oxcf.number_of_layers; i++) { LAYER_CONTEXT *lc = &cpi->layer_context[i]; @@ -5281,11 +5288,11 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l sq_error = (double)(ye + ue + ve); - frame_psnr = vp8_mse2psnr(t_samples, 255.0, sq_error); + frame_psnr = vpx_sse_to_psnr(t_samples, 255.0, sq_error); - cpi->total_y += vp8_mse2psnr(y_samples, 255.0, (double)ye); - cpi->total_u += vp8_mse2psnr(uv_samples, 255.0, (double)ue); - cpi->total_v += vp8_mse2psnr(uv_samples, 255.0, (double)ve); + cpi->total_y += vpx_sse_to_psnr(y_samples, 255.0, (double)ye); + cpi->total_u += vpx_sse_to_psnr(uv_samples, 255.0, (double)ue); + cpi->total_v += vpx_sse_to_psnr(uv_samples, 255.0, (double)ve); cpi->total_sq_error += sq_error; cpi->total += frame_psnr; #if CONFIG_POSTPROC @@ -5308,14 +5315,14 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l sq_error2 = (double)(ye + ue + ve); - frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2); + frame_psnr2 = vpx_sse_to_psnr(t_samples, 255.0, sq_error2); - cpi->totalp_y += vp8_mse2psnr(y_samples, - 255.0, (double)ye); - cpi->totalp_u += vp8_mse2psnr(uv_samples, - 255.0, (double)ue); - cpi->totalp_v += vp8_mse2psnr(uv_samples, - 255.0, (double)ve); + cpi->totalp_y += vpx_sse_to_psnr(y_samples, + 255.0, (double)ye); + cpi->totalp_u += vpx_sse_to_psnr(uv_samples, + 255.0, (double)ue); + cpi->totalp_v += vpx_sse_to_psnr(uv_samples, + 255.0, (double)ve); cpi->total_sq_error2 += sq_error2; cpi->totalp += frame_psnr2; diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h index 3ab0fe8..6b37167 100644 --- a/libvpx/vp8/encoder/onyx_int.h +++ b/libvpx/vp8/encoder/onyx_int.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_VP8_INT_H -#define __INC_VP8_INT_H +#ifndef VP8_ENCODER_ONYX_INT_H_ +#define VP8_ENCODER_ONYX_INT_H_ #include <stdio.h> #include "vpx_config.h" @@ -33,6 +33,10 @@ #include "vp8/encoder/denoising.h" #endif +#ifdef __cplusplus +extern "C" { +#endif + #define MIN_GF_INTERVAL 4 #define DEFAULT_GF_INTERVAL 7 @@ -721,4 +725,8 @@ void vp8_set_speed_features(VP8_COMP *cpi); "Failed to allocate "#lval);\ } while(0) #endif +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_ONYX_INT_H_ diff --git a/libvpx/vp8/encoder/pickinter.h b/libvpx/vp8/encoder/pickinter.h index 35011ca..cf3b1f8 100644 --- a/libvpx/vp8/encoder/pickinter.h +++ b/libvpx/vp8/encoder/pickinter.h @@ -9,11 +9,15 @@ */ -#ifndef __INC_PICKINTER_H -#define __INC_PICKINTER_H +#ifndef VP8_ENCODER_PICKINTER_H_ +#define VP8_ENCODER_PICKINTER_H_ #include "vpx_config.h" #include "vp8/common/onyxc_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra, @@ -24,4 +28,8 @@ extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb, const vp8_variance_fn_ptr_t *vfp, unsigned int *sse, int_mv this_mv); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_PICKINTER_H_ diff --git a/libvpx/vp8/encoder/psnr.c b/libvpx/vp8/encoder/psnr.c deleted file mode 100644 index b3a3d95..0000000 --- a/libvpx/vp8/encoder/psnr.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_scale/yv12config.h" -#include "math.h" -#include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */ - -#define MAX_PSNR 100 - -double vp8_mse2psnr(double Samples, double Peak, double Mse) -{ - double psnr; - - if ((double)Mse > 0.0) - psnr = 10.0 * log10(Peak * Peak * Samples / Mse); - else - psnr = MAX_PSNR; /* Limit to prevent / 0 */ - - if (psnr > MAX_PSNR) - psnr = MAX_PSNR; - - return psnr; -} diff --git a/libvpx/vp8/encoder/psnr.h b/libvpx/vp8/encoder/psnr.h deleted file mode 100644 index 7f6269a..0000000 --- a/libvpx/vp8/encoder/psnr.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_PSNR_H -#define __INC_PSNR_H - -extern double vp8_mse2psnr(double Samples, double Peak, double Mse); - -#endif diff --git a/libvpx/vp8/encoder/quantize.h b/libvpx/vp8/encoder/quantize.h index d55496c..c739b26 100644 --- a/libvpx/vp8/encoder/quantize.h +++ b/libvpx/vp8/encoder/quantize.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_QUANTIZE_H -#define __INC_QUANTIZE_H +#ifndef VP8_ENCODER_QUANTIZE_H_ +#define VP8_ENCODER_QUANTIZE_H_ + +#ifdef __cplusplus +extern "C" { +#endif struct VP8_COMP; struct macroblock; @@ -20,4 +24,8 @@ extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x); extern void vp8cx_mb_init_quantizer(struct VP8_COMP *cpi, struct macroblock *x, int ok_to_skip); extern void vp8cx_init_quantizer(struct VP8_COMP *cpi); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_QUANTIZE_H_ diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c index fe4db13..c51650c 100644 --- a/libvpx/vp8/encoder/ratectrl.c +++ b/libvpx/vp8/encoder/ratectrl.c @@ -174,14 +174,6 @@ static const int kf_gf_boost_qlimits[QINDEX_RANGE] = 600, 600, 600, 600, 600, 600, 600, 600, }; -/* % adjustment to target kf size based on seperation from previous frame */ -static const int kf_boost_seperation_adjustment[16] = -{ - 30, 40, 50, 55, 60, 65, 70, 75, - 80, 85, 90, 95, 100, 100, 100, 100, -}; - - static const int gf_adjust_table[101] = { 100, @@ -1238,7 +1230,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { Q = cpi->oxcf.gold_q; } - } else { diff --git a/libvpx/vp8/encoder/ratectrl.h b/libvpx/vp8/encoder/ratectrl.h index c43f08d..829697f 100644 --- a/libvpx/vp8/encoder/ratectrl.h +++ b/libvpx/vp8/encoder/ratectrl.h @@ -9,10 +9,15 @@ */ -#if !defined __INC_RATECTRL_H +#ifndef VP8_ENCODER_RATECTRL_H_ +#define VP8_ENCODER_RATECTRL_H_ #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_save_coding_context(VP8_COMP *cpi); extern void vp8_restore_coding_context(VP8_COMP *cpi); @@ -25,4 +30,8 @@ extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_ /* return of 0 means drop frame */ extern int vp8_pick_frame_size(VP8_COMP *cpi); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_RATECTRL_H_ diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c index 5016cc4..387701c 100644 --- a/libvpx/vp8/encoder/rdopt.c +++ b/libvpx/vp8/encoder/rdopt.c @@ -528,19 +528,16 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); -# define QC( I) ( qcoeff_ptr [vp8_default_zig_zag1d[I]] ) - + assert(eob <= 16); for (; c < eob; c++) { - int v = QC(c); - int t = vp8_dct_value_tokens_ptr[v].Token; + const int v = qcoeff_ptr[vp8_default_zig_zag1d[c]]; + const int t = vp8_dct_value_tokens_ptr[v].Token; cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t]; cost += vp8_dct_value_cost_ptr[v]; pt = vp8_prev_token_class[t]; } -# undef QC - if (c < 16) cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN]; diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h index 1e11fa7..fe21b8e 100644 --- a/libvpx/vp8/encoder/rdopt.h +++ b/libvpx/vp8/encoder/rdopt.h @@ -9,8 +9,12 @@ */ -#ifndef __INC_RDOPT_H -#define __INC_RDOPT_H +#ifndef VP8_ENCODER_RDOPT_H_ +#define VP8_ENCODER_RDOPT_H_ + +#ifdef __cplusplus +extern "C" { +#endif #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) @@ -130,4 +134,8 @@ extern void vp8_mv_pred ); void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_RDOPT_H_ diff --git a/libvpx/vp8/encoder/segmentation.h b/libvpx/vp8/encoder/segmentation.h index 12815b0..6b55005 100644 --- a/libvpx/vp8/encoder/segmentation.h +++ b/libvpx/vp8/encoder/segmentation.h @@ -8,9 +8,21 @@ * be found in the AUTHORS file in the root of the source tree. */ +#ifndef VP8_ENCODER_SEGMENTATION_H_ +#define VP8_ENCODER_SEGMENTATION_H_ #include "string.h" #include "vp8/common/blockd.h" #include "onyx_int.h" +#ifdef __cplusplus +extern "C" { +#endif + extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_SEGMENTATION_H_ diff --git a/libvpx/vp8/encoder/temporal_filter.c b/libvpx/vp8/encoder/temporal_filter.c index 7e3af71..513b2bf 100644 --- a/libvpx/vp8/encoder/temporal_filter.c +++ b/libvpx/vp8/encoder/temporal_filter.c @@ -16,7 +16,6 @@ #include "vp8/common/alloccommon.h" #include "mcomp.h" #include "firstpass.h" -#include "psnr.h" #include "vpx_scale/vpx_scale.h" #include "vp8/common/extend.h" #include "ratectrl.h" diff --git a/libvpx/vp8/encoder/tokenize.c b/libvpx/vp8/encoder/tokenize.c index 11559a7..2dc8205 100644 --- a/libvpx/vp8/encoder/tokenize.c +++ b/libvpx/vp8/encoder/tokenize.c @@ -213,6 +213,7 @@ static void tokenize1st_order_b /* Luma */ for (block = 0; block < 16; block++, b++) { + const int eob = *b->eob; tmp1 = vp8_block2above[block]; tmp2 = vp8_block2left[block]; qcoeff_ptr = b->qcoeff; @@ -223,7 +224,7 @@ static void tokenize1st_order_b c = type ? 0 : 1; - if(c >= *b->eob) + if(c >= eob) { /* c = band for this case */ t->Token = DCT_EOB_TOKEN; @@ -250,7 +251,8 @@ static void tokenize1st_order_b t++; c++; - for (; c < *b->eob; c++) + assert(eob <= 16); + for (; c < eob; c++) { rc = vp8_default_zig_zag1d[c]; band = vp8_coef_bands[c]; @@ -286,6 +288,7 @@ static void tokenize1st_order_b /* Chroma */ for (block = 16; block < 24; block++, b++) { + const int eob = *b->eob; tmp1 = vp8_block2above[block]; tmp2 = vp8_block2left[block]; qcoeff_ptr = b->qcoeff; @@ -294,7 +297,7 @@ static void tokenize1st_order_b VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); - if(!(*b->eob)) + if(!eob) { /* c = band for this case */ t->Token = DCT_EOB_TOKEN; @@ -321,7 +324,8 @@ static void tokenize1st_order_b t++; c = 1; - for (; c < *b->eob; c++) + assert(eob <= 16); + for (; c < eob; c++) { rc = vp8_default_zig_zag1d[c]; band = vp8_coef_bands[c]; diff --git a/libvpx/vp8/encoder/tokenize.h b/libvpx/vp8/encoder/tokenize.h index 1e6cea1..b73a9ee 100644 --- a/libvpx/vp8/encoder/tokenize.h +++ b/libvpx/vp8/encoder/tokenize.h @@ -9,12 +9,16 @@ */ -#ifndef tokenize_h -#define tokenize_h +#ifndef VP8_ENCODER_TOKENIZE_H_ +#define VP8_ENCODER_TOKENIZE_H_ #include "vp8/common/entropy.h" #include "block.h" +#ifdef __cplusplus +extern "C" { +#endif + void vp8_tokenize_initialize(); typedef struct @@ -47,4 +51,8 @@ extern const short *const vp8_dct_value_cost_ptr; */ extern const TOKENVALUE *const vp8_dct_value_tokens_ptr; -#endif /* tokenize_h */ +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VP8_ENCODER_TOKENIZE_H_ diff --git a/libvpx/vp8/encoder/treewriter.h b/libvpx/vp8/encoder/treewriter.h index 48574f3..cfb2730 100644 --- a/libvpx/vp8/encoder/treewriter.h +++ b/libvpx/vp8/encoder/treewriter.h @@ -9,8 +9,8 @@ */ -#ifndef __INC_TREEWRITER_H -#define __INC_TREEWRITER_H +#ifndef VP8_ENCODER_TREEWRITER_H_ +#define VP8_ENCODER_TREEWRITER_H_ /* Trees map alphabets into huffman-like codes suitable for an arithmetic bit coder. Timothy S Murphy 11 October 2004 */ @@ -19,6 +19,10 @@ #include "boolhuff.h" /* for now */ +#ifdef __cplusplus +extern "C" { +#endif + typedef BOOL_CODER vp8_writer; #define vp8_write vp8_encode_bool @@ -123,4 +127,8 @@ void vp8_cost_tokens2( int *Costs, const vp8_prob *, vp8_tree, int ); +#ifdef __cplusplus +} // extern "C" #endif + +#endif // VP8_ENCODER_TREEWRITER_H_ diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk index f98eb31..dfb54a5 100644 --- a/libvpx/vp8/vp8_common.mk +++ b/libvpx/vp8/vp8_common.mk @@ -47,7 +47,7 @@ VP8_COMMON_SRCS-yes += common/quant_common.h VP8_COMMON_SRCS-yes += common/reconinter.h VP8_COMMON_SRCS-yes += common/reconintra4x4.h VP8_COMMON_SRCS-yes += common/rtcd.c -VP8_COMMON_SRCS-yes += common/rtcd_defs.sh +VP8_COMMON_SRCS-yes += common/rtcd_defs.pl VP8_COMMON_SRCS-yes += common/setupintrarecon.h VP8_COMMON_SRCS-yes += common/swapyv12buffer.h VP8_COMMON_SRCS-yes += common/systemdependent.h @@ -159,14 +159,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) # common (neon) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict4x4_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict8x4_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict8x8_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict16x16_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x4_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem8x8_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem16x16_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM) @@ -181,14 +173,20 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x8_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict16x16_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/save_reg_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM) -$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh)) +# common (neon intrinsics) +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c + + +$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c index 19e9d27..4c896b1 100644 --- a/libvpx/vp8/vp8_cx_iface.c +++ b/libvpx/vp8/vp8_cx_iface.c @@ -414,7 +414,6 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, printf("Sharpness: %d\n", oxcf->Sharpness); printf("cpu_used: %d\n", oxcf->cpu_used); printf("Mode: %d\n", oxcf->Mode); - printf("delete_first_pass_file: %d\n", oxcf->delete_first_pass_file); printf("auto_key: %d\n", oxcf->auto_key); printf("key_freq: %d\n", oxcf->key_freq); printf("end_usage: %d\n", oxcf->end_usage); @@ -751,9 +750,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (!ctx->cfg.rc_target_bitrate) return res; - if (!ctx->cfg.rc_target_bitrate) - return res; - if (img) res = validate_img(ctx, img); @@ -1266,10 +1262,10 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 128, /* kf_max_dist */ #if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION) - 1, /* g_delete_first_pass_file */ "vp8.fpf" /* first pass filename */ #endif VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */ + {0}, /* ss_target_bitrate */ 1, /* ts_number_layers */ {0}, /* ts_target_bitrate */ {0}, /* ts_rate_decimator */ diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c index 871b8d3..0b4c4cb 100644 --- a/libvpx/vp8/vp8_dx_iface.c +++ b/libvpx/vp8/vp8_dx_iface.c @@ -929,6 +929,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = vp8_get_si, /* vpx_codec_get_si_fn_t get_si; */ vp8_decode, /* vpx_codec_decode_fn_t decode; */ vp8_get_frame, /* vpx_codec_frame_get_fn_t frame_get; */ + NOT_IMPLEMENTED, }, { /* encoder functions */ NOT_IMPLEMENTED, diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk index cd091f3..d7c6dd1 100644 --- a/libvpx/vp8/vp8cx.mk +++ b/libvpx/vp8/vp8cx.mk @@ -50,7 +50,6 @@ VP8_CX_SRCS-yes += encoder/mcomp.h VP8_CX_SRCS-yes += encoder/modecosts.h VP8_CX_SRCS-yes += encoder/onyx_int.h VP8_CX_SRCS-yes += encoder/pickinter.h -VP8_CX_SRCS-yes += encoder/psnr.h VP8_CX_SRCS-yes += encoder/quantize.h VP8_CX_SRCS-yes += encoder/ratectrl.h VP8_CX_SRCS-yes += encoder/rdopt.h @@ -61,7 +60,6 @@ VP8_CX_SRCS-yes += encoder/modecosts.c VP8_CX_SRCS-yes += encoder/onyx_if.c VP8_CX_SRCS-yes += encoder/pickinter.c VP8_CX_SRCS-yes += encoder/picklpf.c -VP8_CX_SRCS-yes += encoder/psnr.c VP8_CX_SRCS-yes += encoder/quantize.c VP8_CX_SRCS-yes += encoder/ratectrl.c VP8_CX_SRCS-yes += encoder/rdopt.c diff --git a/libvpx/vp8/vp8cx_arm.mk b/libvpx/vp8/vp8cx_arm.mk index b030ee5..398172a 100644 --- a/libvpx/vp8/vp8cx_arm.mk +++ b/libvpx/vp8/vp8cx_arm.mk @@ -37,6 +37,7 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) # encoder VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/picklpf_arm.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon$(ASM) diff --git a/libvpx/vp8/vp8dx.mk b/libvpx/vp8/vp8dx.mk index 4a8f467..892ed70 100644 --- a/libvpx/vp8/vp8dx.mk +++ b/libvpx/vp8/vp8dx.mk @@ -22,7 +22,7 @@ VP8_DX_SRCS-yes += vp8_dx_iface.c VP8_DX_SRCS-yes += decoder/dboolhuff.c VP8_DX_SRCS-yes += decoder/decodemv.c -VP8_DX_SRCS-yes += decoder/decodframe.c +VP8_DX_SRCS-yes += decoder/decodeframe.c VP8_DX_SRCS-yes += decoder/detokenize.c VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h |