diff options
54 files changed, 1051 insertions, 1625 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s index 523218f..3816409 100644 --- a/common/arm/ih264_arm_memory_barrier.s +++ b/common/arm/ih264_arm_memory_barrier.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @******************************************************************************* @* @file @* ih264_arm_memory_barrier.s @@ -39,7 +39,6 @@ .text .p2align 2 - @***************************************************************************** @* @* Function Name : ih264_arm_dsb diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s index 66102a7..8c9960a 100644 --- a/common/arm/ih264_deblk_chroma_a9.s +++ b/common/arm/ih264_deblk_chroma_a9.s @@ -54,7 +54,7 @@ .text .p2align 2 -@/** +@** @******************************************************************************* @* @* @brief @@ -84,7 +84,7 @@ @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_horz_bs4_bp_a9 @@ -130,7 +130,7 @@ ih264_deblk_chroma_horz_bs4_bp_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -160,7 +160,7 @@ ih264_deblk_chroma_horz_bs4_bp_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bs4_bp_a9 @@ -224,7 +224,7 @@ ih264_deblk_chroma_vert_bs4_bp_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -260,7 +260,7 @@ ih264_deblk_chroma_vert_bs4_bp_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_horz_bslt4_bp_a9 @@ -326,7 +326,7 @@ ih264_deblk_chroma_horz_bslt4_bp_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -362,7 +362,7 @@ ih264_deblk_chroma_horz_bslt4_bp_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bslt4_bp_a9 @@ -465,7 +465,7 @@ ih264_deblk_chroma_vert_bslt4_bp_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -495,7 +495,7 @@ ih264_deblk_chroma_vert_bslt4_bp_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9 @@ -543,7 +543,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_bp_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -579,7 +579,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_bp_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9 @@ -656,7 +656,7 @@ ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -692,7 +692,7 @@ ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_horz_bs4_a9 @@ -743,7 +743,7 @@ ih264_deblk_chroma_horz_bs4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -779,7 +779,7 @@ ih264_deblk_chroma_horz_bs4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bs4_a9 @@ -848,7 +848,7 @@ ih264_deblk_chroma_vert_bs4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -893,7 +893,7 @@ ih264_deblk_chroma_vert_bs4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_horz_bslt4_a9 @@ -968,7 +968,7 @@ ih264_deblk_chroma_horz_bslt4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -1013,7 +1013,7 @@ ih264_deblk_chroma_horz_bslt4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bslt4_a9 @@ -1119,7 +1119,7 @@ ih264_deblk_chroma_vert_bslt4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -1155,7 +1155,7 @@ ih264_deblk_chroma_vert_bslt4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bs4_mbaff_a9 @@ -1206,7 +1206,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -1251,7 +1251,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_chroma_vert_bslt4_mbaff_a9 diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s index 3e6a4d9..9217ed2 100644 --- a/common/arm/ih264_deblk_luma_a9.s +++ b/common/arm/ih264_deblk_luma_a9.s @@ -47,7 +47,7 @@ .text .p2align 2 -@/** +@** @******************************************************************************* @* @* @brief @@ -83,7 +83,7 @@ @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_luma_horz_bslt4_a9 @@ -187,7 +187,7 @@ ih264_deblk_luma_horz_bslt4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -217,7 +217,7 @@ ih264_deblk_luma_horz_bslt4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_luma_horz_bs4_a9 @@ -353,7 +353,7 @@ ih264_deblk_luma_horz_bs4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -389,7 +389,7 @@ ih264_deblk_luma_horz_bs4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_luma_vert_bslt4_a9 @@ -574,7 +574,7 @@ ih264_deblk_luma_vert_bslt4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -604,7 +604,7 @@ ih264_deblk_luma_vert_bslt4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_luma_vert_bs4_a9 @@ -800,7 +800,7 @@ ih264_deblk_luma_vert_bs4_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -830,7 +830,7 @@ ih264_deblk_luma_vert_bs4_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_luma_vert_bs4_mbaff_a9 @@ -942,7 +942,7 @@ ih264_deblk_luma_vert_bs4_mbaff_a9: -@/** +@** @******************************************************************************* @* @* @brief @@ -978,7 +978,7 @@ ih264_deblk_luma_vert_bs4_mbaff_a9: @* None @* @******************************************************************************* -@*/ +@* .global ih264_deblk_luma_vert_bslt4_mbaff_a9 diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s index 94cda46..a4688f2 100644 --- a/common/arm/ih264_default_weighted_pred_a9q.s +++ b/common/arm/ih264_default_weighted_pred_a9q.s @@ -17,14 +17,13 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_default_weighted_pred_a9q.s @* @* @brief @* Contains function definitions for default weighted prediction. -@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT @* @* @author @* Kaushik Senthoor R @@ -38,7 +37,7 @@ @* None @* @******************************************************************************* -@*/ +@* @******************************************************************************* @* @function @* ih264_default_weighted_pred_luma_a9q() @@ -82,7 +81,7 @@ @* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). @* @******************************************************************************* -@*/ +@* @void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @ UWORD8 *pu1_dst, @@ -256,7 +255,7 @@ end_loops: @* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). @* @******************************************************************************* -@*/ +@* @void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @ UWORD8 *pu1_dst, diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s index 687099a..c7feddd 100644 --- a/common/arm/ih264_ihadamard_scaling_a9.s +++ b/common/arm/ih264_ihadamard_scaling_a9.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @ ******************************************************************************* @ * @file @ * ih264_ihadamard_scaling_a9.s @@ -37,7 +37,7 @@ @ * None @ * @ ******************************************************************************* -@ */ +@ * @ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients @ * of a 16x16 intra prediction macroblock, and then performs scaling. @ * prediction buffer @@ -69,10 +69,10 @@ @ * @remarks none @ * @ ******************************************************************************* -@ */ +@ * @ * @ ******************************************************************************* -@ */ +@ * @ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src, @ WORD16* pi2_out, @ const UWORD16 *pu2_iscal_mat, @@ -161,7 +161,7 @@ ih264_ihadamard_scaling_4x4_a9: @ ******************************************************************************* -@ */ +@ * @ * @brief This function performs a 2x2 inverse hadamard transform for chroma block @ * @ * @par Description: @@ -189,10 +189,10 @@ ih264_ihadamard_scaling_4x4_a9: @ * @remarks none @ * @ ******************************************************************************* -@ */ +@ * @ * @ ******************************************************************************* -@ */ +@ * @ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src, @ WORD16* pi2_out, @ const UWORD16 *pu2_iscal_mat, diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s index afd2860..6681a7c 100644 --- a/common/arm/ih264_inter_pred_chroma_a9q.s +++ b/common/arm/ih264_inter_pred_chroma_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_chroma_a9q.s @@ -36,16 +36,16 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** -@/** +@** +@** +@** @ -@/** +@** @******************************************************************************* @* @* @brief @@ -85,7 +85,7 @@ @* None @* @******************************************************************************* -@*/ +@* @void ih264_inter_pred_chroma(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @@ -112,8 +112,6 @@ ih264_inter_pred_chroma_a9q: - - stmfd sp!, {r4-r12, r14} @store register values to stack vstmdb sp!, {d8-d15} @push neon registers to stack ldr r4, [sp, #104] diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s index ea6bba0..62b4b94 100644 --- a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s +++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_horz_a9q.s @@ -36,13 +36,13 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** +@** +@** @******************************************************************************* @* @* @brief @@ -76,7 +76,7 @@ @* None @* @******************************************************************************* -@*/ +@* @void ih264_inter_pred_luma_horz ( @ UWORD8 *pu1_src, @@ -102,6 +102,9 @@ ih264_inter_pred_luma_horz_a9q: + + + stmfd sp!, {r4-r12, r14} @store register values to stack vstmdb sp!, {d8-d15} @push neon registers to stack ldr r5, [sp, #104] @Loads ht @@ -116,7 +119,7 @@ ih264_inter_pred_luma_horz_a9q: beq loop_4 loop_16: @when wd=16 - @// Processing row0 and row1 + @ Processing row0 and row1 vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 @@ -173,7 +176,7 @@ loop_16: @when wd=16 b loop_16 @ loop if height == 8 or 16 loop_8: -@// Processing row0 and row1 +@ Processing row0 and row1 vld1.8 {d5, d6}, [r0], r2 @// Load row1 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) vld1.8 {d2, d3}, [r0], r2 @// Load row0 @@ -204,7 +207,7 @@ loop_8: beq end_func @ Branch if height==4 - b loop_8 @looping if height =8 or 16 + b loop_8 @looping if height =8 or 16 loop_4: vld1.8 {d5, d6}, [r0], r2 @// Load row1 diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s index 5b29e02..65c40a6 100644 --- a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s +++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_vert_a9q.s @@ -36,14 +36,14 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** -@/** +@** +@** +@** @ ******************************************************************************* @ * @ * @brief @@ -195,10 +195,10 @@ loop_16: @when wd=16 subne r0, r0, r2 beq end_func @ Branch if height==4 - b loop_16 @ looping if height = 8 or 16 + b loop_16 @ looping if height = 8 or 16 loop_8: -@// Processing row0 and row1 +@ Processing row0 and row1 vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] @@ -248,7 +248,7 @@ loop_8: loop_4: -@// Processing row0 and row1 +@ Processing row0 and row1 vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s index 6a3c83d..8f049f8 100644 --- a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s +++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_bilinear_a9q.s @@ -36,14 +36,14 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** -@/** +@** +@** +@** @ ******************************************************************************* @ * function:ih264_inter_pred_luma_bilinear @ * @@ -89,7 +89,7 @@ @* None @* @******************************************************************************* -@*/ +@* @void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @@ -192,7 +192,7 @@ loop_16: @when wd=16 subs r12, r6, #8 vst1.8 {q15}, [r2], r5 @//Store dest row7 - beq end_func @ end function if ht=8 + beq end_func @ end function if ht=8 vld1.8 {q0}, [r0], r3 @// Load row8 ;src1 vaddl.u8 q10, d0, d4 @@ -275,7 +275,7 @@ loop_8: @wd=8; vqrshrun.s16 d31, q13, #1 subs r12, r6, #4 vst1.8 {d31}, [r2], r5 @//Store dest row3 - beq end_func @ end function if ht=4 + beq end_func @ end function if ht=4 vld1.8 {d12}, [r1], r4 @// Load row4 ;src2 vld1.8 {d8}, [r0], r3 @// Load row4 ;src1 @@ -298,7 +298,7 @@ loop_8: @wd=8; vqrshrun.s16 d31, q11, #1 subs r12, r6, #8 vst1.8 {d31}, [r2], r5 @//Store dest row7 - beq end_func @ end function if ht=8 + beq end_func @ end function if ht=8 vld1.8 {d0}, [r0], r3 @// Load row8 ;src1 vld1.8 {d4}, [r1], r4 @// Load row8 ;src2 @@ -367,7 +367,7 @@ loop_4: vqrshrun.s16 d31, q13, #1 subs r12, r6, #4 vst1.32 d31[0], [r2], r5 @//Store dest row3 - beq end_func @ end function if ht=4 + beq end_func @ end function if ht=4 vld1.32 d12[0], [r1], r4 @// Load row4 ;src2 vld1.32 d8[0], [r0], r3 @// Load row4 ;src1 diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s index 8ba2fbf..c0b0568 100644 --- a/common/arm/ih264_inter_pred_luma_copy_a9q.s +++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s @@ -17,8 +17,8 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** -@/** +@** +@** @******************************************************************************* @* @* @brief @@ -53,7 +53,7 @@ @* None @* @******************************************************************************* -@*/ +@* @void ih264_inter_pred_luma_copy ( @ UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @@ -182,7 +182,7 @@ end_inner_loop_wd_16: ldmfd sp!, {r4-r12, r15} @Reload the registers from SP -@ /* +@ * @ ******************************************************************************** @ * @ * @brief This function copies a 4x4 block to destination @@ -208,7 +208,7 @@ end_inner_loop_wd_16: @ * Currently wd and height is not used, ie a 4x4 block is always copied @ * @ ******************************************************************************* -@ */ +@ * @ void ih264_interleave_copy(WORD16 *pi2_src, @ UWORD8 *pu1_out, @ WORD32 pred_strd, diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s index 43321a8..54183f0 100644 --- a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s +++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s @@ -36,14 +36,14 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** -@/** +@** +@** +@** @******************************************************************************* @* @* @brief @@ -88,7 +88,7 @@ @* None @* @******************************************************************************* -@*/; +@*; @void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s index 65a6de7..c8edf38 100644 --- a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s +++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s @@ -36,14 +36,14 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** -@/** +@** +@** +@** @******************************************************************************* @* @* @brief @@ -91,7 +91,7 @@ @* None @* @******************************************************************************* -@*/; +@*; @void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @@ -835,7 +835,7 @@ loop_8: vmov q7, q14 vst1.32 d30, [r1], r3 @ store row 3 - bgt loop_8 @if height =8 or 16 loop + bgt loop_8 @if height =8 or 16 loop b end_func loop_4_start: diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s index c39ae01..ab1d1d1 100644 --- a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_horz_qpel_a9q.s @@ -30,19 +30,19 @@ @* @* @par List of Functions: @* -@* - ih264_inter_pred_luma_horz_qpe_a9ql() +@* - ih264_inter_pred_luma_horz_qpel_a9q() @* @* @remarks @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** +@** +@** @******************************************************************************* @* @* @brief @@ -79,7 +79,7 @@ @* None @* @******************************************************************************* -@*/ +@* @void ih264_inter_pred_luma_horz ( @ UWORD8 *pu1_src, @@ -126,7 +126,7 @@ ih264_inter_pred_luma_horz_qpel_a9q: beq loop_4 loop_16: @when wd=16 - @// Processing row0 and row1 + @ Processing row0 and row1 vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 @@ -187,7 +187,7 @@ loop_16: @when wd=16 b loop_16 loop_8: -@// Processing row0 and row1 +@ Processing row0 and row1 vld1.8 {d5, d6}, [r0], r2 @// Load row1 vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) @@ -221,7 +221,7 @@ loop_8: subs r5, r5, #2 @ 2 rows done, decrement by 2 beq end_func @ Branch if height==4 - b loop_8 @looping if height == 8 or 16 + b loop_8 @looping if height == 8 or 16 loop_4: vld1.8 {d5, d6}, [r0], r2 @// Load row1 diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s index 565cc80..3c63ca3 100644 --- a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s @@ -36,14 +36,14 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** -@/** +@** +@** +@** @******************************************************************************* @* @* @brief @@ -91,7 +91,7 @@ @* None @* @******************************************************************************* -@*/; +@*; @void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s index 3c8b60a..cfe03a0 100644 --- a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s +++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s @@ -36,14 +36,11 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** -@/** @******************************************************************************* @* @* @brief @@ -90,7 +87,7 @@ @* None @* @******************************************************************************* -@*/; +@*; @void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s index d45055e..e2c68ef 100644 --- a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s +++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_inter_pred_luma_vert_qpel_a9q.s @@ -36,13 +36,11 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_inter_pred_filters.c +@* All the functions here are replicated from ih264_inter_pred_filters.c @ -@/** -@/** @******************************************************************************* @* @* @brief @@ -79,7 +77,7 @@ @* None @* @******************************************************************************* -@*/ +@* @void ih264_inter_pred_luma_vert ( @ UWORD8 *pu1_src, @@ -211,12 +209,12 @@ loop_16: @when wd=16 subne r0, r0, r2 beq end_func @ Branch if height==4 - b loop_16 @ looping if height = 8 or 16 + b loop_16 @ looping if height = 8 or 16 loop_8: - @// Processing row0 and row1 + @ Processing row0 and row1 vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] vld1.u32 d2, [r0], r2 @ Vector load from src[2_0] @@ -270,7 +268,7 @@ loop_8: b loop_8 @looping if height == 8 or 16 loop_4: -@// Processing row0 and row1 +@ Processing row0 and row1 vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s index d03fc55..ccd5c0d 100644 --- a/common/arm/ih264_intra_pred_chroma_a9q.s +++ b/common/arm/ih264_intra_pred_chroma_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_intra_pred_chroma_a9q.s @@ -39,15 +39,11 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c +@* All the functions here are replicated from ih264_chroma_intra_pred_filters.c @ -@/** -@/** -@/** -@ .text .p2align 2 @@ -60,7 +56,7 @@ scratch_chroma_intrapred_addr1: scratch_intrapred_chroma_plane_addr1: .long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8 -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_chroma_8x8_mode_dc @@ -91,7 +87,7 @@ scratch_intrapred_chroma_plane_addr1: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -105,8 +101,6 @@ scratch_intrapred_chroma_plane_addr1: @ r3 => dst_strd @ r4 => ui_neighboravailability - - .global ih264_intra_pred_chroma_8x8_mode_dc_a9q ih264_intra_pred_chroma_8x8_mode_dc_a9q: @@ -191,10 +185,10 @@ str_pred: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_chroma_8x8_mode_horz @@ -226,7 +220,7 @@ str_pred: @* None @* @******************************************************************************* -@*/ +@* @void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -270,7 +264,7 @@ loop_8x8_horz: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_chroma_8x8_mode_vert @@ -339,10 +333,10 @@ ih264_intra_pred_chroma_8x8_mode_vert_a9q: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_chroma_8x8_mode_plane @@ -373,7 +367,7 @@ ih264_intra_pred_chroma_8x8_mode_vert_a9q: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -393,7 +387,6 @@ ih264_intra_pred_chroma_8x8_mode_plane_a9q: stmfd sp!, {r4-r10, r12, lr} vpush {d8-d15} - vld1.32 d0, [r0] add r10, r0, #10 vld1.32 d1, [r10] @@ -542,7 +535,6 @@ scrlblc2: end_func_plane: - vpop {d8-d15} ldmfd sp!, {r4-r10, r12, pc} diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s index e38e203..0dd82f3 100644 --- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s +++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_intra_pred_luma_16x16_a9q.s @@ -39,14 +39,14 @@ @* None @* @******************************************************************************* -@*/ +@* -@/* All the functions here are replicated from ih264_intra_pred_filters.c +@* All the functions here are replicated from ih264_intra_pred_filters.c @ -@/** -@/** -@/** +@** +@** +@** @ .text @@ -57,10 +57,10 @@ .hidden ih264_gai1_intrapred_luma_plane_coeffs scratch_intrapred_addr1: .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8 -@/** +@** @******************************************************************************* @* -@*ih264_intra_pred_luma_16x16_mode_vert_a9q +@*ih264_intra_pred_luma_16x16_mode_vert @* @* @brief @* Perform Intra prediction for luma_16x16 mode:vertical @@ -135,13 +135,13 @@ ih264_intra_pred_luma_16x16_mode_vert_a9q: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* -@*ih264_intra_pred_luma_16x16_mode_horz_a9q +@*ih264_intra_pred_luma_16x16_mode_horz @* @* @brief @* Perform Intra prediction for luma_16x16 mode:horizontal @@ -170,7 +170,7 @@ ih264_intra_pred_luma_16x16_mode_vert_a9q: @* None @* @******************************************************************************* -@*/ +@* @void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -213,13 +213,13 @@ loop_16x16_horz: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* -@*ih264_intra_pred_luma_16x16_mode_dc_a9q +@*ih264_intra_pred_luma_16x16_mode_dc @* @* @brief @* Perform Intra prediction for luma_16x16 mode:DC @@ -247,7 +247,7 @@ loop_16x16_horz: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -300,7 +300,7 @@ top_available: @ONLY TOP AVAILABLE vdup.u8 q0, d0[0] b str_pred -left_available: @ONLY LEFT AVAILABLE +left_available: @ONLY LEFT AVAILABLE vld1.u8 {q0}, [r0] vpaddl.u8 q0, q0 vadd.u16 d0, d0, d1 @@ -337,13 +337,13 @@ str_pred: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* -@*ih264_intra_pred_luma_16x16_mode_plane_a9q +@*ih264_intra_pred_luma_16x16_mode_plane @* @* @brief @* Perform Intra prediction for luma_16x16 mode:PLANE @@ -371,7 +371,7 @@ str_pred: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s index cb386ea..5cc7e23 100644 --- a/common/arm/ih264_intra_pred_luma_4x4_a9q.s +++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_intra_pred_luma_4x4_a9q.s @@ -44,21 +44,16 @@ @* None @* @******************************************************************************* -@*/ - -@/* All the functions here are replicated from ih264_intra_pred_filters.c -@ +@* -@/** -@/** -@/** +@* All the functions here are replicated from ih264_intra_pred_filters.c @ .text .p2align 2 -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_vert @@ -128,10 +123,10 @@ ih264_intra_pred_luma_4x4_mode_vert_a9q: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_horz @@ -163,7 +158,7 @@ ih264_intra_pred_luma_4x4_mode_vert_a9q: @* None @* @******************************************************************************* -@*/ +@* @void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -210,10 +205,10 @@ ih264_intra_pred_luma_4x4_mode_horz_a9q: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_dc @@ -244,7 +239,7 @@ ih264_intra_pred_luma_4x4_mode_horz_a9q: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -352,7 +347,7 @@ end_func: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_diag_dl @@ -383,7 +378,7 @@ end_func: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -434,7 +429,7 @@ end_func_diag_dl: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_diag_dr @@ -465,7 +460,7 @@ end_func_diag_dl: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -514,7 +509,7 @@ end_func_diag_dr: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_vert_r @@ -545,7 +540,7 @@ end_func_diag_dr: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -596,7 +591,7 @@ end_func_vert_r: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_horz_d @@ -627,7 +622,7 @@ end_func_vert_r: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -659,7 +654,7 @@ ih264_intra_pred_luma_4x4_mode_horz_d_a9q: vqrshrun.s16 d5, q12, #2 sub r5, r3, #2 vmov.8 d6, d5 - vtrn.8 d4, d5 @ + vtrn.8 d4, d5 @ vst1.u16 {d5[1]}, [r1]! vst1.16 {d6[2]}, [r1], r5 vst1.u16 {d4[1]}, [r1]! @@ -678,7 +673,7 @@ end_func_horz_d: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_vert_l @@ -709,7 +704,7 @@ end_func_horz_d: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -759,7 +754,7 @@ end_func_vert_l: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_4x4_mode_horz_u @@ -790,7 +785,7 @@ end_func_vert_l: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -825,9 +820,9 @@ ih264_intra_pred_luma_4x4_mode_horz_u_a9q: vext.8 d6, d5, d4, #1 vst1.8 {d4[2]}, [r1]! vst1.8 {d6[0]}, [r1]! - vtrn.8 d6, d5 @ + vtrn.8 d6, d5 @ sub r5, r3, #2 - vtrn.8 d4, d6 @ + vtrn.8 d4, d6 @ vdup.8 d7, r9 vst1.16 {d6[0]}, [r1], r5 vst1.16 {d6[0]}, [r1]! diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s index 6da1c95..352d29d 100644 --- a/common/arm/ih264_intra_pred_luma_8x8_a9q.s +++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_intra_pred_luma_8x8_a9q.s @@ -45,17 +45,11 @@ @* None @* @******************************************************************************* -@*/ - -@/* All the functions here are replicated from ih264_intra_pred_filters.c -@ +@* -@/** -@/** -@/** +@* All the functions here are replicated from ih264_intra_pred_filters.c @ - .text .p2align 2 @@ -64,7 +58,7 @@ scratch_intrapred_addr_8x8: .long ih264_gai1_intrapred_luma_8x8_horz_u - scrlb8x8l2 - 8 -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_ref_filtering @@ -95,7 +89,7 @@ scratch_intrapred_addr_8x8: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src, @ UWORD8 *pu1_dst) @@ -111,7 +105,6 @@ ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q: stmfd sp!, {r4-r12, r14} @store register values to stack vpush {d8-d15} - vld1.u8 {q0}, [r0]! @ vld1.u8 {q1}, [r0] add r0, r0, #8 @ @@ -141,6 +134,7 @@ ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q: end_func_ref_filt: + vpop {d8-d15} ldmfd sp!, {r4-r12, pc} @Restoring registers from stack @@ -149,7 +143,7 @@ end_func_ref_filt: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_vert @@ -219,10 +213,10 @@ ih264_intra_pred_luma_8x8_mode_vert_a9q: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_horz @@ -254,7 +248,7 @@ ih264_intra_pred_luma_8x8_mode_vert_a9q: @* None @* @******************************************************************************* -@*/ +@* @void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -299,10 +293,10 @@ loop_8x8_horz: -@/****************************************************************************** +@****************************************************************************** -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_dc @@ -333,7 +327,7 @@ loop_8x8_horz: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -413,7 +407,7 @@ str_pred: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_diag_dl @@ -444,7 +438,7 @@ str_pred: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -506,7 +500,7 @@ end_func_diag_dl: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_diag_dr @@ -537,7 +531,7 @@ end_func_diag_dl: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -597,7 +591,7 @@ end_func_diag_dr: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_vert_r @@ -628,7 +622,7 @@ end_func_diag_dr: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -717,7 +711,7 @@ end_func_vert_r: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_horz_d @@ -748,7 +742,7 @@ end_func_vert_r: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -791,7 +785,7 @@ ih264_intra_pred_luma_8x8_mode_horz_d_a9q: vmov.8 q4, q2 vmov.8 q5, q3 sub r6, r3, #6 - vtrn.8 q4, q5 @ + vtrn.8 q4, q5 @ vmov.8 q6, q4 vmov.8 q7, q5 sub r5, r3, #4 @@ -835,7 +829,7 @@ end_func_horz_d: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_vert_l @@ -866,7 +860,7 @@ end_func_horz_d: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -887,6 +881,7 @@ ih264_intra_pred_luma_8x8_mode_vert_l_a9q: stmfd sp!, {r4-r12, r14} @Restoring registers from stack vpush {d8-d15} + add r0, r0, #9 vld1.u8 {q0}, [r0] add r0, r0, #1 @@ -935,7 +930,7 @@ end_func_vert_l: -@/** +@** @******************************************************************************* @* @*ih264_intra_pred_luma_8x8_mode_horz_u @@ -966,7 +961,7 @@ end_func_vert_l: @* @remarks @* None @* -@*******************************************************************************/ +@******************************************************************************* @void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s index f71ca69..4e49f6a 100644 --- a/common/arm/ih264_iquant_itrans_recon_a9.s +++ b/common/arm/ih264_iquant_itrans_recon_a9.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @ ******************************************************************************* @ * @file @ * ih264_iquant_itrans_recon_a9.s @@ -38,8 +38,8 @@ @ * None @ * @ ******************************************************************************* -@*/ -@/** +@* +@** @ ******************************************************************************* @ * @ * @brief @@ -82,7 +82,7 @@ @ * None @ * @ ******************************************************************************* -@ */ +@ * @void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, @ UWORD8 *pu1_pred, @ UWORD8 *pu1_out, @@ -225,7 +225,7 @@ ih264_iquant_itrans_recon_4x4_a9: ldmfd sp!, {r4-r12, r15} @Reload the registers from SP - @/** +@** @ ******************************************************************************* @ * @ * @brief @@ -268,7 +268,7 @@ ih264_iquant_itrans_recon_4x4_a9: @ * None @ * @ ******************************************************************************* -@ */ +@ * @void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, @ UWORD8 *pu1_pred, @ UWORD8 *pu1_out, @@ -416,7 +416,7 @@ ih264_iquant_itrans_recon_chroma_4x4_a9: ldmfd sp!, {r4-r12, r15} @Reload the registers from SP -@/* +@* @ ******************************************************************************* @ * @ * @brief @@ -459,7 +459,7 @@ ih264_iquant_itrans_recon_chroma_4x4_a9: @ * None @ * @ ******************************************************************************* -@ */ +@ * @void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, @ UWORD8 *pu1_pred, @ UWORD8 *pu1_out, diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s index 8d71bdb..97c4724 100644 --- a/common/arm/ih264_iquant_itrans_recon_dc_a9.s +++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @ ******************************************************************************* @ * @file @ * ih264_iquant_itrans_recon_dc_a9.s @@ -37,8 +37,8 @@ @ * None @ * @ ******************************************************************************* -@*/ -@/** +@* +@** @ ******************************************************************************* @ * @ * @brief @@ -83,7 +83,7 @@ @ * None @ * @ ******************************************************************************* -@ */ +@ * @void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, @ UWORD8 *pu1_pred, @ UWORD8 *pu1_out, @@ -167,7 +167,7 @@ ih264_iquant_itrans_recon_4x4_dc_a9: -@/* +@* @ ******************************************************************************* @ * @ * @brief @@ -212,7 +212,7 @@ ih264_iquant_itrans_recon_4x4_dc_a9: @ * None @ * @ ******************************************************************************* -@ */ +@ * @void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src, @ UWORD8 *pu1_pred, @ UWORD8 *pu1_out, @@ -300,7 +300,7 @@ ih264_iquant_itrans_recon_8x8_dc_a9: ldmfd sp!, {r4-r8, r15} -@ /* +@ * @ ******************************************************************************** @ * @ * @brief This function reconstructs a 4x4 sub block from quantized resiude and @@ -328,7 +328,7 @@ ih264_iquant_itrans_recon_8x8_dc_a9: @ * @remarks none @ * @ ******************************************************************************* -@ */ +@ * @ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, @ UWORD8 *pu1_pred, @ UWORD8 *pu1_out, @@ -368,6 +368,7 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_a9: vmov.u16 q15, #0x00ff + vld1.u8 d18, [r2], r0 @load out [8 bit size) -8 coeffs vaddw.u8 q1, q0, d2 @Add pred vld1.u8 d19, [r2], r0 diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s index 1d74da5..769d5d7 100644 --- a/common/arm/ih264_itrans_recon_a9.s +++ b/common/arm/ih264_itrans_recon_a9.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @ ******************************************************************************* @ * @file @ * ih264_itrans_recon_neon_a9.s @@ -33,8 +33,8 @@ @ * None @ * @ ******************************************************************************* -@*/ -@/** +@* +@** @ ******************************************************************************* @ * @ * @brief @@ -72,7 +72,7 @@ @ * @ * @ ******************************************************************************* -@ */ +@ * @void ih264_itrans_recon_4x4( @ WORD16 *pi2_src, @ UWORD8 *pu1_pred, diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s index 2808897..39ad9b3 100644 --- a/common/arm/ih264_mem_fns_neon.s +++ b/common/arm/ih264_mem_fns_neon.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @ ******************************************************************************* @ * @file @ * ih264_mem_fns_neon.s @@ -40,9 +40,9 @@ @ * None @ * @ ******************************************************************************* -@*/ +@* -@/** +@** @******************************************************************************* @* @* @brief @@ -65,7 +65,7 @@ @* None @* @******************************************************************************* -@*/ +@* @void ih264_memcpy_mul_8(UWORD8 *pu1_dst, @ UWORD8 *pu1_src, @ UWORD8 num_bytes) @@ -94,7 +94,7 @@ loop_neon_memcpy_mul_8: @******************************************************************************* -@*/ +@* @void ih264_memcpy(UWORD8 *pu1_dst, @ UWORD8 *pu1_src, @ UWORD8 num_bytes) @@ -143,6 +143,8 @@ loop_memcpy: + + .global ih264_memset_mul_8_a9q ih264_memset_mul_8_a9q: @@ -208,6 +210,8 @@ loop_memset: + + .global ih264_memset_16bit_mul_8_a9q ih264_memset_16bit_mul_8_a9q: diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s index 9bab268..e7a1f91 100644 --- a/common/arm/ih264_padding_neon.s +++ b/common/arm/ih264_padding_neon.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@* @ ******************************************************************************* @ * @file @ * ih264_padding_neon.s @@ -39,10 +39,10 @@ @ * None @ * @ ******************************************************************************* -@*/ +@* -@/** +@** @******************************************************************************* @* @* @brief pad at the top of a 2d array @@ -67,7 +67,7 @@ @* @remarks none @* @******************************************************************************* -@*/ +@* @void ih264_pad_top(UWORD8 *pu1_src, @ WORD32 src_strd, @ WORD32 wd, @@ -110,7 +110,7 @@ loop_neon_pad_top: -@/** +@** @******************************************************************************* @* @* @brief @@ -147,7 +147,7 @@ loop_neon_pad_top: @* None @* @******************************************************************************* -@*/ +@* @#if PAD_LEFT_LUMA == C @void ih264_pad_left_luma(UWORD8 *pu1_src, @ WORD32 src_strd, @@ -160,6 +160,7 @@ loop_neon_pad_top: @ r3 => pad_size + .global ih264_pad_left_luma_a9q ih264_pad_left_luma_a9q: @@ -245,7 +246,7 @@ end_func: -@/** +@** @******************************************************************************* @* @* @brief @@ -282,7 +283,7 @@ end_func: @* None @* @******************************************************************************* -@*/ +@* @#if PAD_LEFT_CHROMA == C @void ih264_pad_left_chroma(UWORD8 *pu1_src, @ WORD32 src_strd, @@ -373,7 +374,7 @@ end_func_l_c: -@/** +@** @******************************************************************************* @* @* @brief @@ -410,7 +411,7 @@ end_func_l_c: @* None @* @******************************************************************************* -@*/ +@* @#if PAD_RIGHT_LUMA == C @void ih264_pad_right_luma(UWORD8 *pu1_src, @ WORD32 src_strd, @@ -519,7 +520,7 @@ end_func_r: -@/** +@** @******************************************************************************* @* @* @brief @@ -556,7 +557,7 @@ end_func_r: @* None @* @******************************************************************************* -@*/ +@* @#if PAD_RIGHT_CHROMA == C @void ih264_pad_right_chroma(UWORD8 *pu1_src, @ WORD32 src_strd, diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s deleted file mode 100644 index 08821f5..0000000 --- a/common/arm/ih264_resi_trans_a9.s +++ /dev/null @@ -1,604 +0,0 @@ -@/****************************************************************************** -@ * -@ * Copyright (C) 2015 The Android Open Source Project -@ * -@ * Licensed under the Apache License, Version 2.0 (the "License"); -@ * you may not use this file except in compliance with the License. -@ * You may obtain a copy of the License at: -@ * -@ * http://www.apache.org/licenses/LICENSE-2.0 -@ * -@ * Unless required by applicable law or agreed to in writing, software -@ * distributed under the License is distributed on an "AS IS" BASIS, -@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -@ * See the License for the specific language governing permissions and -@ * limitations under the License. -@ * -@ ***************************************************************************** -@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore -@*/ -@/** -@******************************************************************************* -@* @file -@* ih264_resi_trans_a9.s -@* -@* @brief -@* Contains function definitions for residual and forward trans -@* -@* @author -@* Ittiam -@* -@* @par List of Functions: -@* ih264_resi_trans_4x4_a9 -@* ih264_resi_trans_8x8_a9 -@* @remarks -@* None -@* -@******************************************************************************* - - -.text -.p2align 2 -@***************************************************************************** -@* -@* Function Name : ih264_resi_trans_4x4_a9 -@* Description : This function does cf4 of H264 followed by and approximate scaling -@* -@* Arguments : -@ R0 :pointer to src buffer -@ R1 :pointer to pred buffer -@ R2 :pointer to dst buffer -@ R3 :src_stride -@ STACk :pred_stride,dst_stride - -@* Values Returned : NONE -@* -@* Register Usage : -@* Stack Usage : -@* Cycles : Around -@* Interruptiaility : Interruptable -@* -@* Known Limitations -@* \Assumptions : -@* -@* Revision History : -@* DD MM YYYY Author(s) Changes -@* 30 12 2009 100633 First version -@* -@***************************************************************************** - - - .global ih264_resi_trans_4x4_a9 - .extern g_scal_coff_h264_4x4 -g_scal_coff_h264_4x4_addr: - .long g_scal_coff_h264_4x4 - 4x4lbl - 8 - -ih264_resi_trans_4x4_a9: - - @R0 :pointer to src buffer - @R1 :pointer to pred buffer - @R2 :pointer to dst buffer - @R3 :src_stride - @STACk :pred_stride,dst_stride - - push {r4-r12, lr} @push all the variables first - - mov r6, sp - add r6, r6, #40 @decrement stack pointer,to accomodate two variables - ldmfd r6, {r4-r5} @load the strides into registers - @R4 pred_stride - @R5 dst_stride - - - @we have to give the stride as post inrement in VLDR1 - @but since thr stride is from end of row 1 to start of row 2, - @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes) - @ADD R3,#4 - @ADD R4,#4 - @ADD R5,#4 - @in case of dst the stride represnts 16 bit ie 2*8bits - @hence we need to add #4 to it and thenm multiply by 2 - @--------------------function loading done------------------------ - - @lets find residual - @data is like 1a -> d0[1:31] d0[32:64] - @ a b c d # # # # - vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer - vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer - @ data is like 1a -> q4[1:63] q4[64:148] - @ d8[1:63] d9[1:63] - @ a b c d # # # # - - vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0] - vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0] - - vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0] - vsubl.u8 q0, d30, d31 @curr - pred for row one - - vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0] - vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0 - - vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0] - - vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0] - vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2] - - lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values - ldr r6, g_scal_coff_h264_4x4_addr -4x4lbl: - add r6, r6, pc @ load the address of global array - - vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6 - - @after this - @D0 -> 1a - @D2 -> 2a - @D4 -> 3a - @D6 -> 4a - - @transpose the matrix so that we can do the horizontal transform first - @#1 #2 #3 #4 - @a b c d ---- D0 - @e f g h -----D2 - @i j k l -----D4 - @m n o p -----D6 - @transpose the inner 2x2 blocks - vtrn.16 d0, d2 - vld1.s16 {q10}, [r6]! @ load the scaling values 0-7; - vtrn.16 d4, d6 - @a e c g - @b f d h - @i m k o - @j n l p - vtrn.32 d0, d4 - vtrn.32 d2, d6 - @a e i m #1 -- D0 --- x4 - @b f j n #2 -- D2 --- x5 - @c g k o #3 -- D4 ----x6 - @d h l p #4 -- D6 ----x7 - - @we have loaded the residuals into the registers , now we need to add and subtract them - @let us do the horiz transform first - - vsub.s16 d5, d2, d4 @x2 = x5-x6 - vsub.s16 d7, d0, d6 @x3 = x4-x7; - - vadd.s16 d3, d2, d4 @x1 = x5+x6 - vadd.s16 d1, d0, d6 @x0 = x4+x7 - - - vshl.s16 d31, d7, #1 @ - vshl.s16 d30, d5, #1 @ - - vadd.s16 d0, d1, d3 @x0 + x1; - vsub.s16 d4, d1, d3 @x0 - x1; - - vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2; - vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft); - - @taking transform again so as to make do vert transform - vtrn.16 d0, d2 - vtrn.16 d4, d6 - - vtrn.32 d0, d4 - vtrn.32 d2, d6 - - @let us do vertical transform - @same code as horiz - - vadd.s16 d1, d0, d6 @x0 = x4+x7 - vadd.s16 d3, d2, d4 @x1 = x5+x6 - vsub.s16 d7, d0, d6 @x3 = x4-x7; - vsub.s16 d5, d2, d4 @x2 = x5-x6 - - -@Since we are going to do scal / quant or whatever, we are going to divide by -@a 32 bit number. So we have to expand the values - - @VADDL.S16 Q12,D1,D3;x0 + x1 - @VSUBL.S16 Q14,D1,D3;x0 - x1 - - @VSHL.S16 D8,D5,#1; - @VSHL.S16 D9,D7,#1; - - @VADDL.S16 Q13,D9,D5 ; + x2 - @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft) - -@scaling follows - -@now we need to do the scaling,so load the scaling matrix -@mutliplying by the scaling coeffient; store the results from q5-q8 ; - - vadd.s16 d24, d3, d1 @x4 = x0 + x1 - vsub.s16 d28, d1, d3 @x6 = x0 - x1 - - vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft) - vmull.s16 q4, d24, d20 @x4*s0 - - vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft) - - vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2 - vmull.s16 q5, d26, d21 @x5*s1 - - vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride - - vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients - - vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft) - - vmull.s16 q6, d28, d20 @x6*s2 - vst1.s32 {q5}, [r2], r5 - - vmull.s16 q7, d30, d21 @x7*s3 - - - vst1.s32 {q6}, [r2], r5 - vst1.s32 {q7}, [r2] - - pop {r4-r12, pc} @pop back all variables - - - - -@***************************************************************************** -@* Function Name : ih264_resi_trans_8x8_a9 -@* Description : This function does cf8 followd by an approximate normalization of H264 -@* -@* Arguments : -@* R0 :pointer to src buffer -@ R1 :pointer to pred buffer -@ R2 :pointer to dst buffer -@ R3 :src_stride -@ STACk :pred_stride,dst_st -@* -@* -@* Values Returned : NONE -@* -@* Register Usage : -@* Stack Usage : -@* Cycles : Around -@* Interruptiaility : Interruptable -@* -@* Known Limitations -@* \Assumptions : -@* -@* Revision History : -@* DD MM YYYY Author(s) Changes -@* 30 12 2009 100633 First version -@* -@***************************************************************************** - - - .global ih264_resi_trans_8x8_a9 - .extern g_scal_coff_h264_8x8 -g_scal_coff_h264_8x8_addr: - .long g_scal_coff_h264_8x8 - 8x8lbl - 8 - - -ih264_resi_trans_8x8_a9: - - @R0 :pointer to src buffer - @R1 :pointer to pred buffer - @R2 :pointer to dst buffer - @R3 :src_stride - @STACk :pred_stride,dst_stride - - push {r4-r12, lr} @push all the variables first - - mov r6, sp - add r6, r6, #40 @decrement stack pointer,to accomodate two variables - ldmfd r6, {r4-r5} @load the strides into registers - @R4 pred_stride - @R5 dst_stride - - @we have to give the stride as post inrement in vst1 - @in case of dst the stride represnts 16 bit ie 2*8bits - @hence we need to add #4 to it and thenm multiply by 2 - @--------------------function loading done------------------------ - - @lets find residual - @data is like 1a -> d0[1:31] d0[32:64] - @ a b c d # # # # - vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer - vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer - - vld1.u8 d28, [r0], r3 @src rw2 - vld1.u8 d29, [r1], r4 @pred rw2 - vsubl.u8 q0, d30, d31 @src-pred rw1 - - vld1.u8 d26, [r0], r3 - vld1.u8 d27, [r1], r4 - vsubl.u8 q1, d28, d29 - - vld1.u8 d24, [r0], r3 - vld1.u8 d25, [r1], r4 - vsubl.u8 q2, d26, d27 - - vld1.u8 d22, [r0], r3 - vld1.u8 d23, [r1], r4 - vsubl.u8 q3, d24, d25 - - vld1.u8 d20, [r0], r3 - vld1.u8 d21, [r1], r4 - vsubl.u8 q4, d22, d23 - - vld1.u8 d18, [r0], r3 - vld1.u8 d19, [r1], r4 - vsubl.u8 q5, d20, d21 - - vld1.u8 d16, [r0], r3 - vld1.u8 d17, [r1], r4 - vsubl.u8 q6, d18, d19 - - lsl r5, r5, #2 - - - vsubl.u8 q7, d16, d17 - - @after this - @Q0 -> 1a - @Q1 -> 2a - @Q2 -> 3a - @Q3 -> 4a - @Q4 -> 5a - @Q5 -> 6a - @Q6 -> 7a - @Q7 -> 8a - - @transpose the matrix so that we can do the horizontal transform first - - @transpose the inner 2x2 blocks - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - @transpose the inner 4x4 blocks - vtrn.32 q0, q2 - vtrn.32 q1, q3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - - @transpose the outer 8x8 blocks - vswp d1, d8 - vswp d7, d14 - vswp d3, d10 - vswp d5, d12 - @transpose done - -@@this point we will have data in Q0-Q7 -@Q7 will be populated within 2 clock cycle -@all others are availabe @ this clock cycle - - @we have loaded the residuals into the registers , now we need to add and subtract them - @let us do the horiz transform first - - vadd.s16 q8, q0, q7 @ a0 = r0 + r7; - vadd.s16 q9, q1, q6 @ a1 = r1 + r6; - vadd.s16 q10, q2, q5 @ a2 = r2 + r5; - vadd.s16 q11, q3, q4 @ a3 = r3 + r4; - - vsub.s16 q12, q0, q7 @ b0 = r0 - r7; - vsub.s16 q13, q1, q6 @ b1 = r1 - r6; - vsub.s16 q15, q3, q4 @ b3 = r3 - r4; - vsub.s16 q14, q2, q5 @ b2 = r2 - r5; - - vadd.s16 q1, q8, q11 @ a4 = a0 + a3; - vadd.s16 q3, q9, q10 @ a5 = a1 + a2; - vsub.s16 q7, q9, q10 @ a7 = a1 - a2; - vsub.s16 q5, q8, q11 @ a6 = a0 - a3; - - ldr r6, g_scal_coff_h264_8x8_addr -8x8lbl: - add r6, r6, pc @ load the address of global array - - vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; - vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); - - vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; - - vadd.s16 q2, q5, q8 @ - - - vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; - vsub.s16 q6, q9, q7 @ - -@do not change Q0,Q2.Q4,Q6 they contain results -@Q1,Q3,Q5,Q7 TO STORE RESULTS -@Q8 Q9 Q10 Q11 USE @WILL - - vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) - vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) - vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) - vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) - - vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); - vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); - vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); - vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); - vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); - - vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) - vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); - vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); - vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); - - - vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); - vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); - vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); - vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; - - @------------horiz transform done------------------------- - @results are in Q0-Q7 - @all other neon registes can be used at will - -@doing vertical transform -@code exact copy of horiz transform above - - @transpose the inner 2x2 blocks - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - @transpose the inner 4x4 blocks - vtrn.32 q0, q2 - vtrn.32 q1, q3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - - @transpose the outer 8x8 blocks - vswp d1, d8 - vswp d3, d10 - vswp d5, d12 - vswp d7, d14 - - @transpose done - - vadd.s16 q8, q0, q7 @ a0 = r0 + r7; - vadd.s16 q9, q1, q6 @ a1 = r1 + r6; - vadd.s16 q10, q2, q5 @ a2 = r2 + r5; - vadd.s16 q11, q3, q4 @ a3 = r3 + r4; - - vsub.s16 q12, q0, q7 @ b0 = r0 - r7; - vsub.s16 q13, q1, q6 @ b1 = r1 - r6; - vsub.s16 q14, q2, q5 @ b2 = r2 - r5; - vsub.s16 q15, q3, q4 @ b3 = r3 - r4; - - vadd.s16 q1, q8, q11 @ a4 = a0 + a3; - vadd.s16 q3, q9, q10 @ a5 = a1 + a2; - vsub.s16 q5, q8, q11 @ a6 = a0 - a3; - vsub.s16 q7, q9, q10 @ a7 = a1 - a2; - - - vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5; - - vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft); - @DSHIFT_TO_0 Q8,Q7,#1,#0 - vadd.s16 q2, q5, q8 @ - - vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5; - - vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7; - vsub.s16 q6, q9, q7 @ - -@do not change Q0,Q2.Q4,Q6 they contain results -@Q1,Q3,Q5,Q7 TO STORE RESULTS -@Q8 Q9 Q10 Q11 USE @WILL - - vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft) - vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft) - vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft) - vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft) - - - vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0); - vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0); - vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2); - vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1); - vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3); - - vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0); - vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2); - vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1); - vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3); - - vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft) - vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft); - vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft); - vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft); - - -@since we are going to scal by small values, we need not expand the guys to 32 bit bit values - vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft); - vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7; - vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft); - vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft); - - @------------vert transform done------------------------- - @results are in Q0-Q7 - @all other neon registes can be used at will - - @scaling - @since the 8x8 scaling matrix repeats in 1x4,1x4 block , - @we need only load 4 values for each row and in total 4 rows - vld1.s16 {q14-q15}, [r6] @ - - @since we need to get a 32 bit o/p for two 16 bit multiplications - @we need a VMULL instruction -@-----------------------------first and second row - - vmull.s16 q8, d0, d28 @scale the first row first 4 elem - vmull.s16 q9, d28, d1 @scale the second row last 4 elemts - - vmull.s16 q10, d2, d29 @ scale second row first 4 elem - vmull.s16 q11, d29, d3 @scale the second row last 4 elem - vmull.s16 q12, d4, d30 @scale third row first 4 elem - - vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete - - vmull.s16 q13, d30, d5 @scale the third row last 4 elem - vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem - - - vst1.s32 {q10, q11}, [r2], r5 @store the second row complete - -@------------------------------- 3rd and 4th row - - vmull.s16 q9, d31, d7 @scale the fourth row second column - - vst1.s32 {q12, q13}, [r2], r5 @store the third row complete - - vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms - vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems - - vmull.s16 q12, d10, d29 @scale the 6th row first4 elements - - - vst1.s32 {q8, q9}, [r2], r5 @store fifth row - -@--------------------------------5th and 6th row - - vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems - - vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms - - vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements - - vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms - vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms - - - vst1.s32 {q12, q13}, [r2], r5 @store 6th row - -@----------------------------------7th and 8th row - vmull.s16 q11, d31, d15 @scale 8th row second 4 elms - - vst1.s32 {q8, q9}, [r2], r5 @store 7th row - vst1.s32 {q10, q11}, [r2], r5 @store 8th row - -@----------------------------------done writing - - pop {r4-r12, pc} @pop back all variables - - - - - - diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s index caf362e..bb836bd 100644 --- a/common/arm/ih264_resi_trans_quant_a9.s +++ b/common/arm/ih264_resi_trans_quant_a9.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @******************************************************************************* @* @file @* ih264_resi_trans_quant_a9.s diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s index ccae779..33859e6 100644 --- a/common/arm/ih264_weighted_bi_pred_a9q.s +++ b/common/arm/ih264_weighted_bi_pred_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_weighted_bi_pred_a9q.s @@ -37,7 +37,7 @@ @* None @* @******************************************************************************* -@*/ +@* @******************************************************************************* @* @function @* ih264_weighted_bi_pred_luma_a9q() @@ -96,7 +96,7 @@ @* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). @* @******************************************************************************* -@*/ +@* @void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @ UWORD8 *pu1_dst, @@ -411,7 +411,7 @@ end_loops: @* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). @* @******************************************************************************* -@*/ +@* @void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @ UWORD8 *pu1_dst, diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s index 1ce94d0..81d26d4 100644 --- a/common/arm/ih264_weighted_pred_a9q.s +++ b/common/arm/ih264_weighted_pred_a9q.s @@ -17,7 +17,7 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** @****************************************************************************** @* @file @* ih264_weighted_pred_a9q.s @@ -37,7 +37,7 @@ @* None @* @******************************************************************************* -@*/ +@* @******************************************************************************* @* @function @* ih264_weighted_pred_luma_a9q() @@ -84,7 +84,7 @@ @* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). @* @******************************************************************************* -@*/ +@* @void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, @@ -314,7 +314,7 @@ end_loops: @* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). @* @******************************************************************************* -@*/ +@* @void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src, @ UWORD8 *pu1_dst, @ WORD32 src_strd, diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s index aefb902..6823015 100644 --- a/common/armv8/ih264_default_weighted_pred_av8.s +++ b/common/armv8/ih264_default_weighted_pred_av8.s @@ -24,7 +24,6 @@ //* //* @brief //* Contains function definitions for default weighted prediction. -//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT //* //* @author //* Kaushik Senthoor R diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s index 38934c9..9564f99 100644 --- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s +++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s @@ -247,8 +247,8 @@ loop_16: //when wd=16 st1 {v30.2s, v31.2s}, [x1], x3 // store row 6 sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) - swp v0.8b v4.8b - swp v1.8b v5.8b + swp v0.8b, v4.8b + swp v1.8b, v5.8b @@ -257,8 +257,8 @@ loop_16: //when wd=16 mov v7.8b, v11.8b subs x12, x14, #1 // if height==16 - looping - swp v4.8b v8.8b - swp v5.8b v9.8b + swp v4.8b, v8.8b + swp v5.8b, v9.8b sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s index ea7645e..202c516 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s @@ -68,7 +68,7 @@ ih264_inter_pred_luma_horz_hpel_vert_hpel_av8: - //store register values to stack + //store register values to stack push_v_regs stp x19, x20, [sp, #-16]! @@ -811,7 +811,7 @@ loop_4: bgt loop_4 end_func: - //Restoring registers from stack + //Restoring registers from stack ldp x19, x20, [sp], #16 pop_v_regs ret diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s index 3737e3f..38f971b 100644 --- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s +++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s @@ -1111,7 +1111,7 @@ loop_4: bgt loop_4 end_func: - //Restoring registers from stack + //Restoring registers from stack ldp x19, x20, [sp], #16 pop_v_regs ret diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s index 62edfdc..2c5efb3 100644 --- a/common/armv8/ih264_intra_pred_chroma_av8.s +++ b/common/armv8/ih264_intra_pred_chroma_av8.s @@ -262,7 +262,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8: - push_v_regs + push_v_regs ld1 {v0.8h}, [x0] dup v10.8h, v0.h[7] diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s index f7d0846..96ef50a 100644 --- a/common/armv8/ih264_weighted_bi_pred_av8.s +++ b/common/armv8/ih264_weighted_bi_pred_av8.s @@ -24,7 +24,6 @@ //* //* @brief //* Contains function definitions for weighted biprediction. -//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT //* //* @author //* Kaushik Senthoor R diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s index 6a03875..ec5bb7a 100644 --- a/common/armv8/ih264_weighted_pred_av8.s +++ b/common/armv8/ih264_weighted_pred_av8.s @@ -24,7 +24,6 @@ //* //* @brief //* Contains function definitions for weighted prediction. -//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT //* //* @author //* Kaushik Senthoor R diff --git a/common/ih264_dpb_mgr.c b/common/ih264_dpb_mgr.c index 8e087d3..9380b7e 100644 --- a/common/ih264_dpb_mgr.c +++ b/common/ih264_dpb_mgr.c @@ -536,7 +536,7 @@ WORD32 ih264_dpb_mgr_alternate_ref_fields(dpb_mgr_t *ps_dpb_mgr, BOTTOM_FIELD:TOP_FIELD; } - if((reference_type == SHORT_TERM_REF)) + if(reference_type == SHORT_TERM_REF) { ps_dpb_mgr->ps_dpb_short_term_head = ps_dpb_head->ps_prev_dpb; } diff --git a/common/ithread.c b/common/ithread.c index 4ffb98a..25a8cd0 100644 --- a/common/ithread.c +++ b/common/ithread.c @@ -327,6 +327,11 @@ WORD32 ithread_set_affinity(WORD32 core_id) return 1; } +void ithread_set_name(CHAR *pc_thread_name) +{ + return; +} + #else UWORD32 ithread_get_handle_size(void) diff --git a/common/x86/ih264_deblk_luma_ssse3.c b/common/x86/ih264_deblk_luma_ssse3.c index 440d5f0..e29bebb 100644 --- a/common/x86/ih264_deblk_luma_ssse3.c +++ b/common/x86/ih264_deblk_luma_ssse3.c @@ -856,7 +856,7 @@ void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src, { UWORD8 u1_Bs, u1_Bs1; - UWORD32 j = 0; + WORD32 j = 0; __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; __m128i int1, int2, int3, int4, high1, high2; diff --git a/common/x86/ih264_ihadamard_scaling_sse42.c b/common/x86/ih264_ihadamard_scaling_sse42.c index 895291b..d68d105 100644 --- a/common/x86/ih264_ihadamard_scaling_sse42.c +++ b/common/x86/ih264_ihadamard_scaling_sse42.c @@ -86,14 +86,19 @@ * ******************************************************************************* */ -void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out, - const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, - UWORD32 u4_qp_div_6, WORD32* pi4_tmp) { +void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp) +{ __m128i src_r0_r1, src_r2_r3; __m128i src_r0, src_r1, src_r2, src_r3; __m128i temp0, temp1, temp2, temp3; __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6))); __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); + UNUSED (pi4_tmp); src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row @@ -171,12 +176,15 @@ void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out, src_r3 = _mm_mullo_epi32(src_r3, mult_val); //Scaling - if (u4_qp_div_6 >= 6) { + if(u4_qp_div_6 >= 6) + { src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); - } else { + } + else + { temp0 = _mm_add_epi32(src_r0, add_rshift); temp1 = _mm_add_epi32(src_r1, add_rshift); temp2 = _mm_add_epi32(src_r2, add_rshift); @@ -194,16 +202,17 @@ void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out, } void ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src, - WORD16* pi2_out, - const UWORD16 *pu2_iscal_mat, - const UWORD16 *pu2_weigh_mat, - UWORD32 u4_qp_div_6, - WORD32* pi4_tmp) + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp) { - UNUSED(pi4_tmp); __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; __m128i zero_8x16b = _mm_setzero_si128(); __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0])); + UNUSED(pi4_tmp); + src = _mm_loadu_si128((__m128i *) pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits diff --git a/common/x86/ih264_ihadamard_scaling_ssse3.c b/common/x86/ih264_ihadamard_scaling_ssse3.c index 232d9fa..1b940ea 100644 --- a/common/x86/ih264_ihadamard_scaling_ssse3.c +++ b/common/x86/ih264_ihadamard_scaling_ssse3.c @@ -85,9 +85,13 @@ * ******************************************************************************* */ -void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out, - const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat, - UWORD32 u4_qp_div_6, WORD32* pi4_tmp) { +void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, + WORD16* pi2_out, + const UWORD16 *pu2_iscal_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 u4_qp_div_6, + WORD32* pi4_tmp) +{ int val = 0xFFFF; __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128(); __m128i src_r0, src_r1, src_r2, src_r3; @@ -96,6 +100,8 @@ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out, __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); __m128i mask = _mm_set1_epi32(val); + UNUSED (pi4_tmp); + mult_val = _mm_and_si128(mult_val, mask); src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row @@ -177,12 +183,15 @@ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out, src_r3 = _mm_madd_epi16(src_r3, mult_val); //Scaling - if (u4_qp_div_6 >= 6) { + if(u4_qp_div_6 >= 6) + { src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); - } else { + } + else + { temp0 = _mm_add_epi32(src_r0, add_rshift); temp1 = _mm_add_epi32(src_r1, add_rshift); temp2 = _mm_add_epi32(src_r2, add_rshift); diff --git a/common/x86/ih264_inter_pred_filters_ssse3.c b/common/x86/ih264_inter_pred_filters_ssse3.c index 64e364e..6d318c9 100644 --- a/common/x86/ih264_inter_pred_filters_ssse3.c +++ b/common/x86/ih264_inter_pred_filters_ssse3.c @@ -98,11 +98,10 @@ void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src, { __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; + WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4; UNUSED(pu1_tmp); UNUSED(dydx); - WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4; - src_strd2 = src_strd << 1; dst_strd2 = dst_strd << 1; src_strd4 = src_strd << 2; @@ -1825,7 +1824,6 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src, WORD32 y_offset; UWORD8 *pu1_pred1; - UNUSED(pu1_tmp); __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b; __m128i src_r5_16x8b, src_r6_16x8b; @@ -1835,6 +1833,7 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src, __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; __m128i const_val16_8x16b; + UNUSED(pu1_tmp); y_offset = dydx & 0xf; coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 diff --git a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c index d43c8e2..565cc75 100644 --- a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c +++ b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c @@ -113,6 +113,8 @@ void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src, UWORD32 *pu4_out = (UWORD32 *)pu1_out; WORD32 q0 = pi2_src[0]; WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0; + UNUSED (pi2_tmp); + INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4); if (iq_start_idx != 0 ) @@ -233,6 +235,10 @@ void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src, { WORD32 q0 = pi2_src[0]; WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0; + UNUSED (pi2_tmp); + UNUSED (iq_start_idx); + UNUSED (pi2_dc_ld_addr); + INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6); i_macro = ((q0 + 32) >> 6); @@ -392,6 +398,12 @@ void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src, __m128i chroma_mask = _mm_set1_epi16 (0xFF); __m128i value_add = _mm_set1_epi16(i_macro); + UNUSED (pi2_src); + UNUSED (pu2_iscal_mat); + UNUSED (pu2_weigh_mat); + UNUSED (u4_qp_div_6); + UNUSED (pi2_tmp); + //Load pred buffer pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits diff --git a/common/x86/ih264_iquant_itrans_recon_sse42.c b/common/x86/ih264_iquant_itrans_recon_sse42.c index 2a4ea3f..6399b65 100644 --- a/common/x86/ih264_iquant_itrans_recon_sse42.c +++ b/common/x86/ih264_iquant_itrans_recon_sse42.c @@ -120,6 +120,7 @@ void ih264_iquant_itrans_recon_4x4_sse42(WORD16 *pi2_src, __m128i resq_r0, resq_r1, resq_r2, resq_r3; __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6))); __m128i value_32 = _mm_set1_epi32(32); + UNUSED (pi2_tmp); /*************************************************************/ /* Dequantization of coefficients. Will be replaced by SIMD */ @@ -369,6 +370,8 @@ void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src, __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6))); __m128i value_32 = _mm_set1_epi32(32); __m128i chroma_mask = _mm_set1_epi16 (0xFF); + UNUSED (pi2_tmp); + /*************************************************************/ /* Dequantization of coefficients. Will be replaced by SIMD */ /* operations on platform */ diff --git a/common/x86/ih264_iquant_itrans_recon_ssse3.c b/common/x86/ih264_iquant_itrans_recon_ssse3.c index ca1397e..388cafe 100644 --- a/common/x86/ih264_iquant_itrans_recon_ssse3.c +++ b/common/x86/ih264_iquant_itrans_recon_ssse3.c @@ -120,6 +120,8 @@ void ih264_iquant_itrans_recon_4x4_ssse3(WORD16 *pi2_src, __m128i resq_r0, resq_r1, resq_r2, resq_r3; __m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6))); __m128i value_32 = _mm_set1_epi32(32); + UNUSED (pi2_tmp); + UNUSED (pi2_dc_ld_addr); /*************************************************************/ /* Dequantization of coefficients. Will be replaced by SIMD */ @@ -397,6 +399,9 @@ void ih264_iquant_itrans_recon_8x8_ssse3(WORD16 *pi2_src, __m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2, resq_r3_1, resq_r3_2, resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2, resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2; + UNUSED (pi2_tmp); + UNUSED (iq_start_idx); + UNUSED (pi2_dc_ld_addr); /*************************************************************/ /* Dequantization of coefficients. Will be replaced by SIMD */ diff --git a/common/x86/ih264_resi_trans_quant_sse42.c b/common/x86/ih264_resi_trans_quant_sse42.c index c267651..eca43ed 100644 --- a/common/x86/ih264_resi_trans_quant_sse42.c +++ b/common/x86/ih264_resi_trans_quant_sse42.c @@ -121,6 +121,9 @@ void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred, __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero __m128i sign_reg0, sign_reg2; __m128i scalemat_r0_r1, scalemat_r2_r3; + + UNUSED (pu2_threshold_matrix); + scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits @@ -394,6 +397,8 @@ void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WO __m128i scalemat_r0_r1, scalemat_r2_r3; __m128i chroma_mask = _mm_set1_epi16 (0xFF); + UNUSED (pu2_threshold_matrix); + scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits @@ -676,6 +681,8 @@ void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]); + UNUSED (pu2_threshold_matrix); + src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); @@ -902,6 +909,8 @@ void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst, __m128i temp_1 = _mm_set1_epi16(1); __m128i rnd_fact = _mm_set1_epi32(u4_round_factor); + UNUSED (pu2_threshold_matrix); + src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits diff --git a/decoder/ih264d_thread_parse_decode.c b/decoder/ih264d_thread_parse_decode.c index be3cb01..1c9eb68 100644 --- a/decoder/ih264d_thread_parse_decode.c +++ b/decoder/ih264d_thread_parse_decode.c @@ -582,13 +582,9 @@ WORD32 ih264d_decode_slice_thread(dec_struct_t *ps_dec /* Decoder parameters */ void ih264d_decode_picture_thread(dec_struct_t *ps_dec ) { - volatile WORD32 i4_err_status; - ithread_set_name("ih264d_decode_picture_thread"); - - // run the loop till all slices are decoded while(1) @@ -644,13 +640,6 @@ void ih264d_decode_picture_thread(dec_struct_t *ps_dec ) DEBUG_THREADS_PRINTF("Waiting for next slice or end of frame\n"); NOP(32); - if(i4_err_status != 0) - { - /*In the case of error set decode Mb number ,so that the - parse thread does not wait because of mb difference being - greated the 32*/ - ps_dec->cur_dec_mb_num = ps_dec->u2_cur_mb_addr - 1; - } } DEBUG_THREADS_PRINTF("Got next slice/end of frame signal \n "); diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s index b58911e..27fbe3d 100644 --- a/encoder/arm/ime_distortion_metrics_a9q.s +++ b/encoder/arm/ime_distortion_metrics_a9q.s @@ -17,9 +17,9 @@ @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ -@/** +@** -@/** +@** @****************************************************************************** @* @* @@ -48,7 +48,7 @@ @ -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) @@ -79,59 +79,62 @@ @* @remarks @* @****************************************************************************** -@*/ +@* .text .p2align 2 + .global ime_compute_sad_16x16_fast_a9q + ime_compute_sad_16x16_fast_a9q: - stmfd sp!, {r12, lr} - lsl r2, r2, #1 - lsl r3, r3, #1 + stmfd sp!, {r12, lr} + vpush {d8-d15} + lsl r2, r2, #1 + lsl r3, r3, #1 @for bringing buffer2 into cache..., dummy load instructions - @ LDR r12,[r1] + @LDR r12,[r1] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - mov r12, #6 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d6, d4 - vabdl.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 loop_sad_16x16_fast: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r12, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - - bne loop_sad_16x16_fast + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 + bne loop_sad_16x16_fast - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 - ldr r12, [sp, #12] - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - vshl.u32 d0, d0, #1 - vst1.32 {d0[0]}, [r12] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + vpop {d8-d15} + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vshl.u32 d0, d0, #1 + vst1.32 {d0[0]}, [r12] - ldmfd sp!, {r12, pc} + ldmfd sp!, {r12, pc} -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x8 blocks @@ -163,56 +166,57 @@ loop_sad_16x16_fast: @* @remarks @* @****************************************************************************** -@*/ +@* @ .global ime_compute_sad_16x8_a9q + ime_compute_sad_16x8_a9q: - stmfd sp!, {r12, lr} + stmfd sp!, {r12, lr} @for bringing buffer2 into cache..., dummy load instructions @LDR r12,[r1] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - mov r12, #6 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d6, d4 - vabdl.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vpush {d8-d15} + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 loop_sad_16x8: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r12, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - - bne loop_sad_16x8 - - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 + bne loop_sad_16x8 - ldr r12, [sp, #12] - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - vst1.32 {d0[0]}, [r12] + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 - ldmfd sp!, {r12, pc} + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + vpop {d8-d15} + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] + ldmfd sp!, {r12, pc} -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks with early exit @@ -243,100 +247,103 @@ loop_sad_16x8: @* @remarks @* @****************************************************************************** -@*/ +@* + .global ime_compute_sad_16x16_ea8_a9q ime_compute_sad_16x16_ea8_a9q: - stmfd sp!, {r5-r7, lr} - lsl r2, r2, #1 - lsl r3, r3, #1 + stmfd sp!, {r5-r7, lr} + lsl r2, r2, #1 + lsl r3, r3, #1 @for bringing buffer2 into cache..., dummy load instructions @LDR r12,[r1] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - mov r5, #6 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d6, d4 - vabdl.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - ldrd r6, r7, [sp, #16] + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r5, #6 + ldrd r6, r7, [sp, #16] + vpush {d8-d15} + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + @r6 = i4_max_sad, r7 = pi4_mb_distortion loop_sad_16x16_ea8_1: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r5, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 - - bne loop_sad_16x16_ea8_1 - - vabal.u8 q0, d10, d8 - sub r0, r0, r2, lsl #3 - vabal.u8 q1, d11, d9 - sub r1, r1, r3, lsl #3 - - vadd.i16 q6, q0, q1 - add r0, r0, r2, asr #1 - vadd.i16 d12, d12, d13 - add r1, r1, r3, asr #1 - - vpaddl.u16 d12, d12 - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - vpaddl.u32 d12, d12 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - - vst1.32 {d12[0]}, [r7] - ldr r5, [r7] - cmp r5, r6 - bgt end_func_16x16_ea8 - - vld1.8 {d10, d11}, [r1], r3 - mov r5, #6 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_ea8_1 + + vabal.u8 q0, d10, d8 + sub r0, r0, r2, lsl #3 + vabal.u8 q1, d11, d9 + sub r1, r1, r3, lsl #3 + + vadd.i16 q6, q0, q1 + add r0, r0, r2, asr #1 + vadd.i16 d12, d12, d13 + add r1, r1, r3, asr #1 + + vpaddl.u16 d12, d12 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + vpaddl.u32 d12, d12 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + + vst1.32 {d12[0]}, [r7] + ldr r5, [r7] + cmp r5, r6 + bgt end_func_16x16_ea8 + + vld1.8 {d10, d11}, [r1], r3 + mov r5, #6 loop_sad_16x16_ea8_2: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 - vld1.8 {d6, d7}, [r1], r3 - subs r5, #2 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d6, d4 - vabal.u8 q1, d7, d5 - vld1.8 {d10, d11}, [r1], r3 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 - bne loop_sad_16x16_ea8_2 + bne loop_sad_16x16_ea8_2 - vabal.u8 q0, d10, d8 - vabal.u8 q1, d11, d9 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 - vst1.32 {d0[0]}, [r7] + vst1.32 {d0[0]}, [r7] end_func_16x16_ea8: - - ldmfd sp!, {r5-r7, pc} + vpop {d8-d15} + ldmfd sp!, {r5-r7, pc} -@/* +@* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad2_prog() @// @@ -346,7 +353,7 @@ end_func_16x16_ea8: @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- -@*/ +@* .global ime_calculate_sad2_prog_a9q @@ -358,72 +365,72 @@ ime_calculate_sad2_prog_a9q: @ r3 = RefBufferWidth <UWORD32> @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> - stmfd sp!, {r4-r5, lr} - - ldr r4, [sp, #8] @ load src stride to r4 - mov r5, #14 + stmfd sp!, {r4-r5, lr} + ldr r4, [sp, #8] @ load src stride to r4 + mov r5, #14 + vpush {d8-d15} @Row 1 - vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 - vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 - vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 @Row 2 - vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 - vabdl.u8 q6, d2, d0 - vabdl.u8 q7, d3, d1 - vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 - vabdl.u8 q8, d4, d0 - vabdl.u8 q9, d5, d1 - vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabdl.u8 q6, d2, d0 + vabdl.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabdl.u8 q8, d4, d0 + vabdl.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 loop_sad2_prog: - subs r5, #2 + subs r5, #2 @Row 1 - vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 - vabal.u8 q6, d8, d6 - vabal.u8 q7, d9, d7 - vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 - vabal.u8 q8, d10, d6 - vabal.u8 q9, d11, d7 - vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 @Row 2 - vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 - vabal.u8 q6, d2, d0 - vabal.u8 q7, d3, d1 - vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 - vabal.u8 q8, d4, d0 - vabal.u8 q9, d5, d1 - vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabal.u8 q6, d2, d0 + vabal.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabal.u8 q8, d4, d0 + vabal.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 - bne loop_sad2_prog + bne loop_sad2_prog - vabal.u8 q6, d8, d6 - vabal.u8 q7, d9, d7 - vabal.u8 q8, d10, d6 - vabal.u8 q9, d11, d7 + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 @ Compute SAD - vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 - vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 + vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 + vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 - vadd.u16 d12, d12, d13 - ldr r5, [sp, #16] @ loading pi4_sad to r5 - vadd.u16 d16, d16, d17 + vadd.u16 d12, d12, d13 + ldr r5, [sp, #16] @ loading pi4_sad to r5 + vadd.u16 d16, d16, d17 - vpadd.u16 d12, d12, d16 - vpaddl.u16 d12, d12 + vpadd.u16 d12, d12, d16 + vpaddl.u16 d12, d12 - vst1.64 {d12}, [r5]! + vst1.64 {d12}, [r5]! + vpop {d8-d15} + ldmfd sp!, {r4-r5, pc} - ldmfd sp!, {r4-r5, pc} - -@/* +@* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad3_prog() @// @@ -433,7 +440,7 @@ loop_sad2_prog: @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- -@*/ +@* .global ime_calculate_sad3_prog_a9q @@ -446,90 +453,90 @@ ime_calculate_sad3_prog_a9q: @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> - stmfd sp!, {r4-r6, lr} - - ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 - mov r6, #14 - - @ Row 1 - vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 - vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 - vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 - vabdl.u8 q8, d2, d0 - vabdl.u8 q9, d3, d1 - vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 - vabdl.u8 q10, d4, d0 - vabdl.u8 q11, d5, d1 - - @ Row 2 - vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 - vabdl.u8 q12, d6, d0 - vabdl.u8 q13, d7, d1 - vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 - vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 - vabal.u8 q8, d10, d8 - vabal.u8 q9, d11, d9 - vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 - vabal.u8 q10, d12, d8 - vabal.u8 q11, d13, d9 + stmfd sp!, {r4-r6, lr} + + ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 + mov r6, #14 + vpush {d8-d15} + @Row 1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabdl.u8 q8, d2, d0 + vabdl.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabdl.u8 q10, d4, d0 + vabdl.u8 q11, d5, d1 + + @Row 2 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabdl.u8 q12, d6, d0 + vabdl.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 loop_sad3_prog: @Row 1 - vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 - vabal.u8 q12, d14, d8 - vabal.u8 q13, d15, d9 - vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 - vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 - vabal.u8 q8, d2, d0 - vabal.u8 q9, d3, d1 - vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 - vabal.u8 q10, d4, d0 - vabal.u8 q11, d5, d1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d2, d0 + vabal.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d4, d0 + vabal.u8 q11, d5, d1 @Row 2 - vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 - vabal.u8 q12, d6, d0 - vabal.u8 q13, d7, d1 - vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 - subs r6, #2 - vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 - vabal.u8 q8, d10, d8 - vabal.u8 q9, d11, d9 - vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 - vabal.u8 q10, d12, d8 - vabal.u8 q11, d13, d9 - - bne loop_sad3_prog - - vabal.u8 q12, d14, d8 - vabal.u8 q13, d15, d9 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d6, d0 + vabal.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + subs r6, #2 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 + + bne loop_sad3_prog + + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 @ Compute SAD - vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 - vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 - vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 + vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 + vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 + vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 - vadd.u16 d16, d16, d17 - vadd.u16 d20, d20, d21 - vadd.u16 d24, d24, d25 + vadd.u16 d16, d16, d17 + vadd.u16 d20, d20, d21 + vadd.u16 d24, d24, d25 - vpadd.u16 d16, d16, d20 - vpadd.u16 d24, d24, d24 + vpadd.u16 d16, d16, d20 + vpadd.u16 d24, d24, d24 - ldr r6, [sp, #24] @ loading pi4_sad to r6 - vpaddl.u16 d16, d16 - vpaddl.u16 d24, d24 + ldr r6, [sp, #24] @ loading pi4_sad to r6 + vpaddl.u16 d16, d16 + vpaddl.u16 d24, d24 - vst1.64 {d16}, [r6]! - vst1.32 {d24[0]}, [r6] + vst1.64 {d16}, [r6]! + vst1.32 {d24[0]}, [r6] + vpop {d8-d15} + ldmfd sp!, {r4-r6, pc} - ldmfd sp!, {r4-r6, pc} - -@/** +@** @****************************************************************************** @* @* @brief computes distortion (SAD) for sub-pel motion estimation @@ -551,7 +558,7 @@ loop_sad3_prog: @* @remarks @* @****************************************************************************** -@*/ +@* .text .p2align 2 @@ -560,115 +567,116 @@ loop_sad3_prog: ime_sub_pel_compute_sad_16x16_a9q: - stmfd sp!, {r4-r11, lr} @store register values to stack + stmfd sp!, {r4-r11, lr} @store register values to stack - ldr r9, [sp, #36] - ldr r10, [sp, #40] + ldr r9, [sp, #36] + ldr r10, [sp, #40] + vpush {d8-d15} + sub r4, r1, #1 @ x left + sub r5, r2, r10 @ y top - sub r4, r1, #1 @ x left - sub r5, r2, r10 @ y top + sub r6, r3, #1 @ xy left + sub r7, r3, r10 @ xy top - sub r6, r3, #1 @ xy left - sub r7, r3, r10 @ xy top - - sub r8, r7, #1 @ xy top-left - mov r11, #15 + sub r8, r7, #1 @ xy top-left + mov r11, #15 @for bringing buffer2 into cache..., dummy load instructions @ LDR r12,[r1] @ LDR r12,[sp,#12] - vld1.8 {d0, d1}, [r0], r9 @ src - vld1.8 {d2, d3}, [r5], r10 @ y top LOAD - vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD - vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD - - vabdl.u8 q6, d2, d0 @ y top ABS1 - vabdl.u8 q7, d4, d0 @ xy top ABS1 - vld1.8 {d8, d9}, [r1], r10 @ x LOAD - vabdl.u8 q8, d6, d0 @ xy top-left ABS1 - vabdl.u8 q9, d8, d0 @ x ABS1 - vld1.8 {d10, d11}, [r4], r10 @ x left LOAD - - vabal.u8 q6, d3, d1 @ y top ABS2 - vabal.u8 q7, d5, d1 @ xy top ABS2 - vld1.8 {d2, d3}, [r2], r10 @ y LOAD - vabal.u8 q8, d7, d1 @ xy top-left ABS2 - vabal.u8 q9, d9, d1 @ x ABS2 - vld1.8 {d4, d5}, [r3], r10 @ xy LOAD - - vabdl.u8 q10, d10, d0 @ x left ABS1 - vabdl.u8 q11, d2, d0 @ y ABS1 - vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD - vabdl.u8 q12, d4, d0 @ xy ABS1 - vabdl.u8 q13, d6, d0 @ xy left ABS1 + vld1.8 {d0, d1}, [r0], r9 @ src + vld1.8 {d2, d3}, [r5], r10 @ y top LOAD + vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD + vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD + + vabdl.u8 q6, d2, d0 @ y top ABS1 + vabdl.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabdl.u8 q8, d6, d0 @ xy top-left ABS1 + vabdl.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabdl.u8 q10, d10, d0 @ x left ABS1 + vabdl.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabdl.u8 q12, d4, d0 @ xy ABS1 + vabdl.u8 q13, d6, d0 @ xy left ABS1 loop_sub_pel_16x16: - vabal.u8 q10, d11, d1 @ x left ABS2 - vabal.u8 q11, d3, d1 @ y ABS2 - subs r11, #1 - vabal.u8 q12, d5, d1 @ xy ABS2 - vabal.u8 q13, d7, d1 @ xy left ABS2 - - vld1.8 {d0, d1}, [r0], r9 @ src - vabal.u8 q6, d2, d0 @ y top ABS1 - vabal.u8 q7, d4, d0 @ xy top ABS1 - vld1.8 {d8, d9}, [r1], r10 @ x LOAD - vabal.u8 q8, d6, d0 @ xy top-left ABS1 - vabal.u8 q9, d8, d0 @ x ABS1 - vld1.8 {d10, d11}, [r4], r10 @ x left LOAD - - vabal.u8 q6, d3, d1 @ y top ABS2 - vabal.u8 q7, d5, d1 @ xy top ABS2 - vld1.8 {d2, d3}, [r2], r10 @ y LOAD - vabal.u8 q8, d7, d1 @ xy top-left ABS2 - vabal.u8 q9, d9, d1 @ x ABS2 - vld1.8 {d4, d5}, [r3], r10 @ xy LOAD - - vabal.u8 q10, d10, d0 @ x left ABS1 - vabal.u8 q11, d2, d0 @ y ABS1 - vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD - vabal.u8 q12, d4, d0 @ xy ABS1 - vabal.u8 q13, d6, d0 @ xy left ABS1 - - bne loop_sub_pel_16x16 - - vabal.u8 q10, d11, d1 @ x left ABS2 - vabal.u8 q11, d3, d1 @ y ABS2 - vabal.u8 q12, d5, d1 @ xy ABS2 - vabal.u8 q13, d7, d1 @ xy left ABS2 - - vadd.i16 d0, d18, d19 @ x - vadd.i16 d3, d12, d13 @ y top - vadd.i16 d6, d14, d15 @ xy top - vadd.i16 d5, d26, d27 @ xy left - vadd.i16 d1, d20, d21 @ x left - vadd.i16 d2, d22, d23 @ y - vadd.i16 d4, d24, d25 @ xy - vadd.i16 d7, d16, d17 @ xy top left - - vpadd.i16 d0, d0, d1 - vpadd.i16 d2, d2, d3 - vpadd.i16 d4, d4, d5 - vpadd.i16 d6, d6, d7 - - vpaddl.u16 d0, d0 - vpaddl.u16 d2, d2 - ldr r11, [sp, #44] - vpaddl.u16 d4, d4 - vpaddl.u16 d6, d6 - - vst1.32 {d0}, [r11]! - vst1.32 {d2}, [r11]! - vst1.32 {d4}, [r11]! - vst1.32 {d6}, [r11]! - - ldmfd sp!, {r4-r11, pc} @Restoring registers from stack - - - -@/** + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + subs r11, #1 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vld1.8 {d0, d1}, [r0], r9 @ src + vabal.u8 q6, d2, d0 @ y top ABS1 + vabal.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabal.u8 q8, d6, d0 @ xy top-left ABS1 + vabal.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabal.u8 q10, d10, d0 @ x left ABS1 + vabal.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabal.u8 q12, d4, d0 @ xy ABS1 + vabal.u8 q13, d6, d0 @ xy left ABS1 + + bne loop_sub_pel_16x16 + + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vadd.i16 d0, d18, d19 @ x + vadd.i16 d3, d12, d13 @ y top + vadd.i16 d6, d14, d15 @ xy top + vadd.i16 d5, d26, d27 @ xy left + vadd.i16 d1, d20, d21 @ x left + vadd.i16 d2, d22, d23 @ y + vadd.i16 d4, d24, d25 @ xy + vadd.i16 d7, d16, d17 @ xy top left + + vpadd.i16 d0, d0, d1 + vpadd.i16 d2, d2, d3 + vpadd.i16 d4, d4, d5 + vpadd.i16 d6, d6, d7 + + vpaddl.u16 d0, d0 + vpaddl.u16 d2, d2 + vpop {d8-d15} + ldr r11, [sp, #44] + vpaddl.u16 d4, d4 + vpaddl.u16 d6, d6 + + vst1.32 {d0}, [r11]! + vst1.32 {d2}, [r11]! + vst1.32 {d4}, [r11]! + vst1.32 {d6}, [r11]! + + ldmfd sp!, {r4-r11, pc} @Restoring registers from stack + + + +@** @****************************************************************************** @* @* @brief computes distortion (SAD) between 2 16x16 blocks @@ -699,7 +707,7 @@ loop_sub_pel_16x16: @* @remarks @* @****************************************************************************** -@*/ +@* .text .p2align 2 @@ -710,51 +718,52 @@ ime_compute_sad_16x16_a9q: @STMFD sp!,{r12,lr} - stmfd sp!, {r12, r14} @store register values to stack + stmfd sp!, {r12, r14} @store register values to stack @for bringing buffer2 into cache..., dummy load instructions @ LDR r12,[r1] @ LDR r12,[sp,#12] - vld1.8 {d4, d5}, [r0], r2 - vld1.8 {d6, d7}, [r1], r3 - - mov r12, #14 - vld1.8 {d8, d9}, [r0], r2 - vabdl.u8 q0, d4, d6 - vld1.8 {d10, d11}, [r1], r3 - vabdl.u8 q1, d5, d7 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + vpush {d8-d15} + mov r12, #14 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d4, d6 + vld1.8 {d10, d11}, [r1], r3 + vabdl.u8 q1, d5, d7 loop_sad_16x16: - vld1.8 {d4, d5}, [r0], r2 - vabal.u8 q0, d8, d10 - vld1.8 {d6, d7}, [r1], r3 - vabal.u8 q1, d9, d11 + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d8, d10 + vld1.8 {d6, d7}, [r1], r3 + vabal.u8 q1, d9, d11 - vld1.8 {d8, d9}, [r0], r2 - vabal.u8 q0, d4, d6 - subs r12, #2 - vld1.8 {d10, d11}, [r1], r3 - vabal.u8 q1, d5, d7 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d4, d6 + subs r12, #2 + vld1.8 {d10, d11}, [r1], r3 + vabal.u8 q1, d5, d7 - bne loop_sad_16x16 + bne loop_sad_16x16 - vabal.u8 q0, d8, d10 - vabal.u8 q1, d9, d11 + vabal.u8 q0, d8, d10 + vabal.u8 q1, d9, d11 - vadd.i16 q0, q0, q1 - vadd.i16 d0, d1, d0 - ldr r12, [sp, #12] + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + vpop {d8-d15} + ldr r12, [sp, #12] - vpaddl.u16 d0, d0 - vpaddl.u32 d0, d0 - vst1.32 {d0[0]}, [r12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] - ldmfd sp!, {r12, pc} @Restoring registers from stack + ldmfd sp!, {r12, pc} @Restoring registers from stack -@/* +@* @//--------------------------------------------------------------------------- @// Function Name : Calculate_Mad4_prog() @// @@ -764,7 +773,7 @@ loop_sad_16x16: @// Platform : CortexA8/NEON . @// @//----------------------------------------------------------------------------- -@*/ +@* .global ime_calculate_sad4_prog_a9q @@ -775,20 +784,20 @@ ime_calculate_sad4_prog_a9q: @ r3 = CurBufferWidth <UWORD32> @ stack = psad <UWORD32 *> {at 0x34} - stmfd sp!, {r4-r7, lr} + stmfd sp!, {r4-r7, lr} @UWORD8 *left_ptr = temp_frame - 1; @UWORD8 *right_ptr = temp_frame + 1; @UWORD8 *top_ptr = temp_frame - RefBufferWidth; @UWORD8 *bot_ptr = temp_frame + RefBufferWidth; - mov r7, #14 - sub r4, r0, #0x01 @r4 = left_ptr - add r5, r0, #0x1 @r5 = right_ptr - sub r6, r0, r2 @r6 = top_ptr - add r0, r0, r2 @r0 = bot_ptr + mov r7, #14 + sub r4, r0, #0x01 @r4 = left_ptr + add r5, r0, #0x1 @r5 = right_ptr + sub r6, r0, r2 @r6 = top_ptr + add r0, r0, r2 @r0 = bot_ptr @r1 = buffer_ptr - + vpush {d8-d15} @D0:D1 : buffer @D2:D3 : top @D4:D5 : left @@ -796,94 +805,93 @@ ime_calculate_sad4_prog_a9q: @D8:D9 : bottom @Row 1 - vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 - vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 - vabdl.u8 q5, d2, d0 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 - vabdl.u8 q6, d3, d1 + vabdl.u8 q5, d2, d0 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabdl.u8 q6, d3, d1 - vabdl.u8 q7, d0, d4 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 - vabdl.u8 q8, d1, d5 + vabdl.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabdl.u8 q8, d1, d5 @Row 2 - vabdl.u8 q9, d0, d6 - vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 - vabdl.u8 q10, d1, d7 + vabdl.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabdl.u8 q10, d1, d7 - vabdl.u8 q11, d0, d8 - vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 - vabdl.u8 q12, d1, d9 + vabdl.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabdl.u8 q12, d1, d9 loop_sad4_prog: - vabal.u8 q5, d26, d2 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 - vabal.u8 q6, d27, d3 + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 - vabal.u8 q7, d26, d4 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 - vabal.u8 q8, d27, d5 + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 - vabal.u8 q9, d26, d6 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 - vabal.u8 q10, d27, d7 + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 @Row 1 - vabal.u8 q11, d26, d8 - vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 - vabal.u8 q12, d27, d9 - - vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 - subs r7, #2 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + vabal.u8 q11, d26, d8 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vabal.u8 q12, d27, d9 - vabal.u8 q5, d0, d2 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + subs r7, #2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 - vabal.u8 q6, d1, d3 + vabal.u8 q5, d0, d2 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabal.u8 q6, d1, d3 - vabal.u8 q7, d0, d4 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 - vabal.u8 q8, d1, d5 + vabal.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabal.u8 q8, d1, d5 @Row 2 - vabal.u8 q9, d0, d6 - vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 - vabal.u8 q10, d1, d7 + vabal.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabal.u8 q10, d1, d7 - vabal.u8 q11, d0, d8 - vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 - vabal.u8 q12, d1, d9 + vabal.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabal.u8 q12, d1, d9 - bne loop_sad4_prog + bne loop_sad4_prog - vabal.u8 q5, d26, d2 - vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 - vabal.u8 q6, d27, d3 + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 - vabal.u8 q7, d26, d4 - vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 - vabal.u8 q8, d27, d5 + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 - vabal.u8 q9, d26, d6 - vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 - vabal.u8 q10, d27, d7 + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 - vabal.u8 q11, d26, d8 - vabal.u8 q12, d27, d9 + vabal.u8 q11, d26, d8 + vabal.u8 q12, d27, d9 @;Q5:Q6 : sad_top @;Q7:Q8 : sad_left @;Q9:Q10 : sad_right @;Q11:Q12 : sad_bot - vadd.u16 q5, q5, q6 - vadd.u16 q7, q7, q8 - vadd.u16 q9, q9, q10 - vadd.u16 q11, q11, q12 + vadd.u16 q5, q5, q6 + vadd.u16 q7, q7, q8 + vadd.u16 q9, q9, q10 + vadd.u16 q11, q11, q12 @; Free :- @; Q6,Q8,Q10,Q12 @@ -893,10 +901,10 @@ loop_sad4_prog: @;Q9 -> D18:D19 @;Q11 -> D22:D23 - vadd.u16 d10, d10, d11 - vadd.u16 d14, d14, d15 - vadd.u16 d18, d18, d19 - vadd.u16 d22, d22, d23 + vadd.u16 d10, d10, d11 + vadd.u16 d14, d14, d15 + vadd.u16 d18, d18, d19 + vadd.u16 d22, d22, d23 @;D10 : sad_top @;D14 : sad_left @@ -904,35 +912,35 @@ loop_sad4_prog: @;D22 : sad_bot - vpaddl.u16 d11, d10 - vpaddl.u16 d15, d14 - vpaddl.u16 d19, d18 - vpaddl.u16 d23, d22 + vpaddl.u16 d11, d10 + vpaddl.u16 d15, d14 + vpaddl.u16 d19, d18 + vpaddl.u16 d23, d22 @;D11 : sad_top @;D15 : sad_left @;D19 : sad_right @;D23 : sad_bot - vpaddl.u32 d10, d11 - vpaddl.u32 d22, d23 - vpaddl.u32 d14, d15 - vpaddl.u32 d18, d19 + vpaddl.u32 d10, d11 + vpaddl.u32 d22, d23 + vpaddl.u32 d14, d15 + vpaddl.u32 d18, d19 @;D10 : sad_top @;D14 : sad_left @;D18 : sad_right @;D22 : sad_bot - ldr r4, [sp, #20] @;Can be rearranged - - vsli.64 d10, d22, #32 - vsli.64 d14, d18, #32 + ldr r4, [sp, #84] @;Can be rearranged - vst1.64 {d14}, [r4]! - vst1.64 {d10}, [r4]! + vsli.64 d10, d22, #32 + vsli.64 d14, d18, #32 - ldmfd sp!, {r4-r7, pc} + vst1.64 {d14}, [r4]! + vst1.64 {d10}, [r4]! + vpop {d8-d15} + ldmfd sp!, {r4-r7, pc} @@ -974,37 +982,37 @@ ime_compute_satqd_16x16_lumainter_a9q: @R5 :Distortion,ie SAD @R6 :is nonzero - push {r4-r12, lr} @push all the variables first + push {r4-r12, lr} @push all the variables first @ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables - ldr r4, [sp, #40] @load the threshold address - - mov r8, #8 @Number of 4x8 blocks to be processed - mov r10, #0 @Sad - mov r7, #0 @Nonzero info + ldr r4, [sp, #40] @load the threshold address + vpush {d8-d15} + mov r8, #8 @Number of 4x8 blocks to be processed + mov r10, #0 @Sad + mov r7, #0 @Nonzero info @---------------------------------------------------- - vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 + vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 - vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 + vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 - vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 + vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 - vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 + vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 - vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 - vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 + vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 + vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 - vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 + vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 - vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 + vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 - vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 - vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 + vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 + vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 - vld1.u16 {q11}, [r4] @I load the threhold - vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 + vld1.u16 {q11}, [r4] @I load the threhold + vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 - vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 + vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 @@ -1013,128 +1021,128 @@ core_loop: @S5 S6 S7 S8 A5 A6 A7 A8 @S9 S10 S11 S12 A9 A10 A11 A12 @S13 S14 S15 S16 A13 A14 A15 A16 - ands r11, r8, #1 @II See if we are at even or odd block - vadd.u16 q4 , q0, q3 @I Add r1 r4 - lsl r11, r2, #2 @II Move back src 4 rows + ands r11, r8, #1 @II See if we are at even or odd block + vadd.u16 q4 , q0, q3 @I Add r1 r4 + lsl r11, r2, #2 @II Move back src 4 rows - subeq r0, r0, r11 @II Move back src 4 rows if we are at even block - vadd.u16 q5 , q1, q2 @I Add r2 r3 - addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block + subeq r0, r0, r11 @II Move back src 4 rows if we are at even block + vadd.u16 q5 , q1, q2 @I Add r2 r3 + addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block - lsl r11, r3, #2 @II Move back pred 4 rows - vtrn.16 d8 , d10 @I trnspse 1 - subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block + lsl r11, r3, #2 @II Move back pred 4 rows + vtrn.16 d8 , d10 @I trnspse 1 + subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block - addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block - vtrn.16 d9 , d11 @I trnspse 2 - subne r0, r0, #8 @II Src 8clos back for odd rows + addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block + vtrn.16 d9 , d11 @I trnspse 2 + subne r0, r0, #8 @II Src 8clos back for odd rows - subne r1, r1, #8 @II Pred 8 cols back for odd rows - vtrn.32 d10, d11 @I trnspse 4 + subne r1, r1, #8 @II Pred 8 cols back for odd rows + vtrn.32 d10, d11 @I trnspse 4 - vtrn.32 d8 , d9 @I trnspse 3 - vswp d10, d11 @I rearrange so that the q4 and q5 add properly + vtrn.32 d8 , d9 @I trnspse 3 + vswp d10, d11 @I rearrange so that the q4 and q5 add properly @D8 S1 S4 A1 A4 @D9 S2 S3 A2 A3 @D11 S1 S4 A1 A4 @D10 S2 S3 A2 A3 - vadd.s16 q6, q4, q5 @I Get s1 s4 - vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 + vadd.s16 q6, q4, q5 @I Get s1 s4 + vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 - vtrn.s16 d12, d13 @I Get s2 s3 + vtrn.s16 d12, d13 @I Get s2 s3 @D12 S1 S4 A1 A4 @D13 S2 S3 A2 A3 - vshl.s16 q7, q6 , #1 @I si = si<<1 - vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 + vshl.s16 q7, q6 , #1 @I si = si<<1 + vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 - vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) - vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 + vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) + vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 @ D16 S14 A14 S23 A23 - vrev32.16 d0, d16 @I - vuzp.s16 d16, d0 @I + vrev32.16 d0, d16 @I + vuzp.s16 d16, d0 @I @D16 S14 S23 A14 A23 - vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) - vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 + vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) + vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 @D17 S12 S34 A12 A34 - vrev32.16 q9, q7 @I Rearrange si's + vrev32.16 q9, q7 @I Rearrange si's @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 @D12 S1 S4 A1 A4 @D19 Z3 Z2 Y3 Y2 - vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) - vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 + vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) + vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 @D13 S2 S3 A2 A3 @D18 Z4 Z1 Y4 Y1 - vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) - vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 + vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) + vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 @D16 S14 S23 A14 A23 - vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 - vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 @D22 SAD1 SAD2 junk junk @Q8 S2 S1 A2 A1 S6 S3 A6 A3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 - vtrn.32 q8, q4 @I Rearrange to make ls of each block togather + vtrn.32 q8, q4 @I Rearrange to make ls of each block togather @Q8 S2 S1 S8 S5 S6 S3 S7 S4 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 - ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 - vdup.s16 q6, d10[0] @I Get the sad blk 1 - vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 + ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 + vdup.s16 q6, d10[0] @I Get the sad blk 1 + vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 - vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 - vmov.s16 r9, d10[0] @I Get the sad for block 1 + vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 + vmov.s16 r9, d10[0] @I Get the sad for block 1 - vsub.s16 q9, q7, q8 @I Add to the lss - vmov.s16 r5, d10[1] @I Get the sad for block 2 + vsub.s16 q9, q7, q8 @I Add to the lss + vmov.s16 r5, d10[1] @I Get the sad for block 2 - vcle.s16 q7, q11, q9 @I Add to the lss - vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 + vcle.s16 q7, q11, q9 @I Add to the lss + vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 - vdup.s16 q15, d10[1] @I Get the sad blk 1 - vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 + vdup.s16 q15, d10[1] @I Get the sad blk 1 + vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 - vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 - vsub.s16 q3, q14, q4 @I Add to the lss - vcle.s16 q15, q11, q3 @I Add to the lss + vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 + vsub.s16 q3, q14, q4 @I Add to the lss + vcle.s16 q15, q11, q3 @I Add to the lss - ADD R10, R10, R9 @I Add to the global sad blk 1 - vtrn.u8 q15, q7 @I get all comparison bits to one reg - vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 + ADD R10, R10, R9 @I Add to the global sad blk 1 + vtrn.u8 q15, q7 @I get all comparison bits to one reg + vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 - ADD R10, R10, R5 @I Add to the global sad blk 2 - vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs - cmp r11, r9 + ADD R10, R10, R5 @I Add to the global sad blk 2 + vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs + cmp r11, r9 - movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 - vadd.u8 d28, d28, d29 @I Add the bits - cmp r11, r5 @I Compare with threshold blk 2 + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 + vadd.u8 d28, d28, d29 @I Add the bits + cmp r11, r5 @I Compare with threshold blk 2 - movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 - vpadd.u8 d28, d28, d29 @I Add the bits + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 + vpadd.u8 d28, d28, d29 @I Add the bits - vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 - vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 + vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 + vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 - orr r7, r7, r11 @I get the guy to r11 + orr r7, r7, r11 @I get the guy to r11 - sub r8, r8, #1 @I Decremrnt block count + sub r8, r8, #1 @I Decremrnt block count - cmp r7, #0 @I If we have atlest one non zero block - bne compute_sad_only @I if a non zero block is der,From now on compute sad only + cmp r7, #0 @I If we have atlest one non zero block + bne compute_sad_only @I if a non zero block is der,From now on compute sad only - cmp r8, #1 @I See if we are at the last block - bne core_loop @I If the blocks are zero, lets continue the satdq + cmp r8, #1 @I See if we are at the last block + bne core_loop @I If the blocks are zero, lets continue the satdq @EPILOUGE for core loop @@ -1142,94 +1150,94 @@ core_loop: @S5 S6 S7 S8 A5 A6 A7 A8 @S9 S10 S11 S12 A9 A10 A11 A12 @S13 S14 S15 S16 A13 A14 A15 A16 - vadd.u16 q4 , q0, q3 @Add r1 r4 - vadd.u16 q5 , q1, q2 @Add r2 r3 + vadd.u16 q4 , q0, q3 @Add r1 r4 + vadd.u16 q5 , q1, q2 @Add r2 r3 @D8 S1 S2 S2 S1 @D10 S4 S3 S3 S4 @D9 A1 A2 A2 A1 @D11 A4 A3 A3 A4 - vtrn.16 d8 , d10 @I trnspse 1 - vtrn.16 d9 , d11 @I trnspse 2 - vtrn.32 d8 , d9 @I trnspse 3 - vtrn.32 d10, d11 @I trnspse 4 + vtrn.16 d8 , d10 @I trnspse 1 + vtrn.16 d9 , d11 @I trnspse 2 + vtrn.32 d8 , d9 @I trnspse 3 + vtrn.32 d10, d11 @I trnspse 4 - vswp d10, d11 @I rearrange so that the q4 and q5 add properly + vswp d10, d11 @I rearrange so that the q4 and q5 add properly @D8 S1 S4 A1 A4 @D9 S2 S3 A2 A3 @D11 S1 S4 A1 A4 @D10 S2 S3 A2 A3 - vadd.s16 q6, q4, q5 @Get s1 s4 - vtrn.s16 d12, d13 @Get s2 s3 + vadd.s16 q6, q4, q5 @Get s1 s4 + vtrn.s16 d12, d13 @Get s2 s3 @D12 S1 S4 A1 A4 @D13 S2 S3 A2 A3 - vshl.s16 q7, q6 , #1 @si = si<<1 - vmov.s16 r9, d10[0] @Get the sad for block 1 + vshl.s16 q7, q6 , #1 @si = si<<1 + vmov.s16 r9, d10[0] @Get the sad for block 1 - vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) - vmov.s16 r5, d10[1] @Get the sad for block 2 + vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) + vmov.s16 r5, d10[1] @Get the sad for block 2 @D16 S14 A14 S23 A23 - vrev32.16 d30, d16 @ - vuzp.s16 d16, d30 @ + vrev32.16 d30, d16 @ + vuzp.s16 d16, d30 @ @D16 S14 S23 A14 A23 - vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) + vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) @D17 S12 S34 A12 A34 - vrev32.16 q9, q7 @Rearrange si's + vrev32.16 q9, q7 @Rearrange si's @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 @D12 S1 S4 A1 A4 @D19 Z3 Z2 Y3 Y2 - vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) + vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) @D13 S2 S3 A2 A3 @D18 Z4 Z1 Y4 Y1 - vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) + vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) @Q10 S8 S5 A8 A5 S7 S4 A7 A4 @D16 S14 S23 A14 A23 - vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 @D22 SAD1 SAD2 junk junk - vmov.u16 r9, d10[0] @Get the sad for block 1 - vmov.u16 r5, d10[1] @Get the sad for block 2 + vmov.u16 r9, d10[0] @Get the sad for block 1 + vmov.u16 r5, d10[1] @Get the sad for block 2 @Q8 S2 S1 A2 A1 S6 S3 A6 A3 @Q10 S8 S5 A8 A5 S7 S4 A7 A4 - ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 - vtrn.32 q8, q4 @Rearrange to make ls of each block togather - ADD R10, R10, R9 @Add to the global sad blk 1 + ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 + vtrn.32 q8, q4 @Rearrange to make ls of each block togather + ADD R10, R10, R9 @Add to the global sad blk 1 @Q8 S2 S1 S8 S5 S6 S3 S7 S4 @Q10 A2 A1 A8 A5 A6 A3 A7 A4 - vld1.u16 {q11}, [r4] @load the threhold - ADD R10, R10, R5 @Add to the global sad blk 2 + vld1.u16 {q11}, [r4] @load the threhold + ADD R10, R10, R5 @Add to the global sad blk 2 - vdup.u16 q6, d10[0] @Get the sad blk 1 + vdup.u16 q6, d10[0] @Get the sad blk 1 - cmp r11, r9 @Compare with threshold blk 1 - vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 + cmp r11, r9 @Compare with threshold blk 1 + vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 - vsub.s16 q9, q7, q8 @Add to the lss + vsub.s16 q9, q7, q8 @Add to the lss - vcle.s16 q15, q11, q9 @Add to the lss - movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 + vcle.s16 q15, q11, q9 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 - cmp r11, r5 @Compare with threshold blk 2 - vdup.u16 q14, d10[1] @Get the sad blk 1 + cmp r11, r5 @Compare with threshold blk 2 + vdup.u16 q14, d10[1] @Get the sad blk 1 - vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 - vsub.s16 q12, q13, q4 @Add to the lss - vcle.s16 q14, q11, q12 @Add to the lss - movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 + vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 + vsub.s16 q12, q13, q4 @Add to the lss + vcle.s16 q14, q11, q12 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 - vtrn.u8 q14, q15 @get all comparison bits to one reg - vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs - vadd.u8 d28, d28, d29 @Add the bits - vpadd.u8 d28, d28, d29 @Add the bits - vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 - orr r7, r7, r11 @get the guy to r11 + vtrn.u8 q14, q15 @get all comparison bits to one reg + vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs + vadd.u8 d28, d28, d29 @Add the bits + vpadd.u8 d28, d28, d29 @Add the bits + vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 + orr r7, r7, r11 @get the guy to r11 - b funcend_sad_16x16 @Since all blocks ar processed nw, got to end + b funcend_sad_16x16 @Since all blocks ar processed nw, got to end compute_sad_only: @This block computes SAD only, so will be lighter @IT will start processign at n odd block @@ -1237,117 +1245,119 @@ compute_sad_only: @This block computes SAD only, so will b @and then for two blocks at a time @The counter is r7, hence r7 blocks will be processed - and r11, r8, #1 @Get the last bit of counter - cmp r11, #0 @See if we are at even or odd block + and r11, r8, #1 @Get the last bit of counter + cmp r11, #0 @See if we are at even or odd block @iif the blk is even we just have to set the pointer to the @start of current row - lsleq r11, r2, #2 @I Move back src 4 rows - subeq r0, r0, r11 @I Move back src 4 rows if we are at even block + lsleq r11, r2, #2 @I Move back src 4 rows + subeq r0, r0, r11 @I Move back src 4 rows if we are at even block - lsleq r11, r3, #2 @I Move back pred 4 rows - subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block + lsleq r11, r3, #2 @I Move back pred 4 rows + subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block @ADDEQ R8,R8,#2 ;Inc counter - beq skip_odd_blk @If the blk is odd we have to compute sad + beq skip_odd_blk @If the blk is odd we have to compute sad - vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 - vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 - vadd.u16 q6, q4, q5 @Add SAD of row 1-4 - vadd.u16 d14, d12, d13 @Add Blk1 and blk2 - vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 - vpadd.u16 d18, d16, d17 @Add col 12-34 + vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 + vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 + vadd.u16 q6, q4, q5 @Add SAD of row 1-4 + vadd.u16 d14, d12, d13 @Add Blk1 and blk2 + vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 + vpadd.u16 d18, d16, d17 @Add col 12-34 - vmov.u16 r9, d18[0] @Move sad to arm - ADD R10, R10, R9 @Add to the global sad + vmov.u16 r9, d18[0] @Move sad to arm + ADD R10, R10, R9 @Add to the global sad - sub r8, r8, #1 @Dec counter - cmp r8, #0 @See if we processed last block - beq funcend_sad_16x16 @if lprocessed last block goto end of func + sub r8, r8, #1 @Dec counter + cmp r8, #0 @See if we processed last block + beq funcend_sad_16x16 @if lprocessed last block goto end of func - sub r0, r0, #8 @Since we processed od block move back src by 8 cols - sub r1, r1, #8 @Since we processed od block move back pred by 8 cols + sub r0, r0, #8 @Since we processed od block move back src by 8 cols + sub r1, r1, #8 @Since we processed od block move back pred by 8 cols skip_odd_blk: - vmov.s16 q0, #0 @Initialize the accumulator - vmov.s16 q1, #0 @Initialize the accumulator + vmov.s16 q0, #0 @Initialize the accumulator + vmov.s16 q1, #0 @Initialize the accumulator - vld1.u8 {q15}, [r0], r2 @load src r1 - vld1.u8 {q14}, [r1], r3 @load pred r1 + vld1.u8 {q15}, [r0], r2 @load src r1 + vld1.u8 {q14}, [r1], r3 @load pred r1 - vld1.u8 {q13}, [r0], r2 @load src r2 - vld1.u8 {q12}, [r1], r3 @load pred r2 + vld1.u8 {q13}, [r0], r2 @load src r2 + vld1.u8 {q12}, [r1], r3 @load pred r2 - vld1.u8 {q11}, [r0], r2 @load src r3 - vld1.u8 {q10}, [r1], r3 @load pred r2 + vld1.u8 {q11}, [r0], r2 @load src r3 + vld1.u8 {q10}, [r1], r3 @load pred r2 - vld1.u8 {q9}, [r0], r2 @load src r4 - vld1.u8 {q8}, [r1], r3 @load pred r4 + vld1.u8 {q9}, [r0], r2 @load src r4 + vld1.u8 {q8}, [r1], r3 @load pred r4 - cmp r8, #2 - beq sad_epilouge + cmp r8, #2 + beq sad_epilouge sad_loop: - vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 - vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 + vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 - vld1.u8 {q15}, [r0], r2 @II load r1 src - vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 + vld1.u8 {q15}, [r0], r2 @II load r1 src + vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 - vld1.u8 {q14}, [r1], r3 @II load r1 pred - vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 + vld1.u8 {q14}, [r1], r3 @II load r1 pred + vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 - vld1.u8 {q13}, [r0], r2 @II load r3 src - vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 + vld1.u8 {q13}, [r0], r2 @II load r3 src + vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 - vld1.u8 {q12}, [r1], r3 @II load r2 pred - vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 + vld1.u8 {q12}, [r1], r3 @II load r2 pred + vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 - vld1.u8 {q11}, [r0], r2 @II load r3 src - vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 + vld1.u8 {q11}, [r0], r2 @II load r3 src + vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 - sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 - vld1.u8 {q10}, [r1], r3 @II load r3 pred - vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 + sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 + vld1.u8 {q10}, [r1], r3 @II load r3 pred + vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 - cmp r8, #2 @Check if last loop - vld1.u8 {q9}, [r0], r2 @II load r4 src - vld1.u8 {q8}, [r1], r3 @II load r4 pred + cmp r8, #2 @Check if last loop + vld1.u8 {q9}, [r0], r2 @II load r4 src + vld1.u8 {q8}, [r1], r3 @II load r4 pred - bne sad_loop @Go back to SAD computation + bne sad_loop @Go back to SAD computation sad_epilouge: - vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 - vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 + vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 - vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 - vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 + vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 + vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 - vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 - vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 + vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 + vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 - vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 - vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 + vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 + vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 - vadd.u16 q2, q0, q1 @ADD two accumulators - vadd.u16 d6, d4, d5 @Add two blk sad - vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad - vpadd.u16 d10, d8, d9 @Add col 12-34 sad + vadd.u16 q2, q0, q1 @ADD two accumulators + vadd.u16 d6, d4, d5 @Add two blk sad + vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad + vpadd.u16 d10, d8, d9 @Add col 12-34 sad - vmov.u16 r9, d10[0] @move SAD to ARM - ADD R10, R10, R9 @Add to the global sad + vmov.u16 r9, d10[0] @move SAD to ARM + ADD R10, R10, R9 @Add to the global sad funcend_sad_16x16: @End of fucntion process - ldr r5, [sp, #44] - ldr r6, [sp, #48] - str r7, [r6] @Store the is zero reg - str r10, [r5] @Store sad + vpop {d8-d15} + ldr r5, [sp, #44] + ldr r6, [sp, #48] + + str r7, [r6] @Store the is zero reg + str r10, [r5] @Store sad @SUB SP,SP,#40 - pop {r4-r12, pc} + pop {r4-r12, pc} diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s index c442077..e768c21 100644 --- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s +++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s @@ -76,7 +76,7 @@ .p2align 2 .include "ih264_neon_macros.s" -.globl ih264e_evaluate_intra16x16_modes_av8 +.global ih264e_evaluate_intra16x16_modes_av8 ih264e_evaluate_intra16x16_modes_av8: diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s index 6dbd8f8..817faa6 100644 --- a/encoder/armv8/ih264e_half_pel_av8.s +++ b/encoder/armv8/ih264e_half_pel_av8.s @@ -1015,10 +1015,3 @@ filter_2dvh_skip_row: ///***************************************** - - - - - - - .section .note.gnu-stack,"",%progbits diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s index 99ebc8a..47c3425 100644 --- a/encoder/armv8/ime_distortion_metrics_av8.s +++ b/encoder/armv8/ime_distortion_metrics_av8.s @@ -975,4 +975,3 @@ satdq_end_func: ldp d8, d9, [sp], #16 pop_v_regs ret - .section .note.gnu-stack,"",%progbits diff --git a/encoder/x86/ih264e_intra_modes_eval_ssse3.c b/encoder/x86/ih264e_intra_modes_eval_ssse3.c index 657921f..0f4a9ad 100644 --- a/encoder/x86/ih264e_intra_modes_eval_ssse3.c +++ b/encoder/x86/ih264e_intra_modes_eval_ssse3.c @@ -487,7 +487,7 @@ void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; WORD32 min_cost; - WORD32 lambda4 = u4_lambda << 2; + UWORD32 lambda4 = u4_lambda << 2; WORD32 dst_strd2, dst_strd3; __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b; diff --git a/encoder/x86/ime_distortion_metrics_sse42.c b/encoder/x86/ime_distortion_metrics_sse42.c index 0876788..baf18a4 100644 --- a/encoder/x86/ime_distortion_metrics_sse42.c +++ b/encoder/x86/ime_distortion_metrics_sse42.c @@ -110,6 +110,7 @@ void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src, __m128i res_r0, res_r1, res_r2, res_r3; __m128i sad_val; int val1, val2; + UNUSED (i4_max_sad); // Row 0-3 sad calculation src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); @@ -248,6 +249,7 @@ void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src, WORD32 i4_max_sad, WORD32 *pi4_mb_distortion) { + UNUSED (i4_max_sad); __m128i src_r0, src_r1, src_r2, src_r3; __m128i est_r0, est_r1, est_r2, est_r3; __m128i res_r0, res_r1, res_r2, res_r3; @@ -498,6 +500,7 @@ void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src, WORD32 i4_max_sad, WORD32 *pi4_mb_distortion) { + UNUSED (i4_max_sad); __m128i src_r0, src_r1, src_r2, src_r3; __m128i est_r0, est_r1, est_r2, est_r3; __m128i res_r0, res_r1, res_r2, res_r3; diff --git a/test/encoder/main.c b/test/encoder/main.c index 2a9635d..bb9cabf 100644 --- a/test/encoder/main.c +++ b/test/encoder/main.c @@ -29,7 +29,10 @@ #include <assert.h> #include <string.h> #include <sys/time.h> + +#ifndef IOS #include <malloc.h> +#endif #ifdef WINDOWS_TIMER #include "windows.h" @@ -1989,7 +1992,7 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt) /* 20 11 2013 100189 Initial Version */ /*****************************************************************************/ #ifdef IOS -int h264enc_main(char * homedir) +int h264enc_main(char * homedir,char *documentdir, int screen_wd, int screen_ht) #else int main(int argc, char *argv[]) #endif @@ -2036,6 +2039,9 @@ int main(int argc, char *argv[]) strcpy(ac_cfg_fname, argv[1]); } +#else + strcpy(ac_cfg_fname, "test.cfg"); + #endif /*************************************************************************/ @@ -2406,22 +2412,22 @@ int main(int argc, char *argv[]) #ifdef IOS /* Correct file paths */ - sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_ip_fname); + sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_ip_fname); strcpy (s_app_ctxt.ac_ip_fname, filename_with_path); - sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_op_fname); + sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_op_fname); strcpy (s_app_ctxt.ac_op_fname, filename_with_path); - sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_recon_fname); + sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_recon_fname); strcpy (s_app_ctxt.ac_recon_fname, filename_with_path); - sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_chksum_fname); + sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_chksum_fname); strcpy (s_app_ctxt.ac_chksum_fname, filename_with_path); - sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_mb_info_fname); + sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_mb_info_fname); strcpy (s_app_ctxt.ac_mb_info_fname, filename_with_path); - sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_pic_info_fname); + sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_pic_info_fname); strcpy (s_app_ctxt.ac_pic_info_fname, filename_with_path); #endif |