diff options
author | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-03-13 21:24:58 +0530 |
---|---|---|
committer | Hamsalekha S <hamsalekha.s@ittiam.com> | 2015-04-02 15:59:02 +0530 |
commit | 8d3d303c7942ced6a987a52db8977d768dc3605f (patch) | |
tree | cc806c96794356996b13ba9970941d0aed74a97e /encoder | |
parent | 3956d913d37327dcb340f836e604b04bd478b158 (diff) | |
download | android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.gz android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.tar.bz2 android_external_libavc-8d3d303c7942ced6a987a52db8977d768dc3605f.zip |
Initial version
Change-Id: I7efe9a589cd24edf86e8d086b40c27cbbf8b4017
Diffstat (limited to 'encoder')
119 files changed, 59630 insertions, 0 deletions
diff --git a/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s new file mode 100755 index 0000000..fe0ce17 --- /dev/null +++ b/encoder/arm/ih264e_evaluate_intra16x16_modes_a9q.s @@ -0,0 +1,313 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** +@****************************************************************************** +@* +@* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC ) +@* and do the prediction. +@* +@* @par Description +@* This function evaluates first three 16x16 modes and compute corresponding sad +@* and return the buffer predicted with best mode. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@** @param[in] pu1_ngbr_pels_i16 +@* UWORD8 pointer to neighbouring pels +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_n_avblty +@* availability of neighbouring pixels +@* +@* @param[in] u4_intra_mode +@* Pointer to the variable in which best mode is returned +@* +@* @param[in] pu4_sadmin +@* Pointer to the variable in which minimum sad is returned +@* +@* @param[in] u4_valid_intra_modes +@* Says what all modes are valid +@* +@* +@* @return none +@* +@****************************************************************************** +@*/ +@ +@void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src, +@ UWORD8 *pu1_ngbr_pels_i16, +@ UWORD8 *pu1_dst, +@ UWORD32 src_strd, +@ UWORD32 dst_strd, +@ WORD32 u4_n_avblty, +@ UWORD32 *u4_intra_mode, +@ WORD32 *pu4_sadmin, +@ UWORD32 u4_valid_intra_modes) +@ +.text +.p2align 2 + + .global ih264e_evaluate_intra16x16_modes_a9q + +ih264e_evaluate_intra16x16_modes_a9q: + +@r0 = pu1_src, +@r1 = pu1_ngbr_pels_i16, +@r2 = pu1_dst, +@r3 = src_strd, +@r4 = dst_strd, +@r5 = u4_n_avblty, +@r6 = u4_intra_mode, +@r7 = pu4_sadmin + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + ldr r5, [sp, #44] + + + vpush {d8-d15} + vld1.32 {q4}, [r1]! + sub r6, r1, #1 + add r1, r1, #1 + mov r10, #0 + vld1.32 {q5}, [r1]! + mov r11, #0 + mov r4, #0 + @/* Left available ???? + ands r7, r5, #01 + movne r10, #1 + + @/* Top available ???? + ands r8, r5, #04 + lsl r9, r10, #3 + movne r11, #1 + lsl r12, r11, #3 + adds r8, r9, r12 + + + @/* None available :( + moveq r4, #128 + + + +@/fINDING dc val*/ + @---------------------- + vaddl.u8 q15, d8, d9 + + vaddl.u8 q14, d10, d11 + + vadd.u16 q15, q14, q15 + @ VLD1.32 {q2},[r0],r3;row 2 + vadd.u16 d30, d31, d30 + vpadd.u16 d30, d30 + @ VLD1.32 {q3},[r0],r3 ;row 3 + vpadd.u16 d30, d30 + @--------------------- + + + vmov.u16 r7, d30[0] + add r7, r7, r8 + add r11, r11, #3 + add r8, r10, r11 + + lsr r7, r8 + add r7, r4, r7 + vld1.32 {q0}, [r0], r3 @ source r0w 0 + vdup.8 q15, r7 @dc val + +@/* computing SADs for all three modes*/ + ldrb r7, [r6] + vdup.8 q10, r7 @/HORIZONTAL VALUE ROW=0; + @/vertical row 0; + vabdl.u8 q8, d0, d10 + vabdl.u8 q9, d1, d11 + sub r6, r6, #1 + @/HORZ row 0; + vabdl.u8 q13, d0, d20 + vabdl.u8 q14, d1, d21 + mov r1, #15 + @/dc row 0; + vabdl.u8 q11, d0, d30 + vabdl.u8 q12, d1, d31 + + +loop: + vld1.32 {q1}, [r0], r3 @row i + @/dc row i; + vabal.u8 q11, d2, d30 + ldrb r7, [r6] + vabal.u8 q12, d3, d31 + + @/vertical row i; + vabal.u8 q8, d2, d10 + vdup.8 q10, r7 @/HORIZONTAL VALUE ROW=i; + sub r6, r6, #1 + vabal.u8 q9, d3, d11 + + subs r1, r1, #1 + @/HORZ row i; + vabal.u8 q13, d2, d20 + vabal.u8 q14, d3, d21 + bne loop + + @------------------------------------------------------------------------------ + + vadd.i16 q9, q9, q8 @/VERT + vadd.i16 d18, d19, d18 @/VERT + vpaddl.u16 d18, d18 @/VERT + vadd.i16 q14, q13, q14 @/HORZ + vadd.i16 d28, d29, d28 @/HORZ + vpaddl.u32 d18, d18 @/VERT + vpaddl.u16 d28, d28 @/HORZ + + vpaddl.u32 d28, d28 @/HORZ + vmov.u32 r8, d18[0] @ vert + vadd.i16 q12, q11, q12 @/DC + vmov.u32 r9, d28[0] @horz + mov r11, #1 + vadd.i16 d24, d24, d25 @/DC + lsl r11 , #30 + + @----------------------- + ldr r0, [sp, #120] @ u4_valid_intra_modes + @-------------------------------------------- + ands r7, r0, #01 @ vert mode valid???????????? + moveq r8, r11 + vpaddl.u16 d24, d24 @/DC + + ands r6, r0, #02 @ horz mode valid???????????? + moveq r9, r11 + vpaddl.u32 d24, d24 @/DC + + vmov.u32 r10, d24[0] @dc +@-------------------------------- + ldr r4, [sp, #104] @r4 = dst_strd, + ldr r7, [sp, #116] @r7 = pu4_sadmin +@---------------------------------------------- + ands r6, r0, #04 @ dc mode valid???????????? + moveq r10, r11 + + @--------------------------- + ldr r6, [sp, #112] @ R6 =MODE + @-------------------------- + + cmp r8, r9 + bgt not_vert + cmp r8, r10 + bgt do_dc + + @/---------------------- + @DO VERTICAL PREDICTION + str r8 , [r7] @MIN SAD + mov r8, #0 + str r8 , [r6] @ MODE + vmov q15, q5 + + b do_dc_vert + @----------------------------- +not_vert: + cmp r9, r10 + bgt do_dc + + @/---------------------- + @DO HORIZONTAL + vdup.8 q5, d9[7] @0 + str r9 , [r7] @MIN SAD + vdup.8 q6, d9[6] @1 + mov r9, #1 + vdup.8 q7, d9[5] @2 + vst1.32 {d10, d11} , [r2], r4 @0 + vdup.8 q8, d9[4] @3 + str r9 , [r6] @ MODE + vdup.8 q9, d9[3] @4 + vst1.32 {d12, d13} , [r2], r4 @1 + vdup.8 q10, d9[2] @5 + vst1.32 {d14, d15} , [r2], r4 @2 + vdup.8 q11, d9[1] @6 + vst1.32 {d16, d17} , [r2], r4 @3 + vdup.8 q12, d9[0] @7 + vst1.32 {d18, d19} , [r2], r4 @4 + vdup.8 q13, d8[7] @8 + vst1.32 {d20, d21} , [r2], r4 @5 + vdup.8 q14, d8[6] @9 + vst1.32 {d22, d23} , [r2], r4 @6 + vdup.8 q15, d8[5] @10 + vst1.32 {d24, d25} , [r2], r4 @7 + vdup.8 q1, d8[4] @11 + vst1.32 {d26, d27} , [r2], r4 @8 + vdup.8 q2, d8[3] @12 + vst1.32 {d28, d29} , [r2], r4 @9 + vdup.8 q3, d8[2] @13 + vst1.32 {d30, d31}, [r2], r4 @10 + vdup.8 q5, d8[1] @14 + vst1.32 {d2, d3} , [r2], r4 @11 + vdup.8 q6, d8[0] @15 + vst1.32 {d4, d5} , [r2], r4 @12 + + vst1.32 {d6, d7} , [r2], r4 @13 + + vst1.32 {d10, d11} , [r2], r4 @14 + + vst1.32 {d12, d13} , [r2], r4 @15 + b end_func + + + @/----------------------------- + +do_dc: @/--------------------------------- + @DO DC + str r10 , [r7] @MIN SAD + mov r10, #2 + str r10 , [r6] @ MODE +do_dc_vert: + vst1.32 {d30, d31}, [r2], r4 @0 + vst1.32 {d30, d31}, [r2], r4 @1 + vst1.32 {d30, d31}, [r2], r4 @2 + vst1.32 {d30, d31}, [r2], r4 @3 + vst1.32 {d30, d31}, [r2], r4 @4 + vst1.32 {d30, d31}, [r2], r4 @5 + vst1.32 {d30, d31}, [r2], r4 @6 + vst1.32 {d30, d31}, [r2], r4 @7 + vst1.32 {d30, d31}, [r2], r4 @8 + vst1.32 {d30, d31}, [r2], r4 @9 + vst1.32 {d30, d31}, [r2], r4 @10 + vst1.32 {d30, d31}, [r2], r4 @11 + vst1.32 {d30, d31}, [r2], r4 @12 + vst1.32 {d30, d31}, [r2], r4 @13 + vst1.32 {d30, d31}, [r2], r4 @14 + vst1.32 {d30, d31}, [r2], r4 @15 + @/------------------ +end_func: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + diff --git a/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s new file mode 100755 index 0000000..568e623 --- /dev/null +++ b/encoder/arm/ih264e_evaluate_intra4x4_modes_a9q.s @@ -0,0 +1,529 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** + +.data +.p2align 2 + +scratch_intrapred_luma_4x4_prediction: + .long ver, hor, d_c, dia_dl + .long dia_dr, ver_r, hor_d, ver_l + .long hor_u + + +.text +.p2align 2 + +scratch_intrapred_luma_4x4_prediction_addr1: + .long scratch_intrapred_luma_4x4_prediction - scrintra_4x4 - 8 + + + +@/** +@/** +@****************************************************************************** +@* +@* @brief :Evaluate best intra 4x4 mode +@* and do the prediction. +@* +@* @par Description +@* This function evaluates 4x4 modes and compute corresponding sad +@* and return the buffer predicted with best mode. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@** @param[in] pu1_ngbr_pels +@* UWORD8 pointer to neighbouring pels +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_n_avblty +@* availability of neighbouring pixels +@* +@* @param[in] u4_intra_mode +@* Pointer to the variable in which best mode is returned +@* +@* @param[in] pu4_sadmin +@* Pointer to the variable in which minimum cost is returned +@* +@* @param[in] u4_valid_intra_modes +@* Says what all modes are valid +@* +@* * @param[in] u4_lambda +@* Lamda value for computing cost from SAD +@* +@* @param[in] u4_predictd_mode +@* Predicted mode for cost computation +@* +@* +@* +@* @return none +@* +@****************************************************************************** +@*/ +@void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src, +@ UWORD8 *pu1_ngbr_pels, +@ UWORD8 *pu1_dst, +@ UWORD32 src_strd, +@ UWORD32 dst_strd, +@ WORD32 u4_n_avblty, +@ UWORD32 *u4_intra_mode, +@ WORD32 *pu4_sadmin, +@ UWORD32 u4_valid_intra_modes, +@ UWORD32 u4_lambda, +@ UWORD32 u4_predictd_mode) + + + + .global ih264e_evaluate_intra_4x4_modes_a9q + +ih264e_evaluate_intra_4x4_modes_a9q: + +@r0 = pu1_src, +@r1 = pu1_ngbr_pels_i16, +@r2 = pu1_dst, +@r3 = src_strd, +@r4 = dst_strd, +@r5 = u4_n_avblty, +@r6 = u4_intra_mode, +@r7 = pu4_sadmin +@r8 = u4_valid_intra_modes +@r0 =u4_lambda +@r1 = u4_predictd_mode + + + stmfd sp!, {r4-r12, r14} @store register values to stack + +@-------------------- + ldr r5, [sp, #44] @r5 = u4_n_avblty, +@---------------------- + vpush {d8-d15} +@Loading neighbours + vld1.32 {q0}, [r1] + add r4, r1, #12 + vld1.8 d1[5], [r4] + vld1.8 d1[7], [r1] + @-------------------------------- + ldr r8, [sp, #120] @u4_valid_intra_modes +@---------------------------------------------- + + + +@ LOADING pu1_src + vld1.32 {d20[0]}, [r0], r3 + vext.8 q1, q0, q0, #1 + vld1.32 {d20[1]}, [r0], r3 + mov r11, #1 + vld1.32 {d21[0]}, [r0], r3 + lsl r11, r11, #30 + vld1.32 {d21[1]}, [r0], r3 + + + +@-------------------------------- + ldr r0, [sp, #124] @r0 =u4_lambda + ldr r1, [sp, #128] @r1 = u4_predictd_mode +@------ + + +vert: + ands r10, r8, #01 @VERT sad ?? + beq horz + vdup.32 q2, d2[1] + vabdl.u8 q14, d4, d20 + vabal.u8 q14, d4, d21 + vadd.i16 d28, d29, d28 + subs r6, r1, #0 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + moveq r6, r0 @ + vmov.u32 r9, d28[0] @ vert + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #0 + +horz: + ands r10, r8, #02 @HORZ sad ?? + beq dc + vdup.32 q3, d0[0] + vmov.32 q4, q3 + vtrn.8 q3, q4 + vtrn.16 d7, d6 + vtrn.16 d9, d8 + vtrn.32 d9, d7 + vtrn.32 d8, d6 + vabdl.u8 q14, d6, d20 + subs r6, r1, #1 + vabal.u8 q14, d7, d21 + vadd.i16 d28, d29, d28 + lslne r6, r0, #2 + vpaddl.u16 d28, d28 @ + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #1 + +dc: + ands r10, r8, #04 @DC sad ?? + beq diags + vext.8 q4, q0, q0, #5 + vaddl.u8 q4, d0, d8 + vpaddl.u16 d8, d8 @ + vpaddl.u32 d8, d8 @/ + vmov.u32 r4, d8[0] @ + mov r14, #1 + ands r10, r5, #1 + addne r4, r4, #2 + addne r14, r14, #1 + ands r10, r5, #4 + addne r4, r4, #2 + addne r14, r14, #1 + ands r10, r5, #5 + moveq r4, #128 + moveq r14, #0 + subs r6, r1, #2 + lsr r4, r4, r14 + vdup.8 q4, r4 + lslne r6, r0, #2 + vabdl.u8 q14, d8, d20 + vabal.u8 q14, d9, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #2 + +diags: + ands r10, r8, #504 @/* if modes other than VERT, HORZ and DC are valid ????*/ + beq pred + @/* Performing FILT11 and FILT121 operation for all neighbour values*/ + vext.8 q5, q0, q0, #2 + vaddl.u8 q6, d0, d2 + vaddl.u8 q7, d1, d3 + vaddl.u8 q8, d10, d2 + vaddl.u8 q9, d11, d3 + vadd.u16 q12, q10, q11 + vqrshrun.s16 d10, q6, #1 + vqrshrun.s16 d11, q7, #1 + vadd.u16 q11, q6, q8 + vadd.u16 q12, q7, q9 + vqrshrun.s16 d12, q11, #2 + vqrshrun.s16 d13, q12, #2 + mov r14, #0 + vdup.32 q13 , r14 + mov r14, #-1 + vmov.i32 d26[0], r14 + +diag_dl: + ands r10, r8, #0x08 @DIAG_DL sad ?? + beq diag_dr + + vext.8 q15, q6, q6, #5 + vbit.32 d14, d30, d26 + vext.8 q15, q6, q6, #15 + vbit.32 d15, d31, d26 + vext.8 q15, q6, q6, #2 + vext.32 q14, q13, q13, #3 + vbit.32 d14, d30, d28 + vext.8 q15, q6, q6, #4 + vbit.32 d15, d30, d28 + vabdl.u8 q14, d14, d20 + subs r6, r1, #3 + vabal.u8 q14, d15, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #3 + +diag_dr: + ands r10, r8, #16 @DIAG_DR sad ?? + beq vert_r + + vext.8 q15, q6, q6, #3 + vbit.32 d16, d30, d26 + vext.8 q15, q6, q6, #1 + vbit.32 d17, d30, d26 + vext.8 q15, q6, q6, #4 + vext.32 q14, q13, q13, #3 + vbit.32 d17, d31, d28 + vext.8 q15, q6, q6, #6 + vbit.32 d16, d31, d28 + vabdl.u8 q14, d16, d20 + subs r6, r1, #4 + vabal.u8 q14, d17, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #4 + +vert_r: + ands r10, r8, #32 @VERT_R sad ?? + beq horz_d + vext.8 q15, q5, q5, #4 + vbit.32 d18, d30, d26 + vext.8 q15, q5, q5, #3 + vbit.32 d19, d30, d26 + vext.32 q14, q13, q13, #3 + vext.8 q15, q6, q6, #15 + vbit.32 d18, d30, d28 + vext.8 q15, q6, q6, #14 + vbit.32 d19, d30, d28 + mov r14, #0 + vdup.32 q14 , r14 + mov r14, #0xff + vmov.i8 d28[0], r14 + vext.8 q15, q6, q6, #2 + vbit.32 d19, d30, d28 + vext.32 q14, q14, q14, #3 + subs r6, r1, #5 + vext.8 q15, q6, q6, #13 + vbit.32 d19, d30, d28 + lslne r6, r0, #2 + vabdl.u8 q14, d18, d20 + vabal.u8 q14, d19, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #5 + +horz_d: + vmov.8 q1, q5 + vmov.8 q15, q6 + vzip.8 q1, q15 + + ands r10, r8, #64 @HORZ_D sad ?? + beq vert_l + vext.8 q15, q6, q6, #2 + vbit.32 d8, d30, d26 + mov r14, #0 + vdup.32 q14 , r14 + mov r14, #0xff + vmov.i8 d28[0], r14 + vext.8 q15, q5, q5, #3 + vbit.32 d8, d30, d28 + vext.8 q15, q1, q1, #2 + vbit.32 d9, d30, d26 + vext.32 q14, q13, q13, #3 + vbit.32 d8, d2, d28 + subs r6, r1, #6 + vext.8 q15, q1, q1, #12 + vbit.32 d9, d30, d28 + vabdl.u8 q14, d8, d20 + vabal.u8 q14, d9, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #6 +vert_l: + ands r10, r8, #128 @VERT_L sad ?? + beq horz_u + vext.8 q15, q5, q5, #5 + vbit.32 d24, d30, d26 + vext.8 q15, q15, q15, #1 + vbit.32 d25, d30, d26 + vext.8 q15, q6, q6, #1 + vext.32 q14, q13, q13, #3 + vbit.32 d24, d30, d28 + vext.8 q15, q15, q15, #1 + subs r6, r1, #7 + vbit.32 d25, d30, d28 + vabdl.u8 q14, d24, d20 + vabal.u8 q14, d25, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #7 + +horz_u: + ands r10, r8, #256 @HORZ_U sad ?? + beq pred + vrev64.8 q5, q1 + vdup.8 q1, d0[0] + vext.8 q6, q6, #7 + mov r14, #0 + vdup.32 q14 , r14 + mov r14, #0xff + vmov.i8 d28[0], r14 + vbit.32 d11, d13, d28 + movw r14, #0xffff + vmov.i16 d28[0], r14 + vext.8 q6, q5, q5, #7 + subs r6, r1, #8 + vbit.32 d3, d12, d28 + vext.8 q6, q5, q5, #3 + vbit.32 d2, d12, d26 + vext.32 q14, q13, q13, #3 + vext.8 q6, q5, q5, #1 + vbit.32 d2, d12, d28 + vabdl.u8 q14, d2, d20 + vabal.u8 q14, d3, d21 + vadd.i16 d28, d29, d28 + vpaddl.u16 d28, d28 @ + lslne r6, r0, #2 + vpaddl.u32 d28, d28 @/ + vmov.u32 r9, d28[0] @ + + + moveq r6, r0 @ + add r9, r6, r9 + + subs r6, r11, r9 + movgt r11, r9 + movgt r12, #8 + +pred: @/*dOING FINAL PREDICTION*/ +@--------------------------- + ldr r7, [sp, #116] @r7 = pu4_sadmin + ldr r6, [sp, #112] @ R6 =MODE +@-------------------------- + str r11, [r7] @/STORING MIN SAD*/ + str r12, [r6] @/FINAL MODE*/ + + + ldr r3, scratch_intrapred_luma_4x4_prediction_addr1 +scrintra_4x4: + add r3, r3, pc + lsl r12, r12, #2 + add r3, r3, r12 + + ldr r5, [r3] + and r5, r5, #0xfffffffe + + bx r5 + + +ver: + vext.8 q0, q0, q0, #1 + vdup.32 q15, d0[1] + b store + +hor: + vmov.32 q15, q3 + b store + +d_c: + vdup.8 q15, r4 + b store + +dia_dl: + vmov.32 q15, q7 + b store + +dia_dr: + vmov.32 q15, q8 + b store + +ver_r: + vmov.32 q15, q9 + b store + +hor_d: + vmov.32 q15, q4 + b store + +ver_l: + vmov.32 q15, q12 + b store + +hor_u: + vmov.32 q15, q1 + +store: @/* storing to pu1_dst*/ + + ldr r4, [sp, #104] @r4 = dst_strd, + + vst1.32 {d30[0]}, [r2], r4 + vst1.32 {d30[1]}, [r2], r4 + vst1.32 {d31[0]}, [r2], r4 + vst1.32 {d31[1]}, [r2], r4 + + +end_func: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + + + diff --git a/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s new file mode 100755 index 0000000..e4dfca8 --- /dev/null +++ b/encoder/arm/ih264e_evaluate_intra_chroma_modes_a9q.s @@ -0,0 +1,346 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** +@****************************************************************************** +@* +@* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC ) +@* and do the prediction. +@* +@* @par Description +@* This function evaluates first three intra chroma modes and compute corresponding sad +@* and return the buffer predicted with best mode. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@** @param[in] pu1_ngbr_pels +@* UWORD8 pointer to neighbouring pels +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_n_avblty +@* availability of neighbouring pixels +@* +@* @param[in] u4_intra_mode +@* Pointer to the variable in which best mode is returned +@* +@* @param[in] pu4_sadmin +@* Pointer to the variable in which minimum sad is returned +@* +@* @param[in] u4_valid_intra_modes +@* Says what all modes are valid +@* +@* +@* @return none +@* +@****************************************************************************** +@*/ +@ +@void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src, +@ UWORD8 *pu1_ngbr_pels_i16, +@ UWORD8 *pu1_dst, +@ UWORD32 src_strd, +@ UWORD32 dst_strd, +@ WORD32 u4_n_avblty, +@ UWORD32 *u4_intra_mode, +@ WORD32 *pu4_sadmin, +@ UWORD32 u4_valid_intra_modes) +@ +.text +.p2align 2 + + .global ih264e_evaluate_intra_chroma_modes_a9q + +ih264e_evaluate_intra_chroma_modes_a9q: + +@r0 = pu1_src, +@r1 = pu1_ngbr_pels_i16, +@r2 = pu1_dst, +@r3 = src_strd, +@r4 = dst_strd, +@r5 = u4_n_avblty, +@r6 = u4_intra_mode, +@r7 = pu4_sadmin + + + + stmfd sp!, {r4-r12, r14} @store register values to stack + @----------------------- + ldr r5, [sp, #44] @r5 = u4_n_avblty, + @------------------------- + mov r12, r1 @ + vpush {d8-d15} + vld1.32 {q4}, [r1]! + add r1, r1, #2 + vld1.32 {q5}, [r1]! + + vuzp.u8 q4, q5 @ + + vpaddl.u8 d8, d8 + vpadd.u16 d8, d8 + + vpaddl.u8 d9, d9 + vpadd.u16 d9, d9 + + vpaddl.u8 d10, d10 + vpadd.u16 d10, d10 + + vpaddl.u8 d11, d11 + + and r7, r5, #5 + vpadd.u16 d11, d11 + subs r8, r7, #5 + beq all_available + subs r8, r7, #4 + beq top_available + subs r8, r7, #1 + beq left_available + mov r10, #128 + vdup.8 q14, r10 + vdup.8 q15, r10 + b sad + +all_available: + vzip.u16 q4, q5 + vext.16 q6, q4, q4, #2 + vadd.u16 q7, q5, q6 + vqrshrn.u16 d14, q7, #3 + vqrshrn.u16 d15, q4, #2 + vqrshrn.u16 d16, q5, #2 + vdup.16 d28, d14[0] + vdup.16 d29, d16[1] + vdup.16 d30, d15[0] + vdup.16 d31, d14[1] + b sad +top_available: + vzip.u16 q4, q5 + vqrshrn.u16 d16, q5, #2 + vdup.16 d28, d16[0] + vdup.16 d29, d16[1] + vdup.16 d30, d16[0] + vdup.16 d31, d16[1] + b sad +left_available: + vzip.u16 q4, q5 + vqrshrn.u16 d16, q4, #2 + vdup.16 d28, d16[3] + vdup.16 d29, d16[3] + vdup.16 d30, d16[2] + vdup.16 d31, d16[2] + + +sad: + vld1.32 {q4}, [r12]! + sub r8, r12, #2 + add r12, r12, #2 + vld1.32 {q5}, [r12]! + add r12, r0, r3, lsl #2 + sub r10, r8, #8 + vld1.32 {q0}, [r0], r3 + ldrh r9, [r8] + vdup.16 q10, r9 @ row 0 + + @/vertical row 0; + vabdl.u8 q8, d0, d10 + vabdl.u8 q9, d1, d11 + sub r8, r8, #2 + vld1.32 {q1}, [r12], r3 + + @/HORZ row 0; + vabdl.u8 q13, d0, d20 + vabdl.u8 q7, d1, d21 + ldrh r9, [r10] + @/dc row 0; + vabdl.u8 q11, d0, d28 + vabdl.u8 q12, d1, d29 + + + vdup.16 q10, r9 @ row 4 + @/vertical row 4; + vabal.u8 q8, d2, d10 + vabal.u8 q9, d3, d11 + sub r10, r10, #2 + + @/HORZ row 4; + vabal.u8 q13, d2, d20 + vabal.u8 q7, d3, d21 + @/dc row 4; + vabal.u8 q11, d2, d30 + vabal.u8 q12, d3, d31 + + mov r11, #3 + +loop: + vld1.32 {q0}, [r0], r3 + ldrh r9, [r8] + + + @/vertical row i; + vabal.u8 q8, d0, d10 + vabal.u8 q9, d1, d11 + + vdup.16 q10, r9 @ row i + vld1.32 {q1}, [r12], r3 + sub r8, r8, #2 + @/HORZ row i; + vabal.u8 q13, d0, d20 + vabal.u8 q7, d1, d21 + ldrh r9, [r10] + @/dc row i; + vabal.u8 q11, d0, d28 + vabal.u8 q12, d1, d29 + sub r10, r10, #2 + + vdup.16 q10, r9 @ row i+4 + @/vertical row 4; + vabal.u8 q8, d2, d10 + vabal.u8 q9, d3, d11 + subs r11, r11, #1 + + @/HORZ row i+4; + vabal.u8 q13, d2, d20 + vabal.u8 q7, d3, d21 + @/dc row i+4; + vabal.u8 q11, d2, d30 + vabal.u8 q12, d3, d31 + bne loop + + + +@------------------------------------------- + + vadd.i16 q9, q9, q8 @/VERT + vadd.i16 q7, q13, q7 @/HORZ + vadd.i16 q12, q11, q12 @/DC + vadd.i16 d18, d19, d18 @/VERT + vadd.i16 d14, d15, d14 @/HORZ + vadd.i16 d24, d24, d25 @/DC + vpaddl.u16 d18, d18 @/VERT + vpaddl.u16 d14, d14 @/HORZ + vpaddl.u16 d24, d24 @/DC + vpaddl.u32 d18, d18 @/VERT + vpaddl.u32 d14, d14 @/HORZ + vpaddl.u32 d24, d24 @/DC + + + + vmov.u32 r8, d18[0] @ vert + vmov.u32 r9, d14[0] @horz + vmov.u32 r10, d24[0] @dc + + mov r11, #1 +@----------------------- + ldr r0, [sp, #120] @ u4_valid_intra_modes +@-------------------------------------------- + + + lsl r11 , #30 + + ands r7, r0, #04 @ vert mode valid???????????? + moveq r8, r11 + + ands r6, r0, #02 @ horz mode valid???????????? + moveq r9, r11 + + ands r6, r0, #01 @ dc mode valid???????????? + moveq r10, r11 + + + @--------------------------- + ldr r4, [sp, #104] @r4 = dst_strd, + ldr r6, [sp, #112] @ R6 =MODE + ldr r7, [sp, #116] @r7 = pu4_sadmin + + @-------------------------- + + cmp r10, r9 + bgt not_dc + cmp r10, r8 + bgt do_vert + + @/---------------------- + @DO DC PREDICTION + str r10 , [r7] @MIN SAD + mov r10, #0 + str r10 , [r6] @ MODE + b do_dc_vert + @----------------------------- + +not_dc: + cmp r9, r8 + bgt do_vert + @/---------------------- + @DO HORIZONTAL + + vdup.16 q10, d9[3] @/HORIZONTAL VALUE ROW=0; + str r9 , [r7] @MIN SAD + mov r9, #1 + vdup.16 q11, d9[2] @/HORIZONTAL VALUE ROW=1; + str r9 , [r6] @ MODE + vdup.16 q12, d9[1] @/HORIZONTAL VALUE ROW=2; + vst1.32 {d20, d21} , [r2], r4 @0 + vdup.16 q13, d9[0] @/HORIZONTAL VALUE ROW=3; + vst1.32 {d22, d23} , [r2], r4 @1 + vdup.16 q14, d8[3] @/HORIZONTAL VALUE ROW=4; + vst1.32 {d24, d25} , [r2], r4 @2 + vdup.16 q15, d8[2] @/HORIZONTAL VALUE ROW=5; + vst1.32 {d26, d27} , [r2], r4 @3 + vdup.16 q1, d8[1] @/HORIZONTAL VALUE ROW=6; + vst1.32 {d28, d29} , [r2], r4 @4 + vdup.16 q2, d8[0] @/HORIZONTAL VALUE ROW=7; + vst1.32 {d30, d31} , [r2], r4 @5 + vst1.32 {d2, d3} , [r2], r4 @6 + vst1.32 {d4, d5} , [r2], r4 @7 + b end_func + +do_vert: + @DO VERTICAL PREDICTION + str r8 , [r7] @MIN SAD + mov r8, #2 + str r8 , [r6] @ MODE + vmov q15, q5 + vmov q14, q5 + +do_dc_vert: + vst1.32 {d28, d29} , [r2], r4 @0 + vst1.32 {d28, d29} , [r2], r4 @1 + vst1.32 {d28, d29} , [r2], r4 @2 + vst1.32 {d28, d29} , [r2], r4 @3 + vst1.32 {d30, d31} , [r2], r4 @4 + vst1.32 {d30, d31} , [r2], r4 @5 + vst1.32 {d30, d31} , [r2], r4 @6 + vst1.32 {d30, d31} , [r2], r4 @7 + + +end_func: + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @Restoring registers from stack + + + diff --git a/encoder/arm/ih264e_fmt_conv.s b/encoder/arm/ih264e_fmt_conv.s new file mode 100755 index 0000000..2bf1479 --- /dev/null +++ b/encoder/arm/ih264e_fmt_conv.s @@ -0,0 +1,329 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +.text +.p2align 2 +@/** + +@/***************************************************************************** +@* * +@* Function Name : IH264D_CXA8_YUV420toYUV420SP_UV() * +@* * +@* Description : This function conversts the image from YUV420P color * +@* space to 420SP color space(UV interleaved). * +@* * +@* Arguments : R0 pu1_y * +@* R1 pu1_u * +@* R2 pu1_v * +@* R3 pu1_dest_y * +@* [R13 #40] pu1_dest_uv * +@* [R13 #44] u2_height * +@* [R13 #48] u2_width * +@* [R13 #52] u2_stridey * +@* [R13 #56] u2_strideu * +@* [R13 #60] u2_stridev * +@* [R13 #64] u2_dest_stride_y * +@* [R13 #68] u2_dest_stride_uv * +@* [R13 #72] convert_uv_only * +@* * +@* Values Returned : None * +@* * +@* Register Usage : R0 - R14 * +@* * +@* Stack Usage : 40 Bytes * +@* * +@* Interruptibility : Interruptible * +@* * +@* Known Limitations * +@* Assumptions: Image Width: Assumed to be multiple of 16 and * +@* greater than or equal to 16 * +@* Image Height: Assumed to be even. * +@* * +@* Revision History : * +@* DD MM YYYY Author(s) Changes (Describe the changes made) * +@* 07 06 2010 Varshita Draft * +@* 07 06 2010 Naveen Kr T Completed * +@* * +@*****************************************************************************/ + .global ih264e_fmt_conv_420p_to_420sp_a9q + +ih264e_fmt_conv_420p_to_420sp_a9q: + + @// push the registers on the stack + stmfd sp!, {r4-r12, lr} + + ldr r4, [sp, #72] @// Load convert_uv_only + + cmp r4, #1 + beq yuv420sp_uv_chroma + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #44] @// Load u2_height from stack + ldr r5, [sp, #48] @// Load u2_width from stack + ldr r7, [sp, #52] @// Load u2_stridey from stack + ldr r8, [sp, #64] @// Load u2_dest_stride_y from stack + sub r7, r7, r5 @// Source increment + sub r8, r8, r5 @// Destination increment + + vpush {d8-d15} +yuv420sp_uv_row_loop_y: + mov r6, r5 + +yuv420sp_uv_col_loop_y: + pld [r0, #128] + vld1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! + sub r6, r6, #16 + cmp r6, #15 + bgt yuv420sp_uv_col_loop_y + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_y + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #16 + sub r0, r0, r6 + sub r3, r3, r6 + + vld1.8 {d0, d1}, [r0]! + vst1.8 {d0, d1}, [r3]! + +yuv420sp_uv_row_loop_end_y: + add r0, r0, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_y + +yuv420sp_uv_chroma: + + ldr r3, [sp, #40] @// Load pu1_dest_uv from stack + + ldr r4, [sp, #44] @// Load u2_height from stack + + ldr r5, [sp, #48] @// Load u2_width from stack + + + ldr r7, [sp, #56] @// Load u2_strideu from stack + + ldr r8, [sp, #68] @// Load u2_dest_stride_uv from stack + + sub r7, r7, r5, lsr #1 @// Source increment + + sub r8, r8, r5 @// Destination increment + + mov r5, r5, lsr #1 + mov r4, r4, lsr #1 + ldr r3, [sp, #40] @// Load pu1_dest_uv from stack + vpush {d8-d15} +yuv420sp_uv_row_loop_uv: + mov r6, r5 + + +yuv420sp_uv_col_loop_uv: + pld [r1, #128] + pld [r2, #128] + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + sub r6, r6, #8 + cmp r6, #7 + bgt yuv420sp_uv_col_loop_uv + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_uv + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #8 + sub r1, r1, r6 + sub r2, r2, r6 + sub r3, r3, r6, lsl #1 + + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + +yuv420sp_uv_row_loop_end_uv: + add r1, r1, r7 + add r2, r2, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_uv + @//POP THE REGISTERS + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} + + + + + +@ /** +@ ******************************************************************************* +@ * +@ * @brief ih264e_fmt_conv_422i_to_420sp_a9q +@ * Function used from format conversion or frame copy +@ * +@ * +@ * +@ *Inputs : r0 - pu1_y - UWORD8 pointer to y plane. +@ * r1 - pu1_u - UWORD8 pointer to u plane. +@ * r2 - pu1_v - UWORD8 pointer to u plane. +@ * r3 - pu2_yuv422i - UWORD16 pointer to yuv422iimage. +@ * stack + 40 - u4_width - Width of the Y plane. +@ * 44 - u4_height - Height of the Y plane. +@ * 48 - u4_stride_y - Stride in pixels of Y plane. +@ * 52 - u4_stride_u - Stride in pixels of U plane. +@ * 56 - u4_stride_v - Stride in pixels of V plane. +@ * 60 - u4_stride_yuv422i- Stride in pixels of yuv422i image. +@ * +@ * @par Description +@ * Function used from copying or converting a reference frame to display buffer +@ * in non shared mode +@ * +@ * @param[in] pu1_y_dst +@ * Output Y pointer +@ * +@ * @param[in] pu1_u_dst +@ * Output U/UV pointer ( UV is interleaved in the same format as that of input) +@ * +@ * @param[in] pu1_v_dst +@ * Output V pointer ( used in 420P output case) +@ * +@ * @param[in] u4_dst_y_strd +@ * Stride of destination Y buffer +@ * +@ * @param[in] u4_dst_u_strd +@ * Stride of destination U/V buffer +@ * +@ * +@ * @param[in] blocking +@ * To indicate whether format conversion should wait till frame is reconstructed +@ * and then return after complete copy is done. To be set to 1 when called at the +@ * end of frame processing and set to 0 when called between frame processing modules +@ * in order to utilize available MCPS +@ * +@ * @returns Error from IH264E_ERROR_T +@ * +@ * @remarks +@ * Assumes that the stride of U and V buffers are same. +@ * This is correct in most cases +@ * If a case comes where this is not true we need to modify the fmt conversion funcnions called inside also +@ * Since we read 4 pixels ata time the width should be aligned to 4 +@ * In assembly width should be aligned to 16 and height to 2. +@ * +@ * +@ * Revision History : +@ * DD MM YYYY Author(s) Changes (Describe the changes made) +@ * 07 06 2010 Harinarayanan K K Adapeted to 422p +@ * +@ ******************************************************************************* +@ */ + +@//` +@*/ + .global ih264e_fmt_conv_422i_to_420sp_a9q +ih264e_fmt_conv_422i_to_420sp_a9q: + stmfd sp!, {r4-r12, lr} @// Back the register which are used + + + + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #48] @// Load u4_stride_y from stack + + ldr r5, [sp, #60] @// Load u4_stride_yuv422i from stack + add r6, r0, r4 @// pu1_y_nxt_row = pu1_y + u4_stride_y + + ldr r7, [sp, #40] @// Load u4_width from stack + add r8, r3, r5, lsl #1 @// pu2_yuv422i_nxt_row = pu2_yuv422i_y + u4_stride_yuv422i(2 Bytes for each pixel) + + ldr r9, [sp, #52] @// Load u4_stride_u from stack + sub r12, r4, r7 @// u2_offset1 = u4_stride_y - u4_width + +@LDR r10,[sp,#56] ;// Load u4_stride_v from stack + sub r14, r5, r7 @// u2_offset_yuv422i = u4_stride_yuv422i - u4_width + + ldr r11, [sp, #44] @// Load u4_height from stack + sub r9, r9, r7 @// u2_offset2 = u4_stride_u - u4_width >> 1 + +@ SUB r10,r10,r7,ASR #1 ;// u2_offset3 = u4_stride_v - u4_width >> 1 + mov r14, r14, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i * 2 + + mov r7, r7, asr #4 @// u4_width = u4_width / 16 (u4_width >> 4) + mov r11, r11, asr #1 @// u4_width = u4_width / 2 (u4_width >> 1) + + add r4, r12, r4 @// u2_offset1 = u2_offset1 + u4_stride_y + add r5, r14, r5, lsl #1 @// u2_offset_yuv422i = u2_offset_yuv422i + u4_stride_yuv422i + + vpush {d8-d15} + +@// Register Assignment +@// pu1_y - r0 +@// pu1_y_nxt_row - r6 +@// pu1_u - r1 +@// pu1_v - r2 +@// pu2_yuv422i - r3 +@// pu2_yuv422i_nxt_row - r8 +@// u2_offset1 - r4 +@// u2_offset2 - r9 +@// u2_offset3 - r10 +@// u2_offset_yuv422i - r5 +@// u4_width / 16 - r7 +@// u4_height / 2 - r11 +@// inner loop count - r12 +yuv420_to_yuv422i_hight_loop: + + mov r12, r7 @// Inner loop count = u4_width / 16 + +yuv420_to_yuv422i_width_loop: + vld4.8 {d0, d1, d2, d3}, [r3]! @// Load the 16 elements of row 1 + vld4.8 {d4, d5, d6, d7}, [r8]! @// Load the 16 elements of row 2 + subs r12, r12, #1 + + vrhadd.u8 d0, d0, d4 + vrhadd.u8 d2, d2, d6 + + vst2.8 {d1, d3}, [r0]! @// Store the 16 elements of row1 Y + vst2.8 {d5, d7}, [r6]! @// Store the 16 elements of row2 Y + + vst2.8 {d0, d2}, [r1]! @// Store the 8 elements of row1/2 U + + bgt yuv420_to_yuv422i_width_loop + + @// Update the buffer pointer so that they will refer to next pair of rows + add r0, r0, r4 @// pu1_y = pu1_y + u2_offset1 + add r6, r6, r4 @// pu1_y_nxt_row = pu1_y_nxt_row + u2_offset1 + + add r1, r1, r9 @// pu1_u = pu1_u + u2_offset2 + subs r11, r11, #1 + + add r3, r3, r5 @// pu2_yuv422i = pu2_yuv422i + u2_offset_yuv422i + + add r8, r8, r5 @// pu2_yuv422i_nxt_row = pu2_yuv422i_nxt_row + u2_offset_yuv422i + bgt yuv420_to_yuv422i_hight_loop + vpop {d8-d15} + ldmfd sp!, {r4-r12, pc} @// Restore the register which are used + + + diff --git a/encoder/arm/ih264e_function_selector.c b/encoder/arm/ih264e_function_selector.c new file mode 100755 index 0000000..bb181c1 --- /dev/null +++ b/encoder/arm/ih264e_function_selector.c @@ -0,0 +1,170 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_function_selector.c +* +* @brief +* Contains functions to initialize function pointers used in h264 +* +* @author +* Ittiam +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include Files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include Files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264e_defs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +#ifdef ARMV8 +void ih264e_init_function_ptr(void *pv_codec) +{ + codec_t *ps_codec = (codec_t *)pv_codec; + ih264e_init_function_ptr_generic(ps_codec); + switch(ps_codec->s_cfg.e_arch) + { + case ARCH_ARM_NONEON: + break; + case ARCH_ARM_A53: + case ARCH_ARM_A57: + case ARCH_ARM_V8_NEON: + ih264e_init_function_ptr_neon_av8(ps_codec); + break; + default: + ih264e_init_function_ptr_neon_av8(ps_codec); + break; + } +} + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void) +{ + return ARCH_ARM_V8_NEON; +} + +#else + +void ih264e_init_function_ptr(void *pv_codec) +{ + codec_t *ps_codec = (codec_t *)pv_codec; + ih264e_init_function_ptr_generic(ps_codec); + switch(ps_codec->s_cfg.e_arch) + { + case ARCH_ARM_NONEON: + break; + case ARCH_ARM_A9Q: + case ARCH_ARM_A9A: + case ARCH_ARM_A9: + case ARCH_ARM_A7: + case ARCH_ARM_A5: + case ARCH_ARM_A15: + ih264e_init_function_ptr_neon_a9q(ps_codec); + break; + default: + ih264e_init_function_ptr_neon_a9q(ps_codec); + break; + } +} + +IV_ARCH_T ih264e_default_arch(void) +{ + return ARCH_ARM_A9Q; +} + +#endif diff --git a/encoder/arm/ih264e_function_selector_a9q.c b/encoder/arm/ih264e_function_selector_a9q.c new file mode 100755 index 0000000..8b2879b --- /dev/null +++ b/encoder/arm/ih264e_function_selector_a9q.c @@ -0,0 +1,252 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_function_selector_generic.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_generic +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264e_defs.h" +#include "ih264e_structs.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_core_coding.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_padding.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264_mem_fns.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_half_pel.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec) +{ + WORD32 i= 0; + + /* curr proc ctxt */ + process_ctxt_t *ps_proc = NULL; + me_ctxt_t *ps_me_ctxt = NULL; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 16x16 */ + ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_a9q; + ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_a9q; + ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_a9q; + ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_a9q; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 4x4 */ + ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_a9q; + ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_a9q; + ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_a9q; + ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_a9q; + ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_a9q; + ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_a9q; + ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_a9q; + ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_a9q; + ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_a9q; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 8x8 */ + ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_a9q; + ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_a9q; + ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_a9q; + ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_a9q; + ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_a9q; + ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_a9q; + ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_a9q; + ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_a9q; + + /* Init function pointers for intra pred leaf level functions chroma + * Intra 8x8 */ + ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_a9q; + ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_a9q; + ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_a9q; + ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_a9q; + + /* Init forward transform fn ptr */ + ps_codec->pf_resi_trans_quant_8x8 = ih264_resi_trans_quant_8x8; + ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4_a9; + ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4_a9; + ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4_a9; + ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv_a9; + + /* Init inverse transform fn ptr */ + ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8; + ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4_a9; + ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc_a9; + ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4_a9; + ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc_a9; + ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_a9; + ps_codec->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_a9; + ps_codec->pf_interleave_copy = ih264_interleave_copy_a9; + + /* Init fn ptr luma core coding */ + ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16; + ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4; + ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16; + + /* Init fn ptr chroma core coding */ + ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8; + ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8; + + /* Init fn ptr luma deblocking */ + ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_a9; + ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_a9; + ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_a9; + ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_a9; + + /* Init fn ptr chroma deblocking */ + ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_a9; + ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_a9; + ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_a9; + ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_a9; + + /* write mb syntax layer */ + ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb; + ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb; + + /* Padding Functions */ + ps_codec->pf_pad_top = ih264_pad_top_a9q; + ps_codec->pf_pad_bottom = ih264_pad_bottom; + ps_codec->pf_pad_left_luma = ih264_pad_left_luma_a9q; + ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_a9q; + ps_codec->pf_pad_right_luma = ih264_pad_right_luma_a9q; + ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_a9q; + + /* Inter pred leaf level functions */ + ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_a9q; + ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_a9q; + ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_a9q; + ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear_a9q; + ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_a9q; + + /* sad me level functions */ + ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q; + ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q; + ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_a9q; + + /* memor handling operations */ + ps_codec->pf_mem_cpy = ih264_memcpy_a9q; + ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_a9q; + ps_codec->pf_mem_set = ih264_memset_a9q; + ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_a9q; + + /* sad me level functions */ + for(i = 0; i < (MAX_PROCESS_CTXT); i++) + { + ps_proc = &ps_codec->as_process[i]; + ps_me_ctxt = &ps_proc->s_me_ctxt; + ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_a9q; + ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_a9q; + ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_a9q; + ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_a9q; + ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_a9q; + ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_a9q; + ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_a9q; + ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_a9q; + } + + /* intra mode eval -encoder level function */ + ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_a9q; + ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_a9q; + ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes_a9q; + + /* csc */ + ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp_a9q; + ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp_a9q; + + /* Halp pel generation function - encoder level*/ + ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_a9q; + ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_a9q; + + return ; + } + diff --git a/encoder/arm/ih264e_function_selector_av8.c b/encoder/arm/ih264e_function_selector_av8.c new file mode 100755 index 0000000..173c2d5 --- /dev/null +++ b/encoder/arm/ih264e_function_selector_av8.c @@ -0,0 +1,259 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_function_selector_generic.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_generic +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264e_defs.h" +#include "ih264e_structs.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_core_coding.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_padding.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264_mem_fns.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_half_pel.h" + + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec) +{ + + WORD32 i= 0; + + /* curr proc ctxt */ + process_ctxt_t *ps_proc = NULL; + me_ctxt_t *ps_me_ctxt = NULL; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 16x16 */ + ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_av8; + ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_av8; + ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_av8; + ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_av8; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 4x4 */ + ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_av8; + ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_av8; + ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_av8; + ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_av8; + ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_av8; + ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_av8; + ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_av8; + ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_av8; + ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_av8; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 8x8 */ + ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_av8; + ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_av8; + ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_av8; + ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_av8; + ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_av8; + ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_av8; + ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_av8; + ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_av8; + + /* Init function pointers for intra pred leaf level functions chroma + * Intra 8x8 */ + ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc_av8; + ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_av8; + ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_av8; + ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_av8; + + + /* Init forward transform fn ptr */ + ps_codec->pf_resi_trans_quant_8x8 = ih264_resi_trans_quant_8x8; + ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4_av8; + ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4_av8; + ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4_av8; + ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv_av8; + + /* Init inverse transform fn ptr */ + ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8_av8; + ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4_av8; + ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc_av8; + ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4_av8; + ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc_av8; + ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_av8; + ps_codec->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv_av8; + ps_codec->pf_interleave_copy = ih264_interleave_copy_av8; + + /* Init fn ptr luma core coding */ + ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16; + ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4; + ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16; + + /* Init fn ptr chroma core coding */ + ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8; + ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8; + + /* Init fn ptr luma deblocking */ + ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_av8; + ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_av8; + ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_av8; + ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_av8; + + /* Init fn ptr chroma deblocking */ + ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_av8; + ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_av8; + ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_av8; + ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_av8; + + /* write mb syntax layer */ + ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb; + ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb; + + /* Padding Functions */ + ps_codec->pf_pad_top = ih264_pad_top_av8; + ps_codec->pf_pad_bottom = ih264_pad_bottom; + ps_codec->pf_pad_left_luma = ih264_pad_left_luma_av8; + ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_av8; + ps_codec->pf_pad_right_luma = ih264_pad_right_luma_av8; + ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_av8; + + /* Inter pred leaf level functions */ + ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_av8; + ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_av8; + ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_av8; + ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear; + ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_av8; + + /* sad me level functions */ + ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_av8; + ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_av8; + ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_av8; + + /* memor handling operations */ + ps_codec->pf_mem_cpy = ih264_memcpy_av8; + ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_av8; + ps_codec->pf_mem_set = ih264_memset_av8; + ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_av8; + + /* sad me level functions */ + for(i = 0; i < (MAX_PROCESS_CTXT); i++) + { + ps_proc = &ps_codec->as_process[i]; + ps_me_ctxt = &ps_proc->s_me_ctxt; + ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_av8; + ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_av8; + ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_av8; + ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_av8; + ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog_av8; + ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog_av8; + ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_av8; + ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_av8; + } + + /* intra mode eval -encoder level function */ + ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_av8; + ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_av8; + ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes; + + /* csc */ + ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp; + ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp; + + /* Halp pel generation function - encoder level*/ + ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_av8; + ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_av8; + + return ; + } + diff --git a/encoder/arm/ih264e_half_pel.s b/encoder/arm/ih264e_half_pel.s new file mode 100755 index 0000000..1b9a87a --- /dev/null +++ b/encoder/arm/ih264e_half_pel.s @@ -0,0 +1,951 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** +@ ******************************************************************************* +@ * @file +@ * ih264e_half_pel.s +@ * +@ * @brief +@ * +@ * +@ * @author +@ * Ittiam +@ * +@ * @par List of Functions: +@ * ih264e_sixtapfilter_horz +@ * ih264e_sixtap_filter_2dvh_vert +@ +@ * +@ * @remarks +@ * None +@ * +@ ******************************************************************************* +@ */ + + +.text +.p2align 2 + +@ /** +@/******************************************************************************* +@* +@* @brief +@* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16) +@* +@* @par Description: +@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +@* sec 8.4.2.2.1 titled "Luma sample interpolation process" +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264e_sixtapfilter_horz(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 dst_strd); + + +.equ HALFPEL_WIDTH , 17 + 1 @( make it even, two rows are processed at a time) + + + .global ih264e_sixtapfilter_horz_a9q +ih264e_sixtapfilter_horz_a9q: + stmfd sp!, {lr} + + vmov.i8 d0, #5 + sub r0, r0, #2 + + vmov.i8 d1, #20 + mov r14, #HALFPEL_WIDTH + vpush {d8-d15} + +filter_horz_loop: + + + vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 + vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 + + @// Processing row0 and row1 + + vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) + vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) + + vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) + vext.8 d29, d4, d4, #5 @//extract a[5] (column3,row0) + vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) + vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) + vaddl.u8 q6, d29, d4 @// a0 + a5 (column3,row0) + vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) + vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) + vext.8 d26, d7, d7, #5 @//extract a[5] (column3,row1) + + vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) + vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) + vaddl.u8 q9, d26, d7 @// a0 + a5 (column3,row1) + vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) + vext.8 d29, d4, d4, #2 @//extract a[2] (column3,row0) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) + vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) + vmlal.u8 q6, d29, d1 @// a0 + a5 + 20a2 (column3,row0) + vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) + vext.8 d26, d7, d7, #2 @//extract a[2] (column3,row1) + + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) + vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) + vmlal.u8 q9, d26, d1 @// a0 + a5 + 20a2 (column3,row1) + vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) + vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vext.8 d29, d4, d4, #3 @//extract a[3] (column3,row0) + vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) + vmlal.u8 q6, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) + vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) + vext.8 d26, d7, d7, #3 @//extract a[3] (column3,row1) + + vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) + vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) + vmlal.u8 q9, d26, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row1) + vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vext.8 d29, d4, d4, #1 @//extract a[1] (column3,row0) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) + vmlsl.u8 q6, d29, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + vext.8 d26, d7, d7, #1 @//extract a[1] (column3,row1) + + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) + vmlsl.u8 q9, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1) + vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) + vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + vext.8 d29, d4, d4, #4 @//extract a[4] (column3,row0) + vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) + vmlsl.u8 q6, d29, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) + vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + vext.8 d26, d7, d7, #4 @//extract a[4] (column3,row1) + + vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + vmlsl.u8 q9, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1) + + vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vqrshrun.s16 d22, q6, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vqrshrun.s16 d23, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + vqrshrun.s16 d24, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + vqrshrun.s16 d25, q9, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1) + + vst1.8 {d20, d21, d22}, [r1], r3 @//Store dest row0 + vst1.8 {d23, d24, d25}, [r1], r3 @//Store dest row1 + + subs r14, r14, #2 @ decrement counter + + bne filter_horz_loop + + vpop {d8-d15} + ldmfd sp!, {pc} + + + + + + + + + +@/** +@******************************************************************************* +@* +@* @brief +@* This function implements a two stage cascaded six tap filter. It +@* applies the six tap filter in the vertical direction on the +@* predictor values, followed by applying the same filter in the +@* horizontal direction on the output of the first stage. The six tap +@* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample +@* interpolation process" +@* (Filter run for width = 17 and height =17) +@* @par Description: +@* The function interpolates +@* the predictors first in the vertical direction and then in the +@* horizontal direction to output the (1/2,1/2). The output of the first +@* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C) +@* in 16 bit precision. +@* +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst1 +@* UWORD8 pointer to the destination(vertical filtered output) +@* +@* @param[out] pu1_dst2 +@* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output) +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride of pu1_dst +@* +@* @param[in]pi16_pred1 +@* Pointer to 16bit intermediate buffer(used only in c) +@* +@* @param[in] pi16_pred1_strd +@* integer destination stride of pi16_pred1 +@* +@* +@* @returns +@* +@* @remarks +@* None +@* +@******************************************************************************* +@*/ +@void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src, +@ UWORD8 *pu1_dst1, +@ UWORD8 *pu1_dst2, +@ WORD32 src_strd, +@ WORD32 dst_strd, +@ WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/ +@ WORD32 pi16_pred1_strd) + + + + + .global ih264e_sixtap_filter_2dvh_vert_a9q + +ih264e_sixtap_filter_2dvh_vert_a9q: + stmfd sp!, {r10, r11, r12, lr} + +@//r0 - pu1_ref +@//r3 - u4_ref_width + vpush {d8-d15} + @// Load six rows for vertical interpolation + lsl r12, r3, #1 + sub r0, r0, r12 + sub r0, r0, #2 + vld1.8 {d2, d3, d4}, [r0], r3 + vld1.8 {d5, d6, d7}, [r0], r3 + vld1.8 {d8, d9, d10}, [r0], r3 + mov r12, #5 + vld1.8 {d11, d12, d13}, [r0], r3 + mov r14, #20 + vld1.8 {d14, d15, d16}, [r0], r3 + vmov.16 d0[0], r12 + vmov.16 d0[1], r14 + vld1.8 {d17, d18, d19}, [r0], r3 + vmov.i8 d1, #20 + +@// r12 - u2_buff1_width +@// r14 - u2_buff2_width + ldr r12, [sp, #80] + add r11, r1, #6 + + mov r14, r12 + + mov r10, #3 @loop counter + + +filter_2dvh_loop: + + @// ////////////// ROW 1 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d2, d17 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d8, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d11, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d5, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d14, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + + vaddl.u8 q11, d3, d18 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d9, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d12, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d6, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d15, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d4, d19 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d10, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d13, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d7, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d16, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vqrshrun.s16 d2, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d3, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d4, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d2, d2, d3, #2 + vst1.8 {d3, d4}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d2}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q1, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q1, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q1, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q1, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q1, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q1, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d2, d3, d4}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 2 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d5, d2 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d11, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d14, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d8, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d17, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d6, d3 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d12, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d15, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d9, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d18, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d7, d4 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d13, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d16, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d10, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d19, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d5, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d6, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d7, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d5, d5, d6, #2 + vst1.8 {d6, d7}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d5}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q3, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q3, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q3, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q3, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q3, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q3, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d5, d6, d7}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 3 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d8, d5 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d14, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d17, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d11, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d2, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d9, d6 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d15, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d18, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d12, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d3, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d10, d7 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d16, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d19, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d13, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d4, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d8, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d9, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d10, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d8, d8, d9, #2 + vst1.8 {d9, d10}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d8}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q4, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q4, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q4, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q4, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q4, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q4, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d8, d9, d10}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 4 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d11, d8 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d17, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d2, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d14, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d5, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d12, d9 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d18, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d3, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d15, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d6, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d13, d10 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d19, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d4, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d16, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d7, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d11, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d12, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d13, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d11, d11, d12, #2 + vst1.8 {d12, d13}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d11}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q6, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q6, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q6, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q6, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q6, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q6, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d11, d12, d13}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 5 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vaddl.u8 q10, d14, d11 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d2, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d5, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d17, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d8, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d15, d12 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d3, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d6, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d18, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d9, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d16, d13 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d4, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d7, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d19, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d10, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d14, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d15, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d16, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d14, d14, d15, #2 + vst1.8 {d15, d16}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d14}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q7, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q7, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q7, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q7, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q7, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q7, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d14, d15, d16}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + @//VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + @//VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + @//VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + @//VST1.8 {D26,D27,D28},[r2],r14 ;// store 1/2,1,2 grif values + @// ////////////// ROW 6 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + + cmp r10, #1 @// if it 17 rows are complete skip + beq filter_2dvh_skip_row + vaddl.u8 q10, d17, d14 @// a0 + a5 (column1,row0) + vmov.i8 d31, #5 + vmlal.u8 q10, d5, d1 @// a0 + a5 + 20a2 (column1,row0) + vmlal.u8 q10, d8, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) + vmlsl.u8 q10, d2, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + vmlsl.u8 q10, d11, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vaddl.u8 q11, d18, d15 @// a0 + a5 (column2,row0) + vmlal.u8 q11, d6, d1 @// a0 + a5 + 20a2 (column2,row0) + vmlal.u8 q11, d9, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) + vmlsl.u8 q11, d3, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + vmlsl.u8 q11, d12, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + vext.16 d30, d20, d21, #2 @//extract a[2] (set1) + + vaddl.u8 q12, d19, d16 @// a0 + a5 (column3,row0) + vext.16 d29, d20, d21, #3 @//extract a[3] (set1) + vmlal.u8 q12, d7, d1 @// a0 + a5 + 20a2 (column3,row0) + vmlal.u8 q12, d10, d1 @// a0 + a5 + 20a2 + 20a3 (column3,row0) + vmlsl.u8 q12, d4, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + vmlsl.u8 q12, d13, d31 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + vqrshrun.s16 d17, q10, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + vext.16 d31, d21, d22, #1 @//extract a[5] (set1) + vqrshrun.s16 d18, q11, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + vext.16 d28, d20, d21, #1 @//extract a[1] (set1) + + vaddl.s16 q13, d31, d20 @// a0 + a5 (set1) + vext.16 d31, d22, d23, #1 @//extract a[5] (set2) + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set1) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set1) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + vmlsl.s16 q13, d21, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + vext.16 d30, d21, d22, #2 @//extract a[2] (set2) + + vqrshrun.s16 d19, q12, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + vext.16 d29, d21, d22, #3 @//extract a[3] (set2) + + vext.16 d28, d21, d22, #1 @//extract a[1] (set2) + vaddl.s16 q10, d31, d21 @// a0 + a5 (set2) + vmlal.s16 q10, d30, d0[1] @// a0 + a5 + 20a2 (set2) + vmlal.s16 q10, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set2) + vmlsl.s16 q10, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + vmlsl.s16 q10, d22, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + vext.16 d31, d23, d24, #1 @//extract a[5] (set3) + + vext.8 d17, d17, d18, #2 + vst1.8 {d18, d19}, [r11], r12 @// store row1 - 1,1/2 grid + vst1.8 {d17}, [r1], r12 @// store row1 - 1,1/2 grid + + vext.16 d30, d22, d23, #2 @//extract a[2] (set3) + vext.16 d29, d22, d23, #3 @//extract a[3] (set3) + + vaddl.s16 q9, d31, d22 @// a0 + a5 (set3) + vext.16 d28, d22, d23, #1 @//extract a[1] (set3) + vmlal.s16 q9, d30, d0[1] @// a0 + a5 + 20a2 (set3) + vmlal.s16 q9, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set3) + vmlsl.s16 q9, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + vmlsl.s16 q9, d23, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + vext.16 d31, d24, d25, #1 @//extract a[5] (set4) + + vshrn.s32 d21, q10, #8 @// shift by 8 and later we will shift by 2 more with rounding (set2) + vext.16 d30, d23, d24, #2 @//extract a[2] (set4) + vshrn.s32 d20, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set1) + vext.16 d29, d23, d24, #3 @//extract a[3] (set4) + + vaddl.s16 q13, d31, d23 @// a0 + a5 (set4) + vext.16 d28, d23, d24, #1 @//extract a[1] (set4) + vext.16 d31, d25, d25, #1 @//extract a[5] (set5) ;//here only first element in the row is valid + vmlal.s16 q13, d30, d0[1] @// a0 + a5 + 20a2 (set4) + vmlal.s16 q13, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set4) + vmlsl.s16 q13, d28, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + vmlsl.s16 q13, d24, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + vext.16 d30, d24, d25, #2 @//extract a[2] (set5) + + vaddl.s16 q11, d31, d24 @// a0 + a5 (set5) + vext.16 d29, d24, d25, #3 @//extract a[3] (set5) + + vext.16 d31, d24, d25, #1 @//extract a[1] (set5) + vshrn.s32 d28, q9, #8 @// shift by 8 and later we will shift by 2 more with rounding (set3) + + vld1.8 {d17, d18, d19}, [r0], r3 @// Load next Row data + vmlal.s16 q11, d30, d0[1] @// a0 + a5 + 20a2 (set5) + vmlal.s16 q11, d29, d0[1] @// a0 + a5 + 20a2 + 20a3 (set5) + vmlsl.s16 q11, d31, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + vmlsl.s16 q11, d25, d0[0] @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + vshrn.s32 d29, q13, #8 @// shift by 8 and later we will shift by 2 more with rounding (set4) + vqrshrun.s16 d26, q10, #2 @// half,half gird set1,2 + + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + + subs r10, r10, #1 @//decrement loop counter + + bne filter_2dvh_loop + + +@// Process first vertical interpolated row +@// each column is + @// ////////////// ROW 13 /////////////////////// + +@// Process first vertical interpolated row +@// each column is + vpop {d8-d15} + ldmfd sp!, {r10, r11, r12, pc} + +filter_2dvh_skip_row: + + vqrshrun.s16 d27, q14, #2 @// half,half gird set3,4 + vshrn.s32 d28, q11, #8 @// shift by 8 and later we will shift by 2 more with rounding (set5) + + vqrshrun.s16 d28, q14, #2 @// half,half gird set5 + + vst1.8 {d26, d27, d28}, [r2], r14 @// store 1/2,1,2 grif values + vpop {d8-d15} + ldmfd sp!, {r10, r11, r12, pc} + + + + diff --git a/encoder/arm/ih264e_platform_macros.h b/encoder/arm/ih264e_platform_macros.h new file mode 100755 index 0000000..39cac96 --- /dev/null +++ b/encoder/arm/ih264e_platform_macros.h @@ -0,0 +1,143 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_platform_macros.h +* +* @brief +* Contains platform specific routines used for codec context intialization +* +* @author +* ittiam +* +* @remarks +* none +* +******************************************************************************* +*/ + +#ifndef IH264E_PLATFORM_MACROS_H_ +#define IH264E_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_generic(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr(void *pv_codec); + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void); + +#endif /* IH264E_PLATFORM_MACROS_H_ */ diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s new file mode 100755 index 0000000..b58911e --- /dev/null +++ b/encoder/arm/ime_distortion_metrics_a9q.s @@ -0,0 +1,1353 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ +@/** + +@/** +@****************************************************************************** +@* +@* +@* @brief +@* This file contains definitions of routines that compute distortion +@* between two macro/sub blocks of identical dimensions +@* +@* @author +@* Ittiam +@* +@* @par List of Functions: +@* - ime_compute_sad_16x16_a9q() +@* - ime_compute_sad_16x16_fast_a9q() +@* - ime_compute_sad_16x8_a9q() +@* - ime_compute_sad_16x16_ea8_a9q() +@* - ime_calculate_sad2_prog_a9q() +@* - ime_calculate_sad3_prog_a9q() +@* - ime_calculate_sad4_prog_a9q() +@* - ime_sub_pel_compute_sad_16x16_a9q() +@* - ime_compute_satqd_16x16_lumainter_a9q() +@* - +@* @remarks +@* None +@* +@******************************************************************************* +@ + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) +@* +@* @par Description +@* This functions computes SAD between 2 16x16 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] i4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ +.text +.p2align 2 + .global ime_compute_sad_16x16_fast_a9q +ime_compute_sad_16x16_fast_a9q: + + stmfd sp!, {r12, lr} + lsl r2, r2, #1 + lsl r3, r3, #1 + + @for bringing buffer2 into cache..., dummy load instructions + @ LDR r12,[r1] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + +loop_sad_16x16_fast: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_fast + + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vshl.u32 d0, d0, #1 + vst1.32 {d0[0]}, [r12] + + ldmfd sp!, {r12, pc} + + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x8 blocks +@* +@* +@* @par Description +@* This functions computes SAD between 2 16x8 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] u4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ +@ + .global ime_compute_sad_16x8_a9q +ime_compute_sad_16x8_a9q: + + stmfd sp!, {r12, lr} + + @for bringing buffer2 into cache..., dummy load instructions + @LDR r12,[r1] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r12, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + +loop_sad_16x8: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r12, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x8 + + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + + ldr r12, [sp, #12] + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] + + ldmfd sp!, {r12, pc} + + + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x16 blocks with early exit +@* +@* @par Description +@* This functions computes SAD between 2 16x16 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] i4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ + .global ime_compute_sad_16x16_ea8_a9q + +ime_compute_sad_16x16_ea8_a9q: + + stmfd sp!, {r5-r7, lr} + lsl r2, r2, #1 + lsl r3, r3, #1 + + @for bringing buffer2 into cache..., dummy load instructions + @LDR r12,[r1] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + mov r5, #6 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d6, d4 + vabdl.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + ldrd r6, r7, [sp, #16] + @r6 = i4_max_sad, r7 = pi4_mb_distortion + +loop_sad_16x16_ea8_1: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_ea8_1 + + vabal.u8 q0, d10, d8 + sub r0, r0, r2, lsl #3 + vabal.u8 q1, d11, d9 + sub r1, r1, r3, lsl #3 + + vadd.i16 q6, q0, q1 + add r0, r0, r2, asr #1 + vadd.i16 d12, d12, d13 + add r1, r1, r3, asr #1 + + vpaddl.u16 d12, d12 + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + vpaddl.u32 d12, d12 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + + vst1.32 {d12[0]}, [r7] + ldr r5, [r7] + cmp r5, r6 + bgt end_func_16x16_ea8 + + vld1.8 {d10, d11}, [r1], r3 + mov r5, #6 + +loop_sad_16x16_ea8_2: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + vld1.8 {d6, d7}, [r1], r3 + subs r5, #2 + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d6, d4 + vabal.u8 q1, d7, d5 + vld1.8 {d10, d11}, [r1], r3 + + bne loop_sad_16x16_ea8_2 + + vabal.u8 q0, d10, d8 + vabal.u8 q1, d11, d9 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + + vst1.32 {d0[0]}, [r7] + +end_func_16x16_ea8: + + ldmfd sp!, {r5-r7, pc} + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : Calculate_Mad2_prog() +@// +@// Detail Description : This function find the sad values of 4 Progressive MBs +@// at one shot +@// +@// Platform : CortexA8/NEON . +@// +@//----------------------------------------------------------------------------- +@*/ + + .global ime_calculate_sad2_prog_a9q + +ime_calculate_sad2_prog_a9q: + + @ r0 = ref1 <UWORD8 *> + @ r1 = ref2 <UWORD8 *> + @ r2 = src <UWORD8 *> + @ r3 = RefBufferWidth <UWORD32> + @ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> + + stmfd sp!, {r4-r5, lr} + + ldr r4, [sp, #8] @ load src stride to r4 + mov r5, #14 + + @Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + + @Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabdl.u8 q6, d2, d0 + vabdl.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabdl.u8 q8, d4, d0 + vabdl.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + +loop_sad2_prog: + + subs r5, #2 + @Row 1 + vld1.8 {d0, d1}, [r2], r4 @ load src Row 1 + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 + vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1 + + @Row 2 + vld1.8 {d6, d7}, [r2], r4 @ load src Row 2 + vabal.u8 q6, d2, d0 + vabal.u8 q7, d3, d1 + vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2 + vabal.u8 q8, d4, d0 + vabal.u8 q9, d5, d1 + vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2 + + bne loop_sad2_prog + + vabal.u8 q6, d8, d6 + vabal.u8 q7, d9, d7 + vabal.u8 q8, d10, d6 + vabal.u8 q9, d11, d7 + + @ Compute SAD + + vadd.u16 q6, q6, q7 @ Q6 : sad_ref1 + vadd.u16 q8, q8, q9 @ Q8 : sad_ref2 + + vadd.u16 d12, d12, d13 + ldr r5, [sp, #16] @ loading pi4_sad to r5 + vadd.u16 d16, d16, d17 + + vpadd.u16 d12, d12, d16 + vpaddl.u16 d12, d12 + + vst1.64 {d12}, [r5]! + + ldmfd sp!, {r4-r5, pc} + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : Calculate_Mad3_prog() +@// +@// Detail Description : This function find the sad values of 4 Progressive MBs +@// at one shot +@// +@// Platform : CortexA8/NEON . +@// +@//----------------------------------------------------------------------------- +@*/ + + .global ime_calculate_sad3_prog_a9q + +ime_calculate_sad3_prog_a9q: + + @ r0 = ref1 <UWORD8 *> + @ r1 = ref2 <UWORD8 *> + @ r2 = ref3 <UWORD8 *> + @ r3 = src <UWORD8 *> + @ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> + + + stmfd sp!, {r4-r6, lr} + + ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5 + mov r6, #14 + + @ Row 1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabdl.u8 q8, d2, d0 + vabdl.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabdl.u8 q10, d4, d0 + vabdl.u8 q11, d5, d1 + + @ Row 2 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabdl.u8 q12, d6, d0 + vabdl.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 + +loop_sad3_prog: + + @Row 1 + vld1.8 {d0, d1}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 + vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1 + vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d2, d0 + vabal.u8 q9, d3, d1 + vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d4, d0 + vabal.u8 q11, d5, d1 + + @Row 2 + vld1.8 {d8, d9}, [r3], r5 @ load src Row 1 + vabal.u8 q12, d6, d0 + vabal.u8 q13, d7, d1 + vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1 + subs r6, #2 + vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1 + vabal.u8 q8, d10, d8 + vabal.u8 q9, d11, d9 + vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1 + vabal.u8 q10, d12, d8 + vabal.u8 q11, d13, d9 + + bne loop_sad3_prog + + vabal.u8 q12, d14, d8 + vabal.u8 q13, d15, d9 + + @ Compute SAD + + vadd.u16 q8, q8, q9 @ Q8 : sad_ref1 + vadd.u16 q10, q10, q11 @ Q10 : sad_ref2 + vadd.u16 q12, q12, q13 @ Q12 : sad_ref3 + + vadd.u16 d16, d16, d17 + vadd.u16 d20, d20, d21 + vadd.u16 d24, d24, d25 + + vpadd.u16 d16, d16, d20 + vpadd.u16 d24, d24, d24 + + ldr r6, [sp, #24] @ loading pi4_sad to r6 + vpaddl.u16 d16, d16 + vpaddl.u16 d24, d24 + + vst1.64 {d16}, [r6]! + vst1.32 {d24[0]}, [r6] + + ldmfd sp!, {r4-r6, pc} + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) for sub-pel motion estimation +@* +@* @par Description +@* This functions computes SAD for all the 8 half pel points +@* +@* @param[out] pi4_sad +@* integer evaluated sad +@* pi4_sad[0] - half x +@* pi4_sad[1] - half x - 1 +@* pi4_sad[2] - half y +@* pi4_sad[3] - half y - 1 +@* pi4_sad[4] - half xy +@* pi4_sad[5] - half xy - 1 +@* pi4_sad[6] - half xy - strd +@* pi4_sad[7] - half xy - 1 - strd +@* +@* @remarks +@* +@****************************************************************************** +@*/ + +.text +.p2align 2 + + .global ime_sub_pel_compute_sad_16x16_a9q + +ime_sub_pel_compute_sad_16x16_a9q: + + stmfd sp!, {r4-r11, lr} @store register values to stack + + ldr r9, [sp, #36] + ldr r10, [sp, #40] + + sub r4, r1, #1 @ x left + sub r5, r2, r10 @ y top + + sub r6, r3, #1 @ xy left + sub r7, r3, r10 @ xy top + + sub r8, r7, #1 @ xy top-left + mov r11, #15 + + @for bringing buffer2 into cache..., dummy load instructions + @ LDR r12,[r1] + @ LDR r12,[sp,#12] + + vld1.8 {d0, d1}, [r0], r9 @ src + vld1.8 {d2, d3}, [r5], r10 @ y top LOAD + vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD + vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD + + vabdl.u8 q6, d2, d0 @ y top ABS1 + vabdl.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabdl.u8 q8, d6, d0 @ xy top-left ABS1 + vabdl.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabdl.u8 q10, d10, d0 @ x left ABS1 + vabdl.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabdl.u8 q12, d4, d0 @ xy ABS1 + vabdl.u8 q13, d6, d0 @ xy left ABS1 + +loop_sub_pel_16x16: + + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + subs r11, #1 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vld1.8 {d0, d1}, [r0], r9 @ src + vabal.u8 q6, d2, d0 @ y top ABS1 + vabal.u8 q7, d4, d0 @ xy top ABS1 + vld1.8 {d8, d9}, [r1], r10 @ x LOAD + vabal.u8 q8, d6, d0 @ xy top-left ABS1 + vabal.u8 q9, d8, d0 @ x ABS1 + vld1.8 {d10, d11}, [r4], r10 @ x left LOAD + + vabal.u8 q6, d3, d1 @ y top ABS2 + vabal.u8 q7, d5, d1 @ xy top ABS2 + vld1.8 {d2, d3}, [r2], r10 @ y LOAD + vabal.u8 q8, d7, d1 @ xy top-left ABS2 + vabal.u8 q9, d9, d1 @ x ABS2 + vld1.8 {d4, d5}, [r3], r10 @ xy LOAD + + vabal.u8 q10, d10, d0 @ x left ABS1 + vabal.u8 q11, d2, d0 @ y ABS1 + vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD + vabal.u8 q12, d4, d0 @ xy ABS1 + vabal.u8 q13, d6, d0 @ xy left ABS1 + + bne loop_sub_pel_16x16 + + vabal.u8 q10, d11, d1 @ x left ABS2 + vabal.u8 q11, d3, d1 @ y ABS2 + vabal.u8 q12, d5, d1 @ xy ABS2 + vabal.u8 q13, d7, d1 @ xy left ABS2 + + vadd.i16 d0, d18, d19 @ x + vadd.i16 d3, d12, d13 @ y top + vadd.i16 d6, d14, d15 @ xy top + vadd.i16 d5, d26, d27 @ xy left + vadd.i16 d1, d20, d21 @ x left + vadd.i16 d2, d22, d23 @ y + vadd.i16 d4, d24, d25 @ xy + vadd.i16 d7, d16, d17 @ xy top left + + vpadd.i16 d0, d0, d1 + vpadd.i16 d2, d2, d3 + vpadd.i16 d4, d4, d5 + vpadd.i16 d6, d6, d7 + + vpaddl.u16 d0, d0 + vpaddl.u16 d2, d2 + ldr r11, [sp, #44] + vpaddl.u16 d4, d4 + vpaddl.u16 d6, d6 + + vst1.32 {d0}, [r11]! + vst1.32 {d2}, [r11]! + vst1.32 {d4}, [r11]! + vst1.32 {d6}, [r11]! + + ldmfd sp!, {r4-r11, pc} @Restoring registers from stack + + + +@/** +@****************************************************************************** +@* +@* @brief computes distortion (SAD) between 2 16x16 blocks +@* +@* @par Description +@* This functions computes SAD between 2 16x16 blocks. There is a provision +@* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +@* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +@* +@* @param[in] pu1_src +@* UWORD8 pointer to the source +@* +@* @param[out] pu1_dst +@* UWORD8 pointer to the destination +@* +@* @param[in] src_strd +@* integer source stride +@* +@* @param[in] dst_strd +@* integer destination stride +@* +@* @param[in] i4_max_sad +@* integer maximum allowed distortion +@* +@* @param[in] pi4_mb_distortion +@* integer evaluated sad +@* +@* @remarks +@* +@****************************************************************************** +@*/ + +.text +.p2align 2 + + .global ime_compute_sad_16x16_a9q + +ime_compute_sad_16x16_a9q: + + + @STMFD sp!,{r12,lr} + stmfd sp!, {r12, r14} @store register values to stack + + @for bringing buffer2 into cache..., dummy load instructions + @ LDR r12,[r1] + @ LDR r12,[sp,#12] + + vld1.8 {d4, d5}, [r0], r2 + vld1.8 {d6, d7}, [r1], r3 + + mov r12, #14 + vld1.8 {d8, d9}, [r0], r2 + vabdl.u8 q0, d4, d6 + vld1.8 {d10, d11}, [r1], r3 + vabdl.u8 q1, d5, d7 + +loop_sad_16x16: + + vld1.8 {d4, d5}, [r0], r2 + vabal.u8 q0, d8, d10 + vld1.8 {d6, d7}, [r1], r3 + vabal.u8 q1, d9, d11 + + vld1.8 {d8, d9}, [r0], r2 + vabal.u8 q0, d4, d6 + subs r12, #2 + vld1.8 {d10, d11}, [r1], r3 + vabal.u8 q1, d5, d7 + + bne loop_sad_16x16 + + vabal.u8 q0, d8, d10 + vabal.u8 q1, d9, d11 + + vadd.i16 q0, q0, q1 + vadd.i16 d0, d1, d0 + ldr r12, [sp, #12] + + vpaddl.u16 d0, d0 + vpaddl.u32 d0, d0 + vst1.32 {d0[0]}, [r12] + + ldmfd sp!, {r12, pc} @Restoring registers from stack + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : Calculate_Mad4_prog() +@// +@// Detail Description : This function find the sad values of 4 Progressive MBs +@// at one shot +@// +@// Platform : CortexA8/NEON . +@// +@//----------------------------------------------------------------------------- +@*/ + + .global ime_calculate_sad4_prog_a9q + +ime_calculate_sad4_prog_a9q: + @ r0 = temp_frame <UWORD8 *> + @ r1 = buffer_ptr <UWORD8 *> + @ r2 = RefBufferWidth <UWORD32> + @ r3 = CurBufferWidth <UWORD32> + @ stack = psad <UWORD32 *> {at 0x34} + + stmfd sp!, {r4-r7, lr} + + @UWORD8 *left_ptr = temp_frame - 1; + @UWORD8 *right_ptr = temp_frame + 1; + @UWORD8 *top_ptr = temp_frame - RefBufferWidth; + @UWORD8 *bot_ptr = temp_frame + RefBufferWidth; + + mov r7, #14 + sub r4, r0, #0x01 @r4 = left_ptr + add r5, r0, #0x1 @r5 = right_ptr + sub r6, r0, r2 @r6 = top_ptr + add r0, r0, r2 @r0 = bot_ptr + @r1 = buffer_ptr + + @D0:D1 : buffer + @D2:D3 : top + @D4:D5 : left + @D6:D7 : right + @D8:D9 : bottom + + @Row 1 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + + vabdl.u8 q5, d2, d0 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabdl.u8 q6, d3, d1 + + vabdl.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabdl.u8 q8, d1, d5 + + @Row 2 + vabdl.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabdl.u8 q10, d1, d7 + + vabdl.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabdl.u8 q12, d1, d9 + +loop_sad4_prog: + + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 + + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 + + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 + + @Row 1 + vabal.u8 q11, d26, d8 + vld1.8 {d0, d1}, [r1], r3 @ load src Row 1 + vabal.u8 q12, d27, d9 + + vld1.8 {d2, d3}, [r6], r2 @ load top Row 1 + subs r7, #2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 1 + + vabal.u8 q5, d0, d2 + + vld1.8 {d6, d7}, [r5], r2 @ load right Row 1 + vabal.u8 q6, d1, d3 + + vabal.u8 q7, d0, d4 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1 + vabal.u8 q8, d1, d5 + + @Row 2 + vabal.u8 q9, d0, d6 + vld1.8 {d26, d27}, [r1], r3 @ load src Row 2 + vabal.u8 q10, d1, d7 + + vabal.u8 q11, d0, d8 + vld1.8 {d2, d3}, [r6], r2 @ load top Row 2 + vabal.u8 q12, d1, d9 + + bne loop_sad4_prog + + vabal.u8 q5, d26, d2 + vld1.8 {d4, d5}, [r4], r2 @ load left Row 2 + vabal.u8 q6, d27, d3 + + vabal.u8 q7, d26, d4 + vld1.8 {d6, d7}, [r5], r2 @ load right Row 2 + vabal.u8 q8, d27, d5 + + vabal.u8 q9, d26, d6 + vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2 + vabal.u8 q10, d27, d7 + + vabal.u8 q11, d26, d8 + vabal.u8 q12, d27, d9 + + @;Q5:Q6 : sad_top + @;Q7:Q8 : sad_left + @;Q9:Q10 : sad_right + @;Q11:Q12 : sad_bot + + vadd.u16 q5, q5, q6 + vadd.u16 q7, q7, q8 + vadd.u16 q9, q9, q10 + vadd.u16 q11, q11, q12 + + @; Free :- + @; Q6,Q8,Q10,Q12 + + @;Q5 -> D10:D11 + @;Q7 -> D14:D15 + @;Q9 -> D18:D19 + @;Q11 -> D22:D23 + + vadd.u16 d10, d10, d11 + vadd.u16 d14, d14, d15 + vadd.u16 d18, d18, d19 + vadd.u16 d22, d22, d23 + + @;D10 : sad_top + @;D14 : sad_left + @;D18 : sad_right + @;D22 : sad_bot + + + vpaddl.u16 d11, d10 + vpaddl.u16 d15, d14 + vpaddl.u16 d19, d18 + vpaddl.u16 d23, d22 + + @;D11 : sad_top + @;D15 : sad_left + @;D19 : sad_right + @;D23 : sad_bot + + vpaddl.u32 d10, d11 + vpaddl.u32 d22, d23 + vpaddl.u32 d14, d15 + vpaddl.u32 d18, d19 + + @;D10 : sad_top + @;D14 : sad_left + @;D18 : sad_right + @;D22 : sad_bot + + ldr r4, [sp, #20] @;Can be rearranged + + vsli.64 d10, d22, #32 + vsli.64 d14, d18, #32 + + vst1.64 {d14}, [r4]! + vst1.64 {d10}, [r4]! + + ldmfd sp!, {r4-r7, pc} + + + + +@***************************************************************************** +@* +@* Function Name : ime_compute_satqd_16x16_lumainter_a9 +@* Description : This fucntion computes SAD for a 16x16 block. +@ : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant +@ +@ Arguments : R0 :pointer to src buffer +@ R1 :pointer to est buffer +@ R2 :source stride +@ R3 :est stride +@ STACk :Threshold,distotion,is_nonzero +@* +@* Values Returned : NONE +@* +@* Register Usage : R0-R11 +@* Stack Usage : +@* Cycles : Around +@* Interruptiaility : Interruptable +@* +@* Known Limitations +@* \Assumptions : +@* +@* Revision History : +@* DD MM YYYY Author(s) Changes +@* 14 04 2014 Harinarayanan K K First version +@* +@***************************************************************************** + .global ime_compute_satqd_16x16_lumainter_a9q +ime_compute_satqd_16x16_lumainter_a9q: + @R0 :pointer to src buffer + @R1 :pointer to est buffer + @R2 :Source stride + @R3 :Pred stride + @R4 :Threshold pointer + @R5 :Distortion,ie SAD + @R6 :is nonzero + + push {r4-r12, lr} @push all the variables first + @ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables + ldr r4, [sp, #40] @load the threshold address + + mov r8, #8 @Number of 4x8 blocks to be processed + mov r10, #0 @Sad + mov r7, #0 @Nonzero info + @---------------------------------------------------- + + vld1.u8 d30, [r0], r2 @I load 8 pix src row 1 + + vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1 + + vld1.u8 d28, [r0], r2 @I load 8 pix src row 2 + + vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2 + + vld1.u8 d26, [r0], r2 @I load 8 pix src row 3 + vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12 + + vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3 + + vld1.u8 d24, [r0], r2 @I load 8 pix src row 4 + + vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4 + vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12 + + vld1.u16 {q11}, [r4] @I load the threhold + vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12 + + vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12 + + + +core_loop: + @S1 S2 S3 S4 A1 A2 A3 A4 + @S5 S6 S7 S8 A5 A6 A7 A8 + @S9 S10 S11 S12 A9 A10 A11 A12 + @S13 S14 S15 S16 A13 A14 A15 A16 + ands r11, r8, #1 @II See if we are at even or odd block + vadd.u16 q4 , q0, q3 @I Add r1 r4 + lsl r11, r2, #2 @II Move back src 4 rows + + subeq r0, r0, r11 @II Move back src 4 rows if we are at even block + vadd.u16 q5 , q1, q2 @I Add r2 r3 + addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block + + lsl r11, r3, #2 @II Move back pred 4 rows + vtrn.16 d8 , d10 @I trnspse 1 + subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block + + addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block + vtrn.16 d9 , d11 @I trnspse 2 + subne r0, r0, #8 @II Src 8clos back for odd rows + + subne r1, r1, #8 @II Pred 8 cols back for odd rows + vtrn.32 d10, d11 @I trnspse 4 + + + vtrn.32 d8 , d9 @I trnspse 3 + vswp d10, d11 @I rearrange so that the q4 and q5 add properly + @D8 S1 S4 A1 A4 + @D9 S2 S3 A2 A3 + @D11 S1 S4 A1 A4 + @D10 S2 S3 A2 A3 + + vadd.s16 q6, q4, q5 @I Get s1 s4 + vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1 + + vtrn.s16 d12, d13 @I Get s2 s3 + @D12 S1 S4 A1 A4 + @D13 S2 S3 A2 A3 + + vshl.s16 q7, q6 , #1 @I si = si<<1 + vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1 + + vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3) + vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2 + @ D16 S14 A14 S23 A23 + vrev32.16 d0, d16 @I + vuzp.s16 d16, d0 @I + @D16 S14 S23 A14 A23 + vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4) + vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2 + @D17 S12 S34 A12 A34 + + vrev32.16 q9, q7 @I Rearrange si's + @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 + + @D12 S1 S4 A1 A4 + @D19 Z3 Z2 Y3 Y2 + vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1)) + vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3 + @D13 S2 S3 A2 A3 + @D18 Z4 Z1 Y4 Y1 + vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1)) + vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3 + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + + @D16 S14 S23 A14 A23 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4 + @D22 SAD1 SAD2 junk junk + + + @Q8 S2 S1 A2 A1 S6 S3 A6 A3 + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + vtrn.32 q8, q4 @I Rearrange to make ls of each block togather + @Q8 S2 S1 S8 S5 S6 S3 S7 S4 + @Q10 A2 A1 A8 A5 A6 A3 A7 A4 + + + ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1 + vdup.s16 q6, d10[0] @I Get the sad blk 1 + vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12 + + vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1 + vmov.s16 r9, d10[0] @I Get the sad for block 1 + + vsub.s16 q9, q7, q8 @I Add to the lss + vmov.s16 r5, d10[1] @I Get the sad for block 2 + + vcle.s16 q7, q11, q9 @I Add to the lss + vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4 + + vdup.s16 q15, d10[1] @I Get the sad blk 1 + vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12 + + + vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1 + vsub.s16 q3, q14, q4 @I Add to the lss + vcle.s16 q15, q11, q3 @I Add to the lss + + ADD R10, R10, R9 @I Add to the global sad blk 1 + vtrn.u8 q15, q7 @I get all comparison bits to one reg + vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12 + + ADD R10, R10, R5 @I Add to the global sad blk 2 + vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs + cmp r11, r9 + + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1 + vadd.u8 d28, d28, d29 @I Add the bits + cmp r11, r5 @I Compare with threshold blk 2 + + movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2 + vpadd.u8 d28, d28, d29 @I Add the bits + + vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11 + vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12 + + orr r7, r7, r11 @I get the guy to r11 + + + sub r8, r8, #1 @I Decremrnt block count + + cmp r7, #0 @I If we have atlest one non zero block + bne compute_sad_only @I if a non zero block is der,From now on compute sad only + + cmp r8, #1 @I See if we are at the last block + bne core_loop @I If the blocks are zero, lets continue the satdq + + + @EPILOUGE for core loop + @S1 S2 S3 S4 A1 A2 A3 A4 + @S5 S6 S7 S8 A5 A6 A7 A8 + @S9 S10 S11 S12 A9 A10 A11 A12 + @S13 S14 S15 S16 A13 A14 A15 A16 + vadd.u16 q4 , q0, q3 @Add r1 r4 + vadd.u16 q5 , q1, q2 @Add r2 r3 + @D8 S1 S2 S2 S1 + @D10 S4 S3 S3 S4 + @D9 A1 A2 A2 A1 + @D11 A4 A3 A3 A4 + vtrn.16 d8 , d10 @I trnspse 1 + vtrn.16 d9 , d11 @I trnspse 2 + vtrn.32 d8 , d9 @I trnspse 3 + vtrn.32 d10, d11 @I trnspse 4 + + vswp d10, d11 @I rearrange so that the q4 and q5 add properly + @D8 S1 S4 A1 A4 + @D9 S2 S3 A2 A3 + @D11 S1 S4 A1 A4 + @D10 S2 S3 A2 A3 + vadd.s16 q6, q4, q5 @Get s1 s4 + vtrn.s16 d12, d13 @Get s2 s3 + @D12 S1 S4 A1 A4 + @D13 S2 S3 A2 A3 + + vshl.s16 q7, q6 , #1 @si = si<<1 + vmov.s16 r9, d10[0] @Get the sad for block 1 + + vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3) + vmov.s16 r5, d10[1] @Get the sad for block 2 + @D16 S14 A14 S23 A23 + vrev32.16 d30, d16 @ + vuzp.s16 d16, d30 @ + @D16 S14 S23 A14 A23 + vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4) + @D17 S12 S34 A12 A34 + + vrev32.16 q9, q7 @Rearrange si's + @Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2 + + @D12 S1 S4 A1 A4 + @D19 Z3 Z2 Y3 Y2 + vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1)) + @D13 S2 S3 A2 A3 + @D18 Z4 Z1 Y4 Y1 + vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1)) + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + + @D16 S14 S23 A14 A23 + vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4 + @D22 SAD1 SAD2 junk junk + vmov.u16 r9, d10[0] @Get the sad for block 1 + vmov.u16 r5, d10[1] @Get the sad for block 2 + + @Q8 S2 S1 A2 A1 S6 S3 A6 A3 + @Q10 S8 S5 A8 A5 S7 S4 A7 A4 + ldrh r11, [r4, #16] @Load the threshold for DC val blk 1 + vtrn.32 q8, q4 @Rearrange to make ls of each block togather + ADD R10, R10, R9 @Add to the global sad blk 1 + + @Q8 S2 S1 S8 S5 S6 S3 S7 S4 + @Q10 A2 A1 A8 A5 A6 A3 A7 A4 + + vld1.u16 {q11}, [r4] @load the threhold + ADD R10, R10, R5 @Add to the global sad blk 2 + + vdup.u16 q6, d10[0] @Get the sad blk 1 + + cmp r11, r9 @Compare with threshold blk 1 + vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1 + + vsub.s16 q9, q7, q8 @Add to the lss + + vcle.s16 q15, q11, q9 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1 + + cmp r11, r5 @Compare with threshold blk 2 + vdup.u16 q14, d10[1] @Get the sad blk 1 + + vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1 + vsub.s16 q12, q13, q4 @Add to the lss + vcle.s16 q14, q11, q12 @Add to the lss + movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2 + + vtrn.u8 q14, q15 @get all comparison bits to one reg + vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs + vadd.u8 d28, d28, d29 @Add the bits + vpadd.u8 d28, d28, d29 @Add the bits + vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11 + orr r7, r7, r11 @get the guy to r11 + + b funcend_sad_16x16 @Since all blocks ar processed nw, got to end + +compute_sad_only: @This block computes SAD only, so will be lighter + @IT will start processign at n odd block + @It will compute sad for odd blok, + @and then for two blocks at a time + @The counter is r7, hence r7 blocks will be processed + + and r11, r8, #1 @Get the last bit of counter + cmp r11, #0 @See if we are at even or odd block + @iif the blk is even we just have to set the pointer to the + @start of current row + + lsleq r11, r2, #2 @I Move back src 4 rows + subeq r0, r0, r11 @I Move back src 4 rows if we are at even block + + lsleq r11, r3, #2 @I Move back pred 4 rows + subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block + @ADDEQ R8,R8,#2 ;Inc counter + beq skip_odd_blk @If the blk is odd we have to compute sad + + + vadd.u16 q4, q0, q1 @Add SAD of row1 and row2 + vadd.u16 q5, q2, q3 @Add SAD of row3 and row4 + vadd.u16 q6, q4, q5 @Add SAD of row 1-4 + vadd.u16 d14, d12, d13 @Add Blk1 and blk2 + vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4 + vpadd.u16 d18, d16, d17 @Add col 12-34 + + vmov.u16 r9, d18[0] @Move sad to arm + ADD R10, R10, R9 @Add to the global sad + + sub r8, r8, #1 @Dec counter + cmp r8, #0 @See if we processed last block + beq funcend_sad_16x16 @if lprocessed last block goto end of func + + sub r0, r0, #8 @Since we processed od block move back src by 8 cols + sub r1, r1, #8 @Since we processed od block move back pred by 8 cols + +skip_odd_blk: + + vmov.s16 q0, #0 @Initialize the accumulator + vmov.s16 q1, #0 @Initialize the accumulator + + vld1.u8 {q15}, [r0], r2 @load src r1 + vld1.u8 {q14}, [r1], r3 @load pred r1 + + vld1.u8 {q13}, [r0], r2 @load src r2 + vld1.u8 {q12}, [r1], r3 @load pred r2 + + vld1.u8 {q11}, [r0], r2 @load src r3 + vld1.u8 {q10}, [r1], r3 @load pred r2 + + vld1.u8 {q9}, [r0], r2 @load src r4 + vld1.u8 {q8}, [r1], r3 @load pred r4 + + cmp r8, #2 + beq sad_epilouge + +sad_loop: + + vabal.u8 q0, d30, d28 @I accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @I accumulate Abs diff R1 + + vld1.u8 {q15}, [r0], r2 @II load r1 src + vabal.u8 q0, d26, d24 @I accumulate Abs diff R2 + + vld1.u8 {q14}, [r1], r3 @II load r1 pred + vabal.u8 q1, d27, d25 @I accumulate Abs diff R2 + + vld1.u8 {q13}, [r0], r2 @II load r3 src + vabal.u8 q0, d22, d20 @I accumulate Abs diff R3 + + vld1.u8 {q12}, [r1], r3 @II load r2 pred + vabal.u8 q1, d23, d21 @I accumulate Abs diff R3 + + vld1.u8 {q11}, [r0], r2 @II load r3 src + vabal.u8 q0, d18, d16 @I accumulate Abs diff R4 + + + sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2 + vld1.u8 {q10}, [r1], r3 @II load r3 pred + vabal.u8 q1, d19, d17 @I accumulate Abs diff R4 + + cmp r8, #2 @Check if last loop + vld1.u8 {q9}, [r0], r2 @II load r4 src + vld1.u8 {q8}, [r1], r3 @II load r4 pred + + bne sad_loop @Go back to SAD computation + +sad_epilouge: + vabal.u8 q0, d30, d28 @Accumulate Abs diff R1 + vabal.u8 q1, d31, d29 @Accumulate Abs diff R1 + + vabal.u8 q0, d26, d24 @Accumulate Abs diff R2 + vabal.u8 q1, d27, d25 @Accumulate Abs diff R2 + + vabal.u8 q0, d22, d20 @Accumulate Abs diff R3 + vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3 + + vabal.u8 q0, d18, d16 @Accumulate Abs diff R4 + vabal.u8 q1, d19, d17 @Accumulate Abs diff R4 + + vadd.u16 q2, q0, q1 @ADD two accumulators + vadd.u16 d6, d4, d5 @Add two blk sad + vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad + vpadd.u16 d10, d8, d9 @Add col 12-34 sad + + vmov.u16 r9, d10[0] @move SAD to ARM + ADD R10, R10, R9 @Add to the global sad + +funcend_sad_16x16: @End of fucntion process + ldr r5, [sp, #44] + ldr r6, [sp, #48] + + str r7, [r6] @Store the is zero reg + str r10, [r5] @Store sad + + @SUB SP,SP,#40 + pop {r4-r12, pc} + + diff --git a/encoder/arm/ime_platform_macros.h b/encoder/arm/ime_platform_macros.h new file mode 100755 index 0000000..0f5b2f2 --- /dev/null +++ b/encoder/arm/ime_platform_macros.h @@ -0,0 +1,51 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ime_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IME_PLATFORM_MACROS_H_ +#define _IME_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Function macro definitions */ +/*****************************************************************************/ + +#define USADA8(src,est,sad) \ + sad += ABS(src[0]-est[0]) + \ + ABS(src[1]-est[1]) + \ + ABS(src[2]-est[2]) + \ + ABS(src[3]-est[3]) + + +#endif /* _IH264_PLATFORM_MACROS_H_ */ diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s new file mode 100755 index 0000000..c442077 --- /dev/null +++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s @@ -0,0 +1,592 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** + +///** +//****************************************************************************** +//* +//* @brief :Evaluate best intra 16x16 mode (among VERT, HORZ and DC ) +//* and do the prediction. +//* +//* @par Description +//* This function evaluates first three 16x16 modes and compute corresponding sad +//* and return the buffer predicted with best mode. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//** @param[in] pu1_ngbr_pels_i16 +//* UWORD8 pointer to neighbouring pels +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] u4_n_avblty +//* availability of neighbouring pixels +//* +//* @param[in] u4_intra_mode +//* Pointer to the variable in which best mode is returned +//* +//* @param[in] pu4_sadmin +//* Pointer to the variable in which minimum sad is returned +//* +//* @param[in] u4_valid_intra_modes +//* Says what all modes are valid +//* +//* +//* @return none +//* +//****************************************************************************** +//*/ +// +//void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src, +// UWORD8 *pu1_ngbr_pels_i16, +// UWORD8 *pu1_dst, +// UWORD32 src_strd, +// UWORD32 dst_strd, +// WORD32 u4_n_avblty, +// UWORD32 *u4_intra_mode, +// WORD32 *pu4_sadmin, +// UWORD32 u4_valid_intra_modes) +// +.text +.p2align 2 +.include "ih264_neon_macros.s" + +.globl ih264e_evaluate_intra16x16_modes_av8 + +ih264e_evaluate_intra16x16_modes_av8: + +//x0 = pu1_src, +//x1 = pu1_ngbr_pels_i16, +//x2 = pu1_dst, +//x3 = src_strd, +//x4 = dst_strd, +//x5 = u4_n_avblty, +//x6 = u4_intra_mode, +//x7 = pu4_sadmin + + + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + + ldr x16, [sp, #80] + mov x17, x4 + mov x18, x5 + mov x14, x6 + mov x15, x7 + + + sub v0.16b, v0.16b, v0.16b + sub v1.16b, v1.16b, v1.16b + mov w10, #0 + mov w11 , #3 + + ands x6, x5, #0x01 + beq top_available //LEFT NOT AVAILABLE + ld1 {v0.16b}, [x1] + add w10, w10, #8 + add w11, w11, #1 +top_available: + ands x6, x5, #0x04 + beq none_available + add x6, x1, #17 + ld1 {v1.16b}, [x6] + add w10, w10, #8 + add w11, w11, #1 + b summation +none_available: + cmp x5, #0 + bne summation + mov w6, #128 + dup v30.16b, w6 + dup v31.16b, w6 + b sad_comp +summation: + uaddl v2.8h, v0.8b, v1.8b + uaddl2 v3.8h, v0.16b, v1.16b + dup v10.8h, w10 + neg w11, w11 + dup v20.8h, w11 + add v0.8h, v2.8h, v3.8h + mov v1.d[0], v0.d[1] + add v0.4h, v0.4h, v1.4h + addp v0.4h, v0.4h , v0.4h + addp v0.4h, v0.4h , v0.4h + add v0.4h, v0.4h, v10.4h + uqshl v0.8h, v0.8h, v20.8h + sqxtun v0.8b, v0.8h + + dup v30.16b, v0.b[0] + dup v31.16b, v0.b[0] + + +sad_comp: + ld1 { v0.2s, v1.2s }, [x0], x3 // source x0w 0 + + ld1 { v2.2s, v3.2s}, [x0], x3 //row 1 + + ld1 { v4.2s, v5.2s}, [x0], x3 //row 2 + + ld1 { v6.2s, v7.2s}, [x0], x3 //row 3 + + //--------------------- + + //values for vertical prediction + add x6, x1, #17 + ld1 {v10.8b}, [x6], #8 + ld1 {v11.8b}, [x6], #8 + ld1 {v9.16b}, [x1] + + + + dup v20.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// + dup v21.8b, v9.b[15] ///HORIZONTAL VALUE ROW=0// + + +///* computing SADs for all three modes*/ + ///vertical row 0@ + uabdl v16.8h, v0.8b, v10.8b + uabdl v18.8h, v1.8b, v11.8b + + ///HORZ row 0@ + uabdl v26.8h, v0.8b, v20.8b + uabdl v28.8h, v1.8b, v21.8b + + ///dc row 0@ + uabdl v22.8h, v0.8b, v30.8b + uabdl v24.8h, v1.8b, v31.8b + + + + + + dup v20.8b, v9.b[14] ///HORIZONTAL VALUE ROW=1// + dup v21.8b, v9.b[14] + + + ///vertical row 1@ + uabal v16.8h, v2.8b, v10.8b + uabal v18.8h, v3.8b, v11.8b + + ld1 { v0.2s, v1.2s }, [x0], x3 //row 4 + ///HORZ row 1@ + uabal v26.8h, v2.8b, v20.8b + uabal v28.8h, v3.8b, v21.8b + + ///dc row 1@ + uabal v22.8h, v2.8b, v30.8b + uabal v24.8h, v3.8b, v31.8b + + dup v20.8b, v9.b[13] ///HORIZONTAL VALUE ROW=2// + dup v21.8b, v9.b[13] + + ///vertical row 2@ + uabal v16.8h, v4.8b, v10.8b + uabal v18.8h, v5.8b, v11.8b + + ld1 { v2.2s, v3.2s}, [x0], x3 //row 5 + ///HORZ row 2@ + uabal v26.8h, v4.8b, v20.8b + uabal v28.8h, v5.8b, v21.8b + + ///dc row 2@ + uabal v22.8h, v4.8b, v30.8b + uabal v24.8h, v5.8b, v31.8b + + dup v20.8b, v9.b[12] ///HORIZONTAL VALUE ROW=3// + dup v21.8b, v9.b[12] + + ///vertical row 3@ + uabal v16.8h, v6.8b, v10.8b + uabal v18.8h, v7.8b, v11.8b + + ld1 { v4.2s, v5.2s}, [x0], x3 //row 6 + ///HORZ row 3@ + uabal v26.8h, v6.8b, v20.8b + uabal v28.8h, v7.8b, v21.8b + + ///dc row 3@ + uabal v22.8h, v6.8b, v30.8b + uabal v24.8h, v7.8b, v31.8b +//---------------------------------------------------------------------------------------------- + + dup v20.8b, v9.b[11] ///HORIZONTAL VALUE ROW=0// + dup v21.8b, v9.b[11] + + ///vertical row 0@ + uabal v16.8h, v0.8b, v10.8b + uabal v18.8h, v1.8b, v11.8b + + ld1 { v6.2s, v7.2s}, [x0], x3 //row 7 + ///HORZ row 0@ + uabal v26.8h, v0.8b, v20.8b + uabal v28.8h, v1.8b, v21.8b + + ///dc row 0@ + uabal v22.8h, v0.8b, v30.8b + uabal v24.8h, v1.8b, v31.8b + + dup v20.8b, v9.b[10] ///HORIZONTAL VALUE ROW=1// + dup v21.8b, v9.b[10] + + ///vertical row 1@ + uabal v16.8h, v2.8b, v10.8b + uabal v18.8h, v3.8b, v11.8b + + ld1 { v0.2s, v1.2s }, [x0], x3 //row 8 + ///HORZ row 1@ + uabal v26.8h, v2.8b, v20.8b + uabal v28.8h, v3.8b, v21.8b + + ///dc row 1@ + uabal v22.8h, v2.8b, v30.8b + uabal v24.8h, v3.8b, v31.8b + + dup v20.8b, v9.b[9] ///HORIZONTAL VALUE ROW=2// + dup v21.8b, v9.b[9] + + ///vertical row 2@ + uabal v16.8h, v4.8b, v10.8b + uabal v18.8h, v5.8b, v11.8b + + ld1 { v2.2s, v3.2s}, [x0], x3 //row 9 + + ///HORZ row 2@ + uabal v26.8h, v4.8b, v20.8b + uabal v28.8h, v5.8b, v21.8b + + ///dc row 2@ + uabal v22.8h, v4.8b, v30.8b + uabal v24.8h, v5.8b, v31.8b + + dup v20.8b, v9.b[8] ///HORIZONTAL VALUE ROW=3// + dup v21.8b, v9.b[8] + + ///vertical row 3@ + uabal v16.8h, v6.8b, v10.8b + uabal v18.8h, v7.8b, v11.8b + + ld1 { v4.2s, v5.2s}, [x0], x3 //row 10 + + ///HORZ row 3@ + uabal v26.8h, v6.8b, v20.8b + uabal v28.8h, v7.8b, v21.8b + + ///dc row 3@ + uabal v22.8h, v6.8b, v30.8b + uabal v24.8h, v7.8b, v31.8b + + +//------------------------------------------- + + dup v20.8b, v9.b[7] ///HORIZONTAL VALUE ROW=0// + dup v21.8b, v9.b[7] + + ///vertical row 0@ + uabal v16.8h, v0.8b, v10.8b + uabal v18.8h, v1.8b, v11.8b + + ld1 { v6.2s, v7.2s}, [x0], x3 //row11 + + ///HORZ row 0@ + uabal v26.8h, v0.8b, v20.8b + uabal v28.8h, v1.8b, v21.8b + + ///dc row 0@ + uabal v22.8h, v0.8b, v30.8b + uabal v24.8h, v1.8b, v31.8b + + dup v20.8b, v9.b[6] ///HORIZONTAL VALUE ROW=1// + dup v21.8b, v9.b[6] + + ///vertical row 1@ + uabal v16.8h, v2.8b, v10.8b + uabal v18.8h, v3.8b, v11.8b + + ld1 { v0.2s, v1.2s }, [x0], x3 //row12 + + ///HORZ row 1@ + uabal v26.8h, v2.8b, v20.8b + uabal v28.8h, v3.8b, v21.8b + + ///dc row 1@ + uabal v22.8h, v2.8b, v30.8b + uabal v24.8h, v3.8b, v31.8b + + dup v20.8b, v9.b[5] ///HORIZONTAL VALUE ROW=2// + dup v21.8b, v9.b[5] + + ///vertical row 2@ + uabal v16.8h, v4.8b, v10.8b + uabal v18.8h, v5.8b, v11.8b + + ld1 { v2.2s, v3.2s}, [x0], x3 //row13 + + ///HORZ row 2@ + uabal v26.8h, v4.8b, v20.8b + uabal v28.8h, v5.8b, v21.8b + + ///dc row 2@ + uabal v22.8h, v4.8b, v30.8b + uabal v24.8h, v5.8b, v31.8b + + dup v20.8b, v9.b[4] ///HORIZONTAL VALUE ROW=3// + dup v21.8b, v9.b[4] + + ///vertical row 3@ + uabal v16.8h, v6.8b, v10.8b + uabal v18.8h, v7.8b, v11.8b + + ld1 { v4.2s, v5.2s}, [x0], x3 //row14 + + ///HORZ row 3@ + uabal v26.8h, v6.8b, v20.8b + uabal v28.8h, v7.8b, v21.8b + + ///dc row 3@ + uabal v22.8h, v6.8b, v30.8b + uabal v24.8h, v7.8b, v31.8b + //----------------------------------------------------------------- + + dup v20.8b, v9.b[3] ///HORIZONTAL VALUE ROW=0// + dup v21.8b, v9.b[3] + + ///vertical row 0@ + uabal v16.8h, v0.8b, v10.8b + uabal v18.8h, v1.8b, v11.8b + + ld1 { v6.2s, v7.2s}, [x0], x3 //row15 + + ///HORZ row 0@ + uabal v26.8h, v0.8b, v20.8b + uabal v28.8h, v1.8b, v21.8b + + ///dc row 0@ + uabal v22.8h, v0.8b, v30.8b + uabal v24.8h, v1.8b, v31.8b + + dup v20.8b, v9.b[2] ///HORIZONTAL VALUE ROW=1// + dup v21.8b, v9.b[2] + + ///vertical row 1@ + uabal v16.8h, v2.8b, v10.8b + uabal v18.8h, v3.8b, v11.8b + + ///HORZ row 1@ + uabal v26.8h, v2.8b, v20.8b + uabal v28.8h, v3.8b, v21.8b + + ///dc row 1@ + uabal v22.8h, v2.8b, v30.8b + uabal v24.8h, v3.8b, v31.8b + + dup v20.8b, v9.b[1] ///HORIZONTAL VALUE ROW=2// + dup v21.8b, v9.b[1] + + ///vertical row 2@ + uabal v16.8h, v4.8b, v10.8b + uabal v18.8h, v5.8b, v11.8b + + ///HORZ row 2@ + uabal v26.8h, v4.8b, v20.8b + uabal v28.8h, v5.8b, v21.8b + + ///dc row 2@ + uabal v22.8h, v4.8b, v30.8b + uabal v24.8h, v5.8b, v31.8b + + dup v20.8b, v9.b[0] ///HORIZONTAL VALUE ROW=3// + dup v21.8b, v9.b[0] + + ///vertical row 3@ + uabal v16.8h, v6.8b, v10.8b + uabal v18.8h, v7.8b, v11.8b + + ///HORZ row 3@ + uabal v26.8h, v6.8b, v20.8b + uabal v28.8h, v7.8b, v21.8b + + ///dc row 3@ + uabal v22.8h, v6.8b, v30.8b + uabal v24.8h, v7.8b, v31.8b + //------------------------------------------------------------------------------ + + + //vert sum + + add v16.8h, v16.8h , v18.8h + mov v18.d[0], v16.d[1] + add v16.4h, v16.4h , v18.4h + uaddlp v16.2s, v16.4h + addp v16.2s, v16.2s, v16.2s + smov x8, v16.s[0] //dc + + + //horz sum + + add v26.8h, v26.8h , v28.8h + mov v28.d[0], v26.d[1] + add v26.4h, v26.4h , v28.4h + uaddlp v26.2s, v26.4h + addp v26.2s, v26.2s, v26.2s + smov x9, v26.s[0] + + //dc sum + + add v24.8h, v22.8h , v24.8h ///DC + mov v25.d[0], v24.d[1] + add v24.4h, v24.4h , v25.4h ///DC + uaddlp v24.2s, v24.4h ///DC + addp v24.2s, v24.2s, v24.2s ///DC + smov x10, v24.s[0] //dc + + + //----------------------- + mov x11, #1 + lsl x11, x11, #30 + + mov x0, x16 + //-------------------------------------------- + ands x7, x0, #01 // vert mode valid???????????? + csel x8, x11, x8, eq + + + ands x6, x0, #02 // horz mode valid???????????? + csel x9, x11, x9, eq + + ands x6, x0, #04 // dc mode valid???????????? + csel x10, x11, x10, eq + + + + +//-------------------------------- + + mov x4, x17 + mov x7, x15 + mov x6, x14 + + //--------------------------- + + //-------------------------- + + cmp x8, x9 + bgt not_vert + cmp x8, x10 + bgt do_dc + + ///---------------------- + //DO VERTICAL PREDICTION + str x8 , [x7] //MIN SAD + mov x8, #0 + str x8 , [x6] // MODE + add x6, x1, #17 + ld1 {v30.16b}, [x6] + b do_dc_vert + //----------------------------- +not_vert: cmp x9, x10 + bgt do_dc + + ///---------------------- + //DO HORIZONTAL + str x9 , [x7] //MIN SAD + mov x9, #1 + str x9 , [x6] // MODE + + ld1 {v0.16b}, [x1] + dup v10.16b, v0.b[15] + dup v11.16b, v0.b[14] + dup v12.16b, v0.b[13] + dup v13.16b, v0.b[12] + st1 {v10.16b}, [x2], x4 + dup v14.16b, v0.b[11] + st1 {v11.16b}, [x2], x4 + dup v15.16b, v0.b[10] + st1 {v12.16b}, [x2], x4 + dup v16.16b, v0.b[9] + st1 {v13.16b}, [x2], x4 + dup v17.16b, v0.b[8] + st1 {v14.16b}, [x2], x4 + dup v18.16b, v0.b[7] + st1 {v15.16b}, [x2], x4 + dup v19.16b, v0.b[6] + st1 {v16.16b}, [x2], x4 + dup v20.16b, v0.b[5] + st1 {v17.16b}, [x2], x4 + dup v21.16b, v0.b[4] + st1 {v18.16b}, [x2], x4 + dup v22.16b, v0.b[3] + st1 {v19.16b}, [x2], x4 + dup v23.16b, v0.b[2] + st1 {v20.16b}, [x2], x4 + dup v24.16b, v0.b[1] + st1 {v21.16b}, [x2], x4 + dup v25.16b, v0.b[0] + st1 {v22.16b}, [x2], x4 + st1 {v23.16b}, [x2], x4 + st1 {v24.16b}, [x2], x4 + st1 {v25.16b}, [x2], x4 + + + + b end_func + + + ///----------------------------- + +do_dc: ///--------------------------------- + //DO DC + str x10 , [x7] //MIN SAD + mov x10, #2 + str x10 , [x6] // MODE +do_dc_vert: + st1 {v30.4s}, [x2], x4 //0 + st1 {v30.4s}, [x2], x4 //1 + st1 {v30.4s}, [x2], x4 //2 + st1 {v30.4s}, [x2], x4 //3 + st1 {v30.4s}, [x2], x4 //4 + st1 {v30.4s}, [x2], x4 //5 + st1 {v30.4s}, [x2], x4 //6 + st1 {v30.4s}, [x2], x4 //7 + st1 {v30.4s}, [x2], x4 //8 + st1 {v30.4s}, [x2], x4 //9 + st1 {v30.4s}, [x2], x4 //10 + st1 {v30.4s}, [x2], x4 //11 + st1 {v30.4s}, [x2], x4 //12 + st1 {v30.4s}, [x2], x4 //13 + st1 {v30.4s}, [x2], x4 //14 + st1 {v30.4s}, [x2], x4 //15 + ///------------------ +end_func: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + diff --git a/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s new file mode 100755 index 0000000..b02afd1 --- /dev/null +++ b/encoder/armv8/ih264e_evaluate_intra_chroma_modes_av8.s @@ -0,0 +1,467 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** + +///** +//****************************************************************************** +//* +//* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC ) +//* and do the prediction. +//* +//* @par Description +//* This function evaluates first three intra chroma modes and compute corresponding sad +//* and return the buffer predicted with best mode. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//** @param[in] pu1_ngbr_pels +//* UWORD8 pointer to neighbouring pels +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] u4_n_avblty +//* availability of neighbouring pixels +//* +//* @param[in] u4_intra_mode +//* Pointer to the variable in which best mode is returned +//* +//* @param[in] pu4_sadmin +//* Pointer to the variable in which minimum sad is returned +//* +//* @param[in] u4_valid_intra_modes +//* Says what all modes are valid +//* +//* +//* @return none +//* +//****************************************************************************** +//*/ +// +//void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src, +// UWORD8 *pu1_ngbr_pels_i16, +// UWORD8 *pu1_dst, +// UWORD32 src_strd, +// UWORD32 dst_strd, +// WORD32 u4_n_avblty, +// UWORD32 *u4_intra_mode, +// WORD32 *pu4_sadmin, +// UWORD32 u4_valid_intra_modes) +// +.text +.p2align 2 +.include "ih264_neon_macros.s" + +.global ih264e_evaluate_intra_chroma_modes_av8 + +ih264e_evaluate_intra_chroma_modes_av8: + +//x0 = pu1_src, +//x1 = pu1_ngbr_pels_i16, +//x2 = pu1_dst, +//x3 = src_strd, +//x4 = dst_strd, +//x5 = u4_n_avblty, +//x6 = u4_intra_mode, +//x7 = pu4_sadmin + + + + // STMFD sp!, {x4-x12, x14} //store register values to stack + push_v_regs + stp x19, x20, [sp, #-16]! + //----------------------- + ldr x16, [sp, #80] + mov x17, x4 + mov x18, x5 + mov x14, x6 + mov x15, x7 + + mov x19, #5 + ands x6, x5, x19 + beq none_available + cmp x6, #1 + beq left_only_available + cmp x6, #4 + beq top_only_available + +all_available: + ld1 {v0.8b, v1.8b}, [x1] + add x6, x1, #18 + ld1 {v2.8b, v3.8b}, [x6] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + uxtl v2.8h, v2.8b + uxtl v3.8h, v3.8b + addp v2.4s, v2.4s , v2.4s + addp v3.4s, v3.4s , v3.4s + addp v2.4s, v2.4s , v2.4s + addp v3.4s, v3.4s , v3.4s + rshrn v5.8b, v0.8h, #2 + dup v21.8h, v5.h[0] + rshrn v6.8b, v3.8h, #2 + dup v20.8h, v6.h[0] + add v1.8h, v1.8h, v2.8h + rshrn v1.8b, v1.8h, #3 + dup v23.8h, v1.h[0] + mov v20.d[0], v23.d[0] + add v0.8h, v0.8h, v3.8h + rshrn v0.8b, v0.8h, #3 + dup v23.8h, v0.h[0] + mov v31.d[0], v23.d[0] + mov v28.d[0], v20.d[0] + mov v29.d[0], v20.d[1] + mov v30.d[0], v21.d[0] + b sad_comp + +left_only_available: + ld1 {v0.8b, v1.8b}, [x1] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + rshrn v0.8b, v0.8h, #2 + rshrn v1.8b, v1.8h, #2 + + dup v28.8h , v1.h[0] + dup v29.8h , v1.h[0] + dup v30.8h, v0.h[0] + dup v31.8h, v0.h[0] + b sad_comp + +top_only_available: + add x6, x1, #18 + ld1 {v0.8b, v1.8b}, [x6] + uxtl v0.8h, v0.8b + uxtl v1.8h, v1.8b + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + addp v0.4s, v0.4s , v0.4s + addp v1.4s, v1.4s , v1.4s + rshrn v0.8b, v0.8h, #2 + rshrn v1.8b, v1.8h, #2 + dup v28.8h , v0.h[0] + dup v30.8h, v1.h[0] + mov v29.d[0], v30.d[1] + mov v30.d[0], v28.d[0] + mov v31.d[0], v30.d[1] + b sad_comp +none_available: + mov w20, #128 + dup v28.16b, w20 + dup v29.16b, w20 + dup v30.16b, w20 + dup v31.16b, w20 + + + +sad_comp: + add x6, x1, #18 + ld1 {v10.8b, v11.8b}, [x6] // vertical values + + ld1 {v27.8h}, [x1] + + dup v20.8h, v27.h[7] ///HORIZONTAL VALUE ROW=0// + dup v21.8h, v27.h[7] + + ld1 { v0.8b, v1.8b}, [x0], x3 + + + ///vertical row 0@ + uabdl v16.8h, v0.8b, v10.8b + uabdl v18.8h, v1.8b, v11.8b + + ///HORZ row 0@ + uabdl v26.8h, v0.8b, v20.8b + uabdl v14.8h, v1.8b, v21.8b + + ld1 {v2.8b, v3.8b}, [x0], x3 + + + + ///dc row 0@ + uabdl v22.8h, v0.8b, v28.8b + uabdl v24.8h, v1.8b, v29.8b + + + dup v20.8h, v27.h[6] + dup v21.8h, v27.h[6] ///HORIZONTAL VALUE ROW=1// + + ///vertical row 1@ + uabal v16.8h, v2.8b, v10.8b + uabal v18.8h, v3.8b, v11.8b + + ld1 { v4.8b, v5.8b}, [x0], x3 + + ///HORZ row 1@ + uabal v26.8h, v2.8b, v20.8b + uabal v14.8h, v3.8b, v21.8b + + ///dc row 1@ + uabal v22.8h, v2.8b, v28.8b + uabal v24.8h, v3.8b, v29.8b + + dup v20.8h, v27.h[5] + dup v21.8h, v27.h[5] ///HORIZONTAL VALUE ROW=2// + + ///vertical row 2@ + uabal v16.8h, v4.8b, v10.8b + uabal v18.8h, v5.8b, v11.8b + + ld1 { v6.8b, v7.8b}, [x0], x3 + ///HORZ row 2@ + uabal v26.8h, v4.8b, v20.8b + uabal v14.8h, v5.8b, v21.8b + + ///dc row 2@ + uabal v22.8h, v4.8b, v28.8b + uabal v24.8h, v5.8b, v29.8b + + dup v20.8h, v27.h[4] + dup v21.8h, v27.h[4] ///HORIZONTAL VALUE ROW=3// + + ///vertical row 3@ + uabal v16.8h, v6.8b, v10.8b + uabal v18.8h, v7.8b, v11.8b + + ///HORZ row 3@ + uabal v26.8h, v6.8b, v20.8b + uabal v14.8h, v7.8b, v21.8b + + ///dc row 3@ + uabal v22.8h, v6.8b, v28.8b + uabal v24.8h, v7.8b, v29.8b + + //---------------------------------------------------------------------------------------------- + ld1 { v0.8b, v1.8b}, [x0], x3 + + + dup v20.8h, v27.h[3] + dup v21.8h, v27.h[3] ///HORIZONTAL VALUE ROW=0// + + ///vertical row 0@ + uabal v16.8h, v0.8b, v10.8b + uabal v18.8h, v1.8b, v11.8b + + ///HORZ row 0@ + uabal v26.8h, v0.8b, v20.8b + uabal v14.8h, v1.8b, v21.8b + + ld1 { v2.8b, v3.8b}, [x0], x3 + + ///dc row 0@ + uabal v22.8h, v0.8b, v30.8b + uabal v24.8h, v1.8b, v31.8b + + dup v20.8h, v27.h[2] + dup v21.8h, v27.h[2] ///HORIZONTAL VALUE ROW=1// + + ///vertical row 1@ + uabal v16.8h, v2.8b, v10.8b + uabal v18.8h, v3.8b, v11.8b + + ///HORZ row 1@ + uabal v26.8h, v2.8b, v20.8b + uabal v14.8h, v3.8b, v21.8b + + ld1 { v4.8b, v5.8b}, [x0], x3 + + ///dc row 1@ + uabal v22.8h, v2.8b, v30.8b + uabal v24.8h, v3.8b, v31.8b + + dup v20.8h, v27.h[1] + dup v21.8h, v27.h[1] ///HORIZONTAL VALUE ROW=2// + + ///vertical row 2@ + uabal v16.8h, v4.8b, v10.8b + uabal v18.8h, v5.8b, v11.8b + + ///HORZ row 2@ + uabal v26.8h, v4.8b, v20.8b + uabal v14.8h, v5.8b, v21.8b + + ld1 {v6.8b, v7.8b}, [x0], x3 + + ///dc row 2@ + uabal v22.8h, v4.8b, v30.8b + uabal v24.8h, v5.8b, v31.8b + + dup v20.8h, v27.h[0] + dup v21.8h, v27.h[0] ///HORIZONTAL VALUE ROW=3// + + ///vertical row 3@ + uabal v16.8h, v6.8b, v10.8b + uabal v18.8h, v7.8b, v11.8b + + ///HORZ row 3@ + uabal v26.8h, v6.8b, v20.8b + uabal v14.8h, v7.8b, v21.8b + + ///dc row 3@ + uabal v22.8h, v6.8b, v30.8b + uabal v24.8h, v7.8b, v31.8b + + +//------------------------------------------- + + +//vert sum + + add v16.8h, v16.8h , v18.8h + mov v18.d[0], v16.d[1] + add v16.4h, v16.4h , v18.4h + uaddlp v16.2s, v16.4h + addp v16.2s, v16.2s, v16.2s + smov x8, v16.s[0] + + + //horz sum + + add v26.8h, v26.8h , v14.8h + mov v14.d[0], v26.d[1] + add v26.4h, v26.4h , v14.4h + uaddlp v26.2s, v26.4h + addp v26.2s, v26.2s, v26.2s + smov x9, v26.s[0] + + //dc sum + + add v24.8h, v22.8h , v24.8h ///DC + mov v25.d[0], v24.d[1] + add v24.4h, v24.4h , v25.4h ///DC + uaddlp v24.2s, v24.4h ///DC + addp v24.2s, v24.2s, v24.2s ///DC + smov x10, v24.s[0] //dc + + + + + mov x11, #1 +//----------------------- + mov x0, x16 // u4_valid_intra_modes + +//-------------------------------------------- + + + lsl x11, x11, #30 + + ands x7, x0, #04 // vert mode valid???????????? + csel x8, x11, x8, eq + + ands x6, x0, #02 // horz mode valid???????????? + csel x9, x11, x9, eq + + ands x6, x0, #01 // dc mode valid???????????? + csel x10, x11, x10, eq + + + //--------------------------- + + mov x4, x17 + mov x6, x14 + mov x7, x15 + + //-------------------------- + + cmp x10, x9 + bgt not_dc + cmp x10, x8 + bgt do_vert + + ///---------------------- + //DO DC PREDICTION + str x10 , [x7] //MIN SAD + + mov x10, #0 + str x10 , [x6] // MODE + + b do_dc_vert + //----------------------------- + +not_dc: + cmp x9, x8 + bgt do_vert + ///---------------------- + //DO HORIZONTAL + str x9 , [x7] //MIN SAD + + mov x10, #1 + str x10 , [x6] // MODE + ld1 {v0.8h}, [x1] + + dup v10.8h, v0.h[7] + dup v11.8h, v0.h[6] + dup v12.8h, v0.h[5] + dup v13.8h, v0.h[4] + st1 {v10.8h}, [x2], x4 + dup v14.8h, v0.h[3] + st1 {v11.8h}, [x2], x4 + dup v15.8h, v0.h[2] + st1 {v12.8h}, [x2], x4 + dup v16.8h, v0.h[1] + st1 {v13.8h}, [x2], x4 + dup v17.8h, v0.h[0] + st1 {v14.8h}, [x2], x4 + st1 {v15.8h}, [x2], x4 + st1 {v16.8h}, [x2], x4 + st1 {v17.8h}, [x2], x4 + + b end_func + +do_vert: + //DO VERTICAL PREDICTION + str x8 , [x7] //MIN SAD + mov x8, #2 + str x8 , [x6] // MODE + add x6, x1, #18 + ld1 {v28.8b, v29.8b}, [x6] // vertical values + ld1 {v30.8b, v31.8b}, [x6] // vertical values + +do_dc_vert: + st1 {v28.2s, v29.2s} , [x2], x4 //0 + st1 {v28.2s, v29.2s} , [x2], x4 //1 + st1 {v28.2s, v29.2s} , [x2], x4 //2 + st1 {v28.2s, v29.2s} , [x2], x4 //3 + st1 {v30.2s, v31.2s} , [x2], x4 //4 + st1 {v30.2s, v31.2s} , [x2], x4 //5 + st1 {v30.2s, v31.2s} , [x2], x4 //6 + st1 {v30.2s, v31.2s} , [x2], x4 //7 + +end_func: + // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s new file mode 100755 index 0000000..6dbd8f8 --- /dev/null +++ b/encoder/armv8/ih264e_half_pel_av8.s @@ -0,0 +1,1024 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +// ******************************************************************************* +// * @file +// * ih264e_half_pel.s +// * +// * @brief +// * +// * +// * @author +// * Ittiam +// * +// * @par List of Functions: +// * ih264e_sixtapfilter_horz +// * ih264e_sixtap_filter_2dvh_vert +// +// * +// * @remarks +// * None +// * +// ******************************************************************************* +// */ + + +.text +.p2align 2 +.include "ih264_neon_macros.s" + +// /** +///******************************************************************************* +//* +//* @brief +//* Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16) +//* +//* @par Description: +//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +//* sec 8.4.2.2.1 titled "Luma sample interpolation process" +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264e_sixtapfilter_horz(UWORD8 *pu1_src, +// UWORD8 *pu1_dst, +// WORD32 src_strd, +// WORD32 dst_strd); + + +.equ halfpel_width , 17 + 1 //( make it even, two rows are processed at a time) + + + .global ih264e_sixtapfilter_horz_av8 +ih264e_sixtapfilter_horz_av8: + // STMFD sp!,{x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + movi v0.8b, #5 + sub x0, x0, #2 + sub x3, x3, #16 + movi v1.8b, #20 + mov x14, #16 + +filter_horz_loop: + + + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 + + //// Processing row0 and row1 + + ext v31.8b, v2.8b , v3.8b , #5 + ext v30.8b, v3.8b , v4.8b , #5 + + uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) + ext v29.8b, v4.8b , v4.8b , #5 + uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) + ext v28.8b, v5.8b , v6.8b , #5 + uaddl v12.8h, v29.8b, v4.8b //// a0 + a5 (column3,row0) + ext v27.8b, v6.8b , v7.8b , #5 + uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) + ext v26.8b, v7.8b , v7.8b , #5 + + uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) + ext v31.8b, v2.8b , v3.8b , #2 + uaddl v18.8h, v26.8b, v7.8b //// a0 + a5 (column3,row1) + ext v30.8b, v3.8b , v4.8b , #2 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + ext v29.8b, v4.8b , v4.8b , #2 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + ext v28.8b, v5.8b , v6.8b , #2 + umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) + ext v27.8b, v6.8b , v7.8b , #2 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) + ext v26.8b, v7.8b , v7.8b , #2 + + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) + ext v31.8b, v2.8b , v3.8b , #3 + umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 (column3,row1) + ext v30.8b, v3.8b , v4.8b , #3 + umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + ext v29.8b, v4.8b , v4.8b , #3 + umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + ext v28.8b, v5.8b , v6.8b , #3 + umlal v12.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) + ext v27.8b, v6.8b , v7.8b , #3 + umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) + ext v26.8b, v7.8b , v7.8b , #3 + + umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) + ext v31.8b, v2.8b , v3.8b , #1 + umlal v18.8h, v26.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row1) + ext v30.8b, v3.8b , v4.8b , #1 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + ext v29.8b, v4.8b , v4.8b , #1 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + ext v28.8b, v5.8b , v6.8b , #1 + umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + ext v27.8b, v6.8b , v7.8b , #1 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) + ext v26.8b, v7.8b , v7.8b , #1 + + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) + ext v31.8b, v2.8b , v3.8b , #4 + umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row1) + ext v30.8b, v3.8b , v4.8b , #4 + umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + ext v29.8b, v4.8b , v4.8b , #4 + umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + ext v28.8b, v5.8b , v6.8b , #4 + umlsl v12.8h, v29.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + ext v27.8b, v6.8b , v7.8b , #4 + umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) + ext v26.8b, v7.8b , v7.8b , #4 + + umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) + umlsl v18.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row1) + + sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + sqrshrun v22.8b, v12.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + sqrshrun v23.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) + sqrshrun v24.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) + sqrshrun v25.8b, v18.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row1) + + st1 {v20.8b, v21.8b}, [x1], #16 ////Store dest row0 + st1 {v22.h}[0], [x1], x3 + st1 {v23.8b, v24.8b}, [x1], #16 ////Store dest row1 + st1 {v25.h}[0], [x1], x3 + + subs x14, x14, #2 // decrement counter + + bne filter_horz_loop + + + // LDMFD sp!,{pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + + + + + +///** +//******************************************************************************* +//* +//* @brief +//* This function implements a two stage cascaded six tap filter. It +//* applies the six tap filter in the vertical direction on the +//* predictor values, followed by applying the same filter in the +//* horizontal direction on the output of the first stage. The six tap +//* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample +//* interpolation process" +//* (Filter run for width = 17 and height =17) +//* @par Description: +//* The function interpolates +//* the predictors first in the vertical direction and then in the +//* horizontal direction to output the (1/2,1/2). The output of the first +//* stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C) +//* in 16 bit precision. +//* +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst1 +//* UWORD8 pointer to the destination(vertical filtered output) +//* +//* @param[out] pu1_dst2 +//* UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output) +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride of pu1_dst +//* +//* @param[in]pi16_pred1 +//* Pointer to 16bit intermediate buffer(used only in c) +//* +//* @param[in] pi16_pred1_strd +//* integer destination stride of pi16_pred1 +//* +//* +//* @returns +//* +//* @remarks +//* None +//* +//******************************************************************************* +//*/ +//void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src, +// UWORD8 *pu1_dst1, +// UWORD8 *pu1_dst2, +// WORD32 src_strd, +// WORD32 dst_strd, +// WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/ +// WORD32 pi16_pred1_strd) + + + + + .global ih264e_sixtap_filter_2dvh_vert_av8 + +ih264e_sixtap_filter_2dvh_vert_av8: + // STMFD sp!,{x10,x11,x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + +////x0 - pu1_ref +////x3 - u4_ref_width + + //// Load six rows for vertical interpolation + lsl x12, x3, #1 + sub x0, x0, x12 + sub x0, x0, #2 + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 + ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 + mov x12, #5 + ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 + mov x14, #20 + ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 + mov v0.4h[0], w12 + mov v0.4h[1], w14 + ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 + movi v1.8b, #20 + +//// x12 - u2_buff1_width +//// x14 - u2_buff2_width + mov x12, x4 + add x11, x1, #16 + + mov x14, x12 + + mov x10, #3 //loop counter + sub x16 , x12, #8 + sub x19, x14, #16 +filter_2dvh_loop: + + //// ////////////// ROW 1 /////////////////////// + +//// Process first vertical interpolated row +//// each column is + uaddl v20.8h, v2.8b, v17.8b //// a0 + a5 (column1,row0) + movi v31.8b, #5 + umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + mov v21.d[0], v20.d[1] + + uaddl v22.8h, v3.8b, v18.8b //// a0 + a5 (column2,row0) + umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + ext v30.8b, v20.8b , v21.8b , #4 + mov v23.d[0], v22.d[1] + + + uaddl v24.8h, v4.8b, v19.8b //// a0 + a5 (column3,row0) + ext v29.8b, v20.8b , v21.8b , #6 + umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) + umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) + umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + mov v25.d[0], v24.d[1] + + sqrshrun v2.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v21.8b , v22.8b , #2 + sqrshrun v3.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v28.8b, v20.8b , v21.8b , #2 + + saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) + ext v31.8b, v22.8b , v23.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set1) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set1) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + smlsl v26.4s, v21.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + ext v30.8b, v21.8b , v22.8b , #4 + + sqrshrun v4.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + ext v29.8b, v21.8b , v22.8b , #6 + + ext v28.8b, v21.8b , v22.8b , #2 + saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) + smlal v20.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set2) + smlal v20.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set2) + smlsl v20.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + smlsl v20.4s, v22.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + ext v31.8b, v23.8b , v24.8b , #2 + mov v21.d[0], v20.d[1] + ext v2.8b, v2.8b , v3.8b , #2 + ext v3.8b, v3.8b , v4.8b , #2 + ext v4.8b, v4.8b , v4.8b , #2 + + st1 {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid + st1 {v4.h}[0], [x11], x12 //// store row1 - 1,1/2 grid + + ext v30.8b, v22.8b , v23.8b , #4 + ext v29.8b, v22.8b , v23.8b , #6 + + saddl v2.4s, v31.4h, v22.4h //// a0 + a5 (set3) + ext v28.8b, v22.8b , v23.8b , #2 + smlal v2.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set3) + smlal v2.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set3) + smlsl v2.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + smlsl v2.4s, v23.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + ext v31.8b, v24.8b , v25.8b , #2 + + shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) + ext v30.8b, v23.8b , v24.8b , #4 + shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) + ext v29.8b, v23.8b , v24.8b , #6 + + saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) + ext v28.8b, v23.8b , v24.8b , #2 + ext v31.8b, v25.8b , v25.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set4) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set4) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + smlsl v26.4s, v24.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + ext v30.8b, v24.8b , v25.8b , #4 + + saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) + ext v29.8b, v24.8b , v25.8b , #6 + + ext v31.8b, v24.8b , v25.8b , #2 + shrn v28.4h, v2.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) + + ld1 {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data + smlal v22.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set5) + smlal v22.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set5) + smlsl v22.4s, v31.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + smlsl v22.4s, v25.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) + mov v20.d[1], v21.d[0] + sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 + + + ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values + //// ////////////// ROW 2 /////////////////////// + +//// Process first vertical interpolated row +//// each column is + uaddl v20.8h, v5.8b, v2.8b //// a0 + a5 (column1,row0) + movi v31.8b, #5 + umlal v20.8h, v11.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + mov v21.d[0], v20.d[1] + + mov v28.d[1], v29.d[0] + sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 + + shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) + + uaddl v22.8h, v6.8b, v3.8b //// a0 + a5 (column2,row0) + umlal v22.8h, v12.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + mov v23.d[0], v22.d[1] + + sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 + ext v30.8b, v20.8b , v21.8b , #4 + + uaddl v24.8h, v7.8b, v4.8b //// a0 + a5 (column3,row0) + ext v29.8b, v20.8b , v21.8b , #6 + umlal v24.8h, v13.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) + umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) + umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + mov v25.d[0], v24.d[1] + + st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values + st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values + + sqrshrun v5.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v21.8b , v22.8b , #2 + sqrshrun v6.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v28.8b, v20.8b , v21.8b , #2 + + saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) + ext v31.8b, v22.8b , v23.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set1) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set1) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + smlsl v26.4s, v21.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + ext v30.8b, v21.8b , v22.8b , #4 + + sqrshrun v7.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + ext v29.8b, v21.8b , v22.8b , #6 + + ext v28.8b, v21.8b , v22.8b , #2 + saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) + smlal v20.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set2) + smlal v20.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set2) + smlsl v20.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + smlsl v20.4s, v22.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + ext v31.8b, v23.8b , v24.8b , #2 + + ext v5.8b, v5.8b , v6.8b , #2 + ext v6.8b, v6.8b , v7.8b , #2 + ext v7.8b, v7.8b , v7.8b , #2 + + st1 {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid + st1 {v7.h}[0], [x11], x12 //// store row1 - 1,1/2 grid + + ext v30.8b, v22.8b , v23.8b , #4 + ext v29.8b, v22.8b , v23.8b , #6 + + saddl v6.4s, v31.4h, v22.4h //// a0 + a5 (set3) + ext v28.8b, v22.8b , v23.8b , #2 + smlal v6.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set3) + smlal v6.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set3) + smlsl v6.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + smlsl v6.4s, v23.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + ext v31.8b, v24.8b , v25.8b , #2 + + shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) + ext v30.8b, v23.8b , v24.8b , #4 + shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) + ext v29.8b, v23.8b , v24.8b , #6 + + saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) + ext v28.8b, v23.8b , v24.8b , #2 + ext v31.8b, v25.8b , v25.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set4) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set4) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + smlsl v26.4s, v24.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + ext v30.8b, v24.8b , v25.8b , #4 + + saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) + ext v29.8b, v24.8b , v25.8b , #6 + + ext v31.8b, v24.8b , v25.8b , #2 + shrn v28.4h, v6.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) + + ld1 {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data + smlal v22.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set5) + smlal v22.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set5) + smlsl v22.4s, v31.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + smlsl v22.4s, v25.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) + mov v20.d[1], v21.d[0] + sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 + + + ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values + //// ////////////// ROW 3 /////////////////////// + +//// Process first vertical interpolated row +//// each column is + uaddl v20.8h, v8.8b, v5.8b //// a0 + a5 (column1,row0) + movi v31.8b, #5 + umlal v20.8h, v14.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + mov v21.d[0], v20.d[1] + + mov v28.d[1], v29.d[0] + sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 + shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) + + uaddl v22.8h, v9.8b, v6.8b //// a0 + a5 (column2,row0) + umlal v22.8h, v15.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + mov v23.d[0], v22.d[1] + + sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 + ext v30.8b, v20.8b , v21.8b , #4 + + uaddl v24.8h, v10.8b, v7.8b //// a0 + a5 (column3,row0) + ext v29.8b, v20.8b , v21.8b , #6 + umlal v24.8h, v16.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) + umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) + umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + mov v25.d[0], v24.d[1] + + st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values + st1 { v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values + + sqrshrun v8.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v21.8b , v22.8b , #2 + sqrshrun v9.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v28.8b, v20.8b , v21.8b , #2 + + saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) + ext v31.8b, v22.8b , v23.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set1) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set1) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + smlsl v26.4s, v21.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + ext v30.8b, v21.8b , v22.8b , #4 + + sqrshrun v10.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + ext v29.8b, v21.8b , v22.8b , #6 + + ext v28.8b, v21.8b , v22.8b , #2 + saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) + smlal v20.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set2) + smlal v20.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set2) + smlsl v20.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + smlsl v20.4s, v22.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + ext v31.8b, v23.8b , v24.8b , #2 + + ext v8.8b, v8.8b , v9.8b , #2 + ext v9.8b, v9.8b , v10.8b , #2 + ext v10.8b, v10.8b , v10.8b , #2 + + st1 {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid + st1 {v10.h}[0], [x11], x12 //// store row1 - 1,1/2 grid + + ext v30.8b, v22.8b , v23.8b , #4 + ext v29.8b, v22.8b , v23.8b , #6 + + saddl v8.4s, v31.4h, v22.4h //// a0 + a5 (set3) + ext v28.8b, v22.8b , v23.8b , #2 + smlal v8.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set3) + smlal v8.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set3) + smlsl v8.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + smlsl v8.4s, v23.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + ext v31.8b, v24.8b , v25.8b , #2 + + shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) + ext v30.8b, v23.8b , v24.8b , #4 + shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) + ext v29.8b, v23.8b , v24.8b , #6 + + saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) + ext v28.8b, v23.8b , v24.8b , #2 + ext v31.8b, v25.8b , v25.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set4) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set4) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + smlsl v26.4s, v24.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + ext v30.8b, v24.8b , v25.8b , #4 + + saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) + ext v29.8b, v24.8b , v25.8b , #6 + + ext v31.8b, v24.8b , v25.8b , #2 + shrn v28.4h, v8.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) + + ld1 {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data + smlal v22.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set5) + smlal v22.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set5) + smlsl v22.4s, v31.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + smlsl v22.4s, v25.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) + mov v20.d[1], v21.d[0] + sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 + + + ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values + //// ////////////// ROW 4 /////////////////////// + +//// Process first vertical interpolated row +//// each column is + uaddl v20.8h, v11.8b, v8.8b //// a0 + a5 (column1,row0) + movi v31.8b, #5 + umlal v20.8h, v17.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlsl v20.8h, v14.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v20.8h, v5.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + mov v21.d[0], v20.d[1] + mov v28.d[1], v29.d[0] + sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 + shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) + + uaddl v22.8h, v12.8b, v9.8b //// a0 + a5 (column2,row0) + umlal v22.8h, v18.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + umlsl v22.8h, v15.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + umlsl v22.8h, v6.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + mov v23.d[0], v22.d[1] + + sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 + ext v30.8b, v20.8b , v21.8b , #4 + + uaddl v24.8h, v13.8b, v10.8b //// a0 + a5 (column3,row0) + ext v29.8b, v20.8b , v21.8b , #6 + umlal v24.8h, v19.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) + umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) + umlsl v24.8h, v16.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + umlsl v24.8h, v7.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + mov v25.d[0], v24.d[1] + + st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values + st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values + + sqrshrun v11.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v21.8b , v22.8b , #2 + sqrshrun v12.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v28.8b, v20.8b , v21.8b , #2 + + saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) + ext v31.8b, v22.8b , v23.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set1) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set1) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + smlsl v26.4s, v21.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + ext v30.8b, v21.8b , v22.8b , #4 + + sqrshrun v13.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + ext v29.8b, v21.8b , v22.8b , #6 + + ext v28.8b, v21.8b , v22.8b , #2 + saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) + smlal v20.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set2) + smlal v20.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set2) + smlsl v20.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + smlsl v20.4s, v22.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + ext v31.8b, v23.8b , v24.8b , #2 + + ext v11.8b, v11.8b , v12.8b , #2 + ext v12.8b, v12.8b , v13.8b , #2 + ext v13.8b, v13.8b , v13.8b , #2 + + st1 {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid + st1 {v13.h}[0], [x11], x12 //// store row1 - 1,1/2 grid + + ext v30.8b, v22.8b , v23.8b , #4 + ext v29.8b, v22.8b , v23.8b , #6 + + saddl v12.4s, v31.4h, v22.4h //// a0 + a5 (set3) + ext v28.8b, v22.8b , v23.8b , #2 + smlal v12.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set3) + smlal v12.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set3) + smlsl v12.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + smlsl v12.4s, v23.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + ext v31.8b, v24.8b , v25.8b , #2 + + shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) + ext v30.8b, v23.8b , v24.8b , #4 + shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) + ext v29.8b, v23.8b , v24.8b , #6 + + saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) + ext v28.8b, v23.8b , v24.8b , #2 + ext v31.8b, v25.8b , v25.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set4) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set4) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + smlsl v26.4s, v24.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + ext v30.8b, v24.8b , v25.8b , #4 + + saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) + ext v29.8b, v24.8b , v25.8b , #6 + + ext v31.8b, v24.8b , v25.8b , #2 + shrn v28.4h, v12.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) + + ld1 {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data + smlal v22.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set5) + smlal v22.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set5) + smlsl v22.4s, v31.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + smlsl v22.4s, v25.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) + mov v20.d[1], v21.d[0] + sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 + + + ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values + //// ////////////// ROW 5 /////////////////////// + +//// Process first vertical interpolated row +//// each column is + uaddl v20.8h, v14.8b, v11.8b //// a0 + a5 (column1,row0) + movi v31.8b, #5 + umlal v20.8h, v2.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlsl v20.8h, v17.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v20.8h, v8.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + mov v21.d[0], v20.d[1] + mov v28.d[1], v29.d[0] + sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 + shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) + + uaddl v22.8h, v15.8b, v12.8b //// a0 + a5 (column2,row0) + umlal v22.8h, v3.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + umlsl v22.8h, v18.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + umlsl v22.8h, v9.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + mov v23.d[0], v22.d[1] + + sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 + ext v30.8b, v20.8b , v21.8b , #4 + + uaddl v24.8h, v16.8b, v13.8b //// a0 + a5 (column3,row0) + ext v29.8b, v20.8b , v21.8b , #6 + umlal v24.8h, v4.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) + umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) + umlsl v24.8h, v19.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + umlsl v24.8h, v10.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + mov v25.d[0], v24.d[1] + + st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values + st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values + + sqrshrun v14.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v21.8b , v22.8b , #2 + sqrshrun v15.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v28.8b, v20.8b , v21.8b , #2 + + saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) + ext v31.8b, v22.8b , v23.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set1) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set1) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + smlsl v26.4s, v21.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + ext v30.8b, v21.8b , v22.8b , #4 + + sqrshrun v16.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + ext v29.8b, v21.8b , v22.8b , #6 + + ext v28.8b, v21.8b , v22.8b , #2 + saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) + smlal v20.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set2) + smlal v20.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set2) + smlsl v20.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + smlsl v20.4s, v22.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + ext v31.8b, v23.8b , v24.8b , #2 + + ext v14.8b, v14.8b , v15.8b , #2 + ext v15.8b, v15.8b , v16.8b , #2 + ext v16.8b, v16.8b , v16.8b , #2 + + st1 {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid + st1 {v16.h}[0], [x11], x12 //// store row1 - 1,1/2 grid + + ext v30.8b, v22.8b , v23.8b , #4 + ext v29.8b, v22.8b , v23.8b , #6 + + saddl v14.4s, v31.4h, v22.4h //// a0 + a5 (set3) + ext v28.8b, v22.8b , v23.8b , #2 + smlal v14.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set3) + smlal v14.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set3) + smlsl v14.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + smlsl v14.4s, v23.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + ext v31.8b, v24.8b , v25.8b , #2 + + shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) + ext v30.8b, v23.8b , v24.8b , #4 + shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) + ext v29.8b, v23.8b , v24.8b , #6 + + saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) + ext v28.8b, v23.8b , v24.8b , #2 + ext v31.8b, v25.8b , v25.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set4) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set4) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + smlsl v26.4s, v24.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + ext v30.8b, v24.8b , v25.8b , #4 + + saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) + ext v29.8b, v24.8b , v25.8b , #6 + + ext v31.8b, v24.8b , v25.8b , #2 + shrn v28.4h, v14.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) + + ld1 {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data + smlal v22.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set5) + smlal v22.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set5) + smlsl v22.4s, v31.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + smlsl v22.4s, v25.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) + mov v20.d[1], v21.d[0] + sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 + + + ////VQRSHRUN.s16 D27,Q14,#2 ;// half,half gird set3,4 + ////VSHRN.s32 D28,Q11,#8 ;// shift by 8 and later we will shift by 2 more with rounding (set5) + + ////VQRSHRUN.s16 D28,Q14,#2 ;// half,half gird set5 + + ////VST1.8 {D26,D27,D28},[x2],x14 ;// store 1/2,1,2 grif values + //// ////////////// ROW 6 /////////////////////// + +//// Process first vertical interpolated row +//// each column is + + cmp x10, #1 //// if it 17 rows are complete skip + beq filter_2dvh_skip_row + uaddl v20.8h, v17.8b, v14.8b //// a0 + a5 (column1,row0) + movi v31.8b, #5 + umlal v20.8h, v5.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) + umlal v20.8h, v8.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) + umlsl v20.8h, v2.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) + umlsl v20.8h, v11.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) + mov v21.d[0], v20.d[1] + mov v28.d[1], v29.d[0] + sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 + shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) + + uaddl v22.8h, v18.8b, v15.8b //// a0 + a5 (column2,row0) + umlal v22.8h, v6.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) + umlal v22.8h, v9.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) + umlsl v22.8h, v3.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) + umlsl v22.8h, v12.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) + mov v23.d[0], v22.d[1] + + sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 + ext v30.8b, v20.8b , v21.8b , #4 + + uaddl v24.8h, v19.8b, v16.8b //// a0 + a5 (column3,row0) + ext v29.8b, v20.8b , v21.8b , #6 + umlal v24.8h, v7.8b, v1.8b //// a0 + a5 + 20a2 (column3,row0) + umlal v24.8h, v10.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column3,row0) + umlsl v24.8h, v4.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column3,row0) + umlsl v24.8h, v13.8b, v31.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column3,row0) + mov v25.d[0], v24.d[1] + + st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values + st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values + + sqrshrun v17.8b, v20.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) + ext v31.8b, v21.8b , v22.8b , #2 + sqrshrun v18.8b, v22.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) + ext v28.8b, v20.8b , v21.8b , #2 + + saddl v26.4s, v31.4h, v20.4h //// a0 + a5 (set1) + ext v31.8b, v22.8b , v23.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set1) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set1) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set1) + smlsl v26.4s, v21.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set1) + ext v30.8b, v21.8b , v22.8b , #4 + + sqrshrun v19.8b, v24.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column3,row0) + ext v29.8b, v21.8b , v22.8b , #6 + + ext v28.8b, v21.8b , v22.8b , #2 + saddl v20.4s, v31.4h, v21.4h //// a0 + a5 (set2) + smlal v20.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set2) + smlal v20.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set2) + smlsl v20.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set2) + smlsl v20.4s, v22.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set2) + ext v31.8b, v23.8b , v24.8b , #2 + + ext v17.8b, v17.8b , v18.8b , #2 + ext v18.8b, v18.8b , v19.8b , #2 + ext v19.8b, v19.8b , v19.8b , #2 + + st1 {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid + st1 {v19.h}[0], [x11], x12 //// store row1 - 1,1/2 grid + + ext v30.8b, v22.8b , v23.8b , #4 + ext v29.8b, v22.8b , v23.8b , #6 + + saddl v18.4s, v31.4h, v22.4h //// a0 + a5 (set3) + ext v28.8b, v22.8b , v23.8b , #2 + smlal v18.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set3) + smlal v18.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set3) + smlsl v18.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set3) + smlsl v18.4s, v23.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set3) + ext v31.8b, v24.8b , v25.8b , #2 + + shrn v21.4h, v20.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set2) + ext v30.8b, v23.8b , v24.8b , #4 + shrn v20.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set1) + ext v29.8b, v23.8b , v24.8b , #6 + + saddl v26.4s, v31.4h, v23.4h //// a0 + a5 (set4) + ext v28.8b, v23.8b , v24.8b , #2 + ext v31.8b, v25.8b , v25.8b , #2 + smlal v26.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set4) + smlal v26.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set4) + smlsl v26.4s, v28.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set4) + smlsl v26.4s, v24.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set4) + ext v30.8b, v24.8b , v25.8b , #4 + + saddl v22.4s, v31.4h, v24.4h //// a0 + a5 (set5) + ext v29.8b, v24.8b , v25.8b , #6 + + ext v31.8b, v24.8b , v25.8b , #2 + shrn v28.4h, v18.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set3) + + ld1 {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data + smlal v22.4s, v30.4h, v0.4h[1] //// a0 + a5 + 20a2 (set5) + smlal v22.4s, v29.4h, v0.4h[1] //// a0 + a5 + 20a2 + 20a3 (set5) + smlsl v22.4s, v31.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 (set5) + smlsl v22.4s, v25.4h, v0.4h[0] //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (set5) + shrn v29.4h, v26.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set4) + mov v20.d[1], v21.d[0] + sqrshrun v26.8b, v20.8h, #2 //// half,half gird set1,2 + + mov v28.d[1], v29.d[0] + sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 + shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) + + sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 + + st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values + st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values + + subs x10, x10, #1 ////decrement loop counter + + bne filter_2dvh_loop + + +//// Process first vertical interpolated row +//// each column is + //// ////////////// ROW 13 /////////////////////// + +//// Process first vertical interpolated row +//// each column is + + // LDMFD sp!,{x10,x11,x12,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + +filter_2dvh_skip_row: + mov v28.d[1], v29.d[0] + sqrshrun v27.8b, v28.8h, #2 //// half,half gird set3,4 + shrn v28.4h, v22.4s, #8 //// shift by 8 and later we will shift by 2 more with rounding (set5) + + sqrshrun v28.8b, v28.8h, #2 //// half,half gird set5 + + st1 {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values + st1 {v28.h}[0], [x2], x19 //// store 1/2,1,2 grif values + // LDMFD sp!,{x10,x11,x12,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + +///***************************************** + + + + + + + .section .note.gnu-stack,"",%progbits diff --git a/encoder/armv8/ih264e_platform_macros.h b/encoder/armv8/ih264e_platform_macros.h new file mode 100755 index 0000000..39cac96 --- /dev/null +++ b/encoder/armv8/ih264e_platform_macros.h @@ -0,0 +1,143 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_platform_macros.h +* +* @brief +* Contains platform specific routines used for codec context intialization +* +* @author +* ittiam +* +* @remarks +* none +* +******************************************************************************* +*/ + +#ifndef IH264E_PLATFORM_MACROS_H_ +#define IH264E_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_a9q(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_neon_av8(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_generic(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr(void *pv_codec); + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void); + +#endif /* IH264E_PLATFORM_MACROS_H_ */ diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s new file mode 100755 index 0000000..99ebc8a --- /dev/null +++ b/encoder/armv8/ime_distortion_metrics_av8.s @@ -0,0 +1,978 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +//** + +///** +//****************************************************************************** +//* +//* +//* @brief +//* This file contains definitions of routines that compute distortion +//* between two macro/sub blocks of identical dimensions +//* +//* @author +//* Ittiam +//* +//* @par List of Functions: +//* - ime_compute_sad_16x16() +//* - ime_compute_sad_8x8() +//* - ime_compute_sad_4x4() +//* - ime_compute_sad_16x8() +//* - ime_compute_satqd_16x16_lumainter_av8() +//* +//* @remarks +//* None +//* +//******************************************************************************* +// + + +///** +//****************************************************************************** +//* +//* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) +//* +//* @par Description +//* This functions computes SAD between 2 16x16 blocks. There is a provision +//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] i4_max_sad +//* integer maximum allowed distortion +//* +//* @param[in] pi4_mb_distortion +//* integer evaluated sad +//* +//* @remarks +//* +//****************************************************************************** +//*/ +.text +.p2align 2 + +.macro push_v_regs + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! +.endm +.macro pop_v_regs + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 +.endm + + .global ime_compute_sad_16x16_fast_av8 +ime_compute_sad_16x16_fast_av8: + push_v_regs + lsl x2, x2, #1 + lsl x3, x3, #1 + + mov x6, #2 + movi v30.8h, #0 + +core_loop_ime_compute_sad_16x16_fast_av8: + + ld1 {v0.16b}, [x0], x2 + ld1 {v1.16b}, [x1], x3 + ld1 {v2.16b}, [x0], x2 + ld1 {v3.16b}, [x1], x3 + + uabal v30.8h, v0.8b, v1.8b + uabal2 v30.8h, v0.16b, v1.16b + + uabal v30.8h, v2.8b, v3.8b + uabal2 v30.8h, v2.16b, v3.16b + + ld1 {v4.16b}, [x0], x2 + ld1 {v5.16b}, [x1], x3 + ld1 {v6.16b}, [x0], x2 + ld1 {v7.16b}, [x1], x3 + + uabal v30.8h, v4.8b, v5.8b + uabal2 v30.8h, v4.16b, v5.16b + + uabal v30.8h, v6.8b, v7.8b + uabal2 v30.8h, v6.16b, v7.16b + + subs x6, x6, #1 + bne core_loop_ime_compute_sad_16x16_fast_av8 + + + addp v30.8h, v30.8h, v30.8h + uaddlp v30.4s, v30.8h + addp v30.2s, v30.2s, v30.2s + shl v30.2s, v30.2s, #1 + + st1 {v30.s}[0], [x5] + pop_v_regs + ret + + +///** +//****************************************************************************** +//* +//* @brief computes distortion (SAD) between 2 16x8 blocks +//* +//* +//* @par Description +//* This functions computes SAD between 2 16x8 blocks. There is a provision +//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] u4_max_sad +//* integer maximum allowed distortion +//* +//* @param[in] pi4_mb_distortion +//* integer evaluated sad +//* +//* @remarks +//* +//****************************************************************************** +//*/ +// + .global ime_compute_sad_16x8_av8 +ime_compute_sad_16x8_av8: + + //chheck what stride incremtn to use + //earlier code did not have this lsl + push_v_regs + mov x6, #2 + movi v30.8h, #0 + +core_loop_ime_compute_sad_16x8_av8: + + ld1 {v0.16b}, [x0], x2 + ld1 {v1.16b}, [x1], x3 + ld1 {v2.16b}, [x0], x2 + ld1 {v3.16b}, [x1], x3 + + uabal v30.8h, v0.8b, v1.8b + uabal2 v30.8h, v0.16b, v1.16b + + uabal v30.8h, v2.8b, v3.8b + uabal2 v30.8h, v2.16b, v3.16b + + ld1 {v4.16b}, [x0], x2 + ld1 {v5.16b}, [x1], x3 + ld1 {v6.16b}, [x0], x2 + ld1 {v7.16b}, [x1], x3 + + uabal v30.8h, v4.8b, v5.8b + uabal2 v30.8h, v4.16b, v5.16b + + uabal v30.8h, v6.8b, v7.8b + uabal2 v30.8h, v6.16b, v7.16b + + subs x6, x6, #1 + bne core_loop_ime_compute_sad_16x8_av8 + + + addp v30.8h, v30.8h, v30.8h + uaddlp v30.4s, v30.8h + addp v30.2s, v30.2s, v30.2s + + st1 {v30.s}[0], [x5] + pop_v_regs + ret + +///** +//****************************************************************************** +//* +//* @brief computes distortion (SAD) between 2 16x16 blocks with early exit +//* +//* @par Description +//* This functions computes SAD between 2 16x16 blocks. There is a provision +//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] i4_max_sad +//* integer maximum allowed distortion +//* +//* @param[in] pi4_mb_distortion +//* integer evaluated sad +//* +//* @remarks +//* +//****************************************************************************** +//*/ + + .global ime_compute_sad_16x16_ea8_av8 +ime_compute_sad_16x16_ea8_av8: + + push_v_regs + movi v30.8h, #0 + + add x7, x0, x2 + add x8, x1, x3 + + lsl x2, x2, #1 + lsl x3, x3, #1 + + ld1 {v0.16b}, [x0], x2 + ld1 {v1.16b}, [x1], x3 + ld1 {v2.16b}, [x0], x2 + ld1 {v3.16b}, [x1], x3 + ld1 {v8.16b}, [x0], x2 + ld1 {v9.16b}, [x1], x3 + ld1 {v10.16b}, [x0], x2 + ld1 {v11.16b}, [x1], x3 + ld1 {v12.16b}, [x0], x2 + ld1 {v13.16b}, [x1], x3 + ld1 {v14.16b}, [x0], x2 + ld1 {v15.16b}, [x1], x3 + ld1 {v16.16b}, [x0], x2 + ld1 {v17.16b}, [x1], x3 + ld1 {v18.16b}, [x0], x2 + ld1 {v19.16b}, [x1], x3 + + uabal v30.8h, v0.8b, v1.8b + uabal2 v30.8h, v0.16b, v1.16b + + uabal v30.8h, v2.8b, v3.8b + uabal2 v30.8h, v2.16b, v3.16b + + uabal v30.8h, v8.8b, v9.8b + uabal2 v30.8h, v8.16b, v9.16b + + uabal v30.8h, v10.8b, v11.8b + uabal2 v30.8h, v10.16b, v11.16b + + uabal v30.8h, v12.8b, v13.8b + uabal2 v30.8h, v12.16b, v13.16b + + uabal v30.8h, v14.8b, v15.8b + uabal2 v30.8h, v14.16b, v15.16b + + uabal v30.8h, v16.8b, v17.8b + uabal2 v30.8h, v16.16b, v17.16b + + uabal v30.8h, v18.8b, v19.8b + uabal2 v30.8h, v18.16b, v19.16b + + addp v31.8h, v30.8h, v30.8h + uaddlp v31.4s, v31.8h + addp v31.2s, v31.2s, v31.2s + mov w6, v31.s[0] + cmp w6, w4 + bgt end_func_16x16 + + //do the stuff again + ld1 {v0.16b}, [x7], x2 + ld1 {v1.16b}, [x8], x3 + ld1 {v2.16b}, [x7], x2 + ld1 {v3.16b}, [x8], x3 + ld1 {v8.16b}, [x7], x2 + ld1 {v9.16b}, [x8], x3 + ld1 {v10.16b}, [x7], x2 + ld1 {v11.16b}, [x8], x3 + ld1 {v12.16b}, [x7], x2 + ld1 {v13.16b}, [x8], x3 + ld1 {v14.16b}, [x7], x2 + ld1 {v15.16b}, [x8], x3 + ld1 {v16.16b}, [x7], x2 + ld1 {v17.16b}, [x8], x3 + ld1 {v18.16b}, [x7], x2 + ld1 {v19.16b}, [x8], x3 + + uabal v30.8h, v0.8b, v1.8b + uabal2 v30.8h, v0.16b, v1.16b + + uabal v30.8h, v2.8b, v3.8b + uabal2 v30.8h, v2.16b, v3.16b + + uabal v30.8h, v8.8b, v9.8b + uabal2 v30.8h, v8.16b, v9.16b + + uabal v30.8h, v10.8b, v11.8b + uabal2 v30.8h, v10.16b, v11.16b + + uabal v30.8h, v12.8b, v13.8b + uabal2 v30.8h, v12.16b, v13.16b + + uabal v30.8h, v14.8b, v15.8b + uabal2 v30.8h, v14.16b, v15.16b + + uabal v30.8h, v16.8b, v17.8b + uabal2 v30.8h, v16.16b, v17.16b + + uabal v30.8h, v18.8b, v19.8b + uabal2 v30.8h, v18.16b, v19.16b + + addp v31.8h, v30.8h, v30.8h + uaddlp v31.4s, v31.8h + addp v31.2s, v31.2s, v31.2s + +end_func_16x16: + st1 {v31.s}[0], [x5] + pop_v_regs + ret + + +///* +////--------------------------------------------------------------------------- +//// Function Name : ime_calculate_sad2_prog_av8() +//// +//// Detail Description : This function find the sad values of 4 Progressive MBs +//// at one shot +//// +//// Platform : CortexAv8/NEON . +//// +////----------------------------------------------------------------------------- +//*/ + + .global ime_calculate_sad2_prog_av8 +ime_calculate_sad2_prog_av8: + + // x0 = ref1 <UWORD8 *> + // x1 = ref2 <UWORD8 *> + // x2 = src <UWORD8 *> + // x3 = RefBufferWidth <UWORD32> + // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> + push_v_regs + mov x6, #8 + movi v30.8h, #0 + movi v31.8h, #0 + +core_loop_ime_calculate_sad2_prog_av8: + + ld1 {v0.16b}, [x0], x3 + ld1 {v1.16b}, [x1], x3 + ld1 {v2.16b}, [x3], x4 + + ld1 {v3.16b}, [x0], x3 + ld1 {v4.16b}, [x1], x3 + ld1 {v5.16b}, [x3], x4 + + + uabal v30.8h, v0.8b, v2.8b + uabal2 v30.8h, v0.16b, v2.16b + uabal v31.8h, v1.8b, v2.8b + uabal2 v31.8h, v1.16b, v2.16b + + uabal v30.8h, v3.8b, v5.8b + uabal2 v30.8h, v3.16b, v5.16b + uabal v31.8h, v4.8b, v5.8b + uabal2 v31.8h, v4.16b, v5.16b + + + ld1 {v6.16b}, [x0], x3 + ld1 {v7.16b}, [x1], x3 + ld1 {v8.16b}, [x3], x4 + + ld1 {v9.16b}, [x0], x3 + ld1 {v10.16b}, [x1], x3 + ld1 {v11.16b}, [x3], x4 + + uabal v30.8h, v6.8b, v8.8b + uabal2 v30.8h, v6.16b, v8.16b + uabal v31.8h, v7.8b, v8.8b + uabal2 v31.8h, v7.16b, v8.16b + + uabal v30.8h, v9.8b, v11.8b + uabal2 v30.8h, v9.16b, v11.16b + uabal v31.8h, v10.8b, v11.8b + uabal2 v31.8h, v0.16b, v11.16b + + subs x6, x6, #1 + bne core_loop_ime_calculate_sad2_prog_av8 + + addp v30.8h, v30.8h, v31.8h + uaddlp v30.4s, v30.8h + addp v30.2s, v30.2s, v30.2s + shl v30.2s, v30.2s, #1 + + st1 {v30.2s}, [x5] + pop_v_regs + ret + +///* +////--------------------------------------------------------------------------- +//// Function Name : Calculate_Mad3_prog() +//// +//// Detail Description : This function find the sad values of 4 Progressive MBs +//// at one shot +//// +//// Platform : CortexA8/NEON . +//// +////----------------------------------------------------------------------------- +//*/ + + .global ime_calculate_sad3_prog_av8 +ime_calculate_sad3_prog_av8: + + // x0 = ref1 <UWORD8 *> + // x1 = ref2 <UWORD8 *> + // x2 = ref3 <UWORD8 *> + // x3 = src <UWORD8 *> + // stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *> + + + // x0 = ref1 <UWORD8 *> + // x1 = ref2 <UWORD8 *> + // x2 = src <UWORD8 *> + // x3 = RefBufferWidth <UWORD32> + // stack = CurBufferWidth <UWORD32>, psad <UWORD32 *> + push_v_regs + mov x6, #16 + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 + +core_loop_ime_calculate_sad3_prog_av8: + + ld1 {v0.16b}, [x0], x4 + ld1 {v1.16b}, [x1], x4 + ld1 {v2.16b}, [x2], x4 + ld1 {v3.16b}, [x3], x5 + + uabal v29.8h, v0.8b, v3.8b + uabal2 v29.8h, v0.16b, v3.16b + uabal v30.8h, v1.8b, v3.8b + uabal2 v30.8h, v1.16b, v3.16b + uabal v31.8h, v2.8b, v3.8b + uabal2 v31.8h, v2.16b, v3.16b + + ld1 {v4.16b}, [x0], x4 + ld1 {v5.16b}, [x1], x4 + ld1 {v6.16b}, [x2], x4 + ld1 {v7.16b}, [x3], x5 + + uabal v29.8h, v4.8b, v7.8b + uabal2 v29.8h, v4.16b, v7.16b + uabal v30.8h, v5.8b, v7.8b + uabal2 v30.8h, v5.16b, v7.16b + uabal v31.8h, v6.8b, v7.8b + uabal2 v31.8h, v6.16b, v7.16b + + subs x6, x6, #1 + bne core_loop_ime_calculate_sad2_prog_av8 + + addp v30.8h, v30.8h, v31.8h + uaddlp v30.4s, v30.8h + addp v30.2s, v30.2s, v30.2s + shl v30.2s, v30.2s, #1 + + st1 {v30.2s}, [x5] + pop_v_regs + ret + + + + +///** +//****************************************************************************** +//* +//* @brief computes distortion (SAD) for sub-pel motion estimation +//* +//* @par Description +//* This functions computes SAD for all the 8 half pel points +//* +//* @param[out] pi4_sad +//* integer evaluated sad +//* pi4_sad[0] - half x +//* pi4_sad[1] - half x - 1 +//* pi4_sad[2] - half y +//* pi4_sad[3] - half y - 1 +//* pi4_sad[4] - half xy +//* pi4_sad[5] - half xy - 1 +//* pi4_sad[6] - half xy - strd +//* pi4_sad[7] - half xy - 1 - strd +//* +//* @remarks +//* +//****************************************************************************** +//*/ + +.text +.p2align 2 + + .global ime_sub_pel_compute_sad_16x16_av8 +ime_sub_pel_compute_sad_16x16_av8: + push_v_regs + sub x7, x1, #1 //x left + sub x8, x2, x5 //y top + sub x9, x3, #1 //xy left + sub x10, x3, x5 //xy top + sub x11, x10, #1 //xy top left + + movi v24.8h, #0 + movi v25.8h, #0 + movi v26.8h, #0 + movi v27.8h, #0 + movi v28.8h, #0 + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 + + mov x12, #16 +core_loop_ime_sub_pel_compute_sad_16x16_av8: + + ld1 {v0.16b}, [x0], x4 //src + ld1 {v1.16b}, [x1], x5 //x + ld1 {v2.16b}, [x7], x5 //x left + ld1 {v3.16b}, [x2], x5 //y + ld1 {v9.16b}, [x8], x5 //y top + ld1 {v10.16b}, [x3], x5 //xy + ld1 {v11.16b}, [x9], x5 //xy left + ld1 {v12.16b}, [x10], x5 //xy top + ld1 {v13.16b}, [x11], x5 //xy top left + + uabal v24.8h, v0.8b, v1.8b + uabal2 v24.8h, v0.16b, v1.16b + uabal v25.8h, v0.8b, v2.8b + uabal2 v25.8h, v0.16b, v2.16b + uabal v26.8h, v0.8b, v3.8b + uabal2 v26.8h, v0.16b, v3.16b + uabal v27.8h, v0.8b, v9.8b + uabal2 v27.8h, v0.16b, v9.16b + uabal v28.8h, v0.8b, v10.8b + uabal2 v28.8h, v0.16b, v10.16b + uabal v29.8h, v0.8b, v11.8b + uabal2 v29.8h, v0.16b, v11.16b + uabal v30.8h, v0.8b, v12.8b + uabal2 v30.8h, v0.16b, v12.16b + uabal v31.8h, v0.8b, v13.8b + uabal2 v31.8h, v0.16b, v13.16b + + subs x12, x12, #1 + bne core_loop_ime_sub_pel_compute_sad_16x16_av8 + + addp v24.8h, v24.8h, v25.8h + addp v26.8h, v26.8h, v27.8h + addp v28.8h, v28.8h, v29.8h + addp v30.8h, v30.8h, v31.8h + + uaddlp v24.4s, v24.8h + uaddlp v26.4s, v26.8h + uaddlp v28.4s, v28.8h + uaddlp v30.4s, v30.8h + + addp v24.4s, v24.4s, v26.4s + addp v25.4s, v28.4s, v30.4s + + st1 {v24.4s-v25.4s}, [x6] + + + pop_v_regs + ret + + +///** +//****************************************************************************** +//* +//* @brief computes distortion (SAD) between 2 16x16 blocks +//* +//* @par Description +//* This functions computes SAD between 2 16x16 blocks. There is a provision +//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +//* +//* @param[in] pu1_src +//* UWORD8 pointer to the source +//* +//* @param[out] pu1_dst +//* UWORD8 pointer to the destination +//* +//* @param[in] src_strd +//* integer source stride +//* +//* @param[in] dst_strd +//* integer destination stride +//* +//* @param[in] i4_max_sad +//* integer maximum allowed distortion +//* +//* @param[in] pi4_mb_distortion +//* integer evaluated sad +//* +//* @remarks +//* +//****************************************************************************** +//*/ + .global ime_compute_sad_16x16_av8 +ime_compute_sad_16x16_av8: + push_v_regs + mov x6, #4 + movi v30.8h, #0 + +core_loop_ime_compute_sad_16x16_av8: + + ld1 {v0.16b}, [x0], x2 + ld1 {v1.16b}, [x1], x3 + ld1 {v2.16b}, [x0], x2 + ld1 {v3.16b}, [x1], x3 + + uabal v30.8h, v0.8b, v1.8b + uabal2 v30.8h, v0.16b, v1.16b + + uabal v30.8h, v2.8b, v3.8b + uabal2 v30.8h, v2.16b, v3.16b + + ld1 {v4.16b}, [x0], x2 + ld1 {v5.16b}, [x1], x3 + ld1 {v6.16b}, [x0], x2 + ld1 {v7.16b}, [x1], x3 + + uabal v30.8h, v4.8b, v5.8b + uabal2 v30.8h, v4.16b, v5.16b + + uabal v30.8h, v6.8b, v7.8b + uabal2 v30.8h, v6.16b, v7.16b + + subs x6, x6, #1 + bne core_loop_ime_compute_sad_16x16_av8 + + + addp v30.8h, v30.8h, v30.8h + uaddlp v30.4s, v30.8h + addp v30.2s, v30.2s, v30.2s + + st1 {v30.s}[0], [x5] + pop_v_regs + ret + + +///* +////--------------------------------------------------------------------------- +//// Function Name : Calculate_Mad4_prog() +//// +//// Detail Description : This function find the sad values of 4 Progressive MBs +//// at one shot +//// +//// Platform : CortexA8/NEON . +//// +////----------------------------------------------------------------------------- +//*/ + + .global ime_calculate_sad4_prog_av8 +ime_calculate_sad4_prog_av8: + push_v_regs + sub x5, x0, #1 //left + add x6, x0, #1 //right + sub x7, x0, x2 //top + add x8, x0, x2 //bottom + + movi v28.8h, #0 + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 + + mov x9, #16 +core_loop_ime_calculate_sad4_prog_av8: + + ld1 {v0.16b}, [x1], x3 + ld1 {v1.16b}, [x5], x2 + ld1 {v2.16b}, [x6], x2 + ld1 {v3.16b}, [x7], x2 + ld1 {v9.16b}, [x8], x2 + + uabal v28.8h, v0.8b, v1.8b + uabal2 v28.8h, v0.16b, v1.16b + uabal v29.8h, v0.8b, v2.8b + uabal2 v29.8h, v0.16b, v2.16b + uabal v30.8h, v0.8b, v3.8b + uabal2 v30.8h, v0.16b, v3.16b + uabal v31.8h, v0.8b, v9.8b + uabal2 v31.8h, v0.16b, v9.16b + + subs x9, x9, #1 + bne core_loop_ime_calculate_sad4_prog_av8 + + addp v28.8h, v28.8h, v29.8h + addp v30.8h, v30.8h, v31.8h + + uaddlp v28.4s, v28.8h + uaddlp v30.4s, v30.8h + + addp v28.4s, v28.4s, v30.4s + st1 {v28.4s}, [x4] + pop_v_regs + ret + + + +//***************************************************************************** +//* +//* Function Name : ime_compute_satqd_16x16_lumainter_av8 +//* Description : This fucntion computes SAD for a 16x16 block. +// : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant +// +// Arguments : x0 :pointer to src buffer +// x1 :pointer to est buffer +// x2 :source stride +// x3 :est stride +// STACk :Threshold,distotion,is_nonzero +//* +//* Values Returned : NONE +//* +//* Register Usage : x0-x11 +//* Stack Usage : +//* Cycles : Around +//* Interruptiaility : Interruptable +//* +//* Known Limitations +//* \Assumptions : +//* +//* Revision History : +//* DD MM YYYY Author(s) Changes +//* 14 04 2014 Harinarayanan K K First version +//* +//***************************************************************************** + .global ime_compute_satqd_16x16_lumainter_av8 +ime_compute_satqd_16x16_lumainter_av8: + //x0 :pointer to src buffer + //x1 :pointer to est buffer + //x2 :Source stride + //x3 :Pred stride + //x4 :Threshold pointer + //x5 :Distortion,ie SAD + //x6 :is nonzero + //x7 :loop counter + push_v_regs + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + + ld1 {v30.8h}, [x4] + + dup v20.4h, v30.h[1] //ls1 + dup v24.4h, v30.h[0] //ls2 + dup v21.4h, v30.h[5] //ls3 + dup v25.4h, v30.h[7] //ls4 + dup v22.4h, v30.h[3] //ls5 + dup v26.4h, v30.h[4] //ls6 + dup v23.4h, v30.h[6] //ls7 + dup v27.4h, v30.h[2] //ls8 + + mov v20.d[1], v24.d[0] + mov v21.d[1], v25.d[0] + mov v22.d[1], v26.d[0] + mov v23.d[1], v27.d[0] + + add x4, x4, #16 + ld1 {v29.h}[0], [x4] + dup v29.4h, v29.h[0] + + movi v31.8h, #0 + + mov x7, #4 +core_loop_satqd_ime_compute_satqd_16x16_lumainter: + ld1 {v0.16b}, [x0], x2 + ld1 {v1.16b}, [x1], x3 + ld1 {v2.16b}, [x0], x2 + ld1 {v3.16b}, [x1], x3 + ld1 {v4.16b}, [x0], x2 + ld1 {v5.16b}, [x1], x3 + ld1 {v6.16b}, [x0], x2 + ld1 {v7.16b}, [x1], x3 + + uabdl v10.8h, v0.8b, v1.8b + uabdl2 v15.8h, v0.16b, v1.16b + uabdl v11.8h, v2.8b, v3.8b + uabdl2 v16.8h, v2.16b, v3.16b + uabdl v12.8h, v4.8b, v5.8b + uabdl2 v17.8h, v4.16b, v5.16b + uabdl v13.8h, v6.8b, v7.8b + uabdl2 v18.8h, v6.16b, v7.16b + + add v0.8h, v10.8h, v13.8h + add v1.8h, v11.8h, v12.8h + add v2.8h, v15.8h, v18.8h + add v3.8h, v16.8h, v17.8h + + //v0 : S1 S4 S4 S1 A1 A4 A4 A1 + //v1 : S2 S3 S3 S2 A2 A3 A3 A2 + //v2 : B1 B4 B4 B1 X1 X4 X4 X1 + //v3 : B3 B2 B2 B3 X3 X2 X2 X3 + + trn1 v4.8h, v0.8h, v1.8h + trn2 v5.8h, v0.8h, v1.8h + trn1 v6.8h, v2.8h, v3.8h + trn2 v7.8h, v2.8h, v3.8h + + trn1 v0.4s, v4.4s, v6.4s + trn2 v2.4s, v4.4s, v6.4s + trn1 v1.4s, v5.4s, v7.4s + trn2 v3.4s, v5.4s, v7.4s + + add v4.8h, v0.8h, v3.8h + add v5.8h, v1.8h, v2.8h + //v4 : S1 S2 B1 B2 A1 A2 X1 X2 + //v5 : S4 S3 B4 B3 A4 A3 X4 X3 + + //compute sad for each 4x4 block + add v6.8h, v4.8h, v5.8h + addp v19.8h, v6.8h, v6.8h + //duplicate the sad into 128 bit so that we can compare using 128bit + add v31.4h, v31.4h, v19.4h + + //sad_2 = sad_1<<1; + shl v28.8h, v19.8h, #1 + + //sad_2 - pu2_thrsh + sub v24.8h, v28.8h, v20.8h + sub v25.8h, v28.8h, v21.8h + sub v26.8h, v28.8h, v22.8h + sub v27.8h, v28.8h, v23.8h + + trn1 v0.4s, v4.4s, v5.4s + trn2 v1.4s, v4.4s, v5.4s + //v0 : S1 S2 S4 S3 A1 A2 A4 A3 + //v1 : B1 B2 B4 B3 X1 X2 X4 X3 + + trn1 v4.8h, v0.8h, v1.8h + trn2 v5.8h, v0.8h, v1.8h + //v4 : S1 B1 S4 B4 A1 X1 A4 X4 + //v5 : S2 B2 S3 B3 A2 X2 A3 X3 + + mov v7.s[0], v4.s[1] + mov v7.s[1], v4.s[3] + mov v6.s[0], v5.s[1] // V4 //S1 B1 A1 X1 + mov v6.s[1], v5.s[3] // V5 //S2 B2 A2 X2 + mov v4.s[1], v4.s[2] // V6 //S3 B3 A3 X3 + mov v5.s[1], v5.s[2] // V7 //S4 B4 A4 X4 + + shl v0.4h, v4.4h, #1 //S1<<1 + shl v1.4h, v5.4h, #1 //S2<<1 + shl v2.4h, v6.4h, #1 //S3<<1 + shl v3.4h, v7.4h, #1 //S4<<1 + + add v8.4h, v5.4h, v6.4h //(s2[j] + s3[j])) + add v9.4h, v4.4h, v7.4h //(s1[j] + s4[j])) + add v10.4h, v6.4h, v7.4h //(s3[j] + s4[j])) + sub v11.4h, v6.4h, v0.4h //(s3[j] - (s1[j]<<1)) + sub v12.4h, v7.4h, v1.4h //(s4[j] - (s2[j]<<1)) + add v13.4h, v4.4h, v5.4h //(s1[j] + s2[j])) + sub v14.4h, v5.4h, v3.4h //(s2[j] - (s4[j]<<1))) + sub v15.4h, v4.4h, v2.4h //(s1[j] - (s3[j]<<1))) + + mov v8.d[1], v9.d[0] + mov v10.d[1], v11.d[0] + mov v12.d[1], v13.d[0] + mov v14.d[1], v15.d[0] + + cmge v0.8h, v24.8h, v8.8h //ls1 ls2 + cmge v1.8h, v25.8h, v10.8h //ls3 ls4 + cmge v2.8h, v26.8h, v12.8h //ls5 ls6 + cmge v3.8h, v27.8h, v14.8h //ls7 ls8 + cmge v4.4h, v19.4h, v29.4h //sad + + orr v0.16b, v0.16b, v1.16b + orr v2.16b, v2.16b, v3.16b + orr v2.16b, v0.16b, v2.16b + xtn v2.8b, v2.8h + orr v2.8b, v2.8b, v4.8b + + //if the comparison is non zero, out + mov x4, v2.d[0] + cmp x4, #0 + bne core_loop_compute_sad_pre + + subs x7, x7, #1 + bne core_loop_satqd_ime_compute_satqd_16x16_lumainter + b satdq_end_func + + +core_loop_compute_sad: + ld1 {v0.16b}, [x0], x2 + ld1 {v1.16b}, [x1], x3 + ld1 {v2.16b}, [x0], x2 + ld1 {v3.16b}, [x1], x3 + + uabal v31.8h, v0.8b, v1.8b + uabal2 v31.8h, v0.16b, v1.16b + + uabal v31.8h, v2.8b, v3.8b + uabal2 v31.8h, v2.16b, v3.16b + + ld1 {v4.16b}, [x0], x2 + ld1 {v5.16b}, [x1], x3 + ld1 {v6.16b}, [x0], x2 + ld1 {v7.16b}, [x1], x3 + + uabal v31.8h, v4.8b, v5.8b + uabal2 v31.8h, v4.16b, v5.16b + + uabal v31.8h, v6.8b, v7.8b + uabal2 v31.8h, v6.16b, v7.16b + +core_loop_compute_sad_pre: + subs x7, x7, #1 + bne core_loop_compute_sad + +satdq_end_func: + + mov x7, #1 + cmp x4, #0 + csel x7, x4, x7, eq + str w7, [x6] + + addp v31.8h, v31.8h, v31.8h + uaddlp v31.4s, v31.8h + addp v31.2s, v31.2s, v31.2s + st1 {v31.s}[0], [x5] + + + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + pop_v_regs + ret + .section .note.gnu-stack,"",%progbits diff --git a/encoder/armv8/ime_platform_macros.h b/encoder/armv8/ime_platform_macros.h new file mode 100755 index 0000000..0f5b2f2 --- /dev/null +++ b/encoder/armv8/ime_platform_macros.h @@ -0,0 +1,51 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ime_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IME_PLATFORM_MACROS_H_ +#define _IME_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Function macro definitions */ +/*****************************************************************************/ + +#define USADA8(src,est,sad) \ + sad += ABS(src[0]-est[0]) + \ + ABS(src[1]-est[1]) + \ + ABS(src[2]-est[2]) + \ + ABS(src[3]-est[3]) + + +#endif /* _IH264_PLATFORM_MACROS_H_ */ diff --git a/encoder/ih264e.h b/encoder/ih264e.h new file mode 100755 index 0000000..15a9d8f --- /dev/null +++ b/encoder/ih264e.h @@ -0,0 +1,620 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ih264e.h */ +/* */ +/* Description : This file contains all the necessary structure and */ +/* enumeration definitions needed for the Application */ +/* Program Interface(API) of the Ittiam MPEG4 */ +/* Encoder on Cortex A8 - Neon platform */ +/* */ +/* List of Functions : ih264e_api_function */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 26 08 2010 100239(RCY) Draft */ +/* */ +/*****************************************************************************/ + +#ifndef _IH264E_H_ +#define _IH264E_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "iv2.h" +#include "ive2.h" +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + + +/*****************************************************************************/ +/* API Function Prototype */ +/*****************************************************************************/ +IV_STATUS_T ih264e_api_function(iv_obj_t *ps_handle, void *pv_api_ip,void *pv_api_op); + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ +typedef enum +{ + IH264E_CMD_CTL_SET_ME_INFO_ENABLE, +}IH264E_CMD_CTL_SUB_CMDS; + + +/*****************************************************************************/ +/* Extended Structures */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* Get Number of Memory Records */ +/*****************************************************************************/ + + +typedef struct +{ + iv_num_mem_rec_ip_t s_ive_ip; +}ih264e_num_mem_rec_ip_t; + + +typedef struct +{ + iv_num_mem_rec_op_t s_ive_op; +}ih264e_num_mem_rec_op_t; + + +/*****************************************************************************/ +/* Fill Memory Records */ +/*****************************************************************************/ + + +typedef struct +{ + iv_fill_mem_rec_ip_t s_ive_ip; +}ih264e_fill_mem_rec_ip_t; + + +typedef struct +{ + iv_fill_mem_rec_op_t s_ive_op; +}ih264e_fill_mem_rec_op_t; + +/*****************************************************************************/ +/* Retrieve Memory Records */ +/*****************************************************************************/ + + +typedef struct +{ + iv_retrieve_mem_rec_ip_t s_ive_ip; +}ih264e_retrieve_mem_rec_ip_t; + + +typedef struct +{ + iv_retrieve_mem_rec_op_t s_ive_op; +}ih264e_retrieve_mem_rec_op_t; + + +/*****************************************************************************/ +/* Initialize encoder */ +/*****************************************************************************/ + +typedef struct +{ + ive_init_ip_t s_ive_ip; +}ih264e_init_ip_t; + + +typedef struct +{ + ive_init_op_t s_ive_op; +}ih264e_init_op_t; + + +/*****************************************************************************/ +/* Queue Input raw buffer - Send the YUV buffer to be encoded */ +/*****************************************************************************/ +typedef struct +{ + ive_queue_inp_ip_t s_ive_ip; +}ih264e_queue_inp_ip_t; + +typedef struct +{ + ive_queue_inp_op_t s_ive_op; +}ih264e_queue_inp_op_t; + +/*****************************************************************************/ +/* Dequeue Input raw buffer - Get free YUV buffer from the encoder */ +/*****************************************************************************/ +typedef struct +{ + ive_dequeue_inp_ip_t s_ive_ip; +}ih264e_dequeue_inp_ip_t; + +typedef struct +{ + ive_dequeue_inp_op_t s_ive_op; +}ih264e_dequeue_inp_op_t; + + +/*****************************************************************************/ +/* Queue Output bitstream buffer - Send the bistream buffer to be filled */ +/*****************************************************************************/ +typedef struct +{ + ive_queue_out_ip_t s_ive_ip; +}ih264e_queue_out_ip_t; + +typedef struct +{ + ive_queue_out_op_t s_ive_op; +}ih264e_queue_out_op_t; + +/*****************************************************************************/ +/* Dequeue Output bitstream buffer - Get the bistream buffer filled */ +/*****************************************************************************/ +typedef struct +{ + ive_dequeue_out_ip_t s_ive_ip; +}ih264e_dequeue_out_ip_t; + +typedef struct +{ + ive_dequeue_out_op_t s_ive_op; +}ih264e_dequeue_out_op_t; + + +/*****************************************************************************/ +/* Get Recon data - Get the reconstructed data from encoder */ +/*****************************************************************************/ +typedef struct +{ + ive_get_recon_ip_t s_ive_ip; +}ih264e_get_recon_ip_t; + +typedef struct +{ + ive_get_recon_op_t s_ive_op; +}ih264e_get_recon_op_t; +/*****************************************************************************/ +/* Video control Flush */ +/*****************************************************************************/ + + +typedef struct +{ + ive_ctl_flush_ip_t s_ive_ip; +}ih264e_ctl_flush_ip_t; + + +typedef struct +{ + ive_ctl_flush_op_t s_ive_op; +}ih264e_ctl_flush_op_t; + +/*****************************************************************************/ +/* Video control reset */ +/*****************************************************************************/ + + +typedef struct +{ + ive_ctl_reset_ip_t s_ive_ip; +}ih264e_ctl_reset_ip_t; + + +typedef struct +{ + ive_ctl_reset_op_t s_ive_op; +}ih264e_ctl_reset_op_t; + + +/*****************************************************************************/ +/* Video control:Get Buf Info */ +/*****************************************************************************/ + + +typedef struct +{ + ive_ctl_getbufinfo_ip_t s_ive_ip; +}ih264e_ctl_getbufinfo_ip_t; + + + +typedef struct +{ + ive_ctl_getbufinfo_op_t s_ive_op; +}ih264e_ctl_getbufinfo_op_t; + + + +/*****************************************************************************/ +/* Video control:Get Version Info */ +/*****************************************************************************/ + + +typedef struct +{ + ive_ctl_getversioninfo_ip_t s_ive_ip; +}ih264e_ctl_getversioninfo_ip_t; + + + +typedef struct +{ + ive_ctl_getversioninfo_op_t s_ive_op; +}ih264e_ctl_getversioninfo_op_t; + +/*****************************************************************************/ +/* Video control:Set default params */ +/*****************************************************************************/ + + +typedef struct +{ + ive_ctl_setdefault_ip_t s_ive_ip; +}ih264e_ctl_setdefault_ip_t; + + + +typedef struct +{ + ive_ctl_setdefault_op_t s_ive_op; +}ih264e_ctl_setdefault_op_t; + +/*****************************************************************************/ +/* Video control Set IPE params */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_ipe_params_ip_t s_ive_ip; +}ih264e_ctl_set_ipe_params_ip_t; + +typedef struct +{ + ive_ctl_set_ipe_params_op_t s_ive_op; +}ih264e_ctl_set_ipe_params_op_t; + +/*****************************************************************************/ +/* Video control Set Frame dimensions */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_dimensions_ip_t s_ive_ip; +}ih264e_ctl_set_dimensions_ip_t; + +typedef struct +{ + ive_ctl_set_dimensions_op_t s_ive_op; +}ih264e_ctl_set_dimensions_op_t; + +/*****************************************************************************/ +/* Video control Set Frame rates */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_frame_rate_ip_t s_ive_ip; +}ih264e_ctl_set_frame_rate_ip_t; +typedef struct +{ + ive_ctl_set_frame_rate_op_t s_ive_op; +}ih264e_ctl_set_frame_rate_op_t; + + +/*****************************************************************************/ +/* Video control Set Bitrate */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_bitrate_ip_t s_ive_ip; +}ih264e_ctl_set_bitrate_ip_t; + +typedef struct +{ + ive_ctl_set_bitrate_op_t s_ive_op; +}ih264e_ctl_set_bitrate_op_t; + + +/*****************************************************************************/ +/* Video control Set Frame type */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_frame_type_ip_t s_ive_ip; +}ih264e_ctl_set_frame_type_ip_t; + +typedef struct +{ + ive_ctl_set_frame_type_op_t s_ive_op; +}ih264e_ctl_set_frame_type_op_t; + +/*****************************************************************************/ +/* Video control Set Encode mode */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_enc_mode_ip_t s_ive_ip; +}ih264e_ctl_set_enc_mode_ip_t; + +typedef struct +{ + ive_ctl_set_enc_mode_op_t s_ive_op; +}ih264e_ctl_set_enc_mode_op_t; + +/*****************************************************************************/ +/* Video control Set QP */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_qp_ip_t s_ive_ip; +}ih264e_ctl_set_qp_ip_t; + +typedef struct +{ + ive_ctl_set_qp_op_t s_ive_op; +}ih264e_ctl_set_qp_op_t; + +/*****************************************************************************/ +/* Video control Set AIR params */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_air_params_ip_t s_ive_ip; +}ih264e_ctl_set_air_params_ip_t; + +typedef struct +{ + ive_ctl_set_air_params_op_t s_ive_op; +}ih264e_ctl_set_air_params_op_t; + +/*****************************************************************************/ +/* Video control Set VBV params */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_vbv_params_ip_t s_ive_ip; +}ih264e_ctl_set_vbv_params_ip_t; + +typedef struct +{ + ive_ctl_set_vbv_params_op_t s_ive_op; +}ih264e_ctl_set_vbv_params_op_t; + +/*****************************************************************************/ +/* Video control Set Processor Details */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_num_cores_ip_t s_ive_ip; +}ih264e_ctl_set_num_cores_ip_t; + +typedef struct +{ + ive_ctl_set_num_cores_op_t s_ive_op; +}ih264e_ctl_set_num_cores_op_t; + +/*****************************************************************************/ +/* Video control Set Motion estimation params */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_me_params_ip_t s_ive_ip; +}ih264e_ctl_set_me_params_ip_t; + +typedef struct +{ + ive_ctl_set_me_params_op_t s_ive_op; +}ih264e_ctl_set_me_params_op_t; + +/*****************************************************************************/ +/* Video control Set GOP params */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_gop_params_ip_t s_ive_ip; +}ih264e_ctl_set_gop_params_ip_t; + +typedef struct +{ + ive_ctl_set_gop_params_op_t s_ive_op; +}ih264e_ctl_set_gop_params_op_t; + +/*****************************************************************************/ +/* Video control Set Deblock params */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_deblock_params_ip_t s_ive_ip; +}ih264e_ctl_set_deblock_params_ip_t; + +typedef struct +{ + ive_ctl_set_deblock_params_op_t s_ive_op; +}ih264e_ctl_set_deblock_params_op_t; + +/*****************************************************************************/ +/* Video control Set Profile params */ +/*****************************************************************************/ +typedef struct +{ + ive_ctl_set_profile_params_ip_t s_ive_ip; +}ih264e_ctl_set_profile_params_ip_t; + +typedef struct +{ + ive_ctl_set_profile_params_op_t s_ive_op; +}ih264e_ctl_set_profile_params_op_t; + +/*****************************************************************************/ +/* Synchronous video encode call */ +/*****************************************************************************/ +typedef struct +{ + ive_video_encode_ip_t s_ive_ip; +}ih264e_video_encode_ip_t; + +typedef struct +{ + ive_video_encode_op_t s_ive_op; +}ih264e_video_encode_op_t; + + +/* The enum values should not have greater than 8 bits as this is assigned to WORD8 */ +typedef enum +{ + INTRA16x16 = 0, + INTRA4x4, + INTER16x16 +}IV_MB_TYPE_T; + +/*****************************************************************************/ +/* Pic info structures */ +/*****************************************************************************/ +typedef struct +{ + /** Qp */ + UWORD32 u4_qp; + + /** Pic Type */ + IV_PICTURE_CODING_TYPE_T e_frame_type; + +}ih264e_pic_info1_t; + +typedef struct +{ + /** Qp */ + UWORD32 u4_qp; + + /** Pic Type */ + IV_PICTURE_CODING_TYPE_T e_frame_type; + + /** Disable deblock level (0: Enable completely, 3: Disable completely */ + UWORD32 u4_disable_deblock_level; + +}ih264e_pic_info2_t; + + +/*****************************************************************************/ +/* MB info structures */ +/*****************************************************************************/ +typedef struct +{ + /** MV X */ + WORD16 i2_mv_x; + + /** MV Y */ + WORD16 i2_mv_y; +}ih264e_mv_t; + +typedef struct +{ + /** Intra / Inter */ + WORD8 i1_mb_type; + union + { + ih264e_mv_t as_mv[1]; + + /** Intra mode */ + WORD8 ai1_intra_mode[1]; + }; +}ih264e_mb_info1_t; + +typedef struct +{ + /** Intra / Inter */ + WORD8 i1_mb_type; + + + /** SAD */ + UWORD16 u2_sad; + + union + { + ih264e_mv_t as_mv[1]; + + /** Intra mode */ + WORD8 ai1_intra_mode[1]; + }; + + +}ih264e_mb_info2_t; + +typedef struct +{ + /** Intra / Inter */ + WORD8 i1_mb_type; + + union + { + ih264e_mv_t as_mv[4]; + + /** Intra mode */ + WORD8 ai1_intra_mode[16]; + }; + +}ih264e_mb_info3_t; + +typedef struct +{ + /** Intra / Inter */ + WORD8 i1_mb_type; + + /** Intra Mode */ + WORD8 i1_intra_mode; + + /** SAD */ + UWORD16 u2_sad; + + union + { + ih264e_mv_t as_mv[16]; + + /** Intra mode */ + WORD8 ai1_intra_mode[16]; + }; + + + +}ih264e_mb_info4_t; + +/* Add any new structures to the following union. It is used to calculate the max size needed for allocation of memory */ +typedef struct +{ + union + { + ih264e_mb_info1_t s_mb_info1; + ih264e_mb_info2_t s_mb_info2; + ih264e_mb_info3_t s_mb_info3; + ih264e_mb_info4_t s_mb_info4; + }; +}ih264e_mb_info_t; + +#ifdef __cplusplus +} /* closing brace for extern "C" */ +#endif +#endif /* _IH264E_H_ */ diff --git a/encoder/ih264e_api.c b/encoder/ih264e_api.c new file mode 100755 index 0000000..e5c66ea --- /dev/null +++ b/encoder/ih264e_api.c @@ -0,0 +1,5559 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_api.c +* +* @brief +* Contains api function definitions for H264 encoder +* +* @author +* ittiam +* +* @par List of Functions: +* - api_check_struct_sanity() +* - ih264e_codec_update_config() +* - ih264e_set_default_params() +* - ih264e_init() +* - ih264e_get_num_rec() +* - ih264e_fill_num_mem_rec() +* - ih264e_init_mem_rec() +* - ih264e_retrieve_memrec() +* - ih264e_set_flush_mode() +* - ih264e_get_buf_info() +* - ih264e_set_dimensions() +* - ih264e_set_frame_rate() +* - ih264e_set_bit_rate() +* - ih264e_set_frame_type() +* - ih264e_set_qp() +* - ih264e_set_enc_mode() +* - ih264e_set_vbv_params() +* - ih264_set_air_params() +* - ih264_set_me_params() +* - ih264_set_ipe_params() +* - ih264_set_gop_params() +* - ih264_set_profile_params() +* - ih264_set_deblock_params() +* - ih264e_set_num_cores() +* - ih264e_reset() +* - ih264e_ctl() +* - ih264e_api_function() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include Files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +/* User Include Files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "ih264_size_defs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e.h" +#include "ithread.h" +#include "ih264_debug.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" +#include "ih264e_defs.h" +#include "ih264e_globals.h" +#include "ih264_buf_mgr.h" +#include "irc_mem_req_and_acq.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "irc_rate_control_api.h" +#include "ih264e_time_stamp.h" +#include "ih264e_modify_frm_rate.h" +#include "ih264e_rate_control.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_defs.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264e_structs.h" +#include "ih264e_utils.h" +#include "ih264e_core_coding.h" +#include "ih264_buf_mgr.h" +#include "ih264_platform_macros.h" +#include "ih264e_platform_macros.h" +#include "ih264_list.h" +#include "ih264_dpb_mgr.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_common_tables.h" +#include "ih264e_master.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_version.h" + + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ +WORD32 ih264e_get_rate_control_mem_tab(void *pv_rate_control, + iv_mem_rec_t *ps_mem, + ITT_FUNC_TYPE_E e_func_type); + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* Used to test arguments for corresponding API call +* +* @par Description: +* For each command the arguments are validated +* +* @param[in] ps_handle +* Codec handle at API level +* +* @param[in] pv_api_ip +* Pointer to input structure +* +* @param[out] pv_api_op +* Pointer to output structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T api_check_struct_sanity(iv_obj_t *ps_handle, + void *pv_api_ip, + void *pv_api_op) +{ + /* api call */ + WORD32 command = IV_CMD_NA; + + /* input structure expected by the api call */ + UWORD32 *pu4_api_ip = pv_api_ip; + + /* output structure expected by the api call */ + UWORD32 *pu4_api_op = pv_api_op; + + /* temp var */ + WORD32 i, j; + + if (NULL == pv_api_op || NULL == pv_api_ip) + { + return (IV_FAIL); + } + + /* get command */ + command = pu4_api_ip[1]; + + /* set error code */ + pu4_api_op[1] = 0; + + /* error checks on handle */ + switch (command) + { + case IV_CMD_GET_NUM_MEM_REC: + case IV_CMD_FILL_NUM_MEM_REC: + break; + + case IV_CMD_INIT: + if (ps_handle == NULL) + { + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_HANDLE_NULL; + return IV_FAIL; + } + + if (ps_handle->u4_size != sizeof(iv_obj_t)) + { + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_HANDLE_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + break; + + case IVE_CMD_QUEUE_INPUT: + case IVE_CMD_QUEUE_OUTPUT: + case IVE_CMD_DEQUEUE_OUTPUT: + case IVE_CMD_GET_RECON: + case IV_CMD_RETRIEVE_MEMREC: + case IVE_CMD_VIDEO_CTL: + case IVE_CMD_VIDEO_ENCODE: + + if (ps_handle == NULL) + { + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_HANDLE_NULL; + return IV_FAIL; + } + + if (ps_handle->u4_size != sizeof(iv_obj_t)) + { + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_HANDLE_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_handle->pv_fxns != ih264e_api_function) + { + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_API_FUNCTION_PTR_NULL; + return IV_FAIL; + } + + if (ps_handle->pv_codec_handle == NULL) + { + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_INVALID_CODEC_HANDLE; + return IV_FAIL; + } + break; + + default: + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_INVALID_API_CMD; + return IV_FAIL; + } + + /* error checks on input output structures */ + switch (command) + { + case IV_CMD_GET_NUM_MEM_REC: + { + ih264e_num_mem_rec_ip_t *ps_ip = pv_api_ip; + ih264e_num_mem_rec_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + if (ps_ip->s_ive_ip.u4_size != sizeof(ih264e_num_mem_rec_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_GET_MEM_REC_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (ps_op->s_ive_op.u4_size != sizeof(ih264e_num_mem_rec_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_GET_MEM_REC_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + break; + } + + case IV_CMD_FILL_NUM_MEM_REC: + { + ih264e_fill_mem_rec_ip_t *ps_ip = pv_api_ip; + ih264e_fill_mem_rec_op_t *ps_op = pv_api_op; + + iv_mem_rec_t *ps_mem_rec = NULL; + + WORD32 max_wd = ALIGN16(ps_ip->s_ive_ip.u4_max_wd); + WORD32 max_ht = ALIGN16(ps_ip->s_ive_ip.u4_max_ht); + + ps_op->s_ive_op.u4_error_code = 0; + + if (ps_ip->s_ive_ip.u4_size != sizeof(ih264e_fill_mem_rec_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_FILL_MEM_REC_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (ps_op->s_ive_op.u4_size != sizeof(ih264e_fill_mem_rec_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_FILL_MEM_REC_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (max_wd < MIN_WD || max_wd > MAX_WD) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_WIDTH_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (max_ht < MIN_HT || max_ht > MAX_HT) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_HEIGHT_NOT_SUPPORTED; + return (IV_FAIL); + } + + /* verify number of mem rec ptr */ + if (NULL == ps_ip->s_ive_ip.ps_mem_rec) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_FILL_NUM_MEM_RECS_POINTER_NULL; + return (IV_FAIL); + } + + /* verify number of mem records */ + if (ps_ip->s_ive_ip.u4_num_mem_rec != MEM_REC_CNT) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_NUM_MEM_REC_NOT_SUFFICIENT; + return IV_FAIL; + } + + /* check mem records sizes are correct */ + ps_mem_rec = ps_ip->s_ive_ip.ps_mem_rec; + for (i = 0; i < MEM_REC_CNT; i++) + { + if (ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + } + break; + } + + case IV_CMD_INIT: + { + ih264e_init_ip_t *ps_ip = pv_api_ip; + ih264e_init_op_t *ps_op = pv_api_op; + + iv_mem_rec_t *ps_mem_rec = NULL; + + WORD32 max_wd = ALIGN16(ps_ip->s_ive_ip.u4_max_wd); + WORD32 max_ht = ALIGN16(ps_ip->s_ive_ip.u4_max_ht); + + ps_op->s_ive_op.u4_error_code = 0; + + if (ps_ip->s_ive_ip.u4_size != sizeof(ih264e_init_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_INIT_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (ps_op->s_ive_op.u4_size != sizeof(ih264e_init_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_INIT_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (max_wd < MIN_WD || max_wd > MAX_WD) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_WIDTH_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (max_ht < MIN_HT || max_ht > MAX_HT) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_HEIGHT_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_ref_cnt != 1) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_NUM_REF_UNSUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_reorder_cnt != 0) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_NUM_REORDER_UNSUPPORTED; + return (IV_FAIL); + } + + if ((ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_10) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_1B) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_11) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_12) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_13) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_20) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_21) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_22) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_30) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_31) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_32) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_40) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_41) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_42) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_50) + && (ps_ip->s_ive_ip.u4_max_level != IH264_LEVEL_51)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_CODEC_LEVEL_NOT_SUPPORTED; + return (IV_FAIL); + } + + if ((ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_420P) + && (ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_422ILE) + && (ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_420SP_UV) + && (ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_420SP_VU)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INPUT_CHROMA_FORMAT_NOT_SUPPORTED; + return (IV_FAIL); + } + + if ((ps_ip->s_ive_ip.e_recon_color_fmt != IV_YUV_420P) + && (ps_ip->s_ive_ip.e_recon_color_fmt != IV_YUV_420SP_UV) + && (ps_ip->s_ive_ip.e_recon_color_fmt != IV_YUV_420SP_VU)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_RECON_CHROMA_FORMAT_NOT_SUPPORTED; + return (IV_FAIL); + } + + if ((ps_ip->s_ive_ip.e_rc_mode != IVE_RC_NONE) + && (ps_ip->s_ive_ip.e_rc_mode != IVE_RC_STORAGE) + && (ps_ip->s_ive_ip.e_rc_mode != IVE_RC_CBR_NON_LOW_DELAY)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_RATE_CONTROL_MODE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_framerate > DEFAULT_MAX_FRAMERATE) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_FRAME_RATE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_bitrate > DEFAULT_MAX_BITRATE) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_BITRATE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_num_bframes != 0) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_BFRAMES_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.e_content_type != IV_PROGRESSIVE) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_CONTENT_TYPE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_srch_rng_x > DEFAULT_MAX_SRCH_RANGE_X) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_HORIZONTAL_SEARCH_RANGE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_srch_rng_y > DEFAULT_MAX_SRCH_RANGE_Y) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_VERTICAL_SEARCH_RANGE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if ((ps_ip->s_ive_ip.e_slice_mode != IVE_SLICE_MODE_NONE) + && (ps_ip->s_ive_ip.e_slice_mode != IVE_SLICE_MODE_BLOCKS)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_SLICE_TYPE_INPUT_INVALID; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.e_slice_mode == IVE_SLICE_MODE_BLOCKS) + { + if (ps_ip->s_ive_ip.u4_slice_param == 0 + || ps_ip->s_ive_ip.u4_slice_param > ((UWORD32)max_ht >> 4)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_SLICE_PARAM_INPUT_INVALID; + return (IV_FAIL); + } + } + + if (NULL == ps_ip->s_ive_ip.ps_mem_rec) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_FILL_NUM_MEM_RECS_POINTER_NULL; + return (IV_FAIL); + } + + /* verify number of mem records */ + if (ps_ip->s_ive_ip.u4_num_mem_rec != MEM_REC_CNT) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_NUM_MEM_REC_NOT_SUFFICIENT; + return (IV_FAIL); + } + + ps_mem_rec = ps_ip->s_ive_ip.ps_mem_rec; + + /* check memrecords sizes are correct */ + for (i = 0; i <((WORD32)ps_ip->s_ive_ip.u4_num_mem_rec); i++) + { + if (ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + /* check memrecords pointers are not NULL */ + if (ps_mem_rec[i].pv_base == NULL) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_BASE_POINTER_NULL; + return IV_FAIL; + } + } + + /* verify memtabs for overlapping regions */ + { + void *start[MEM_REC_CNT]; + void *end[MEM_REC_CNT]; + + start[0] = (ps_mem_rec[0].pv_base); + end[0] = ((UWORD8 *) ps_mem_rec[0].pv_base) + + ps_mem_rec[0].u4_mem_size - 1; + + for (i = 1; i < MEM_REC_CNT; i++) + { + /* This array is populated to check memtab overlap */ + start[i] = (ps_mem_rec[i].pv_base); + end[i] = ((UWORD8 *) ps_mem_rec[i].pv_base) + + ps_mem_rec[i].u4_mem_size - 1; + + for (j = 0; j < i; j++) + { + if ((start[i] >= start[j]) && (start[i] <= end[j])) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_OVERLAP_ERR; + return IV_FAIL; + } + + if ((end[i] >= start[j]) && (end[i] <= end[j])) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_OVERLAP_ERR; + return IV_FAIL; + } + + if ((start[i] < start[j]) && (end[i] > end[j])) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_OVERLAP_ERR; + return IV_FAIL; + } + } + } + } + + /* re-validate mem records with init config */ + { + /* mem records */ + iv_mem_rec_t s_mem_rec_ittiam_api[MEM_REC_CNT]; + + /* api interface structs */ + ih264e_fill_mem_rec_ip_t s_ip; + ih264e_fill_mem_rec_op_t s_op; + + /* error status */ + IV_STATUS_T e_status; + + /* temp var */ + WORD32 i; + + s_ip.s_ive_ip.u4_size = sizeof(ih264e_fill_mem_rec_ip_t); + s_op.s_ive_op.u4_size = sizeof(ih264e_fill_mem_rec_op_t); + + s_ip.s_ive_ip.e_cmd = IV_CMD_FILL_NUM_MEM_REC; + s_ip.s_ive_ip.ps_mem_rec = s_mem_rec_ittiam_api; + s_ip.s_ive_ip.u4_max_wd = max_wd; + s_ip.s_ive_ip.u4_max_ht = max_ht; + s_ip.s_ive_ip.u4_num_mem_rec = ps_ip->s_ive_ip.u4_num_mem_rec; + s_ip.s_ive_ip.u4_max_level = ps_ip->s_ive_ip.u4_max_level; + s_ip.s_ive_ip.u4_max_ref_cnt = ps_ip->s_ive_ip.u4_max_ref_cnt; + s_ip.s_ive_ip.u4_max_reorder_cnt = + ps_ip->s_ive_ip.u4_max_reorder_cnt; + s_ip.s_ive_ip.e_color_format = ps_ip->s_ive_ip.e_inp_color_fmt; + s_ip.s_ive_ip.u4_max_srch_rng_x = + ps_ip->s_ive_ip.u4_max_srch_rng_x; + s_ip.s_ive_ip.u4_max_srch_rng_y = + ps_ip->s_ive_ip.u4_max_srch_rng_y; + + for (i = 0; i < MEM_REC_CNT; i++) + { + s_mem_rec_ittiam_api[i].u4_size = sizeof(iv_mem_rec_t); + } + + /* fill mem records */ + e_status = ih264e_api_function(NULL, (void *) &s_ip, + (void *) &s_op); + + if (IV_FAIL == e_status) + { + ps_op->s_ive_op.u4_error_code = s_op.s_ive_op.u4_error_code; + return (IV_FAIL); + } + + /* verify mem records */ + for (i = 0; i < MEM_REC_CNT; i++) + { + if (ps_mem_rec[i].u4_mem_size + < s_mem_rec_ittiam_api[i].u4_mem_size) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_INSUFFICIENT_SIZE; + + return IV_FAIL; + } + + if (ps_mem_rec[i].u4_mem_alignment + != s_mem_rec_ittiam_api[i].u4_mem_alignment) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_ALIGNMENT_ERR; + + return IV_FAIL; + } + + if (ps_mem_rec[i].e_mem_type + != s_mem_rec_ittiam_api[i].e_mem_type) + { + UWORD32 check = IV_SUCCESS; + UWORD32 diff = s_mem_rec_ittiam_api[i].e_mem_type + - ps_mem_rec[i].e_mem_type; + + if ((ps_mem_rec[i].e_mem_type + <= IV_EXTERNAL_CACHEABLE_SCRATCH_MEM) + && (s_mem_rec_ittiam_api[i].e_mem_type + >= IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM)) + { + check = IV_FAIL; + } + + if (3 != (s_mem_rec_ittiam_api[i].e_mem_type % 4)) + { + /* It is not IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM or + * IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM */ + + if ((diff < 1) || (diff > 3)) + { + /* Difference between 1 and 3 is okay for all cases other than the + * two filtered with the MOD condition above */ + check = IV_FAIL; + } + } + else + { + if (diff == 1) + { + /* This particular case is when codec asked for External Persistent, + * but got Internal Scratch */ + check = IV_FAIL; + } + if ((diff != 2) && (diff != 3)) + { + check = IV_FAIL; + } + } + + if (check == IV_FAIL) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_INCORRECT_TYPE; + + return IV_FAIL; + } + } + } + } + break; + } + + case IVE_CMD_QUEUE_INPUT: + case IVE_CMD_QUEUE_OUTPUT: + case IVE_CMD_DEQUEUE_OUTPUT: + case IVE_CMD_GET_RECON: + break; + + case IV_CMD_RETRIEVE_MEMREC: + { + ih264e_retrieve_mem_rec_ip_t *ps_ip = pv_api_ip; + ih264e_retrieve_mem_rec_op_t *ps_op = pv_api_op; + + iv_mem_rec_t *ps_mem_rec = NULL; + + ps_op->s_ive_op.u4_error_code = 0; + + if (ps_ip->s_ive_ip.u4_size != sizeof(ih264e_retrieve_mem_rec_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_RETRIEVE_MEM_REC_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (ps_op->s_ive_op.u4_size != sizeof(ih264e_retrieve_mem_rec_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_RETRIEVE_MEM_REC_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (NULL == ps_ip->s_ive_ip.ps_mem_rec) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_FILL_NUM_MEM_RECS_POINTER_NULL; + return (IV_FAIL); + } + + ps_mem_rec = ps_ip->s_ive_ip.ps_mem_rec; + + /* check memrecords sizes are correct */ + for (i = 0; i < MEM_REC_CNT; i++) + { + if (ps_mem_rec[i].u4_size != sizeof(iv_mem_rec_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_MEM_REC_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + } + break; + } + + case IVE_CMD_VIDEO_ENCODE: + { + ih264e_video_encode_ip_t *ps_ip = pv_api_ip; + ih264e_video_encode_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size != sizeof(ih264e_video_encode_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_ENCODE_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + + if (ps_op->s_ive_op.u4_size != sizeof(ih264e_video_encode_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_ENCODE_API_STRUCT_SIZE_INCORRECT; + return (IV_FAIL); + } + break; + } + + case IVE_CMD_VIDEO_CTL: + { + /* ptr to input structure */ + WORD32 *pu4_ptr_cmd = pv_api_ip; + + /* sub command */ + WORD32 sub_command = pu4_ptr_cmd[2]; + + switch (sub_command) + { + case IVE_CMD_CTL_SETDEFAULT: + { + ih264e_ctl_setdefault_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_setdefault_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_setdefault_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETDEF_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_setdefault_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETDEF_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + break; + } + + case IVE_CMD_CTL_GETBUFINFO: + { + codec_t *ps_codec = (codec_t *) (ps_handle->pv_codec_handle); + + ih264e_ctl_getbufinfo_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_getbufinfo_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_getbufinfo_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_GETBUFINFO_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_getbufinfo_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_GETBUFINFO_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.u4_max_wd < MIN_WD) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_WIDTH_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_wd > ps_codec->s_cfg.u4_max_wd) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_WIDTH_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_ht < MIN_HT) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_HEIGHT_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_max_ht > ps_codec->s_cfg.u4_max_ht) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_HEIGHT_NOT_SUPPORTED; + return (IV_FAIL); + } + + if ((ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_420P) + && (ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_422ILE) + && (ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_420SP_UV) + && (ps_ip->s_ive_ip.e_inp_color_fmt != IV_YUV_420SP_VU)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INPUT_CHROMA_FORMAT_NOT_SUPPORTED; + return (IV_FAIL); + } + break; + } + + case IVE_CMD_CTL_GETVERSION: + { + ih264e_ctl_getversioninfo_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_getversioninfo_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_getversioninfo_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_GETVERSION_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_getversioninfo_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_GETVERSION_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.pu1_version == NULL) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_CTL_GET_VERSION_BUFFER_IS_NULL; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_FLUSH: + { + ih264e_ctl_flush_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_flush_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_flush_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_FLUSH_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_flush_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_FLUSH_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_RESET: + { + ih264e_ctl_reset_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_reset_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_reset_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_RESET_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_reset_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_RESET_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_NUM_CORES: + { + ih264e_ctl_set_num_cores_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_num_cores_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_num_cores_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETCORES_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_num_cores_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETCORES_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_num_cores < 1) + || (ps_ip->s_ive_ip.u4_num_cores > MAX_NUM_CORES)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_NUM_CORES; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_DIMENSIONS: + { + codec_t *ps_codec = (codec_t *) (ps_handle->pv_codec_handle); + + ih264e_ctl_set_dimensions_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_dimensions_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_dimensions_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETDIM_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_dimensions_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETDIM_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.u4_wd < MIN_WD) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_WIDTH_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_wd > ps_codec->s_cfg.u4_max_wd) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_WIDTH_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_ht < MIN_HT) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_HEIGHT_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_ht > ps_codec->s_cfg.u4_max_ht) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_HEIGHT_NOT_SUPPORTED; + return (IV_FAIL); + } + + break; + } + + case IVE_CMD_CTL_SET_FRAMERATE: + { + ih264e_ctl_set_frame_rate_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_frame_rate_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_frame_rate_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETFRAMERATE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_frame_rate_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETFRAMERATE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (((ps_ip->s_ive_ip.u4_src_frame_rate * 1000) > DEFAULT_MAX_FRAMERATE) + || ((ps_ip->s_ive_ip.u4_tgt_frame_rate * 1000) > DEFAULT_MAX_FRAMERATE)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_FRAME_RATE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if ((ps_ip->s_ive_ip.u4_src_frame_rate == 0) + || (ps_ip->s_ive_ip.u4_tgt_frame_rate == 0)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_FRAME_RATE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_tgt_frame_rate + > ps_ip->s_ive_ip.u4_src_frame_rate) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_TGT_FRAME_RATE_EXCEEDS_SRC_FRAME_RATE; + return (IV_FAIL); + } + + break; + } + + case IVE_CMD_CTL_SET_BITRATE: + { + ih264e_ctl_set_bitrate_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_bitrate_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_bitrate_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETBITRATE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_bitrate_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETBITRATE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_target_bitrate > DEFAULT_MAX_BITRATE) + || (ps_ip->s_ive_ip.u4_target_bitrate == 0)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_BITRATE_NOT_SUPPORTED; + return (IV_FAIL); + } + + break; + } + + case IVE_CMD_CTL_SET_FRAMETYPE: + { + ih264e_ctl_set_frame_type_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_frame_type_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_frame_type_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETFRAMETYPE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_frame_type_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETFRAMETYPE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.e_frame_type != IV_NA_FRAME) + && (ps_ip->s_ive_ip.e_frame_type != IV_I_FRAME) + && (ps_ip->s_ive_ip.e_frame_type != IV_P_FRAME) + && (ps_ip->s_ive_ip.e_frame_type != IV_IDR_FRAME)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_FORCE_FRAME_INPUT; + return IV_FAIL; + } + break; + } + + case IVE_CMD_CTL_SET_ME_PARAMS: + { + codec_t *ps_codec = (codec_t *) (ps_handle->pv_codec_handle); + + ih264e_ctl_set_me_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_me_params_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_me_params_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETMEPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_me_params_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETMEPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_me_speed_preset != FULL_SRCH) + && (ps_ip->s_ive_ip.u4_me_speed_preset != DMND_SRCH) + && (ps_ip->s_ive_ip.u4_me_speed_preset != HEX_SRCH)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_ME_SPEED_PRESET; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_enable_hpel != 0) + && (ps_ip->s_ive_ip.u4_enable_hpel != 1)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_HALFPEL_OPTION; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_enable_qpel != 0) + && (ps_ip->s_ive_ip.u4_enable_qpel != 1)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_QPEL_OPTION; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_enable_fast_sad != 0) + && (ps_ip->s_ive_ip.u4_enable_fast_sad != 1)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_FAST_SAD_OPTION; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.u4_enable_alt_ref > 255) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_ALT_REF_OPTION; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.u4_srch_rng_x + > ps_codec->s_cfg.u4_max_srch_rng_x) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_HORIZONTAL_SEARCH_RANGE_NOT_SUPPORTED; + return (IV_FAIL); + } + + if (ps_ip->s_ive_ip.u4_srch_rng_y + > ps_codec->s_cfg.u4_max_srch_rng_y) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_VERTICAL_SEARCH_RANGE_NOT_SUPPORTED; + return (IV_FAIL); + } + + break; + } + + case IVE_CMD_CTL_SET_IPE_PARAMS: + { + ih264e_ctl_set_ipe_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_ipe_params_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_ipe_params_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETIPEPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_ipe_params_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETIPEPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_enable_intra_4x4 != 0) + && (ps_ip->s_ive_ip.u4_enable_intra_4x4 != 1)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_INTRA4x4_OPTION; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_enc_speed_preset != IVE_CONFIG) + && (ps_ip->s_ive_ip.u4_enc_speed_preset != IVE_SLOWEST) + && (ps_ip->s_ive_ip.u4_enc_speed_preset != IVE_NORMAL) + && (ps_ip->s_ive_ip.u4_enc_speed_preset != IVE_FAST) + && (ps_ip->s_ive_ip.u4_enc_speed_preset != IVE_HIGH_SPEED) + && (ps_ip->s_ive_ip.u4_enc_speed_preset != IVE_FASTEST)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_ENC_SPEED_PRESET; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_GOP_PARAMS: + { + ih264e_ctl_set_gop_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_gop_params_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_gop_params_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETGOPPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_gop_params_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETGOPPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_i_frm_interval < DEFAULT_MIN_INTRA_FRAME_RATE) + || (ps_ip->s_ive_ip.u4_i_frm_interval > DEFAULT_MAX_INTRA_FRAME_RATE)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_INTRA_FRAME_INTERVAL; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_idr_frm_interval < DEFAULT_MIN_INTRA_FRAME_RATE) + || (ps_ip->s_ive_ip.u4_idr_frm_interval > DEFAULT_MAX_INTRA_FRAME_RATE)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_IDR_FRAME_INTERVAL; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.u4_num_b_frames != 0) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_BFRAMES_NOT_SUPPORTED; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_DEBLOCK_PARAMS: + { + ih264e_ctl_set_deblock_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_deblock_params_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_deblock_params_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETDEBLKPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_deblock_params_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETDEBLKPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_disable_deblock_level != DISABLE_DEBLK_LEVEL_0) + && (ps_ip->s_ive_ip.u4_disable_deblock_level != DISABLE_DEBLK_LEVEL_2) + && (ps_ip->s_ive_ip.u4_disable_deblock_level != DISABLE_DEBLK_LEVEL_3) + && (ps_ip->s_ive_ip.u4_disable_deblock_level != DISABLE_DEBLK_LEVEL_4)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_DEBLOCKING_TYPE_INPUT; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_QP: + { + ih264e_ctl_set_qp_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_qp_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_qp_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETQPPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_qp_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETQPPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_i_qp_max > MAX_H264_QP) + || (ps_ip->s_ive_ip.u4_p_qp_max > MAX_H264_QP) + || (ps_ip->s_ive_ip.u4_b_qp_max > MAX_H264_QP)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_MAX_FRAME_QP; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_i_qp_min > ps_ip->s_ive_ip.u4_i_qp_max) + || (ps_ip->s_ive_ip.u4_p_qp_min > ps_ip->s_ive_ip.u4_p_qp_max) + || (ps_ip->s_ive_ip.u4_b_qp_min > ps_ip->s_ive_ip.u4_b_qp_max)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_MIN_FRAME_QP; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_i_qp > ps_ip->s_ive_ip.u4_i_qp_max) + || (ps_ip->s_ive_ip.u4_p_qp > ps_ip->s_ive_ip.u4_p_qp_max) + || (ps_ip->s_ive_ip.u4_b_qp > ps_ip->s_ive_ip.u4_b_qp_max)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_INVALID_INIT_QP; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_i_qp < ps_ip->s_ive_ip.u4_i_qp_min) + || (ps_ip->s_ive_ip.u4_p_qp < ps_ip->s_ive_ip.u4_p_qp_min) + || (ps_ip->s_ive_ip.u4_b_qp < ps_ip->s_ive_ip.u4_b_qp_min)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= IH264E_INVALID_INIT_QP; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_ENC_MODE: + { + ih264e_ctl_set_enc_mode_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_enc_mode_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_enc_mode_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETENCMODE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_enc_mode_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETENCMODE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.e_enc_mode != IVE_ENC_MODE_HEADER) + && (ps_ip->s_ive_ip.e_enc_mode != IVE_ENC_MODE_PICTURE)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_ENC_OPERATION_MODE; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_VBV_PARAMS: + { + ih264e_ctl_set_vbv_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_vbv_params_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_vbv_params_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETVBVPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_vbv_params_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETVBVPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.u4_vbv_buffer_delay < DEFAULT_MIN_BUFFER_DELAY) + || (ps_ip->s_ive_ip.u4_vbv_buffer_delay > DEFAULT_MAX_BUFFER_DELAY)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_BUFFER_DELAY; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_AIR_PARAMS: + { + ih264e_ctl_set_air_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_air_params_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_air_params_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETAIRPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_air_params_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETAIRPARAMS_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if ((ps_ip->s_ive_ip.e_air_mode != IVE_AIR_MODE_NONE) + && (ps_ip->s_ive_ip.e_air_mode != IVE_AIR_MODE_CYCLIC) + && (ps_ip->s_ive_ip.e_air_mode != IVE_AIR_MODE_RANDOM)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_AIR_MODE; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.u4_air_refresh_period == 0) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_INVALID_AIR_REFRESH_PERIOD; + return IV_FAIL; + } + + break; + } + + case IVE_CMD_CTL_SET_PROFILE_PARAMS: + { + ih264e_ctl_set_profile_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_profile_params_op_t *ps_op = pv_api_op; + + if (ps_ip->s_ive_ip.u4_size + != sizeof(ih264e_ctl_set_profile_params_ip_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_IP_CTL_SETPROFILE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_op->s_ive_op.u4_size + != sizeof(ih264e_ctl_set_profile_params_op_t)) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IVE_ERR_OP_CTL_SETPROFILE_API_STRUCT_SIZE_INCORRECT; + return IV_FAIL; + } + + if (ps_ip->s_ive_ip.e_profile != IV_PROFILE_BASE) + { + ps_op->s_ive_op.u4_error_code |= 1 + << IVE_UNSUPPORTEDPARAM; + ps_op->s_ive_op.u4_error_code |= + IH264E_PROFILE_NOT_SUPPORTED; + return IV_FAIL; + } + + break; + } + + default: + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_INVALID_API_SUB_CMD; + return IV_FAIL; + } + + break; + } + + default: + *(pu4_api_op + 1) |= 1 << IVE_UNSUPPORTEDPARAM; + *(pu4_api_op + 1) |= IVE_ERR_INVALID_API_CMD; + return IV_FAIL; + } + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief update encoder configuration parameters +* +* @par Description: +* updates encoder configuration parameters from the given config set. +* Initialize/reinitialize codec parameters according to new configurations. +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] ps_cfg +* Pointer to config param set +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_codec_update_config(codec_t *ps_codec, + cfg_params_t *ps_cfg) +{ + /* config params */ + cfg_params_t *ps_curr_cfg = &ps_codec->s_cfg; + + /* error status */ + IH264E_ERROR_T err = IH264E_SUCCESS; + + /* temp var */ + UWORD32 u4_init_rc = 0; + + /***********************/ + /* UPDATE CODEC CONFIG */ + /***********************/ + if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_DIMENSIONS) + { + UWORD32 wd_aln = ALIGN16(ps_cfg->u4_wd); + UWORD32 ht_aln = ALIGN16(ps_cfg->u4_ht); + + if (ps_curr_cfg->u4_wd != wd_aln || ps_curr_cfg->u4_ht != ht_aln + || ps_curr_cfg->u4_strd != ps_cfg->u4_strd + || ps_curr_cfg->u4_disp_wd != ps_cfg->u4_disp_wd + || ps_curr_cfg->u4_disp_ht != ps_cfg->u4_disp_ht) + { + ps_curr_cfg->u4_wd = wd_aln; + ps_curr_cfg->u4_ht = ht_aln; + ps_curr_cfg->u4_strd = ps_cfg->u4_strd; + + if (ps_curr_cfg->u4_strd == 0) + { + ps_curr_cfg->u4_strd = ps_curr_cfg->u4_wd; + } + + ps_curr_cfg->u4_disp_wd = ps_cfg->u4_disp_wd; + ps_curr_cfg->u4_disp_ht = ps_cfg->u4_disp_ht; + + ps_curr_cfg->i4_wd_mbs = ps_curr_cfg->u4_wd >> 4; + ps_curr_cfg->i4_ht_mbs = ps_curr_cfg->u4_ht >> 4; + + ps_codec->i4_src_strd = ps_codec->s_cfg.u4_strd; + ps_codec->i4_rec_strd = ALIGN16(ps_cfg->u4_wd) + PAD_WD; + + /* If number of MBs in a frame changes the air map also changes. + * Hence recompute air map also reset air pic cnt */ + if (ps_codec->s_cfg.e_air_mode != IVE_AIR_MODE_NONE) + { + /* re-init the air map */ + ih264e_init_air_map(ps_codec); + + /* reset air counter */ + ps_codec->i4_air_pic_cnt = -1; + } + + /* initialize mv bank buffer manager */ + err = ih264e_mv_buf_mgr_add_bufs(ps_codec); + if (err != IH264E_SUCCESS) + return err; + + /* initialize ref bank buffer manager */ + err = ih264e_pic_buf_mgr_add_bufs(ps_codec); + if (err != IH264E_SUCCESS) + return err; + + /* since dimension changed, start new sequence by forcing IDR */ + ps_codec->force_curr_frame_type = IV_IDR_FRAME; + + /* in case dimension changes, we need to reinitialize RC as the + * old model shall not fit further */ + u4_init_rc = 1; + + /* when the dimension changes, the header needs to be regenerated */ + ps_codec->i4_header_mode = 1; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_FRAMERATE) + { + /* temp var */ + UWORD32 u4_src_ticks, u4_tgt_ticks; + + u4_src_ticks = ih264e_frame_time_get_src_ticks( + ps_codec->s_rate_control.pps_frame_time); + + u4_tgt_ticks = ih264e_frame_time_get_tgt_ticks( + ps_codec->s_rate_control.pps_frame_time); + + /* Change frame rate */ + if (ps_codec->s_cfg.u4_src_frame_rate + != ps_cfg->u4_src_frame_rate * 1000) + { + ps_codec->s_cfg.u4_src_frame_rate = ps_cfg->u4_src_frame_rate + * 1000; + + ih264e_frame_time_update_src_frame_rate( + ps_codec->s_rate_control.pps_frame_time, + ps_codec->s_cfg.u4_src_frame_rate); + + ih264_time_stamp_update_frame_rate( + ps_codec->s_rate_control.pps_time_stamp, + ps_codec->s_cfg.u4_src_frame_rate); + + irc_change_frame_rate(ps_codec->s_rate_control.pps_rate_control_api, + ps_codec->s_cfg.u4_src_frame_rate, + u4_src_ticks, u4_tgt_ticks); + } + + if (ps_codec->s_cfg.u4_tgt_frame_rate + != ps_cfg->u4_tgt_frame_rate * 1000) + { + ps_codec->s_cfg.u4_tgt_frame_rate = ps_cfg->u4_tgt_frame_rate + * 1000; + + ih264e_frame_time_update_tgt_frame_rate( + ps_codec->s_rate_control.pps_frame_time, + ps_codec->s_cfg.u4_tgt_frame_rate); + + irc_change_frame_rate(ps_codec->s_rate_control.pps_rate_control_api, + ps_codec->s_cfg.u4_src_frame_rate, + u4_src_ticks, u4_tgt_ticks); + + irc_change_frm_rate_for_bit_alloc( + ps_codec->s_rate_control.pps_rate_control_api, + ps_codec->s_cfg.u4_tgt_frame_rate); + } + + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_BITRATE) + { + if (ps_curr_cfg->u4_target_bitrate != ps_cfg->u4_target_bitrate) + { + if (IVE_RC_NONE != ps_curr_cfg->e_rc_mode) + irc_change_avg_bit_rate( + ps_codec->s_rate_control.pps_rate_control_api, + ps_cfg->u4_target_bitrate); + + ps_curr_cfg->u4_target_bitrate = ps_cfg->u4_target_bitrate; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_FRAMETYPE) + { + switch (ps_cfg->e_frame_type) + { + case IV_I_FRAME: + ps_codec->force_curr_frame_type = IV_I_FRAME; + break; + + case IV_IDR_FRAME: + ps_codec->force_curr_frame_type = IV_IDR_FRAME; + break; + + case IV_P_FRAME: + default: + break; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_ME_PARAMS) + { + if (ps_curr_cfg->u4_enc_speed_preset == IVE_CONFIG) + { + ps_codec->s_cfg.u4_enable_hpel = ps_cfg->u4_enable_hpel; + ps_codec->s_cfg.u4_enable_fast_sad = ps_cfg->u4_enable_fast_sad; + ps_codec->s_cfg.u4_me_speed_preset = ps_cfg->u4_me_speed_preset; + ps_codec->s_cfg.u4_enable_qpel = ps_cfg->u4_enable_qpel; + } + else if (ps_curr_cfg->u4_enc_speed_preset == IVE_FASTEST) + { + ps_codec->s_cfg.u4_enable_fast_sad = ps_cfg->u4_enable_fast_sad; + } + ps_codec->s_cfg.u4_srch_rng_x = ps_cfg->u4_srch_rng_x; + ps_codec->s_cfg.u4_srch_rng_y = ps_cfg->u4_srch_rng_y; + + if (ps_codec->s_cfg.u4_enable_alt_ref != ps_cfg->u4_enable_alt_ref) + { + ps_codec->s_cfg.u4_enable_alt_ref = ps_cfg->u4_enable_alt_ref; + ps_codec->u4_is_curr_frm_ref = 1; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_IPE_PARAMS) + { + ps_curr_cfg->u4_enc_speed_preset = ps_cfg->u4_enc_speed_preset; + + if (ps_curr_cfg->u4_enc_speed_preset == IVE_SLOWEST) + {/* high quality */ + /* enable diamond search */ + ps_curr_cfg->u4_me_speed_preset = DMND_SRCH; + ps_curr_cfg->u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_curr_cfg->u4_enable_intra_4x4 = 1; + ps_codec->luma_energy_compaction[1] = + ih264e_code_luma_intra_macroblock_4x4_rdopt_on; + + /* sub pel off */ + ps_curr_cfg->u4_enable_hpel = 1; + + /* deblocking off */ + ps_curr_cfg->u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_0; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 0; + } + else if (ps_curr_cfg->u4_enc_speed_preset == IVE_NORMAL) + {/* normal */ + /* enable diamond search */ + ps_curr_cfg->u4_me_speed_preset = DMND_SRCH; + ps_curr_cfg->u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_curr_cfg->u4_enable_intra_4x4 = 1; + + /* sub pel off */ + ps_curr_cfg->u4_enable_hpel = 1; + + /* deblocking off */ + ps_curr_cfg->u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_0; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 0; + } + else if (ps_curr_cfg->u4_enc_speed_preset == IVE_FAST) + {/* normal */ + /* enable diamond search */ + ps_curr_cfg->u4_me_speed_preset = DMND_SRCH; + ps_curr_cfg->u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_curr_cfg->u4_enable_intra_4x4 = 0; + + /* sub pel off */ + ps_curr_cfg->u4_enable_hpel = 1; + + /* deblocking off */ + ps_curr_cfg->u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_0; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 1; + } + else if (ps_curr_cfg->u4_enc_speed_preset == IVE_HIGH_SPEED) + {/* fast */ + /* enable diamond search */ + ps_curr_cfg->u4_me_speed_preset = DMND_SRCH; + ps_curr_cfg->u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_curr_cfg->u4_enable_intra_4x4 = 0; + + /* sub pel off */ + ps_curr_cfg->u4_enable_hpel = 0; + + /* deblocking off */ + ps_curr_cfg->u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_4; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 0; + } + else if (ps_curr_cfg->u4_enc_speed_preset == IVE_FASTEST) + {/* fastest */ + /* enable diamond search */ + ps_curr_cfg->u4_me_speed_preset = DMND_SRCH; + //u4_num_layers = 4; + + /* disable intra 4x4 */ + ps_curr_cfg->u4_enable_intra_4x4 = 0; + + /* sub pel off */ + ps_curr_cfg->u4_enable_hpel = 0; + + /* deblocking off */ + ps_curr_cfg->u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_4; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 1; + } + else if (ps_curr_cfg->u4_enc_speed_preset == IVE_CONFIG) + { + ps_curr_cfg->u4_enable_intra_4x4 = ps_cfg->u4_enable_intra_4x4; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_GOP_PARAMS) + { + if (ps_curr_cfg->u4_i_frm_interval != ps_cfg->u4_i_frm_interval) + { + ps_curr_cfg->u4_i_frm_interval = ps_cfg->u4_i_frm_interval; + + /* reset air counter */ + ps_codec->i4_air_pic_cnt = -1; + + /* re-init air map */ + ih264e_init_air_map(ps_codec); + + /*Effect intra frame interval change*/ + + irc_change_intra_frm_int_call( + ps_codec->s_rate_control.pps_rate_control_api, + ps_curr_cfg->u4_i_frm_interval); + } + + ps_curr_cfg->u4_idr_frm_interval = ps_cfg->u4_idr_frm_interval; + + ps_curr_cfg->u4_num_b_frames = ps_cfg->u4_num_b_frames; + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_DEBLOCK_PARAMS) + { + if (ps_curr_cfg->u4_enc_speed_preset == IVE_CONFIG) + { + ps_curr_cfg->u4_disable_deblock_level = + ps_cfg->u4_disable_deblock_level; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_QP) + { + UWORD8 au1_init_qp[MAX_PIC_TYPE]; + UWORD8 au1_min_max_qp[2 * MAX_PIC_TYPE]; + + ps_codec->s_cfg.u4_i_qp_max = ps_cfg->u4_i_qp_max; + ps_codec->s_cfg.u4_i_qp_min = ps_cfg->u4_i_qp_min; + ps_codec->s_cfg.u4_i_qp = ps_cfg->u4_i_qp; + + ps_codec->s_cfg.u4_p_qp_max = ps_cfg->u4_p_qp_max; + ps_codec->s_cfg.u4_p_qp_min = ps_cfg->u4_p_qp_min; + ps_codec->s_cfg.u4_p_qp = ps_cfg->u4_p_qp; + + ps_codec->s_cfg.u4_b_qp_max = ps_cfg->u4_b_qp_max; + ps_codec->s_cfg.u4_b_qp_min = ps_cfg->u4_b_qp_min; + ps_codec->s_cfg.u4_b_qp = ps_cfg->u4_b_qp; + + /* update rc lib with modified qp */ + au1_init_qp[0] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp]; + au1_init_qp[1] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp]; + au1_init_qp[2] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp]; + + irc_change_init_qp(ps_codec->s_rate_control.pps_rate_control_api, + au1_init_qp); + + au1_min_max_qp[2 * I_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp_min]; + au1_min_max_qp[2 * I_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp_max]; + + au1_min_max_qp[2 * P_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp_min]; + au1_min_max_qp[2 * P_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp_max]; + + au1_min_max_qp[2 * B_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp_min]; + au1_min_max_qp[2 * B_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp_max]; + + irc_change_min_max_qp(ps_codec->s_rate_control.pps_rate_control_api, + au1_min_max_qp); + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_ENC_MODE) + { + ps_codec->s_cfg.e_enc_mode = ps_cfg->e_enc_mode; + + if (ps_codec->s_cfg.e_enc_mode == IVE_ENC_MODE_HEADER) + { + ps_codec->i4_header_mode = 1; + ps_codec->s_cfg.e_enc_mode = IVE_ENC_MODE_PICTURE; + } + else + { + ps_codec->i4_header_mode = 0; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_VBV_PARAMS + && IVE_RC_NONE != ps_codec->s_cfg.e_rc_mode) + { + ps_codec->s_cfg.u4_vbv_buf_size = ps_cfg->u4_vbv_buf_size; + ps_codec->s_cfg.u4_vbv_buffer_delay = ps_cfg->u4_vbv_buffer_delay; + + // irc_change_buffer_delay(ps_codec->s_rate_control.pps_rate_control_api, ps_codec->s_cfg.u4_vbv_buffer_delay); + + // TODO: remove this when the support for changing buffer dynamically + // is yet to be added. + u4_init_rc = 1; + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_AIR_PARAMS) + { + if (ps_curr_cfg->e_air_mode != ps_cfg->e_air_mode + || ps_curr_cfg->u4_air_refresh_period + != ps_cfg->u4_air_refresh_period) + { + ps_curr_cfg->e_air_mode = ps_cfg->e_air_mode; + ps_curr_cfg->u4_air_refresh_period = ps_cfg->u4_air_refresh_period; + + ih264e_init_air_map(ps_codec); + + /* reset air counter */ + ps_codec->i4_air_pic_cnt = -1; + } + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_PROFILE_PARAMS) + { + ps_codec->s_cfg.e_profile = ps_cfg->e_profile; + } + else if (ps_cfg->e_cmd == IVE_CMD_CTL_SET_NUM_CORES) + { + ps_codec->s_cfg.u4_num_cores = ps_cfg->u4_num_cores; + } + + /* reset RC model */ + if (u4_init_rc) + { + /* init qp */ + UWORD8 au1_init_qp[MAX_PIC_TYPE]; + + /* min max qp */ + UWORD8 au1_min_max_qp[2 * MAX_PIC_TYPE]; + + /* init i,p,b qp */ + au1_init_qp[0] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp]; + au1_init_qp[1] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp]; + au1_init_qp[2] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp]; + + /* init min max qp */ + au1_min_max_qp[2 * I_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp_min]; + au1_min_max_qp[2 * I_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp_max]; + + au1_min_max_qp[2 * P_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp_min]; + au1_min_max_qp[2 * P_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp_max]; + + au1_min_max_qp[2 * B_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp_min]; + au1_min_max_qp[2 * B_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp_max]; + + /* get rc mode */ + switch (ps_codec->s_cfg.e_rc_mode) + { + case IVE_RC_STORAGE: + ps_codec->s_rate_control.e_rc_type = VBR_STORAGE; + break; + + case IVE_RC_CBR_NON_LOW_DELAY: + ps_codec->s_rate_control.e_rc_type = CBR_NLDRC; + break; + + case IVE_RC_CBR_LOW_DELAY: + ps_codec->s_rate_control.e_rc_type = CBR_LDRC; + break; + + case IVE_RC_NONE: + ps_codec->s_rate_control.e_rc_type = CONST_QP; + break; + + default: + break; + } + + /* init rate control */ + ih264e_rc_init(ps_codec->s_rate_control.pps_rate_control_api, + ps_codec->s_rate_control.pps_frame_time, + ps_codec->s_rate_control.pps_time_stamp, + ps_codec->s_rate_control.pps_pd_frm_rate, + ps_codec->s_cfg.u4_max_framerate, + ps_codec->s_cfg.u4_src_frame_rate, + ps_codec->s_cfg.u4_tgt_frame_rate, + ps_codec->s_rate_control.e_rc_type, + ps_codec->s_cfg.u4_target_bitrate, + ps_codec->s_cfg.u4_max_bitrate, + ps_codec->s_cfg.u4_vbv_buffer_delay, + ps_codec->s_cfg.u4_i_frm_interval, au1_init_qp, + H264_ALLOC_INTER_FRM_INTV, au1_min_max_qp, + ps_codec->s_cfg.u4_max_level); + } + + return err; +} + +/** +******************************************************************************* +* +* @brief +* Sets default encoder config parameters +* +* @par Description: +* Sets default dynamic parameters. Will be called in ih264e_init() to ensure +* that even if set_params is not called, codec continues to work +* +* @param[in] ps_cfg +* Pointer to encoder config params +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_set_default_params(cfg_params_t *ps_cfg) +{ + WORD32 ret = IV_SUCCESS; + + ps_cfg->u4_max_wd = MAX_WD; + ps_cfg->u4_max_ht = MAX_HT; + ps_cfg->u4_max_ref_cnt = MAX_REF_CNT; + ps_cfg->u4_max_reorder_cnt = MAX_REF_CNT; + ps_cfg->u4_max_level = DEFAULT_MAX_LEVEL; + ps_cfg->e_inp_color_fmt = IV_YUV_420SP_UV; + ps_cfg->u4_enable_recon = DEFAULT_RECON_ENABLE; + ps_cfg->e_recon_color_fmt = IV_YUV_420P; + ps_cfg->u4_enc_speed_preset = IVE_FASTEST; + ps_cfg->e_rc_mode = DEFAULT_RC; + ps_cfg->u4_max_framerate = DEFAULT_MAX_FRAMERATE; + ps_cfg->u4_max_bitrate = DEFAULT_MAX_BITRATE; + ps_cfg->u4_max_num_bframes = 0; + ps_cfg->e_content_type = IV_PROGRESSIVE; + ps_cfg->u4_max_srch_rng_x = DEFAULT_MAX_SRCH_RANGE_X; + ps_cfg->u4_max_srch_rng_y = DEFAULT_MAX_SRCH_RANGE_Y; + ps_cfg->e_slice_mode = IVE_SLICE_MODE_NONE; + ps_cfg->u4_slice_param = DEFAULT_SLICE_PARAM; + ps_cfg->e_arch = ih264e_default_arch(); + ps_cfg->e_soc = SOC_GENERIC; + ps_cfg->u4_disp_wd = MAX_WD; + ps_cfg->u4_disp_ht = MAX_HT; + ps_cfg->u4_wd = MAX_WD; + ps_cfg->u4_ht = MAX_HT; + ps_cfg->u4_strd = ALIGN16(MAX_WD); + ps_cfg->u4_src_frame_rate = DEFAULT_SRC_FRAME_RATE; + ps_cfg->u4_tgt_frame_rate = DEFAULT_TGT_FRAME_RATE; + ps_cfg->u4_target_bitrate = DEFAULT_BITRATE; + ps_cfg->e_frame_type = IV_NA_FRAME; + ps_cfg->e_enc_mode = IVE_ENC_MODE_DEFAULT; + ps_cfg->u4_i_qp = DEFAULT_I_QP; + ps_cfg->u4_p_qp = DEFAULT_P_QP; + ps_cfg->u4_b_qp = DEFAULT_B_QP; + ps_cfg->u4_i_qp_min = DEFAULT_QP_MIN; + ps_cfg->u4_i_qp_max = DEFAULT_QP_MAX; + ps_cfg->u4_p_qp_min = DEFAULT_QP_MIN; + ps_cfg->u4_p_qp_max = DEFAULT_QP_MAX; + ps_cfg->u4_b_qp_min = DEFAULT_QP_MIN; + ps_cfg->u4_b_qp_max = DEFAULT_QP_MAX; + ps_cfg->e_air_mode = DEFAULT_AIR_MODE; + ps_cfg->u4_air_refresh_period = DEFAULT_AIR_REFRESH_PERIOD; + ps_cfg->u4_vbv_buffer_delay = DEFAULT_VBV_DELAY; + ps_cfg->u4_vbv_buf_size = DEFAULT_VBV_SIZE; + ps_cfg->u4_num_cores = DEFAULT_NUM_CORES; + ps_cfg->u4_me_speed_preset = DEFAULT_ME_SPEED_PRESET; + ps_cfg->u4_enable_hpel = DEFAULT_HPEL; + ps_cfg->u4_enable_qpel = DEFAULT_QPEL; + ps_cfg->u4_enable_intra_4x4 = DEFAULT_I4; + ps_cfg->u4_enable_intra_8x8 = DEFAULT_I8; + ps_cfg->u4_enable_intra_16x16 = DEFAULT_I16; + ps_cfg->u4_enable_fast_sad = DEFAULT_ENABLE_FAST_SAD; + ps_cfg->u4_enable_satqd = DEFAULT_ENABLE_SATQD; + ps_cfg->i4_min_sad = + (ps_cfg->u4_enable_satqd == DEFAULT_ENABLE_SATQD) ? + DEFAULT_MIN_SAD_ENABLE : + DEFAULT_MIN_SAD_DISABLE; + ps_cfg->u4_srch_rng_x = DEFAULT_SRCH_RNG_X; + ps_cfg->u4_srch_rng_y = DEFAULT_SRCH_RNG_Y; + ps_cfg->u4_i_frm_interval = DEFAULT_I_INTERVAL; + ps_cfg->u4_idr_frm_interval = DEFAULT_IDR_INTERVAL; + ps_cfg->u4_num_b_frames = DEFAULT_B_FRAMES; + ps_cfg->u4_disable_deblock_level = DEFAULT_DISABLE_DEBLK_LEVEL; + ps_cfg->e_profile = DEFAULT_PROFILE; + ps_cfg->u4_timestamp_low = 0; + ps_cfg->u4_timestamp_high = 0; + ps_cfg->u4_is_valid = 1; + ps_cfg->e_cmd = IVE_CMD_CT_NA; + ps_cfg->i4_wd_mbs = ps_cfg->u4_max_wd >> 4; + ps_cfg->i4_ht_mbs = ps_cfg->u4_max_ht >> 4; + ps_cfg->u4_entropy_coding_mode = CAVLC; + ps_cfg->u4_weighted_prediction = 0; + ps_cfg->u4_constrained_intra_pred = 0; + ps_cfg->u4_pic_info_type = 0; + ps_cfg->u4_mb_info_type = 0; + + return ret; +} + +/** +******************************************************************************* +* +* @brief +* Initialize encoder context. This will be called by init_mem_rec and during +* codec reset +* +* @par Description: +* Initializes the context +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_init(codec_t *ps_codec) +{ + /* enc config param set */ + cfg_params_t *ps_cfg = &(ps_codec->s_cfg); + + /* temp var */ + WORD32 i; + + /* coded pic count */ + ps_codec->i4_coded_pic_cnt = 0; + + /* Number of API calls to encode are made */ + ps_codec->i4_encode_api_call_cnt = -1; + + /* Indicates no header has been generated yet */ + ps_codec->u4_header_generated = 0; + + /* Number of pictures encoded */ + ps_codec->i4_pic_cnt = -1; + + /* Number of threads created */ + ps_codec->i4_proc_thread_cnt = 0; + + /* ctl mutex init */ + ithread_mutex_init(ps_codec->pv_ctl_mutex); + + /* Set encoder chroma format */ + ps_codec->e_codec_color_format = + (ps_cfg->e_inp_color_fmt == IV_YUV_420SP_VU) ? + IV_YUV_420SP_VU : IV_YUV_420SP_UV; + + /* Number of continuous frames where deblocking was disabled */ + ps_codec->i4_disable_deblk_pic_cnt = 0; + + /* frame num */ + ps_codec->i4_frame_num = -1; + + /* set the current frame type to I frame, since we are going to start encoding*/ + ps_codec->force_curr_frame_type = IV_NA_FRAME; + + /* idr_pic_id */ + ps_codec->i4_idr_pic_id = -1; + + /* Flush mode */ + ps_codec->i4_flush_mode = 0; + + /* Encode header mode */ + ps_codec->i4_header_mode = 0; + + /* Encode generate header */ + ps_codec->i4_gen_header = 0; + + /* To signal successful completion of init */ + ps_codec->i4_init_done = 1; + + /* To signal that at least one picture was decoded */ + ps_codec->i4_first_pic_done = 0; + + /* Reset Codec */ + ps_codec->i4_reset_flag = 0; + + /* Current error code */ + ps_codec->i4_error_code = IH264E_SUCCESS; + + /* threshold residue */ + ps_codec->u4_thres_resi = 1; + + /* inter gating enable */ + ps_codec->u4_inter_gate = 0; + + /* entropy mutex init */ + ithread_mutex_init(ps_codec->pv_entropy_mutex); + + /* sps id */ + ps_codec->i4_sps_id = 0; + + /* sps id */ + ps_codec->i4_pps_id = 0; + + /* Process thread created status */ + memset(ps_codec->ai4_process_thread_created, 0, MAX_PROCESS_THREADS); + + /* Number of MBs processed together */ + ps_codec->i4_proc_nmb = 8; + + /* Previous POC msb */ + ps_codec->i4_prev_poc_msb = 0; + + /* Previous POC lsb */ + ps_codec->i4_prev_poc_lsb = -1; + + /* max Previous POC lsb */ + ps_codec->i4_max_prev_poc_lsb = -1; + + /* sps, pps status */ + { + sps_t *ps_sps = ps_codec->ps_sps_base; + pps_t *ps_pps = ps_codec->ps_pps_base; + + for (i = 0; i < MAX_SPS_CNT; i++) + { + ps_sps->i1_sps_valid = 0; + ps_sps++; + } + + for (i = 0; i < MAX_PPS_CNT; i++) + { + ps_pps->i1_pps_valid = 0; + ps_pps++; + } + } + + { + WORD32 max_mb_rows = ps_cfg->i4_ht_mbs; + + WORD32 num_jobs = max_mb_rows * 2; + WORD32 clz; + + /* Use next power of two number of entries*/ + clz = CLZ(num_jobs); + num_jobs = 1 << (32 - clz); + + /* init process jobq */ + ps_codec->pv_proc_jobq = ih264_list_init( + ps_codec->pv_proc_jobq_buf, + ps_codec->i4_proc_jobq_buf_size, num_jobs, + sizeof(job_t), 10); + RETURN_IF((ps_codec->pv_proc_jobq == NULL), IV_FAIL); + ih264_list_reset(ps_codec->pv_proc_jobq); + + /* init entropy jobq */ + ps_codec->pv_entropy_jobq = ih264_list_init( + ps_codec->pv_entropy_jobq_buf, + ps_codec->i4_entropy_jobq_buf_size, num_jobs, + sizeof(job_t), 10); + RETURN_IF((ps_codec->pv_entropy_jobq == NULL), IV_FAIL); + ih264_list_reset(ps_codec->pv_entropy_jobq); + } + + /* Update the jobq context to all the threads */ + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + ps_codec->as_process[i].pv_proc_jobq = ps_codec->pv_proc_jobq; + ps_codec->as_process[i].pv_entropy_jobq = ps_codec->pv_entropy_jobq; + + /* i4_id always stays between 0 and MAX_PROCESS_THREADS */ + ps_codec->as_process[i].i4_id = + (i >= MAX_PROCESS_THREADS) ? + (i - MAX_PROCESS_THREADS) : i; + ps_codec->as_process[i].ps_codec = ps_codec; + + ps_codec->as_process[i].s_entropy.pv_proc_jobq = ps_codec->pv_proc_jobq; + ps_codec->as_process[i].s_entropy.pv_entropy_jobq = + ps_codec->pv_entropy_jobq; + ps_codec->as_process[i].s_entropy.i4_abs_pic_order_cnt = -1; + } + + /* Initialize MV Bank buffer manager */ + ps_codec->pv_mv_buf_mgr = ih264_buf_mgr_init(ps_codec->pv_mv_buf_mgr_base); + + /* Initialize Picture buffer manager for reference buffers*/ + ps_codec->pv_ref_buf_mgr = ih264_buf_mgr_init( + ps_codec->pv_ref_buf_mgr_base); + + /* Initialize Picture buffer manager for input buffers*/ + ps_codec->pv_inp_buf_mgr = ih264_buf_mgr_init( + ps_codec->pv_inp_buf_mgr_base); + + /* Initialize buffer manager for output buffers*/ + ps_codec->pv_out_buf_mgr = ih264_buf_mgr_init( + ps_codec->pv_out_buf_mgr_base); + + /* buffer cnt in buffer manager */ + ps_codec->i4_inp_buf_cnt = 0; + ps_codec->i4_out_buf_cnt = 0; + ps_codec->i4_ref_buf_cnt = 0; + + ps_codec->ps_pic_buf = (pic_buf_t *) ps_codec->pv_pic_buf_base; + memset(ps_codec->ps_pic_buf, 0, BUF_MGR_MAX_CNT * sizeof(pic_buf_t)); + + /* Initialize dpb manager */ + ih264_dpb_mgr_init((dpb_mgr_t*) ps_codec->pv_dpb_mgr); + + memset(ps_codec->as_ref_set, 0, + sizeof(ref_set_t) * (MAX_DPB_SIZE + MAX_CTXT_SETS)); + for (i = 0; i < (MAX_DPB_SIZE + MAX_CTXT_SETS); i++) + { + ps_codec->as_ref_set[i].i4_pic_cnt = -1; + } + + /* fn ptr init */ + ih264e_init_function_ptr(ps_codec); + + /* reset status flags */ + for (i = 0; i < MAX_CTXT_SETS; i++) + { + ps_codec->au4_entropy_thread_active[i] = 0; + ps_codec->ai4_pic_cnt[i] = -1; + + ps_codec->s_rate_control.pre_encode_skip[i] = 0; + ps_codec->s_rate_control.post_encode_skip[i] = 0; + } + + ps_codec->s_rate_control.num_intra_in_prev_frame = 0; + ps_codec->s_rate_control.i4_avg_activity = 0; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Gets number of memory records required by the codec +* +* @par Description: +* Gets codec memory requirements +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns status +* +* @remarks +* +******************************************************************************* +*/ +static WORD32 ih264e_get_num_rec(void *pv_api_ip, void *pv_api_op) +{ + UNUSED(pv_api_ip); + /* api call I/O structures */ + ih264e_num_mem_rec_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_num_mem_rec = MEM_REC_CNT; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Fills memory records of the codec +* +* @par Description: +* Fills codec memory requirements +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_fill_num_mem_rec(void *pv_api_ip, void *pv_api_op) +{ + /* api call I/O structures */ + ih264e_fill_mem_rec_ip_t *ps_ip = pv_api_ip; + ih264e_fill_mem_rec_op_t *ps_op = pv_api_op; + + /* profile / level info */ + WORD32 level; + WORD32 num_reorder_frames; + WORD32 num_ref_frames; + + /* mem records */ + WORD32 no_of_mem_rec; + iv_mem_rec_t *ps_mem_rec_base, *ps_mem_rec; + + /* frame dimensions */ + WORD32 max_wd_luma, max_ht_luma; + WORD32 max_mb_rows, max_mb_cols, max_mb_cnt; + + /* temp var */ + WORD32 i; + + /* error status */ + IV_STATUS_T status = IV_SUCCESS; + + /* profile / level info */ + level = ps_ip->s_ive_ip.u4_max_level; + num_reorder_frames = ps_ip->s_ive_ip.u4_max_reorder_cnt; + num_ref_frames = ps_ip->s_ive_ip.u4_max_ref_cnt; + + /* mem records */ + ps_mem_rec_base = ps_ip->s_ive_ip.ps_mem_rec; + no_of_mem_rec = ps_ip->s_ive_ip.u4_num_mem_rec; + + /* frame dimensions */ + max_ht_luma = ps_ip->s_ive_ip.u4_max_ht; + max_wd_luma = ps_ip->s_ive_ip.u4_max_wd; + max_ht_luma = ALIGN16(max_ht_luma); + max_wd_luma = ALIGN16(max_wd_luma); + max_mb_rows = max_ht_luma / MB_SIZE; + max_mb_cols = max_wd_luma / MB_SIZE; + max_mb_cnt = max_mb_rows * max_mb_cols; + + /* validate params */ + if ((level < MIN_LEVEL) || (level > MAX_LEVEL)) + { + ps_op->s_ive_op.u4_error_code |= IH264E_CODEC_LEVEL_NOT_SUPPORTED; + level = MAX_LEVEL; + } + + if (num_ref_frames > MAX_REF_CNT) + { + ps_op->s_ive_op.u4_error_code |= IH264E_NUM_REF_UNSUPPORTED; + num_ref_frames = MAX_REF_CNT; + } + + if (num_reorder_frames > MAX_REF_CNT) + { + ps_op->s_ive_op.u4_error_code |= IH264E_NUM_REORDER_UNSUPPORTED; + num_reorder_frames = MAX_REF_CNT; + } + + /* Set all memory records as persistent and alignment as 128 by default */ + ps_mem_rec = ps_mem_rec_base; + for (i = 0; i < no_of_mem_rec; i++) + { + ps_mem_rec->u4_mem_alignment = 128; + ps_mem_rec->e_mem_type = IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM; + ps_mem_rec++; + } + + /************************************************************************ + * Request memory for h264 encoder handle * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_IV_OBJ]; + { + ps_mem_rec->u4_mem_size = sizeof(iv_obj_t); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_IV_OBJ, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory for h264 encoder context * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_CODEC]; + { + ps_mem_rec->u4_mem_size = sizeof(codec_t); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CODEC, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory for entropy context * + * In multi core encoding, each row is assumed to be launched on a * + * thread. The rows below can only start after its neighbors are coded * + * The status of an mb coded/uncoded is signaled via entropy map. * + * 1. One word32 to store skip run cnt * + * 2. mb entropy map (mb status entropy coded/uncoded). The size* + * of the entropy map is max mb cols. Further allocate one * + * more additional row to evade checking for row -1. * + * 3. size of bit stream buffer to store bit stream ctxt. * + * 4. Entropy coding is dependent on nnz coefficient count for * + * the neighbor blocks. It is sufficient to maintain one row * + * worth of nnz as entropy for lower row waits on entropy map* + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTROPY]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size of skip mb run */ + total_size += sizeof(WORD32); + total_size = ALIGN8(total_size); + + /* size in bytes to store entropy status of an entire frame */ + total_size += (max_mb_cols * max_mb_rows); + /* add an additional 1 row of bytes to evade the special case of row 0 */ + total_size += max_mb_cols; + total_size = ALIGN128(total_size); + + /* size of bit stream buffer */ + total_size += sizeof(bitstrm_t); + total_size = ALIGN128(total_size); + + /* top nnz luma */ + total_size += (max_mb_cols * 4 * sizeof(UWORD8)); + total_size = ALIGN128(total_size); + + /* top nnz cbcr */ + total_size += (max_mb_cols * 4 * sizeof(UWORD8)); + total_size = ALIGN128(total_size); + + /* total size per each proc ctxt */ + total_size *= MAX_CTXT_SETS; + + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_ENTROPY, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * The residue coefficients that needs to be entropy coded are packed * + * at a buffer space by the proc threads. The entropy thread shall * + * read from the buffer space, unpack them and encode the same. The * + * buffer space required to pack a row of mbs are as follows. * + * Assuming transform_8x8_flag is disabled, * + * In the worst case, 1 mb contains 1 dc 4x4 luma sub block, followed * + * by 16 ac 4x4 luma sub blocks, 2 dc chroma 2x2 sub blocks, followed * + * by 8 ac 4x4 chroma sub blocks. * + * For the sake of simplicity we assume that all sub blocks are of * + * type 4x4. The packing of each 4x4 is depicted by the structure * + * tu_sblk_coeff_data_t * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_COEFF_DATA]; + { + /* temp var */ + WORD32 size = 0; + + /* size of coeff data of 1 mb */ + size += sizeof(tu_sblk_coeff_data_t) * MAX_4x4_SUBBLKS; + + /* size of coeff data of 1 row of mb's */ + size *= max_mb_cols; + + /* align to avoid any false sharing across threads */ + size = ALIGN64(size); + + /* size for one full frame */ + size *= max_mb_rows; + + /* size of each proc buffer set (ping, pong) */ + size *= MAX_CTXT_SETS; + + ps_mem_rec->u4_mem_size = size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MB_COEFF_DATA, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * while encoding an mb, the mb header data is signaled to the entropy* + * thread by writing to a buffer space. the size of header data per mb * + * is assumed to be 40 bytes * + * TODO: revisit this inference * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_HEADER_DATA]; + { + /* temp var */ + WORD32 size; + + /* size per MB */ + size = 40; + + /* size for 1 row of mbs */ + size = size * max_mb_cols; + + /* align to avoid any false sharing across threads */ + size = ALIGN64(size); + + /* size for one full frame */ + size *= max_mb_rows; + + /* size of each proc buffer set (ping, pong) */ + size *= MAX_CTXT_SETS; + + ps_mem_rec->u4_mem_size = size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MB_HEADER_DATA, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Size for holding mv_buf_t for each MV Bank. * + * Note this allocation is done for BUF_MGR_MAX_CNT instead of * + * MAX_DPB_SIZE or max_dpb_size for following reasons * + * max_dpb_size will be based on max_wd and max_ht * + * For higher max_wd and max_ht this number will be smaller than * + * MAX_DPB_SIZE But during actual initialization number of buffers * + * allocated can be more. * + * * + * One extra MV Bank is needed to hold current pics MV bank. * + * Since this is only a structure allocation and not actual buffer * + * allocation, it is allocated for BUF_MGR_MAX_CNT entries * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBANK]; + { + /* max luma samples */ + WORD32 max_luma_samples = 0; + + /* determine max luma samples */ + for (i = 0; i < 16; i++) + if (level ==(WORD32)gas_ih264_lvl_tbl[i].u4_level_idc) + max_luma_samples = gas_ih264_lvl_tbl[i].u4_max_fs + << (BLK_SIZE + BLK_SIZE); + + ps_mem_rec->u4_mem_size = ih264_buf_mgr_size(); + + /************************************************************************ + * Allocate for pu_map, enc_pu_t and pic_pu_idx for each MV bank * + * Note: Number of luma samples is not max_wd * max_ht here, instead it * + * is set to maximum number of luma samples allowed at the given level. * + * This is done to ensure that any stream with width and height lesser * + * than max_wd and max_ht is supported. Number of buffers required can * + * be greater for lower width and heights at a given level and this * + * increased number of buffers might require more memory than what * + * max_wd and max_ht buffer would have required Also note one extra * + * buffer is allocated to store current pictures MV bank. * + ***********************************************************************/ + + ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(mv_buf_t); + + ps_mem_rec->u4_mem_size += (num_ref_frames + num_reorder_frames + + MAX_CTXT_SETS) + * ih264e_get_pic_mv_bank_size(max_luma_samples); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MVBANK, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * While encoding inter slices, to compute the cost of encoding an mb * + * with the mv's at hand, we employ the expression cost = sad + lambda * + * x mv_bits. Here mv_bits is the total number of bits taken to represe* + * nt the mv in the stream. The mv bits for all the possible mv are * + * stored in the look up table. The mem record for this look up table * + * is given below. * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBITS]; + { + /* max srch range x */ + UWORD32 u4_srch_range_x = ps_ip->s_ive_ip.u4_max_srch_rng_x; + + /* max srch range y */ + UWORD32 u4_srch_range_y = ps_ip->s_ive_ip.u4_max_srch_rng_y; + + /* max srch range */ + UWORD32 u4_max_srch_range = MAX(u4_srch_range_x, u4_srch_range_y); + + /* due to subpel */ + u4_max_srch_range <<= 2; + + /* due to mv on either direction */ + u4_max_srch_range = (u4_max_srch_range << 1); + + /* due to pred mv + zero */ + u4_max_srch_range = (u4_max_srch_range << 1) + 1; + + u4_max_srch_range = ALIGN128(u4_max_srch_range); + + ps_mem_rec->u4_mem_size = u4_max_srch_range; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MVBITS, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory for SPS * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_SPS]; + { + ps_mem_rec->u4_mem_size = MAX_SPS_CNT * sizeof(sps_t); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SPS, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory for PPS * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PPS]; + { + ps_mem_rec->u4_mem_size = MAX_PPS_CNT * sizeof(pps_t); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PPS, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory for Slice Header * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_HDR]; + { + ps_mem_rec->u4_mem_size = MAX_CTXT_SETS * MAX_SLICE_HDR_CNT + * sizeof(slice_header_t); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SLICE_HDR, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory for Adaptive Intra Refresh * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_AIR_MAP]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* intra coded map */ + total_size += max_mb_cnt; + total_size *= MAX_CTXT_SETS; + + /* mb refresh map */ + total_size += sizeof(UWORD16) * max_mb_cnt; + + /* alignment */ + total_size = ALIGN128(total_size); + + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_AIR_MAP, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * In multi slice encoding, this memory record helps tracking the start* + * of slice with reference to mb. * + * MEM RECORD for holding * + * 1. mb slice map * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_MAP]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to slice index of all mbs of a frame */ + total_size = ALIGN64(max_mb_cnt); + + /* total size per each proc ctxt */ + total_size *= MAX_CTXT_SETS; + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_SLICE_MAP, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory to hold thread handles for each processing thread * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_THREAD_HANDLE]; + { + WORD32 handle_size = ithread_get_handle_size(); + + ps_mem_rec->u4_mem_size = MAX_PROCESS_THREADS * handle_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_THREAD_HANDLE, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory to hold mutex for control calls * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_CTL_MUTEX]; + { + ps_mem_rec->u4_mem_size = ithread_get_mutex_lock_size(); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CTL_MUTEX, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory to hold mutex for entropy calls * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTROPY_MUTEX]; + { + ps_mem_rec->u4_mem_size = ithread_get_mutex_lock_size(); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_ENTROPY_MUTEX, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory to hold process jobs * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_JOBQ]; + { + /* One process job per row of MBs */ + /* Allocate for two pictures, so that wrap around can be handled easily */ + WORD32 num_jobs = max_mb_rows * 2; + + WORD32 job_queue_size = ih264_list_size(num_jobs, sizeof(job_t)); + + ps_mem_rec->u4_mem_size = job_queue_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_JOBQ, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory to hold entropy jobs * + ***********************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTROPY_JOBQ]; + { + /* One process job per row of MBs */ + /* Allocate for two pictures, so that wrap around can be handled easily */ + WORD32 num_jobs = max_mb_rows * 2; + + WORD32 job_queue_size = ih264_list_size(num_jobs, sizeof(job_t)); + + ps_mem_rec->u4_mem_size = job_queue_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_ENTROPY_JOBQ, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * In multi core encoding, each row is assumed to be launched on a * + * thread. The rows below can only start after its neighbors are coded * + * The status of an mb coded/uncoded is signaled via proc map. * + * MEM RECORD for holding * + * 1. mb proc map (mb status core coded/uncoded) * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_MAP]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to mb core coding status of an entire frame */ + total_size = max_mb_cnt; + + /* add an additional 1 row of bytes to evade the special case of row 0 */ + total_size += max_mb_cols; + + /* total size per each proc ctxt */ + total_size *= MAX_CTXT_SETS; + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_MAP, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * mem record for holding a particular MB is deblocked or not * + * 1. mb deblk map (mb status deblocked/not deblocked) * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_DBLK_MAP]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to mb core coding status of an entire frame */ + total_size = max_mb_cnt; + + /* add an additional 1 row of bytes to evade the special case of row 0 */ + total_size += max_mb_cols; + + total_size = ALIGN64(total_size); + + /* total size per each proc ctxt */ + total_size *= MAX_CTXT_SETS; + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_DBLK_MAP, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * mem record for holding a particular MB's me is done or not * + * 1. mb me map * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ME_MAP]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to mb core coding status of an entire frame */ + total_size = max_mb_cnt; + + /* add an additional 1 row of bytes to evade the special case of row 0 */ + total_size += max_mb_cols; + + /* total size per each proc ctxt */ + total_size *= MAX_CTXT_SETS; + + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_ME_MAP, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * size for holding dpb manager context * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_DPB_MGR]; + { + ps_mem_rec->u4_mem_size = sizeof(dpb_mgr_t); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_DPB_MGR, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * luma or chroma core coding involves mb estimation, error computation* + * between the estimated singnal and the actual signal, transform the * + * error, quantize the error, then inverse transform and inverse quant * + * ize the residue and add the result back to estimated signal. * + * To perform all these, a set of temporary buffers are needed. * + * MEM RECORD for holding scratch buffers * + * 1. prediction buffer used during mb mode analysis * + * 2 temp. reference buffer when intra 4x4 with rdopt on is * + * enabled * + * - when intra 4x4 is enabled, rdopt is on, to store the * + * reconstructed values and use them later this temp. buffer * + * is used. * + * 3. prediction buffer used during intra mode analysis * + * 4. prediction buffer used during intra 16x16 plane mode * + * analysis + * 5. prediction buffer used during intra chroma mode analysis * + * 6. prediction buffer used during intra chroma 16x16 plane * + * mode analysis + * 7. forward transform output buffer * + * - to store the error between estimated and the actual inp * + * ut and to store the fwd transformed quantized output * + * 8. forward transform output buffer * + * - when intra 4x4 is enabled, rdopt is on, to store the * + * fwd transform values and use them later this temp. buffer * + * is used. * + * 9. temporary buffer for inverse transform * + * - temporary buffer used in inverse transform and inverse * + * quantization * + * A. Buffers for holding half_x , half_y and half_xy planes * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_SCRATCH]; + { + WORD32 total_size = 0; + + /* size to hold prediction buffer */ + total_size += sizeof(UWORD8) * 16 * 16; + total_size = ALIGN64(total_size); + + /* size to hold recon for intra 4x4 buffer */ + total_size += sizeof(UWORD8) * 16 * 16; + total_size = ALIGN64(total_size); + + /* prediction buffer intra 16x16 */ + total_size += sizeof(UWORD8) * 16 * 16; + total_size = ALIGN64(total_size); + + /* prediction buffer intra 16x16 plane*/ + total_size += sizeof(UWORD8) * 16 * 16; + total_size = ALIGN64(total_size); + + /* prediction buffer intra chroma*/ + total_size += sizeof(UWORD8) * 16 * 8; + total_size = ALIGN64(total_size); + + /* prediction buffer intra chroma plane*/ + total_size += sizeof(UWORD8) * 16 * 8; + total_size = ALIGN64(total_size); + + /* size to hold fwd transform output */ + total_size += sizeof(WORD16) * SIZE_TRANS_BUFF; + total_size = ALIGN64(total_size); + + /* size to hold fwd transform output */ + total_size += sizeof(WORD16) * SIZE_TRANS_BUFF; + total_size = ALIGN64(total_size); + + /* size to hold temporary data during inverse transform */ + total_size += sizeof(WORD32) * SIZE_TMP_BUFF_ITRANS; + total_size = ALIGN64(total_size); + + /* Buffers for holding half_x , half_y and half_xy planes */ + total_size += sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT); + total_size = ALIGN64(total_size); + + total_size += sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT); + total_size = ALIGN64(total_size); + + total_size += sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT); + total_size = ALIGN64(total_size); + + /* Allocate for each process thread */ + total_size *= MAX_PROCESS_CTXT; + + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_PROC_SCRATCH, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * When transform_8x8_flag is disabled, the size of a sub block is * + * 4x4 and when the transform_8x8_flag is enabled the size of the sub * + * block is 8x8. The threshold matrix and the forward scaling list * + * is of the size of the sub block. * + * MEM RECORD for holding * + * 1. quantization parameters for plane y, cb, cr * + * - threshold matrix for quantization * + * - forward weight matrix * + * - satqd threshold matrix * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_QUANT_PARAM]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* quantization parameter list for planes y,cb and cr */ + total_size += ALIGN64(sizeof(quant_params_t)) * 3; + + /* size of threshold matrix for quantization + * (assuming the transform_8x8_flag is disabled). + * for all 3 planes */ + total_size += ALIGN64(sizeof(WORD16) * 4 * 4) * 3; + + /* size of forward weight matrix for quantization + * (assuming the transform_8x8_flag is disabled). + * for all 3 planes */ + total_size += ALIGN64(sizeof(WORD16) * 4 * 4) * 3; + + /* Size for SATDQ threshold matrix for palnes y, cb and cr */ + total_size += ALIGN64(sizeof(UWORD16) * 9) * 3; + + /* total size per each proc thread */ + total_size *= MAX_PROCESS_CTXT; + + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_QUANT_PARAM, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * While computing blocking strength for the current mb, the csbp, mb * + * type for the neighboring mbs are necessary. memtab for storing top * + * row mbtype and csbp is evaluated here. * + * * + * when encoding intra 4x4 or intra 8x8 the submb types are estimated * + * and sent. The estimation is dependent on neighbor mbs. For this * + * store the top row sub mb types for intra mbs * + * * + * During motion vector prediction, the curr mb mv is predicted from * + * neigbors left, top, top right and sometimes top left depending on * + * the availability. The top and top right content is accessed from * + * the memtab specified below. * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_TOP_ROW_SYN_INFO]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to store 1 row of mb_info_t */ + /* one additional mb, to avoid checking end of row condition */ + total_size += (max_mb_cols + 1) * sizeof(mb_info_t); + + /* size in bytes to store 1 row of intra macroblock sub modes */ + total_size += max_mb_cols * sizeof(UWORD8) * 16; + + /* size in bytes to store 1 row + 1 of enc_pu_t */ + /* one additional mb, to avoid checking end of row condition */ + total_size += (max_mb_cols + 1) * sizeof(enc_pu_t); + + /* total size per proc ctxt */ + total_size = ALIGN128(total_size); + + /* total size per each proc ctxt */ + total_size *= MAX_CTXT_SETS; + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_TOP_ROW_SYN_INFO, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * When transform_8x8_flag is disabled, the mb is partitioned into * + * 4 sub blocks. This corresponds to 1 vertical left edge and 1 * + * vertical inner edge, 1 horizontal top edge and 1 horizontal * + * inner edge per mb. Further, When transform_8x8_flag is enabled, * + * the mb is partitioned in to 16 sub blocks. This corresponds to * + * 1 vertical left edge and 3 vertical inner edges, 1 horizontal top * + * edge and 3 horizontal inner edges per mb. * + * MEM RECORD for holding * + * 1. vertical edge blocking strength * + * 2. horizontal edge blocking strength * + * 3. mb qp * + * all are frame level * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_BS_QP]; + { + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to store vertical edge bs, horizontal edge bs and qp of every mb*/ + WORD32 vert_bs_size, horz_bs_size, qp_size; + + /* vertical edge bs = total number of vertical edges * number of bytes per each edge */ + /* total num of v edges = total mb * 4 (assuming transform_8x8_flag = 0), + * each edge is formed by 4 pairs of subblks, requiring 4 bytes to storing bs */ + vert_bs_size = ALIGN64(max_mb_cnt * 4 * 4); + + /* horizontal edge bs = total number of horizontal edges * number of bytes per each edge */ + /* total num of h edges = total mb * 4 (assuming transform_8x8_flag = 0), + * each edge is formed by 4 pairs of subblks, requiring 4 bytes to storing bs */ + horz_bs_size = ALIGN64(max_mb_cnt * 4 * 4); + + /* qp of each mb requires 1 byte */ + qp_size = ALIGN64(max_mb_cnt); + + /* total size */ + total_size = vert_bs_size + horz_bs_size + qp_size; + + /* total size per each proc ctxt */ + total_size *= MAX_CTXT_SETS; + + ps_mem_rec->u4_mem_size = total_size; + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BS_QP, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * size for holding dpb manager context * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_INP_PIC]; + { + ps_mem_rec->u4_mem_size = ih264_buf_mgr_size(); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_INP_PIC, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * size for holding dpb manager context * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_OUT]; + { + ps_mem_rec->u4_mem_size = ih264_buf_mgr_size(); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_OUT, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Size for color space conversion * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_CSC]; + { + /* We need a total a memory for a single frame of 420 sp, ie + * (wd * ht) for luma and (wd * ht / 2) for chroma*/ + ps_mem_rec->u4_mem_size = MAX_CTXT_SETS + * ((3 * max_ht_luma * max_wd_luma) >> 1); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_CSC, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Size for holding pic_buf_t for each reference picture * + * Note this allocation is done for BUF_MGR_MAX_CNT instead of * + * MAX_DPB_SIZE or max_dpb_size for following reasons * + * max_dpb_size will be based on max_wd and max_ht * + * For higher max_wd and max_ht this number will be smaller than * + * MAX_DPB_SIZE But during actual initialization number of buffers * + * allocated can be more. * + * * + * Also to handle display depth application can allocate more than * + * what codec asks for in case of non-shared mode * + * Since this is only a structure allocation and not actual buffer * + * allocation, it is allocated for BUF_MGR_MAX_CNT entries * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_REF_PIC]; + { + ps_mem_rec->u4_mem_size = ih264_buf_mgr_size(); + ps_mem_rec->u4_mem_size += BUF_MGR_MAX_CNT * sizeof(pic_buf_t); + + /************************************************************************ + * Note: Number of luma samples is not max_wd * max_ht here, instead it * + * is set to maximum number of luma samples allowed at the given level. * + * This is done to ensure that any stream with width and height lesser * + * than max_wd and max_ht is supported. Number of buffers required can * + * be greater for lower width and heights at a given level and this * + * increased number of buffers might require more memory than what * + * max_wd and max_ht buffer would have required. Number of buffers is * + * doubled in order to return one frame at a time instead of sending * + * multiple outputs during dpb full case. Also note one extra buffer is * + * allocted to store current picture. * + * * + * Half-pel planes for each reference buffer are allocated along with * + * the reference buffer. So each reference buffer is 4 times the * + * required size. This way buffer management for the half-pel planes is * + * easier and while using the half-pel planes in MC, an offset can be * + * used from a single pointer * + ***********************************************************************/ + ps_mem_rec->u4_mem_size += HPEL_PLANES_CNT + * ih264e_get_total_pic_buf_size( + max_wd_luma * max_ht_luma, level, + PAD_WD, PAD_HT, num_ref_frames, + num_reorder_frames); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_REF_PIC, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * Request memory to hold mem recs to be returned during retrieve call * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_BACKUP]; + { + ps_mem_rec->u4_mem_size = MEM_REC_CNT * sizeof(iv_mem_rec_t); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_BACKUP, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * size for memory required by NMB info structs and buffer for storing * + * half pel plane * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_INFO_NMB]; + { + ps_mem_rec->u4_mem_size = MAX_PROCESS_CTXT * MAX_NMB + * (sizeof(mb_info_nmb_t) + + MB_SIZE * MB_SIZE * sizeof(UWORD8)); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_MB_INFO_NMB, ps_mem_rec->u4_mem_size); + + /************************************************************************ + * RC mem records * + ************************************************************************/ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_RC]; + { + ih264e_get_rate_control_mem_tab(NULL, ps_mem_rec, FILL_MEMTAB); + } + DEBUG("\nMemory record Id %d = %d \n", MEM_REC_RC, ps_mem_rec->u4_mem_size); + + /* Each memtab size is aligned to next multiple of 128 bytes */ + /* This is to ensure all the memtabs start at different cache lines */ + ps_mem_rec = ps_mem_rec_base; + for (i = 0; i < MEM_REC_CNT; i++) + { + ps_mem_rec->u4_mem_size = ALIGN128(ps_mem_rec->u4_mem_size); + ps_mem_rec++; + } + + ps_op->s_ive_op.u4_num_mem_rec = MEM_REC_CNT; + + DEBUG("Num mem recs in fill call : %d\n", ps_op->s_ive_op.u4_num_mem_rec); + + return (status); +} + +/** +******************************************************************************* +* +* @brief +* Initializes from mem records passed to the codec +* +* @par Description: +* Initializes pointers based on mem records passed +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_init_mem_rec(iv_obj_t *ps_codec_obj, + void *pv_api_ip, + void *pv_api_op) +{ + /* api call I/O structures */ + ih264e_init_ip_t *ps_ip = pv_api_ip; + ih264e_init_op_t *ps_op = pv_api_op; + + /* mem records */ + iv_mem_rec_t *ps_mem_rec_base, *ps_mem_rec; + + /* codec variables */ + codec_t * ps_codec; + cfg_params_t *ps_cfg; + + /* frame dimensions */ + WORD32 max_wd_luma, max_ht_luma; + WORD32 max_mb_rows, max_mb_cols, max_mb_cnt; + + /* temp var */ + WORD32 i; + WORD32 status = IV_SUCCESS; + + /* frame dimensions */ + max_ht_luma = ALIGN16(ps_ip->s_ive_ip.u4_max_ht); + max_wd_luma = ALIGN16(ps_ip->s_ive_ip.u4_max_wd); + max_mb_rows = max_ht_luma / MB_SIZE; + max_mb_cols = max_wd_luma / MB_SIZE; + max_mb_cnt = max_mb_rows * max_mb_cols; + + /* mem records */ + ps_mem_rec_base = ps_ip->s_ive_ip.ps_mem_rec; + + /* Init mem records */ + ps_mem_rec = &ps_mem_rec_base[MEM_REC_CODEC]; + { + ps_codec_obj->pv_codec_handle = ps_mem_rec->pv_base; + ps_codec = (codec_t *) (ps_codec_obj->pv_codec_handle); + } + + /* Note this memset can not be done in init() call, since init will called + during reset as well. And calling this during reset will mean all pointers + need to reinitialized */ + memset(ps_codec, 0, sizeof(codec_t)); + + /* Set default Config Params */ + ps_cfg = &ps_codec->s_cfg; + ih264e_set_default_params(ps_cfg); + + /* Update config params as per input */ + ps_cfg->u4_max_wd = ALIGN16(ps_ip->s_ive_ip.u4_max_wd); + ps_cfg->u4_max_ht = ALIGN16(ps_ip->s_ive_ip.u4_max_ht); + ps_cfg->i4_wd_mbs = ps_cfg->u4_max_wd >> 4; + ps_cfg->i4_ht_mbs = ps_cfg->u4_max_ht >> 4; + ps_cfg->u4_max_ref_cnt = ps_ip->s_ive_ip.u4_max_ref_cnt; + ps_cfg->u4_max_reorder_cnt = ps_ip->s_ive_ip.u4_max_reorder_cnt; + ps_cfg->u4_max_level = ps_ip->s_ive_ip.u4_max_level; + ps_cfg->e_inp_color_fmt = ps_ip->s_ive_ip.e_inp_color_fmt; + ps_cfg->e_recon_color_fmt = ps_ip->s_ive_ip.e_recon_color_fmt; + ps_cfg->u4_max_framerate = ps_ip->s_ive_ip.u4_max_framerate; + ps_cfg->u4_max_bitrate = ps_ip->s_ive_ip.u4_max_bitrate; + ps_cfg->u4_max_num_bframes = ps_ip->s_ive_ip.u4_max_num_bframes; + ps_cfg->e_content_type = ps_ip->s_ive_ip.e_content_type; + ps_cfg->u4_max_srch_rng_x = ps_ip->s_ive_ip.u4_max_srch_rng_x; + ps_cfg->u4_max_srch_rng_y = ps_ip->s_ive_ip.u4_max_srch_rng_y; + ps_cfg->e_slice_mode = ps_ip->s_ive_ip.e_slice_mode; + ps_cfg->u4_slice_param = ps_ip->s_ive_ip.u4_slice_param; + ps_cfg->e_arch = ps_ip->s_ive_ip.e_arch; + ps_cfg->e_soc = ps_ip->s_ive_ip.e_soc; + ps_cfg->u4_enable_recon = ps_ip->s_ive_ip.u4_enable_recon; + ps_cfg->e_rc_mode = ps_ip->s_ive_ip.e_rc_mode; + + /* Validate params */ + if ((ps_ip->s_ive_ip.u4_max_level < MIN_LEVEL) + || (ps_ip->s_ive_ip.u4_max_level > MAX_LEVEL)) + { + ps_op->s_ive_op.u4_error_code |= IH264E_CODEC_LEVEL_NOT_SUPPORTED; + ps_cfg->u4_max_level = DEFAULT_MAX_LEVEL; + } + + if (ps_ip->s_ive_ip.u4_max_ref_cnt > MAX_REF_CNT) + { + ps_op->s_ive_op.u4_error_code |= IH264E_NUM_REF_UNSUPPORTED; + ps_cfg->u4_max_ref_cnt = MAX_REF_CNT; + } + + if (ps_ip->s_ive_ip.u4_max_reorder_cnt > MAX_REF_CNT) + { + ps_op->s_ive_op.u4_error_code |= IH264E_NUM_REORDER_UNSUPPORTED; + ps_cfg->u4_max_reorder_cnt = MAX_REF_CNT; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_BACKUP]; + { + ps_codec->ps_mem_rec_backup = (iv_mem_rec_t *) ps_mem_rec->pv_base; + + memcpy(ps_codec->ps_mem_rec_backup, ps_mem_rec_base, + MEM_REC_CNT * sizeof(iv_mem_rec_t)); + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTROPY]; + { + /* temp var */ + WORD32 size = 0, offset; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + /* base ptr */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* reset size */ + size = 0; + + /* skip mb run */ + ps_codec->as_process[i].s_entropy.pi4_mb_skip_run = + (void *) (pu1_buf + size); + size += sizeof(WORD32); + size = ALIGN8(size); + + /* entropy map */ + ps_codec->as_process[i].s_entropy.pu1_entropy_map = + (void *) (pu1_buf + size + max_mb_cols); + /* size in bytes to store entropy status of an entire frame */ + size += (max_mb_cols * max_mb_rows); + /* add an additional 1 row of bytes to evade the special case of row 0 */ + size += max_mb_cols; + size = ALIGN128(size); + + /* bit stream ptr */ + ps_codec->as_process[i].s_entropy.ps_bitstrm = (void *) (pu1_buf + + size); + size += sizeof(bitstrm_t); + size = ALIGN128(size); + + /* nnz luma */ + ps_codec->as_process[i].s_entropy.pu1_top_nnz_luma = + (void *) (pu1_buf + size); + size += (max_mb_cols * 4 * sizeof(UWORD8)); + size = ALIGN128(size); + + /* nnz chroma */ + ps_codec->as_process[i].s_entropy.pu1_top_nnz_cbcr = + (void *) (pu1_buf + size); + size += (max_mb_cols * 4 * sizeof(UWORD8)); + size = ALIGN128(size); + offset = size; + } + else + { + /* base ptr */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* reset size */ + size = offset; + + /* skip mb run */ + ps_codec->as_process[i].s_entropy.pi4_mb_skip_run = + (void *) (pu1_buf + size); + size += sizeof(WORD32); + size = ALIGN8(size); + + /* entropy map */ + ps_codec->as_process[i].s_entropy.pu1_entropy_map = + (void *) (pu1_buf + size + max_mb_cols); + /* size in bytes to store entropy status of an entire frame */ + size += (max_mb_cols * max_mb_rows); + /* add an additional 1 row of bytes to evade the special case of row 0 */ + size += max_mb_cols; + size = ALIGN128(size); + + /* bit stream ptr */ + ps_codec->as_process[i].s_entropy.ps_bitstrm = (void *) (pu1_buf + + size); + size += sizeof(bitstrm_t); + size = ALIGN128(size); + + /* nnz luma */ + ps_codec->as_process[i].s_entropy.pu1_top_nnz_luma = + (void *) (pu1_buf + size); + size += (max_mb_cols * 4 * sizeof(UWORD8)); + size = ALIGN128(size); + + /* nnz chroma */ + ps_codec->as_process[i].s_entropy.pu1_top_nnz_cbcr = + (void *) (pu1_buf + size); + size += (max_mb_cols * 4 * sizeof(UWORD8)); + size = ALIGN128(size); + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_COEFF_DATA]; + { + /* temp var */ + WORD32 size = 0, size_of_row; + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* size of coeff data of 1 mb */ + size += sizeof(tu_sblk_coeff_data_t) * MAX_4x4_SUBBLKS; + + /* size of coeff data of 1 row of mb's */ + size *= max_mb_cols; + + /* align to avoid false sharing */ + size = ALIGN64(size); + size_of_row = size; + + /* size for one full frame */ + size *= max_mb_rows; + + ps_codec->u4_size_coeff_data = size_of_row; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].pv_pic_mb_coeff_data = pu1_buf; + ps_codec->as_process[i].s_entropy.pv_pic_mb_coeff_data = + pu1_buf; + } + else + { + ps_codec->as_process[i].pv_pic_mb_coeff_data = pu1_buf + size; + ps_codec->as_process[i].s_entropy.pv_pic_mb_coeff_data = pu1_buf + + size; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_HEADER_DATA]; + { + /* temp var */ + WORD32 size, size_of_row; + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* size of header data of 1 mb */ + size = 40; + + /* size for 1 row of mbs */ + size = size * max_mb_cols; + + /* align to avoid any false sharing across threads */ + size = ALIGN64(size); + size_of_row = size; + + /* size for one full frame */ + size *= max_mb_rows; + + ps_codec->u4_size_header_data = size_of_row; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].pv_pic_mb_header_data = pu1_buf; + ps_codec->as_process[i].s_entropy.pv_pic_mb_header_data = + pu1_buf; + } + else + { + ps_codec->as_process[i].pv_pic_mb_header_data = pu1_buf + size; + ps_codec->as_process[i].s_entropy.pv_pic_mb_header_data = + pu1_buf + size; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBANK]; + { + /* size of buf mgr struct */ + WORD32 size = ih264_buf_mgr_size(); + + /* temp var */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* mv buffer mgr */ + ps_codec->pv_mv_buf_mgr_base = pu1_buf; + + /* mv bank */ + ps_codec->pv_mv_bank_buf_base = pu1_buf + size; + ps_codec->i4_total_mv_bank_size = ps_mem_rec->u4_mem_size - size; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MVBITS]; + { + /* max srch range x */ + UWORD32 u4_srch_range_x = ps_ip->s_ive_ip.u4_max_srch_rng_x; + + /* max srch range y */ + UWORD32 u4_srch_range_y = ps_ip->s_ive_ip.u4_max_srch_rng_y; + + /* max srch range */ + UWORD32 u4_max_srch_range = MAX(u4_srch_range_x, u4_srch_range_y); + + /* temp var */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* due to subpel */ + u4_max_srch_range <<= 2; + +// /* due to mv on either direction */ +// u4_max_srch_range = (u4_max_srch_range << 1); + + /* due to pred mv + zero */ + u4_max_srch_range = (u4_max_srch_range << 1) + 1; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + /* me ctxt */ + me_ctxt_t *ps_mem_ctxt = &(ps_codec->as_process[i].s_me_ctxt); + + /* init at zero mv */ + ps_mem_ctxt->pu1_mv_bits = pu1_buf + u4_max_srch_range; + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_SPS]; + { + ps_codec->ps_sps_base = (sps_t *) ps_mem_rec->pv_base; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PPS]; + { + ps_codec->ps_pps_base = (pps_t *) ps_mem_rec->pv_base; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_HDR]; + { + ps_codec->ps_slice_hdr_base = ps_mem_rec->pv_base; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].ps_slice_hdr_base = ps_mem_rec->pv_base; + } + else + { + /* temp var */ + WORD32 size = MAX_SLICE_HDR_CNT * sizeof(slice_header_t); + void *pv_buf = (UWORD8 *) ps_mem_rec->pv_base + size; + + ps_codec->as_process[i].ps_slice_hdr_base = pv_buf; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_AIR_MAP]; + { + /* temp var */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].pu1_is_intra_coded = pu1_buf; + } + else + { + ps_codec->as_process[i].pu1_is_intra_coded = pu1_buf + + max_mb_cnt; + } + } + + ps_codec->pu2_intr_rfrsh_map = (UWORD16 *) (pu1_buf + max_mb_cnt * 2); + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_SLICE_MAP]; + { + /* pointer to storage space */ + UWORD8 *pu1_buf_ping, *pu1_buf_pong; + + /* init pointer */ + pu1_buf_ping = ps_mem_rec->pv_base; + pu1_buf_pong = pu1_buf_ping + ALIGN64(max_mb_cnt); + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].pu1_slice_idx = pu1_buf_ping; + } + else + { + ps_codec->as_process[i].pu1_slice_idx = pu1_buf_pong; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_THREAD_HANDLE]; + { + WORD32 handle_size = ithread_get_handle_size(); + + for (i = 0; i < MAX_PROCESS_THREADS; i++) + { + ps_codec->apv_proc_thread_handle[i] = (UWORD8 *) ps_mem_rec->pv_base + + (i * handle_size); + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_CTL_MUTEX]; + { + ps_codec->pv_ctl_mutex = ps_mem_rec->pv_base; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTROPY_MUTEX]; + { + ps_codec->pv_entropy_mutex = ps_mem_rec->pv_base; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_JOBQ]; + { + ps_codec->pv_proc_jobq_buf = ps_mem_rec->pv_base; + ps_codec->i4_proc_jobq_buf_size = ps_mem_rec->u4_mem_size; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ENTROPY_JOBQ]; + { + ps_codec->pv_entropy_jobq_buf = ps_mem_rec->pv_base; + ps_codec->i4_entropy_jobq_buf_size = ps_mem_rec->u4_mem_size; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_MAP]; + { + /* pointer to storage space */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to mb core coding status of an entire frame */ + total_size = max_mb_cnt; + + /* add an additional 1 row of bytes to evade the special case of row 0 */ + total_size += max_mb_cols; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].pu1_proc_map = pu1_buf + max_mb_cols; + } + else + { + ps_codec->as_process[i].pu1_proc_map = pu1_buf + total_size + + max_mb_cols; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_DBLK_MAP]; + { + /* pointer to storage space */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to mb core coding status of an entire frame */ + total_size = max_mb_cnt; + + /* add an additional 1 row of bytes to evade the special case of row 0 */ + total_size += max_mb_cols; + + /*Align the memory offsets*/ + total_size = ALIGN64(total_size); + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].pu1_deblk_map = pu1_buf + max_mb_cols; + + } + else + { + ps_codec->as_process[i].pu1_deblk_map = pu1_buf + total_size + + max_mb_cols; + + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_ME_MAP]; + { + /* pointer to storage space */ + UWORD8 *pu1_buf = (UWORD8 *) ps_mem_rec->pv_base; + + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to mb core coding status of an entire frame */ + total_size = max_mb_cnt; + + /* add an additional 1 row of bytes to evade the special case of row 0 */ + total_size += max_mb_cols; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].pu1_me_map = pu1_buf + max_mb_cols; + } + else + { + ps_codec->as_process[i].pu1_me_map = pu1_buf + total_size + + max_mb_cols; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_DPB_MGR]; + { + ps_codec->pv_dpb_mgr = ps_mem_rec->pv_base; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_PROC_SCRATCH]; + { + /* pointer to storage space */ + UWORD8 *pu1_buf = (UWORD8 *) ps_mem_rec->pv_base; + + /* size of pred buffer, fwd transform output, temp buffer for inv tra */ + WORD32 size_pred_luma, size_pred_chroma, size_fwd, size_inv, size_hp; + + /* temp var */ + WORD32 size = 0; + + /* size to hold intra/inter prediction buffer */ + size_pred_luma = sizeof(UWORD8) * 16 * 16; + size_pred_chroma = sizeof(UWORD8) * 8 * 16; + + /* size to hold fwd transform output */ + size_fwd = sizeof(WORD16) * SIZE_TRANS_BUFF; + + /* size to hold temporary data during inverse transform */ + size_inv = sizeof(WORD32) * SIZE_TMP_BUFF_ITRANS; + + /* size to hold half pel plane buffers */ + size_hp = sizeof(UWORD8) * (HP_BUFF_WD * HP_BUFF_HT); + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + /* prediction buffer */ + ps_codec->as_process[i].pu1_pred_mb = (void *) (pu1_buf + size); + ps_codec->as_process[i].i4_pred_strd = 16; + size += size_pred_luma; + size = ALIGN64(size); + + /* prediction buffer */ + ps_codec->as_process[i].pu1_ref_mb_intra_4x4 = (void *) (pu1_buf + + size); + size += size_pred_luma; + size = ALIGN64(size); + + /* prediction buffer intra 16x16 */ + ps_codec->as_process[i].pu1_pred_mb_intra_16x16 = (void *) (pu1_buf + + size); + size += size_pred_luma; + size = ALIGN64(size); + + /* prediction buffer intra 16x16 plane*/ + ps_codec->as_process[i].pu1_pred_mb_intra_16x16_plane = + (void *) (pu1_buf + size); + size += size_pred_luma; + size = ALIGN64(size); + + /* prediction buffer intra chroma*/ + ps_codec->as_process[i].pu1_pred_mb_intra_chroma = (void *) (pu1_buf + + size); + size += size_pred_chroma; + size = ALIGN64(size); + + /* prediction buffer intra chroma plane*/ + ps_codec->as_process[i].pu1_pred_mb_intra_chroma_plane = + (void *) (pu1_buf + size); + size += size_pred_chroma; + size = ALIGN64(size); + + /* Fwd transform output */ + ps_codec->as_process[i].pi2_res_buf = (void *) (pu1_buf + size); + ps_codec->as_process[i].i4_res_strd = 16; + size += size_fwd; + size = ALIGN64(size); + + /* Fwd transform output */ + ps_codec->as_process[i].pi2_res_buf_intra_4x4 = (void *) (pu1_buf + + size); + size += size_fwd; + size = ALIGN64(size); + + /* scratch buffer used during inverse transform */ + ps_codec->as_process[i].pv_scratch_buff = (void *) (pu1_buf + size); + size += size_inv; + size = ALIGN64(size); + + /* Buffers for holding half_x , half_y and half_xy values */ + ps_codec->as_process[i].pu1_half_x = (void *) (pu1_buf + size); + size += size_hp; + size = ALIGN64(size); + + ps_codec->as_process[i].pu1_half_y = (void *) (pu1_buf + size); + size += size_hp; + size = ALIGN64(size); + + ps_codec->as_process[i].pu1_half_xy = (void *) (pu1_buf + size); + size += size_hp; + size = ALIGN64(size); + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_QUANT_PARAM]; + { + /* pointer to storage space */ + UWORD8 *pu1_buf = (UWORD8 *) ps_mem_rec->pv_base; + + /* size of qp, threshold matrix, fwd scaling list for one plane */ + WORD32 size_quant_param, size_thres_mat, size_fwd_weight_mat, + size_satqd_weight_mat; + + /* temp var */ + WORD32 total_size = 0; + + /* size of quantization parameter list of 1 plane */ + size_quant_param = ALIGN64(sizeof(quant_params_t)); + + /* size of threshold matrix for quantization + * (assuming the transform_8x8_flag is disabled). + * for 1 plane */ + size_thres_mat = ALIGN64(sizeof(WORD16) * 4 * 4); + + /* size of forward weight matrix for quantization + * (assuming the transform_8x8_flag is disabled). + * for 1 plane */ + size_fwd_weight_mat = ALIGN64(sizeof(WORD16) * 4 * 4); + + /* size of SATQD matrix*/ + size_satqd_weight_mat = ALIGN64(sizeof(UWORD16) * 9); + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + quant_params_t **ps_qp_params = ps_codec->as_process[i].ps_qp_params; + + /* quantization param structure */ + ps_qp_params[0] = (quant_params_t *) (pu1_buf + total_size); + total_size = total_size + size_quant_param; + ps_qp_params[1] = (quant_params_t *) (pu1_buf + total_size); + total_size = total_size + size_quant_param; + ps_qp_params[2] = (quant_params_t *) (pu1_buf + total_size); + total_size = total_size + size_quant_param; + + /* threshold matrix for quantization */ + ps_qp_params[0]->pu2_thres_mat = (void *) (pu1_buf + total_size); + total_size = total_size + size_thres_mat; + ps_qp_params[1]->pu2_thres_mat = (void *) (pu1_buf + total_size); + total_size = total_size + size_thres_mat; + ps_qp_params[2]->pu2_thres_mat = (void *) (pu1_buf + total_size); + total_size = total_size + size_thres_mat; + + /* fwd weight matrix */ + ps_qp_params[0]->pu2_weigh_mat = (void *) (pu1_buf + total_size); + total_size = total_size + size_fwd_weight_mat; + ps_qp_params[1]->pu2_weigh_mat = (void *) (pu1_buf + total_size); + total_size = total_size + size_fwd_weight_mat; + ps_qp_params[2]->pu2_weigh_mat = (void *) (pu1_buf + total_size); + total_size = total_size + size_fwd_weight_mat; + + /* threshold matrix for SATQD */ + ps_qp_params[0]->pu2_sad_thrsh = (void *) (pu1_buf + total_size); + total_size = total_size + size_satqd_weight_mat; + ps_qp_params[1]->pu2_sad_thrsh = (void *) (pu1_buf + total_size); + total_size = total_size + size_satqd_weight_mat; + ps_qp_params[2]->pu2_sad_thrsh = (void *) (pu1_buf + total_size); + total_size = total_size + size_satqd_weight_mat; + + total_size = ALIGN128(total_size); + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_TOP_ROW_SYN_INFO]; + { + /* total size of the mem record */ + WORD32 total_size = 0, size_csbp, size_intra_modes, size_mv; + + /* pointer to buffer */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* size in bytes to store 1 row of mb_info_t */ + /* one additional mb, to avoid checking end of row condition */ + size_csbp = (max_mb_cols + 1) * sizeof(mb_info_t); + + /* size in bytes to store 1 row of intra macroblock sub modes */ + size_intra_modes = max_mb_cols * sizeof(UWORD8) * 16; + + /* size in bytes to store 1 row + 1 of enc_pu_t */ + /* one additional mb, to avoid checking end of row condition */ + size_mv = (max_mb_cols + 1) * sizeof(enc_pu_t); + + /* total size per proc ctxt */ + total_size = size_csbp + size_intra_modes + size_mv; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + ps_codec->as_process[i].ps_top_row_mb_syntax_ele_base = + (mb_info_t *) pu1_buf; + ps_codec->as_process[i].pu1_top_mb_intra_modes_base = pu1_buf + + size_csbp; + ps_codec->as_process[i].ps_top_row_pu_base = + (enc_pu_t *) (pu1_buf + size_csbp + + size_intra_modes); + } + else + { + ps_codec->as_process[i].ps_top_row_mb_syntax_ele_base = + (mb_info_t *) (pu1_buf + total_size); + ps_codec->as_process[i].pu1_top_mb_intra_modes_base = pu1_buf + + total_size + size_csbp; + ps_codec->as_process[i].ps_top_row_pu_base = + (enc_pu_t *) (pu1_buf + total_size + size_csbp + + size_intra_modes); + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_BS_QP]; + { + UWORD8 *pu1_buf_ping, *pu1_buf_pong; + + /* total size of the mem record */ + WORD32 total_size = 0; + + /* size in bytes to store vertical edge bs, horizontal edge bs and qp of every mb*/ + WORD32 vert_bs_size, horz_bs_size, qp_size; + + /* vertical edge bs = total number of vertical edges * number of bytes per each edge */ + /* total num of v edges = total mb * 4 (assuming transform_8x8_flag = 0), + * each edge is formed by 4 pairs of subblks, requiring 4 bytes to storing bs */ + vert_bs_size = ALIGN64(max_mb_cnt * 4 * 4); + + /* horizontal edge bs = total number of horizontal edges * number of bytes per each edge */ + /* total num of h edges = total mb * 4 (assuming transform_8x8_flag = 0), + * each edge is formed by 4 pairs of subblks, requiring 4 bytes to storing bs */ + horz_bs_size = ALIGN64(max_mb_cnt * 4 * 4); + + /* qp of each mb requires 1 byte */ + qp_size = ALIGN64(max_mb_cnt); + + /* total size */ + total_size = vert_bs_size + horz_bs_size + qp_size; + + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + if (i < MAX_PROCESS_CTXT / 2) + { + pu1_buf_ping = (UWORD8 *) ps_mem_rec->pv_base; + + /* vertical edge bs storage space */ + ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = + (UWORD32 *) pu1_buf_ping; + pu1_buf_ping += vert_bs_size; + + /* horizontal edge bs storage space */ + ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = + (UWORD32 *) pu1_buf_ping; + pu1_buf_ping += horz_bs_size; + + /* qp */ + ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = + (UWORD8 *) pu1_buf_ping; + pu1_buf_ping += qp_size; + } + else + { + pu1_buf_pong = (UWORD8 *) ps_mem_rec->pv_base; + pu1_buf_pong += total_size; + + /* vertical edge bs storage space */ + ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_vert_bs = + (UWORD32 *) pu1_buf_pong; + pu1_buf_pong += vert_bs_size; + + /* horizontal edge bs storage space */ + ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu4_pic_horz_bs = + (UWORD32 *) pu1_buf_pong; + pu1_buf_pong += horz_bs_size; + + /* qp */ + ps_codec->as_process[i].s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp = + (UWORD8 *) pu1_buf_pong; + pu1_buf_pong += qp_size; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_INP_PIC]; + { + ps_codec->pv_inp_buf_mgr_base = ps_mem_rec->pv_base; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_OUT]; + { + ps_codec->pv_out_buf_mgr_base = ps_mem_rec->pv_base; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_CSC]; + { + ps_codec->pu1_y_csc_buf_base = ps_mem_rec->pv_base; + ps_codec->pu1_uv_csc_buf_base = (UWORD8 *) ps_mem_rec->pv_base + + (max_ht_luma * max_wd_luma); + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_REF_PIC]; + { + /* size of buf mgr struct */ + WORD32 size = ih264_buf_mgr_size(); + + /* temp var */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* pic buffer mgr */ + ps_codec->pv_ref_buf_mgr_base = pu1_buf; + + /* picture bank */ + ps_codec->pv_pic_buf_base = pu1_buf + size; + ps_codec->i4_total_pic_buf_size = ps_mem_rec->u4_mem_size - size; + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_MB_INFO_NMB]; + { + /* temp var */ + UWORD8 *pu1_buf = ps_mem_rec->pv_base; + + /* size of nmb ctxt */ + WORD32 size = MAX_NMB * sizeof(mb_info_nmb_t); + + UWORD32 nmb_cntr, subpel_buf_size; + + /* init nmb info structure pointer in all proc ctxts */ + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + ps_codec->as_process[i].ps_nmb_info = (mb_info_nmb_t *) (pu1_buf); + + pu1_buf += size; + } + + subpel_buf_size = MB_SIZE * MB_SIZE * sizeof(UWORD8); + + /* adjusting pointers for nmb halfpel buffer */ + for (i = 0; i < MAX_PROCESS_CTXT; i++) + { + mb_info_nmb_t* ps_mb_info_nmb = + &ps_codec->as_process[i].ps_nmb_info[0]; + + for (nmb_cntr = 0; nmb_cntr < MAX_NMB; nmb_cntr++) + { + ps_mb_info_nmb[nmb_cntr].pu1_best_sub_pel_buf = pu1_buf; + + pu1_buf = pu1_buf + subpel_buf_size; + + ps_mb_info_nmb[nmb_cntr].u4_bst_spel_buf_strd = MB_SIZE; + } + } + } + + ps_mem_rec = &ps_mem_rec_base[MEM_REC_RC]; + { + ih264e_get_rate_control_mem_tab(&ps_codec->s_rate_control, ps_mem_rec, + USE_BASE); + } + + /* init codec ctxt */ + status = ih264e_init(ps_codec); + + return status; +} + +/** +******************************************************************************* +* +* @brief +* Retrieves mem records passed to the codec +* +* @par Description: +* Retrieves mem recs passed during init +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_retrieve_memrec(iv_obj_t *ps_codec_obj, + void *pv_api_ip, + void *pv_api_op) +{ + /* codec ctxt */ + codec_t *ps_codec = (codec_t *) ps_codec_obj->pv_codec_handle; + + /* ctrl call I/O structures */ + ih264e_retrieve_mem_rec_ip_t *ps_ip = pv_api_ip; + ih264e_retrieve_mem_rec_op_t *ps_op = pv_api_op; + + if (ps_codec->i4_init_done != 1) + { + ps_op->s_ive_op.u4_error_code |= 1 << IVE_FATALERROR; + ps_op->s_ive_op.u4_error_code |= IH264E_INIT_NOT_DONE; + return IV_FAIL; + } + + /* join threads upon at end of sequence */ + ih264e_join_threads(ps_codec); + + /* collect list of memory records used by the encoder library */ + memcpy(ps_ip->s_ive_ip.ps_mem_rec, ps_codec->ps_mem_rec_backup, + MEM_REC_CNT * (sizeof(iv_mem_rec_t))); + ps_op->s_ive_op.u4_num_mem_rec_filled = MEM_REC_CNT; + + /* clean up mutex memory */ + ih264_list_free(ps_codec->pv_entropy_jobq); + ih264_list_free(ps_codec->pv_proc_jobq); + ithread_mutex_destroy(ps_codec->pv_ctl_mutex); + ithread_mutex_destroy(ps_codec->pv_entropy_mutex); + + + ih264_buf_mgr_free((buf_mgr_t *)ps_codec->pv_mv_buf_mgr); + ih264_buf_mgr_free((buf_mgr_t *)ps_codec->pv_ref_buf_mgr); + ih264_buf_mgr_free((buf_mgr_t *)ps_codec->pv_inp_buf_mgr); + ih264_buf_mgr_free((buf_mgr_t *)ps_codec->pv_out_buf_mgr); + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets the encoder in flush mode. +* +* @par Description: +* Sets the encoder in flush mode +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks This call has no real effect on encoder +* +******************************************************************************* +*/ +static WORD32 ih264e_set_flush_mode(iv_obj_t *ps_codec_obj, + void *pv_api_ip, + void *pv_api_op) +{ + UNUSED(pv_api_ip); + /* codec ctxt */ + codec_t *ps_codec = (codec_t *) ps_codec_obj->pv_codec_handle; + + /* ctrl call I/O structures */ + ih264e_ctl_flush_op_t *ps_ctl_op = pv_api_op; + + ps_ctl_op->s_ive_op.u4_error_code = 0; + + /* signal flush frame control call */ + ps_codec->i4_flush_mode = 1; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Gets encoder buffer requirements +* +* @par Description: +* Gets the encoder buffer requirements. Basing on max width and max height +* configuration settings, this routine, computes the sizes of necessary input, +* output buffers returns this info to callee. +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_get_buf_info(iv_obj_t *ps_codec_obj, + void *pv_api_ip, + void *pv_api_op) +{ + UNUSED(ps_codec_obj); + /* ctrl call I/O structures */ + ih264e_ctl_getbufinfo_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_getbufinfo_op_t *ps_op = pv_api_op; + + /* temp var */ + WORD32 wd = ALIGN16(ps_ip->s_ive_ip.u4_max_wd); + WORD32 ht = ALIGN16(ps_ip->s_ive_ip.u4_max_ht); + WORD32 i; + + ps_op->s_ive_op.u4_error_code = 0; + + /* Number of components in input buffers required for codec & + * Minimum sizes of each component in input buffer required */ + if (ps_ip->s_ive_ip.e_inp_color_fmt == IV_YUV_420P) + { + ps_op->s_ive_op.u4_inp_comp_cnt = MIN_RAW_BUFS_420_COMP; + + ps_op->s_ive_op.au4_min_in_buf_size[0] = wd * ht; + ps_op->s_ive_op.au4_min_in_buf_size[1] = (wd >> 1) * (ht >> 1); + ps_op->s_ive_op.au4_min_in_buf_size[2] = (wd >> 1) * (ht >> 1); + } + else if (ps_ip->s_ive_ip.e_inp_color_fmt == IV_YUV_422ILE) + { + ps_op->s_ive_op.u4_inp_comp_cnt = MIN_RAW_BUFS_422ILE_COMP; + + ps_op->s_ive_op.au4_min_in_buf_size[0] = wd * ht * 2; + ps_op->s_ive_op.au4_min_in_buf_size[1] = + ps_op->s_ive_op.au4_min_in_buf_size[2] = 0; + } + else if (ps_ip->s_ive_ip.e_inp_color_fmt == IV_RGB_565) + { + ps_op->s_ive_op.u4_inp_comp_cnt = MIN_RAW_BUFS_RGB565_COMP; + + ps_op->s_ive_op.au4_min_in_buf_size[0] = wd * ht * 2; + ps_op->s_ive_op.au4_min_in_buf_size[1] = + ps_op->s_ive_op.au4_min_in_buf_size[2] = 0; + } + else if (ps_ip->s_ive_ip.e_inp_color_fmt == IV_RGBA_8888) + { + ps_op->s_ive_op.u4_inp_comp_cnt = MIN_RAW_BUFS_RGBA8888_COMP; + + ps_op->s_ive_op.au4_min_in_buf_size[0] = wd * ht * 4; + ps_op->s_ive_op.au4_min_in_buf_size[1] = + ps_op->s_ive_op.au4_min_in_buf_size[2] = 0; + } + else if ((ps_ip->s_ive_ip.e_inp_color_fmt == IV_YUV_420SP_UV) + || (ps_ip->s_ive_ip.e_inp_color_fmt == IV_YUV_420SP_VU)) + { + ps_op->s_ive_op.u4_inp_comp_cnt = MIN_RAW_BUFS_420SP_COMP; + + ps_op->s_ive_op.au4_min_in_buf_size[0] = wd * ht; + ps_op->s_ive_op.au4_min_in_buf_size[1] = wd * (ht >> 1); + ps_op->s_ive_op.au4_min_in_buf_size[2] = 0; + } + + /* Number of components in output buffers required for codec & + * Minimum sizes of each component in output buffer required */ + ps_op->s_ive_op.u4_out_comp_cnt = MIN_BITS_BUFS_COMP; + + for (i = 0; i < (WORD32) ps_op->s_ive_op.u4_out_comp_cnt; i++) + { + ps_op->s_ive_op.au4_min_out_buf_size[i] = (wd * ht * 3) >> 1; + } + + ps_op->s_ive_op.u4_min_inp_bufs = MIN_INP_BUFS; + ps_op->s_ive_op.u4_min_out_bufs = MIN_OUT_BUFS; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets the picture dimensions +* +* @par Description: +* Sets width, height, display width, display height and strides +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264e_set_dimensions(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_dimensions_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_dimensions_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_wd = ALIGN16(ps_ip->s_ive_ip.u4_wd); + ps_cfg->u4_ht = ALIGN16(ps_ip->s_ive_ip.u4_ht); + ps_cfg->u4_strd = ps_ip->s_ive_ip.u4_strd; + ps_cfg->i4_wd_mbs = ps_cfg->u4_wd >> 4; + ps_cfg->i4_ht_mbs = ps_cfg->u4_ht >> 4; + ps_cfg->u4_disp_wd = ps_ip->s_ive_ip.u4_wd; + ps_cfg->u4_disp_ht = ps_ip->s_ive_ip.u4_ht; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets source and target frame rates +* +* @par Description: +* Sets source and target frame rates +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264e_set_frame_rate(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_frame_rate_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_frame_rate_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_src_frame_rate = ps_ip->s_ive_ip.u4_src_frame_rate; + ps_cfg->u4_tgt_frame_rate = ps_ip->s_ive_ip.u4_tgt_frame_rate; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets target bit rate +* +* @par Description: +* Sets target bit rate +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264e_set_bit_rate(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_bitrate_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_bitrate_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_target_bitrate = ps_ip->s_ive_ip.u4_target_bitrate; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets frame type +* +* @par Description: +* Sets frame type +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks not a sticky tag +* +******************************************************************************* +*/ +static IV_STATUS_T ih264e_set_frame_type(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_frame_type_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_frame_type_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->e_frame_type = ps_ip->s_ive_ip.e_frame_type; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets quantization params +* +* @par Description: +* Sets the max, min and default qp for I frame, P frame and B frame +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264e_set_qp(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_qp_ip_t *ps_set_qp_ip = pv_api_ip; + ih264e_ctl_set_qp_op_t *ps_set_qp_op = pv_api_op; + + ps_set_qp_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_i_qp_max = ps_set_qp_ip->s_ive_ip.u4_i_qp_max; + ps_cfg->u4_i_qp_min = ps_set_qp_ip->s_ive_ip.u4_i_qp_min; + ps_cfg->u4_i_qp = ps_set_qp_ip->s_ive_ip.u4_i_qp; + ps_cfg->u4_p_qp_max = ps_set_qp_ip->s_ive_ip.u4_p_qp_max; + ps_cfg->u4_p_qp_min = ps_set_qp_ip->s_ive_ip.u4_p_qp_min; + ps_cfg->u4_p_qp = ps_set_qp_ip->s_ive_ip.u4_p_qp; + ps_cfg->u4_b_qp_max = ps_set_qp_ip->s_ive_ip.u4_b_qp_max; + ps_cfg->u4_b_qp_min = ps_set_qp_ip->s_ive_ip.u4_b_qp_min; + ps_cfg->u4_b_qp = ps_set_qp_ip->s_ive_ip.u4_b_qp; + + ps_cfg->u4_timestamp_high = ps_set_qp_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_set_qp_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets encoding mode +* +* @par Description: +* Sets encoding mode +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264e_set_enc_mode(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_enc_mode_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_enc_mode_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->e_enc_mode = ps_ip->s_ive_ip.e_enc_mode; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets vbv parameters +* +* @par Description: +* Sets vbv parameters +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264e_set_vbv_params(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_vbv_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_vbv_params_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_vbv_buf_size = ps_ip->s_ive_ip.u4_vbv_buf_size; + ps_cfg->u4_vbv_buffer_delay = ps_ip->s_ive_ip.u4_vbv_buffer_delay; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets AIR parameters +* +* @par Description: +* Sets AIR parameters +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264_set_air_params(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_air_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_air_params_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->e_air_mode = ps_ip->s_ive_ip.e_air_mode; + ps_cfg->u4_air_refresh_period = ps_ip->s_ive_ip.u4_air_refresh_period; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets motion estimation parameters +* +* @par Description: +* Sets motion estimation parameters +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264_set_me_params(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_me_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_me_params_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_enable_hpel = ps_ip->s_ive_ip.u4_enable_hpel; + ps_cfg->u4_enable_qpel = ps_ip->s_ive_ip.u4_enable_qpel; + ps_cfg->u4_enable_fast_sad = ps_ip->s_ive_ip.u4_enable_fast_sad; + ps_cfg->u4_enable_alt_ref = ps_ip->s_ive_ip.u4_enable_alt_ref; + ps_cfg->u4_srch_rng_x = ps_ip->s_ive_ip.u4_srch_rng_x; + ps_cfg->u4_srch_rng_y = ps_ip->s_ive_ip.u4_srch_rng_y; + ps_cfg->u4_me_speed_preset = ps_ip->s_ive_ip.u4_me_speed_preset; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets Intra/Inter Prediction estimation parameters +* +* @par Description: +* Sets Intra/Inter Prediction estimation parameters +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264_set_ipe_params(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_ipe_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_ipe_params_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_enable_intra_4x4 = ps_ip->s_ive_ip.u4_enable_intra_4x4; + ps_cfg->u4_enc_speed_preset = ps_ip->s_ive_ip.u4_enc_speed_preset; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets GOP parameters +* +* @par Description: +* Sets GOP parameters +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264_set_gop_params(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_gop_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_gop_params_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_i_frm_interval = ps_ip->s_ive_ip.u4_i_frm_interval; + ps_cfg->u4_idr_frm_interval = ps_ip->s_ive_ip.u4_idr_frm_interval; + ps_cfg->u4_num_b_frames = ps_ip->s_ive_ip.u4_num_b_frames; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets profile parameters +* +* @par Description: +* Sets profile parameters +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @param[out] ps_cfg +* Pointer to config structure to be updated +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IV_STATUS_T ih264_set_profile_params(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_profile_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_profile_params_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->e_profile = ps_ip->s_ive_ip.e_profile; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets disable deblock level +* +* @par Description: +* Sets disable deblock level. Level 0 means no disabling and level 4 means +* disable completely. 1, 2, 3 are intermediate levels that control amount +* of deblocking done. +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264_set_deblock_params(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_deblock_params_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_deblock_params_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_disable_deblock_level = ps_ip->s_ive_ip.u4_disable_deblock_level; + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Sets number of cores +* +* @par Description: +* Sets number of cores +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks The number of encoder threads is limited to MAX_PROCESS_THREADS +* +******************************************************************************* +*/ +static WORD32 ih264e_set_num_cores(void *pv_api_ip, + void *pv_api_op, + cfg_params_t *ps_cfg) +{ + /* ctrl call I/O structures */ + ih264e_ctl_set_num_cores_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_set_num_cores_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + ps_cfg->u4_num_cores = MIN(ps_ip->s_ive_ip.u4_num_cores, MAX_PROCESS_THREADS); + + ps_cfg->u4_timestamp_high = ps_ip->s_ive_ip.u4_timestamp_high; + ps_cfg->u4_timestamp_low = ps_ip->s_ive_ip.u4_timestamp_low; + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Resets encoder state +* +* @par Description: +* Resets encoder state by calling ih264e_init() +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_reset(iv_obj_t *ps_codec_obj, + void *pv_api_ip, + void *pv_api_op) +{ + UNUSED(pv_api_ip); + /* codec ctxt */ + codec_t * ps_codec = (codec_t *) (ps_codec_obj->pv_codec_handle); + + /* ctrl call I/O structures */ + ih264e_ctl_reset_op_t *ps_op = pv_api_op; + + ps_op->s_ive_op.u4_error_code = 0; + + if (ps_codec != NULL) + { + ih264e_init(ps_codec); + } + else + { + ps_op->s_ive_op.u4_error_code = IH264E_INIT_NOT_DONE; + } + + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Codec control call +* +* @par Description: +* Codec control call which in turn calls appropriate calls based on sub-command +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 ih264e_ctl(iv_obj_t *ps_codec_obj, + void *pv_api_ip, + void *pv_api_op) +{ + /* codec ctxt */ + codec_t *ps_codec = (codec_t *) ps_codec_obj->pv_codec_handle; + + /* ctrl call I/O structures */ + ih264e_ctl_setdefault_ip_t *ps_ctl_ip = pv_api_ip; + ih264e_ctl_setdefault_op_t *ps_ctl_op = pv_api_op; + + /* ctrl call sub cmd */ + IVE_CONTROL_API_COMMAND_TYPE_T sub_cmd = ps_ctl_ip->s_ive_ip.e_sub_cmd; + + /* error status */ + IV_STATUS_T ret = 0; + + /* temp var */ + WORD32 i; + cfg_params_t *ps_cfg = NULL; + + /* control call is for configuring encoding params, this is not to be called + * before a successful init call */ + if (ps_codec->i4_init_done != 1) + { + ps_ctl_op->s_ive_op.u4_error_code |= 1 << IVE_FATALERROR; + ps_ctl_op->s_ive_op.u4_error_code |= IH264E_INIT_NOT_DONE; + return IV_FAIL; + } + + /* make it thread safe */ + ithread_mutex_lock(ps_codec->pv_ctl_mutex); + + /* find a free config param set to hold current parameters */ + for (i = 0; i < MAX_ACTIVE_CONFIG_PARAMS; i++) + { + if (0 == ps_codec->as_cfg[i].u4_is_valid) + { + ps_cfg = &ps_codec->as_cfg[i]; + break; + } + } + + /* If all are invalid, then start overwriting from the head config params */ + if (NULL == ps_cfg) + { + ps_cfg = &ps_codec->as_cfg[0]; + } + + ps_cfg->u4_is_valid = 1; + + ps_cfg->e_cmd = sub_cmd; + + switch (sub_cmd) + { + case IVE_CMD_CTL_SET_DIMENSIONS: + ret = ih264e_set_dimensions(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_FRAMERATE: + ret = ih264e_set_frame_rate(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_BITRATE: + ret = ih264e_set_bit_rate(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_FRAMETYPE: + ret = ih264e_set_frame_type(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_QP: + ret = ih264e_set_qp(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_ENC_MODE: + ret = ih264e_set_enc_mode(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_VBV_PARAMS: + ret = ih264e_set_vbv_params(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_AIR_PARAMS: + ret = ih264_set_air_params(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_ME_PARAMS: + ret = ih264_set_me_params(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_IPE_PARAMS: + ret = ih264_set_ipe_params(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_GOP_PARAMS: + ret = ih264_set_gop_params(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_PROFILE_PARAMS: + ret = ih264_set_profile_params(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_SET_DEBLOCK_PARAMS: + ret = ih264_set_deblock_params(pv_api_ip, pv_api_op, ps_cfg); + break; + + case IVE_CMD_CTL_RESET: + + /* invalidate config param struct as it is being served right away */ + ps_codec->as_cfg[i].u4_is_valid = 0; + + ret = ih264e_reset(ps_codec_obj, pv_api_ip, pv_api_op); + break; + + case IVE_CMD_CTL_SETDEFAULT: + { + /* ctrl call I/O structures */ + ih264e_ctl_setdefault_op_t *ps_op = pv_api_op; + + /* invalidate config param struct as it is being served right away */ + ps_codec->as_cfg[i].u4_is_valid = 0; + + /* error status */ + ret = ih264e_set_default_params(ps_cfg); + + ps_op->s_ive_op.u4_error_code = ret; + + break; + } + + case IVE_CMD_CTL_FLUSH: + + /* invalidate config param struct as it is being served right away */ + ps_codec->as_cfg[i].u4_is_valid = 0; + + ret = ih264e_set_flush_mode(ps_codec_obj, pv_api_ip, pv_api_op); + break; + + case IVE_CMD_CTL_GETBUFINFO: + + /* invalidate config param struct as it is being served right away */ + ps_codec->as_cfg[i].u4_is_valid = 0; + + ret = ih264e_get_buf_info(ps_codec_obj, pv_api_ip, pv_api_op); + break; + + case IVE_CMD_CTL_GETVERSION: + { + /* ctrl call I/O structures */ + ih264e_ctl_getversioninfo_ip_t *ps_ip = pv_api_ip; + ih264e_ctl_getversioninfo_op_t *ps_op = pv_api_op; + + /* invalidate config param struct as it is being served right away */ + ps_codec->as_cfg[i].u4_is_valid = 0; + + /* error status */ + ps_op->s_ive_op.u4_error_code = IV_SUCCESS; + + if (ps_ip->s_ive_ip.u4_version_bufsize <= 0) + { + ps_op->s_ive_op.u4_error_code = + IH264E_CXA_VERS_BUF_INSUFFICIENT; + ret = IV_FAIL; + } + else + { + ret = ih264e_get_version((CHAR *) ps_ip->s_ive_ip.pu1_version, + ps_ip->s_ive_ip.u4_version_bufsize); + + if (ret != IV_SUCCESS) + { + ps_op->s_ive_op.u4_error_code = + IH264E_CXA_VERS_BUF_INSUFFICIENT; + ret = IV_FAIL; + } + } + break; + } + + case IVE_CMD_CTL_SET_NUM_CORES: + ret = ih264e_set_num_cores(pv_api_ip, pv_api_op, ps_cfg); + break; + + default: + /* invalidate config param struct as it is being served right away */ + ps_codec->as_cfg[i].u4_is_valid = 0; + + DEBUG("Warning !! unrecognized control api command \n"); + break; + } + + ithread_mutex_unlock(ps_codec->pv_ctl_mutex); + + return ret; +} + +/** +******************************************************************************* +* +* @brief +* Codec entry point function. All the function calls to the codec are done +* using this function with different values specified in command +* +* @par Description: +* Arguments are tested for validity and then based on the command +* appropriate function is called +* +* @param[in] ps_handle +* API level handle for codec +* +* @param[in] pv_api_ip +* Input argument structure +* +* @param[out] pv_api_op +* Output argument structure +* +* @returns error_status +* +* @remarks +* +******************************************************************************* +*/ +IV_STATUS_T ih264e_api_function(iv_obj_t *ps_handle, + void *pv_api_ip, + void *pv_api_op) +{ + /* api command */ + WORD32 command = IV_CMD_NA; + + /* error status */ + IV_STATUS_T e_status; + WORD32 ret; + + /* tmp var */ + WORD32 *pu4_ptr_cmd = (WORD32 *) pv_api_ip; + + /* validate input / output structures */ + e_status = api_check_struct_sanity(ps_handle, pv_api_ip, pv_api_op); + + if (e_status != IV_SUCCESS) + { + DEBUG("error code = %d\n", *((UWORD32 *)pv_api_op + 1)); + return IV_FAIL; + } + + pu4_ptr_cmd++; + + command = *pu4_ptr_cmd; + + switch (command) + { + case IV_CMD_GET_NUM_MEM_REC: + ret = ih264e_get_num_rec(pv_api_ip, pv_api_op); + break; + + case IV_CMD_FILL_NUM_MEM_REC: + ret = ih264e_fill_num_mem_rec(pv_api_ip, pv_api_op); + break; + + case IV_CMD_INIT: + ret = ih264e_init_mem_rec(ps_handle, pv_api_ip, pv_api_op); + break; + + case IV_CMD_RETRIEVE_MEMREC: + ret = ih264e_retrieve_memrec(ps_handle, pv_api_ip, pv_api_op); + break; + + case IVE_CMD_VIDEO_CTL: + ret = ih264e_ctl(ps_handle, pv_api_ip, pv_api_op); + break; + + case IVE_CMD_VIDEO_ENCODE: + ret = ih264e_encode(ps_handle, pv_api_ip, pv_api_op); + break; + + default: + ret = IV_FAIL; + break; + } + + return (IV_STATUS_T) ret; +} diff --git a/encoder/ih264e_bitstream.c b/encoder/ih264e_bitstream.c new file mode 100755 index 0000000..e5bfbe4 --- /dev/null +++ b/encoder/ih264e_bitstream.c @@ -0,0 +1,472 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_bitstream.c +* +* @brief +* This file contains function definitions related to bitstream generation +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_bitstrm_init() +* - ih264e_put_bits() +* - ih264e_put_bit() +* - ih264e_put_rbsp_trailing_bits() +* - ih264e_put_uev() +* - ih264e_put_sev() +* - ih264e_put_nal_start_code_prefix() +* +****************************************************************************** +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <assert.h> +#include <stdarg.h> +#include <math.h> + +/* User include files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "ih264_debug.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ih264_defs.h" +#include "ih264_macros.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief Initializes the encoder bitstream engine +* +* @par Description +* This routine needs to be called at start of slice/frame encode +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] p1_bitstrm_buf +* bitstream buffer pointer where the encoded stream is generated in byte order +* +* @param[in] u4_max_bitstrm_size +* indicates maximum bitstream buffer size. (in bytes) +* If actual stream size exceeds the maximum size, encoder should +* 1. Not corrupt data beyond u4_max_bitstrm_size bytes +* 2. Report an error back to application indicating overflow +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_bitstrm_init(bitstrm_t *ps_bitstrm, + UWORD8 *pu1_bitstrm_buf, + UWORD32 u4_max_bitstrm_size) +{ + ps_bitstrm->pu1_strm_buffer = pu1_bitstrm_buf; + ps_bitstrm->u4_max_strm_size = u4_max_bitstrm_size; + + /* Default init values for other members of bitstream context */ + ps_bitstrm->u4_strm_buf_offset = 0; + ps_bitstrm->u4_cur_word = 0; + ps_bitstrm->i4_bits_left_in_cw = WORD_SIZE; + ps_bitstrm->i4_zero_bytes_run = 0; + + return(IH264E_SUCCESS); +} + +/** +****************************************************************************** +* +* @brief puts a code with specified number of bits into the bitstream +* +* @par Description +* inserts code_len number of bits from lsb of code_val into the +* bitstream. updates context members like u4_cur_word, u4_strm_buf_offset and +* i4_bits_left_in_cw. If the total words (u4_strm_buf_offset) exceeds max +* available size (u4_max_strm_size), returns error without corrupting data +* beyond it +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] u4_code_val +* code value that needs to be inserted in the stream. +* +* @param[in] code_len +* indicates code length (in bits) of code_val that would be inserted in +* bitstream buffer size. Range of length[1:WORD_SIZE] +* +* @remarks Assumptions: all bits from bit position code_len to msb of +* code_val shall be zero +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_bits(bitstrm_t *ps_bitstrm, + UWORD32 u4_code_val, + WORD32 code_len) +{ + UWORD32 u4_cur_word = ps_bitstrm->u4_cur_word; + WORD32 bits_left_in_cw = ps_bitstrm->i4_bits_left_in_cw; + + + /* check assumptions made in the module */ + ASSERT(code_len > 0 && code_len <= WORD_SIZE); + + if(code_len < WORD_SIZE) + ASSERT((u4_code_val >> code_len) == 0); + + + /* sanity check on the bitstream engine state */ + ASSERT(bits_left_in_cw > 0 && bits_left_in_cw <= WORD_SIZE); + + ASSERT(ps_bitstrm->i4_zero_bytes_run <= EPB_ZERO_BYTES); + + ASSERT(ps_bitstrm->pu1_strm_buffer != NULL); + + + if(bits_left_in_cw > code_len) + { + /*******************************************************************/ + /* insert the code in local bitstream word and return */ + /* code is inserted in position of bits left (post decrement) */ + /*******************************************************************/ + bits_left_in_cw -= code_len; + u4_cur_word |= (u4_code_val << bits_left_in_cw); + + ps_bitstrm->u4_cur_word = u4_cur_word; + ps_bitstrm->i4_bits_left_in_cw = bits_left_in_cw; + + return(IH264E_SUCCESS); + } + else + { + /********************************************************************/ + /* 1. insert partial code corresponding to bits left in cur word */ + /* 2. flush all the bits of cur word to bitstream */ + /* 3. insert emulation prevention bytes while flushing the bits */ + /* 4. insert remaining bits of code starting from msb of cur word */ + /* 5. update bitsleft in current word and stream buffer offset */ + /********************************************************************/ + UWORD32 u4_strm_buf_offset = ps_bitstrm->u4_strm_buf_offset; + + UWORD32 u4_max_strm_size = ps_bitstrm->u4_max_strm_size; + + WORD32 zero_run = ps_bitstrm->i4_zero_bytes_run; + + UWORD8* pu1_strm_buf = ps_bitstrm->pu1_strm_buffer; + + WORD32 i, rem_bits = (code_len - bits_left_in_cw); + + + /*********************************************************************/ + /* Bitstream overflow check */ + /* NOTE: corner case of epb bytes (max 2 for 32bit word) not handled */ + /*********************************************************************/ + if((u4_strm_buf_offset + (WORD_SIZE>>3)) >= u4_max_strm_size) + { + /* return without corrupting the buffer beyond its size */ + return(IH264E_BITSTREAM_BUFFER_OVERFLOW); + } + + /* insert parital code corresponding to bits left in cur word */ + u4_cur_word |= u4_code_val >> rem_bits; + + for(i = WORD_SIZE; i > 0; i -= 8) + { + /* flush the bits in cur word byte by byte and copy to stream */ + UWORD8 u1_next_byte = (u4_cur_word >> (i-8)) & 0xFF; + + PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, u1_next_byte, zero_run); + } + + /* insert the remaining bits from code val into current word */ + u4_cur_word = rem_bits ? (u4_code_val << (WORD_SIZE - rem_bits)) : 0; + + /* update the state variables and return success */ + ps_bitstrm->u4_cur_word = u4_cur_word; + ps_bitstrm->i4_bits_left_in_cw = WORD_SIZE - rem_bits; + ps_bitstrm->i4_zero_bytes_run = zero_run; + ps_bitstrm->u4_strm_buf_offset = u4_strm_buf_offset; + return (IH264E_SUCCESS); + } +} + +/** +****************************************************************************** +* +* @brief inserts a 1-bit code into the bitstream +* +* @par Description +* inserts 1bit lsb of code_val into the bitstream +* updates context members like u4_cur_word, u4_strm_buf_offset and +* i4_bits_left_in_cw. If the total words (u4_strm_buf_offset) exceeds max +* available size (u4_max_strm_size), returns error without corrupting data +* beyond it +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] u4_code_val +* code value that needs to be inserted in the stream. +* +* @remarks Assumptions: all bits from bit position 1 to msb of code_val +* shall be zero +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_bit(bitstrm_t *ps_bitstrm, UWORD32 u4_code_val) +{ + /* call the put bits function for 1 bit and return */ + return(ih264e_put_bits(ps_bitstrm, u4_code_val, 1)); +} + +/** +****************************************************************************** +* +* @brief inserts rbsp trailing bits at the end of stream buffer (NAL) +* +* @par Description +* inserts rbsp trailing bits, updates context members like u4_cur_word and +* i4_bits_left_in_cw and flushes the same in the bitstream buffer. If the +* total words (u4_strm_buf_offset) exceeds max available size +* (u4_max_strm_size), returns error without corrupting data beyond it +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_rbsp_trailing_bits(bitstrm_t *ps_bitstrm) +{ + WORD32 i; + UWORD32 u4_cur_word = ps_bitstrm->u4_cur_word; + WORD32 bits_left_in_cw = ps_bitstrm->i4_bits_left_in_cw; + WORD32 bytes_left_in_cw = (bits_left_in_cw - 1) >> 3; + + UWORD32 u4_strm_buf_offset = ps_bitstrm->u4_strm_buf_offset; + UWORD32 u4_max_strm_size = ps_bitstrm->u4_max_strm_size; + WORD32 zero_run = ps_bitstrm->i4_zero_bytes_run; + UWORD8* pu1_strm_buf = ps_bitstrm->pu1_strm_buffer; + + /*********************************************************************/ + /* Bitstream overflow check */ + /* NOTE: corner case of epb bytes (max 2 for 32bit word) not handled */ + /*********************************************************************/ + if((u4_strm_buf_offset + (WORD_SIZE>>3) - bytes_left_in_cw) >= + u4_max_strm_size) + { + /* return without corrupting the buffer beyond its size */ + return(IH264E_BITSTREAM_BUFFER_OVERFLOW); + } + + /* insert a 1 at the end of current word and flush all the bits */ + u4_cur_word |= (1 << (bits_left_in_cw - 1)); + + /* get the bits to be inserted in msbdb of the word */ + //u4_cur_word <<= (WORD_SIZE - bytes_left_in_cw + 1); + + for(i = WORD_SIZE; i > (bytes_left_in_cw*8); i -= 8) + { + /* flush the bits in cur word byte by byte and copy to stream */ + UWORD8 u1_next_byte = (u4_cur_word >> (i-8)) & 0xFF; + + PUTBYTE_EPB(pu1_strm_buf, u4_strm_buf_offset, u1_next_byte, zero_run); + } + + /* update the stream offset */ + ps_bitstrm->u4_strm_buf_offset = u4_strm_buf_offset; + + /* Default init values for scratch variables of bitstream context */ + ps_bitstrm->u4_cur_word = 0; + ps_bitstrm->i4_bits_left_in_cw = WORD_SIZE; + ps_bitstrm->i4_zero_bytes_run = 0; + + return (IH264E_SUCCESS); +} + +/** +****************************************************************************** +* +* @brief puts exponential golomb code of a unsigned integer into bitstream +* +* @par Description +* computes uev code for given syntax element and inserts the same into +* bitstream by calling ih264e_put_bits() interface. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] u4_code_num +* unsigned integer input whose golomb code is written in stream +* +* @remarks Assumptions: code value can be represented in less than 16bits +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_uev(bitstrm_t *ps_bitstrm, UWORD32 u4_code_num) +{ + UWORD32 u4_bit_str, u4_range; + IH264E_ERROR_T e_error; + + /* convert the codenum to exp-golomb bit code: Table 9-2 JCTVC-J1003_d7 */ + u4_bit_str = u4_code_num + 1; + + /* get range of the bit string and put using put_bits() */ + GETRANGE(u4_range, u4_bit_str); + + e_error = ih264e_put_bits(ps_bitstrm, u4_bit_str, (2 * u4_range - 1)); + + return(e_error); +} + +/** +****************************************************************************** +* +* @brief puts exponential golomb code of a signed integer into bitstream +* +* @par Description +* computes sev code for given syntax element and inserts the same into +* bitstream by calling ih264e_put_bits() interface. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] syntax_elem +* signed integer input whose golomb code is written in stream +* +* @remarks Assumptions: code value can be represented in less than 16bits +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_sev(bitstrm_t *ps_bitstrm, WORD32 syntax_elem) +{ + UWORD32 u4_code_num, u4_bit_str, u4_range; + IH264E_ERROR_T e_error; + + /************************************************************************/ + /* convert the codenum to exp-golomb bit code for signed syntax element */ + /* See Table9-2 and Table 9-3 of standard JCTVC-J1003_d7 */ + /************************************************************************/ + if(syntax_elem <= 0) + { + /* codeNum for non-positive integer = 2*abs(x) : Table9-3 */ + u4_code_num = ((-syntax_elem) << 1); + } + else + { + /* codeNum for positive integer = 2x-1 : Table9-3 */ + u4_code_num = (syntax_elem << 1) - 1; + } + + /* convert the codenum to exp-golomb bit code: Table 9-2 JCTVC-J1003_d7 */ + u4_bit_str = u4_code_num + 1; + + /* get range of the bit string and put using put_bits() */ + GETRANGE(u4_range, u4_bit_str); + + e_error = ih264e_put_bits(ps_bitstrm, u4_bit_str, (2 * u4_range - 1)); + + return(e_error); +} + +/** +****************************************************************************** +* +* @brief insert NAL start code prefix (0x000001) into bitstream with an option +* of inserting leading_zero_8bits (which makes startcode prefix as 0x00000001) +* +* @par Description +* Although start code prefix could have been put by calling ih264e_put_bits(), +* ih264e_put_nal_start_code_prefix() is specially added to make sure emulation +* prevention insertion is not done for the NAL start code prefix which will +* surely happen otherwise by calling ih264e_put_bits() interface. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] insert_leading_zero_8bits +* flag indicating if one more zero bytes needs to prefixed before start code +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_nal_start_code_prefix(bitstrm_t *ps_bitstrm, + WORD32 insert_leading_zero_8bits) +{ + UWORD32 u4_strm_buf_offset = ps_bitstrm->u4_strm_buf_offset; + UWORD8* pu1_strm_buf = ps_bitstrm->pu1_strm_buffer; + + /* Bitstream buffer overflow check assuming worst case of 4 bytes */ + if((u4_strm_buf_offset + 4) >= ps_bitstrm->u4_max_strm_size) + { + return(IH264E_BITSTREAM_BUFFER_OVERFLOW); + } + + /* Insert leading zero 8 bits conditionally */ + if(insert_leading_zero_8bits) + { + pu1_strm_buf[u4_strm_buf_offset] = 0x00; + u4_strm_buf_offset++; + } + + /* Insert NAL start code prefix 0x00 00 01 */ + pu1_strm_buf[u4_strm_buf_offset] = 0x00; + u4_strm_buf_offset++; + + pu1_strm_buf[u4_strm_buf_offset] = 0x00; + u4_strm_buf_offset++; + + pu1_strm_buf[u4_strm_buf_offset] = 0x01; + u4_strm_buf_offset++; + + /* update the stream offset */ + ps_bitstrm->u4_strm_buf_offset = u4_strm_buf_offset; + + return (IH264E_SUCCESS); +} + diff --git a/encoder/ih264e_bitstream.h b/encoder/ih264e_bitstream.h new file mode 100755 index 0000000..21360cc --- /dev/null +++ b/encoder/ih264e_bitstream.h @@ -0,0 +1,401 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_bitstream.h +* +* @brief +* This file contains encoder bitstream engine related structures and +* interface prototypes +* +* @author +* ittiam +* +* @remarks +* none +* +******************************************************************************* +*/ + +#ifndef IH264E_BITSTREAM_H_ +#define IH264E_BITSTREAM_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief defines the maximum number of bits in a bitstream word +****************************************************************************** + */ +#define WORD_SIZE 32 + +/** +****************************************************************************** + * @brief The number of consecutive zero bytes for emulation prevention check +****************************************************************************** + */ +#define EPB_ZERO_BYTES 2 + +/** +****************************************************************************** + * @brief Emulation prevention insertion byte +****************************************************************************** + */ +#define EPB_BYTE 0x03 + + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Macro to check if emulation prevention byte insertion is required +****************************************************************************** + */ +#define INSERT_EPB(zero_run, next_byte) \ + ((zero_run) == EPB_ZERO_BYTES) && (0 == ((next_byte) & 0xFC)) + +/** +****************************************************************************** + * @brief returns the bit position of a leading 1 (msb) in a code value +****************************************************************************** + */ +#if !MSVC +#define GETRANGE(r,value) \ +{ \ + r = 0; \ + if(0 == value) \ + r = 1; \ + else \ + { \ + r = 32-CLZ(value); \ + }\ +} +#else +#define GETRANGE(r,value) \ +{ \ + unsigned long msb_one_bit = 0; \ + r = _BitScanReverse(&msb_one_bit, value) ? (UWORD32)(msb_one_bit + 1) : 1 ; \ +} +#endif + +/** +****************************************************************************** + * @brief returns bits required to code a value +****************************************************************************** + */ +#define UE_LENGTH(bits,x) \ +{ \ + UWORD32 r_bit; \ + GETRANGE(r_bit,x+1) \ + bits =(((r_bit - 1) << 1)+1); \ +} \ + +/** +****************************************************************************** + * @brief Inserts 1 byte and Emulation Prevention Byte(if any) into bitstream + * Increments the stream offset and zero run correspondingly +****************************************************************************** + */ +#define PUTBYTE_EPB(ptr,off,byte,zero_run) \ +{ \ + if( INSERT_EPB(zero_run, byte) ) \ + { \ + ptr[off] = EPB_BYTE; \ + off++; \ + zero_run = 0; \ + } \ + \ + ptr[off] = byte; \ + off++; \ + zero_run = byte ? 0 : zero_run+1; \ +} \ + +/** +****************************************************************************** + * @brief Ensures Byte alignment of the slice header +****************************************************************************** + */ +#define BYTE_ALIGNMENT(ps_bitstrm) ih264e_put_rbsp_trailing_bits(ps_bitstrm) + + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Bitstream context for encoder +****************************************************************************** + */ +typedef struct bitstrm +{ + /** points to start of stream buffer. */ + UWORD8 *pu1_strm_buffer; + + /** + * max bitstream size (in bytes). + * Encoded stream shall not exceed this size. + */ + UWORD32 u4_max_strm_size; + + /** + * byte offset (w.r.t pu1_strm_buffer) where next byte would be written + * Bitstream engine makes sure it would not corrupt data beyond + * u4_max_strm_size bytes + */ + UWORD32 u4_strm_buf_offset; + + /** + * current bitstream word; It is a scratch word containing max of + * WORD_SIZE bits. Will be copied to stream buffer when the word is + * full + */ + UWORD32 u4_cur_word; + + /** + * signifies number of bits available in u4_cur_word + * bits from msb to i4_bits_left_in_cw of u4_cur_word have already been + * inserted next bits would be inserted from pos [i4_bits_left_in_cw-1] + * Range of this variable [1 : WORD_SIZE] + */ + WORD32 i4_bits_left_in_cw; + + /** + * signifies the number of consecutive zero bytes propogated from previous + * word. It is used for emulation prevention byte insertion in the stream + */ + WORD32 i4_zero_bytes_run; + +} bitstrm_t; + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief Initializes the encoder bitstream engine +* +* @par Description +* This routine needs to be called at start of slice/frame encode +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] p1_bitstrm_buf +* bitstream buffer pointer where the encoded stream is generated in byte order +* +* @param[in] u4_max_bitstrm_size +* indicates maximum bitstream buffer size. (in bytes) +* If actual stream size exceeds the maximum size, encoder should +* 1. Not corrupt data beyond u4_max_bitstrm_size bytes +* 2. Report an error back to application indicating overflow +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_bitstrm_init + ( + bitstrm_t *ps_bitstrm, + UWORD8 *pu1_bitstrm_buf, + UWORD32 u4_max_bitstrm_size + ); + +/** +****************************************************************************** +* +* @brief puts a code with specified number of bits into the bitstream +* +* @par Description +* inserts code_len number of bits from lsb of code_val into the +* bitstream. If the total bytes (u4_strm_buf_offset) exceeds max +* available size (u4_max_strm_size), returns error without corrupting data +* beyond it +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] u4_code_val +* code value that needs to be inserted in the stream. +* +* @param[in] code_len +* indicates code length (in bits) of code_val that would be inserted in +* bitstream buffer size. +* +* @remarks Assumptions: all bits from bit position code_len to msb of +* code_val shall be zero +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_bits + ( + bitstrm_t *ps_bitstrm, + UWORD32 u4_code_val, + WORD32 code_len + ); + +/** +****************************************************************************** +* +* @brief inserts a 1-bit code into the bitstream +* +* @par Description +* inserts 1bit lsb of code_val into the bitstream +* updates context members like u4_cur_word, u4_strm_buf_offset and +* i4_bits_left_in_cw. If the total words (u4_strm_buf_offset) exceeds max +* available size (u4_max_strm_size), returns error without corrupting data +* beyond it +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] u4_code_val +* code value that needs to be inserted in the stream. +* +* @remarks Assumptions: all bits from bit position 1 to msb of code_val +* shall be zero +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_bit + ( + bitstrm_t *ps_bitstrm, + UWORD32 u4_code_val + ); + +/** +****************************************************************************** +* +* @brief inserts rbsp trailing bits at the end of stream buffer (NAL) +* +* @par Description +* inserts rbsp trailing bits, updates context members like u4_cur_word and +* i4_bits_left_in_cw and flushes the same in the bitstream buffer. If the +* total words (u4_strm_buf_offset) exceeds max available size +* (u4_max_strm_size), returns error without corrupting data beyond it +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_rbsp_trailing_bits + ( + bitstrm_t *ps_bitstrm + ); + +/** +****************************************************************************** +* +* @brief puts exponential golomb code of a unsigned integer into bitstream +* +* @par Description +* computes uev code for given syntax element and inserts the same into +* bitstream by calling ih264e_put_bits() interface. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] u4_code_num +* unsigned integer input whose golomb code is written in stream +* +* @remarks Assumptions: code value can be represented in less than 16bits +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_uev + ( + bitstrm_t *ps_bitstrm, + UWORD32 u4_code_num + ); + +/** +****************************************************************************** +* +* @brief puts exponential golomb code of a signed integer into bitstream +* +* @par Description +* computes sev code for given syntax element and inserts the same into +* bitstream by calling ih264e_put_bits() interface. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] syntax_elem +* signed integer input whose golomb code is written in stream +* +* @remarks Assumptions: code value can be represented in less than 16bits +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_sev + ( + bitstrm_t *ps_bitstrm, + WORD32 syntax_elem + ); + +/** +****************************************************************************** +* +* @brief insert NAL start code prefix (0x000001) into bitstream with an option +* of inserting leading_zero_8bits (which makes startcode prefix as 0x00000001) +* +* @par Description +* Although start code prefix could have been put by calling ih264e_put_bits(), +* ih264e_put_nal_start_code_prefix() is specially added to make sure emulation +* prevention insertion is not done for the NAL start code prefix which will +* surely happen otherwise by calling ih264e_put_bits() interface. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] insert_leading_zero_8bits +* flag indicating if one more zero bytes needs to prefixed before start code +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_put_nal_start_code_prefix + ( + bitstrm_t *ps_bitstrm, + WORD32 insert_leading_zero_8bits + ); + +#endif /* IH264E_BITSTREAM_H_ */ diff --git a/encoder/ih264e_cavlc.c b/encoder/ih264e_cavlc.c new file mode 100755 index 0000000..1341dcd --- /dev/null +++ b/encoder/ih264e_cavlc.c @@ -0,0 +1,1448 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_cavlc.c +* +* @brief +* Contains all the routines to code syntax elements and residuals when entropy +* coding chosen is CAVLC +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_compute_zeroruns_and_trailingones() +* - ih264e_write_coeff4x4_cavlc() +* - ih264e_write_coeff8x8_cavlc() +* - ih264e_encode_residue() +* - ih264e_write_islice_mb() +* - ih264e_write_pslice_mb() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <assert.h> +#include <limits.h> + +/* User include files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_debug.h" +#include "ih264_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_encode_header.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264e_statistics.h" +#include "ih264e_trace.h" + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* This function computes run of zero, number of trailing ones and sign of +* trailing ones basing on the significant coeff map, residual block and +* total nnz. +* +* @param[in] pi2_res_block +* Pointer to residual block containing levels in scan order +* +* @param[in] u4_total_coeff +* Total non-zero coefficients in that sub block +* +* @param[in] pu1_zero_run +* Pointer to array to store run of zeros +* +* @param[in] u4_sig_coeff_map +* significant coefficient map +* +* @returns u4_totzero_sign_trailone +* Bits 0-8 contains number of trailing ones. +* Bits 8-16 contains bitwise sign information of trailing one +* Bits 16-24 contains total number of zeros. +* +* @remarks +* None +* +******************************************************************************* +*/ +static UWORD32 ih264e_compute_zeroruns_and_trailingones(WORD16 *pi2_res_block, + UWORD32 u4_total_coeff, + UWORD8 *pu1_zero_run, + UWORD32 u4_sig_coeff_map) +{ + UWORD32 i = 0; + UWORD32 u4_nnz_coeff = 0; + WORD32 i4_run = -1; + UWORD32 u4_sign = 0; + UWORD32 u4_tot_zero = 0; + UWORD32 u4_trailing1 = 0; + WORD32 i4_val; + UWORD32 u4_totzero_sign_trailone; + UWORD32 *pu4_zero_run; + + pu4_zero_run = (void *)pu1_zero_run; + pu4_zero_run[0] = 0; + pu4_zero_run[1] = 0; + pu4_zero_run[2] = 0; + pu4_zero_run[3] = 0; + + /* Compute Runs of zeros for all nnz coefficients except the last 3 */ + if (u4_total_coeff > 3) + { + for (i = 0; u4_nnz_coeff < (u4_total_coeff-3); i++) + { + i4_run++; + + i4_val = (u4_sig_coeff_map & 0x1); + u4_sig_coeff_map >>= 1; + + if (i4_val != 0) + { + pu1_zero_run[u4_nnz_coeff++] = i4_run; + i4_run = -1; + } + } + } + + /* Compute T1's, Signof(T1's) and Runs of zeros for the last 3 */ + while (u4_nnz_coeff != u4_total_coeff) + { + i4_run++; + + i4_val = (u4_sig_coeff_map & 0x1); + u4_sig_coeff_map >>= 1; + + if (i4_val != 0) + { + if (pi2_res_block[u4_nnz_coeff] == 1) + { + pu1_zero_run[u4_nnz_coeff] = i4_run; + u4_trailing1++; + } + else + { + if (pi2_res_block[u4_nnz_coeff] == -1) + { + pu1_zero_run[u4_nnz_coeff] = i4_run; + u4_sign |= 1 << u4_trailing1; + u4_trailing1++; + } + else + { + pu1_zero_run[u4_nnz_coeff] = i4_run; + u4_trailing1 = 0; + u4_sign = 0; + } + } + i4_run = -1; + u4_nnz_coeff++; + } + i++; + } + + u4_tot_zero = i - u4_total_coeff; + u4_totzero_sign_trailone = (u4_tot_zero << 16)|(u4_sign << 8)|u4_trailing1; + + return (u4_totzero_sign_trailone); +} + +/** +******************************************************************************* +* +* @brief +* This function generates CAVLC coded bit stream for the given residual block +* +* @param[in] pi2_res_block +* Pointer to residual block containing levels in scan order +* +* @param[in] u4_total_coeff +* Total non-zero coefficients in the sub block +* +* @param[in] u4_block_type +* block type +* +* @param[in] pu1_zero_run +* Pointer to array to store run of zeros +* +* @param[in] u4_nc +* average of non zero coeff from top and left blocks (when available) +* +* @param[in, out] ps_bit_stream +* structure pointing to a buffer holding output bit stream +* +* @param[in] u4_sig_coeff_map +* significant coefficient map of the residual block +* +* @returns +* error code +* +* @remarks +* If the block type is CAVLC_CHROMA_4x4_DC, then u4_nc is non-significant +* +******************************************************************************* +*/ +static IH264E_ERROR_T ih264e_write_coeff4x4_cavlc(WORD16 *pi2_res_block, + UWORD32 u4_total_coeff, + ENTROPY_BLK_TYPE u4_block_type, + UWORD8 *pu1_zero_run, + UWORD32 u4_nc, + bitstrm_t *ps_bit_stream, + UWORD32 u4_sig_coeff_map) +{ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + UWORD32 u4_totzero_sign_trailone = 0; + UWORD32 u4_trailing_ones = 0; + UWORD32 u4_tot_zeros = 0; + UWORD32 u4_remaining_coeff = 0; + UWORD32 u4_sign1 = 0; + UWORD32 u4_max_num_coeff = 0; + const UWORD32 au4_max_num_nnz_coeff[] = {16, 15, 16, 4, 15}; + + /* validate inputs */ + ASSERT(u4_block_type <= CAVLC_CHROMA_4x4_AC); + + u4_max_num_coeff = au4_max_num_nnz_coeff[u4_block_type]; + + ASSERT(u4_total_coeff <= u4_max_num_coeff); + + if (!u4_total_coeff) + { + UWORD32 u4_codeword = 15; + UWORD32 u4_codesize = 1; + if (u4_block_type == CAVLC_CHROMA_4x4_DC) + { + u4_codeword = 1; + u4_codesize = 2; + DEBUG("\n[%d numcoeff, %d numtrailing ones]",u4_total_coeff, 0); + ENTROPY_TRACE("\tnumber of non zero coeffs ",u4_total_coeff); + ENTROPY_TRACE("\tnumber of trailing ones ",0); + } + else + { + UWORD32 u4_vlcnum = u4_nc >> 1; + + /* write coeff_token */ + if (u4_vlcnum > 3) + { + /* Num-FLC */ + u4_codeword = 3; + u4_codesize = 6; + } + else + { + /* Num-VLC 0, 1, 2 */ + if (u4_vlcnum > 1) + { + u4_vlcnum = 2; + } + u4_codesize <<= u4_vlcnum; + u4_codeword >>= (4 - u4_codesize); + } + + DEBUG("\n[%d numcoeff, %d numtrailing ones, %d nnz]",u4_total_coeff, 0, u4_nc); + ENTROPY_TRACE("\tnumber of non zero coeffs ",u4_total_coeff); + ENTROPY_TRACE("\tnC ",u4_nc); + } + + + DEBUG("\nCOEFF TOKEN 0: %d u4_codeword, %d u4_codesize",u4_codeword, u4_codesize); + ENTROPY_TRACE("\tcodeword ",u4_codeword); + ENTROPY_TRACE("\tcodesize ",u4_codesize); + + error_status = ih264e_put_bits(ps_bit_stream, u4_codeword, u4_codesize); + + return error_status; + } + else + { + /* Compute zero run, number of trailing ones and their sign. */ + u4_totzero_sign_trailone = + ih264e_compute_zeroruns_and_trailingones(pi2_res_block, + u4_total_coeff, + pu1_zero_run, + u4_sig_coeff_map); + u4_trailing_ones = u4_totzero_sign_trailone & 0xFF; + u4_sign1 = (u4_totzero_sign_trailone >> 8)& 0xFF; + u4_tot_zeros = (u4_totzero_sign_trailone >> 16) & 0xFF; + u4_remaining_coeff = u4_total_coeff - u4_trailing_ones; + + /* write coeff_token */ + { + UWORD32 u4_codeword; + UWORD32 u4_codesize; + if (u4_block_type == CAVLC_CHROMA_4x4_DC) + { + u4_codeword = gu1_code_coeff_token_table_chroma[u4_trailing_ones][u4_total_coeff-1]; + u4_codesize = gu1_size_coeff_token_table_chroma[u4_trailing_ones][u4_total_coeff-1]; + + DEBUG("\n[%d numcoeff, %d numtrailing ones]",u4_total_coeff, u4_trailing_ones); + ENTROPY_TRACE("\tnumber of non zero coeffs ",u4_total_coeff); + ENTROPY_TRACE("\tnumber of trailing ones ",u4_trailing_ones); + } + else + { + UWORD32 u4_vlcnum = u4_nc >> 1; + + if (u4_vlcnum > 3) + { + /* Num-FLC */ + u4_codeword = ((u4_total_coeff-1) << 2 ) + u4_trailing_ones; + u4_codesize = 6; + } + else + { + /* Num-VLC 0, 1, 2 */ + if (u4_vlcnum > 1) + { + u4_vlcnum = 2; + } + u4_codeword = gu1_code_coeff_token_table[u4_vlcnum][u4_trailing_ones][u4_total_coeff-1]; + u4_codesize = gu1_size_coeff_token_table[u4_vlcnum][u4_trailing_ones][u4_total_coeff-1]; + } + + DEBUG("\n[%d numcoeff, %d numtrailing ones, %d nnz]",u4_total_coeff, u4_trailing_ones, u4_nc); + ENTROPY_TRACE("\tnumber of non zero coeffs ",u4_total_coeff); + ENTROPY_TRACE("\tnumber of trailing ones ",u4_trailing_ones); + ENTROPY_TRACE("\tnC ",u4_nc); + } + + DEBUG("\nCOEFF TOKEN 0: %d u4_codeword, %d u4_codesize",u4_codeword, u4_codesize); + ENTROPY_TRACE("\tcodeword ",u4_codeword); + ENTROPY_TRACE("\tcodesize ",u4_codesize); + + error_status = ih264e_put_bits(ps_bit_stream, u4_codeword, u4_codesize); + } + + /* write sign of trailing ones */ + if (u4_trailing_ones) + { + DEBUG("\nT1's: %d u4_codeword, %d u4_codesize",u4_sign1, u4_trailing_ones); + error_status = ih264e_put_bits(ps_bit_stream, u4_sign1, u4_trailing_ones); + ENTROPY_TRACE("\tnumber of trailing ones ",u4_trailing_ones); + ENTROPY_TRACE("\tsign of trailing ones ",u4_sign1); + } + + /* write level codes */ + if (u4_remaining_coeff) + { + WORD32 i4_level = pi2_res_block[u4_remaining_coeff-1]; + UWORD32 u4_escape; + UWORD32 u4_suffix_length = 0; // Level-VLC[N] + UWORD32 u4_abs_level, u4_abs_level_actual = 0; + WORD32 i4_sign; + const UWORD32 u4_rndfactor[] = {0, 0, 1, 3, 7, 15, 31}; + + DEBUG("\n \t%d coeff,",i4_level); + ENTROPY_TRACE("\tcoeff ",i4_level); + + if (u4_trailing_ones < 3) + { + /* If there are less than 3 T1s, then the first non-T1 level is incremented if negative (decremented if positive)*/ + if (i4_level < 0) + { + i4_level += 1; + } + else + { + i4_level -= 1; + } + + u4_abs_level_actual = 1; + + /* Initialize VLC table (Suffix Length) to encode the level */ + if (u4_total_coeff > 10) + { + u4_suffix_length = 1; + } + } + + i4_sign = (i4_level >> (sizeof(WORD32) * CHAR_BIT - 1)); + u4_abs_level = ((i4_level + i4_sign) ^ i4_sign); + + u4_abs_level_actual += u4_abs_level; + + u4_escape = (u4_abs_level + u4_rndfactor[u4_suffix_length]) >> u4_suffix_length; + + while (1) + { + UWORD32 u4_codesize; + UWORD32 u4_codeword; + UWORD32 u4_codeval; + + u4_remaining_coeff--; + +GATHER_CAVLC_STATS1(); + + { + u4_codeval = u4_abs_level << 1; + u4_codeval = u4_codeval - 2 - i4_sign; + + if ((!u4_suffix_length) && (u4_escape > 7) && (u4_abs_level < 16)) + { + u4_codeword = (1 << 4) + (u4_codeval - 14); + u4_codesize = 19; + } + else if (u4_escape > 7) + { + u4_codeword = (1 << 12) + (u4_codeval - (15 << u4_suffix_length)); + u4_codesize = 28; + if (!u4_suffix_length) + { + u4_codeword -= 15; + } + } + else + { + u4_codeword = (1 << u4_suffix_length) + (u4_codeval & ((1 << u4_suffix_length)-1)); + u4_codesize = (u4_codeval >> u4_suffix_length) + 1 + u4_suffix_length; + } + } + + /*put the level code in bitstream*/ + DEBUG("\nLEVEL: %d u4_codeword, %d u4_codesize",u4_codeword, u4_codesize); + ENTROPY_TRACE("\tcodeword ",u4_codeword); + ENTROPY_TRACE("\tcodesize ",u4_codesize); + error_status = ih264e_put_bits(ps_bit_stream, u4_codeword, u4_codesize); + + if (u4_remaining_coeff == 0) break; + + /*update suffix length for next level*/ + if (u4_suffix_length == 0) + { + u4_suffix_length++; + } + if (u4_suffix_length < 6) + { + if (u4_abs_level_actual > gu1_threshold_vlc_level[u4_suffix_length]) + { + u4_suffix_length++; + } + } + + /* next level */ + i4_level = pi2_res_block[u4_remaining_coeff-1]; + + DEBUG("\n \t%d coeff,",i4_level); + ENTROPY_TRACE("\tcoeff ",i4_level); + + i4_sign = (i4_level >> (sizeof(WORD32) * CHAR_BIT - 1)); + u4_abs_level = ((i4_level + i4_sign) ^ i4_sign); + + u4_abs_level_actual = u4_abs_level; + + u4_escape = (u4_abs_level + u4_rndfactor[u4_suffix_length]) >> u4_suffix_length; + } + } + + DEBUG("\n \t %d totalzeros",u4_tot_zeros); + ENTROPY_TRACE("\ttotal zeros ",u4_tot_zeros); + + /* Write Total Zeros */ + if (u4_total_coeff < u4_max_num_coeff) + { + WORD32 index; + UWORD32 u4_codeword; + UWORD32 u4_codesize; + + if (u4_block_type == CAVLC_CHROMA_4x4_DC) + { + UWORD8 gu1_index_zero_table_chroma[] = {0, 4, 7}; + index = gu1_index_zero_table_chroma[u4_total_coeff-1] + u4_tot_zeros; + u4_codesize = gu1_size_zero_table_chroma[index]; + u4_codeword = gu1_code_zero_table_chroma[index]; + } + else + { + index = gu1_index_zero_table[u4_total_coeff-1] + u4_tot_zeros; + u4_codesize = gu1_size_zero_table[index]; + u4_codeword = gu1_code_zero_table[index]; + } + + DEBUG("\nTOTAL ZEROS: %d u4_codeword, %d u4_codesize",u4_codeword, u4_codesize); + ENTROPY_TRACE("\tcodeword ",u4_codeword); + ENTROPY_TRACE("\tcodesize ",u4_codesize); + error_status = ih264e_put_bits(ps_bit_stream, u4_codeword, u4_codesize); + } + + /* Write Run Before */ + if (u4_tot_zeros) + { + UWORD32 u4_max_num_coef = u4_total_coeff-1; + UWORD32 u4_codeword; + UWORD32 u4_codesize; + UWORD32 u4_zeros_left = u4_tot_zeros; + + while (u4_max_num_coef) + { + UWORD32 u4_run_before = pu1_zero_run[u4_max_num_coef]; + UWORD32 u4_index; + + if (u4_zeros_left > MAX_ZERO_LEFT) + { + u4_index = gu1_index_run_table[MAX_ZERO_LEFT]; + } + else + { + u4_index = gu1_index_run_table[u4_zeros_left - 1]; + } + + u4_codesize = gu1_size_run_table[u4_index + u4_run_before]; + u4_codeword = gu1_code_run_table[u4_index + u4_run_before]; + + DEBUG("\nRUN BEFORE ZEROS: %d u4_codeword, %d u4_codesize",u4_codeword, u4_codesize); + ENTROPY_TRACE("\tcodeword ",u4_codeword); + ENTROPY_TRACE("\tcodesize ",u4_codesize); + error_status = ih264e_put_bits(ps_bit_stream, u4_codeword, u4_codesize); + + u4_zeros_left -= u4_run_before; + if (!u4_zeros_left) + { + break; + } + u4_max_num_coef--; + } + } + } + + return error_status; +} + +/** +******************************************************************************* +* +* @brief +* This function generates CAVLC coded bit stream for the given subblock +* +* @param[in] ps_ent_ctxt +* Pointer to entropy context +* +* @param[in] pi2_res_block +* Pointers to residual blocks of all the partitions for the current subblk +* (containing levels in scan order) +* +* @param[in] pu1_nnz +* Total non-zero coefficients of all the partitions for the current subblk +* +* @param[in] pu2_sig_coeff_map +* Significant coefficient map of all the partitions for the current subblk +* +* @param[in] u4_block_type +* entropy coding block type +* +* @param[in] u4_ngbr_avbl +* top and left availability of all the partitions for the current subblk +* (packed) +* +* @param[in] pu1_top_nnz +* pointer to the buffer containing nnz of all the subblks to the top +* +* @param[in] pu1_left_nnz +* pointer to the buffer containing nnz of all the subblks to the left +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +static IH264E_ERROR_T ih264e_write_coeff8x8_cavlc(entropy_ctxt_t *ps_ent_ctxt, + WORD16 **pi2_res_block, + UWORD8 *pu1_nnz, + UWORD16 *pu2_sig_coeff_map, + ENTROPY_BLK_TYPE u4_block_type, + UWORD32 u4_ngbr_avlb, + UWORD8 *pu1_top_nnz, + UWORD8 *pu1_left_nnz) +{ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm; + UWORD8 *pu1_zero_run = ps_ent_ctxt->au1_zero_run, *pu1_ngbr_avbl; + UWORD32 u4_nC; + UWORD8 u1_mb_a, u1_mb_b; + + pu1_ngbr_avbl = (void *)(&u4_ngbr_avlb); + + /* encode ac block index 4x4 = 0*/ + u1_mb_a = pu1_ngbr_avbl[0] & 0x0F; + u1_mb_b = pu1_ngbr_avbl[0] & 0xF0; + u4_nC = 0; + if (u1_mb_a) + u4_nC += pu1_left_nnz[0]; + if (u1_mb_b) + u4_nC += pu1_top_nnz[0]; + if (u1_mb_a && u1_mb_b) + u4_nC = (u4_nC + 1) >> 1; + pu1_left_nnz[0] = pu1_top_nnz[0] = pu1_nnz[0]; + error_status = ih264e_write_coeff4x4_cavlc(pi2_res_block[0], pu1_nnz[0], u4_block_type, pu1_zero_run, u4_nC, ps_bitstream, pu2_sig_coeff_map[0]); + + /* encode ac block index 4x4 = 1*/ + u1_mb_a = pu1_ngbr_avbl[1] & 0x0F; + u1_mb_b = pu1_ngbr_avbl[1] & 0xF0; + u4_nC = 0; + if (u1_mb_a) + u4_nC += pu1_left_nnz[0]; + if (u1_mb_b) + u4_nC += pu1_top_nnz[1]; + if (u1_mb_a && u1_mb_b) + u4_nC = (u4_nC + 1) >> 1; + pu1_left_nnz[0] = pu1_top_nnz[1] = pu1_nnz[1]; + error_status = ih264e_write_coeff4x4_cavlc(pi2_res_block[1], pu1_nnz[1], u4_block_type, pu1_zero_run, u4_nC, ps_bitstream, pu2_sig_coeff_map[1]); + + /* encode ac block index 4x4 = 2*/ + u1_mb_a = pu1_ngbr_avbl[2] & 0x0F; + u1_mb_b = pu1_ngbr_avbl[2] & 0xF0; + u4_nC = 0; + if (u1_mb_a) + u4_nC += pu1_left_nnz[1]; + if (u1_mb_b) + u4_nC += pu1_top_nnz[0]; + if (u1_mb_a && u1_mb_b) + u4_nC = (u4_nC + 1) >> 1; + pu1_left_nnz[1] = pu1_top_nnz[0] = pu1_nnz[2]; + error_status = ih264e_write_coeff4x4_cavlc(pi2_res_block[2], pu1_nnz[2], u4_block_type, pu1_zero_run, u4_nC, ps_bitstream, pu2_sig_coeff_map[2]); + + /* encode ac block index 4x4 = 0*/ + u1_mb_a = pu1_ngbr_avbl[3] & 0x0F; + u1_mb_b = pu1_ngbr_avbl[3] & 0xF0; + u4_nC = 0; + if (u1_mb_a) + u4_nC += pu1_left_nnz[1]; + if (u1_mb_b) + u4_nC += pu1_top_nnz[1]; + if (u1_mb_a && u1_mb_b) + u4_nC = (u4_nC + 1) >> 1; + pu1_left_nnz[1] = pu1_top_nnz[1] = pu1_nnz[3]; + error_status = ih264e_write_coeff4x4_cavlc(pi2_res_block[3], pu1_nnz[3], u4_block_type, pu1_zero_run, u4_nC, ps_bitstream, pu2_sig_coeff_map[3]); + + return error_status; +} + +/** +******************************************************************************* +* +* @brief +* This function encodes luma and chroma residues of a macro block when +* the entropy coding mode chosen is cavlc. +* +* @param[in] ps_ent_ctxt +* Pointer to entropy context +* +* @param[in] u4_mb_type +* current mb type +* +* @param[in] u4_cbp +* coded block pattern for the current mb +* +* @returns error code +* +* @remarks none +* +******************************************************************************* +*/ +static IH264E_ERROR_T ih264e_encode_residue(entropy_ctxt_t *ps_ent_ctxt, + UWORD32 u4_mb_type, + UWORD32 u4_cbp) +{ + /* error status */ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + + /* packed residue */ + void *pv_mb_coeff_data = ps_ent_ctxt->pv_mb_coeff_data; + + /* bit stream buffer */ + bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm; + + /* zero run */ + UWORD8 *pu1_zero_run = ps_ent_ctxt->au1_zero_run; + + /* temp var */ + UWORD32 u4_nC, u4_ngbr_avlb; + UWORD8 au1_nnz[4], *pu1_ngbr_avlb, *pu1_top_nnz, *pu1_left_nnz; + UWORD16 au2_sig_coeff_map[4]; + WORD16 *pi2_res_block[4]; + UWORD8 *pu1_slice_idx = ps_ent_ctxt->pu1_slice_idx; + tu_sblk_coeff_data_t *ps_mb_coeff_data; + ENTROPY_BLK_TYPE e_entropy_blk_type = CAVLC_LUMA_4x4; + + /* ngbr availability */ + UWORD8 u1_mb_a, u1_mb_b; + + /* cbp */ + UWORD32 u4_cbp_luma = u4_cbp & 0xF, u4_cbp_chroma = u4_cbp >> 4; + + /* mb indices */ + WORD32 i4_mb_x, i4_mb_y; + + /* derive neighbor availability */ + i4_mb_x = ps_ent_ctxt->i4_mb_x; + i4_mb_y = ps_ent_ctxt->i4_mb_y; + pu1_slice_idx += (i4_mb_y * ps_ent_ctxt->i4_wd_mbs); + /* left macroblock availability */ + u1_mb_a = (i4_mb_x == 0 || + (pu1_slice_idx[i4_mb_x - 1 ] != pu1_slice_idx[i4_mb_x]))? 0 : 1; + /* top macroblock availability */ + u1_mb_b = (i4_mb_y == 0 || + (pu1_slice_idx[i4_mb_x-ps_ent_ctxt->i4_wd_mbs] != pu1_slice_idx[i4_mb_x]))? 0 : 1; + + pu1_ngbr_avlb = (void *)(&u4_ngbr_avlb); + pu1_top_nnz = ps_ent_ctxt->pu1_top_nnz_luma[ps_ent_ctxt->i4_mb_x]; + pu1_left_nnz = (UWORD8 *)&ps_ent_ctxt->u4_left_nnz_luma; + + /* encode luma residue */ + + /* mb type intra 16x16 */ + if (u4_mb_type == I16x16) + { + /* parse packed coeff data structure for residual data */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + /* estimate nnz for the current mb */ + u4_nC = 0; + if (u1_mb_a) + u4_nC += pu1_left_nnz[0]; + if (u1_mb_b) + u4_nC += pu1_top_nnz[0]; + if (u1_mb_a && u1_mb_b) + u4_nC = (u4_nC + 1) >> 1; + + /* encode dc block */ + ENTROPY_TRACE("Luma DC blk idx %d",0); + error_status = ih264e_write_coeff4x4_cavlc(pi2_res_block[0], au1_nnz[0], CAVLC_LUMA_4x4_DC, pu1_zero_run, u4_nC, ps_bitstream, au2_sig_coeff_map[0]); + + e_entropy_blk_type = CAVLC_LUMA_4x4_AC; + } + + if (u4_cbp_luma & 1) + { + /* encode ac block index 8x8 = 0*/ + /* parse packed coeff data structure for residual data */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[1], au2_sig_coeff_map[1], pi2_res_block[1]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[2], au2_sig_coeff_map[2], pi2_res_block[2]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[3], au2_sig_coeff_map[3], pi2_res_block[3]); + /* derive sub block neighbor availability */ + + pu1_ngbr_avlb[0] = (u1_mb_b << 4) | (u1_mb_a); + pu1_ngbr_avlb[1] = (u1_mb_b << 4) | 1; + pu1_ngbr_avlb[2] = (1 << 4) | (u1_mb_a); + pu1_ngbr_avlb[3] = 0x11; + /* encode sub blk */ + ENTROPY_TRACE("Luma blk idx %d",0); + error_status = ih264e_write_coeff8x8_cavlc(ps_ent_ctxt, pi2_res_block, au1_nnz, au2_sig_coeff_map, e_entropy_blk_type, u4_ngbr_avlb, pu1_top_nnz, pu1_left_nnz); + } + else + { + pu1_top_nnz[0] = pu1_top_nnz[1] = 0; + pu1_left_nnz[0] = pu1_left_nnz[1] = 0; + } + + if (u4_cbp_luma & 2) + { + /* encode ac block index 8x8 = 1*/ + /* parse packed coeff data structure for residual data */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[1], au2_sig_coeff_map[1], pi2_res_block[1]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[2], au2_sig_coeff_map[2], pi2_res_block[2]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[3], au2_sig_coeff_map[3], pi2_res_block[3]); + + /* derive sub block neighbor availability */ + pu1_ngbr_avlb[1] = pu1_ngbr_avlb[0] = (u1_mb_b << 4) | 1; + pu1_ngbr_avlb[3] = pu1_ngbr_avlb[2] = 0x11; + /* encode sub blk */ + ENTROPY_TRACE("Luma blk idx %d",1); + error_status = ih264e_write_coeff8x8_cavlc(ps_ent_ctxt, pi2_res_block, au1_nnz, au2_sig_coeff_map, e_entropy_blk_type, u4_ngbr_avlb, pu1_top_nnz+2, pu1_left_nnz); + } + else + { + (pu1_top_nnz + 2)[0] = (pu1_top_nnz + 2)[1] = 0; + pu1_left_nnz[0] = pu1_left_nnz[1] = 0; + } + + if (u4_cbp_luma & 0x4) + { + /* encode ac block index 8x8 = 2*/ + /* parse packed coeff data structure for residual data */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[1], au2_sig_coeff_map[1], pi2_res_block[1]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[2], au2_sig_coeff_map[2], pi2_res_block[2]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[3], au2_sig_coeff_map[3], pi2_res_block[3]); + + /* derive sub block neighbor availability */ + pu1_ngbr_avlb[2] = pu1_ngbr_avlb[0] = (1 << 4) | u1_mb_a; + pu1_ngbr_avlb[1] = pu1_ngbr_avlb[3] = 0x11; + /* encode sub blk */ + ENTROPY_TRACE("Luma blk idx %d",2); + error_status = ih264e_write_coeff8x8_cavlc(ps_ent_ctxt, pi2_res_block, au1_nnz, au2_sig_coeff_map, e_entropy_blk_type, u4_ngbr_avlb, pu1_top_nnz, (pu1_left_nnz+2)); + } + else + { + pu1_top_nnz[0] = pu1_top_nnz[1] = 0; + (pu1_left_nnz + 2)[0] = (pu1_left_nnz + 2)[1] = 0; + } + + if (u4_cbp_luma & 0x8) + { + /* encode ac block index 8x8 = 3*/ + /* parse packed coeff data structure for residual data */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[1], au2_sig_coeff_map[1], pi2_res_block[1]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[2], au2_sig_coeff_map[2], pi2_res_block[2]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[3], au2_sig_coeff_map[3], pi2_res_block[3]); + + /* derive sub block neighbor availability */ + u4_ngbr_avlb = 0x11111111; + /* encode sub blk */ + ENTROPY_TRACE("Luma blk idx %d",3); + error_status = ih264e_write_coeff8x8_cavlc(ps_ent_ctxt, pi2_res_block, au1_nnz, au2_sig_coeff_map, e_entropy_blk_type, u4_ngbr_avlb, pu1_top_nnz+2, pu1_left_nnz+2); + } + else + { + (pu1_top_nnz + 2)[0] = (pu1_top_nnz + 2)[1] = 0; + (pu1_left_nnz + 2)[0] = (pu1_left_nnz + 2)[1] = 0; + } + + /* encode chroma residue */ + if (u4_cbp_chroma & 3) + { + /* parse packed coeff data structure for residual data */ + /* cb, cr */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[1], au2_sig_coeff_map[1], pi2_res_block[1]); + + /* encode dc block */ + /* cb, cr */ + ENTROPY_TRACE("Chroma DC blk idx %d",0); + error_status = ih264e_write_coeff4x4_cavlc(pi2_res_block[0], au1_nnz[0], CAVLC_CHROMA_4x4_DC, pu1_zero_run, 0, ps_bitstream, au2_sig_coeff_map[0]); + ENTROPY_TRACE("Chroma DC blk idx %d",1); + error_status = ih264e_write_coeff4x4_cavlc(pi2_res_block[1], au1_nnz[1], CAVLC_CHROMA_4x4_DC, pu1_zero_run, 0, ps_bitstream, au2_sig_coeff_map[1]); + } + + pu1_top_nnz = ps_ent_ctxt->pu1_top_nnz_cbcr[ps_ent_ctxt->i4_mb_x]; + pu1_left_nnz = (UWORD8 *) &ps_ent_ctxt->u4_left_nnz_cbcr; + + /* encode sub blk */ + if (u4_cbp_chroma & 0x2) + { + /* encode ac block index 8x8 = 0*/ + /* derive sub block neighbor availability */ + pu1_ngbr_avlb[0] = (u1_mb_b << 4) | (u1_mb_a); + pu1_ngbr_avlb[1] = (u1_mb_b << 4) | 1; + pu1_ngbr_avlb[2] = (1 << 4) | (u1_mb_a); + pu1_ngbr_avlb[3] = 0x11; + + /* parse packed coeff data structure for residual data */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[1], au2_sig_coeff_map[1], pi2_res_block[1]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[2], au2_sig_coeff_map[2], pi2_res_block[2]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[3], au2_sig_coeff_map[3], pi2_res_block[3]); + + ENTROPY_TRACE("Chroma AC blk idx %d",0); + error_status = ih264e_write_coeff8x8_cavlc(ps_ent_ctxt, pi2_res_block, au1_nnz, au2_sig_coeff_map, CAVLC_CHROMA_4x4_AC, u4_ngbr_avlb, pu1_top_nnz, pu1_left_nnz); + } + else + { + pu1_top_nnz[0] = pu1_top_nnz[1] = 0; + pu1_left_nnz[0] = pu1_left_nnz[1] = 0; + } + + pu1_top_nnz += 2; + pu1_left_nnz += 2; + + /* encode sub blk */ + if (u4_cbp_chroma & 0x2) + { + /* parse packed coeff data structure for residual data */ + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[0], au2_sig_coeff_map[0], pi2_res_block[0]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[1], au2_sig_coeff_map[1], pi2_res_block[1]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[2], au2_sig_coeff_map[2], pi2_res_block[2]); + PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, au1_nnz[3], au2_sig_coeff_map[3], pi2_res_block[3]); + + ENTROPY_TRACE("Chroma AC blk idx %d",1); + error_status = ih264e_write_coeff8x8_cavlc(ps_ent_ctxt, pi2_res_block, au1_nnz, au2_sig_coeff_map, CAVLC_CHROMA_4x4_AC, u4_ngbr_avlb, pu1_top_nnz, pu1_left_nnz); + } + else + { + pu1_top_nnz[0] = pu1_top_nnz[1] = 0; + pu1_left_nnz[0] = pu1_left_nnz[1] = 0; + } + + /* store the index of the next mb coeff data */ + ps_ent_ctxt->pv_mb_coeff_data = pv_mb_coeff_data; + + return error_status; +} + +#define GET_NUM_BITS(ps_bitstream) ((ps_bitstream->u4_strm_buf_offset << 3) + 32 - ps_bitstream->i4_bits_left_in_cw) + +/** +******************************************************************************* +* +* @brief +* This function generates CAVLC coded bit stream for an Intra Slice. +* +* @description +* The mb syntax layer for intra slices constitutes luma mb mode, luma sub modes +* (if present), mb qp delta, coded block pattern, chroma mb mode and +* luma/chroma residue. These syntax elements are written as directed by table +* 7.3.5 of h264 specification. +* +* @param[in] ps_ent_ctxt +* pointer to entropy context +* +* @returns error code +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_write_islice_mb(entropy_ctxt_t *ps_ent_ctxt) +{ + /* error status */ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + + /* bit stream ptr */ + bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm; + + /* packed header data */ + UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + + /* mb header info */ + /* + * mb_tpm : mb type plus mode + * mb_type : luma mb type and chroma mb type are packed + * cbp : coded block pattern + * mb_qp_delta : mb qp delta + * chroma_intra_mode : chroma intra mode + * luma_intra_mode : luma intra mode + */ + WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode; + WORD8 mb_qp_delta; + + /* temp var */ + WORD32 i, mb_type_stream; + + WORD32 bitstream_start_offset, bitstream_end_offset; + + /* Starting bitstream offset for header in bits */ + bitstream_start_offset = GET_NUM_BITS(ps_bitstream); + + + /********************************************************************/ + /* BEGIN HEADER GENERATION */ + /********************************************************************/ + + /* mb header info */ + mb_tpm = *pu1_byte++; + cbp = *pu1_byte++; + mb_qp_delta = *pu1_byte++; + + /* mb type */ + mb_type = mb_tpm & 0xF; + /* is intra ? */ + if (mb_type == I16x16) + { + UWORD32 u4_cbp_l, u4_cbp_c; + + u4_cbp_c = (cbp >> 4); + u4_cbp_l = (cbp & 0xF); + luma_intra_mode = (mb_tpm >> 4) & 3; + chroma_intra_mode = (mb_tpm >> 6); + + mb_type_stream = luma_intra_mode + 1 + (u4_cbp_c << 2) + (u4_cbp_l == 15) * 12; + + /* write mb type */ + PUT_BITS_UEV(ps_bitstream, mb_type_stream, error_status, "mb type"); + + /* intra_chroma_pred_mode */ + PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + } + else if (mb_type == I4x4) + { + /* mb sub blk modes */ + WORD32 intra_pred_mode_flag, rem_intra_mode; + WORD32 byte; + + chroma_intra_mode = (mb_tpm >> 6); + + /* write mb type */ + PUT_BITS_UEV(ps_bitstream, 0, error_status, "mb type"); + + for (i = 0; i < 16; i += 2) + { + /* sub blk idx 1 */ + byte = *pu1_byte++; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + + /* sub blk idx 2 */ + byte >>= 4; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + } + + /* intra_chroma_pred_mode */ + PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + } + else if (mb_type == I8x8) + { + /* transform 8x8 flag */ + UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag; + + /* mb sub blk modes */ + WORD32 intra_pred_mode_flag, rem_intra_mode; + WORD32 byte; + + chroma_intra_mode = (mb_tpm >> 6); + + ASSERT(0); + + /* write mb type */ + PUT_BITS_UEV(ps_bitstream, 0, error_status, "mb type"); + + /* u4_transform_size_8x8_flag */ + PUT_BITS(ps_bitstream, u4_transform_size_8x8_flag, 1, error_status, "u4_transform_size_8x8_flag"); + + /* write sub block modes */ + for (i = 0; i < 4; i++) + { + /* sub blk idx 1 */ + byte = *pu1_byte++; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + + /* sub blk idx 2 */ + byte >>= 4; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + } + + /* intra_chroma_pred_mode */ + PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + } + else + { + } + + /* coded_block_pattern */ + if (mb_type != I16x16) + { + PUT_BITS_UEV(ps_bitstream, gu1_cbp_map_tables[cbp][0], error_status, "coded_block_pattern"); + } + + if (cbp || mb_type == I16x16) + { + /* mb_qp_delta */ + PUT_BITS_SEV(ps_bitstream, mb_qp_delta, error_status, "mb_qp_delta"); + } + + /* Ending bitstream offset for header in bits */ + bitstream_end_offset = GET_NUM_BITS(ps_bitstream); + + ps_ent_ctxt->u4_header_bits[0] += bitstream_end_offset - bitstream_start_offset; + + /* Starting bitstream offset for residue */ + bitstream_start_offset = bitstream_end_offset; + + /* residual */ + error_status = ih264e_encode_residue(ps_ent_ctxt, mb_type, cbp); + + /* Ending bitstream offset for reside in bits */ + bitstream_end_offset = GET_NUM_BITS(ps_bitstream); + ps_ent_ctxt->u4_residue_bits[0] += bitstream_end_offset - bitstream_start_offset; + + /* store the index of the next mb syntax layer */ + ps_ent_ctxt->pv_mb_header_data = pu1_byte; + + return error_status; +} + +/** +******************************************************************************* +* +* @brief +* This function generates CAVLC coded bit stream for Inter slices +* +* @description +* The mb syntax layer for inter slices constitutes luma mb mode, luma sub modes +* (if present), mb qp delta, coded block pattern, chroma mb mode and +* luma/chroma residue. These syntax elements are written as directed by table +* 7.3.5 of h264 specification +* +* @param[in] ps_ent_ctxt +* pointer to entropy context +* +* @returns error code +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_write_pslice_mb(entropy_ctxt_t *ps_ent_ctxt) +{ + /* error status */ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + + /* bit stream ptr */ + bitstrm_t *ps_bitstream = ps_ent_ctxt->ps_bitstrm; + + /* packed header data */ + UWORD8 *pu1_byte = ps_ent_ctxt->pv_mb_header_data; + + /* mb header info */ + /* + * mb_tpm : mb type plus mode + * mb_type : luma mb type and chroma mb type are packed + * cbp : coded block pattern + * mb_qp_delta : mb qp delta + * chroma_intra_mode : chroma intra mode + * luma_intra_mode : luma intra mode + * ps_pu : Pointer to the array of structures having motion vectors, size + * and position of sub partitions + */ + WORD32 mb_tpm, mb_type, cbp, chroma_intra_mode, luma_intra_mode; + WORD8 mb_qp_delta; + + /* temp var */ + WORD32 i, mb_type_stream, cbptable = 1; + + WORD32 is_inter = 0; + + WORD32 bitstream_start_offset, bitstream_end_offset; + + /* Starting bitstream offset for header in bits */ + bitstream_start_offset = GET_NUM_BITS(ps_bitstream); + + /********************************************************************/ + /* BEGIN HEADER GENERATION */ + /********************************************************************/ + + /* mb header info */ + mb_tpm = *pu1_byte++; + + /* mb type */ + mb_type = mb_tpm & 0xF; + + /* check for skip */ + if (mb_type == PSKIP) + { + UWORD32 *nnz; + + is_inter = 1; + + /* increment skip counter */ + (*ps_ent_ctxt->pi4_mb_skip_run)++; + + /* store the index of the next mb syntax layer */ + ps_ent_ctxt->pv_mb_header_data = pu1_byte; + + /* set nnz to zero */ + ps_ent_ctxt->u4_left_nnz_luma = 0; + nnz = (UWORD32 *)ps_ent_ctxt->pu1_top_nnz_luma[ps_ent_ctxt->i4_mb_x]; + *nnz = 0; + ps_ent_ctxt->u4_left_nnz_cbcr = 0; + nnz = (UWORD32 *)ps_ent_ctxt->pu1_top_nnz_cbcr[ps_ent_ctxt->i4_mb_x]; + *nnz = 0; + + /* residual */ + error_status = ih264e_encode_residue(ps_ent_ctxt, P16x16, 0); + + bitstream_end_offset = GET_NUM_BITS(ps_bitstream); + + ps_ent_ctxt->u4_header_bits[is_inter] += bitstream_end_offset - bitstream_start_offset; + + return error_status; + } + + /* remaining mb header info */ + cbp = *pu1_byte++; + mb_qp_delta = *pu1_byte++; + + /* mb skip run */ + PUT_BITS_UEV(ps_bitstream, *ps_ent_ctxt->pi4_mb_skip_run, error_status, "mb skip run"); + + /* reset skip counter */ + *ps_ent_ctxt->pi4_mb_skip_run = 0; + + /* is intra ? */ + if (mb_type == I16x16) + { + UWORD32 u4_cbp_l, u4_cbp_c; + + is_inter = 0; + + u4_cbp_c = (cbp >> 4); + u4_cbp_l = (cbp & 0xF); + luma_intra_mode = (mb_tpm >> 4) & 3; + chroma_intra_mode = (mb_tpm >> 6); + + mb_type_stream = luma_intra_mode + 1 + (u4_cbp_c << 2) + (u4_cbp_l == 15) * 12; + + mb_type_stream += 5; + + /* write mb type */ + PUT_BITS_UEV(ps_bitstream, mb_type_stream, error_status, "mb type"); + + /* intra_chroma_pred_mode */ + PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + } + else if (mb_type == I4x4) + { + /* mb sub blk modes */ + WORD32 intra_pred_mode_flag, rem_intra_mode; + WORD32 byte; + + is_inter = 0; + + chroma_intra_mode = (mb_tpm >> 6); + cbptable = 0; + + /* write mb type */ + PUT_BITS_UEV(ps_bitstream, 5, error_status, "mb type"); + + for (i = 0; i < 16; i += 2) + { + /* sub blk idx 1 */ + byte = *pu1_byte++; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + + /* sub blk idx 2 */ + byte >>= 4; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + } + + /* intra_chroma_pred_mode */ + PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + } + else if (mb_type == I8x8) + { + /* transform 8x8 flag */ + UWORD32 u4_transform_size_8x8_flag = ps_ent_ctxt->i1_transform_8x8_mode_flag; + + /* mb sub blk modes */ + WORD32 intra_pred_mode_flag, rem_intra_mode; + WORD32 byte; + + is_inter = 0; + + chroma_intra_mode = (mb_tpm >> 6); + cbptable = 0; + + ASSERT(0); + + /* write mb type */ + PUT_BITS_UEV(ps_bitstream, 5, error_status, "mb type"); + + /* u4_transform_size_8x8_flag */ + PUT_BITS(ps_bitstream, u4_transform_size_8x8_flag, 1, error_status, "u4_transform_size_8x8_flag"); + + /* write sub block modes */ + for (i = 0; i < 4; i++) + { + /* sub blk idx 1 */ + byte = *pu1_byte++; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + + /* sub blk idx 2 */ + byte >>= 4; + + intra_pred_mode_flag = byte & 0x1; + + /* prev_intra4x4_pred_mode_flag */ + PUT_BITS(ps_bitstream, intra_pred_mode_flag, 1, error_status, "prev_intra4x4_pred_mode_flag"); + + /* rem_intra4x4_pred_mode */ + if (!intra_pred_mode_flag) + { + rem_intra_mode = (byte & 0xF) >> 1; + PUT_BITS(ps_bitstream, rem_intra_mode, 3, error_status, "rem_intra4x4_pred_mode"); + } + } + + /* intra_chroma_pred_mode */ + PUT_BITS_UEV(ps_bitstream, chroma_intra_mode, error_status, "intra_chroma_pred_mode"); + } + else + { + /* inter macro block partition cnt */ + const UWORD8 au1_part_cnt[] = { 1, 2, 2, 4 }; + + /* mv ptr */ + WORD16 *pi2_mv_ptr = (WORD16 *)pu1_byte; + + /* number of partitions for the current mb */ + UWORD32 u4_part_cnt = au1_part_cnt[mb_type - 3]; + + is_inter = 1; + + /* write mb type */ + PUT_BITS_UEV(ps_bitstream, mb_type - 3, error_status, "mb type"); + + for (i = 0; i < (WORD32)u4_part_cnt; i++) + { + PUT_BITS_SEV(ps_bitstream, *pi2_mv_ptr++, error_status, "mv x"); + + PUT_BITS_SEV(ps_bitstream, *pi2_mv_ptr++, error_status, "mv y"); + } + + pu1_byte = (UWORD8 *)pi2_mv_ptr; + } + + /* coded_block_pattern */ + if (mb_type != I16x16) + { + PUT_BITS_UEV(ps_bitstream, gu1_cbp_map_tables[cbp][cbptable], error_status, "coded_block_pattern"); + } + + if (cbp || mb_type == I16x16) + { + /* mb_qp_delta */ + PUT_BITS_SEV(ps_bitstream, mb_qp_delta, error_status, "mb_qp_delta"); + } + + + /* Ending bitstream offset for header in bits */ + bitstream_end_offset = GET_NUM_BITS(ps_bitstream); + + ps_ent_ctxt->u4_header_bits[is_inter] += bitstream_end_offset - bitstream_start_offset; + + /* start bitstream offset for residue in bits */ + bitstream_start_offset = bitstream_end_offset; + + /* residual */ + error_status = ih264e_encode_residue(ps_ent_ctxt, mb_type, cbp); + + /* Ending bitstream offset for residue in bits */ + bitstream_end_offset = GET_NUM_BITS(ps_bitstream); + + ps_ent_ctxt->u4_residue_bits[is_inter] += bitstream_end_offset - bitstream_start_offset; + + /* store the index of the next mb syntax layer */ + ps_ent_ctxt->pv_mb_header_data = pu1_byte; + + return error_status; +} diff --git a/encoder/ih264e_cavlc.h b/encoder/ih264e_cavlc.h new file mode 100755 index 0000000..86f4cd4 --- /dev/null +++ b/encoder/ih264e_cavlc.h @@ -0,0 +1,112 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_cavlc.h +* +* @brief +* This file contains enumerations, macros and extern declarations of H264 +* cavlc tables +* +* @author +* ittiam +* +* @remarks +* none +****************************************************************************** +*/ + +#ifndef IH264E_CAVLC_H_ +#define IH264E_CAVLC_H_ + +/*****************************************************************************/ +/* Function macro definitions */ +/*****************************************************************************/ + +#define PARSE_COEFF_DATA_BLOCK_4x4(pv_mb_coeff_data, ps_mb_coeff_data, u4_nnz, u4_sig_coeff_map, pi2_res_block) \ + {\ + ps_mb_coeff_data = pv_mb_coeff_data; \ + u4_nnz = ps_mb_coeff_data->i4_sig_map_nnz & 0xff; \ + if (u4_nnz)\ + {\ + u4_sig_coeff_map = ps_mb_coeff_data->i4_sig_map_nnz >> 16; \ + pi2_res_block = ps_mb_coeff_data->ai2_residue; \ + pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue + u4_nnz; \ + }\ + else\ + {\ + pv_mb_coeff_data = ps_mb_coeff_data->ai2_residue;\ + }\ + } + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* This function generates CAVLC coded bit stream for an Intra Slice. +* +* @description +* The mb syntax layer for intra slices constitutes luma mb mode, luma sub modes +* (if present), mb qp delta, coded block pattern, chroma mb mode and +* luma/chroma residue. These syntax elements are written as directed by table +* 7.3.5 of h264 specification. +* +* @param[in] ps_ent_ctxt +* pointer to entropy context +* +* @returns error code +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_write_islice_mb(entropy_ctxt_t *ps_ent_ctxt); + +/** +******************************************************************************* +* +* @brief +* This function generates CAVLC coded bit stream for Inter slices +* +* @description +* The mb syntax layer for inter slices constitutes luma mb mode, luma sub modes +* (if present), mb qp delta, coded block pattern, chroma mb mode and +* luma/chroma residue. These syntax elements are written as directed by table +* 7.3.5 of h264 specification +* +* @param[in] ps_ent_ctxt +* pointer to entropy context +* +* @returns error code +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_write_pslice_mb(entropy_ctxt_t *ps_ent_ctxt); + +#endif /* IH264E_CAVLC_H_ */ diff --git a/encoder/ih264e_config.h b/encoder/ih264e_config.h new file mode 100755 index 0000000..2446cdb --- /dev/null +++ b/encoder/ih264e_config.h @@ -0,0 +1,52 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_config.h +* +* @brief +* contains any necessary declarations/definitions that are used during codec +* build +* +* @author +* ittiam +* +* @remarks +* none +****************************************************************************** +*/ + +#ifndef IH264E_CONFIG_H_ +#define IH264E_CONFIG_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +#define CAVLC_LEVEL_STATS 0 +#define GATING_STATS 0 +#define DEBUG_PRINT 0 +#define ENABLE_TRACE 0 +#define DEBUG_RC 0 +#define TRACE_SUPPORT 0 + +#endif /* IH264E_CONFIG_H_ */ diff --git a/encoder/ih264e_core_coding.c b/encoder/ih264e_core_coding.c new file mode 100755 index 0000000..5ba18de --- /dev/null +++ b/encoder/ih264e_core_coding.c @@ -0,0 +1,2365 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264e_core_coding.c + * + * @brief + * This file contains routines that perform luma and chroma core coding for + * intra macroblocks + * + * @author + * ittiam + * + * @par List of Functions: + * - ih264e_pack_l_mb_i16() + * - ih264e_pack_c_mb_i8() + * - ih264e_code_luma_intra_macroblock_16x16() + * - ih264e_code_luma_intra_macroblock_4x4() + * - ih264e_code_chroma_intra_macroblock_8x8() + * + * @remarks + * None + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <string.h> +#include <assert.h> + +/* User include files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "ih264_platform_macros.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264e_defs.h" +#include "ih264_trans_data.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_globals.h" +#include "ih264e_core_coding.h" +#include "ih264e_mc.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* This function performs does the DCT transform then Hadamard transform +* and quantization for a macroblock when the mb mode is intra 16x16 mode +* +* @par Description: +* First cf4 is done on all 16 4x4 blocks of the 16x16 input block. +* Then hadamard transform is done on the DC coefficients +* Quantization is then performed on the 16x16 block, 4x4 wise +* +* @param[in] pu1_src +* Pointer to source sub-block +* +* @param[in] pu1_pred +* Pointer to prediction sub-block +* +* @param[in] pi2_out +* Pointer to residual sub-block +* The output will be in linear format +* The first 16 continuous locations will contain the values of Dc block +* After DC block and a stride 1st AC block will follow +* After one more stride next AC block will follow +* The blocks will be in raster scan order +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* Prediction stride +* +* @param[in] dst_strd +* Destination stride +* +* @param[in] pu2_scale_matrix +* The quantization matrix for 4x4 transform +* +* @param[in] pu2_threshold_matrix +* Threshold matrix +* +* @param[in] u4_qbits +* 15+QP/6 +* +* @param[in] u4_round_factor +* Round factor for quant +* +* @param[out] pu1_nnz +* Memory to store the non-zeros after transform +* The first byte will be the nnz of DC block +* From the next byte the AC nnzs will be stored in raster scan order +* +* @param u4_dc_flag +* Signals if Dc transform is to be done or not +* 1 -> Dc transform will be done +* 0 -> Dc transform will not be done +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_luma_16x16_resi_trans_dctrans_quant(codec_t *ps_codec, + UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz, + UWORD32 u4_dc_flag) + +{ + WORD32 blk_cntr; + WORD32 i4_offsetx, i4_offsety; + UWORD8 *pu1_curr_src, *pu1_curr_pred; + + WORD16 *pi2_dc_str = pi2_out; + + /* Move to the ac addresses */ + pu1_nnz++; + pi2_out += dst_strd; + + for (blk_cntr = 0; blk_cntr < NUM_LUMA4x4_BLOCKS_IN_MB; blk_cntr++) + { + IND2SUB_LUMA_MB(blk_cntr, i4_offsetx, i4_offsety); + + pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd; + pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd; + + ps_codec->pf_resi_trans_quant_4x4(pu1_curr_src, pu1_curr_pred, + pi2_out + blk_cntr * dst_strd, + src_strd, pred_strd, pu2_scale_matrix, + pu2_threshold_matrix, u4_qbits, + u4_round_factor, &pu1_nnz[blk_cntr], + &pi2_dc_str[blk_cntr]); + + } + + if (!u4_dc_flag) + return; + + /* + * In case of i16x16, we need to remove the contribution of dc coeffs into + * nnz of each block. We are doing that in the packing function + */ + + /* Adjust pointers to point to dc values */ + pi2_out -= dst_strd; + pu1_nnz--; + + u4_qbits++; + u4_round_factor <<= 1; + + ps_codec->pf_hadamard_quant_4x4(pi2_dc_str, pi2_out, pu2_scale_matrix, + pu2_threshold_matrix, u4_qbits, + u4_round_factor, &pu1_nnz[0]); +} + +/** +******************************************************************************* +* +* @brief +* This function performs the intra 16x16 inverse transform process for H264 +* it includes inverse Dc transform, inverse quant and then inverse transform +* +* @par Description: +* +* @param[in] pi2_src +* Input data, 16x16 size +* First 16 mem locations will have the Dc coffs in rater scan order in linear fashion +* after a stride 1st AC clock will be present again in raster can order +* Then each AC block of the 16x16 block will follow in raster scan order +* +* @param[in] pu1_pred +* The predicted data, 16x16 size +* Block by block form +* +* @param[in] pu1_out +* Output 16x16 +* In block by block form +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* input stride for prediction buffer +* +* @param[in] out_strd +* input stride for output buffer +* +* @param[in] pu2_iscale_mat +* Inverse quantization matrix for 4x4 transform +* +* @param[in] pu2_weigh_mat +* weight matrix of 4x4 transform +* +* @param[in] qp_div +* QP/6 +* +* @param[in] pi4_tmp +* Input temporary buffer +* needs to be at least 20 in size +* +* @param[in] pu4_cntrl +* Controls the transform path +* total Last 17 bits are used +* the 16th th bit will correspond to DC block +* and 32-17 will correspond to the ac blocks in raster scan order +* bit equaling zero indicates that the entire 4x4 block is zero for DC +* For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero +* +* @param[in] pi4_tmp +* Input temporary buffer +* needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size +* +* @returns +* none +* +* @remarks +* The all zero case must be taken care outside +* +******************************************************************************* +*/ +void ih264e_luma_16x16_idctrans_iquant_itrans_recon(codec_t *ps_codec, + WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + UWORD32 u4_cntrl, + UWORD32 u4_dc_trans_flag, + WORD32 *pi4_tmp) +{ + /* Start index for inverse quant in a 4x4 block */ + WORD32 iq_start_idx = (u4_dc_trans_flag == 0) ? 0 : 1; + + /* Cntrl bits for 4x4 transforms + * u4_blk_cntrl : controls if a 4x4 block should be processed in ac path + * u4_dc_cntrl : controls is a 4x4 block is to be processed in dc path + * : dc block must contain only single dc coefficient + * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac + * : ie not (ac or dc) + */ + UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl; + + /* tmp registers for block ids */ + UWORD32 u4_blk_id; + + /* Subscrripts */ + WORD32 i4_offset_x, i4_offset_y; + + UWORD8 *pu1_cur_prd_blk, *pu1_cur_out_blk; + + /* Src and stride for dc coeffs */ + UWORD32 u4_dc_inc; + WORD16 *pi2_dc_src; + + /* + * For intra blocks we need to do inverse dc transform + * In case if intra blocks, its here that we populate the dc bits in cntrl + * as they cannot be populated any earlier + */ + if (u4_dc_trans_flag) + { + UWORD32 cntr, u4_dc_cntrl; + /* Do inv hadamard and place the results at the start of each AC block */ + ps_codec->pf_ihadamard_scaling_4x4(pi2_src, pi2_src, pu2_iscale_mat, + pu2_weigh_mat, qp_div, pi4_tmp); + + /* Update the cntrl flag */ + u4_dc_cntrl = 0; + for (cntr = 0; cntr < DC_COEFF_CNT_LUMA_MB; cntr++) + { + u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr)); + } + /* Mark dc bits as 1 if corresponding ac bit is 0 */ + u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl); + /* Combine both ac and dc bits */ + u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA) + | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_LUMA); + } + + /* Source for dc coeffs + * If the block is intra, we have to read dc values from first row of src + * then stride for each block is 1, other wise its src stride + */ + pi2_dc_src = (iq_start_idx == 0) ? (pi2_src + src_strd) : pi2_src; + u4_dc_inc = (iq_start_idx == 0) ? src_strd : 1; + + /* The AC blocks starts from 2nd row */ + pi2_src += src_strd; + + /* Get the block bits */ + u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA); + u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_LUMA) << 16; + u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFFFF0000; + + /* Get first block to process */ + DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id); + while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB) + { + /* Compute address of src blocks */ + WORD32 i4_src_offset = u4_dc_inc * u4_blk_id; + + IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y); + + /* Compute address of out and pred blocks */ + pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd; + pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd; + + /* Do inv dc transform */ + ps_codec->pf_iquant_itrans_recon_4x4_dc(pi2_dc_src + i4_src_offset, + pu1_cur_prd_blk, + pu1_cur_out_blk, pred_strd, + out_strd, pu2_iscale_mat, + pu2_weigh_mat, qp_div, NULL, + iq_start_idx, + pi2_dc_src + i4_src_offset); + /* Get next DC block to process */ + DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id); + } + + /* now process ac/mixed blocks */ + DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id); + while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB) + { + + WORD32 i4_src_offset = src_strd * u4_blk_id; + + IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y); + + pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd; + pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd; + + ps_codec->pf_iquant_itrans_recon_4x4(pi2_src + i4_src_offset, + pu1_cur_prd_blk, pu1_cur_out_blk, + pred_strd, out_strd, + pu2_iscale_mat, pu2_weigh_mat, + qp_div, (WORD16*) pi4_tmp, + iq_start_idx, + pi2_dc_src + u4_blk_id); + + DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id); + } + + /* Now process empty blocks */ + DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id); + while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB) + { + IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y); + + pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd; + pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd; + + ps_codec->pf_inter_pred_luma_copy(pu1_cur_prd_blk, pu1_cur_out_blk, + pred_strd, out_strd, SIZE_4X4_BLK_HRZ, + SIZE_4X4_BLK_VERT, 0, 0); + + DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id); + } +} + +/** +******************************************************************************* +* +* @brief +* This function performs does the DCT transform then Hadamard transform +* and quantization for a chroma macroblock +* +* @par Description: +* First cf4 is done on all 16 4x4 blocks of the 8x8input block +* Then hadamard transform is done on the DC coefficients +* Quantization is then performed on the 8x8 block, 4x4 wise +* +* @param[in] pu1_src +* Pointer to source sub-block +* The input is in interleaved format for two chroma planes +* +* @param[in] pu1_pred +* Pointer to prediction sub-block +* Prediction is in inter leaved format +* +* @param[in] pi2_out +* Pointer to residual sub-block +* The output will be in linear format +* The first 4 continuous locations will contain the values of DC block for U +* and then next 4 will contain for V. +* After DC block and a stride 1st AC block of U plane will follow +* After one more stride next AC block of V plane will follow +* The blocks will be in raster scan order +* +* After all the AC blocks of U plane AC blocks of V plane will follow in exact +* same way +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* Prediction stride +* +* @param[in] dst_strd +* Destination stride +* +* @param[in] pu2_scale_matrix +* The quantization matrix for 4x4 transform +* +* @param[in] pu2_threshold_matrix +* Threshold matrix +* +* @param[in] u4_qbits +* 15+QP/6 +* +* @param[in] u4_round_factor +* Round factor for quant +* +* @param[out] pu1_nnz +* Memory to store the non-zeros after transform +* The first byte will be the nnz od DC block for U plane +* From the next byte the AC nnzs will be storerd in raster scan order +* The fifth byte will be nnz of Dc block of V plane +* Then Ac blocks will follow +* +* @param u4_dc_flag +* Signals if Dc transform is to be done or not +* 1 -> Dc transform will be done +* 0 -> Dc transform will not be done +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_chroma_8x8_resi_trans_dctrans_quant(codec_t *ps_codec, + UWORD8 *pu1_src, + UWORD8 *pu1_pred, + WORD16 *pi2_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, + UWORD32 u4_qbits, + UWORD32 u4_round_factor, + UWORD8 *pu1_nnz_c) +{ + WORD32 blk_cntr; + WORD32 i4_offsetx, i4_offsety; + UWORD8 *pu1_curr_src, *pu1_curr_pred; + + WORD16 pi2_dc_str[8]; + UWORD8 au1_dcnnz[2]; + + /* Move to the ac addresses */ + pu1_nnz_c++; + pi2_out += out_strd; + + for (blk_cntr = 0; blk_cntr < NUM_CHROMA4x4_BLOCKS_IN_MB; blk_cntr++) + { + IND2SUB_CHROMA_MB(blk_cntr, i4_offsetx, i4_offsety); + + pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd; + pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd; + + /* For chroma, v plane nnz is populated from position 5 */ + ps_codec->pf_resi_trans_quant_chroma_4x4( + pu1_curr_src, pu1_curr_pred, + pi2_out + blk_cntr * out_strd, src_strd, pred_strd, + pu2_scale_matrix, pu2_threshold_matrix, u4_qbits, + u4_round_factor, &pu1_nnz_c[blk_cntr + (blk_cntr > 3)], + &pi2_dc_str[blk_cntr]); + } + + /* Adjust pointers to point to dc values */ + pi2_out -= out_strd; + pu1_nnz_c--; + + u4_qbits++; + u4_round_factor <<= 1; + + ps_codec->pf_hadamard_quant_2x2_uv(pi2_dc_str, pi2_out, pu2_scale_matrix, + pu2_threshold_matrix, u4_qbits, + u4_round_factor, au1_dcnnz); + + /* Copy the dc nnzs */ + pu1_nnz_c[0] = au1_dcnnz[0]; + pu1_nnz_c[5] = au1_dcnnz[1]; + +} + +/** +******************************************************************************* +* @brief +* This function performs the inverse transform with process for chroma MB of H264 +* +* @par Description: +* Does inverse DC transform ,inverse quantization inverse transform +* +* @param[in] pi2_src +* Input data, 16x16 size +* The input is in the form of, first 4 locations will contain DC coeffs of +* U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane +* in raster scan order will follow, each block as linear array in raster scan order. +* After a stride next AC block will follow. After all AC blocks of U plane +* V plane AC blocks will follow in exact same order. +* +* @param[in] pu1_pred +* The predicted data, 8x16 size, U and V interleaved +* +* @param[in] pu1_out +* Output 8x16, U and V interleaved +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* input stride for prediction buffer +* +* @param[in] out_strd +* input stride for output buffer +* +* @param[in] pu2_iscale_mat +* Inverse quantization martix for 4x4 transform +* +* @param[in] pu2_weigh_mat +* weight matrix of 4x4 transform +* +* @param[in] qp_div +* QP/6 +* +* @param[in] pi4_tmp +* Input temporary buffer +* needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes +* in size +* +* @param[in] pu4_cntrl +* Controls the transform path +* the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block +* 32-28 bits will indicate AC blocks of U plane in raster scan order +* 27-23 bits will indicate AC blocks of V plane in rater scan order +* The bit 1 implies that there is at least one non zero coeff in a block +* +* @returns +* none +* +* @remarks +******************************************************************************* +*/ +void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(codec_t *ps_codec, + WORD16 *pi2_src, + UWORD8 *pu1_pred, + UWORD8 *pu1_out, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 out_strd, + const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, + UWORD32 u4_cntrl, + WORD32 *pi4_tmp) +{ + /* Cntrl bits for 4x4 transforms + * u4_blk_cntrl : controls if a 4x4 block should be processed in ac path + * u4_dc_cntrl : controls is a 4x4 block is to be processed in dc path + * : dc block must contain only single dc coefficient + * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac + * : ie not (ac or dc) + */ + + UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl; + + /* tmp registers for block ids */ + WORD32 u4_blk_id; + + /* Offsets for pointers */ + WORD32 i4_offset_x, i4_offset_y; + + /* Pointer to 4x4 blocks */ + UWORD8 *pu1_cur_4x4_prd_blk, *pu1_cur_4x4_out_blk; + + /* Tmp register for pointer to dc coffs */ + WORD16 *pi2_dc_src; + + WORD16 i2_zero = 0; + + /* Increment for dc block */ + WORD32 i4_dc_inc; + + /* + * Lets do the inverse transform for dc coeffs in chroma + */ + if (u4_cntrl & CNTRL_FLAG_DCBLK_MASK_CHROMA) + { + UWORD32 cntr, u4_dc_cntrl; + /* Do inv hadamard for u an v block */ + + ps_codec->pf_ihadamard_scaling_2x2_uv(pi2_src, pi2_src, pu2_iscale_mat, + pu2_weigh_mat, qp_div, NULL); + /* + * Update the cntrl flag + * Flag is updated as follows bits 15-11 -> u block dc bits + */ + u4_dc_cntrl = 0; + for (cntr = 0; cntr < 8; cntr++) + { + u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr)); + } + + /* Mark dc bits as 1 if corresponding ac bit is 0 */ + u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl); + /* Combine both ac and dc bits */ + u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA) + | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_CHROMA); + + /* Since we populated the dc coffs, we have to read them from there */ + pi2_dc_src = pi2_src; + i4_dc_inc = 1; + } + else + { + u4_cntrl = u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA; + pi2_dc_src = &i2_zero; + i4_dc_inc = 0; + } + + /* Get the block bits */ + u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA); + u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_CHROMA) << 16; + u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFF000000; + + /* The AC blocks starts from 2nd row */ + pi2_src += src_strd; + + DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id); + while (u4_blk_id < 8) + { + WORD32 dc_src_offset = u4_blk_id * i4_dc_inc; + + IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y); + + pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd; + pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd; + + ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc( + pi2_dc_src + dc_src_offset, pu1_cur_4x4_prd_blk, + pu1_cur_4x4_out_blk, pred_strd, out_strd, NULL, NULL, 0, + NULL, pi2_dc_src + dc_src_offset); + /* Get next DC block to process */ + DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id); + } + + /* now process ac/mixed blocks */ + DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id); + while (u4_blk_id < 8) + { + WORD32 i4_src_offset = src_strd * u4_blk_id; + WORD32 dc_src_offset = i4_dc_inc * u4_blk_id; + + IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y); + + pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd; + pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd; + + ps_codec->pf_iquant_itrans_recon_chroma_4x4(pi2_src + i4_src_offset, + pu1_cur_4x4_prd_blk, + pu1_cur_4x4_out_blk, + pred_strd, out_strd, + pu2_iscale_mat, + pu2_weigh_mat, qp_div, + (WORD16 *) pi4_tmp, + pi2_dc_src + dc_src_offset); + + DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id); + } + + /* Now process empty blocks */ + DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id); + while (u4_blk_id < 8) + { + IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y); + + pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd; + pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd; + + ps_codec->pf_interleave_copy(pu1_cur_4x4_prd_blk, pu1_cur_4x4_out_blk, + pred_strd, out_strd, SIZE_4X4_BLK_VERT, + SIZE_4X4_BLK_HRZ); + + DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id); + } +} + +/** +****************************************************************************** +* +* @brief This function packs residue of an i16x16 luma mb for entropy coding +* +* @par Description +* An i16 macro block contains two classes of units, dc 4x4 block and +* 4x4 ac blocks. while packing the mb, the dc block is sent first, and +* the 16 ac blocks are sent next in scan order. Each and every block is +* represented by 3 parameters (nnz, significant coefficient map and the +* residue coefficients itself). If a 4x4 unit does not have any coefficients +* then only nnz is sent. Inside a 4x4 block the individual coefficients are +* sent in scan order. +* +* The first byte of each block will be nnz of the block, if it is non zero, +* a 2 byte significance map is sent. This is followed by nonzero coefficients. +* This is repeated for 1 dc + 16 ac blocks. +* +* @param[in] pi2_res_mb +* pointer to residue mb +* +* @param[in, out] pv_mb_coeff_data +* buffer pointing to packed residue coefficients +* +* @param[in] u4_res_strd +* residual block stride +* +* @param[out] u1_cbp_l +* coded block pattern luma +* +* @param[in] pu1_nnz +* number of non zero coefficients in each 4x4 unit +* +* @param[out] +* Control signal for inverse transform of 16x16 blocks +* +* @return none +* +* @ remarks +* +****************************************************************************** +*/ +void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb, + void **pv_mb_coeff_data, + WORD32 i4_res_strd, + UWORD8 *u1_cbp_l, + UWORD8 *pu1_nnz, + UWORD32 *pu4_cntrl) +{ + /* pointer to packed sub block buffer space */ + tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data), *ps_mb_coeff_data_ac; + + /* no of non zero coefficients in the current sub block */ + UWORD32 u4_nnz_cnt; + + /* significant coefficient map */ + UWORD32 u4_s_map; + + /* pointer to scanning matrix */ + const UWORD8 *pu1_scan_order; + + /* number of non zeros in sub block */ + UWORD32 u4_nnz; + + /* coeff scan order */ + const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}; + + /* temp var */ + UWORD32 coeff_cnt, mask, b4,u4_cntrl=0; + + /*DC and AC coeff pointers*/ + WORD16 *pi2_res_mb_ac,*pi2_res_mb_dc; + + /********************************************************/ + /* pack dc coeff data for entropy coding */ + /********************************************************/ + + pi2_res_mb_dc = pi2_res_mb; + pu1_scan_order = gu1_luma_scan_order_dc; + + u4_nnz = *pu1_nnz; + u4_cntrl = 0; + + /* write number of non zero coefficients */ + ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz; + + if (u4_nnz) + { + for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++) + { + if (pi2_res_mb_dc[pu1_scan_order[coeff_cnt]]) + { + /* write residue */ + ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_dc[pu1_scan_order[coeff_cnt]]; + u4_s_map |= mask; + } + mask <<= 1; + } + /* write significant coeff map */ + ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16); + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt; + + u4_cntrl = 0x00008000;// Set DC bit in ctrl code + } + else + { + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue; + } + + /********************************************************/ + /* pack ac coeff data for entropy coding */ + /********************************************************/ + + pu1_nnz ++; + pu1_scan_order = gu1_luma_scan_order; + pi2_res_mb += i4_res_strd; /*Move to AC block*/ + + ps_mb_coeff_data_ac = (*pv_mb_coeff_data); + + for (b4 = 0; b4 < 16; b4++) + { + ps_mb_coeff_data = (*pv_mb_coeff_data); + + u4_nnz = pu1_nnz[u1_scan_order[b4]]; + + /* Jump according to the scan order */ + pi2_res_mb_ac = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]); + + /* + * Since this is a i16x16 block, we should not count dc coeff on indi + * vidual 4x4 blocks to nnz. But due to the implementation of 16x16 + * trans function, we add dc's nnz to u4_nnz too. Hence we adjust that + * here + */ + u4_nnz -= (pi2_res_mb_ac[0] != 0); + + /* write number of non zero coefficients */ + ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz; + + if (u4_nnz) + { + for (u4_nnz_cnt = 0, coeff_cnt = 1, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++) + { + if (pi2_res_mb_ac[pu1_scan_order[coeff_cnt]]) + { + /* write residue */ + ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_ac[pu1_scan_order[coeff_cnt]]; + u4_s_map |= mask; + } + mask <<= 1; + } + /* write significant coeff map */ + ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16); + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt; + *u1_cbp_l = 15; + + u4_cntrl |= (1 << (31 - u1_scan_order[b4])); + } + else + { + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue; + } + + } + + if (!(*u1_cbp_l)) + { + (*pv_mb_coeff_data) = ps_mb_coeff_data_ac; + } + + /* Store the cntrl signal */ + (*pu4_cntrl) = u4_cntrl; + return; +} + +/** +****************************************************************************** +* +* @brief This function packs residue of an p16x16 luma mb for entropy coding +* +* @par Description +* A p16x16 macro block contains two classes of units 16 4x4 ac blocks. +* while packing the mb, the dc block is sent first, and +* the 16 ac blocks are sent next in scan order. Each and every block is +* represented by 3 parameters (nnz, significant coefficient map and the +* residue coefficients itself). If a 4x4 unit does not have any coefficients +* then only nnz is sent. Inside a 4x4 block the individual coefficients are +* sent in scan order. +* +* The first byte of each block will be nnz of the block, if it is non zero, +* a 2 byte significance map is sent. This is followed by nonzero coefficients. +* This is repeated for 1 dc + 16 ac blocks. +* +* @param[in] pi2_res_mb +* pointer to residue mb +* +* @param[in, out] pv_mb_coeff_data +* buffer pointing to packed residue coefficients +* +* @param[in] i4_res_strd +* residual block stride +* +* @param[out] u1_cbp_l +* coded block pattern luma +* +* @param[in] pu1_nnz +* number of non zero coefficients in each 4x4 unit +* +* @param[out] pu4_cntrl +* Control signal for inverse transform +* +* @return none +* +* @remarks Killing coffs not yet coded +* +****************************************************************************** +*/ +void ih264e_pack_l_mb(WORD16 *pi2_res_mb, + void **pv_mb_coeff_data, + WORD32 i4_res_strd, + UWORD8 *u1_cbp_l, + UWORD8 *pu1_nnz, + UWORD32 u4_thres_resi, + UWORD32 *pu4_cntrl) +{ + /* pointer to packed sub block buffer space */ + tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8, *ps_mb_coeff_data_mb; + + /* no of non zero coefficients in the current sub block */ + UWORD32 u4_nnz_cnt; + + /* significant coefficient map */ + UWORD32 u4_s_map; + + /* pointer to scanning matrix */ + const UWORD8 *pu1_scan_order = gu1_luma_scan_order; + + /* number of non zeros in sub block */ + UWORD32 u4_nnz; + + /* pointer to residual sub block */ + WORD16 *pi2_res_sb; + + /* coeff scan order */ + const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}; + + /* coeff cost */ + const UWORD8 *pu1_coeff_cost = gu1_coeff_cost; + + /* temp var */ + UWORD32 u4_mb_coeff_cost = 0, u4_b8_coeff_cost = 0, coeff_cnt, mask, u4_cntrl = 0, b4, b8; + + /* temp var */ + WORD32 i4_res_val, i4_run = -1, dcac_block; + + /* When Hadamard transform is disabled, first row values are dont care, ignore them */ + pi2_res_mb += i4_res_strd; + + /* When Hadamard transform is disabled, first unit value is dont care, ignore this */ + pu1_nnz ++; + + ps_mb_coeff_data_mb = ps_mb_coeff_data_b8 = (*pv_mb_coeff_data); + + /********************************************************/ + /* pack coeff data for entropy coding */ + /********************************************************/ + + for (b4 = 0; b4 < 16; b4++) + { + ps_mb_coeff_data = (*pv_mb_coeff_data); + + b8 = b4 >> 2; + + u4_nnz = pu1_nnz[u1_scan_order[b4]]; + + /* Jump according to the scan order */ + pi2_res_sb = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]); + + /* write number of non zero coefficients */ + ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz; + + if (u4_nnz) + { + for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++) + { + /* number of runs of zero before, this is used to compute coeff cost */ + i4_run++; + + i4_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]]; + + if (i4_res_val) + { + /* write residue */ + ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i4_res_val; + u4_s_map |= mask; + + if (u4_thres_resi) + { + /* compute coeff cost */ + if (i4_res_val == 1 || i4_res_val == -1) + { + if (i4_run < 6) + u4_b8_coeff_cost += pu1_coeff_cost[i4_run]; + } + else + u4_b8_coeff_cost += 9; + + i4_run = -1; + } + } + + mask <<= 1; + } + + /* write significant coeff map */ + ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16); + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt; + + /* cbp */ + *u1_cbp_l |= (1 << b8); + + /* Cntrl map for inverse transform computation + * + * If coeff_cnt is zero, it means that only nonzero was a dc coeff + * Hence we have to set the 16 - u1_scan_order[b4]) position instead + * of 31 - u1_scan_order[b4] + */ + dcac_block = (coeff_cnt == 0)?16:31; + u4_cntrl |= (1 << (dcac_block - u1_scan_order[b4])); + } + else + { + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue; + } + + /* Decide if the 8x8 unit has to be sent for entropy coding? */ + if ((b4+1) % 4 == 0) + { + if ( u4_thres_resi && (u4_b8_coeff_cost <= LUMA_SUB_BLOCK_SKIP_THRESHOLD) && + (*u1_cbp_l & (1 << b8)) ) + { + + + /* + * When we want to reset the full 8x8 block, we have to reset + * both the dc and ac coeff bits hence we have the symmetric + * arrangement of bits + */ + const UWORD32 cntrl_mask_map[4] = {0xcc00cc00, 0x33003300, 0x00cc00cc, 0x00330033}; + + /* restore cbp */ + *u1_cbp_l = (*u1_cbp_l & (~(1 << b8))); + + /* correct cntrl flag */ + u4_cntrl = u4_cntrl & (~cntrl_mask_map[(b4 >> 2)]); + + /* correct nnz */ + pu1_nnz[u1_scan_order[b4 - 3]] = 0; + pu1_nnz[u1_scan_order[b4 - 2]] = 0; + pu1_nnz[u1_scan_order[b4 - 1]] = 0; + pu1_nnz[u1_scan_order[b4]] = 0; + + /* reset blk cost */ + u4_b8_coeff_cost = 0; + } + + if (!(*u1_cbp_l & (1 << b8))) + { + (*pv_mb_coeff_data) = ps_mb_coeff_data_b8; + } + + u4_mb_coeff_cost += u4_b8_coeff_cost; + + u4_b8_coeff_cost = 0; + i4_run = -1; + ps_mb_coeff_data_b8 = (*pv_mb_coeff_data); + } + } + + if (u4_thres_resi && (u4_mb_coeff_cost <= LUMA_BLOCK_SKIP_THRESHOLD) + && (*u1_cbp_l)) + { + (*pv_mb_coeff_data) = ps_mb_coeff_data_mb; + *u1_cbp_l = 0; + u4_cntrl = 0; + memset(pu1_nnz, 0, 16); + } + + (*pu4_cntrl) = u4_cntrl; + + return; +} + +/** +****************************************************************************** +* +* @brief This function packs residue of an i8x8 chroma mb for entropy coding +* +* @par Description +* An i8 chroma macro block contains two classes of units, dc 2x2 block and +* 4x4 ac blocks. while packing the mb, the dc block is sent first, and +* the 4 ac blocks are sent next in scan order. Each and every block is +* represented by 3 parameters (nnz, significant coefficient map and the +* residue coefficients itself). If a 4x4 unit does not have any coefficients +* then only nnz is sent. Inside a 4x4 block the individual coefficients are +* sent in scan order. +* +* The first byte of each block will be nnz of the block, if it is non zero, +* a 2 byte significance map is sent. This is followed by nonzero coefficients. +* This is repeated for 1 dc + 4 ac blocks. +* +* @param[in] pi2_res_mb +* pointer to residue mb +* +* @param[in, out] pv_mb_coeff_data +* buffer pointing to packed residue coefficients +* +* @param[in] u4_res_strd +* residual block stride +* +* @param[out] u1_cbp_c +* coded block pattern chroma +* +* @param[in] pu1_nnz +* number of non zero coefficients in each 4x4 unit +* +* @param[out] pu1_nnz +* Control signal for inverse transform +* +* @param[in] u4_swap_uv +* Swaps the order of U and V planes in entropy bitstream +* +* @return none +* +* @ remarks +* +****************************************************************************** +*/ +void ih264e_pack_c_mb(WORD16 *pi2_res_mb, + void **pv_mb_coeff_data, + WORD32 i4_res_strd, + UWORD8 *u1_cbp_c, + UWORD8 *pu1_nnz, + UWORD32 u4_thres_resi, + UWORD32 *pu4_cntrl, + UWORD32 u4_swap_uv) +{ + /* pointer to packed sub block buffer space */ + tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data); + tu_sblk_coeff_data_t *ps_mb_coeff_data_dc, *ps_mb_coeff_data_ac; + + /* nnz pointer */ + UWORD8 *pu1_nnz_ac, *pu1_nnz_dc; + + /* nnz counter */ + UWORD32 u4_nnz_cnt; + + /* significant coefficient map */ + UWORD32 u4_s_map; + + /* pointer to scanning matrix */ + const UWORD8 *pu1_scan_order; + + /* no of non zero coefficients in the current sub block */ + UWORD32 u4_nnz; + + /* pointer to residual sub block, res val */ + WORD16 *pi2_res_sb, i2_res_val; + + /* temp var */ + UWORD32 coeff_cnt, mask, b4,plane; + + /* temp var */ + UWORD32 u4_coeff_cost; + WORD32 i4_run; + + /* coeff cost */ + const UWORD8 *pu1_coeff_cost = gu1_coeff_cost; + + /* pointer to packed buffer space */ + UWORD32 *pu4_mb_coeff_data = NULL; + + /* ac coded block pattern */ + UWORD8 u1_cbp_ac; + + /* Variable to store the current bit pos in cntrl variable*/ + UWORD32 cntrl_pos = 0; + + /********************************************************/ + /* pack dc coeff data for entropy coding */ + /********************************************************/ + pu1_scan_order = gu1_chroma_scan_order_dc; + pi2_res_sb = pi2_res_mb; + pu1_nnz_dc = pu1_nnz; + (*pu4_cntrl) = 0; + cntrl_pos = 15; + ps_mb_coeff_data_dc = (*pv_mb_coeff_data); + + /* Color space conversion between SP_UV and SP_VU + * We always assume SP_UV for all the processing + * Hence to get proper stream output we need to swap U and V channels here + * + * For that there are two paths we need to look for + * One is the path to bitstream , these variables should have the proper input + * configured UV or VU + * For the other path the inverse transform variables should have ehat ever 0ordering the + * input had + */ + + if (u4_swap_uv) + { + pu1_nnz_dc += 5;/* Move to NNZ of V planve */ + pi2_res_sb += 4;/* Move to DC coff of V plane */ + + cntrl_pos = 14; /* Control bit for V plane */ + } + + for (plane = 0; plane < 2; plane++) + { + ps_mb_coeff_data = (*pv_mb_coeff_data); + + u4_nnz = *pu1_nnz_dc; + /* write number of non zero coefficients U/V */ + ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz; + + if (u4_nnz) + { + for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++) + { + i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]]; + if (i2_res_val) + { + /* write residue U/V */ + ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val; + u4_s_map |= mask; + } + mask <<= 1; + } + /* write significant coeff map U/V */ + ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16); + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt; + *u1_cbp_c = 1; + + (*pu4_cntrl) |= (1 << cntrl_pos); + } + else + { + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue; + } + + if (u4_swap_uv) + { + cntrl_pos++; /* Control bit for U plane */ + pu1_nnz_dc -= 5; /* Move to NNZ of U plane */ + pi2_res_sb -= 4; /* Move to DC coff of U plane */ + + } + else + { + cntrl_pos--; /* Control bit for U plane */ + pu1_nnz_dc += 5; /* 4 for AC NNZ and 1 for DC */ + pi2_res_sb += 4; /* Move to DC coff of V plane */ + } + } + + /********************************************************/ + /* pack ac coeff data for entropy coding */ + /********************************************************/ + + pu1_scan_order = gu1_chroma_scan_order; + ps_mb_coeff_data_ac = (*pv_mb_coeff_data); + + if (u4_swap_uv) + { + pi2_res_sb = pi2_res_mb + i4_res_strd * 5; /* Move to V plane ,ie 1dc row+ 4 ac row */ + cntrl_pos = 27; /* The control bits are to be added for V bloc ie 31-4 th bit */ + pu1_nnz_ac = pu1_nnz + 6;/*Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */ + } + else + { + pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to U plane ,ie 1dc row */ + cntrl_pos = 31; + pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc */ + } + + for (plane = 0; plane < 2; plane++) + { + pu4_mb_coeff_data = (*pv_mb_coeff_data); + + u4_coeff_cost = 0; + i4_run = -1; + + /* get the current cbp, so that it automatically + * gets reverted in case of zero ac values */ + u1_cbp_ac = *u1_cbp_c; + + for (b4 = 0; b4 < 4; b4++) + { + ps_mb_coeff_data = (*pv_mb_coeff_data); + + u4_nnz = *pu1_nnz_ac; + + /* + * We are scanning only ac coeffs, but the nnz is for the + * complete 4x4 block. Hence we have to discount the nnz contributed + * by the dc coefficient + */ + u4_nnz -= (pi2_res_sb[0]!=0); + + /* write number of non zero coefficients U/V */ + ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz; + + if (u4_nnz) + { + for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++) + { + i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]]; + + i4_run++; + + if (i2_res_val) + { + /* write residue U/V */ + ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val; + u4_s_map |= mask; + + if ( u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD) ) + { + /* compute coeff cost */ + if (i2_res_val == 1 || i2_res_val == -1) + { + if (i4_run < 6) + u4_coeff_cost += pu1_coeff_cost[i4_run]; + } + else + u4_coeff_cost += 9; + + i4_run = -1; + } + } + mask <<= 1; + } + + /* write significant coeff map U/V */ + ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16); + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt; + u1_cbp_ac = 2; + + (*pu4_cntrl) |= 1 << cntrl_pos; + } + else + { + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue; + } + + pu1_nnz_ac++; + pi2_res_sb += i4_res_strd; + cntrl_pos--; + } + + /* reset block */ + if (u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD)) + { + pu4_mb_coeff_data[0] = 0; + pu4_mb_coeff_data[1] = 0; + pu4_mb_coeff_data[2] = 0; + pu4_mb_coeff_data[3] = 0; + (*pv_mb_coeff_data) = pu4_mb_coeff_data + 4; + + /* Generate the control signal */ + /* Zero out the current plane's AC coefficients */ + (*pu4_cntrl) &= ((plane == u4_swap_uv) ? 0x0FFFFFFF : 0xF0FFFFFF); + + /* Similarly do for the NNZ also */ + *(pu1_nnz_ac - 4) = 0; + *(pu1_nnz_ac - 3) = 0; + *(pu1_nnz_ac - 2) = 0; + *(pu1_nnz_ac - 1) = 0; + } + else + { + *u1_cbp_c = u1_cbp_ac; + } + + if (u4_swap_uv) + { + pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to V plane ,ie 1dc row+ 4 ac row + 1 dc row */ + cntrl_pos = 31; /* The control bits are to be added for V bloc ie 31-4 th bit */ + pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */ + + pu1_nnz_ac = pu1_nnz + 1; + } + else + pu1_nnz_ac = pu1_nnz + 6; /* Go to nnz of V plane */ + } + + /* restore the ptr basing on cbp */ + if (*u1_cbp_c == 0) + { + (*pv_mb_coeff_data) = ps_mb_coeff_data_dc; + } + else if (*u1_cbp_c == 1) + { + (*pv_mb_coeff_data) = ps_mb_coeff_data_ac; + } + + return ; +} + +/** +******************************************************************************* +* +* @brief performs luma core coding when intra mode is i16x16 +* +* @par Description: +* If the current mb is to be coded as intra of mb type i16x16, the mb is first +* predicted using one of i16x16 prediction filters, basing on the intra mode +* chosen. Then, error is computed between the input blk and the estimated blk. +* This error is transformed (hierarchical transform i.e., dct followed by hada- +* -mard), quantized. The quantized coefficients are packed in scan order for +* entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks none +* +******************************************************************************* +*/ + +UWORD8 ih264e_code_luma_intra_macroblock_16x16(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* pointer to ref macro block */ + UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = NULL; + + /* pointer to residual macro block */ + WORD16 *pi2_res_mb = ps_proc->pi2_res_buf; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + WORD32 i4_res_strd = ps_proc->i4_res_strd; + + /* intra mode */ + UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode; + + /* coded block pattern */ + UWORD8 u1_cbp_l = 0; + + /* number of non zero coeffs*/ + UWORD32 au4_nnz[5]; + UWORD8 *pu1_nnz = (UWORD8 *)au4_nnz; + + /*Cntrol signal for itrans*/ + UWORD32 u4_cntrl; + + /* quantization parameters */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0]; + + /* pointer to packed mb coeff data */ + void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data); + + /* init nnz */ + au4_nnz[0] = 0; + au4_nnz[1] = 0; + au4_nnz[2] = 0; + au4_nnz[3] = 0; + au4_nnz[4] = 0; + + if (u1_intra_mode == PLANE_I16x16) + { + pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16_plane; + } + else + { + pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16; + } + + /********************************************************/ + /* error estimation, */ + /* transform */ + /* quantization */ + /********************************************************/ + ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb, + pu1_pred_mb, pi2_res_mb, + i4_src_strd, i4_pred_strd, + i4_res_strd, + ps_qp_params->pu2_scale_mat, + ps_qp_params->pu2_thres_mat, + ps_qp_params->u1_qbits, + ps_qp_params->u4_dead_zone, + pu1_nnz, ENABLE_DC_TRANSFORM); + + /********************************************************/ + /* pack coeff data for entropy coding */ + /********************************************************/ + ih264e_pack_l_mb_i16(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l, + pu1_nnz, &u4_cntrl); + + /********************************************************/ + /* ierror estimation, */ + /* itransform */ + /* iquantization */ + /********************************************************/ + /* + *if refernce frame is not to be computed + *we only need the right and bottom border 4x4 blocks to predict next intra + *blocks, hence only compute them + */ + if (!ps_proc->u4_compute_recon) + { + u4_cntrl &= 0x111F8000; + } + + if (u4_cntrl) + { + ih264e_luma_16x16_idctrans_iquant_itrans_recon( + ps_codec, pi2_res_mb, pu1_pred_mb, pu1_ref_mb, + i4_res_strd, i4_pred_strd, i4_rec_strd, + ps_qp_params->pu2_iscale_mat, + ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div, + u4_cntrl, ENABLE_DC_TRANSFORM, + ps_proc->pv_scratch_buff); + } + else + { + ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb, i4_pred_strd, + i4_rec_strd, MB_SIZE, MB_SIZE, NULL, + 0); + } + + return (u1_cbp_l); +} + + +/** +******************************************************************************* +* +* @brief performs luma core coding when intra mode is i4x4 +* +* @par Description: +* If the current mb is to be coded as intra of mb type i4x4, the mb is first +* predicted using one of i4x4 prediction filters, basing on the intra mode +* chosen. Then, error is computed between the input blk and the estimated blk. +* This error is dct transformed and quantized. The quantized coefficients are +* packed in scan order for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks +* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order +* mentioned in h.264 specification +* +******************************************************************************* +*/ +UWORD8 ih264e_code_luma_intra_macroblock_4x4(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* pointer to ref macro block */ + UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb; + + /* pointer to residual macro block */ + WORD16 *pi2_res_mb = ps_proc->pi2_res_buf; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + + /* pointer to neighbors: left, top, top-left */ + UWORD8 *pu1_mb_a; + UWORD8 *pu1_mb_b; + UWORD8 *pu1_mb_c; + UWORD8 *pu1_mb_d; + + /* intra mode */ + UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode; + + /* neighbor availability */ + WORD32 i4_ngbr_avbl; + + /* neighbor pels for intra prediction */ + UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels; + + /* coded block pattern */ + UWORD8 u1_cbp_l = 0; + + /* number of non zero coeffs*/ + UWORD8 u1_nnz; + + /* quantization parameters */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0]; + + /* pointer to packed mb coeff data */ + void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data); + + /* pointer to packed mb coeff data */ + tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8; + + /* no of non zero coefficients in the current sub block */ + UWORD32 u4_nnz_cnt; + + /* significant coefficient map */ + UWORD32 u4_s_map; + + /* pointer to scanning matrix */ + const UWORD8 *pu1_scan_order = gu1_luma_scan_order; + + /*Dummy variable for 4x4 trans fucntion*/ + WORD16 i2_dc_dummy; + + /* temp var */ + UWORD32 i, b8, b4, u1_blk_x, u1_blk_y, u1_pix_x, u1_pix_y, coeff_cnt, mask; + + /* Process 16 4x4 lum sub-blocks of the MB in scan order */ + for (b8 = 0; b8 < 4; b8++) + { + u1_blk_x = GET_BLK_RASTER_POS_X(b8) << 3; + u1_blk_y = GET_BLK_RASTER_POS_Y(b8) << 3; + + /* if in case cbp for the 8x8 block is zero, send no residue */ + ps_mb_coeff_data_b8 = *pv_mb_coeff_data; + + for (b4 = 0; b4 < 4; b4++) + { + /* index of pel in MB */ + u1_pix_x = u1_blk_x + (GET_SUB_BLK_RASTER_POS_X(b4) << 2); + u1_pix_y = u1_blk_y + (GET_SUB_BLK_RASTER_POS_Y(b4) << 2); + + /* Initialize source and reference pointers */ + pu1_curr_mb = ps_proc->pu1_src_buf_luma + u1_pix_x + (u1_pix_y * i4_src_strd); + pu1_ref_mb = ps_proc->pu1_rec_buf_luma + u1_pix_x + (u1_pix_y * i4_rec_strd); + + /* pointer to left of ref macro block */ + pu1_mb_a = pu1_ref_mb - 1; + /* pointer to top of ref macro block */ + pu1_mb_b = pu1_ref_mb - i4_rec_strd; + /* pointer to topright of ref macro block */ + pu1_mb_c = pu1_mb_b + 4; + /* pointer to topleft macro block */ + pu1_mb_d = pu1_mb_b - 1; + + /* compute neighbor availability */ + i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4]; + + /* sub block intra mode */ + u1_intra_mode = ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4]; + + /********************************************************/ + /* gather prediction pels from neighbors for prediction */ + /********************************************************/ + /* left pels */ + if (i4_ngbr_avbl & LEFT_MB_AVAILABLE_MASK) + { + for (i = 0; i < 4; i++) + pu1_ngbr_pels_i4[4 - 1 - i] = pu1_mb_a[i * i4_rec_strd]; + } + else + { + memset(pu1_ngbr_pels_i4, 0, 4); + } + + /* top pels */ + if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK) + { + memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4); + } + else + { + memset(pu1_ngbr_pels_i4 + 5, 0, 4); + } + /* top left pels */ + if (i4_ngbr_avbl & TOP_LEFT_MB_AVAILABLE_MASK) + { + pu1_ngbr_pels_i4[4] = *pu1_mb_d; + } + else + { + pu1_ngbr_pels_i4[4] = 0; + } + /* top right pels */ + if (i4_ngbr_avbl & TOP_RIGHT_MB_AVAILABLE_MASK) + { + memcpy(pu1_ngbr_pels_i4+8+1,pu1_mb_c,4); + } + else if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK) + { + memset(pu1_ngbr_pels_i4+8+1,pu1_ngbr_pels_i4[8],4); + } + + /********************************************************/ + /* prediction */ + /********************************************************/ + (ps_codec->apf_intra_pred_4_l)[u1_intra_mode](pu1_ngbr_pels_i4, + pu1_pred_mb, 0, + i4_pred_strd, + i4_ngbr_avbl); + + /********************************************************/ + /* error estimation, */ + /* transform */ + /* quantization */ + /********************************************************/ + ps_codec->pf_resi_trans_quant_4x4(pu1_curr_mb, pu1_pred_mb, + pi2_res_mb, i4_src_strd, + i4_pred_strd, + ps_qp_params->pu2_scale_mat, + ps_qp_params->pu2_thres_mat, + ps_qp_params->u1_qbits, + ps_qp_params->u4_dead_zone, + &u1_nnz, &i2_dc_dummy); + + /********************************************************/ + /* pack coeff data for entropy coding */ + /********************************************************/ + ps_mb_coeff_data = *pv_mb_coeff_data; + + /* write number of non zero coefficients */ + ps_mb_coeff_data->i4_sig_map_nnz = u1_nnz; + + if (u1_nnz) + { + for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u1_nnz; coeff_cnt++) + { + if (pi2_res_mb[pu1_scan_order[coeff_cnt]]) + { + /* write residue */ + ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]]; + u4_s_map |= mask; + } + mask <<= 1; + } + /* write significant coeff map */ + ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16); + + /* update ptr to coeff data */ + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt; + + /* cbp */ + u1_cbp_l |= (1 << b8); + } + else + { + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue; + } + + /********************************************************/ + /* ierror estimation, */ + /* itransform */ + /* iquantization */ + /********************************************************/ + /* If the frame is not to be used for P frame reference or dumping recon + * we only will use the recon for only predicting intra Mbs + * This will need only right and bottom edge 4x4 blocks recon + * Hence we selectively enable them + */ + if (ps_proc->u4_compute_recon || (0xF888 & (1 << ((b8 << 2) + b4)))) + { + if (u1_nnz) + ps_codec->pf_iquant_itrans_recon_4x4( + pi2_res_mb, pu1_pred_mb, pu1_ref_mb, + /*No input stride,*/i4_pred_strd, + i4_rec_strd, ps_qp_params->pu2_iscale_mat, + ps_qp_params->pu2_weigh_mat, + ps_qp_params->u1_qp_div, + ps_proc->pv_scratch_buff, 0, 0); + else + ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb, + i4_pred_strd, i4_rec_strd, + BLK_SIZE, BLK_SIZE, NULL, + 0); + } + + } + + /* if the 8x8 block has no residue, nothing needs to be sent to entropy */ + if (!(u1_cbp_l & (1 << b8))) + { + *pv_mb_coeff_data = ps_mb_coeff_data_b8; + } + } + + return (u1_cbp_l); +} + +/** +******************************************************************************* +* +* @brief performs luma core coding when intra mode is i4x4 +* +* @par Description: +* If the current mb is to be coded as intra of mb type i4x4, the mb is first +* predicted using one of i4x4 prediction filters, basing on the intra mode +* chosen. Then, error is computed between the input blk and the estimated blk. +* This error is dct transformed and quantized. The quantized coefficients are +* packed in scan order for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks +* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order +* mentioned in h.264 specification +* +******************************************************************************* +*/ +UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* pointer to ref macro block */ + UWORD8 *pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4; + + /* pointer to recon buffer */ + UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma; + + /* pointer to residual macro block */ + WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4; + + /* strides */ + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* number of non zero coeffs*/ + UWORD8 *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4; + + /* coded block pattern */ + UWORD8 u1_cbp_l = 0; + + /* pointer to packed mb coeff data */ + void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data); + + /* pointer to packed mb coeff data */ + tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8; + + /* no of non zero coefficients in the current sub block */ + UWORD32 u4_nnz_cnt; + + /* significant coefficient map */ + UWORD32 u4_s_map; + + /* pointer to scanning matrix */ + const UWORD8 *pu1_scan_order = gu1_luma_scan_order; + + /* temp var */ + UWORD32 b8, b4, coeff_cnt, mask; + + /* Process 16 4x4 lum sub-blocks of the MB in scan order */ + for (b8 = 0; b8 < 4; b8++) + { + /* if in case cbp for the 8x8 block is zero, send no residue */ + ps_mb_coeff_data_b8 = *pv_mb_coeff_data; + + for (b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE) + { + /********************************************************/ + /* pack coeff data for entropy coding */ + /********************************************************/ + ps_mb_coeff_data = *pv_mb_coeff_data; + + /* write number of non zero coefficients */ + ps_mb_coeff_data->i4_sig_map_nnz = *pu1_nnz; + + if (*pu1_nnz) + { + for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < *pu1_nnz; coeff_cnt++) + { + if (pi2_res_mb[pu1_scan_order[coeff_cnt]]) + { + /* write residue */ + ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]]; + u4_s_map |= mask; + } + mask <<= 1; + } + /* write significant coeff map */ + ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16); + + /* update ptr to coeff data */ + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt; + + /* cbp */ + u1_cbp_l |= (1 << b8); + } + else + { + (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue; + } + } + + /* if the 8x8 block has no residue, nothing needs to be sent to entropy */ + if (!(u1_cbp_l & (1 << b8))) + { + *pv_mb_coeff_data = ps_mb_coeff_data_b8; + } + } + + /* memcpy recon */ + ps_codec->pf_inter_pred_luma_copy(pu1_ref_mb_intra_4x4, pu1_rec_mb, MB_SIZE, i4_rec_strd, MB_SIZE, MB_SIZE, NULL, 0); + + return (u1_cbp_l); +} + + +/** +******************************************************************************* +* +* @brief performs chroma core coding for intra macro blocks +* +* @par Description: +* If the current MB is to be intra coded with mb type chroma I8x8, the MB is +* first predicted using intra 8x8 prediction filters. The predicted data is +* compared with the input for error and the error is transformed. The DC +* coefficients of each transformed sub blocks are further transformed using +* Hadamard transform. The resulting coefficients are quantized, packed and sent +* for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_c +* coded block pattern chroma +* +* @remarks +* The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order +* mentioned in h.264 specification +* +******************************************************************************* +*/ +UWORD8 ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* pointer to ref macro block */ + UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = NULL; + + /* pointer to residual macro block */ + WORD16 *pi2_res_mb = ps_proc->pi2_res_buf; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + WORD32 i4_res_strd = ps_proc->i4_res_strd; + + /* intra mode */ + UWORD8 u1_intra_mode = ps_proc->u1_c_i8_mode; + + /* coded block pattern */ + UWORD8 u1_cbp_c = 0; + + /* number of non zero coeffs*/ + UWORD8 au1_nnz[18] = {0}; + + /* quantization parameters */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1]; + + /* Control signal for inverse transform */ + UWORD32 u4_cntrl; + + /* pointer to packed mb coeff data */ + void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data); + + /* See if we need to swap U and V plances for entropy */ + UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU; + + if (PLANE_CH_I8x8 == u1_intra_mode) + { + pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma_plane; + } + else + { + pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma; + } + + /********************************************************/ + /* error estimation, */ + /* transform */ + /* quantization */ + /********************************************************/ + ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb, + pu1_pred_mb, pi2_res_mb, + i4_src_strd, i4_pred_strd, + i4_res_strd, + ps_qp_params->pu2_scale_mat, + ps_qp_params->pu2_thres_mat, + ps_qp_params->u1_qbits, + ps_qp_params->u4_dead_zone, + au1_nnz); + + /********************************************************/ + /* pack coeff data for entropy coding */ + /********************************************************/ + ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c, + au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv); + + /********************************************************/ + /* ierror estimation, */ + /* itransform */ + /* iquantization */ + /********************************************************/ + ih264e_chroma_8x8_idctrans_iquant_itrans_recon(ps_codec, pi2_res_mb, + pu1_pred_mb, pu1_ref_mb, + i4_res_strd, i4_pred_strd, + i4_rec_strd, + ps_qp_params->pu2_iscale_mat, + ps_qp_params->pu2_weigh_mat, + ps_qp_params->u1_qp_div, + u4_cntrl, + ps_proc->pv_scratch_buff); + return (u1_cbp_c); +} + + +/** +******************************************************************************* +* +* @brief performs luma core coding when mode is inter +* +* @par Description: +* If the current mb is to be coded as inter the mb is predicted based on the +* sub mb partitions and corresponding motion vectors generated by ME. Then, +* error is computed between the input blk and the estimated blk. This error is +* transformed, quantized. The quantized coefficients are packed in scan order +* for entropy coding +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks none +* +******************************************************************************* +*/ + +UWORD8 ih264e_code_luma_inter_macroblock_16x16(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* pointer to ref macro block */ + UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb; + + /* pointer to residual macro block */ + WORD16 *pi2_res_mb = ps_proc->pi2_res_buf; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + WORD32 i4_res_strd = ps_proc->i4_res_strd; + + /* coded block pattern */ + UWORD8 u1_cbp_l = 0; + + /*Control signal of itrans*/ + UWORD32 u4_cntrl; + + /* number of non zero coeffs*/ + UWORD8 *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz; + + /* quantization parameters */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0]; + + /* pointer to packed mb coeff data */ + void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data); + + /* pseudo pred buffer */ + UWORD8 *pu1_pseudo_pred = pu1_pred_mb; + + /* pseudo pred buffer stride */ + WORD32 i4_pseudo_pred_strd = i4_pred_strd; + + /* init nnz */ + ps_proc->au4_nnz[0] = 0; + ps_proc->au4_nnz[1] = 0; + ps_proc->au4_nnz[2] = 0; + ps_proc->au4_nnz[3] = 0; + ps_proc->au4_nnz[4] = 0; + + /********************************************************/ + /* prediction */ + /********************************************************/ + ih264e_motion_comp_luma(ps_proc, &pu1_pseudo_pred, &i4_pseudo_pred_strd); + + /********************************************************/ + /* error estimation, */ + /* transform */ + /* quantization */ + /********************************************************/ + if (ps_proc->u4_min_sad_reached == 0 || ps_proc->u4_min_sad != 0) + { + ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb, + pu1_pseudo_pred, pi2_res_mb, + i4_src_strd, + i4_pseudo_pred_strd, + i4_res_strd, + ps_qp_params->pu2_scale_mat, + ps_qp_params->pu2_thres_mat, + ps_qp_params->u1_qbits, + ps_qp_params->u4_dead_zone, + pu1_nnz, + DISABLE_DC_TRANSFORM); + + /********************************************************/ + /* pack coeff data for entropy coding */ + /********************************************************/ + ih264e_pack_l_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l, + pu1_nnz, ps_codec->u4_thres_resi, &u4_cntrl); + } + else + { + u1_cbp_l = 0; + u4_cntrl = 0; + } + + /********************************************************/ + /* ierror estimation, */ + /* itransform */ + /* iquantization */ + /********************************************************/ + + /*If the frame is not to be used for P frame reference or dumping recon + * we only will use the reocn for only predicting intra Mbs + * THis will need only right and bottom edge 4x4 blocks recon + * Hence we selectively enable them using control signal(including DC) + */ + if (ps_proc->u4_compute_recon != 1) + { + u4_cntrl &= 0x111F0000; + } + + if (u4_cntrl) + { + ih264e_luma_16x16_idctrans_iquant_itrans_recon( + ps_codec, pi2_res_mb, pu1_pseudo_pred, pu1_rec_mb, + i4_res_strd, i4_pseudo_pred_strd, i4_rec_strd, + ps_qp_params->pu2_iscale_mat, + ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div, + u4_cntrl /*Cntrl*/, DISABLE_DC_TRANSFORM, + ps_proc->pv_scratch_buff); + } + else + { + ps_codec->pf_inter_pred_luma_copy(pu1_pseudo_pred, pu1_rec_mb, + i4_pseudo_pred_strd, i4_rec_strd, + MB_SIZE, MB_SIZE, NULL, 0); + } + + + return (u1_cbp_l); +} + +/** +******************************************************************************* +* +* @brief performs chroma core coding for inter macro blocks +* +* @par Description: +* If the current mb is to be coded as inter predicted mb,based on the sub mb partitions +* and corresponding motion vectors generated by ME ,prediction is done. +* Then, error is computed between the input blk and the estimated blk. +* This error is transformed , quantized. The quantized coefficients +* are packed in scan order for +* entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern chroma +* +* @remarks none +* +******************************************************************************* +*/ +UWORD8 ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* pointer to ref macro block */ + UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_chroma; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb; + + /* pointer to residual macro block */ + WORD16 *pi2_res_mb = ps_proc->pi2_res_buf; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + WORD32 i4_res_strd = ps_proc->i4_res_strd; + + /* coded block pattern */ + UWORD8 u1_cbp_c = 0; + + /*Control signal for inverse transform*/ + UWORD32 u4_cntrl; + + /* number of non zero coeffs*/ + UWORD8 au1_nnz[10] = {0}; + + /* quantization parameters */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1]; + + /* pointer to packed mb coeff data */ + void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data); + + /*See if we need to swap U and V plances for entropy*/ + UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU; + + /********************************************************/ + /* prediction */ + /********************************************************/ + ih264e_motion_comp_chroma(ps_proc); + + /********************************************************/ + /* error estimation, */ + /* transform */ + /* quantization */ + /********************************************************/ + ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb, + pu1_pred_mb, pi2_res_mb, + i4_src_strd, i4_pred_strd, + i4_res_strd, + ps_qp_params->pu2_scale_mat, + ps_qp_params->pu2_thres_mat, + ps_qp_params->u1_qbits, + ps_qp_params->u4_dead_zone, + au1_nnz); + + /********************************************************/ + /* pack coeff data for entropy coding */ + /********************************************************/ + ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c, + au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv); + + /********************************************************/ + /* ierror estimation, */ + /* itransform */ + /* iquantization */ + /********************************************************/ + + /* If the frame is not to be used for P frame reference or dumping recon + * we only will use the reocn for only predicting intra Mbs + * THis will need only right and bottom edge 4x4 blocks recon + * Hence we selectively enable them using control signal(including DC) + */ + if (!ps_proc->u4_compute_recon) + { + u4_cntrl &= 0x7700C000; + } + + if (u4_cntrl) + { + ih264e_chroma_8x8_idctrans_iquant_itrans_recon( + ps_codec, pi2_res_mb, pu1_pred_mb, pu1_rec_mb, + i4_res_strd, i4_pred_strd, i4_rec_strd, + ps_qp_params->pu2_iscale_mat, + ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div, + u4_cntrl, ps_proc->pv_scratch_buff); + } + else + { + ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_rec_mb, i4_pred_strd, + i4_rec_strd, MB_SIZE >> 1, MB_SIZE, + NULL, 0); + } + + return (u1_cbp_c); +} diff --git a/encoder/ih264e_core_coding.h b/encoder/ih264e_core_coding.h new file mode 100755 index 0000000..1237d25 --- /dev/null +++ b/encoder/ih264e_core_coding.h @@ -0,0 +1,653 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_core_coding.h +* +* @brief +* This file contains extern declarations of core coding routines +* +* @author +* ittiam +* +* @remarks +* none +****************************************************************************** +*/ + +#ifndef IH264E_CORE_CODING_H_ +#define IH264E_CORE_CODING_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Enable/Disable Hadamard transform of DC Coeff's +****************************************************************************** + */ +#define DISABLE_DC_TRANSFORM 0 +#define ENABLE_DC_TRANSFORM 1 + +/** +******************************************************************************* + * @brief bit masks for DC and AC control flags +******************************************************************************* + */ + +#define DC_COEFF_CNT_LUMA_MB 16 +#define NUM_4X4_BLKS_LUMA_MB_ROW 4 +#define NUM_LUMA4x4_BLOCKS_IN_MB 16 +#define NUM_CHROMA4x4_BLOCKS_IN_MB 8 + +#define SIZE_4X4_BLK_HRZ TRANS_SIZE_4 +#define SIZE_4X4_BLK_VERT TRANS_SIZE_4 + +#define CNTRL_FLAG_DC_MASK_LUMA 0x0000FFFF +#define CNTRL_FLAG_AC_MASK_LUMA 0xFFFF0000 + +#define CNTRL_FLAG_AC_MASK_CHROMA_U 0xF0000000 +#define CNTRL_FLAG_DC_MASK_CHROMA_U 0x0000F000 + +#define CNTRL_FLAG_AC_MASK_CHROMA_V 0x0F000000 +#define CNTRL_FLAG_DC_MASK_CHROMA_V 0x00000F00 + +#define CNTRL_FLAG_AC_MASK_CHROMA ( CNTRL_FLAG_AC_MASK_CHROMA_U | CNTRL_FLAG_AC_MASK_CHROMA_V ) +#define CNTRL_FLAG_DC_MASK_CHROMA ( CNTRL_FLAG_DC_MASK_CHROMA_U | CNTRL_FLAG_DC_MASK_CHROMA_V ) + +#define CNTRL_FLAG_DCBLK_MASK_CHROMA 0x0000C000 + +/** +******************************************************************************* + * @brief macros for transforms +******************************************************************************* + */ +#define DEQUEUE_BLKID_FROM_CONTROL( u4_cntrl, blk_lin_id) \ +{ \ + blk_lin_id = CLZ(u4_cntrl); \ + u4_cntrl &= (0x7FFFFFFF >> blk_lin_id); \ +}; + +#define IND2SUB_LUMA_MB(u4_blk_id,i4_offset_x,i4_offset_y) \ +{ \ + i4_offset_x = (u4_blk_id % 4) << 2; \ + i4_offset_y = (u4_blk_id / 4) << 2; \ +} + +#define IND2SUB_CHROMA_MB(u4_blk_id,i4_offset_x,i4_offset_y) \ +{ \ + i4_offset_x = ((u4_blk_id & 0x1 ) << 3) + (u4_blk_id > 3); \ + i4_offset_y = (u4_blk_id & 0x2) << 1; \ +} + + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* This function performs does the DCT transform then Hadamard transform +* and quantization for a macroblock when the mb mode is intra 16x16 mode +* +* @par Description: +* First cf4 is done on all 16 4x4 blocks of the 16x16 input block. +* Then hadamard transform is done on the DC coefficients +* Quantization is then performed on the 16x16 block, 4x4 wise +* +* @param[in] pu1_src +* Pointer to source sub-block +* +* @param[in] pu1_pred +* Pointer to prediction sub-block +* +* @param[in] pi2_out +* Pointer to residual sub-block +* The output will be in linear format +* The first 16 continuous locations will contain the values of Dc block +* After DC block and a stride 1st AC block will follow +* After one more stride next AC block will follow +* The blocks will be in raster scan order +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* Prediction stride +* +* @param[in] dst_strd +* Destination stride +* +* @param[in] pu2_scale_matrix +* The quantization matrix for 4x4 transform +* +* @param[in] pu2_threshold_matrix +* Threshold matrix +* +* @param[in] u4_qbits +* 15+QP/6 +* +* @param[in] u4_round_factor +* Round factor for quant +* +* @param[out] pu1_nnz +* Memory to store the non-zeros after transform +* The first byte will be the nnz of DC block +* From the next byte the AC nnzs will be stored in raster scan order +* +* @param u4_dc_flag +* Signals if Dc transform is to be done or not +* 1 -> Dc transform will be done +* 0 -> Dc transform will not be done +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_luma_16x16_resi_trans_dctrans_quant( + codec_t *ps_codec, UWORD8 *pu1_src, UWORD8 *pu1_pred, + WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd, + WORD32 dst_strd, const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, + UWORD32 u4_round_factor, UWORD8 *pu1_nnz, UWORD32 u4_dc_flag); + +/** +******************************************************************************* +* +* @brief +* This function performs the intra 16x16 inverse transform process for H264 +* it includes inverse Dc transform, inverse quant and then inverse transform +* +* @par Description: +* +* @param[in] pi2_src +* Input data, 16x16 size +* First 16 mem locations will have the Dc coffs in rater scan order in linear fashion +* after a stride 1st AC clock will be present again in raster can order +* Then each AC block of the 16x16 block will follow in raster scan order +* +* @param[in] pu1_pred +* The predicted data, 16x16 size +* Block by block form +* +* @param[in] pu1_out +* Output 16x16 +* In block by block form +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* input stride for prediction buffer +* +* @param[in] out_strd +* input stride for output buffer +* +* @param[in] pu2_iscale_mat +* Inverse quantization matrix for 4x4 transform +* +* @param[in] pu2_weigh_mat +* weight matrix of 4x4 transform +* +* @param[in] qp_div +* QP/6 +* +* @param[in] pi4_tmp +* Input temporary buffer +* needs to be at least 20 in size +* +* @param[in] pu4_cntrl +* Controls the transform path +* total Last 17 bits are used +* the 16th th bit will correspond to DC block +* and 32-17 will correspond to the ac blocks in raster scan order +* bit equaling zero indicates that the entire 4x4 block is zero for DC +* For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero +* +* @param[in] pi4_tmp +* Input temporary buffer +* needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size +* +* @returns +* none +* +* @remarks +* The all zero case must be taken care outside +* +******************************************************************************* +*/ +void ih264e_luma_16x16_idctrans_iquant_itrans_recon( + codec_t *ps_codec, WORD16 *pi2_src, UWORD8 *pu1_pred, + UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd, + WORD32 out_strd, const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl, + UWORD32 u4_dc_trans_flag, WORD32 *pi4_tmp); + +/** +******************************************************************************* +* +* @brief +* This function performs does the DCT transform then Hadamard transform +* and quantization for a chroma macroblock +* +* @par Description: +* First cf4 is done on all 16 4x4 blocks of the 8x8input block +* Then hadamard transform is done on the DC coefficients +* Quantization is then performed on the 8x8 block, 4x4 wise +* +* @param[in] pu1_src +* Pointer to source sub-block +* The input is in interleaved format for two chroma planes +* +* @param[in] pu1_pred +* Pointer to prediction sub-block +* Prediction is in inter leaved format +* +* @param[in] pi2_out +* Pointer to residual sub-block +* The output will be in linear format +* The first 4 continuous locations will contain the values of DC block for U +* and then next 4 will contain for V. +* After DC block and a stride 1st AC block of U plane will follow +* After one more stride next AC block of V plane will follow +* The blocks will be in raster scan order +* +* After all the AC blocks of U plane AC blocks of V plane will follow in exact +* same way +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* Prediction stride +* +* @param[in] dst_strd +* Destination stride +* +* @param[in] pu2_scale_matrix +* The quantization matrix for 4x4 transform +* +* @param[in] pu2_threshold_matrix +* Threshold matrix +* +* @param[in] u4_qbits +* 15+QP/6 +* +* @param[in] u4_round_factor +* Round factor for quant +* +* @param[out] pu1_nnz +* Memory to store the non-zeros after transform +* The first byte will be the nnz od DC block for U plane +* From the next byte the AC nnzs will be storerd in raster scan order +* The fifth byte will be nnz of Dc block of V plane +* Then Ac blocks will follow +* +* @param u4_dc_flag +* Signals if Dc transform is to be done or not +* 1 -> Dc transform will be done +* 0 -> Dc transform will not be done +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_chroma_8x8_resi_trans_dctrans_quant( + codec_t *ps_codec, UWORD8 *pu1_src, UWORD8 *pu1_pred, + WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd, + WORD32 out_strd, const UWORD16 *pu2_scale_matrix, + const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits, + UWORD32 u4_round_factor, UWORD8 *pu1_nnz_c); + +/** +******************************************************************************* +* @brief +* This function performs the inverse transform with process for chroma MB of H264 +* +* @par Description: +* Does inverse DC transform ,inverse quantization inverse transform +* +* @param[in] pi2_src +* Input data, 16x16 size +* The input is in the form of, first 4 locations will contain DC coeffs of +* U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane +* in raster scan order will follow, each block as linear array in raster scan order. +* After a stride next AC block will follow. After all AC blocks of U plane +* V plane AC blocks will follow in exact same order. +* +* @param[in] pu1_pred +* The predicted data, 8x16 size, U and V interleaved +* +* @param[in] pu1_out +* Output 8x16, U and V interleaved +* +* @param[in] src_strd +* Source stride +* +* @param[in] pred_strd +* input stride for prediction buffer +* +* @param[in] out_strd +* input stride for output buffer +* +* @param[in] pu2_iscale_mat +* Inverse quantization martix for 4x4 transform +* +* @param[in] pu2_weigh_mat +* weight matrix of 4x4 transform +* +* @param[in] qp_div +* QP/6 +* +* @param[in] pi4_tmp +* Input temporary buffer +* needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes +* in size +* +* @param[in] pu4_cntrl +* Controls the transform path +* the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block +* 32-28 bits will indicate AC blocks of U plane in raster scan order +* 27-23 bits will indicate AC blocks of V plane in rater scan order +* The bit 1 implies that there is at least one non zero coff in a block +* +* @returns +* none +* +* @remarks +******************************************************************************* +*/ +void ih264e_chroma_8x8_idctrans_iquant_itrans_recon( + codec_t *ps_codec, WORD16 *pi2_src, UWORD8 *pu1_pred, + UWORD8 *pu1_out, WORD32 src_strd, WORD32 pred_strd, + WORD32 out_strd, const UWORD16 *pu2_iscale_mat, + const UWORD16 *pu2_weigh_mat, UWORD32 qp_div, UWORD32 u4_cntrl, + WORD32 *pi4_tmp); + +/** +****************************************************************************** +* +* @brief This function packs residue of an i16x16 luma mb for entropy coding +* +* @par Description +* An i16 macro block contains two classes of units, dc 4x4 block and +* 4x4 ac blocks. while packing the mb, the dc block is sent first, and +* the 16 ac blocks are sent next in scan order. Each and every block is +* represented by 3 parameters (nnz, significant coefficient map and the +* residue coefficients itself). If a 4x4 unit does not have any coefficients +* then only nnz is sent. Inside a 4x4 block the individual coefficients are +* sent in scan order. +* +* The first byte of each block will be nnz of the block, if it is non zero, +* a 2 byte significance map is sent. This is followed by nonzero coefficients. +* This is repeated for 1 dc + 16 ac blocks. +* +* @param[in] pi2_res_mb +* pointer to residue mb +* +* @param[in, out] pv_mb_coeff_data +* buffer pointing to packed residue coefficients +* +* @param[in] u4_res_strd +* residual block stride +* +* @param[out] u1_cbp_l +* coded block pattern luma +* +* @param[in] pu1_nnz +* number of non zero coefficients in each 4x4 unit +* +* @param[out] +* Control signal for inverse transform of 16x16 blocks +* +* @return none +* +* @ remarks +* +****************************************************************************** +*/ +void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb, void **pv_mb_coeff_data, + WORD32 i4_res_strd, UWORD8 *u1_cbp_l, UWORD8 *pu1_nnz, + UWORD32 *pu4_cntrl); + +/** +****************************************************************************** +* +* @brief This function packs residue of an i8x8 chroma mb for entropy coding +* +* @par Description +* An i8 chroma macro block contains two classes of units, dc 2x2 block and +* 4x4 ac blocks. while packing the mb, the dc block is sent first, and +* the 4 ac blocks are sent next in scan order. Each and every block is +* represented by 3 parameters (nnz, significant coefficient map and the +* residue coefficients itself). If a 4x4 unit does not have any coefficients +* then only nnz is sent. Inside a 4x4 block the individual coefficients are +* sent in scan order. +* +* The first byte of each block will be nnz of the block, if it is non zero, +* a 2 byte significance map is sent. This is followed by nonzero coefficients. +* This is repeated for 1 dc + 4 ac blocks. +* +* @param[in] pi2_res_mb +* pointer to residue mb +* +* @param[in, out] pv_mb_coeff_data +* buffer pointing to packed residue coefficients +* +* @param[in] u4_res_strd +* residual block stride +* +* @param[out] u1_cbp_c +* coded block pattern chroma +* +* @param[in] pu1_nnz +* number of non zero coefficients in each 4x4 unit +* +* @param[out] pu1_nnz +* Control signal for inverse transform +* +* @param[in] u4_swap_uv +* Swaps the order of U and V planes in entropy bitstream +* +* @return none +* +* @ remarks +* +****************************************************************************** +*/ +void ih264e_pack_c_mb(WORD16 *pi2_res_mb, void **pv_mb_coeff_data, + WORD32 i4_res_strd, UWORD8 *u1_cbp_c, UWORD8 *pu1_nnz, + UWORD32 u4_kill_coffs_flag, UWORD32 *pu4_cntrl, + UWORD32 u4_swap_uv); + +/** +******************************************************************************* +* +* @brief performs luma core coding when intra mode is i16x16 +* +* @par Description: +* If the current mb is to be coded as intra of mb type i16x16, the mb is first +* predicted using one of i16x16 prediction filters, basing on the intra mode +* chosen. Then, error is computed between the input blk and the estimated blk. +* This error is transformed (hierarchical transform i.e., dct followed by hada- +* -mard), quantized. The quantized coefficients are packed in scan order for +* entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks none +* +******************************************************************************* +*/ +UWORD8 ih264e_code_luma_intra_macroblock_16x16 + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* +* @brief performs luma core coding when intra mode is i4x4 +* +* @par Description: +* If the current mb is to be coded as intra of mb type i4x4, the mb is first +* predicted using one of i4x4 prediction filters, basing on the intra mode +* chosen. Then, error is computed between the input blk and the estimated blk. +* This error is dct transformed and quantized. The quantized coefficients are +* packed in scan order for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks +* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order +* mentioned in h.264 specification +* +******************************************************************************* +*/ +UWORD8 ih264e_code_luma_intra_macroblock_4x4 + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* +* @brief performs luma core coding when intra mode is i4x4 +* +* @par Description: +* If the current mb is to be coded as intra of mb type i4x4, the mb is first +* predicted using one of i4x4 prediction filters, basing on the intra mode +* chosen. Then, error is computed between the input blk and the estimated blk. +* This error is dct transformed and quantized. The quantized coefficients are +* packed in scan order for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks +* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order +* mentioned in h.264 specification +* +******************************************************************************* +*/ +UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* +* @brief performs chroma core coding for intra macro blocks +* +* @par Description: +* If the current MB is to be intra coded with mb type chroma I8x8, the MB is +* first predicted using intra 8x8 prediction filters. The predicted data is +* compared with the input for error and the error is transformed. The DC +* coefficients of each transformed sub blocks are further transformed using +* Hadamard transform. The resulting coefficients are quantized, packed and sent +* for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_c +* coded block pattern chroma +* +* @remarks +* The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order +* mentioned in h.264 specification +* +******************************************************************************* +*/ +UWORD8 ih264e_code_chroma_intra_macroblock_8x8 + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* @brief performs luma core coding when mode is inter +* +* @par Description: +* If the current mb is to be coded as inter predicted mb,based on the sub mb +* partitions and corresponding motion vectors generated by ME, prediction is done. +* Then, error is computed between the input blk and the estimated blk. +* This error is transformed ( dct and with out hadamard), quantized. The +* quantized coefficients are packed in scan order for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks none +* +******************************************************************************* +*/ +UWORD8 ih264e_code_luma_inter_macroblock_16x16 + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* @brief performs chroma core coding for inter macro blocks +* +* @par Description: +* If the current mb is to be coded as inter predicted mb, based on the sub mb +* partitions and corresponding motion vectors generated by ME, prediction is done. +* Then, error is computed between the input blk and the estimated blk. +* This error is transformed, quantized. The quantized coefficients +* are packed in scan order for entropy coding. +* +* @param[in] ps_proc_ctxt +* pointer to the current macro block context +* +* @returns u1_cbp_l +* coded block pattern luma +* +* @remarks none +* +******************************************************************************* +*/ +UWORD8 ih264e_code_chroma_inter_macroblock_8x8 + ( + process_ctxt_t *ps_proc + ); + +#endif /* IH264E_CORE_CODING_H_ */ diff --git a/encoder/ih264e_deblk.c b/encoder/ih264e_deblk.c new file mode 100755 index 0000000..8a11bdb --- /dev/null +++ b/encoder/ih264e_deblk.c @@ -0,0 +1,854 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264e_deblk.c + * + * @brief + * This file contains functions that are associated with deblocking + * + * @author + * ittiam + * + * @par List of Functions: + * - ih264e_fill_bs_1mv_1ref_non_mbaff + * - ih264e_calculate_csbp + * - ih264e_compute_bs + * - ih264e_filter_top_edge + * - ih264e_filter_left_edge + * - ih264e_deblock_mb + * + * @remarks + * None + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <string.h> +#include <assert.h> + +/* User include files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_macros.h" +#include "ih264_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264_trans_data.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_deblk_tables.h" +#include "ih264e_deblk.h" + + +/*****************************************************************************/ +/* Extern global definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* @brief BS Table Lookup +* input : +* output : +* @remarks none +****************************************************************************** +*/ +static const UWORD32 gu4_bs_table[][16] = +{ + { + 0x00000000, 0x02000000, 0x00020000, 0x02020000, + 0x00000200, 0x02000200, 0x00020200, 0x02020200, + 0x00000002, 0x02000002, 0x00020002, 0x02020002, + 0x00000202, 0x02000202, 0x00020202, 0x02020202 + }, + { + 0x01010101, 0x02010101, 0x01020101, 0x02020101, + 0x01010201, 0x02010201, 0x01020201, 0x02020201, + 0x01010102, 0x02010102, 0x01020102, 0x02020102, + 0x01010202, 0x02010202, 0x01020202, 0x02020202 + } +}; + +/** +****************************************************************************** +* @brief Transpose Matrix used in BS +* input : +* output : +* @remarks none +****************************************************************************** +*/ +static const UWORD16 ih264e_gu2_4x4_v2h_reorder[16] = +{ + 0x0000, 0x0001, 0x0010, 0x0011, + 0x0100, 0x0101, 0x0110, 0x0111, + 0x1000, 0x1001, 0x1010, 0x1011, + 0x1100, 0x1101, 0x1110, 0x1111 +}; + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Fill BS value for all the edges of an mb +* +* @par Description: +* Fill BS value for all the edges of an mb +* +* @param[in] pu4_horz_bs +* Base pointer of horizontal BS table +* +* @param[in] pu4_vert_bs +* Base pointer of vertical BS table +* +* @param[in] u4_left_mb_csbp +* coded sub block pattern of left mb +* +* @param[in] u4_left_mb_csbp +* coded sub block pattern of top mb +* +* @param[in] ps_leftMvPred +* MV of left mb +* +* @param[in] ps_topMvPred +* MV of top mb +* +* @param[in] ps_curMvPred +* MV of curr mb +* +* @param[in] u1_left_intra +* is left intra +* +* @param[in] u1_top_intra +* is top intra +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +static void ih264e_fill_bs_1mv_1ref_non_mbaff(UWORD32 *pu4_horz_bs, + UWORD32 *pu4_vert_bs, + UWORD32 u4_left_mb_csbp, + UWORD32 u4_top_mb_csbp, + UWORD32 u4_cur_mb_csbp, + mv_t *ps_leftMvPred, + mv_t *ps_topMvPred, + mv_t *ps_curMvPred, + UWORD8 u1_left_intra, + UWORD8 u1_top_intra) +{ + /* motion vectors of blks p & q */ + WORD16 i16_qMv0, i16_qMv1, i16_pMv0, i16_pMv1; + + /* temp var */ + UWORD32 u4_lft_flag, u4_top_flag; + const UWORD32 *bs_map; + UWORD32 u4_reordered_vert_bs_enc, u4_temp; + + /* Coded Pattern for Horizontal Edge */ + /*-----------------------------------------------------------------------*/ + /*u4_nbr_horz_csbp=11C|10C|9C|8C|7C|6C|5C|4C|3C|2C|1C|0C|15T|14T|13T|12T */ + /*-----------------------------------------------------------------------*/ + UWORD32 u4_nbr_horz_csbp = (u4_cur_mb_csbp << 4) | (u4_top_mb_csbp >> 12); + UWORD32 u4_horz_bs_enc = u4_cur_mb_csbp | u4_nbr_horz_csbp; + + /* Coded Pattern for Vertical Edge */ + /*-----------------------------------------------------------------------*/ + /*u4_left_mb_masked_csbp = 15L|0|0|0|11L|0|0|0|7L|0|0|0|3L|0|0|0 */ + /*-----------------------------------------------------------------------*/ + UWORD32 u4_left_mb_masked_csbp = u4_left_mb_csbp & CSBP_RIGHT_BLOCK_MASK; + + /*-----------------------------------------------------------------------*/ + /*u4_cur_mb_masked_csbp =14C|13C|12C|x|10C|9C|8C|x|6C|5C|4C|x|2C|1C|0C|x */ + /*-----------------------------------------------------------------------*/ + UWORD32 u4_cur_mb_masked_csbp =(u4_cur_mb_csbp<<1)&(~CSBP_LEFT_BLOCK_MASK); + + /*-----------------------------------------------------------------------*/ + /*u4_nbr_vert_csbp=14C|13C|12C|15L|10C|9C|8C|11L|6C|5C|4C|7L|2C|1C|0C|3L */ + /*-----------------------------------------------------------------------*/ + UWORD32 u4_nbr_vert_csbp = (u4_cur_mb_masked_csbp) | (u4_left_mb_masked_csbp >> 3); + UWORD32 u4_vert_bs_enc = u4_cur_mb_csbp | u4_nbr_vert_csbp; + + /* BS Calculation for MB Boundary Edges */ + + /* BS calculation for 1 2 3 horizontal boundary */ + bs_map = gu4_bs_table[0]; + pu4_horz_bs[1] = bs_map[(u4_horz_bs_enc >> 4) & 0xF]; + pu4_horz_bs[2] = bs_map[(u4_horz_bs_enc >> 8) & 0xF]; + pu4_horz_bs[3] = bs_map[(u4_horz_bs_enc >> 12) & 0xF]; + + /* BS calculation for 5 6 7 vertical boundary */ + /* Do 4x4 tranpose of u4_vert_bs_enc by using look up table for reorder */ + u4_reordered_vert_bs_enc = ih264e_gu2_4x4_v2h_reorder[u4_vert_bs_enc & 0xF]; + + u4_temp = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 4) & 0xF]; + u4_reordered_vert_bs_enc |= (u4_temp << 1); + + u4_temp = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 8) & 0xF]; + u4_reordered_vert_bs_enc |= (u4_temp << 2); + + u4_temp = ih264e_gu2_4x4_v2h_reorder[(u4_vert_bs_enc >> 12) & 0xF]; + u4_reordered_vert_bs_enc |= (u4_temp << 3); + + pu4_vert_bs[1] = bs_map[(u4_reordered_vert_bs_enc >> 4) & 0xF]; + pu4_vert_bs[2] = bs_map[(u4_reordered_vert_bs_enc >> 8) & 0xF]; + pu4_vert_bs[3] = bs_map[(u4_reordered_vert_bs_enc >> 12) & 0xF]; + + + /* BS Calculation for MB Boundary Edges */ + i16_qMv0 = ps_curMvPred->i2_mvx; + i16_qMv1 = ps_curMvPred->i2_mvy; + + if (u1_top_intra) + { + pu4_horz_bs[0] = 0x04040404; + } + else + { + i16_pMv0 = ps_topMvPred->i2_mvx; + i16_pMv1 = ps_topMvPred->i2_mvy; + + u4_top_flag = (ABS((i16_pMv0 - i16_qMv0)) >= 4 ) | + (ABS((i16_pMv1 - i16_qMv1)) >= 4); + + bs_map = gu4_bs_table[!!u4_top_flag]; + pu4_horz_bs[0] = bs_map[u4_horz_bs_enc & 0xF]; + } + + if (u1_left_intra) + { + pu4_vert_bs[0] = 0x04040404; + } + else + { + i16_pMv0 = ps_leftMvPred->i2_mvx; + i16_pMv1 = ps_leftMvPred->i2_mvy; + + + u4_lft_flag = (ABS((i16_pMv0 - i16_qMv0)) >= 4 ) | + (ABS((i16_pMv1 - i16_qMv1)) >= 4); + + bs_map = gu4_bs_table[!!u4_lft_flag]; + pu4_vert_bs[0] = bs_map[u4_reordered_vert_bs_enc & 0xF]; + } +} + +/** +******************************************************************************* +* +* @brief calculate coded subblock pattern from nnz +* +* @par Description: +* calculate coded subblock pattern from nnz +* +* @param[in] ps_proc +* process context +* +* @returns csbp +* +* @remarks none +* +******************************************************************************* +*/ +static UWORD32 ih264e_calculate_csbp(process_ctxt_t *ps_proc) +{ + /* number of non zeros for each tx blk */ + UWORD8 *pu1_curr_nnz = (UWORD8 *)ps_proc->au4_nnz; + + /* csbp */ + UWORD32 u4_csbp = 0; + + /* temp var */ + WORD32 i4_i; + + pu1_curr_nnz += 1; + + /* Creating Subblock pattern for current MB */ + /* 15C|14C|13C|12C|11C|10C|9C|8C|7C|6C|5C|4C|3C|2C|1C|0C */ + for (i4_i = 0; i4_i < 16; i4_i++ ) + { + u4_csbp |= ((!!*(pu1_curr_nnz + i4_i))<< i4_i); + } + + return u4_csbp; +} + +/** +******************************************************************************* +* +* @brief This function computes blocking strength for an mb +* +* @par Description: +* This function computes blocking strength for an mb +* +* @param[in] ps_proc +* process context +* +* @returns none +* +* @remarks In this module it is assumed that their is only single reference +* frame and is always the most recently used anchor frame +* +******************************************************************************* +*/ +void ih264e_compute_bs(process_ctxt_t * ps_proc) +{ + /* deblk bs context */ + bs_ctxt_t *ps_bs = &(ps_proc->s_deblk_ctxt.s_bs_ctxt); + + /* vertical blocking strength */ + UWORD32 *pu4_pic_vert_bs; + + /* horizontal blocking strength */ + UWORD32 *pu4_pic_horz_bs; + + /* mb indices */ + WORD32 i4_mb_x, i4_mb_y; + + /* is intra */ + WORD32 i4_intra; + + /* temp var */ + WORD32 i4_wd_mbs = ps_proc->i4_wd_mbs; + + /* init indices */ + i4_mb_x = ps_bs->i4_mb_x; + i4_mb_y = ps_bs->i4_mb_y; + + /* init pointers */ + pu4_pic_vert_bs = ps_bs->pu4_pic_vert_bs + ((i4_mb_y * i4_wd_mbs) + i4_mb_x) * 4; + pu4_pic_horz_bs = ps_bs->pu4_pic_horz_bs + ((i4_mb_y * i4_wd_mbs) + i4_mb_x) * 4; + + /* is intra? */ + i4_intra = ps_proc->u4_is_intra; + + /* compute blocking strength */ + if (i4_intra) + { + pu4_pic_vert_bs[0] = 0x04040404; + pu4_pic_vert_bs[1] = pu4_pic_vert_bs[2] = pu4_pic_vert_bs[3] = 0x03030303; + + pu4_pic_horz_bs[0] = 0x04040404; + pu4_pic_horz_bs[1] = pu4_pic_horz_bs[2] = pu4_pic_horz_bs[3] = 0x03030303; + } + else + { + /* left mb syntax info */ + mb_info_t *ps_left_mb_syntax_ele = &ps_proc->s_left_mb_syntax_ele; + + /* top mb syntax info */ + mb_info_t *ps_top_mb_syntax_ele = ps_proc->ps_top_row_mb_syntax_ele + i4_mb_x; + + /* top row motion vector info */ + enc_pu_t *ps_top_row_pu = ps_proc->ps_top_row_pu + i4_mb_x; + + /* csbp for curr mb */ + ps_proc->u4_csbp = ih264e_calculate_csbp(ps_proc); + + /* csbp for ngbrs */ + if (i4_mb_x == 0) + { + ps_left_mb_syntax_ele->u4_csbp = 0; + ps_left_mb_syntax_ele->u2_is_intra = 0; + ps_proc->s_left_mb_pu.s_l0_mv = ps_proc->ps_pu->s_l0_mv; + } + if (i4_mb_y == 0) + { + ps_top_mb_syntax_ele->u4_csbp = 0; + ps_top_mb_syntax_ele->u2_is_intra = 0; + ps_top_row_pu->s_l0_mv = ps_proc->ps_pu->s_l0_mv; + } + + ih264e_fill_bs_1mv_1ref_non_mbaff(pu4_pic_horz_bs, + pu4_pic_vert_bs, + ps_left_mb_syntax_ele->u4_csbp, + ps_top_mb_syntax_ele->u4_csbp, + ps_proc->u4_csbp, + &ps_proc->s_left_mb_pu.s_l0_mv, + &ps_top_row_pu->s_l0_mv, + &ps_proc->ps_pu->s_l0_mv, + ps_left_mb_syntax_ele->u2_is_intra, + ps_top_mb_syntax_ele->u2_is_intra); + } + + return ; +} + +/** +******************************************************************************* +* +* @brief This function performs deblocking of top horizontal edge +* +* @par Description: +* This function performs deblocking of top horizontal edge +* +* @param[in] ps_codec +* pointer to codec context +* +* @param[in] ps_proc +* pointer to proc context +* +* @param[in] pu1_mb_qp +* pointer to mb quantization param +* +* @param[in] pu1_cur_pic_luma +* pointer to recon buffer luma +* +* @param[in] pu1_cur_pic_chroma +* pointer to recon buffer chroma +* +* @param[in] pu4_pic_horz_bs +* pointer to horizontal blocking strength +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +static void ih264e_filter_top_edge(codec_t *ps_codec, + process_ctxt_t *ps_proc, + UWORD8 *pu1_mb_qp, + UWORD8 *pu1_cur_pic_luma, + UWORD8 *pu1_cur_pic_chroma, + UWORD32 *pu4_pic_horz_bs) +{ + /* strd */ + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* deblk params */ + UWORD32 u4_alpha_luma, u4_beta_luma, u4_qp_luma, u4_idx_A_luma, u4_idx_B_luma, u4_qp_p, u4_qp_q; + UWORD32 u4_alpha_chroma, u4_beta_chroma, u4_qp_chroma, u4_idx_A_chroma, u4_idx_B_chroma; + + /* collect qp of left & top mb */ + u4_qp_p = pu1_mb_qp[-ps_proc->i4_wd_mbs]; + u4_qp_q = pu1_mb_qp[0]; + + /********/ + /* luma */ + /********/ + u4_qp_luma = (u4_qp_p + u4_qp_q + 1) >> 1; + + /* filter offset A and filter offset B have to be received from slice header */ + /* TODO : for now lets set these offsets as zero */ + + + u4_idx_A_luma = MIN(51, u4_qp_luma + 0); + u4_idx_B_luma = MIN(51, u4_qp_luma + 0); + + /* alpha, beta computation */ + u4_alpha_luma = gu1_ih264_alpha_table[u4_idx_A_luma]; + u4_beta_luma = gu1_ih264_beta_table[u4_idx_B_luma]; + + /**********/ + /* chroma */ + /**********/ + u4_qp_chroma = (gu1_qpc_fqpi[u4_qp_p] + gu1_qpc_fqpi[u4_qp_q] + 1) >> 1; + + /* filter offset A and filter offset B have to be received from slice header */ + /* TODO : for now lets set these offsets as zero */ + + + u4_idx_A_chroma = MIN(51, u4_qp_chroma + 0); + u4_idx_B_chroma = MIN(51, u4_qp_chroma + 0); + + /* alpha, beta computation */ + u4_alpha_chroma = gu1_ih264_alpha_table[u4_idx_A_chroma]; + u4_beta_chroma = gu1_ih264_beta_table[u4_idx_B_chroma]; + + /* deblk edge */ + /* top Horizontal edge - allowed to be deblocked ? */ + if (pu4_pic_horz_bs[0] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_horz_bs4(pu1_cur_pic_luma, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + ps_codec->pf_deblk_chroma_horz_bs4(pu1_cur_pic_chroma, i4_rec_strd, u4_alpha_chroma, u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_horz_bslt4(pu1_cur_pic_luma, i4_rec_strd, u4_alpha_luma, + u4_beta_luma, pu4_pic_horz_bs[0], + gu1_ih264_clip_table[u4_idx_A_luma]); + + ps_codec->pf_deblk_chroma_horz_bslt4(pu1_cur_pic_chroma, i4_rec_strd, u4_alpha_chroma, + u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma, pu4_pic_horz_bs[0], + gu1_ih264_clip_table[u4_idx_A_chroma], gu1_ih264_clip_table[u4_idx_A_chroma]); + } +} + +/** +******************************************************************************* +* +* @brief This function performs deblocking of left vertical edge +* +* @par Description: +* This function performs deblocking of top horizontal edge +* +* @param[in] ps_codec +* pointer to codec context +* +* @param[in] ps_proc +* pointer to proc context +* +* @param[in] pu1_mb_qp +* pointer to mb quantization param +* +* @param[in] pu1_cur_pic_luma +* pointer to recon buffer luma +* +* @param[in] pu1_cur_pic_chroma +* pointer to recon buffer chroma +* +* @param[in] pu4_pic_vert_bs +* pointer to vertical blocking strength +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +static void ih264e_filter_left_edge(codec_t *ps_codec, + process_ctxt_t *ps_proc, + UWORD8 *pu1_mb_qp, + UWORD8 *pu1_cur_pic_luma, + UWORD8 *pu1_cur_pic_chroma, + UWORD32 *pu4_pic_vert_bs) +{ + /* strd */ + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* deblk params */ + UWORD32 u4_alpha_luma, u4_beta_luma, u4_qp_luma, u4_idx_A_luma, u4_idx_B_luma, u4_qp_p, u4_qp_q; + UWORD32 u4_alpha_chroma, u4_beta_chroma, u4_qp_chroma, u4_idx_A_chroma, u4_idx_B_chroma; + + /* collect qp of left & curr mb */ + u4_qp_p = pu1_mb_qp[-1]; + u4_qp_q = pu1_mb_qp[0]; + + /********/ + /* luma */ + /********/ + u4_qp_luma = (u4_qp_p + u4_qp_q + 1) >> 1; + + /* filter offset A and filter offset B have to be received from slice header */ + /* TODO : for now lets set these offsets as zero */ + + + u4_idx_A_luma = MIN(51, u4_qp_luma + 0); + u4_idx_B_luma = MIN(51, u4_qp_luma + 0); + + /* alpha, beta computation */ + u4_alpha_luma = gu1_ih264_alpha_table[u4_idx_A_luma]; + u4_beta_luma = gu1_ih264_beta_table[u4_idx_B_luma]; + + /**********/ + /* chroma */ + /**********/ + u4_qp_chroma = (gu1_qpc_fqpi[u4_qp_p] + gu1_qpc_fqpi[u4_qp_q] + 1) >> 1; + + /* filter offset A and filter offset B have to be received from slice header */ + /* TODO : for now lets set these offsets as zero */ + + + u4_idx_A_chroma = MIN(51, u4_qp_chroma + 0); + u4_idx_B_chroma = MIN(51, u4_qp_chroma + 0); + + /* alpha, beta computation */ + u4_alpha_chroma = gu1_ih264_alpha_table[u4_idx_A_chroma]; + u4_beta_chroma = gu1_ih264_beta_table[u4_idx_B_chroma]; + + /* deblk edge */ + if (pu4_pic_vert_bs[0] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_vert_bs4(pu1_cur_pic_luma, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + ps_codec->pf_deblk_chroma_vert_bs4(pu1_cur_pic_chroma, i4_rec_strd, u4_alpha_chroma, u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_vert_bslt4(pu1_cur_pic_luma, i4_rec_strd, + u4_alpha_luma, u4_beta_luma, + pu4_pic_vert_bs[0], + gu1_ih264_clip_table[u4_idx_A_luma]); + + ps_codec->pf_deblk_chroma_vert_bslt4(pu1_cur_pic_chroma, i4_rec_strd, u4_alpha_chroma, + u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma, pu4_pic_vert_bs[0], + gu1_ih264_clip_table[u4_idx_A_chroma], gu1_ih264_clip_table[u4_idx_A_chroma]); + } +} + +/** +******************************************************************************* +* +* @brief This function performs deblocking on an mb +* +* @par Description: +* This function performs deblocking on an mb +* +* @param[in] ps_proc +* process context corresponding to the job +* +* @param[in] ps_deblk +* pointer to deblock context +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_deblock_mb(process_ctxt_t *ps_proc, deblk_ctxt_t * ps_deblk) +{ + /* codec ctxt */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* ngbr availability */ + UWORD8 u1_mb_a, u1_mb_b; + + /* mb indices */ + WORD32 i4_mb_x = ps_deblk->i4_mb_x, i4_mb_y = ps_deblk->i4_mb_y; + + /* pic qp ptr */ + UWORD8 *pu1_pic_qp = ps_deblk->s_bs_ctxt.pu1_pic_qp; + + /* vertical blocking strength */ + UWORD32 *pu4_pic_vert_bs = ps_deblk->s_bs_ctxt.pu4_pic_vert_bs; + + /* horizontal blocking strength */ + UWORD32 *pu4_pic_horz_bs = ps_deblk->s_bs_ctxt.pu4_pic_horz_bs; + + /* src buffers luma */ + UWORD8 *pu1_cur_pic_luma = ps_deblk->pu1_cur_pic_luma; + + /* src buffers chroma */ + UWORD8 *pu1_cur_pic_chroma = ps_deblk->pu1_cur_pic_chroma; + + /* strd */ + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* deblk params */ + UWORD32 u4_alpha_luma, u4_beta_luma, u4_qp_luma, u4_idx_A_luma, u4_idx_B_luma; + UWORD32 u4_alpha_chroma, u4_beta_chroma, u4_qp_chroma, u4_idx_A_chroma, u4_idx_B_chroma; + + /* temp var */ + UWORD32 push_ptr = (i4_mb_y * ps_proc->i4_wd_mbs) + i4_mb_x; + + /* derive neighbor availability */ + /* In slice mode the edges of mbs that lie on the slice boundary are not deblocked */ + /* deblocking filter idc '2' */ + if (ps_codec->s_cfg.e_slice_mode != IVE_SLICE_MODE_NONE) + { + /* slice index */ + UWORD8 *pu1_slice_idx = ps_deblk->pu1_slice_idx; + + pu1_slice_idx += (i4_mb_y * ps_proc->i4_wd_mbs); + /* left macroblock availability */ + u1_mb_a = (i4_mb_x == 0 || + (pu1_slice_idx[i4_mb_x - 1 ] != pu1_slice_idx[i4_mb_x]))? 0 : 1; + /* top macroblock availability */ + u1_mb_b = (i4_mb_y == 0 || + (pu1_slice_idx[i4_mb_x-ps_proc->i4_wd_mbs] != pu1_slice_idx[i4_mb_x]))? 0 : 1; + } + else + { + /* left macroblock availability */ + u1_mb_a = (i4_mb_x == 0)? 0 : 1; + /* top macroblock availability */ + u1_mb_b = (i4_mb_y == 0)? 0 : 1; + } + + pu1_pic_qp += push_ptr; + pu4_pic_vert_bs += push_ptr * 4; + pu4_pic_horz_bs += push_ptr * 4; + + /********/ + /* luma */ + /********/ + u4_qp_luma = pu1_pic_qp[0]; + + /* filter offset A and filter offset B have to be received from slice header */ + /* TODO : for now lets set these offsets as zero */ + + + u4_idx_A_luma = MIN(51, u4_qp_luma + 0); + u4_idx_B_luma = MIN(51, u4_qp_luma + 0); + + /* alpha, beta computation */ + u4_alpha_luma = gu1_ih264_alpha_table[u4_idx_A_luma]; + u4_beta_luma = gu1_ih264_beta_table[u4_idx_B_luma]; + + /**********/ + /* chroma */ + /**********/ + u4_qp_chroma = gu1_qpc_fqpi[u4_qp_luma]; + + /* filter offset A and filter offset B have to be received from slice header */ + /* TODO : for now lets set these offsets as zero */ + + + u4_idx_A_chroma = MIN(51, u4_qp_chroma + 0); + u4_idx_B_chroma = MIN(51, u4_qp_chroma + 0); + + /* alpha, beta computation */ + u4_alpha_chroma = gu1_ih264_alpha_table[u4_idx_A_chroma]; + u4_beta_chroma = gu1_ih264_beta_table[u4_idx_B_chroma]; + + /* Deblock vertical edges */ + /* left vertical edge 0 - allowed to be deblocked ? */ + if (u1_mb_a) + { + ih264e_filter_left_edge(ps_codec, ps_proc, pu1_pic_qp, pu1_cur_pic_luma, pu1_cur_pic_chroma, pu4_pic_vert_bs); + } + + /* vertical edge 1 */ + if (pu4_pic_vert_bs[1] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_vert_bs4(pu1_cur_pic_luma + 4, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_vert_bslt4(pu1_cur_pic_luma + 4, i4_rec_strd, + u4_alpha_luma, u4_beta_luma, + pu4_pic_vert_bs[1], + gu1_ih264_clip_table[u4_idx_A_luma]); + } + + /* vertical edge 2 */ + if (pu4_pic_vert_bs[2] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_vert_bs4(pu1_cur_pic_luma + 8, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + ps_codec->pf_deblk_chroma_vert_bs4(pu1_cur_pic_chroma + 8, i4_rec_strd, u4_alpha_chroma, u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_vert_bslt4(pu1_cur_pic_luma + 8, i4_rec_strd, u4_alpha_luma, + u4_beta_luma, pu4_pic_vert_bs[2], + gu1_ih264_clip_table[u4_idx_A_luma]); + + ps_codec->pf_deblk_chroma_vert_bslt4(pu1_cur_pic_chroma + 8, i4_rec_strd, u4_alpha_chroma, + u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma, pu4_pic_vert_bs[2], + gu1_ih264_clip_table[u4_idx_A_chroma], gu1_ih264_clip_table[u4_idx_A_chroma]); + } + + /* vertical edge 3 */ + if (pu4_pic_vert_bs[3] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_vert_bs4(pu1_cur_pic_luma + 12, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_vert_bslt4(pu1_cur_pic_luma + 12, i4_rec_strd, u4_alpha_luma, + u4_beta_luma, pu4_pic_vert_bs[3], + gu1_ih264_clip_table[u4_idx_A_luma]); + } + + /* Deblock Horizontal edges */ + /* Horizontal edge 0 */ + if (u1_mb_b) + { + ih264e_filter_top_edge(ps_codec, ps_proc, pu1_pic_qp, pu1_cur_pic_luma, pu1_cur_pic_chroma, pu4_pic_horz_bs); + } + + /* horizontal edge 1 */ + if (pu4_pic_horz_bs[1] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_horz_bs4(pu1_cur_pic_luma + 4 * i4_rec_strd, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_horz_bslt4(pu1_cur_pic_luma + 4 * i4_rec_strd, i4_rec_strd, u4_alpha_luma, + u4_beta_luma, pu4_pic_horz_bs[1], + gu1_ih264_clip_table[u4_idx_A_luma]); + } + + /* horizontal edge 2 */ + if (pu4_pic_horz_bs[2] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_horz_bs4(pu1_cur_pic_luma + 8 * i4_rec_strd, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + ps_codec->pf_deblk_chroma_horz_bs4(pu1_cur_pic_chroma + 4 * i4_rec_strd, i4_rec_strd, u4_alpha_chroma, u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_horz_bslt4(pu1_cur_pic_luma + 8 * i4_rec_strd, i4_rec_strd, u4_alpha_luma, + u4_beta_luma, pu4_pic_horz_bs[2], + gu1_ih264_clip_table[u4_idx_A_luma]); + + ps_codec->pf_deblk_chroma_horz_bslt4(pu1_cur_pic_chroma + 4 * i4_rec_strd, i4_rec_strd, u4_alpha_chroma, + u4_beta_chroma, u4_alpha_chroma, u4_beta_chroma, pu4_pic_horz_bs[2], + gu1_ih264_clip_table[u4_idx_A_chroma], gu1_ih264_clip_table[u4_idx_A_chroma]); + } + + /* horizontal edge 3 */ + if (pu4_pic_horz_bs[3] == 0x04040404) + { + /* strong filter */ + ps_codec->pf_deblk_luma_horz_bs4(pu1_cur_pic_luma + 12 * i4_rec_strd, i4_rec_strd, u4_alpha_luma, u4_beta_luma); + } + else + { + /* normal filter */ + ps_codec->pf_deblk_luma_horz_bslt4(pu1_cur_pic_luma + 12 * i4_rec_strd, i4_rec_strd, u4_alpha_luma, + u4_beta_luma, pu4_pic_horz_bs[3], + gu1_ih264_clip_table[u4_idx_A_luma]); + } + + return ; +} diff --git a/encoder/ih264e_deblk.h b/encoder/ih264e_deblk.h new file mode 100755 index 0000000..9b3b67b --- /dev/null +++ b/encoder/ih264e_deblk.h @@ -0,0 +1,99 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_deblk.h +* +* @brief +* This file contains extern declarations of deblocking routines +* +* @author +* ittiam +* +* @remarks +* none +****************************************************************************** +*/ + +#ifndef IH264E_DEBLK_H_ +#define IH264E_DEBLK_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief masks to extract csbp +****************************************************************************** + */ +#define CSBP_LEFT_BLOCK_MASK 0x1111 +#define CSBP_RIGHT_BLOCK_MASK 0x8888 + + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief This function computes blocking strength for an mb +* +* @par Description: +* This function computes blocking strength for an mb +* +* @param[in] ps_proc +* process context +* +* @returns none +* +* @remarks In this module it is assumed that their is only single reference +* frame and is always the most recently used anchor frame +* +******************************************************************************* +*/ +void ih264e_compute_bs(process_ctxt_t * ps_proc); + +/** +******************************************************************************* +* +* @brief This function performs deblocking on an mb +* +* @par Description: +* This function performs deblocking on an mb +* +* @param[in] ps_proc +* process context corresponding to the job +* +* @param[in] ps_deblk +* pointer to deblock context +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_deblock_mb(process_ctxt_t *ps_proc, deblk_ctxt_t * ps_deblk); + +#endif /* IH264E_DEBLK_H_ */ diff --git a/encoder/ih264e_debug.h b/encoder/ih264e_debug.h new file mode 100755 index 0000000..5cb0434 --- /dev/null +++ b/encoder/ih264e_debug.h @@ -0,0 +1,65 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_debug.h +* +* @brief +* This file contains extern declarations of routines that could be helpful +* for debugging purposes. +* +* @author +* ittiam +* +* @remarks +* none +****************************************************************************** +*/ + +#ifndef IH264E_DEBUG_H_ +#define IH264E_DEBUG_H_ + +#if DEBUG_RC + +#define DEBUG_DUMP_QP(pic_cnt, qp, num_cores) \ + ih264e_debug_dump_qp(pic_cnt, qp, num_cores); + +#define DEBUG_DUMP_RC(ps_rc) ih264e_debug_print_rc(ps_rc); + +#define DEBUG_DUMP_COST_SAD_PU(ps_proc) ih264e_debug_dump_cost_sad_pu(ps_proc); + +#define DEBUG_DUMP_INP_TO_RC_POST_ENC(ps_frame_info, pic_cnt, num_cores) \ + ih264e_debug_dump_inp_to_post_enc(ps_frame_info, pic_cnt, num_cores); + +#else + +#define DEBUG_DUMP_QP(pic_cnt, qp, num_cores) (void); + +#define DEBUG_DUMP_RC(ps_rc) (void); + +#define DEBUG_DUMP_COST_SAD_PU(ps_proc) (void); + +#define DEBUG_DUMP_INP_TO_RC_POST_ENC(ps_frame_info, pic_cnt, num_cores) (void); + +#endif + +#endif /* IH264E_DEBUG_H_ */ diff --git a/encoder/ih264e_defs.h b/encoder/ih264e_defs.h new file mode 100755 index 0000000..76929ef --- /dev/null +++ b/encoder/ih264e_defs.h @@ -0,0 +1,538 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_defs.h +* +* @brief +* Definitions used in the encoder +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_DEFS_H_ +#define IH264E_DEFS_H_ + + +/*****************************************************************************/ +/* Width and height restrictions */ +/*****************************************************************************/ +/** + * Minimum width supported by codec + */ +#define MIN_WD 16 + +/** + * Maximum width supported by codec + */ + +#define MAX_WD 1920 + +/** + * Minimum height supported by codec + */ +#define MIN_HT 16 + +/** + * Maximum height supported by codec + */ + +#define MAX_HT 1920 + +/*****************************************************************************/ +/* Padding sizes */ +/*****************************************************************************/ +/** + * Padding used for top of the frame + */ +#define PAD_TOP 32 + +/** + * Padding used for bottom of the frame + */ +#define PAD_BOT 32 + +/** + * Padding used at left of the frame + */ +#define PAD_LEFT 32 + +/** + * Padding used at right of the frame + */ +#define PAD_RIGHT 32 +/** + * Padding for width + */ +#define PAD_WD (PAD_LEFT + PAD_RIGHT) +/** + * Padding for height + */ +#define PAD_HT (PAD_TOP + PAD_BOT) + +/* + * buffer width and height for half pel buffers + */ +#define HP_BUFF_WD 24 +#define HP_BUFF_HT 18 + +/*****************************************************************************/ +/* Number of frame restrictions */ +/*****************************************************************************/ +/** + * Maximum number of reference buffers in DPB manager + */ +#define MAX_REF_CNT 32 + +/*****************************************************************************/ +/* Num cores releated defs */ +/*****************************************************************************/ +/** + * Maximum number of cores + */ +#define MAX_NUM_CORES 8 + +/** + * Maximum number of threads for pixel processing + */ +#define MAX_PROCESS_THREADS MAX_NUM_CORES + +/** + * Maximum process context sets + * Used to stagger encoding of MAX_CTXT_SETS in parallel + */ +#define MAX_CTXT_SETS 2 +/** + * Maximum number of contexts + * Kept as twice the number of threads, to make it easier to initialize the contexts + * from master thread + */ +#define MAX_PROCESS_CTXT MAX_NUM_CORES * MAX_CTXT_SETS + +/*****************************************************************************/ +/* Profile and level restrictions */ +/*****************************************************************************/ +/** + * Max level supported by the codec + */ +#define MAX_LEVEL IH264_LEVEL_51 + +/** + * Min level supported by the codec + */ +#define MIN_LEVEL IH264_LEVEL_10 + +/** + * Maximum number of slice headers that are held in memory simultaneously + * For single core implementation only 1 slice header is enough. + * But for multi-core parsing thread needs to ensure that slice headers are + * stored till the last CB in a slice is decoded. + * Parsing thread has to wait till last CB of a slice is consumed before reusing + * overwriting the slice header + * MAX_SLICE_HDR_CNT is assumed to be a power of 2 + */ + +#define LOG2_MAX_SLICE_HDR_CNT 8 +#define MAX_SLICE_HDR_CNT (1 << LOG2_MAX_SLICE_HDR_CNT) + +/* Generic declarations */ +#define DEFAULT_MAX_LEVEL 40 +#define DEFAULT_RECON_ENABLE 0 +#define DEFAULT_RC IVE_RC_STORAGE +#define DEFAULT_MAX_FRAMERATE 120000 +#define DEFAULT_MAX_BITRATE 20000000 +#define DEFAULT_MAX_SRCH_RANGE_X 256 +#define DEFAULT_MAX_SRCH_RANGE_Y 256 +#define DEFAULT_SLICE_PARAM 256 +#define DEFAULT_SRC_FRAME_RATE 30000 +#define DEFAULT_TGT_FRAME_RATE 30000 +#define DEFAULT_BITRATE 6000000 +#define DEFAULT_QP_MIN 10 +#define DEFAULT_QP_MAX 51 +#define DEFAULT_I_QP 25 +#define DEFAULT_P_QP 28 +#define DEFAULT_B_QP 28 +#define DEFAULT_AIR_MODE IVE_AIR_MODE_NONE +#define DEFAULT_AIR_REFRESH_PERIOD 30 +#define DEFAULT_VBV_DELAY 1000 +#define DEFAULT_VBV_SIZE 16800000 /* level 3.1 */ +#define DEFAULT_NUM_CORES 1 +#define DEFAULT_ME_SPEED_PRESET 100 +#define DEFAULT_HPEL 1 +#define DEFAULT_QPEL 1 +#define DEFAULT_I4 1 +#define DEFAULT_I8 0 +#define DEFAULT_I16 1 +#define DEFAULT_ENABLE_FAST_SAD 0 +#define DEFAULT_ENABLE_SATQD 1 +#define DEFAULT_MIN_SAD_ENABLE 0 +#define DEFAULT_MIN_SAD_DISABLE -1 +#define DEFAULT_SRCH_RNG_X 64 +#define DEFAULT_SRCH_RNG_Y 48 +#define DEFAULT_I_INTERVAL 30 +#define DEFAULT_IDR_INTERVAL 1000 +#define DEFAULT_B_FRAMES 0 +#define DEFAULT_DISABLE_DEBLK_LEVEL 0 +#define DEFAULT_PROFILE IV_PROFILE_BASE +#define DEFAULT_MIN_INTRA_FRAME_RATE 1 +#define DEFAULT_MAX_INTRA_FRAME_RATE 2147483647 +#define DEFAULT_MIN_BUFFER_DELAY 30 +#define DEFAULT_MAX_BUFFER_DELAY 20000 +#define DEFAULT_STRIDE 0 +#define DEFAULT_ENC_SPEED_PRESET IVE_USER_DEFINED +#define DEFAULT_PRE_ENC_ME 0 +#define DEFAULT_PRE_ENC_IPE 0 + +/** Maximum number of entries in input buffer list */ +#define MAX_INP_BUF_LIST_ENTRIES 32 + +/** Maximum number of entries in output buffer list */ +#define MAX_OUT_BUF_LIST_ENTRIES 32 + +/** Maximum number of entries in recon buffer list used within the encoder */ +#define MAX_REC_LIST_ENTRIES 16 + +/** Number of buffers created to hold half-pel planes for every reference buffer */ + #define HPEL_PLANES_CNT 1 + +/** + ***************************************************************************** + * Macro to compute total size required to hold on set of scaling matrices + ***************************************************************************** + */ +#define SCALING_MAT_SIZE(m_scaling_mat_size) \ +{ \ + m_scaling_mat_size = 6 * TRANS_SIZE_4 * TRANS_SIZE_4; \ + m_scaling_mat_size += 6 * TRANS_SIZE_8 * TRANS_SIZE_8; \ + m_scaling_mat_size += 6 * TRANS_SIZE_16 * TRANS_SIZE_16; \ + m_scaling_mat_size += 2 * TRANS_SIZE_32 * TRANS_SIZE_32; \ +} + +/** + ****************************************************************************** + * @brief Macros to get raster scan position of a block[8x8] / sub block[4x4] + ****************************************************************************** + */ +#define GET_BLK_RASTER_POS_X(x) ((x & 0x01)) +#define GET_BLK_RASTER_POS_Y(y) ((y >> 1)) +#define GET_SUB_BLK_RASTER_POS_X(x) ((x & 0x01)) +#define GET_SUB_BLK_RASTER_POS_Y(y) ((y >> 1)) + +#define NUM_RC_MEMTABS 17 + +/** + *************************************************************************** + * Enum to hold various mem records being request + **************************************************************************** + */ +enum +{ + /** + * Codec Object at API level + */ + MEM_REC_IV_OBJ, + + /** + * Codec context + */ + MEM_REC_CODEC, + + /** + * entropy context + */ + MEM_REC_ENTROPY, + + /** + * Buffer to hold coeff data + */ + MEM_REC_MB_COEFF_DATA, + + /** + * Buffer to hold coeff data + */ + MEM_REC_MB_HEADER_DATA, + + /** + * Motion vector bank + */ + MEM_REC_MVBANK, + + /** + * Motion vector bits + */ + MEM_REC_MVBITS, + + /** + * Holds mem records passed to the codec. + */ + MEM_REC_BACKUP, + + /** + * Holds SPS + */ + MEM_REC_SPS, + + /** + * Holds PPS + */ + MEM_REC_PPS, + + /** + * Holds Slice Headers + */ + MEM_REC_SLICE_HDR, + + /** + * Contains map indicating slice index per MB basis + */ + MEM_REC_SLICE_MAP, + + /** + * Holds thread handles + */ + MEM_REC_THREAD_HANDLE, + + /** + * Holds control call mutex + */ + MEM_REC_CTL_MUTEX, + + /** + * Holds entropy call mutex + */ + MEM_REC_ENTROPY_MUTEX, + + /** + * Holds memory for Process JOB Queue + */ + MEM_REC_PROC_JOBQ, + + /** + * Holds memory for Entropy JOB Queue + */ + MEM_REC_ENTROPY_JOBQ, + + /** + * Contains status map indicating processing status per MB basis + */ + MEM_REC_PROC_MAP, + + /** + * Contains status map indicating deblocking status per MB basis + */ + MEM_REC_DBLK_MAP, + + /* + * Contains AIR map and mask + */ + MEM_REC_AIR_MAP, + + /** + * Contains status map indicating ME status per MB basis + */ + MEM_REC_ME_MAP, + + /** + * Holds dpb manager context + */ + MEM_REC_DPB_MGR, + + /** + * Holds intermediate buffers needed during processing stage + * Memory for process contexts is allocated in this memtab + */ + MEM_REC_PROC_SCRATCH, + + /** + * Holds buffers for vert_bs, horz_bs and QP (all frame level) + */ + MEM_REC_QUANT_PARAM, + + /** + * Holds top row syntax information + */ + MEM_REC_TOP_ROW_SYN_INFO, + + /** + * Holds buffers for vert_bs, horz_bs and QP (all frame level) + */ + MEM_REC_BS_QP, + + /** + * Holds input buffer manager context + */ + MEM_REC_INP_PIC, + + /** + * Holds output buffer manager context + */ + MEM_REC_OUT, + + /** + * Holds picture buffer manager context and array of pic_buf_ts + * Also holds reference picture buffers in non-shared mode + */ + MEM_REC_REF_PIC, + + /* + * Mem record for color space conversion + */ + MEM_REC_CSC, + + /** + * NMB info struct + */ + MEM_REC_MB_INFO_NMB, + + /** + * Rate control of memory records. + */ + MEM_REC_RC, + + /** + * Place holder to compute number of memory records. + */ + MEM_REC_CNT = MEM_REC_RC + NUM_RC_MEMTABS, + + /* + * Do not add anything below + */ +}; + +#define DISABLE_DEBLOCK_INTERVAL 8 + +/** + **************************************************************************** + * Disable deblock levels + * Level 0 enables deblocking completely and level 4 disables completely + * Other levels are intermediate values to control deblocking level + **************************************************************************** + */ +enum +{ + /** + * Enable deblocking completely + */ + DISABLE_DEBLK_LEVEL_0, + + /** + * Disable only within MB edges - Not supported currently + */ + DISABLE_DEBLK_LEVEL_1, + + /** + * Enable deblocking once in DEBLOCK_INTERVAL number of pictures + * and for I slices + */ + DISABLE_DEBLK_LEVEL_2, + + /** + * Enable deblocking only for I slices + */ + DISABLE_DEBLK_LEVEL_3, + + /** + * Disable deblocking completely + */ + DISABLE_DEBLK_LEVEL_4 +}; + +/** + **************************************************************************** + * Number of buffers for I/O based on format + **************************************************************************** + */ + +/** Minimum number of input buffers */ +#define MIN_INP_BUFS 2 + +/** Minimum number of output buffers */ +#define MIN_OUT_BUFS 1 + +/** Minimum number of components in bitstream buffer */ +#define MIN_BITS_BUFS_COMP 1 + +/** Minimum number of components in raw buffer */ +#define MIN_RAW_BUFS_420_COMP 3 +#define MIN_RAW_BUFS_422ILE_COMP 1 +#define MIN_RAW_BUFS_RGB565_COMP 1 +#define MIN_RAW_BUFS_RGBA8888_COMP 1 +#define MIN_RAW_BUFS_420SP_COMP 2 + +#define MAX_NMB 120 + +/** Maximum number of active config paramter sets */ +#define MAX_ACTIVE_CONFIG_PARAMS 32 + +/** +****************************************************************************** + * @brief Thresholds for luma & chroma to determine if the 8x8 subblock needs + * to be encoded or skipped +****************************************************************************** +*/ +#define LUMA_SUB_BLOCK_SKIP_THRESHOLD 4 +#define LUMA_BLOCK_SKIP_THRESHOLD 5 +#define CHROMA_BLOCK_SKIP_THRESHOLD 4 + +/** +****************************************************************************** + * @brief defines the first byte of a NAL unit + * forbidden zero bit - nal_ref_idc - nal_unit_type +****************************************************************************** +*/ +/* [0 - 11 - 00111] */ +#define NAL_SPS_FIRST_BYTE 0x67 + +/* [0 - 11 - 01000] */ +#define NAL_PPS_FIRST_BYTE 0x68 + +/* [0 - 11 - 00001] */ +#define NAL_SLICE_FIRST_BYTE 0x61 + +/* [0 - 00 - 00001] */ +#define NAL_NON_REF_SLICE_FIRST_BYTE 0x01 + +/* [0 - 11 - 00101] */ +#define NAL_IDR_SLICE_FIRST_BYTE 0x65 + +/* [0 - 00 - 01100] */ +#define NAL_FILLER_FIRST_BYTE 0x0C + +/* [0 - 00 - 00110] */ +#define NAL_SEI_FIRST_BYTE 0x06 + +#define H264_ALLOC_INTER_FRM_INTV 1 + +#define H264_MPEG_QP_MAP 191 + +#define MPEG2_QP_ELEM (H264_MPEG_QP_MAP + 1) +#define H264_QP_ELEM (MAX_H264_QP + 1) + +#define H264_INIT_QUANT_I 26 +#define H264_INIT_QUANT_P 34 + +#endif /*IH264E_DEFS_H_*/ diff --git a/encoder/ih264e_encode.c b/encoder/ih264e_encode.c new file mode 100755 index 0000000..ffc6fb7 --- /dev/null +++ b/encoder/ih264e_encode.c @@ -0,0 +1,580 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_encode.c +* +* @brief +* This file contains functions for encoding the input yuv frame in synchronous +* api mode +* +* @author +* ittiam +* +* List of Functions +* - ih264e_join_threads() +* - ih264e_wait_for_thread() +* - ih264e_encode() +* +****************************************************************************** +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +/* User Include files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e.h" +#include "ithread.h" +#include "ih264_defs.h" +#include "ih264_macros.h" +#include "ih264_debug.h" +#include "ih264_structs.h" +#include "ih264_platform_macros.h" +#include "ih264_error.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_list.h" +#include "ih264e_error.h" +#include "ih264e_defs.h" +#include "ih264_padding.h" +#include "ih264e_bitstream.h" +#include "irc_mem_req_and_acq.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_time_stamp.h" +#include "ih264e_structs.h" +#include "ih264e_master.h" +#include "ih264e_process.h" +#include "ih264_buf_mgr.h" +#include "ih264_dpb_mgr.h" +#include "ih264e_utils.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_config.h" +#include "ih264e_statistics.h" +#include "ih264e_trace.h" +#include "ih264e_debug.h" +#ifdef LOGO_EN +#include "ih264e_ittiam_logo.h" +#endif + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief +* This function joins all the spawned threads after successful completion of +* their tasks +* +* @par Description +* +* @param[in] ps_codec +* pointer to codec context +* +* @returns none +* +****************************************************************************** +*/ +void ih264e_join_threads(codec_t *ps_codec) +{ + /* temp var */ + WORD32 i = 0; + WORD32 ret = 0; + + /* join spawned threads */ + while (i < ps_codec->i4_proc_thread_cnt) + { + if (ps_codec->ai4_process_thread_created[i]) + { + ret = ithread_join(ps_codec->apv_proc_thread_handle[i], NULL); + if (ret != 0) + { + printf("pthread Join Failed"); + assert(0); + } + ps_codec->ai4_process_thread_created[i] = 0; + i++; + } + } + + ps_codec->i4_proc_thread_cnt = 0; +} + +/** +****************************************************************************** +* +* @brief This function puts the current thread to sleep for a duration +* of sleep_us +* +* @par Description +* ithread_yield() method causes the calling thread to yield execution to another +* thread that is ready to run on the current processor. The operating system +* selects the thread to yield to. ithread_usleep blocks the current thread for +* the specified number of milliseconds. In other words, yield just says, +* end my timeslice prematurely, look around for other threads to run. If there +* is nothing better than me, continue. Sleep says I don't want to run for x +* milliseconds. Even if no other thread wants to run, don't make me run. +* +* @param[in] sleep_us +* thread sleep duration +* +* @returns error_status +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_wait_for_thread(UWORD32 sleep_us) +{ + /* yield thread */ + ithread_yield(); + + /* put thread to sleep */ + ithread_usleep(sleep_us); + + return IH264E_SUCCESS; +} + +/** +****************************************************************************** +* +* @brief +* Encodes in synchronous api mode +* +* @par Description +* This routine processes input yuv, encodes it and outputs bitstream and recon +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns Status +* +****************************************************************************** +*/ +WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op) +{ + /* error status */ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + + /* codec ctxt */ + codec_t *ps_codec = (codec_t *)ps_codec_obj->pv_codec_handle; + + /* input frame to encode */ + ih264e_video_encode_ip_t *ps_video_encode_ip = pv_api_ip; + + /* output buffer to write stream */ + ih264e_video_encode_op_t *ps_video_encode_op = pv_api_op; + + /* i/o structures */ + inp_buf_t s_inp_buf; + out_buf_t s_out_buf; + + /* temp var */ + WORD32 ctxt_sel = 0, i; + + /********************************************************************/ + /* BEGIN INIT */ + /********************************************************************/ + /* reset output structure */ + ps_video_encode_op->s_ive_op.u4_error_code = IV_SUCCESS; + ps_video_encode_op->s_ive_op.output_present = 0; + ps_video_encode_op->s_ive_op.dump_recon = 0; + ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_NA_FRAME; + + /* copy input info. to internal structure */ + s_inp_buf.s_raw_buf = ps_video_encode_ip->s_ive_ip.s_inp_buf; + s_inp_buf.u4_timestamp_low = ps_video_encode_ip->s_ive_ip.u4_timestamp_low; + s_inp_buf.u4_timestamp_high = ps_video_encode_ip->s_ive_ip.u4_timestamp_high; + s_inp_buf.u4_is_last = ps_video_encode_ip->s_ive_ip.u4_is_last; + s_inp_buf.pv_mb_info = ps_video_encode_ip->s_ive_ip.pv_mb_info; + s_inp_buf.u4_mb_info_type = ps_video_encode_ip->s_ive_ip.u4_mb_info_type; + s_inp_buf.pv_pic_info = ps_video_encode_ip->s_ive_ip.pv_pic_info; + s_inp_buf.u4_pic_info_type = ps_video_encode_ip->s_ive_ip.u4_pic_info_type; + + /* copy output info. to internal structure */ + s_out_buf.s_bits_buf = ps_video_encode_ip->s_ive_ip.s_out_buf; + s_out_buf.u4_is_last = ps_video_encode_ip->s_ive_ip.u4_is_last; + s_out_buf.u4_timestamp_low = ps_video_encode_ip->s_ive_ip.u4_timestamp_low; + s_out_buf.u4_timestamp_high = ps_video_encode_ip->s_ive_ip.u4_timestamp_high; + + /* api call cnt */ + ps_codec->i4_encode_api_call_cnt += 1; + + /* curr pic cnt */ + ps_codec->i4_pic_cnt += 1; + + /* codec context selector */ + ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1; + + /* reset status flags */ + ps_codec->ai4_pic_cnt[ctxt_sel] = -1; + ps_codec->s_rate_control.post_encode_skip[ctxt_sel] = 0; + ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] = 0; + + /* pass output buffer to codec */ + ps_codec->as_out_buf[ctxt_sel] = s_out_buf; + + /* initialize codec ctxt with default params for the first encode api call */ + if (ps_codec->i4_encode_api_call_cnt == 0) + { + ih264e_codec_init(ps_codec); + } + + /* parse configuration params */ + for (i = 0; i < MAX_ACTIVE_CONFIG_PARAMS; i++) + { + cfg_params_t *ps_cfg = &ps_codec->as_cfg[i]; + + if (1 == ps_cfg->u4_is_valid) + { + if ( ((ps_cfg->u4_timestamp_high == s_inp_buf.u4_timestamp_high) && + (ps_cfg->u4_timestamp_low == s_inp_buf.u4_timestamp_low)) || + ((WORD32)ps_cfg->u4_timestamp_high == -1) || + ((WORD32)ps_cfg->u4_timestamp_low == -1) ) + { + error_status |= ih264e_codec_update_config(ps_codec, ps_cfg); + SET_ERROR_ON_RETURN(error_status, + IVE_UNSUPPORTEDPARAM, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + + ps_cfg->u4_is_valid = 0; + } + } + } + + /****************************************************************** + * INSERT LOGO + *****************************************************************/ +#ifdef LOGO_EN + if (s_inp_buf.s_raw_buf.apv_bufs[0] != NULL && + ps_codec->i4_header_mode != 1) + { + ih264e_insert_logo(s_inp_buf.s_raw_buf.apv_bufs[0], + s_inp_buf.s_raw_buf.apv_bufs[1], + s_inp_buf.s_raw_buf.apv_bufs[2], + s_inp_buf.s_raw_buf.au4_strd[0], + 0, + 0, + ps_codec->s_cfg.e_inp_color_fmt, + ps_codec->s_cfg.u4_disp_wd, + ps_codec->s_cfg.u4_disp_ht); + } +#endif /*LOGO_EN*/ + + if (ps_codec->i4_encode_api_call_cnt == 0) + { + /********************************************************************/ + /* number of mv/ref bank buffers used by the codec, */ + /* 1 to handle curr frame */ + /* 1 to store information of ref frame */ + /* 1 more additional because of the codec employs 2 ctxt sets */ + /* to assist asynchronous API */ + /********************************************************************/ + + /* initialize mv bank buffer manager */ + error_status |= ih264e_mv_buf_mgr_add_bufs(ps_codec); + SET_ERROR_ON_RETURN(error_status, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + + /* initialize ref bank buffer manager */ + error_status |= ih264e_pic_buf_mgr_add_bufs(ps_codec); + SET_ERROR_ON_RETURN(error_status, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + + /* for the first frame, generate header when not requested explicitly */ + if (ps_codec->i4_header_mode == 0 && + ps_codec->u4_header_generated == 0) + { + ps_codec->i4_gen_header = 1; + } + } + + /* generate header and return when encoder is operated in header mode */ + if (ps_codec->i4_header_mode == 1) + { + /* whenever the header is generated, this implies a start of sequence + * and a sequence needs to be started with IDR + */ + ps_codec->force_curr_frame_type = IV_IDR_FRAME; + + /* generate header */ + error_status |= ih264e_generate_sps_pps(ps_codec); + + /* api call cnt */ + ps_codec->i4_encode_api_call_cnt --; + + /* curr pic cnt */ + ps_codec->i4_pic_cnt --; + + /* header mode tag is not sticky */ + ps_codec->i4_header_mode = 0; + + /* send the input to app */ + ps_video_encode_op->s_ive_op.s_inp_buf = s_inp_buf.s_raw_buf; + + /* send the output to app */ + ps_video_encode_op->s_ive_op.output_present = 1; + ps_video_encode_op->s_ive_op.dump_recon = 0; + ps_video_encode_op->s_ive_op.s_out_buf = ps_codec->as_out_buf[ctxt_sel].s_bits_buf; + + /* error status */ + SET_ERROR_ON_RETURN(error_status, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + + /* indicates that header has been generated previously */ + ps_codec->u4_header_generated = 1; + + return IV_SUCCESS; + } + + + if (s_inp_buf.s_raw_buf.apv_bufs[0] != NULL) + { + /* array giving pic cnt that is being processed in curr context set */ + ps_codec->ai4_pic_cnt[ctxt_sel] = ps_codec->i4_pic_cnt; + + /* initialize all relevant process ctxts */ + error_status |= ih264e_pic_init(ps_codec, &s_inp_buf); + SET_ERROR_ON_RETURN(error_status, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + + if (ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] == 0) + { + /* proc ctxt base idx */ + WORD32 proc_ctxt_select = ctxt_sel * MAX_PROCESS_THREADS; + + /* proc ctxt */ + process_ctxt_t *ps_proc = &ps_codec->as_process[proc_ctxt_select]; + + WORD32 ret = 0; + + /* number of addl. threads to be created */ + WORD32 num_thread_cnt = ps_codec->s_cfg.u4_num_cores - 1; + + for (i = 0; i < num_thread_cnt; i++) + { + ret = ithread_create(ps_codec->apv_proc_thread_handle[i], + NULL, + (void*)ih264e_process_thread, + &ps_codec->as_process[i + 1]); + if (ret != 0) + { + printf("pthread Create Failed"); + assert(0); + } + + ps_codec->ai4_process_thread_created[i] = 1; + + ps_codec->i4_proc_thread_cnt++; + } + + + /* launch job */ + ih264e_process_thread(ps_proc); + + /* Join threads at the end of encoding a frame */ + ih264e_join_threads(ps_codec); + + ih264_list_reset(ps_codec->pv_proc_jobq); + + ih264_list_reset(ps_codec->pv_entropy_jobq); + } + } + + if (-1 != ps_codec->ai4_pic_cnt[ctxt_sel]) + { + /* proc ctxt base idx */ + WORD32 proc_ctxt_select = ctxt_sel * MAX_PROCESS_THREADS; + + /* proc ctxt */ + process_ctxt_t *ps_proc = &ps_codec->as_process[proc_ctxt_select]; + + /* receive output back from codec */ + s_out_buf = ps_codec->as_out_buf[ctxt_sel]; + + /* send the output to app */ + ps_video_encode_op->s_ive_op.output_present = 1; + ps_video_encode_op->s_ive_op.dump_recon = 1; + ps_video_encode_op->s_ive_op.s_out_buf = s_out_buf.s_bits_buf; + ps_video_encode_op->s_ive_op.u4_error_code = IV_SUCCESS; + + /* receive input back from codec */ + s_inp_buf = ps_proc->s_inp_buf; + + /* send the input to app */ + ps_video_encode_op->s_ive_op.s_inp_buf = s_inp_buf.s_raw_buf; + + if (ps_codec->s_cfg.u4_enable_recon && + ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] == 0) + { + /* error status */ + IH264_ERROR_T ret = IH264_SUCCESS; + + /* recon buffer */ + rec_buf_t *ps_rec_buf = &ps_codec->as_rec_buf[ctxt_sel]; + + ps_video_encode_op->s_ive_op.s_recon_buf = ps_video_encode_ip->s_ive_ip.s_recon_buf; + + /* copy/convert the recon buffer and return */ + ih264e_fmt_conv(ps_codec, &ps_rec_buf->s_pic_buf, + ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[0], + ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[1], + ps_video_encode_ip->s_ive_ip.s_recon_buf.apv_bufs[2], + ps_video_encode_ip->s_ive_ip.s_recon_buf.au4_wd[0], + ps_video_encode_ip->s_ive_ip.s_recon_buf.au4_wd[1], + 0, + ps_codec->s_cfg.u4_disp_ht); + + ret = ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, ps_rec_buf->s_pic_buf.i4_buf_id, BUF_MGR_IO); + if (IH264_SUCCESS != ret) + { + SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + } + } + + /* release buffers from ref list */ + if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel] == 1) + { + /* pic info */ + pic_buf_t *ps_cur_pic; + + /* mv info */ + mv_buf_t *ps_cur_mv_buf; + + /* error status */ + IH264_ERROR_T ret = IH264_SUCCESS; + + /* Decrement coded pic count */ + ps_codec->i4_coded_pic_cnt--; + + /* loop through to get the min pic cnt among the list of pics stored in ref list */ + /* since the skipped frame may not be on reference list, we may not have an MV bank + * hence free only if we have allocated */ + for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++) + { + if (ps_codec->i4_pic_cnt == ps_codec->as_ref_set[i].i4_pic_cnt) + { + ps_codec->as_ref_set[i].i4_pic_cnt = -1; + ps_codec->as_ref_set[i].i4_poc = -1; + + ps_cur_pic = ps_codec->as_ref_set[i].ps_pic_buf; + + ps_cur_mv_buf = ps_codec->as_ref_set[i].ps_mv_buf; + + /* release this frame from reference list */ + ret = ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr, ps_cur_mv_buf->i4_buf_id , BUF_MGR_REF); + SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + + ret = ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, ps_cur_pic->i4_buf_id , BUF_MGR_REF); + SET_ERROR_ON_RETURN((IH264E_ERROR_T)ret, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + break; + } + } + } + + if ((ps_codec->s_rate_control.post_encode_skip[ctxt_sel] == 1) || + (ps_codec->s_rate_control.pre_encode_skip[ctxt_sel] == 1)) + { + ps_video_encode_op->s_ive_op.dump_recon = 0; + } + else + { + /* set output pic type */ + if (ps_codec->i4_slice_type == PSLICE) + { + ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_P_FRAME; + } + else if (ps_codec->i4_slice_type == ISLICE && ps_codec->u4_is_idr != 1) + { + ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_I_FRAME; + } + else + { + ps_video_encode_op->s_ive_op.u4_encoded_frame_type = IV_IDR_FRAME; + } + } + + /* loop through to get the error status */ + for (i = 0; i < (WORD32)ps_codec->s_cfg.u4_num_cores; i++) + { + error_status |= ps_codec->as_process[ctxt_sel + i].i4_error_code; + } + SET_ERROR_ON_RETURN(error_status, + IVE_FATALERROR, + ps_video_encode_op->s_ive_op.u4_error_code, + IV_FAIL); + } + + if (1 == s_inp_buf.u4_is_last) + { + ps_video_encode_op->s_ive_op.output_present = 0; + ps_video_encode_op->s_ive_op.dump_recon = 0; + } + + return IV_SUCCESS; +} diff --git a/encoder/ih264e_encode_header.c b/encoder/ih264e_encode_header.c new file mode 100755 index 0000000..67e5409 --- /dev/null +++ b/encoder/ih264e_encode_header.c @@ -0,0 +1,1187 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_encode_header.c +* +* @brief +* This file contains function definitions related to header encoding. +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_generate_nal_unit_header() +* - ih264e_generate_sps() +* - ih264e_generate_pps() +* - ih264e_generate_slice_header() +* - ih264e_get_level() +* - ih264e_populate_sps() +* - ih264e_populate_pps() +* - ih264e_populate_slice_header() +* - ih264e_add_filler_nal_unit() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +/* User Include Files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e.h" +#include "ithread.h" +#include "ih264e_config.h" +#include "ih264e_trace.h" +#include "ih264_typedefs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ih264_debug.h" +#include "ih264_defs.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_defs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_encode_header.h" +#include "ih264_common_tables.h" +#include "ih264_macros.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief Generate nal unit header in the stream as per section 7.4.1 +* +* @par Description +* Inserts Nal unit header syntax as per section 7.4.1 +* +* @param[inout] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] nal_unit_type +* nal type to be inserted +* +* @param[in] nal_ref_idc +* nal ref idc to be inserted +* +* @return success or failure error code +* +****************************************************************************** +*/ +static WORD32 ih264e_generate_nal_unit_header(bitstrm_t *ps_bitstrm, + WORD32 nal_unit_type, + WORD32 nal_ref_idc) +{ + WORD32 return_status = IH264E_SUCCESS; + + /* sanity checks */ + ASSERT((nal_unit_type > 0) && (nal_unit_type < 32)); + + /* forbidden_zero_bit + nal_ref_idc + nal_unit_type */ + PUT_BITS(ps_bitstrm, + ((nal_ref_idc << 5) + nal_unit_type), + (1+2+5), /*1 forbidden zero bit + 2 nal_ref_idc + 5 nal_unit_type */ + return_status, + "nal_unit_header"); + + return(return_status); +} + +/** +****************************************************************************** +* +* @brief Generates SPS (Sequence Parameter Set) +* +* @par Description +* This function generates Sequence Parameter Set header as per the spec +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] ps_sps +* pointer to structure containing SPS data +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_generate_sps(bitstrm_t *ps_bitstrm, sps_t *ps_sps) +{ + WORD32 return_status = IH264E_SUCCESS; + WORD32 i; + WORD8 i1_nal_unit_type = 7; + WORD8 i1_nal_ref_idc = 3; + + /* Insert Start Code */ + return_status |= ih264e_put_nal_start_code_prefix(ps_bitstrm, 1); + + /* Insert Nal Unit Header */ + return_status |= ih264e_generate_nal_unit_header(ps_bitstrm, i1_nal_unit_type, i1_nal_ref_idc); + + /* profile_idc */ + PUT_BITS(ps_bitstrm, ps_sps->u1_profile_idc, 8, return_status, "profile_idc"); + + /* constrained_set_flags */ + PUT_BITS(ps_bitstrm, ps_sps->u1_constraint_set0_flag, 1, return_status, "constrained_set0_flag"); + PUT_BITS(ps_bitstrm, ps_sps->u1_constraint_set1_flag, 1, return_status, "constrained_set1_flag"); + PUT_BITS(ps_bitstrm, ps_sps->u1_constraint_set2_flag, 1, return_status, "constrained_set2_flag"); + PUT_BITS(ps_bitstrm, ps_sps->u1_constraint_set3_flag, 1, return_status, "constrained_set3_flag"); + + /* reserved_zero_four_bits */ + PUT_BITS(ps_bitstrm, 0, 4, return_status, "reserved_zero_four_bits"); + + /* level_idc */ + PUT_BITS(ps_bitstrm, ps_sps->u1_level_idc, 8, return_status, "level_idc"); + + /* seq_parameter_set_id */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->u1_sps_id, return_status, "seq_parameter_set_id"); + + if (ps_sps->u1_profile_idc >= IH264_PROFILE_HIGH) + { + /* chroma_format_idc */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->u1_chroma_format_idc, return_status, "chroma_format_idc"); + + if (ps_sps->u1_chroma_format_idc == CHROMA_FMT_IDC_YUV444) + { + /* i1_residual_colour_transform_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_residual_colour_transform_flag, 1, return_status, "i1_residual_colour_transform_flag"); + } + + /* bit_depth_luma_minus8 */ + PUT_BITS_UEV(ps_bitstrm, (ps_sps->i1_bit_depth_luma - 8), return_status, "bit_depth_luma_minus8"); + + /* bit_depth_chroma_minus8 */ + PUT_BITS_UEV(ps_bitstrm, (ps_sps->i1_bit_depth_chroma - 8), return_status, "bit_depth_chroma_minus8"); + + /* qpprime_y_zero_transform_bypass_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_qpprime_y_zero_transform_bypass_flag, 1, return_status, "qpprime_y_zero_transform_bypass_flag"); + + /* seq_scaling_matrix_present_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_seq_scaling_matrix_present_flag, 1, return_status, "seq_scaling_matrix_present_flag"); + + /* seq_scaling_list */ + if (ps_sps->i1_seq_scaling_matrix_present_flag) + { + /* TODO_LATER: Will be enabled once scaling list support is added */ + } + } + + /* log2_max_frame_num_minus4 */ + PUT_BITS_UEV(ps_bitstrm, (ps_sps->i1_log2_max_frame_num - 4), return_status, "log2_max_frame_num_minus4"); + + /* pic_order_cnt_type */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->i1_pic_order_cnt_type, return_status, "pic_order_cnt_type"); + + if (ps_sps->i1_pic_order_cnt_type == 0) + { + /* log2_max_pic_order_cnt_lsb_minus4 */ + PUT_BITS_UEV(ps_bitstrm, (ps_sps->i1_log2_max_pic_order_cnt_lsb - 4), return_status, "log2_max_pic_order_cnt_lsb_minus4"); + } + else if (ps_sps->i1_pic_order_cnt_type == 1) + { + /* delta_pic_order_always_zero_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_delta_pic_order_always_zero_flag, 1, return_status, "delta_pic_order_always_zero_flag"); + + /* offset_for_non_ref_pic */ + PUT_BITS_SEV(ps_bitstrm, ps_sps->i4_offset_for_non_ref_pic, return_status, "offset_for_non_ref_pic"); + + /* offset_for_top_to_bottom_field */ + PUT_BITS_SEV(ps_bitstrm, ps_sps->i4_offset_for_top_to_bottom_field, return_status, "offset_for_top_to_bottom_field"); + + /* num_ref_frames_in_pic_order_cnt_cycle */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->u1_num_ref_frames_in_pic_order_cnt_cycle, return_status, "num_ref_frames_in_pic_order_cnt_cycle"); + + /* Offset for ref frame */ + for (i=0; i<ps_sps->u1_num_ref_frames_in_pic_order_cnt_cycle; i++) + { + /* offset_for_ref_frame */ + PUT_BITS_SEV(ps_bitstrm, ps_sps->ai4_offset_for_ref_frame[i], return_status, "offset_for_ref_frame"); + } + } + + /* num_ref_frames */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->u1_max_num_ref_frames, return_status, "num_ref_frames"); + + /* gaps_in_frame_num_value_allowed_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_gaps_in_frame_num_value_allowed_flag, 1, return_status, "gaps_in_frame_num_value_allowed_flag"); + + /* pic_width_in_mbs_minus1 */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->i2_pic_width_in_mbs_minus1, return_status, "pic_width_in_mbs_minus1"); + + /* pic_height_in_map_units_minus1 */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->i2_pic_height_in_map_units_minus1, return_status, "pic_height_in_map_units_minus1"); + + /* frame_mbs_only_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_frame_mbs_only_flag, 1, return_status, "frame_mbs_only_flag"); + + if (!ps_sps->i1_frame_mbs_only_flag) + { + /* mb_adaptive_frame_field_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_mb_adaptive_frame_field_flag, 1, return_status, "mb_adaptive_frame_field_flag"); + } + + /* direct_8x8_inference_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_direct_8x8_inference_flag, 1, return_status, "direct_8x8_inference_flag"); + + /* frame_cropping_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_frame_cropping_flag, 1, return_status, "frame_cropping_flag"); + + if (ps_sps->i1_frame_cropping_flag) + { + /* frame_crop_left_offset */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->i2_frame_crop_left_offset, return_status, "frame_crop_left_offset"); + + /* frame_crop_right_offset */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->i2_frame_crop_right_offset, return_status, "frame_crop_right_offset"); + + /* frame_crop_top_offset */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->i2_frame_crop_top_offset, return_status, "frame_crop_top_offset"); + + /* frame_crop_bottom_offset */ + PUT_BITS_UEV(ps_bitstrm, ps_sps->i2_frame_crop_bottom_offset, return_status, "frame_crop_bottom_offset"); + } + + /* vui_parameters_present_flag */ + PUT_BITS(ps_bitstrm, ps_sps->i1_vui_parameters_present_flag, 1, return_status, "vui_parameters_present_flag"); + + if (ps_sps->i1_vui_parameters_present_flag) + { + /* Add vui parameters to the bitstream */; + } + + /* rbsp trailing bits */ + return_status |= ih264e_put_rbsp_trailing_bits(ps_bitstrm); + + return return_status; +} + +/** +****************************************************************************** +* +* @brief Generates PPS (Picture Parameter Set) +* +* @par Description +* Generate Picture Parameter Set as per Section 7.3.2.2 +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] ps_pps +* pointer to structure containing PPS data +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_generate_pps(bitstrm_t *ps_bitstrm, pps_t *ps_pps, sps_t *ps_sps) +{ + WORD32 return_status = IH264E_SUCCESS; + + /* Insert the NAL start code */ + return_status |= ih264e_put_nal_start_code_prefix(ps_bitstrm, 1); + + /* Insert Nal Unit Header */ + PUT_BITS(ps_bitstrm, NAL_PPS_FIRST_BYTE, 8, return_status, "pps_header"); + + /* pic_parameter_set_id */ + PUT_BITS_UEV(ps_bitstrm, ps_pps->u1_pps_id, return_status, "pic_parameter_set_id"); + + /* seq_parameter_set_id */ + PUT_BITS_UEV(ps_bitstrm, ps_pps->u1_sps_id, return_status, "seq_parameter_set_id"); + + /* Entropy coding : 0-VLC; 1 - CABAC */ + PUT_BITS(ps_bitstrm, ps_pps->u1_entropy_coding_mode_flag, 1, return_status, "Entropy coding : 0-VLC; 1 - CABAC"); + + /* Pic order present flag */ + PUT_BITS(ps_bitstrm, ps_pps->u1_pic_order_present_flag, 1, return_status, "Pic order present flag"); + + /* Number of slice groups */ + PUT_BITS_UEV(ps_bitstrm, ps_pps->u1_num_slice_groups - 1, return_status, "Number of slice groups"); + + if (ps_pps->u1_num_slice_groups > 1) + { + /* TODO_LATER: Currently the number of slice groups minus 1 is 0. + * If this is not the case, we have to add Slice group map type to the bit stream*/ + } + + /* num_ref_idx_l0_default_active_minus1 */ + PUT_BITS_UEV(ps_bitstrm, ps_pps->i1_num_ref_idx_l0_default_active - 1, return_status, "num_ref_idx_l0_default_active_minus1"); + + /* num_ref_idx_l1_default_active_minus1 */ + PUT_BITS_UEV(ps_bitstrm, ps_pps->i1_num_ref_idx_l1_default_active - 1, return_status, "num_ref_idx_l1_default_active_minus1"); + + /* weighted_pred_flag */ + PUT_BITS(ps_bitstrm, ps_pps->i1_weighted_pred_flag, 1, return_status, "weighted_pred_flag"); + + /* weighted_bipred_flag */ + PUT_BITS(ps_bitstrm, ps_pps->i1_weighted_bipred_idc, 2, return_status, "weighted_bipred_idc"); + + /* pic_init_qp_minus26 */ + PUT_BITS_SEV(ps_bitstrm, ps_pps->i1_pic_init_qp - 26, return_status, "pic_init_qp_minus26"); + + /* pic_init_qs_minus26 */ + PUT_BITS_SEV(ps_bitstrm, ps_pps->i1_pic_init_qs - 26, return_status, "pic_init_qs_minus26"); + + /* chroma_qp_index_offset */ + PUT_BITS_SEV(ps_bitstrm, ps_pps->i1_chroma_qp_index_offset, return_status, "chroma_qp_index_offset"); + + /* deblocking_filter_control_present_flag */ + PUT_BITS(ps_bitstrm, ps_pps->i1_deblocking_filter_control_present_flag, 1, return_status, "deblocking_filter_control_present_flag"); + + /* constrained_intra_pred_flag */ + PUT_BITS(ps_bitstrm, ps_pps->i1_constrained_intra_pred_flag, 1, return_status, "constrained_intra_pred_flag"); + + /*redundant_pic_cnt_present_flag */ + PUT_BITS(ps_bitstrm, ps_pps->i1_redundant_pic_cnt_present_flag, 1, return_status, "redundant_pic_cnt_present_flag"); + + if (ps_sps->u1_profile_idc >= IH264_PROFILE_HIGH) + { + /* transform_8x8_mode_flag */ + PUT_BITS(ps_bitstrm, ps_pps->i1_transform_8x8_mode_flag, 1, return_status, "transform_8x8_mode_flag"); + + /* pic_scaling_matrix_present_flag */ + PUT_BITS(ps_bitstrm, ps_pps->i1_pic_scaling_matrix_present_flag, 1, return_status, "pic_scaling_matrix_present_flag"); + + if(ps_pps->i1_pic_scaling_matrix_present_flag) + { + /* TODO_LATER: Will be enabled once scaling list support is added */ + } + + /* Second chroma QP offset */ + PUT_BITS_SEV(ps_bitstrm, ps_pps->i1_second_chroma_qp_index_offset, return_status, "Second chroma QP offset"); + } + + return_status |= ih264e_put_rbsp_trailing_bits(ps_bitstrm); + + return return_status; +} + +/** +****************************************************************************** +* +* @brief Generates Slice Header +* +* @par Description +* Generate Slice Header as per Section 7.3.5.1 +* +* @param[inout] ps_bitstrm +* pointer to bitstream context for generating slice header +* +* @param[in] ps_slice_hdr +* pointer to slice header params +* +* @param[in] ps_pps +* pointer to pps params referred by slice +* +* @param[in] ps_sps +* pointer to sps params referred by slice +* +* @param[out] ps_dup_bit_strm_ent_offset +* Bitstream struct to store bitstream state +* +* @param[out] pu4_first_slice_start_offset +* first slice offset is returned +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_generate_slice_header(bitstrm_t *ps_bitstrm, + slice_header_t *ps_slice_hdr, + pps_t *ps_pps, + sps_t *ps_sps) +{ + + WORD32 return_status = IH264E_SUCCESS; + + /* Insert start code */ + return_status |= ih264e_put_nal_start_code_prefix(ps_bitstrm, 1); + + /* Insert Nal Unit Header */ + return_status |= ih264e_generate_nal_unit_header(ps_bitstrm, ps_slice_hdr->i1_nal_unit_type, ps_slice_hdr->i1_nal_unit_idc); + + /* first_mb_in_slice */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->u2_first_mb_in_slice, return_status, "first_mb_in_slice"); + + /* slice_type */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->u1_slice_type, return_status, "slice_type"); + + /* pic_parameter_set_id */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->u1_pps_id, return_status, "pic_parameter_set_id"); + + /* frame_num */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->i4_frame_num, ps_sps->i1_log2_max_frame_num, return_status, "frame_num"); + + if (!ps_sps->i1_frame_mbs_only_flag) + { + /* field_pic_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->i1_field_pic_flag, 1, return_status, "field_pic_flag"); + + if(ps_slice_hdr->i1_field_pic_flag) + { + /* bottom_field_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->i1_bottom_field_flag, 1, return_status, "bottom_field_flag"); + } + } + + if (ps_slice_hdr->i1_nal_unit_type == 5) + { + /* u2_idr_pic_id */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->u2_idr_pic_id, return_status, "u2_idr_pic_id"); + } + + if (ps_sps->i1_pic_order_cnt_type == 0) + { + /* pic_order_cnt_lsb */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->i4_pic_order_cnt_lsb, ps_sps->i1_log2_max_pic_order_cnt_lsb, return_status, "pic_order_cnt_lsb"); + + if(ps_pps->u1_pic_order_present_flag && !ps_slice_hdr->i1_field_pic_flag) + { + /* delta_pic_order_cnt_bottom */ + PUT_BITS_SEV(ps_bitstrm, ps_slice_hdr->i4_delta_pic_order_cnt_bottom, return_status, "delta_pic_order_cnt_bottom"); + } + } + + if (ps_sps->i1_pic_order_cnt_type == 1 && !ps_sps->i1_delta_pic_order_always_zero_flag) + { + /* delta_pic_order_cnt[0] */ + PUT_BITS_SEV(ps_bitstrm, ps_slice_hdr->ai4_delta_pic_order_cnt[0], return_status, "delta_pic_order_cnt[0]"); + + if (ps_pps->u1_pic_order_present_flag && !ps_slice_hdr->i1_field_pic_flag) + { + /* delta_pic_order_cnt[1] */ + PUT_BITS_SEV(ps_bitstrm, ps_slice_hdr->ai4_delta_pic_order_cnt[1], return_status, "delta_pic_order_cnt[1]"); + } + } + + if (ps_pps->i1_redundant_pic_cnt_present_flag) + { + /* redundant_pic_cnt */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->u1_redundant_pic_cnt, return_status, "redundant_pic_cnt"); + } + + if (ps_slice_hdr->u1_slice_type == BSLICE) + { + /* direct_spatial_mv_pred_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_direct_spatial_mv_pred_flag, 1, return_status, "direct_spatial_mv_pred_flag"); + } + + if (ps_slice_hdr->u1_slice_type == PSLICE || ps_slice_hdr->u1_slice_type == SPSLICE || ps_slice_hdr->u1_slice_type == BSLICE) + { + /* num_ref_idx_active_override_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_num_ref_idx_active_override_flag, 1, return_status, "num_ref_idx_active_override_flag"); + + if (ps_slice_hdr->u1_num_ref_idx_active_override_flag) + { + /* num_ref_idx_l0_active_minus1 */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->i1_num_ref_idx_l0_active - 1, return_status, "num_ref_idx_l0_active_minus1"); + } + if (ps_slice_hdr->u1_slice_type == BSLICE) + { + /* num_ref_idx_l1_active_minus1 */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->i1_num_ref_idx_l1_active - 1, return_status, "num_ref_idx_l1_active_minus1"); + } + } + + /* ref_idx_reordering */ + /* TODO: ref_idx_reordering */ + if ((ps_slice_hdr->u1_slice_type != ISLICE) && (ps_slice_hdr->u1_slice_type != SISLICE)) + { + /* ref_pic_list_reordering_flag_l0 */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_ref_idx_reordering_flag_l0, 1, return_status, "ref_pic_list_reordering_flag_l0"); + + if (ps_slice_hdr->u1_ref_idx_reordering_flag_l0) + { + + } + } + + if ((ps_pps->i1_weighted_pred_flag && + (ps_slice_hdr->u1_slice_type == PSLICE || ps_slice_hdr->u1_slice_type == SPSLICE)) || + (ps_slice_hdr->u1_weighted_bipred_idc == 1 && ps_slice_hdr->u1_slice_type == BSLICE)) + { + /* TODO_LATER: Currently there is no support for weighted prediction. + This needs to be updated when the support is added */ + } + + if (ps_slice_hdr->i1_nal_unit_idc != 0) + { + if (ps_slice_hdr->i1_nal_unit_type == 5) + { + /* no_output_of_prior_pics_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_no_output_of_prior_pics_flag , 1, return_status, "no_output_of_prior_pics_flag "); + + /* long_term_reference_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_long_term_reference_flag , 1, return_status, "long_term_reference_flag "); + } + else + { + /* adaptive_ref_pic_marking_mode_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_adaptive_ref_pic_marking_mode_flag , 1, return_status, "adaptive_ref_pic_marking_mode_flag "); + + if (ps_slice_hdr->u1_adaptive_ref_pic_marking_mode_flag) + { + /* TODO: if the reference picture marking mode is adaptive + add these fields in the bit-stream */ + } + } + } + + if (ps_slice_hdr->u1_entropy_coding_mode_flag && ps_slice_hdr->u1_slice_type != ISLICE && + ps_slice_hdr->u1_slice_type != SISLICE) + { + /* cabac_init_idc */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->i1_cabac_init_idc, return_status, "cabac_init_idc"); + } + + /* slice_qp_delta */ + PUT_BITS_SEV(ps_bitstrm, ps_slice_hdr->i1_slice_qp - ps_pps->i1_pic_init_qp, return_status, "slice_qp_delta"); + + if (ps_slice_hdr->u1_slice_type == SPSLICE || ps_slice_hdr->u1_slice_type == SISLICE) + { + if (ps_slice_hdr->u1_slice_type == SPSLICE) + { + /* sp_for_switch_flag */ + PUT_BITS(ps_bitstrm, ps_slice_hdr->u1_sp_for_switch_flag , 1, return_status, "sp_for_switch_flag"); + } + /* slice_qs_delta */ + PUT_BITS_SEV(ps_bitstrm, ps_slice_hdr->u1_slice_qs - ps_pps->i1_pic_init_qs, return_status, "slice_qs_delta"); + } + + if (ps_pps->i1_deblocking_filter_control_present_flag) + { + /* disable_deblocking_filter_idc */ + PUT_BITS_UEV(ps_bitstrm, ps_slice_hdr->u1_disable_deblocking_filter_idc, return_status, "disable_deblocking_filter_idc"); + + if(ps_slice_hdr->u1_disable_deblocking_filter_idc != 1) + { + /* slice_alpha_c0_offset_div2 */ + PUT_BITS_SEV(ps_bitstrm, ps_slice_hdr->i1_slice_alpha_c0_offset_div2, return_status, "slice_alpha_c0_offset_div2"); + + /* slice_beta_offset_div2 */ + PUT_BITS_SEV(ps_bitstrm, ps_slice_hdr->i1_slice_beta_offset_div2, return_status, "slice_beta_offset_div2"); + } + } + + if (ps_slice_hdr->u1_num_slice_groups_minus1 > 0 && + ps_pps->u1_slice_group_map_type >= 3 && + ps_pps->u1_slice_group_map_type <= 5) + { + /* slice_group_change_cycle */ + /* TODO_LATER: Currently the number of slice groups minus 1 is 0. + * If this is not the case, we have to add Slice group map type to the bit stream */ + } + + return return_status; +} + + + +/** +****************************************************************************** +* +* @brief Populates sps structure +* +* @par Description +* Populates sps structure for its use in header generation +* +* @param[in] ps_codec +* pointer to encoder context +* +* @param[out] ps_sps +* pointer to sps params that needs to be populated +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_populate_sps(codec_t *ps_codec, sps_t *ps_sps) +{ + /* active config parameters */ + cfg_params_t *ps_cfg = &(ps_codec->s_cfg); + +// /* level */ +// IH264_LEVEL_T level_idc; + + /* error_status */ + IH264E_ERROR_T i4_err_code = IH264E_FAIL; + + /* profile */ + /* + * Baseline profile supports, 8 bits per sample, 4:2:0 format, CAVLC. + * B frames are not allowed. Further, Flexible mb ordering, Redundant slices, Arbitrary slice ordering are supported. + * The constrained baseline profile is baseline profile minus ASO, FMO and redundant slices. + * To the constrained baseline profile if we add support for B slices, support for encoding interlaced frames, + * support for weighted prediction and introduce CABAC entropy coding then we have Main Profile. + */ + if ((ps_cfg->u4_num_b_frames) || (ps_cfg->e_content_type != IV_PROGRESSIVE) || + (ps_cfg->u4_entropy_coding_mode == CABAC) || (ps_cfg->u4_weighted_prediction)) + { + ps_sps->u1_profile_idc = IH264_PROFILE_MAIN; + } + else + { + ps_sps->u1_profile_idc = IH264_PROFILE_BASELINE; + } + + /* level */ + ps_sps->u1_level_idc = ps_cfg->u4_max_level; +// i4_err_code = ih264e_get_level(ps_cfg, &level_idc); +// if (i4_err_code == IH264E_SUCCESS) +// { +// ps_sps->u1_level_idc = level_idc; +// +// } +// else +// { +// return i4_err_code; +// } + + /* constrained flags */ + /* + * baseline profile automatically implies set 0 flag + */ + ps_sps->u1_constraint_set0_flag = (ps_sps->u1_profile_idc == IH264_PROFILE_BASELINE); + /* + * main profile automatically implies set 1 flag + * Although the encoder says it supports Baseline profile it actually supports constrained + * baseline profile as ASO, FMO and redundant slices are not supported + */ + ps_sps->u1_constraint_set1_flag = (ps_sps->u1_profile_idc <= IH264_PROFILE_MAIN); + /* + * extended profile is not supported + */ + ps_sps->u1_constraint_set2_flag = 0x00; + /* + * level 1b or level 11 + */ + if (ps_sps->u1_level_idc == IH264_LEVEL_1B) + { + ps_sps->u1_constraint_set3_flag = 0; + ps_sps->u1_level_idc = IH264_LEVEL_11; + } + else + { + ps_sps->u1_constraint_set3_flag = 0; + } + + /* active sps id */ + ps_sps->u1_sps_id = ps_codec->i4_sps_id; + + if (ps_sps->u1_profile_idc >= IH264_PROFILE_HIGH) + { + /* chroma format idc */ + ps_sps->u1_chroma_format_idc = CHROMA_FMT_IDC_YUV420; + + /* residual_colour_transform_flag */ + ps_sps->i1_residual_colour_transform_flag = 0; + + /* luma bit depth 8 */ + ps_sps->i1_bit_depth_luma = 8; + + /* chroma bit depth 8 */ + ps_sps->i1_bit_depth_chroma = 8; + + /* qpprime_y_zero_transform_bypass_flag */ + ps_sps->i1_qpprime_y_zero_transform_bypass_flag = 0; + + /* seq_scaling_matrix_present_flag */ + ps_sps->i1_seq_scaling_matrix_present_flag = 0; + + if (ps_sps->i1_seq_scaling_matrix_present_flag) + { + /* TODO_LATER: Will be enabled once scaling list support is added */ + } + } + + /* log2_max_frame_num_minus4 */ + ps_sps->i1_log2_max_frame_num = 16; + + /* pic_order_cnt_type */ + ps_sps->i1_pic_order_cnt_type = 2; + + if(ps_cfg->u4_enable_alt_ref) + ps_sps->i1_pic_order_cnt_type = 0; + + /* log2_max_pic_order_cnt_lsb_minus4 */ + ps_sps->i1_log2_max_pic_order_cnt_lsb = 8; + + /* TODO : add support for other poc types */ + if (ps_sps->i1_pic_order_cnt_type == 0) + { + + } + else if (ps_sps->i1_pic_order_cnt_type == 1) + { + + } + + /* num_ref_frames */ + /* FIXME : Fix this hard coding */ + ps_sps->u1_max_num_ref_frames = 1; + + /* gaps_in_frame_num_value_allowed_flag */ + ps_sps->i1_gaps_in_frame_num_value_allowed_flag = 0; + + /* pic width in mb - 1 */ + ps_sps->i2_pic_width_in_mbs_minus1 = ps_cfg->i4_wd_mbs - 1; + + /* pic height in mb - 1 */ + ps_sps->i2_pic_height_in_map_units_minus1 = ps_cfg->i4_ht_mbs - 1;; + + /* frame_mbs_only_flag, no support for interlace encoding */ + ps_sps->i1_frame_mbs_only_flag = 1; + + /* mb_adaptive_frame_field_flag */ + if (ps_sps->i1_frame_mbs_only_flag == 0) + { + ps_sps->i1_mb_adaptive_frame_field_flag = 0; + } + + /* direct_8x8_inference_flag */ + ps_sps->i1_direct_8x8_inference_flag = 0; + + /* cropping params */ + /*NOTE : Cropping values depend on the chroma format + * For our case ,decoder interprets the cropping values as 2*num pixels + * Hence the difference in the disp width and width must be halved before sending + * to get the expected results + */ + ps_sps->i1_frame_cropping_flag = 0; + ps_sps->i2_frame_crop_left_offset = 0; + ps_sps->i2_frame_crop_right_offset = (ps_codec->s_cfg.u4_wd - ps_codec->s_cfg.u4_disp_wd)>>1; + ps_sps->i2_frame_crop_top_offset = 0; + ps_sps->i2_frame_crop_bottom_offset = (ps_codec->s_cfg.u4_ht - ps_codec->s_cfg.u4_disp_ht)>>1; + + if (ps_sps->i2_frame_crop_left_offset || + ps_sps->i2_frame_crop_right_offset || + ps_sps->i2_frame_crop_top_offset || + ps_sps->i2_frame_crop_bottom_offset) + { + ps_sps->i1_frame_cropping_flag = 1; + } + + /* vui params */ + ps_sps->i1_vui_parameters_present_flag = 0; + + if (ps_sps->i1_vui_parameters_present_flag) + { + /* populate vui params */ + } + + return i4_err_code; +} + +/** +****************************************************************************** +* +* @brief Populates pps structure +* +* @par Description +* Populates pps structure for its use in header generation +* +* @param[in] ps_codec +* pointer to encoder context +* +* @param[out] ps_pps +* pointer to pps params that needs to be populated +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_populate_pps(codec_t *ps_codec, pps_t *ps_pps) +{ + /* active config parameters */ + cfg_params_t *ps_cfg = &(ps_codec->s_cfg); + + /* seq_parameter_set_id */ + ps_pps->u1_sps_id = ps_codec->i4_sps_id; + + /* pic_parameter_set_id */ + ps_pps->u1_pps_id = ps_codec->i4_pps_id; + + /* entropy_coding_mode */ + ps_pps->u1_entropy_coding_mode_flag = ps_cfg->u4_entropy_coding_mode; + + /* pic_order_present_flag is unset for POC type 2 */ + ps_pps->u1_pic_order_present_flag = 0; + + /* Currently number of slice groups supported are 1 */ + ps_pps->u1_num_slice_groups = 1; + + if (ps_pps->u1_num_slice_groups - 1) + { + /* TODO_LATER: Currently the number of slice groups minus 1 is 0. + * If this is not the case, we have to add Slice group map type to the bit stream*/ + } + + /* number of reference frames for list 0 */ + /* FIXME : fix this hard coded value */ + ps_pps->i1_num_ref_idx_l0_default_active = 1; + + /* number of reference frames for list 1 */ + ps_pps->i1_num_ref_idx_l1_default_active = 1; + + /* weighted prediction for now is disabled */ + ps_pps->i1_weighted_pred_flag = 0; + ps_pps->i1_weighted_bipred_idc = 0; + + /* The intent is to not signal qp from pps. Rather send the same in slice headers */ + ps_pps->i1_pic_init_qp = 0; + + /* The intent is to not signal qp from pps. Rather send the same in slice headers */ + ps_pps->i1_pic_init_qs = 0; + + /* The intent is to not signal qp from pps. Rather send the same in slice headers */ + ps_pps->i1_chroma_qp_index_offset = 0; + + /* deblocking filter flags present in slice header */ + ps_pps->i1_deblocking_filter_control_present_flag = 1; + + /* constrained intra prediction */ + ps_pps->i1_constrained_intra_pred_flag = ps_cfg->u4_constrained_intra_pred; + + /* sending redundant slices is not supported for now */ + ps_pps->i1_redundant_pic_cnt_present_flag = 0; + + ps_pps->u1_slice_group_map_type = 0; + return IH264E_SUCCESS; +} + +/** +****************************************************************************** +* +* @brief Populates slice header structure +* +* @par Description +* Populates slice header structure for its use in header generation +* +* @param[in] ps_proc +* pointer to proc context +* +* @param[out] ps_slice_hdr +* pointer to slice header structure that needs to be populated +* +* @param[in] ps_pps +* pointer to pps params structure referred by the slice +* +* @param[in] ps_sps +* pointer to sps params referred by the pps +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_populate_slice_header(process_ctxt_t *ps_proc, + slice_header_t *ps_slice_hdr, + pps_t *ps_pps, + sps_t *ps_sps) +{ + /* entropy context */ + entropy_ctxt_t *ps_entropy = &ps_proc->s_entropy; + + codec_t *ps_codec = ps_proc->ps_codec; + + if (ps_proc->ps_codec->u4_is_curr_frm_ref) + { + ps_slice_hdr->i1_nal_unit_idc = 3; + } + else + { + ps_slice_hdr->i1_nal_unit_idc = 0; + } + + /* start mb address */ + ps_slice_hdr->u2_first_mb_in_slice = ps_entropy->i4_mb_start_add; + + /* slice type */ + ps_slice_hdr->u1_slice_type = ps_proc->i4_slice_type; + + /* pic_parameter_set_id */ + ps_slice_hdr->u1_pps_id = ps_pps->u1_pps_id; + + /* Separate color plane flag is 0, + * hence the syntax element color_plane_id not included */ + + /* frame num */ + ps_slice_hdr->i4_frame_num = ps_proc->i4_frame_num; + + /* frame_mbs_only_flag, no support for interlace encoding */ + if (!ps_sps->i1_frame_mbs_only_flag) + { + ps_slice_hdr->i1_field_pic_flag = 0; + + if (ps_slice_hdr->i1_field_pic_flag) + { + ps_slice_hdr->i1_bottom_field_flag = 0; + } + } + + /* idr pic id */ + if (ps_proc->u4_is_idr) + { + ps_slice_hdr->u2_idr_pic_id = ps_proc->u4_idr_pic_id; + ps_slice_hdr->i1_nal_unit_type = 5; + } + else + { + ps_slice_hdr->i1_nal_unit_type = 1; + } + + if (ps_sps->i1_pic_order_cnt_type == 0) + { + + WORD32 val; + val = ps_codec->i4_coded_pic_cnt; + val %= (1 << ps_sps->i1_log2_max_pic_order_cnt_lsb); + ps_slice_hdr->i4_pic_order_cnt_lsb = val; + } + else if (ps_sps->i1_pic_order_cnt_type == 1) + { + + } + + if(0 == ps_slice_hdr->u2_first_mb_in_slice) + ps_codec->i4_coded_pic_cnt++; + + /* + * redundant slices are not currently supported. + * Hence the syntax element redundant slice cnt is not initialized + */ + if (ps_pps->i1_redundant_pic_cnt_present_flag) + { + + } + + /* direct spatial mv pred flag */ + if (ps_proc->i4_slice_type == BSLICE) + { + + } + + if (ps_proc->i4_slice_type == PSLICE || ps_proc->i4_slice_type == SPSLICE || ps_proc->i4_slice_type == BSLICE) + { + /* num_ref_idx_active_override_flag */ + ps_slice_hdr->u1_num_ref_idx_active_override_flag = 0; + + if (ps_slice_hdr->u1_num_ref_idx_active_override_flag) + { + /* num_ref_idx_l0_active_minus1 */ + + if (ps_proc->i4_slice_type == BSLICE) + { + /* num_ref_idx_l1_active_minus1 */ + + } + } + } + + /* ref_idx_reordering */ + /* TODO: ref_idx_reordering */ + if ((ps_proc->i4_slice_type != ISLICE) && (ps_proc->i4_slice_type != SISLICE)) + { + /* ref_pic_list_reordering_flag_l0 */ + ps_slice_hdr->u1_ref_idx_reordering_flag_l0 = 0; + + if (ps_slice_hdr->u1_ref_idx_reordering_flag_l0) + { + + } + } + + if ((ps_pps->i1_weighted_pred_flag && + (ps_proc->i4_slice_type == PSLICE || ps_proc->i4_slice_type == SPSLICE)) || + (ps_slice_hdr->u1_weighted_bipred_idc == 1 && ps_proc->i4_slice_type == BSLICE)) + { + /* TODO_LATER: Currently there is no support for weighted prediction. + This needs to be updated when the support is added */ + } + + if (ps_slice_hdr->i1_nal_unit_idc != 0) + { + if (ps_slice_hdr->i1_nal_unit_type == 5) + { + /* no_output_of_prior_pics_flag */ + ps_slice_hdr->u1_no_output_of_prior_pics_flag = 0; + + /* long_term_reference_flag */ + ps_slice_hdr->u1_long_term_reference_flag = 0; + } + else + { + /* adaptive_ref_pic_marking_mode_flag */ + ps_slice_hdr->u1_adaptive_ref_pic_marking_mode_flag = 0; + + if (ps_slice_hdr->u1_adaptive_ref_pic_marking_mode_flag) + { + /* TODO: if the reference picture marking mode is adaptive + add these fields in the bit-stream */ + } + } + } + + /* entropy coding mode flag */ + ps_slice_hdr->u1_entropy_coding_mode_flag = ps_entropy->u1_entropy_coding_mode_flag; + + if (ps_slice_hdr->u1_entropy_coding_mode_flag && ps_proc->i4_slice_type != ISLICE && + ps_proc->i4_slice_type != SISLICE) + { + /* cabac_init_idc */ + } + + /* slice qp */ + ps_slice_hdr->i1_slice_qp = ps_proc->u4_frame_qp; + + if (ps_proc->i4_slice_type == SPSLICE || ps_proc->i4_slice_type == SISLICE) + { + if (ps_proc->i4_slice_type == SPSLICE) + { + /* sp_for_switch_flag */ + } + /* slice_qs_delta */ + } + + if (ps_pps->i1_deblocking_filter_control_present_flag) + { + /* disable_deblocking_filter_idc */ + ps_slice_hdr->u1_disable_deblocking_filter_idc = ps_proc->u4_disable_deblock_level; + + if (ps_slice_hdr->u1_disable_deblocking_filter_idc != 1) + { + /* slice_alpha_c0_offset_div2 */ + ps_slice_hdr->i1_slice_alpha_c0_offset_div2 = 0; + + /* slice_beta_offset_div2 */ + ps_slice_hdr->i1_slice_beta_offset_div2 = 0; + } + } + ps_slice_hdr->u1_num_slice_groups_minus1 = 0; + if(ps_slice_hdr->u1_num_slice_groups_minus1 > 0 && + ps_pps->u1_slice_group_map_type >= 3 && + ps_pps->u1_slice_group_map_type <= 5) + { + /* slice_group_change_cycle */ + /* TODO_LATER: Currently the number of slice groups minus 1 is 0. + * If this is not the case, we have to add Slice group map type to the bit stream */ + } + + return IH264E_SUCCESS; +} + +/** +****************************************************************************** +* +* @brief inserts FILLER Nal Unit. +* +* @par Description +* In constant bit rate rc mode, when the bits generated by the codec is +* underflowing the target bit rate, the encoder library inserts filler nal unit. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] insert_fill_bytes +* Number of fill bytes to be inserted +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_add_filler_nal_unit(bitstrm_t *ps_bitstrm, + WORD32 insert_fill_bytes) +{ + WORD32 i4_num_words_to_fill, i4_words_filled; + + IH264E_ERROR_T return_status = IH264E_SUCCESS; + + /* Insert the NAL start code */ + return_status |= ih264e_put_nal_start_code_prefix(ps_bitstrm, 1); + + if (ps_bitstrm->u4_strm_buf_offset + insert_fill_bytes >= ps_bitstrm->u4_max_strm_size) + { + return (IH264E_BITSTREAM_BUFFER_OVERFLOW); + } + + /* Insert Nal Unit Header */ + PUT_BITS(ps_bitstrm, NAL_FILLER_FIRST_BYTE, 8, return_status, "filler_header"); + + PUT_BITS(ps_bitstrm, 0xFFFFFF, 24, return_status, "fill bytes"); + + /* Initializing Variables */ + i4_words_filled = 1; + + /****************************************************/ + /* Flooring the number of bytes for be stuffed to */ + /* WORD unit */ + /****************************************************/ + i4_num_words_to_fill = (insert_fill_bytes >> 2); + + /****************************************************/ + /* Reducing already 4 bytes filled. In case stuffing*/ + /* is <= 4 bytes, we are actually not stuffing */ + /* anything */ + /****************************************************/ + i4_num_words_to_fill -= i4_words_filled; + + while (i4_num_words_to_fill > 0) + { + /* Insert Nal Unit Header */ + PUT_BITS(ps_bitstrm, 0xFFFFFFFF, 32, return_status, "fill bytes"); + + i4_num_words_to_fill-- ; + } + + return_status |= ih264e_put_rbsp_trailing_bits(ps_bitstrm); + + return return_status; +} + diff --git a/encoder/ih264e_encode_header.h b/encoder/ih264e_encode_header.h new file mode 100755 index 0000000..acae5b6 --- /dev/null +++ b/encoder/ih264e_encode_header.h @@ -0,0 +1,278 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_encode_header.h +* +* @brief +* This file contains structures and interface prototypes for h264 bitstream +* header encoding +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_ENCODE_HEADER_H_ +#define IH264E_ENCODE_HEADER_H_ + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Macro to put a code with specified number of bits into the + * bitstream +****************************************************************************** + */ +#define PUT_BITS(ps_bitstrm, code_val, code_len, ret_val, syntax_string) \ + ENTROPY_TRACE(syntax_string, code_val);\ + ret_val |= ih264e_put_bits((ps_bitstrm), (code_val), (code_len)) + +/** +****************************************************************************** + * @brief Macro to put a code with specified number of bits into the + * bitstream using 0th order exponential Golomb encoding for + * signed numbers +****************************************************************************** + */ +#define PUT_BITS_UEV(ps_bitstrm, code_val, ret_val, syntax_string) \ + ENTROPY_TRACE(syntax_string, code_val);\ + ret_val |= ih264e_put_uev((ps_bitstrm), (code_val)) + +/** +****************************************************************************** + * @brief Macro to put a code with specified number of bits into the + * bitstream using 0th order exponential Golomb encoding for + * signed numbers +****************************************************************************** + */ +#define PUT_BITS_SEV(ps_bitstrm, code_val, ret_val, syntax_string) \ + ENTROPY_TRACE(syntax_string, code_val);\ + ret_val |= ih264e_put_sev((ps_bitstrm), (code_val)) + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief Generates SPS (Sequence Parameter Set) +* +* @par Description +* This function generates Sequence Parameter Set header as per the spec +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] ps_sps +* pointer to structure containing SPS data +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_generate_sps + ( + bitstrm_t *ps_bitstrm, + sps_t *ps_sps + ); + +/** +****************************************************************************** +* +* @brief Generates PPS (Picture Parameter Set) +* +* @par Description +* Generate Picture Parameter Set as per Section 7.3.2.2 +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] ps_pps +* pointer to structure containing PPS data +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_generate_pps + ( + bitstrm_t *ps_bitstrm, + pps_t *ps_pps, + sps_t *ps_sps + ); + +/** +****************************************************************************** +* +* @brief Generates Slice Header +* +* @par Description +* Generate Slice Header as per Section 7.3.5.1 +* +* @param[inout] ps_bitstrm +* pointer to bitstream context for generating slice header +* +* @param[in] ps_slice_hdr +* pointer to slice header params +* +* @param[in] ps_pps +* pointer to pps params referred by slice +* +* @param[in] ps_sps +* pointer to sps params referred by slice +* +* @param[out] ps_dup_bit_strm_ent_offset +* Bitstream struct to store bitstream state +* +* @param[out] pu4_first_slice_start_offset +* first slice offset is returned +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_generate_slice_header + ( + bitstrm_t *ps_bitstrm, + slice_header_t *ps_slice_hdr, + pps_t *ps_pps, + sps_t *ps_sps + ); + +/** +****************************************************************************** +* +* @brief Populates sps structure +* +* @par Description +* Populates sps structure for its use in header generation +* +* @param[in] ps_codec +* pointer to encoder context +* +* @param[out] ps_sps +* pointer to sps params that needs to be populated +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_populate_sps + ( + codec_t *ps_codec, + sps_t *ps_sps + ); + +/** +****************************************************************************** +* +* @brief Populates pps structure +* +* @par Description +* Populates pps structure for its use in header generation +* +* @param[in] ps_codec +* pointer to encoder context +* +* @param[out] ps_pps +* pointer to pps params that needs to be populated +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_populate_pps + ( + codec_t *ps_codec, + pps_t *ps_pps + ); + + +/** +****************************************************************************** +* +* @brief Populates slice header structure +* +* @par Description +* Populates slice header structure for its use in header generation +* +* @param[in] ps_proc +* pointer to proc context +* +* @param[out] ps_slice_hdr +* pointer to slice header structure that needs to be populated +* +* @param[in] ps_pps +* pointer to pps params structure referred by the slice +* +* @param[in] ps_sps +* pointer to sps params referred by the pps +* +* @return success or failure error code +* +****************************************************************************** +*/ +WORD32 ih264e_populate_slice_header + ( + process_ctxt_t *ps_proc, + slice_header_t *ps_slice_hdr, + pps_t *ps_pps, + sps_t *ps_sps + ); + + +/** +****************************************************************************** +* +* @brief inserts FILLER Nal Unit. +* +* @par Description +* In constant bit rate rc mode, when the bits generated by the codec is +* underflowing the target bit rate, the encoder library inserts filler nal unit. +* +* @param[in] ps_bitstrm +* pointer to bitstream context (handle) +* +* @param[in] insert_fill_bytes +* Number of fill bytes to be inserted +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_add_filler_nal_unit + ( + bitstrm_t *ps_bitstrm, + WORD32 insert_fill_bytes + ); + + +#endif //IH264E_ENCODE_HEADER_H_ diff --git a/encoder/ih264e_error.h b/encoder/ih264e_error.h new file mode 100755 index 0000000..8fe9dac --- /dev/null +++ b/encoder/ih264e_error.h @@ -0,0 +1,229 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_error.h +* +* @brief +* Definitions related to error handling +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_ERROR_H_ +#define IH264E_ERROR_H_ + +/** +****************************************************************************** +* @brief Error start codes for various classes of errors in H264 encoder +****************************************************************************** +*/ +#define SET_ERROR_ON_RETURN(error, severity, out_status, ret_code) \ + if (error != IH264E_SUCCESS) \ + {\ + out_status = ((1 << severity) | error);\ + return (ret_code);\ + } + + +/** +****************************************************************************** + * @brief Extended error code for each error in H264 encoder +****************************************************************************** + */ +typedef enum +{ + /* NOTE: the ive error codes ends at 0x80 */ + IVE_ERR_CODEC_EXTENSIONS = 0x80, + + /* bit stream error start */ + IH264E_BITSTREAM_ERROR_START = IVE_ERR_CODEC_EXTENSIONS, + + /* codec error start */ + IH264E_CODEC_ERROR_START = IH264E_BITSTREAM_ERROR_START + 0x10, + + /** no error */ + IH264E_SUCCESS = 0, + + /** bitstream init failure, buffer ptr not aligned to WORD (32bits) */ + IH264E_BITSTREAM_BUFPTR_ALIGN_FAIL = IH264E_BITSTREAM_ERROR_START + 0x01, + + /** bitstream init failure, buf size not multiple of WORD size (32bits) */ + IH264E_BITSTREAM_BUFSIZE_ALIGN_FAIL = IH264E_BITSTREAM_ERROR_START + 0x02, + + /** bitstream runtime failure, buf size limit exceeded during encode */ + IH264E_BITSTREAM_BUFFER_OVERFLOW = IH264E_BITSTREAM_ERROR_START + 0x03, + + /**width not set within supported limit */ + IH264E_WIDTH_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x01, + + /**height not set within supported limit */ + IH264E_HEIGHT_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x02, + + /**Unsupported number of reference pictures passed as an argument */ + IH264E_NUM_REF_UNSUPPORTED = IH264E_CODEC_ERROR_START + 0x03, + + /**Unsupported number of reference pictures passed as an argument */ + IH264E_NUM_REORDER_UNSUPPORTED = IH264E_CODEC_ERROR_START + 0x04, + + /**codec level not supported */ + IH264E_CODEC_LEVEL_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x05, + + /**input chroma format not supported */ + IH264E_INPUT_CHROMA_FORMAT_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x06, + + /**recon chroma format not supported */ + IH264E_RECON_CHROMA_FORMAT_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x07, + + /**rate control option configured is not supported */ + IH264E_RATE_CONTROL_MODE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x08, + + /**frame rate configured is not supported */ + IH264E_FRAME_RATE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x09, + + /**bit rate configured is not supported */ + IH264E_BITRATE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x0A, + + /**frame rate not supported */ + IH264E_BFRAMES_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x0B, + + /**content type not supported */ + IH264E_CONTENT_TYPE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x0C, + + /**unsupported horizontal search range */ + IH264E_HORIZONTAL_SEARCH_RANGE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x0D, + + /**unsupported vertical search range */ + IH264E_VERTICAL_SEARCH_RANGE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x0E, + + /**Unsupported slice type input */ + IH264E_SLICE_TYPE_INPUT_INVALID = IH264E_CODEC_ERROR_START + 0x0F, + + /**unsupported architecture type */ + IH264E_ARCH_TYPE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x10, + + /**unsupported soc type */ + IH264E_SOC_TYPE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x11, + + /**target frame rate exceeds source frame rate */ + IH264E_TGT_FRAME_RATE_EXCEEDS_SRC_FRAME_RATE = IH264E_CODEC_ERROR_START + 0x12, + + /**invalid force frame input */ + IH264E_INVALID_FORCE_FRAME_INPUT = IH264E_CODEC_ERROR_START + 0x13, + + /**invalid me speed preset */ + IH264E_INVALID_ME_SPEED_PRESET = IH264E_CODEC_ERROR_START + 0x14, + + /**invalid encoder speed preset */ + IH264E_INVALID_ENC_SPEED_PRESET = IH264E_CODEC_ERROR_START + 0x15, + + /**invalid deblocking param */ + IH264E_INVALID_DEBLOCKING_TYPE_INPUT = IH264E_CODEC_ERROR_START + 0x16, + + /**invalid max qp */ + IH264E_INVALID_MAX_FRAME_QP = IH264E_CODEC_ERROR_START + 0x17, + + /**invalid min qp */ + IH264E_INVALID_MIN_FRAME_QP = IH264E_CODEC_ERROR_START + 0x18, + + /**invalid init qp */ + IH264E_INVALID_INIT_QP = IH264E_CODEC_ERROR_START + 0x19, + + /**version buffer size is insufficient */ + IH264E_CXA_VERS_BUF_INSUFFICIENT = IH264E_CODEC_ERROR_START + 0x1A, + + /**init not done */ + IH264E_INIT_NOT_DONE = IH264E_CODEC_ERROR_START + 0x1B, + + /**invalid refresh type input */ + IH264E_INVALID_AIR_MODE = IH264E_CODEC_ERROR_START + 0x1C, + + /** Unsupported air mode */ + IH264E_INVALID_AIR_REFRESH_PERIOD = IH264E_CODEC_ERROR_START + 0x1D, + + /**In sufficient memory allocated for MV Bank */ + IH264E_INSUFFICIENT_MEM_MVBANK = IH264E_CODEC_ERROR_START + 0x1E, + + /**In sufficient memory allocated for MV Bank */ + IH264E_INSUFFICIENT_MEM_PICBUF = IH264E_CODEC_ERROR_START + 0x1F, + + /**Buffer manager error */ + IH264E_BUF_MGR_ERROR = IH264E_CODEC_ERROR_START + 0x20, + + /**No free MV Bank buffer available to store current pic */ + IH264E_NO_FREE_MVBANK = IH264E_CODEC_ERROR_START + 0x21, + + /**No free picture buffer available to store current pic */ + IH264E_NO_FREE_PICBUF = IH264E_CODEC_ERROR_START + 0x22, + + /**Invalid encoder operation mode */ + IH264E_INVALID_ENC_OPERATION_MODE = IH264E_CODEC_ERROR_START + 0x23, + + /**Invalid half pel option */ + IH264E_INVALID_HALFPEL_OPTION = IH264E_CODEC_ERROR_START + 0x24, + + /**Invalid quarter pel option */ + IH264E_INVALID_QPEL_OPTION = IH264E_CODEC_ERROR_START + 0x25, + + /**Invalid fast sad option */ + IH264E_INVALID_FAST_SAD_OPTION = IH264E_CODEC_ERROR_START + 0x26, + + /**Invalid intra 4x4 option */ + IH264E_INVALID_INTRA4x4_OPTION = IH264E_CODEC_ERROR_START + 0x27, + + /**Invalid intra frame interval */ + IH264E_INVALID_INTRA_FRAME_INTERVAL = IH264E_CODEC_ERROR_START + 0x28, + + /**Invalid idr frame interval */ + IH264E_INVALID_IDR_FRAME_INTERVAL = IH264E_CODEC_ERROR_START + 0x29, + + /**Invalid buffer delay */ + IH264E_INVALID_BUFFER_DELAY = IH264E_CODEC_ERROR_START + 0x2A, + + /**Invalid num cores */ + IH264E_INVALID_NUM_CORES = IH264E_CODEC_ERROR_START + 0x2B, + + /**profile not supported */ + IH264E_PROFILE_NOT_SUPPORTED = IH264E_CODEC_ERROR_START + 0x2C, + + /**Unsupported slice type input */ + IH264E_SLICE_PARAM_INPUT_INVALID = IH264E_CODEC_ERROR_START + 0x2D, + + /**Invalid alt ref option */ + IH264E_INVALID_ALT_REF_OPTION = IH264E_CODEC_ERROR_START + 0x2E, + + /**No free picture buffer available to store recon pic */ + IH264E_NO_FREE_RECONBUF = IH264E_CODEC_ERROR_START + 0x2F, + + /**max failure error code to ensure enum is 32 bits wide */ + IH264E_FAIL = -1, + +}IH264E_ERROR_T; + + +#endif /* IH264E_ERROR_H_ */ diff --git a/encoder/ih264e_fmt_conv.c b/encoder/ih264e_fmt_conv.c new file mode 100755 index 0000000..393d6ca --- /dev/null +++ b/encoder/ih264e_fmt_conv.c @@ -0,0 +1,864 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_fmt_conv.c +* +* @brief +* Contains functions for format conversion or frame copy of output buffer +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_fmt_conv_420sp_to_rgb565() +* - ih264e_fmt_conv_420sp_to_rgba8888() +* - ih264e_fmt_conv_420sp_to_420sp() +* - ih264e_fmt_conv_420sp_to_420sp_swap_uv() +* - ih264e_fmt_conv_420sp_to_420p() +* - ih264e_fmt_conv_420p_to_420sp() +* - ih264e_fmt_conv_422i_to_420sp() +* - ih264e_fmt_conv() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e.h" +#include "ithread.h" +#include "ih264_defs.h" +#include "ih264_debug.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_error.h" +#include "ih264_buf_mgr.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_fmt_conv.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +void ih264e_fmt_conv_420sp_to_rgb565(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD16 *pu2_rgb_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_strd, + WORD32 is_u_first) +{ + WORD16 i2_r, i2_g, i2_b; + UWORD32 u4_r, u4_g, u4_b; + WORD16 i2_i, i2_j; + UWORD8 *pu1_y_src_nxt; + UWORD16 *pu2_rgb_dst_NextRow; + + UWORD8 *pu1_u_src, *pu1_v_src; + + if (is_u_first) + { + pu1_u_src = (UWORD8 *) pu1_uv_src; + pu1_v_src = (UWORD8 *) pu1_uv_src + 1; + } + else + { + pu1_u_src = (UWORD8 *) pu1_uv_src + 1; + pu1_v_src = (UWORD8 *) pu1_uv_src; + } + + pu1_y_src_nxt = pu1_y_src + src_y_strd; + pu2_rgb_dst_NextRow = pu2_rgb_dst + dst_strd; + + for (i2_i = 0; i2_i < (ht >> 1); i2_i++) + { + for (i2_j = (wd >> 1); i2_j > 0; i2_j--) + { + i2_b = ((*pu1_u_src - 128) * COEFF4 >> 13); + i2_g = ((*pu1_u_src - 128) * COEFF2 + (*pu1_v_src - 128) * COEFF3) + >> 13; + i2_r = ((*pu1_v_src - 128) * COEFF1) >> 13; + + pu1_u_src += 2; + pu1_v_src += 2; + /* pixel 0 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src + i2_b); + u4_b >>= 3; + /* G */ + u4_g = CLIP_U8(*pu1_y_src + i2_g); + u4_g >>= 2; + /* R */ + u4_r = CLIP_U8(*pu1_y_src + i2_r); + u4_r >>= 3; + + pu1_y_src++; + *pu2_rgb_dst++ = ((u4_r << 11) | (u4_g << 5) | u4_b); + + /* pixel 1 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src + i2_b); + u4_b >>= 3; + /* G */ + u4_g = CLIP_U8(*pu1_y_src + i2_g); + u4_g >>= 2; + /* R */ + u4_r = CLIP_U8(*pu1_y_src + i2_r); + u4_r >>= 3; + + pu1_y_src++; + *pu2_rgb_dst++ = ((u4_r << 11) | (u4_g << 5) | u4_b); + + /* pixel 2 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b); + u4_b >>= 3; + /* G */ + u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g); + u4_g >>= 2; + /* R */ + u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r); + u4_r >>= 3; + + pu1_y_src_nxt++; + *pu2_rgb_dst_NextRow++ = ((u4_r << 11) | (u4_g << 5) | u4_b); + + /* pixel 3 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b); + u4_b >>= 3; + /* G */ + u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g); + u4_g >>= 2; + /* R */ + u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r); + u4_r >>= 3; + + pu1_y_src_nxt++; + *pu2_rgb_dst_NextRow++ = ((u4_r << 11) | (u4_g << 5) | u4_b); + + } + + pu1_u_src = pu1_u_src + src_uv_strd - wd; + pu1_v_src = pu1_v_src + src_uv_strd - wd; + + pu1_y_src = pu1_y_src + (src_y_strd << 1) - wd; + pu1_y_src_nxt = pu1_y_src_nxt + (src_y_strd << 1) - wd; + + pu2_rgb_dst = pu2_rgb_dst_NextRow - wd + dst_strd; + pu2_rgb_dst_NextRow = pu2_rgb_dst_NextRow + (dst_strd << 1) - wd; + } + +} + +void ih264e_fmt_conv_420sp_to_rgba8888(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD32 *pu4_rgba_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_strd, + WORD32 is_u_first) +{ + WORD16 i2_r, i2_g, i2_b; + UWORD32 u4_r, u4_g, u4_b; + WORD16 i2_i, i2_j; + UWORD8 *pu1_y_src_nxt; + UWORD32 *pu4_rgba_dst_NextRow; + UWORD8 *pu1_u_src, *pu1_v_src; + + if (is_u_first) + { + pu1_u_src = (UWORD8 *) pu1_uv_src; + pu1_v_src = (UWORD8 *) pu1_uv_src + 1; + } + else + { + pu1_u_src = (UWORD8 *) pu1_uv_src + 1; + pu1_v_src = (UWORD8 *) pu1_uv_src; + } + + pu1_y_src_nxt = pu1_y_src + src_y_strd; + + pu4_rgba_dst_NextRow = pu4_rgba_dst + dst_strd; + + for (i2_i = 0; i2_i < (ht >> 1); i2_i++) + { + for (i2_j = (wd >> 1); i2_j > 0; i2_j--) + { + i2_b = ((*pu1_u_src - 128) * COEFF4 >> 13); + i2_g = ((*pu1_u_src - 128) * COEFF2 + (*pu1_v_src - 128) * COEFF3) + >> 13; + i2_r = ((*pu1_v_src - 128) * COEFF1) >> 13; + + pu1_u_src += 2; + pu1_v_src += 2; + /* pixel 0 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src + i2_b); + /* G */ + u4_g = CLIP_U8(*pu1_y_src + i2_g); + /* R */ + u4_r = CLIP_U8(*pu1_y_src + i2_r); + + pu1_y_src++; + *pu4_rgba_dst++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0)); + + /* pixel 1 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src + i2_b); + /* G */ + u4_g = CLIP_U8(*pu1_y_src + i2_g); + /* R */ + u4_r = CLIP_U8(*pu1_y_src + i2_r); + + pu1_y_src++; + *pu4_rgba_dst++ = ((u4_r << 16) | (u4_g << 8) | (u4_b << 0)); + + /* pixel 2 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b); + /* G */ + u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g); + /* R */ + u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r); + + pu1_y_src_nxt++; + *pu4_rgba_dst_NextRow++ = + ((u4_r << 16) | (u4_g << 8) | (u4_b << 0)); + + /* pixel 3 */ + /* B */ + u4_b = CLIP_U8(*pu1_y_src_nxt + i2_b); + /* G */ + u4_g = CLIP_U8(*pu1_y_src_nxt + i2_g); + /* R */ + u4_r = CLIP_U8(*pu1_y_src_nxt + i2_r); + + pu1_y_src_nxt++; + *pu4_rgba_dst_NextRow++ = + ((u4_r << 16) | (u4_g << 8) | (u4_b << 0)); + + } + + pu1_u_src = pu1_u_src + src_uv_strd - wd; + pu1_v_src = pu1_v_src + src_uv_strd - wd; + + pu1_y_src = pu1_y_src + (src_y_strd << 1) - wd; + pu1_y_src_nxt = pu1_y_src_nxt + (src_y_strd << 1) - wd; + + pu4_rgba_dst = pu4_rgba_dst_NextRow - wd + dst_strd; + pu4_rgba_dst_NextRow = pu4_rgba_dst_NextRow + (dst_strd << 1) - wd; + } + +} + +/** +******************************************************************************* +* +* @brief Function used for copying a 420SP buffer +* +* @par Description +* Function used for copying a 420SP buffer +* +* @param[in] pu1_y_src +* Input Y pointer +* +* @param[in] pu1_uv_src +* Input UV pointer (UV is interleaved either in UV or VU format) +* +* @param[in] pu1_y_dst +* Output Y pointer +* +* @param[in] pu1_uv_dst +* Output UV pointer (UV is interleaved in the same format as that of input) +* +* @param[in] wd +* Width +* +* @param[in] ht +* Height +* +* @param[in] src_y_strd +* Input Y Stride +* +* @param[in] src_uv_strd +* Input UV stride +* +* @param[in] dst_y_strd +* Output Y stride +* +* @param[in] dst_uv_strd +* Output UV stride +* +* @returns None +* +* @remarks In case there is a need to perform partial frame copy then +* by passion appropriate source and destination pointers and appropriate +* values for wd and ht it can be done +* +******************************************************************************* +*/ +void ih264e_fmt_conv_420sp_to_420sp(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_uv_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_y_strd, + WORD32 dst_uv_strd) +{ + UWORD8 *pu1_src, *pu1_dst; + WORD32 num_rows, num_cols, src_strd, dst_strd; + WORD32 i; + + /* copy luma */ + pu1_src = (UWORD8 *) pu1_y_src; + pu1_dst = (UWORD8 *) pu1_y_dst; + + num_rows = ht; + num_cols = wd; + + src_strd = src_y_strd; + dst_strd = dst_y_strd; + + for (i = 0; i < num_rows; i++) + { + memcpy(pu1_dst, pu1_src, num_cols); + pu1_dst += dst_strd; + pu1_src += src_strd; + } + + /* copy U and V */ + pu1_src = (UWORD8 *) pu1_uv_src; + pu1_dst = (UWORD8 *) pu1_uv_dst; + + num_rows = ht >> 1; + num_cols = wd; + + src_strd = src_uv_strd; + dst_strd = dst_uv_strd; + + for (i = 0; i < num_rows; i++) + { + memcpy(pu1_dst, pu1_src, num_cols); + pu1_dst += dst_strd; + pu1_src += src_strd; + } + return; +} + + +void ih264e_fmt_conv_420sp_to_420sp_swap_uv(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_uv_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_y_strd, + WORD32 dst_uv_strd) +{ + UWORD8 *pu1_src, *pu1_dst; + WORD32 num_rows, num_cols, src_strd, dst_strd; + WORD32 i; + + /* copy luma */ + pu1_src = (UWORD8 *) pu1_y_src; + pu1_dst = (UWORD8 *) pu1_y_dst; + + num_rows = ht; + num_cols = wd; + + src_strd = src_y_strd; + dst_strd = dst_y_strd; + + for (i = 0; i < num_rows; i++) + { + memcpy(pu1_dst, pu1_src, num_cols); + pu1_dst += dst_strd; + pu1_src += src_strd; + } + + /* copy U and V */ + pu1_src = (UWORD8 *) pu1_uv_src; + pu1_dst = (UWORD8 *) pu1_uv_dst; + + num_rows = ht >> 1; + num_cols = wd; + + src_strd = src_uv_strd; + dst_strd = dst_uv_strd; + + for (i = 0; i < num_rows; i++) + { + WORD32 j; + for (j = 0; j < num_cols; j += 2) + { + pu1_dst[j + 0] = pu1_src[j + 1]; + pu1_dst[j + 1] = pu1_src[j + 0]; + } + pu1_dst += dst_strd; + pu1_src += src_strd; + } + return; +} + +void ih264e_fmt_conv_420sp_to_420p(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_u_dst, + UWORD8 *pu1_v_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_y_strd, + WORD32 dst_uv_strd, + WORD32 is_u_first, + WORD32 disable_luma_copy) +{ + UWORD8 *pu1_src, *pu1_dst; + UWORD8 *pu1_u_src, *pu1_v_src; + WORD32 num_rows, num_cols, src_strd, dst_strd; + WORD32 i, j; + + if (0 == disable_luma_copy) + { + /* copy luma */ + pu1_src = (UWORD8 *) pu1_y_src; + pu1_dst = (UWORD8 *) pu1_y_dst; + + num_rows = ht; + num_cols = wd; + + src_strd = src_y_strd; + dst_strd = dst_y_strd; + + for (i = 0; i < num_rows; i++) + { + memcpy(pu1_dst, pu1_src, num_cols); + pu1_dst += dst_strd; + pu1_src += src_strd; + } + } + /* de-interleave U and V and copy to destination */ + if (is_u_first) + { + pu1_u_src = (UWORD8 *) pu1_uv_src; + pu1_v_src = (UWORD8 *) pu1_uv_src + 1; + } + else + { + pu1_u_src = (UWORD8 *) pu1_uv_src + 1; + pu1_v_src = (UWORD8 *) pu1_uv_src; + } + + num_rows = ht >> 1; + num_cols = wd >> 1; + + src_strd = src_uv_strd; + dst_strd = dst_uv_strd; + + for (i = 0; i < num_rows; i++) + { + for (j = 0; j < num_cols; j++) + { + pu1_u_dst[j] = pu1_u_src[j * 2]; + pu1_v_dst[j] = pu1_v_src[j * 2]; + } + + pu1_u_dst += dst_strd; + pu1_v_dst += dst_strd; + pu1_u_src += src_strd; + pu1_v_src += src_strd; + } + return; +} + +/** +******************************************************************************* +* +* @brief Function used to perform color space conversion from 420P to 420SP +* +* @par Description +* Function used to perform color space conversion from 420P to 420SP +* +* @param[in] pu1_y_src +* Input Y pointer +* +* @param[in] pu1_u_src +* Input U pointer +* +* @param[in] pu1_v_dst +* Input V pointer +* +* @param[in] pu1_y_dst +* Output Y pointer +* +* @param[in] pu1_uv_dst +* Output UV pointer +* +* @param[in] u4_width +* Width +* +* @param[in] u4_height +* Height +* +* @param[in] src_y_strd +* Input Y Stride +* +* @param[in] src_u_strd +* Input U stride +* +* @param[in] src_v_strd +* Input V stride +* +* @param[in] dst_y_strd +* Output Y stride +* +* @param[in] dst_uv_strd +* Output UV stride +* +* @param[in] convert_uv_only +* Flag to indicate if only UV copy needs to be done +* +* @returns none +* +* @remarks In case there is a need to perform partial frame copy then +* by passion appropriate source and destination pointers and appropriate +* values for wd and ht it can be done +* +******************************************************************************* +*/ +void ih264e_fmt_conv_420p_to_420sp(UWORD8 *pu1_y_src, + UWORD8 *pu1_u_src, + UWORD8 *pu1_v_src, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_uv_dst, + UWORD16 u2_height, + UWORD16 u2_width, + UWORD16 src_y_strd, + UWORD16 src_u_strd, + UWORD16 src_v_strd, + UWORD16 dst_y_strd, + UWORD16 dst_uv_strd, + UWORD32 convert_uv_only) +{ + UWORD8 *pu1_src, *pu1_dst; + UWORD8 *pu1_src_u, *pu1_src_v; + UWORD16 i; + UWORD32 u2_width_uv; + UWORD32 dest_inc_Y = 0, dest_inc_UV = 0; + + dest_inc_UV = dst_uv_strd; + + if (0 == convert_uv_only) + { + + /* Copy Y buffer */ + pu1_dst = (UWORD8 *) pu1_y_dst; + pu1_src = (UWORD8 *) pu1_y_src; + + dest_inc_Y = dst_y_strd; + + for (i = 0; i < u2_height; i++) + { + memcpy((void *) pu1_dst, (void *) pu1_src, u2_width); + pu1_dst += dest_inc_Y; + pu1_src += src_y_strd; + } + } + + /* Interleave Cb and Cr buffers */ + pu1_src_u = pu1_u_src; + pu1_src_v = pu1_v_src; + pu1_dst = pu1_uv_dst; + + u2_height = (u2_height + 1) >> 1; + u2_width_uv = (u2_width + 1) >> 1; + for (i = 0; i < u2_height; i++) + { + UWORD32 j; + for (j = 0; j < u2_width_uv; j++) + { + *pu1_dst++ = *pu1_src_u++; + *pu1_dst++ = *pu1_src_v++; + } + + pu1_dst += dest_inc_UV - u2_width; + pu1_src_u += src_u_strd - u2_width_uv; + pu1_src_v += src_v_strd - u2_width_uv; + } +} + +/** +******************************************************************************* +* +* @brief Function used to convert 422 interleaved to 420sp +* +* @par Description +* Function used to convert 422 interleaved to 420sp +* +* @param[in] pu1_y_buf +* Output Y pointer +* +* @param[in] pu1_u_buf +* Output u pointer +* +* @param[in[ pu1_v_buf +* Output V pointer +* +* @param[in] pu1_422i_buf +* Input 422i pointer +* +* @param[in] u4_y_width +* Width of Y component +* +* @param[in] u4_y_height +* Height of Y component +* +* @param[in] u4_y_stride +* Stride of pu1_y_buf +* +* @param[in] u4_u_stride +* Stride of pu1_u_buf +* +* @param[in] u4_v_stride +* Stride of pu1_v_buf +* +* @param[in] u4_422i_stride +* Stride of pu1_422i_buf +* +* @returns None +* +* @remarks For conversion +* pu1_v_buf = pu1_u_buf+1 +* u4_u_stride = u4_v_stride +* +* The extra parameters are for maintaining API with assembly function +* +******************************************************************************* +*/ +void ih264e_fmt_conv_422i_to_420sp(UWORD8 *pu1_y_buf, + UWORD8 *pu1_u_buf, + UWORD8 *pu1_v_buf, + UWORD8 *pu1_422i_buf, + WORD32 u4_y_width, + WORD32 u4_y_height, + WORD32 u4_y_stride, + WORD32 u4_u_stride, + WORD32 u4_v_stride, + WORD32 u4_422i_stride) +{ + WORD32 row, col; + UWORD8 *row_even_422 = pu1_422i_buf; + UWORD8 *row_odd_422 = row_even_422 + (u4_422i_stride << 1); + UWORD8 *row_even_luma = pu1_y_buf; + /* Since at the end of loop, we have row_even_luma += (luma_width << 1), + * it should be same here right? */ + UWORD8 *row_odd_luma = row_even_luma + u4_y_stride; + UWORD8 *row_cb = pu1_u_buf; + UWORD8 *row_cr = pu1_v_buf; + + for (row = 0; row < u4_y_height; row = row + 2) + { + for (col = 0; col < (u4_y_width << 1); col = col + 4) + { + UWORD8 cb_even = row_even_422[col]; + UWORD8 cr_even = row_even_422[col + 2]; + + row_cb[col >> 1] = cb_even; + row_cr[col >> 1] = cr_even; + + row_even_luma[col >> 1] = row_even_422[col + 1]; + row_even_luma[(col >> 1) + 1] = row_even_422[col + 3]; + + row_odd_luma[col >> 1] = row_odd_422[col + 1]; + row_odd_luma[(col >> 1) + 1] = row_odd_422[col + 3]; + } + + row_even_422 += (u4_422i_stride << 2); + row_odd_422 += (u4_422i_stride << 2); + + row_even_luma += (u4_y_stride << 1); + row_odd_luma += (u4_y_stride << 1); + + row_cb += u4_u_stride; + row_cr += u4_v_stride; + } +} + +/** +******************************************************************************* +* +* @brief Function used from format conversion or frame copy +* +* @par Description +* Function used from copying or converting a reference frame to display buffer +* in non shared mode +* +* @param[in] pu1_y_dst +* Output Y pointer +* +* @param[in] pu1_u_dst +* Output U/UV pointer ( UV is interleaved in the same format as that of input) +* +* @param[in] pu1_v_dst +* Output V pointer ( used in 420P output case) +* +* @param[in] u4_dst_y_strd +* Stride of destination Y buffer +* +* @param[in] u4_dst_u_strd +* Stride of destination U/V buffer +* +* @param[in] blocking +* To indicate whether format conversion should wait till frame is reconstructed +* and then return after complete copy is done. To be set to 1 when called at the +* end of frame processing and set to 0 when called between frame processing modules +* in order to utilize available MCPS +* +* @returns error status +* +* @remarks +* Assumes that the stride of U and V buffers are same. +* This is correct in most cases +* If a case comes where this is not true we need to modify the fmt conversion +* functions called inside also +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_fmt_conv(codec_t *ps_codec, + pic_buf_t *ps_pic, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_u_dst, + UWORD8 *pu1_v_dst, + UWORD32 u4_dst_y_strd, + UWORD32 u4_dst_uv_strd, + WORD32 cur_row, + WORD32 num_rows) +{ + IH264E_ERROR_T ret = IH264E_SUCCESS; + UWORD8 *pu1_y_src, *pu1_uv_src; + UWORD8 *pu1_y_dst_tmp, *pu1_uv_dst_tmp; + UWORD8 *pu1_u_dst_tmp, *pu1_v_dst_tmp; + UWORD16 *pu2_rgb_dst_tmp; + UWORD32 *pu4_rgb_dst_tmp; + WORD32 is_u_first; + UWORD8 *pu1_luma; + UWORD8 *pu1_chroma; + WORD32 dst_stride, wd; + + + if (0 == num_rows) + return ret; + + pu1_luma = ps_pic->pu1_luma; + pu1_chroma = ps_pic->pu1_chroma; + + + dst_stride = ps_codec->s_cfg.u4_wd; + wd = ps_codec->s_cfg.u4_disp_wd; + is_u_first = (IV_YUV_420SP_UV == ps_codec->e_codec_color_format) ? 1 : 0; + + /* In case of 420P output luma copy is disabled for shared mode */ + { + pu1_y_src = pu1_luma + cur_row * ps_codec->i4_rec_strd; + pu1_uv_src = pu1_chroma + (cur_row / 2) * ps_codec->i4_rec_strd; + + pu2_rgb_dst_tmp = (UWORD16 *) pu1_y_dst; + pu2_rgb_dst_tmp += cur_row * dst_stride; + pu4_rgb_dst_tmp = (UWORD32 *) pu1_y_dst; + pu4_rgb_dst_tmp += cur_row * dst_stride; + + pu1_y_dst_tmp = pu1_y_dst + cur_row * u4_dst_y_strd; + pu1_uv_dst_tmp = pu1_u_dst + (cur_row / 2) * u4_dst_uv_strd; + pu1_u_dst_tmp = pu1_u_dst + (cur_row / 2) * u4_dst_uv_strd; + pu1_v_dst_tmp = pu1_v_dst + (cur_row / 2) * u4_dst_uv_strd; + + /* If the call is non-blocking and there are no rows to be copied then return */ + /* In non-shared mode, reference buffers are in 420SP UV format, + * if output also is in 420SP_UV, then just copy + * if output is in 420SP_VU then swap UV values + */ + if ((IV_YUV_420SP_UV == ps_codec->s_cfg.e_recon_color_fmt) || + (IV_YUV_420SP_VU == ps_codec->s_cfg.e_recon_color_fmt)) + { + ih264e_fmt_conv_420sp_to_420sp(pu1_y_src, pu1_uv_src, pu1_y_dst_tmp, + pu1_uv_dst_tmp, wd, num_rows, + ps_codec->i4_rec_strd, + ps_codec->i4_rec_strd, u4_dst_y_strd, + u4_dst_uv_strd); + } + else if (IV_YUV_420P == ps_codec->s_cfg.e_recon_color_fmt) + { + ih264e_fmt_conv_420sp_to_420p(pu1_y_src, pu1_uv_src, pu1_y_dst_tmp, + pu1_u_dst_tmp, pu1_v_dst_tmp, wd, + num_rows, ps_codec->i4_rec_strd, + ps_codec->i4_rec_strd, u4_dst_y_strd, + u4_dst_uv_strd, is_u_first, 0); + } + } + return(ret); +} + diff --git a/encoder/ih264e_fmt_conv.h b/encoder/ih264e_fmt_conv.h new file mode 100755 index 0000000..6b33bf0 --- /dev/null +++ b/encoder/ih264e_fmt_conv.h @@ -0,0 +1,142 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_fmt_conv.h +* +* @brief +* The file contains extern declarations of color space conversion routines +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_FMT_CONV_H_ +#define IH264E_FMT_CONV_H_ + +#define COEFF1 13073 +#define COEFF2 -3207 +#define COEFF3 -6664 +#define COEFF4 16530 + +IH264E_ERROR_T ih264e_fmt_conv(codec_t *ps_codec, + pic_buf_t *ps_pic, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_u_dst, + UWORD8 *pu1_v_dst, + UWORD32 u4_dst_y_strd, + UWORD32 u4_dst_uv_strd, + WORD32 cur_row, + WORD32 num_rows); + +typedef void ih264e_fmt_conv_420sp_to_rgba8888_ft(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD32 *pu4_rgba_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_strd, + WORD32 is_u_first); + +typedef void ih264e_fmt_conv_420sp_to_rgb565_ft(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD16 *pu2_rgb_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_strd, + WORD32 is_u_first); + +typedef void ih264e_fmt_conv_420sp_to_420sp_ft(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_uv_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_y_strd, + WORD32 dst_uv_strd); + +typedef void ih264e_fmt_conv_420sp_to_420p_ft(UWORD8 *pu1_y_src, + UWORD8 *pu1_uv_src, + UWORD8 *pu1_y_dst, + UWORD8 *pu1_u_dst, + UWORD8 *pu1_v_dst, + WORD32 wd, + WORD32 ht, + WORD32 src_y_strd, + WORD32 src_uv_strd, + WORD32 dst_y_strd, + WORD32 dst_uv_strd, + WORD32 is_u_first, + WORD32 disable_luma_copy); + +typedef void ih264e_fmt_conv_420p_to_420sp_ft(UWORD8 *pu1_y_src, UWORD8 *pu1_u_src, UWORD8 *pu1_v_src, + UWORD8 *pu1_y_dst, UWORD8 *pu1_uv_dst, + UWORD16 u2_height, UWORD16 u2_width, UWORD16 src_y_strd, + UWORD16 src_u_strd, UWORD16 src_v_strd, + UWORD16 dst_y_strd, UWORD16 dst_uv_strd, + UWORD32 convert_uv_only); + +typedef void ih264e_fmt_conv_422i_to_420sp_ft(UWORD8 *pu1_y_buf,UWORD8 *pu1_u_buf,UWORD8 *pu1_v_buf, + UWORD8 *pu1_422i_buf, + WORD32 u4_y_width,WORD32 u4_y_height, + WORD32 u4_y_stride,WORD32 u4_u_stride,WORD32 u4_v_stride, + WORD32 u4_422i_stride); + + +/* C function declarations */ +ih264e_fmt_conv_420sp_to_rgba8888_ft ih264e_fmt_conv_420sp_to_rgba8888; +ih264e_fmt_conv_420sp_to_rgb565_ft ih264e_fmt_conv_420sp_to_rgb565; +ih264e_fmt_conv_420sp_to_420sp_ft ih264e_fmt_conv_420sp_to_420sp; +ih264e_fmt_conv_420sp_to_420p_ft ih264e_fmt_conv_420sp_to_420p; +ih264e_fmt_conv_420p_to_420sp_ft ih264e_fmt_conv_420p_to_420sp; +ih264e_fmt_conv_422i_to_420sp_ft ih264e_fmt_conv_422i_to_420sp; + +/* A9Q function declarations */ +ih264e_fmt_conv_420sp_to_rgba8888_ft ih264e_fmt_conv_420sp_to_rgba8888_a9q; +ih264e_fmt_conv_420sp_to_420sp_ft ih264e_fmt_conv_420sp_to_420sp_a9q; +ih264e_fmt_conv_420sp_to_420p_ft ih264e_fmt_conv_420sp_to_420p_a9q; +ih264e_fmt_conv_420p_to_420sp_ft ih264e_fmt_conv_420p_to_420sp_a9q; +ih264e_fmt_conv_422i_to_420sp_ft ih264e_fmt_conv_422i_to_420sp_a9q; + + +/* A9A function declarations */ +ih264e_fmt_conv_420sp_to_rgba8888_ft ih264e_fmt_conv_420sp_to_rgba8888_a9a; +ih264e_fmt_conv_420sp_to_420sp_ft ih264e_fmt_conv_420sp_to_420sp_a9a; +ih264e_fmt_conv_420sp_to_420p_ft ih264e_fmt_conv_420sp_to_420p_a9a; + +/* SSSe31 function declarations */ +ih264e_fmt_conv_420sp_to_420p_ft ih264e_fmt_conv_420sp_to_420p_ssse31; + +/* SSE4 function declarations */ +ih264e_fmt_conv_420sp_to_420p_ft ih264e_fmt_conv_420sp_to_420p_sse42; + +#endif /* IH264E_FMT_CONV_H_ */ diff --git a/encoder/ih264e_function_selector_generic.c b/encoder/ih264e_function_selector_generic.c new file mode 100755 index 0000000..65f943a --- /dev/null +++ b/encoder/ih264e_function_selector_generic.c @@ -0,0 +1,259 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_function_selector_generic.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_generic +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264e_defs.h" +#include "ih264e_structs.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_core_coding.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_padding.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264_mem_fns.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_half_pel.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_generic(codec_t *ps_codec) +{ + WORD32 i = 0; + + /* curr proc ctxt */ + process_ctxt_t *ps_proc = NULL; + me_ctxt_t *ps_me_ctxt = NULL; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 16x16 */ + ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert; + ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz; + ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc; + ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 4x4 */ + ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert; + ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz; + ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc; + ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl; + ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr; + ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r; + ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d; + ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l; + ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 8x8 */ + ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert; + ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc; + ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl; + ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr; + ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r; + ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d; + ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l; + ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u; + + /* Init function pointers for intra pred leaf level functions chroma + * Intra 8x8 */ + ps_codec->apf_intra_pred_c[0] = ih264_intra_pred_chroma_8x8_mode_dc; + ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz; + ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert; + ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane; + + /* Init luma forward transform fn ptr */ + ps_codec->pf_resi_trans_quant_8x8 = ih264_resi_trans_quant_8x8; + ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4; + ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4; + ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4; + ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv; + + /* Init inverse transform fn ptr */ + ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8; + ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4; + ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc; + ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4; + ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc; + + ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4; + ps_codec->pf_ihadamard_scaling_2x2_uv = ih264_ihadamard_scaling_2x2_uv; + ps_codec->pf_interleave_copy = ih264_interleave_copy; + + /* Init fn ptr luma core coding */ + ps_codec->luma_energy_compaction[0] = ih264e_code_luma_intra_macroblock_16x16; + ps_codec->luma_energy_compaction[1] = ih264e_code_luma_intra_macroblock_4x4; + ps_codec->luma_energy_compaction[3] = ih264e_code_luma_inter_macroblock_16x16; + + /* Init fn ptr chroma core coding */ + ps_codec->chroma_energy_compaction[0] = ih264e_code_chroma_intra_macroblock_8x8; + ps_codec->chroma_energy_compaction[1] = ih264e_code_chroma_inter_macroblock_8x8; + + /* Init fn ptr luma deblocking */ + ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4; + ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4; + ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4; + ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4; + + /* Init fn ptr chroma deblocking */ + ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4; + ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4; + ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4; + ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4; + + /* write mb syntax layer */ + ps_codec->pf_write_mb_syntax_layer[ISLICE] = ih264e_write_islice_mb; + ps_codec->pf_write_mb_syntax_layer[PSLICE] = ih264e_write_pslice_mb; + + /* Padding Functions */ + ps_codec->pf_pad_top = ih264_pad_top; + ps_codec->pf_pad_bottom = ih264_pad_bottom; + ps_codec->pf_pad_left_luma = ih264_pad_left_luma; + ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma; + ps_codec->pf_pad_right_luma = ih264_pad_right_luma; + ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma; + + /* Inter pred leaf level functions */ + ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy; + ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz; + ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert; + ps_codec->pf_inter_pred_luma_bilinear = ih264_inter_pred_luma_bilinear; + ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma; + + /* sad me level functions */ + ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16; + ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast; + ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8; + + /* memory handling operations */ + ps_codec->pf_mem_cpy = ih264_memcpy; + ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8; + ps_codec->pf_mem_set = ih264_memset; + ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8; + + /* sad me level functions */ + for (i = 0; i < (MAX_PROCESS_CTXT); i++) + { + ps_proc = &ps_codec->as_process[i]; + + ps_me_ctxt = &ps_proc->s_me_ctxt; + ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16; + ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast; + ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8; + ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog; + ps_me_ctxt->pf_ime_compute_sad3_diamond = ime_calculate_sad3_prog; + ps_me_ctxt->pf_ime_compute_sad2_diamond = ime_calculate_sad2_prog; + ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16; + ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter; + } + + /* intra mode eval -encoder level function */ + ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes; + ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes; + ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes; + + /* csc */ + ps_codec->pf_ih264e_conv_420p_to_420sp = ih264e_fmt_conv_420p_to_420sp; + ps_codec->pf_ih264e_fmt_conv_422i_to_420sp = ih264e_fmt_conv_422i_to_420sp; + + /* Halp pel generation function - encoder level*/ + ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz; + ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert; + + return; +} diff --git a/encoder/ih264e_globals.c b/encoder/ih264e_globals.c new file mode 100755 index 0000000..e2b46a4 --- /dev/null +++ b/encoder/ih264e_globals.c @@ -0,0 +1,261 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_globals.c +* +* @brief +* Contains definitions of global variables used across the encoder +* +* @author +* ittiam +* +* @par List of functions +* +* +* @remarks +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "ih264e_defs.h" +#include "ih264e_globals.h" + +/*****************************************************************************/ +/* Extern global definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* @brief lamda for varying quantizer scales that would be used to +* compute the RD cost while deciding on the MB modes. +* input : qp +* output : lambda +* @remarks lambda = 0.85 * pow(2, (qp - 12)/3), when SSD is used as metric +* for computing distortion (Bit rate estimation for cost function of H.264/ +* AVC by Mohd Golam Sarwer et. al.) If the use of distortion metric is SAD +* rather than SSD in the stage of encoding, consider sqrt(lambda) simply to +* adjust lambda for the lack of squaring operation in the error computation +* (from rate distortion optimization for video compression by sullivan). +****************************************************************************** +*/ +const UWORD16 gu2_qp_lambda[52]= +{ + 0, 0, 0, 0, 0, 0, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 3, 3, 3, + 4, 4, 5, 5, 6, 7, 7, 8, + 9, 10, 12, 13, 15, 17, 19, 21, + 23, 26, 30, 33, 37, 42, 47, 53, + 59, 66, 74, 83, +}; + +/** +****************************************************************************** +* @brief Lamda for varying quantizer scales that would be used to +* compute the RD cost while deciding on the MB modes. +* input : qp +* output : lambda +* @remarks lambda = pow(2, (qp - 12)/6) +****************************************************************************** +*/ +const UWORD8 gu1_qp0[52]= +{ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 3, 3, 3, 4, + 4, 4, 5, 6, 6, 7, 8, 9, + 10, 11, 13, 14, 16, 18, 20, 23, + 25, 29, 32, 36, 40, 45, 51, 57, + 64, 72, 81, 91, +}; + +/** +****************************************************************************** +* @brief unsigned exp. goulumb codelengths to assign cost to a coefficient of +* mb types. +* input : Integer +* output : codelength +* @remarks Refer sec. 9-1 in h264 specification +****************************************************************************** +*/ +const UWORD8 u1_uev_codelength[32] = +{ + 1, 3, 3, 5, 5, 5, 5, 7, + 7, 7, 7, 7, 7, 7, 7, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 11, +}; + + +/** +****************************************************************************** +* @brief Look up table to assign cost to a coefficient of a residual block +* basing on its surrounding coefficients +* input : Numbers of T1's +* output : coeff_cost +* @remarks Refer Section 2.3 Elimination of single coefficients in inter +* macroblocks in document JVT-O079 +****************************************************************************** +*/ +const UWORD8 gu1_coeff_cost[6] = +{ + 3, 2, 2, 1, 1, 1 +}; + +/** +****************************************************************************** +* @brief Indices map to raster scan for luma 4x4 block +* input : scan index +* output : scan location +* @remarks None +****************************************************************************** +*/ +const UWORD8 gu1_luma_scan_order[16] = +{ + 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 +}; + +/** +****************************************************************************** +* @brief Indices map to raster scan for chroma AC block +* input : scan index +* output : scan location +* @remarks None +****************************************************************************** +*/ +const UWORD8 gu1_chroma_scan_order[15] = +{ + 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 +}; + +/** +****************************************************************************** +* @brief Indices map to raster scan for luma 4x4 dc block +* input : scan index +* output : scan location +* @remarks : None +****************************************************************************** +*/ +const UWORD8 gu1_luma_scan_order_dc[16] = +{ + 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 +}; + +/** +****************************************************************************** +* @brief Indices map to raster scan for chroma 2x2 dc block +* input : scan index +* output : scan location +* @remarks None +****************************************************************************** +*/ +const UWORD8 gu1_chroma_scan_order_dc[4] = +{ + 0, 1, 2, 3 +}; + +/** +****************************************************************************** +* @brief choice of motion vectors to be used during mv prediction +* input : formatted reference idx comparison metric +* output : mv prediction has to be median or a simple straight forward selec +* tion from neighbors. +* @remarks If only one of the candidate blocks has a reference frame equal to + the current block then use the same block as the final predictor. A simple + look up table to assist this mv prediction condition +****************************************************************************** +*/ +const WORD8 gi1_mv_pred_condition[8] = +{ + -1, 0, 1, -1, 2, -1, -1, -1 +}; + +/** +****************************************************************************** +* @brief maps the h264 quantizer to the mpeg2 quantizer scale +* input : h264 qp +* output : equivalent mpeg 2 qp +* @remarks mpeg2qscale = 2 ^ [((h264qp - 12) / 6) + 1] +****************************************************************************** +*/ +const UWORD8 gau1_h264_to_mpeg2_qmap[H264_QP_ELEM] = +{ + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 3, 3, 3, 4, + 4, 4, 5, 6, 6, 7, 8, 9, + 10, 11, 13, 14, 16, 18, 20, 23, + 25, 29, 32, 36, 40, 45, 51, 57, + 64, 72, 81, 91, 102, 114, 128, 144, + 161, 181, 203, 228, +}; + +/** +****************************************************************************** +* @brief maps the mpeg2 quantizer to the h264 quantizer scale +* input : mpeg2 qp +* output : equivalent h264qp +* @remarks MPEG-2 dequantization: (2*QFij + k)*Wij*qscale/32 +* k = 0 (for intra) k = sign(QFij) +* H.264 dequantization: (QFij*R(QP%6,i,j))>>(6 - QP/6) +* +* Excluding the portion of R(QP%6,i,j) that is due to +* the DCT scale factors, the 6 entries after dividing by 64 (2^6) +* correspond to dequant values of +* 2.5, 2.8125, 3.125, 3.5625, 3.9375, 4.4375. +* (a=0.5 b=sqrt(2/5) - refer to JVT-B038.doc) +* +* Assuming that h264Qp=12 corresponds to MPEG2 qscale of 2 +* (the actual mapping seems to be to MPEG2 qscale of 2.5), +* and the fact that the effective h264 quantizer changes by +* a factor of 2 for every 6 steps, the following mapping is +* obtained: +* h264qp = 6*(log2(mpeg2qscale/2)) + 12. +* +* Note that the quant matrix entry assumed for the above +* equality is 16. Hence when the mpeg2 quant matrix entries +* are all 16, this lookup can be used as is (which is the +* default inter quant matrix in mpeg-2). +****************************************************************************** +*/ +const UWORD8 gau1_mpeg2_to_h264_qmap[MPEG2_QP_ELEM] = +{ + 0, 4, 10, 14, 16, 18, 20, 21, 22, 23, 24, 25, 26, 26, 27, 27, + 28, 29, 29, 29, 30, 30, 31, 31, 32, 32, 32, 33, 33, 33, 33, 34, + 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37, + 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 40, 40, 40, + 40, 40, 40, 40, 41, 41, 41, 41, 41, 41, 41, 41, 41, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 43, 43, 43, + 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, + 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, +}; + diff --git a/encoder/ih264e_globals.h b/encoder/ih264e_globals.h new file mode 100755 index 0000000..4c3de23 --- /dev/null +++ b/encoder/ih264e_globals.h @@ -0,0 +1,192 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_globals.h +* +* @brief +* Contains declarations of global variables for H264 encoder +* +* @author +* Ittiam +* +* @remarks +* +******************************************************************************* +*/ + +#ifndef IH264E_GLOBALS_H_ +#define IH264E_GLOBALS_H_ + + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* @brief Computes the lamda for varying quantizer scales that would be used to +* compute the RD cost while deciding on the MB modes. +* input : qp +* output : lambda +* @remarks lambda = 0.85 * pow(2, (qp - 12)/3), when SSD is used as metric +* for computing distortion (Bit rate estimation for cost function of H.264/ +* AVC by Mohd Golam Sarwer et. al.) If the use of distortion metric is SAD +* rather than SSD in the stage of encoding, consider sqrt(lambda) simply to +* adjust lambda for the lack of squaring operation in the error computation +* (from rate distortion optimization for video compression by sullivan). +****************************************************************************** +*/ +extern const UWORD16 gu2_qp_lambda[52]; + +/** +****************************************************************************** +* @brief Computes the lamda for varying quantizer scales that would be used to +* compute the RD cost while deciding on the MB modes. +* input : qp +* output : lambda +* @remarks lambda = pow(2, (qp - 12)/6). When Lagrangian multiplier is disabled +* the same constant is used across mode decision and mv decisions. +****************************************************************************** +*/ +extern const UWORD8 gu1_qp0[52]; + +/** +****************************************************************************** +* @brief unsigned exp. goulumb codelengths to assign cost to a coefficient of +* mb types. +* input : Integer +* output : codelength +* @remarks Refer sec. 9-1 in h264 specification +****************************************************************************** +*/ +extern const UWORD8 u1_uev_codelength[32]; + +/** +****************************************************************************** +* @brief Look up table to assign cost to a coefficient of a residual block +* basing on its surrounding coefficients +* input : Numbers of T1's +* output : coeff_cost +* @remarks Refer Section 2.3 Elimination of single coefficients in inter +* macroblocks in document JVT-O079 +****************************************************************************** +*/ +extern const UWORD8 gu1_coeff_cost[6]; + +/** +****************************************************************************** +* @brief Indices map to raster scan for luma 4x4 block +* input : scan index +* output : scan location +* @remarks The scan order assumes the stride to access the next row is 16 +****************************************************************************** +*/ +extern const UWORD8 gu1_luma_scan_order[16]; + +/** +****************************************************************************** +* @brief Indices map to raster scan for chroma AC block +* input : scan index +* output : scan location +* @remarks The scan order assumes the stride to access the next row is 32 +****************************************************************************** +*/ +extern const UWORD8 gu1_chroma_scan_order[15]; + +/** +****************************************************************************** +* @brief Indices map to raster scan for luma 4x4 dc block +* input : scan index +* output : scan location +* @remarks The scan order assumes the stride to access the next row is 16 +****************************************************************************** +*/ +extern const UWORD8 gu1_luma_scan_order_dc[16]; + +/** +****************************************************************************** +* @brief Indices map to raster scan for chroma 2x2 dc block +* input : scan index +* output : scan location +* @remarks The scan order assumes the stride to access the next row is 16 +****************************************************************************** +*/ +extern const UWORD8 gu1_chroma_scan_order_dc[4]; + + +/** +****************************************************************************** +* @brief choice of motion vectors to be used during mv prediction +* input : formatted reference idx comparison metric +* output : mv prediction has to be median or a simple straight forward selec +* tion from neighbors. +* @remarks If only one of the candidate blocks has a reference frame equal to + the current block then use the same block as the final predictor. A simple + look up table to assist this mv prediction condition +****************************************************************************** +*/ +extern const WORD8 gi1_mv_pred_condition[8]; + + +/** +****************************************************************************** +* @brief maps the h264 quantizer to the mpeg2 quantizer scale +* input : h264 qp +* output : eqvivalent mpeg 2 qp +* @remarks mpeg2qscale = 2 ^ [((h264qp - 12) / 6) + 1] +****************************************************************************** +*/ +extern const UWORD8 gau1_h264_to_mpeg2_qmap[H264_QP_ELEM]; + +/** +****************************************************************************** +* @brief maps the mpeg2 quantizer to the h264 quantizer scale +* input : mpeg2 qp +* output : eqvivalent h264q p +* @remarks MPEG-2 dequantization: (2*QFij + k)*Wij*qscale/32 +* k = 0 (for intra) k = sign(QFij) +* H.264 dequantization: (QFij*R(QP%6,i,j))>>(6 - QP/6) +* +* Excluding the portion of R(QP%6,i,j) that is due to +* the DCT scale factors, the 6 entries after dividing by 64 (2^6) +* correspond to dequant values of +* 2.5, 2.8125, 3.125, 3.5625, 3.9375, 4.4375. +* (a=0.5 b=sqrt(2/5) - refer to JVT-B038.doc) +* +* Assuming that h264Qp=12 corresponds to MPEG2 qscale of 2 +* (the actual mapping seems to be to MPEG2 qscale of 2.5), +* and the fact that the effective h264 quantizer changes by +* a factor of 2 for every 6 steps, the following mapping is +* obtained: +* h264qp = 6*(log2(mpeg2qscale/2)) + 12. +* +* Note that the quant matrix entry assumed for the above +* equality is 16. Hence when the mpeg2 quant matrix entries +* are all 16, this lookup can be used as is (which is the +* default inter quant matrix in mpeg-2). +****************************************************************************** +*/ +extern const UWORD8 gau1_mpeg2_to_h264_qmap[MPEG2_QP_ELEM]; + + +#endif /* IH264E_GLOBALS_H_ */ diff --git a/encoder/ih264e_half_pel.c b/encoder/ih264e_half_pel.c new file mode 100755 index 0000000..cb475a1 --- /dev/null +++ b/encoder/ih264e_half_pel.c @@ -0,0 +1,226 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_half_pel.c +* +* @brief +* This file contains functions that are used for computing subpixel planes +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_sixtapfilter_horz +* - ih264e_sixtap_filter_2dvh_vert +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <assert.h> +#include <limits.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ithread.h" +#include "ih264_platform_macros.h" +#include "ih264_defs.h" +#include "ih264e_half_pel.h" +#include "ih264_macros.h" +#include "ih264e_half_pel.h" +#include "ih264e_debug.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* Interprediction luma filter for horizontal input (Filter run for width = 17 +* and height =16) +* +* @par Description: +* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +* sec 8.4.2.2.1 titled "Luma sample interpolation process" +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ +void ih264e_sixtapfilter_horz(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd) +{ + UWORD32 u4_i, u4_j; + UWORD32 u4_w, u4_h; + + /* width and height of interpolation */ + u4_w = HP_PL_WD; + u4_h = MB_SIZE; + + pu1_src -= 2; + + for (u4_i = 0; u4_i < u4_h; u4_i++) + { + for (u4_j = 0; u4_j < u4_w; u4_j++, pu1_dst++, pu1_src++) + { + WORD16 i16_temp; + + i16_temp = ih264_g_six_tap[0] * (*pu1_src + pu1_src[5]) + + ih264_g_six_tap[1] * (pu1_src[1] + pu1_src[4]) + + ih264_g_six_tap[2] * (pu1_src[2] + pu1_src[3]); + + i16_temp = (i16_temp + 16) >> 5; + + *pu1_dst = CLIP_U8(i16_temp); + } + pu1_src += src_strd - u4_w; + pu1_dst += dst_strd - u4_w; + } +} + +/** +******************************************************************************* +* +* @brief +* This function implements a two stage cascaded six tap filter. It applies +* the six tap filter in the vertical direction on the predictor values, +* followed by applying the same filter in the horizontal direction on the +* output of the first stage. The six tap filtering operation is described in +* sec 8.4.2.2.1 titled "Luma sample interpolation process" (Filter run for +* width = 17 and height = 17) +* +* @par Description: +* The function interpolates the predictors first in the vertical direction and +* then in the horizontal direction to output the (1/2,1/2). The output of the +* first stage of the filter is stored in the buffer pointed to by +* pi16_pred1(only in C) in 16 bit precision. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst1 +* UWORD8 pointer to the destination (Horizontal filtered output) +* +* @param[out] pu1_dst2 +* UWORD8 pointer to the destination (output after applying vertical filter to +* the intermediate horizontal output) +* +* @param[in] src_strd +* integer source stride + +* @param[in] dst_strd +* integer destination stride of pu1_dst +* +* @param[in] pi4_pred +* Pointer to 16bit intermediate buffer (used only in c) +* +* @param[in] i4_pred_strd +* integer destination stride of pi16_pred1 +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ +void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src, + UWORD8 *pu1_dst1, + UWORD8 *pu1_dst2, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 *pi4_pred, + WORD32 i4_pred_strd) +{ + WORD32 row, col; + WORD32 tmp; + WORD32 *pi4_pred_temp = pi4_pred; + WORD32 ht = HP_PL_HT, wd = HP_PL_WD; + + for (row = 0; row < ht; row++) + { + for (col = -2; col < wd + 3; col++) + { + tmp = ih264_g_six_tap[0] * (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd]) + + ih264_g_six_tap[1] * (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd]) + + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1 * src_strd]); + + pi4_pred_temp[col] = tmp; + } + + pu1_src += src_strd; + pi4_pred_temp += i4_pred_strd; + } + + for (row = 0; row < ht; row++) + { + for (col = 0; col < wd; col++) + { + tmp = (pi4_pred[col - 2] + pi4_pred[col + 3]) + + ih264_g_six_tap[1] * (pi4_pred[col - 1] + pi4_pred[col + 2]) + + ih264_g_six_tap[2] * (pi4_pred[col] + pi4_pred[col + 1]); + + tmp = (tmp + 512) >> 10; + + pu1_dst2[col] = CLIP_U8(tmp); + pu1_dst1[col] = CLIP_U8((pi4_pred[col] + 16) >> 5); + } + pi4_pred += i4_pred_strd; + pu1_dst2 += dst_strd; + pu1_dst1 += dst_strd; + } +} + diff --git a/encoder/ih264e_half_pel.h b/encoder/ih264e_half_pel.h new file mode 100755 index 0000000..92bd37f --- /dev/null +++ b/encoder/ih264e_half_pel.h @@ -0,0 +1,162 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264e_half_pel.h + * + * @brief + * Contains extern declarations of subpel functions used by the encoder + * + * @author + * ittiam + * + * @remarks + * none + * + ******************************************************************************* + */ + +#ifndef IH264E_HALF_PEL_H_ +#define IH264E_HALF_PEL_H_ + +/*****************************************************************************/ +/* Global constants */ +/*****************************************************************************/ +/* + * Dimensions of subpel plane buffers + */ +#define HP_PL_WD MB_SIZE + 1 +#define HP_PL_HT MB_SIZE + 1 + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* Interprediction luma filter for horizontal input (Filter run for width = 17 +* and height =16) +* +* @par Description: +* Applies a 6 tap horizontal filter .The output is clipped to 8 bits +* sec 8.4.2.2.1 titled "Luma sample interpolation process" +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ +typedef void ih264e_sixtapfilter_horz_ft(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd); + +ih264e_sixtapfilter_horz_ft ih264e_sixtapfilter_horz; + +/* arm assembly */ +ih264e_sixtapfilter_horz_ft ih264e_sixtapfilter_horz_a9q; +ih264e_sixtapfilter_horz_ft ih264e_sixtapfilter_horz_av8; + +/* x86 intrinsics*/ +ih264e_sixtapfilter_horz_ft ih264e_sixtapfilter_horz_ssse3; + +/** +******************************************************************************* +* +* @brief +* This function implements a two stage cascaded six tap filter. It applies +* the six tap filter in the vertical direction on the predictor values, +* followed by applying the same filter in the horizontal direction on the +* output of the first stage. The six tap filtering operation is described in +* sec 8.4.2.2.1 titled "Luma sample interpolation process" (Filter run for +* width = 17 and height = 17) +* +* @par Description: +* The function interpolates the predictors first in the vertical direction and +* then in the horizontal direction to output the (1/2,1/2). The output of the +* first stage of the filter is stored in the buffer pointed to by +* pi16_pred1(only in C) in 16 bit precision. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst1 +* UWORD8 pointer to the destination (Horizontal filtered output) +* +* @param[out] pu1_dst2 +* UWORD8 pointer to the destination (output after applying vertical filter to +* the intermediate horizontal output) +* +* @param[in] src_strd +* integer source stride + +* @param[in] dst_strd +* integer destination stride of pu1_dst +* +* @param[in] pi4_pred +* Pointer to 16bit intermediate buffer (used only in c) +* +* @param[in] i4_pred_strd +* integer destination stride of pi16_pred1 +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ +typedef void ih264e_sixtap_filter_2dvh_vert_ft(UWORD8 *pu1_src, + UWORD8 *pu1_dst1, + UWORD8 *pu1_dst2, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 *pi4_pred, + WORD32 i4_pred_strd); + +ih264e_sixtap_filter_2dvh_vert_ft ih264e_sixtap_filter_2dvh_vert; + +/* assembly */ +ih264e_sixtap_filter_2dvh_vert_ft ih264e_sixtap_filter_2dvh_vert_a9q; + +ih264e_sixtap_filter_2dvh_vert_ft ih264e_sixtap_filter_2dvh_vert_av8; + +/* x86 intrinsics */ +ih264e_sixtap_filter_2dvh_vert_ft ih264e_sixtap_filter_2dvh_vert_ssse3; + +#endif /* IH264E_HALF_PEL_H_ */ diff --git a/encoder/ih264e_intra_modes_eval.c b/encoder/ih264e_intra_modes_eval.c new file mode 100755 index 0000000..b41d717 --- /dev/null +++ b/encoder/ih264e_intra_modes_eval.c @@ -0,0 +1,2296 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_intra_modes_eval.c +* +* @brief +* This file contains definitions of routines that perform rate distortion +* analysis on a macroblock if they are to be coded as intra. +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_derive_neighbor_availability_of_mbs() +* - ih264e_derive_ngbr_avbl_of_mb_partitions() +* - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff() +* - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff() +* - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff() +* - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton() +* - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff() +* - ih264e_evaluate_intra16x16_modes() +* - ih264e_evaluate_intra4x4_modes() +* - ih264e_evaluate_intra_chroma_modes() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <string.h> +#include <limits.h> +#include <assert.h> + +/* User include files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "ih264e_defs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_debug.h" +#include "ih264_defs.h" +#include "ih264_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_structs.h" +#include "ih264_common_tables.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ime_distortion_metrics.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_structs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264e_globals.h" +#include "ime_platform_macros.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief +* derivation process for macroblock availability +* +* @par Description +* Calculates the availability of the left, top, topright and topleft macroblocks. +* +* @param[in] ps_proc_ctxt +* pointer to proc context (handle) +* +* @remarks Based on section 6.4.5 in H264 spec +* +* @return none +* +****************************************************************************** +*/ +void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc) +{ + UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx; + UWORD8 *pu1_slice_idx_b; + UWORD8 *pu1_slice_idx_a; + UWORD8 *pu1_slice_idx_c; + UWORD8 *pu1_slice_idx_d; + block_neighbors_t *ps_ngbr_avbl; + WORD32 i4_mb_x, i4_mb_y; + WORD32 i4_wd_mbs; + + i4_mb_x = ps_proc->i4_mb_x; + i4_mb_y = ps_proc->i4_mb_y; + + i4_wd_mbs = ps_proc->i4_wd_mbs; + + pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x; + pu1_slice_idx_a = pu1_slice_idx_curr - 1; + pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs; + pu1_slice_idx_c = pu1_slice_idx_b + 1; + pu1_slice_idx_d = pu1_slice_idx_b - 1; + ps_ngbr_avbl = ps_proc->ps_ngbr_avbl; + + /**********************************************************************/ + /* The macroblock is marked as available, unless one of the following */ + /* conditions is true in which case the macroblock shall be marked as */ + /* not available. */ + /* 1. mbAddr < 0 */ + /* 2 mbAddr > CurrMbAddr */ + /* 3. the macroblock with address mbAddr belongs to a different slice */ + /* than the macroblock with address CurrMbAddr */ + /**********************************************************************/ + + /* left macroblock availability */ + if (i4_mb_x == 0) + { /* macroblocks along first column */ + ps_ngbr_avbl->u1_mb_a = 0; + } + else + { /* macroblocks belong to same slice? */ + if (*pu1_slice_idx_a != *pu1_slice_idx_curr) + ps_ngbr_avbl->u1_mb_a = 0; + else + ps_ngbr_avbl->u1_mb_a = 1; + } + + /* top macroblock availability */ + if (i4_mb_y == 0) + { /* macroblocks along first row */ + ps_ngbr_avbl->u1_mb_b = 0; + } + else + { /* macroblocks belong to same slice? */ + if (*pu1_slice_idx_b != *pu1_slice_idx_curr) + ps_ngbr_avbl->u1_mb_b = 0; + else + ps_ngbr_avbl->u1_mb_b = 1; + } + + /* top right macroblock availability */ + if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0) + { /* macroblocks along last column */ + ps_ngbr_avbl->u1_mb_c = 0; + } + else + { /* macroblocks belong to same slice? */ + if (*pu1_slice_idx_c != *pu1_slice_idx_curr) + ps_ngbr_avbl->u1_mb_c = 0; + else + ps_ngbr_avbl->u1_mb_c = 1; + } + + /* top left macroblock availability */ + if (i4_mb_x == 0 || i4_mb_y == 0) + { /* macroblocks along first column */ + ps_ngbr_avbl->u1_mb_d = 0; + } + else + { /* macroblocks belong to same slice? */ + if (*pu1_slice_idx_d != *pu1_slice_idx_curr) + ps_ngbr_avbl->u1_mb_d = 0; + else + ps_ngbr_avbl->u1_mb_d = 1; + } +} + +/** +****************************************************************************** +* +* @brief +* derivation process for subblock/partition availability +* +* @par Description +* Calculates the availability of the left, top, topright and topleft subblock +* or partitions. +* +* @param[in] ps_proc_ctxt +* pointer to macroblock context (handle) +* +* @param[in] i1_pel_pos_x +* column position of the pel wrt the current block +* +* @param[in] i1_pel_pos_y +* row position of the pel in wrt current block +* +* @remarks Assumptions: before calling this function it is assumed that +* the neighbor availability of the current macroblock is already derived. +* Based on table 6-3 of H264 specification +* +* @return availability status (yes or no) +* +****************************************************************************** +*/ +UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl, + WORD8 i1_pel_pos_x, + WORD8 i1_pel_pos_y) +{ + UWORD8 u1_neighbor_avail=0; + + /**********************************************************************/ + /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to */ + /* various columns of a macroblock */ + /* */ + /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to */ + /* various rows of a macroblock */ + /* */ + /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements */ + /* outside the bound of an mb ie., represents its neighbors. */ + /**********************************************************************/ + if (i1_pel_pos_x < 0) + { /* column(-1) */ + if (i1_pel_pos_y < 0) + { /* row(-1) */ + u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */ + } + else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16) + { /* all rows of a macroblock */ + u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */ + } + else /* if (i1_pel_pos_y >= 16) */ + { /* rows(+16) */ + u1_neighbor_avail = 0; /* current mb bottom left availability */ + } + } + else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16) + { /* all columns of a macroblock */ + if (i1_pel_pos_y < 0) + { /* row(-1) */ + u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */ + } + else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16) + { /* all rows of a macroblock */ + u1_neighbor_avail = 1; /* current mb availability */ + /* availability of the partition is dependent on the position of the partition inside the mb */ + /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */ + } + else /* if (i1_pel_pos_y >= 16) */ + { /* rows(+16) */ + u1_neighbor_avail = 0; /* current mb bottom availability */ + } + } + else if (i1_pel_pos_x >= 16) + { /* column(+16) */ + if (i1_pel_pos_y < 0) + { /* row(-1) */ + u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */ + } + else /* if (i1_pel_pos_y >= 0) */ + { /* all other rows */ + u1_neighbor_avail = 0; /* current mb right & bottom right availability */ + } + } + + return u1_neighbor_avail; +} + +/** +****************************************************************************** +* +* @brief +* evaluate best intra 16x16 mode (rate distortion opt off) +* +* @par Description +* This function evaluates all the possible intra 16x16 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to process context (handle) +* +* @remarks +* Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: As per the Document JVT-O079, for intra 16x16 macroblock, +* the SAD and cost are one and the same. +* +* @return none +* +****************************************************************************** +*/ + +void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* SAD(distortion metric) of an 8x8 block */ + WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX; + + /* lambda */ + UWORD32 u4_lambda = ps_proc->u4_lambda; + + /* cost = distortion + lambda*rate */ + WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX; + + /* intra mode */ + UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16; + + /* neighbor pels for intra prediction */ + UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels; + + /* neighbor availability */ + WORD32 i4_ngbr_avbl; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma; + UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16; + UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* pointer to neighbors left, top, topleft */ + UWORD8 *pu1_mb_a = pu1_ref_mb - 1; + UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd; + UWORD8 *pu1_mb_d = pu1_mb_b - 1; + + /* valid intra modes map */ + UWORD32 u4_valid_intra_modes; + + /* lut for valid intra modes */ + const UWORD8 u1_valid_intra_modes[8] = {4, 6, 12, 14, 5, 7, 13, 15}; + + /* temp var */ + UWORD32 i, u4_enable_fast_sad = 0, offset = 0; + + /* init temp var */ + if (ps_proc->i4_slice_type == PSLICE) + { + offset = 5; + u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad; + } + + /* locating neighbors that are available for prediction */ + /* TODO : update the neighbor availability information basing on constrained intra pred information */ + /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines + * basing on neighbors available and hence evade the computation of neighbor availability totally. */ + /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */ + i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1); + ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl; + + /* gather prediction pels from the neighbors, if particular set is not available + * it is set to zero*/ + /* left pels */ + if (ps_proc->ps_ngbr_avbl->u1_mb_a) + { + for(i = 0; i < 16; i++) + pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd]; + } + else + { + ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE); + } + /* top pels */ + if (ps_proc->ps_ngbr_avbl->u1_mb_b) + { + ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16); + /*for(i = 0; i < 16; i++) + pu1_ngbr_pels_i16[16+1+i] = pu1_mb_b[i];*/ + } + else + { + ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE); + } + /* topleft pels */ + if (ps_proc->ps_ngbr_avbl->u1_mb_d) + pu1_ngbr_pels_i16[16] = *pu1_mb_d; + else + pu1_ngbr_pels_i16[16] = 0; + + /* set valid intra modes for evaluation */ +// u4_valid_intra_modes = 15; +//// ih264e_filter_intra16x16modes(pu1_mb_curr, i4_src_strd, &u4_valid_intra_modes); +// if (!ps_proc->ps_ngbr_avbl->u1_mb_a) +// u4_valid_intra_modes &= ~(1 << HORZ_I16x16); +// if (!ps_proc->ps_ngbr_avbl->u1_mb_b) +// u4_valid_intra_modes &= ~(1 << VERT_I16x16); +//// if (!ps_proc->ps_ngbr_avbl->u1_mb_a || !ps_proc->ps_ngbr_avbl->u1_mb_b || !ps_proc->ps_ngbr_avbl->u1_mb_d) +// if (i4_ngbr_avbl != 7) +// u4_valid_intra_modes &= ~(1 << PLANE_I16x16); + + u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl]; + + if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST) + u4_valid_intra_modes &= ~(1 << PLANE_I16x16); + + /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */ + ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16, + i4_src_strd, i4_pred_strd, + i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least, + u4_valid_intra_modes); + + /* cost = distortion + lambda*rate */ + i4_mb_cost_least = i4_mb_distortion_least; + + if (( (u4_valid_intra_modes >> 3) & 1) != 0 && (ps_codec->s_cfg.u4_enc_speed_preset != IVE_FASTEST || + ps_proc->i4_slice_type == ISLICE)) + { + /* intra prediction for PLANE mode*/ + (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl); + + /* evaluate distortion between the actual blk and the estimated blk for the given mode */ + ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion); + + /* cost = distortion + lambda*rate */ + i4_mb_cost = i4_mb_distortion; + + /* update the least cost information if necessary */ + if(i4_mb_cost < i4_mb_distortion_least) + { + u4_intra_mode = PLANE_I16x16; + + i4_mb_cost_least = i4_mb_cost; + i4_mb_distortion_least = i4_mb_distortion; + } + } + + u4_best_intra_16x16_mode = u4_intra_mode; + + DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode); + + ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode; + + /* cost = distortion + lambda*rate */ + i4_mb_cost_least = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode]; + + + /* update the type of the mb if necessary */ + if (i4_mb_cost_least < ps_proc->i4_mb_cost) + { + ps_proc->i4_mb_cost = i4_mb_cost_least; + ps_proc->i4_mb_distortion = i4_mb_distortion_least; + ps_proc->u4_mb_type = I16x16; + } + + return ; +} + + +/** +****************************************************************************** +* +* @brief +* evaluate best intra 8x8 mode (rate distortion opt on) +* +* @par Description +* This function evaluates all the possible intra 8x8 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to proc ctxt +* +* @remarks Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: TODO: This function needs to be tested +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* SAD(distortion metric) of an 4x4 block */ + WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0; + + /* lambda */ + UWORD32 u4_lambda = ps_proc->u4_lambda; + + /* cost = distortion + lambda*rate */ + WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda; + + /* cost due to mbtype */ + UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda; + + /* intra mode */ + UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode; + + /* neighbor pels for intra prediction */ + UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels; + + /* pointer to curr partition */ + UWORD8 *pu1_mb_curr; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + + /* neighbors left, top, top right, top left */ + UWORD8 *pu1_mb_a; + UWORD8 *pu1_mb_b; + UWORD8 *pu1_mb_d; + + /* neighbor availability */ + WORD32 i4_ngbr_avbl; + block_neighbors_t s_ngbr_avbl; + + /* temp vars */ + UWORD32 b8, u4_pix_x, u4_pix_y; + + /* ngbr mb syntax information */ + UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4); + mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x; + + /* valid intra modes map */ + UWORD32 u4_valid_intra_modes; + + for(b8 = 0; b8 < 4; b8++) + { + u4_pix_x = (b8 & 0x01) << 3; + u4_pix_y = (b8 >> 1) << 3; + + pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd); + /* when rdopt is off, we use the input as reference for constructing prediction buffer */ + /* as opposed to using the recon pels. (open loop intra prediction) */ + pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */ + pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */ + pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */ + + /* locating neighbors that are available for prediction */ + /* TODO : update the neighbor availability information basing on constrained intra pred information */ + /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */ + /* basing on neighbors available and hence evade the computation of neighbor availability totally. */ + s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */ + s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */ + s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */ + s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */ + + /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */ + i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + (s_ngbr_avbl.u1_mb_c << 3) + + (s_ngbr_avbl.u1_mb_a << 4); + /* if top partition is available and top right is not available for intra prediction, then */ + /* padd top right samples using top sample and make top right also available */ + /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */ + ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl; + + + ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8, + i4_src_strd, i4_ngbr_avbl); + + i4_partition_cost_least = INT_MAX; + /* set valid intra modes for evaluation */ + u4_valid_intra_modes = 0x1ff; + + if (!s_ngbr_avbl.u1_mb_b) + { + u4_valid_intra_modes &= ~(1 << VERT_I4x4); + u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4); + u4_valid_intra_modes &= ~(1 << VERT_L_I4x4); + } + if (!s_ngbr_avbl.u1_mb_a) + { + u4_valid_intra_modes &= ~(1 << HORZ_I4x4); + u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4); + } + if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d) + { + u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4); + u4_valid_intra_modes &= ~(1 << VERT_R_I4x4); + u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4); + } + + /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */ + if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b) + { + u4_estimated_intra_8x8_mode = DC_I8x8; + } + else + { + UWORD32 u4_left_intra_8x8_mode = DC_I8x8; + UWORD32 u4_top_intra_8x8_mode = DC_I8x8; + + if (u4_pix_x == 0) + { + if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8) + { + u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1]; + } + else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4) + { + u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2]; + } + } + else + { + u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1]; + } + + if (u4_pix_y == 0) + { + if (ps_top_mb_syn_ele->u2_mb_type == I8x8) + { + u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2]; + } + else if (ps_top_mb_syn_ele->u2_mb_type == I4x4) + { + u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2]; + } + } + else + { + u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2]; + } + + u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode); + } + + /* perform intra mode 8x8 evaluation */ + for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1) + { + if ( (u4_valid_intra_modes & 1) == 0) + continue; + + /* intra prediction */ + (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl); + + /* evaluate distortion between the actual blk and the estimated blk for the given mode */ + ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion); + + i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits); + + /* update the least cost information if necessary */ + if (i4_partition_cost < i4_partition_cost_least) + { + i4_partition_cost_least = i4_partition_cost; + i4_partition_distortion_least = i4_partition_distortion; + u4_best_intra_8x8_mode = u4_intra_mode; + } + } + /* macroblock distortion */ + i4_total_cost += i4_partition_cost_least; + i4_total_distortion += i4_partition_distortion_least; + /* mb partition mode */ + ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode; + + } + + /* update the type of the mb if necessary */ + if (i4_total_cost < ps_proc->i4_mb_cost) + { + ps_proc->i4_mb_cost = i4_total_cost; + ps_proc->i4_mb_distortion = i4_total_distortion; + ps_proc->u4_mb_type = I8x8; + } + + return ; +} + + +/** +****************************************************************************** +* +* @brief +* evaluate best intra 4x4 mode (rate distortion opt off) +* +* @par Description +* This function evaluates all the possible intra 4x4 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to proc ctxt +* +* @remarks +* Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock, +* 24*lambda is added to the SAD before comparison with the best SAD for +* inter prediction. This is an empirical value to prevent using too many intra +* blocks. +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* SAD(distortion metric) of an 4x4 block */ + WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0; + + /* lambda */ + UWORD32 u4_lambda = ps_proc->u4_lambda; + + /* cost = distortion + lambda*rate */ + WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda; + + /* cost due to mbtype */ + UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda; + + /* intra mode */ + UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode; + + /* neighbor pels for intra prediction */ + UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels; + + /* pointer to curr partition */ + UWORD8 *pu1_mb_curr; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + + /* neighbors left, top, top right, top left */ + UWORD8 *pu1_mb_a; + UWORD8 *pu1_mb_b; + UWORD8 *pu1_mb_c; + UWORD8 *pu1_mb_d; + + /* neighbor availability */ + WORD32 i4_ngbr_avbl; + block_neighbors_t s_ngbr_avbl; + + /* temp vars */ + UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y; + + /* scan order inside 4x4 block */ + const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}; + + /* ngbr sub mb modes */ + UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4); + mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x; + + /* valid intra modes map */ + UWORD32 u4_valid_intra_modes; + UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511}; + + i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3); + memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16); + + for (b8 = 0; b8 < 4; b8++) + { + u4_blk_x = (b8 & 0x01) << 3; + u4_blk_y = (b8 >> 1) << 3; + for (b4 = 0; b4 < 4; b4++) + { + u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2); + u4_pix_y = u4_blk_y + ((b4 >> 1) << 2); + + pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd); + /* when rdopt is off, we use the input as reference for constructing prediction buffer */ + /* as opposed to using the recon pels. (open loop intra prediction) */ + pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */ + pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */ + pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */ + pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */ + + /* locating neighbors that are available for prediction */ + /* TODO : update the neighbor availability information basing on constrained intra pred information */ + /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */ + /* basing on neighbors available and hence evade the computation of neighbor availability totally. */ + + i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4]; + s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1); + s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1; + s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2; + s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3; + /* set valid intra modes for evaluation */ + u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7]; + + /* if top partition is available and top right is not available for intra prediction, then */ + /* padd top right samples using top sample and make top right also available */ + /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */ + + /* gather prediction pels from the neighbors */ + if (s_ngbr_avbl.u1_mb_a) + { + for(i = 0; i < 4; i++) + pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd]; + } + else + { + memset(pu1_ngbr_pels_i4, 0, 4); + } + + if (s_ngbr_avbl.u1_mb_b) + { + memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4); + } + else + { + memset(pu1_ngbr_pels_i4 + 5, 0, 4); + } + + if (s_ngbr_avbl.u1_mb_d) + pu1_ngbr_pels_i4[4] = *pu1_mb_d; + else + pu1_ngbr_pels_i4[4] = 0; + + if (s_ngbr_avbl.u1_mb_c) + { + memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4); + } + else if (s_ngbr_avbl.u1_mb_b) + { + memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4); + s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b; + } + + i4_partition_cost_least = INT_MAX; + + /* predict the intra 4x4 mode for the current partition (for evaluating cost) */ + if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b) + { + u4_estimated_intra_4x4_mode = DC_I4x4; + } + else + { + UWORD32 u4_left_intra_4x4_mode = DC_I4x4; + UWORD32 u4_top_intra_4x4_mode = DC_I4x4; + + if (u4_pix_x == 0) + { + if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4) + { + u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]]; + } + else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8) + { + u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1]; + } + } + else + { + u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]]; + } + + if (u4_pix_y == 0) + { + if (ps_top_mb_syn_ele->u2_mb_type == I4x4) + { + u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]]; + } + else if (ps_top_mb_syn_ele->u2_mb_type == I8x8) + { + u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2]; + } + } + else + { + u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]]; + } + + u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode); + } + + ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode; + + /* mode evaluation and prediction */ + ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr, + pu1_ngbr_pels_i4, + pu1_pred_mb, i4_src_strd, + i4_pred_strd, i4_ngbr_avbl, + &u4_best_intra_4x4_mode, + &i4_partition_cost_least, + u4_valid_intra_modes, + u4_lambda, + u4_estimated_intra_4x4_mode); + + + i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits); + + DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode); + /* macroblock distortion */ + i4_total_distortion += i4_partition_distortion_least; + i4_total_cost += i4_partition_cost_least; + /* mb partition mode */ + ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode; + } + } + + /* update the type of the mb if necessary */ + if (i4_total_cost < ps_proc->i4_mb_cost) + { + ps_proc->i4_mb_cost = i4_total_cost; + ps_proc->i4_mb_distortion = i4_total_distortion; + ps_proc->u4_mb_type = I4x4; + } + + return ; +} + +/** +****************************************************************************** +* +* @brief evaluate best intra 4x4 mode (rate distortion opt on) +* +* @par Description +* This function evaluates all the possible intra 4x4 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to proc ctxt +* +* @remarks +* Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock, +* 24*lambda is added to the SAD before comparison with the best SAD for +* inter prediction. This is an empirical value to prevent using too many intra +* blocks. +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* SAD(distortion metric) of an 4x4 block */ + WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0; + + /* lambda */ + UWORD32 u4_lambda = ps_proc->u4_lambda; + + /* cost = distortion + lambda*rate */ + WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda; + + /* cost due to mbtype */ + UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda; + + /* intra mode */ + UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode; + + /* neighbor pels for intra prediction */ + UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels; + + /* pointer to curr partition */ + UWORD8 *pu1_mb_curr; + UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top; + UWORD8 *pu1_ref_mb_intra_4x4; + + /* pointer to residual macro block */ + WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + WORD32 i4_ref_strd_left, i4_ref_strd_top; + + /* neighbors left, top, top right, top left */ + UWORD8 *pu1_mb_a; + UWORD8 *pu1_mb_b; + UWORD8 *pu1_mb_c; + UWORD8 *pu1_mb_d; + + /* number of non zero coeffs*/ + UWORD8 *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4; + + /* quantization parameters */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0]; + + /* neighbor availability */ + WORD32 i4_ngbr_avbl; + block_neighbors_t s_ngbr_avbl; + + /* temp vars */ + UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y; + + /* scan order inside 4x4 block */ + const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15}; + + /* ngbr sub mb modes */ + UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4); + mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x; + + /* valid intra modes map */ + UWORD32 u4_valid_intra_modes; + UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511}; + + /* Dummy variable for 4x4 trans function */ + WORD16 i2_dc_dummy; + + /* compute ngbr availability for sub blks */ + i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3); + memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16); + + for(b8 = 0; b8 < 4; b8++) + { + u4_blk_x = (b8 & 0x01) << 3; + u4_blk_y = (b8 >> 1) << 3; + for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE) + { + u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2); + u4_pix_y = u4_blk_y + ((b4 >> 1) << 2); + + pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd); + pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd); + if (u4_pix_x == 0) + { + i4_ref_strd_left = ps_proc->i4_rec_strd; + pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left); + } + else + { + i4_ref_strd_left = i4_pred_strd; + pu1_mb_ref_left = pu1_ref_mb_intra_4x4; + } + if (u4_pix_y == 0) + { + i4_ref_strd_top = ps_proc->i4_rec_strd; + pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top); + } + else + { + i4_ref_strd_top = i4_pred_strd; + pu1_mb_ref_top = pu1_ref_mb_intra_4x4; + } + + pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */ + pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */ + pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */ + if (u4_pix_y == 0) + pu1_mb_d = pu1_mb_b - 1; + else + pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */ + + /* locating neighbors that are available for prediction */ + /* TODO : update the neighbor availability information basing on constrained intra pred information */ + /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */ + /* basing on neighbors available and hence evade the computation of neighbor availability totally. */ + + i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4]; + s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1); + s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1; + s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2; + s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3; + /* set valid intra modes for evaluation */ + u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7]; + + /* if top partition is available and top right is not available for intra prediction, then */ + /* padd top right samples using top sample and make top right also available */ + /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */ + + /* gather prediction pels from the neighbors */ + if (s_ngbr_avbl.u1_mb_a) + { + for(i = 0; i < 4; i++) + pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left]; + } + else + { + memset(pu1_ngbr_pels_i4,0,4); + } + if(s_ngbr_avbl.u1_mb_b) + { + memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4); + } + else + { + memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4); + } + if (s_ngbr_avbl.u1_mb_d) + pu1_ngbr_pels_i4[4] = *pu1_mb_d; + else + pu1_ngbr_pels_i4[4] = 0; + if (s_ngbr_avbl.u1_mb_c) + { + memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4); + } + else if (s_ngbr_avbl.u1_mb_b) + { + memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4); + s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b; + } + + i4_partition_cost_least = INT_MAX; + + /* predict the intra 4x4 mode for the current partition (for evaluating cost) */ + if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b) + { + u4_estimated_intra_4x4_mode = DC_I4x4; + } + else + { + UWORD32 u4_left_intra_4x4_mode = DC_I4x4; + UWORD32 u4_top_intra_4x4_mode = DC_I4x4; + + if (u4_pix_x == 0) + { + if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4) + { + u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]]; + } + else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8) + { + u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1]; + } + } + else + { + u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]]; + } + + if (u4_pix_y == 0) + { + if (ps_top_mb_syn_ele->u2_mb_type == I4x4) + { + u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]]; + } + else if (ps_top_mb_syn_ele->u2_mb_type == I8x8) + { + u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2]; + } + } + else + { + u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]]; + } + + u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode); + } + + ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode; + + /*mode evaluation and prediction*/ + ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr, + pu1_ngbr_pels_i4, + pu1_pred_mb, i4_src_strd, + i4_pred_strd, i4_ngbr_avbl, + &u4_best_intra_4x4_mode, + &i4_partition_cost_least, + u4_valid_intra_modes, + u4_lambda, + u4_estimated_intra_4x4_mode); + + + i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits); + + DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode); + + /* macroblock distortion */ + i4_total_distortion += i4_partition_distortion_least; + i4_total_cost += i4_partition_cost_least; + + /* mb partition mode */ + ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode; + + + /********************************************************/ + /* error estimation, */ + /* transform */ + /* quantization */ + /********************************************************/ + ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb, + pi2_res_mb, i4_src_strd, + i4_pred_strd, + /* No op stride, this implies a buff of lenght 1x16 */ + ps_qp_params->pu2_scale_mat, + ps_qp_params->pu2_thres_mat, + ps_qp_params->u1_qbits, + ps_qp_params->u4_dead_zone, + pu1_nnz, &i2_dc_dummy); + + /********************************************************/ + /* ierror estimation, */ + /* itransform */ + /* iquantization */ + /********************************************************/ + ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb, + pu1_ref_mb_intra_4x4, + i4_pred_strd, i4_pred_strd, + ps_qp_params->pu2_iscale_mat, + ps_qp_params->pu2_weigh_mat, + ps_qp_params->u1_qp_div, + ps_proc->pv_scratch_buff, 0, + NULL); + } + } + + /* update the type of the mb if necessary */ + if (i4_total_cost < ps_proc->i4_mb_cost) + { + ps_proc->i4_mb_cost = i4_total_cost; + ps_proc->i4_mb_distortion = i4_total_distortion; + ps_proc->u4_mb_type = I4x4; + } + + return ; +} + +/** +****************************************************************************** +* +* @brief +* evaluate best chroma intra 8x8 mode (rate distortion opt off) +* +* @par Description +* This function evaluates all the possible chroma intra 8x8 modes and finds +* the mode that best represents the macroblock (least distortion) and occupies +* fewer bits in the bitstream. +* +* @param[in] ps_proc_ctxt +* pointer to macroblock context (handle) +* +* @remarks +* For chroma best intra pred mode is calculated based only on SAD +* +* @returns none +* +****************************************************************************** +*/ + +void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc) +{ + /* Codec Context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* SAD(distortion metric) of an 8x8 block */ + WORD32 i4_mb_distortion, i4_chroma_mb_distortion; + + /* intra mode */ + UWORD32 u4_best_chroma_intra_8x8_mode = DC_CH_I8x8; + + /* neighbor pels for intra prediction */ + UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels; + + /* pointer to curr macro block */ + UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma; + UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma; + + /* pointer to prediction macro block */ + UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma; + UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane; + + /* strides */ + WORD32 i4_src_strd_c = ps_proc->i4_src_strd; + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd; + + /* neighbors left, top, top left */ + UWORD8 *pu1_mb_a = pu1_ref_mb - 2; + UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c; + UWORD8 *pu1_mb_d = pu1_mb_b - 2; + + /* neighbor availability */ + const UWORD8 u1_valid_intra_modes[8] = {1, 3, 9, 11, 5, 7, 13, 15,}; + WORD32 i4_ngbr_avbl; + + /* valid intra modes map */ + UWORD32 u4_valid_intra_modes; + + /* temp var */ + UWORD8 i; + + /* locating neighbors that are available for prediction */ + /* TODO : update the neighbor availability information basing on constrained intra pred information */ + /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines + * basing on neighbors available and hence evade the computation of neighbor availability totally. */ + /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */ + i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1); + ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl; + + /* gather prediction pels from the neighbors */ + /* left pels */ + if (ps_proc->ps_ngbr_avbl->u1_mb_a) + { + for (i = 0; i < 16; i += 2) + { + pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c]; + pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1]; + } + } + else + { + ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE); + } + + /* top pels */ + if (ps_proc->ps_ngbr_avbl->u1_mb_b) + { + ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16); + } + else + { + ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE); + } + + /* top left pels */ + if (ps_proc->ps_ngbr_avbl->u1_mb_d) + { + pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d; + pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1); + } + + u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl]; + + if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST) + u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8); + + i4_chroma_mb_distortion = INT_MAX; + + /* perform intra mode chroma 8x8 evaluation */ + /* intra prediction */ + ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb, + pu1_ngbr_pels_c_i8x8, + pu1_pred_mb, + i4_src_strd_c, + i4_pred_strd, + i4_ngbr_avbl, + &u4_best_chroma_intra_8x8_mode, + &i4_chroma_mb_distortion, + u4_valid_intra_modes); + + if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/ + { + (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl); + + /* evaluate distortion(sad) */ + ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion); + + /* update the least distortion information if necessary */ + if(i4_mb_distortion < i4_chroma_mb_distortion) + { + i4_chroma_mb_distortion = i4_mb_distortion; + u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8; + } + } + + DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode); + + ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode; + + return ; +} + + +/** +****************************************************************************** +* +* @brief +* Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the +* prediction. +* +* @par Description +* This function evaluates first three 16x16 modes and compute corresponding sad +* and return the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] pu1_ngbr_pels_i16 +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* Pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* Pointer to the variable in which minimum sad is returned +* +* @param[in] u4_valid_intra_modes +* Says what all modes are valid +* +* @returns none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels_i16, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 u4_n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes) +{ + UWORD8 *pu1_neighbour; + UWORD8 *pu1_src_temp = pu1_src; + UWORD8 left = 0, top = 0; + WORD32 u4_dcval = 0; + WORD32 i, j; + WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX, + i4_min_sad = INT_MAX; + UWORD8 val; + + left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK); + top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; + + /* left available */ + if (left) + { + i4_sad_horz = 0; + + for (i = 0; i < 16; i++) + { + val = pu1_ngbr_pels_i16[15 - i]; + + u4_dcval += val; + + for (j = 0; j < 16; j++) + { + i4_sad_horz += ABS(val - pu1_src_temp[j]); + } + + pu1_src_temp += src_strd; + } + u4_dcval += 8; + } + + pu1_src_temp = pu1_src; + /* top available */ + if (top) + { + i4_sad_vert = 0; + + for (i = 0; i < 16; i++) + { + u4_dcval += pu1_ngbr_pels_i16[17 + i]; + + for (j = 0; j < 16; j++) + { + i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]); + } + pu1_src_temp += src_strd; + + } + u4_dcval += 8; + } + + u4_dcval = (u4_dcval) >> (3 + left + top); + + pu1_src_temp = pu1_src; + + /* none available */ + u4_dcval += (left == 0) * (top == 0) * 128; + + i4_sad_dc = 0; + + for (i = 0; i < 16; i++) + { + for (j = 0; j < 16; j++) + { + i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]); + } + pu1_src_temp += src_strd; + } + + if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */ + i4_sad_dc = INT_MAX; + + if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */ + i4_sad_vert = INT_MAX; + + if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */ + i4_sad_horz = INT_MAX; + + i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert); + + /* Finding Minimum sad and doing corresponding prediction */ + if (i4_min_sad < *pu4_sadmin) + { + *pu4_sadmin = i4_min_sad; + if (i4_min_sad == i4_sad_vert) + { + *u4_intra_mode = VERT_I16x16; + pu1_neighbour = pu1_ngbr_pels_i16 + 17; + for (j = 0; j < 16; j++) + { + memcpy(pu1_dst, pu1_neighbour, MB_SIZE); + pu1_dst += dst_strd; + } + } + else if (i4_min_sad == i4_sad_horz) + { + *u4_intra_mode = HORZ_I16x16; + for (j = 0; j < 16; j++) + { + val = pu1_ngbr_pels_i16[15 - j]; + memset(pu1_dst, val, MB_SIZE); + pu1_dst += dst_strd; + } + } + else + { + *u4_intra_mode = DC_I16x16; + for (j = 0; j < 16; j++) + { + memset(pu1_dst, u4_dcval, MB_SIZE); + pu1_dst += dst_strd; + } + } + } + return; +} + +/** +****************************************************************************** +* +* @brief +* Evaluate best intra 4x4 mode and perform prediction. +* +* @par Description +* This function evaluates 4x4 modes and compute corresponding sad +* and return the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] pu1_ngbr_pels +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* Pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* Pointer to the variable in which minimum cost is returned +* +* @param[in] u4_valid_intra_modes +* Says what all modes are valid +* +* @param[in] u4_lambda +* Lamda value for computing cost from SAD +* +* @param[in] u4_predictd_mode +* Predicted mode for cost computation +* +* @returns none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 u4_n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes, + UWORD32 u4_lambda, + UWORD32 u4_predictd_mode) +{ + UWORD8 *pu1_src_temp = pu1_src; + UWORD8 *pu1_pred = pu1_ngbr_pels; + UWORD8 left = 0, top = 0; + UWORD8 u1_pred_val = 0; + UWORD8 u1_pred_vals[4] = {0}; + UWORD8 *pu1_pred_val = NULL; + /* To store FILT121 operated values*/ + UWORD8 u1_pred_vals_diag_121[15] = {0}; + /* To store FILT11 operated values*/ + UWORD8 u1_pred_vals_diag_11[15] = {0}; + UWORD8 u1_pred_vals_vert_r[8] = {0}; + UWORD8 u1_pred_vals_horz_d[10] = {0}; + UWORD8 u1_pred_vals_horz_u[10] = {0}; + WORD32 u4_dcval = 0; + WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX, + INT_MAX, INT_MAX, INT_MAX, INT_MAX}; + + WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX, + INT_MAX, INT_MAX, INT_MAX, INT_MAX}; + WORD32 i, i4_min_cost = INT_MAX; + + left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK); + top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; + + /* Computing SAD */ + + /* VERT mode valid */ + if (u4_valid_intra_modes & 1) + { + pu1_pred = pu1_ngbr_pels + 5; + i4_sad[VERT_I4x4] = 0; + i4_cost[VERT_I4x4] = 0; + + USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]); + + i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + /* HORZ mode valid */ + if (u4_valid_intra_modes & 2) + { + i4_sad[HORZ_I4x4] = 0; + i4_cost[HORZ_I4x4] =0; + pu1_src_temp = pu1_src; + + u1_pred_val = pu1_ngbr_pels[3]; + + i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val) + + ABS(pu1_src_temp[1] - u1_pred_val) + + ABS(pu1_src_temp[2] - u1_pred_val) + + ABS(pu1_src_temp[3] - u1_pred_val); + pu1_src_temp += src_strd; + + u1_pred_val = pu1_ngbr_pels[2]; + + i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val) + + ABS(pu1_src_temp[1] - u1_pred_val) + + ABS(pu1_src_temp[2] - u1_pred_val) + + ABS(pu1_src_temp[3] - u1_pred_val); + pu1_src_temp += src_strd; + + u1_pred_val = pu1_ngbr_pels[1]; + + i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val) + + ABS(pu1_src_temp[1] - u1_pred_val) + + ABS(pu1_src_temp[2] - u1_pred_val) + + ABS(pu1_src_temp[3] - u1_pred_val); + pu1_src_temp += src_strd; + + u1_pred_val = pu1_ngbr_pels[0]; + + i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val) + + ABS(pu1_src_temp[1] - u1_pred_val) + + ABS(pu1_src_temp[2] - u1_pred_val) + + ABS(pu1_src_temp[3] - u1_pred_val); + + i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + /* DC mode valid */ + if (u4_valid_intra_modes & 4) + { + i4_sad[DC_I4x4] = 0; + i4_cost[DC_I4x4] = 0; + pu1_src_temp = pu1_src; + + if (left) + u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2] + + pu1_ngbr_pels[3] + 2; + if (top) + u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7] + + pu1_ngbr_pels[8] + 2; + + u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128; + + /* none available */ + memset(u1_pred_vals, u4_dcval, 4); + USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]); + pu1_src_temp += src_strd; + + i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + /* if modes other than VERT, HORZ and DC are valid */ + if (u4_valid_intra_modes > 7) + { + pu1_pred = pu1_ngbr_pels; + pu1_pred[13] = pu1_pred[14] = pu1_pred[12]; + + /* Performing FILT121 and FILT11 operation for all neighbour values*/ + for (i = 0; i < 13; i++) + { + u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]); + u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]); + + pu1_pred++; + } + + if (u4_valid_intra_modes & 8)/* DIAG_DL */ + { + i4_sad[DIAG_DL_I4x4] = 0; + i4_cost[DIAG_DL_I4x4] = 0; + pu1_src_temp = pu1_src; + pu1_pred_val = u1_pred_vals_diag_121 + 5; + + USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]); + pu1_src_temp += src_strd; + i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + if (u4_valid_intra_modes & 16)/* DIAG_DR */ + { + i4_sad[DIAG_DR_I4x4] = 0; + i4_cost[DIAG_DR_I4x4] = 0; + pu1_src_temp = pu1_src; + pu1_pred_val = u1_pred_vals_diag_121 + 3; + + USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]); + pu1_src_temp += src_strd; + i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ? + u4_lambda : 4 * u4_lambda); + + } + + if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/ + { + i4_sad[VERT_R_I4x4] = 0; + + pu1_src_temp = pu1_src; + u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2]; + memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3); + u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1]; + memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3); + + pu1_pred_val = u1_pred_vals_diag_11 + 4; + USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]); + pu1_pred_val = u1_pred_vals_diag_121 + 3; + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4), + i4_sad[VERT_R_I4x4]); + + i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/ + { + i4_sad[HORZ_D_I4x4] = 0; + + pu1_src_temp = pu1_src; + u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3]; + memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3); + u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0]; + u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0]; + u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1]; + u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1]; + u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2]; + u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2]; + + pu1_pred_val = u1_pred_vals_horz_d; + USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]); + + i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/ + { + i4_sad[VERT_L_I4x4] = 0; + pu1_src_temp = pu1_src; + pu1_pred_val = u1_pred_vals_diag_11 + 5; + USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]); + pu1_src_temp += src_strd; + pu1_pred_val = u1_pred_vals_diag_121 + 5; + USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]); + pu1_src_temp += src_strd; + pu1_pred_val = u1_pred_vals_diag_11 + 6; + USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]); + pu1_src_temp += src_strd; + pu1_pred_val = u1_pred_vals_diag_121 + 6; + USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]); + + i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/ + { + i4_sad[HORZ_U_I4x4] = 0; + pu1_src_temp = pu1_src; + u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2]; + u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1]; + u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1]; + u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0]; + u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0]; + u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]); + + memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4); + + pu1_pred_val = u1_pred_vals_horz_u; + USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]); + pu1_src_temp += src_strd; + USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]); + + i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ? + u4_lambda : 4 * u4_lambda); + } + + i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]), + MIN3(i4_cost[3], i4_cost[4], i4_cost[5]), + MIN3(i4_cost[6], i4_cost[7], i4_cost[8])); + + } + else + { + /* Only first three modes valid */ + i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]); + } + + *pu4_sadmin = i4_min_cost; + + if (i4_min_cost == i4_cost[0]) + { + *u4_intra_mode = VERT_I4x4; + pu1_pred_val = pu1_ngbr_pels + 5; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val), 4); + } + else if (i4_min_cost == i4_cost[1]) + { + *u4_intra_mode = HORZ_I4x4; + memset(pu1_dst, pu1_ngbr_pels[3], 4); + pu1_dst += dst_strd; + memset(pu1_dst, pu1_ngbr_pels[2], 4); + pu1_dst += dst_strd; + memset(pu1_dst, pu1_ngbr_pels[1], 4); + pu1_dst += dst_strd; + memset(pu1_dst, pu1_ngbr_pels[0], 4); + } + else if (i4_min_cost == i4_cost[2]) + { + *u4_intra_mode = DC_I4x4; + memset(pu1_dst, u4_dcval, 4); + pu1_dst += dst_strd; + memset(pu1_dst, u4_dcval, 4); + pu1_dst += dst_strd; + memset(pu1_dst, u4_dcval, 4); + pu1_dst += dst_strd; + memset(pu1_dst, u4_dcval, 4); + } + + else if (i4_min_cost == i4_cost[3]) + { + *u4_intra_mode = DIAG_DL_I4x4; + pu1_pred_val = u1_pred_vals_diag_121 + 5; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 1), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 2), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 3), 4); + } + else if (i4_min_cost == i4_cost[4]) + { + *u4_intra_mode = DIAG_DR_I4x4; + pu1_pred_val = u1_pred_vals_diag_121 + 3; + + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val - 1), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val - 2), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val - 3), 4); + } + + else if (i4_min_cost == i4_cost[5]) + { + *u4_intra_mode = VERT_R_I4x4; + pu1_pred_val = u1_pred_vals_diag_11 + 4; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + pu1_pred_val = u1_pred_vals_diag_121 + 3; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (u1_pred_vals_vert_r), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4); + } + else if (i4_min_cost == i4_cost[6]) + { + *u4_intra_mode = HORZ_D_I4x4; + pu1_pred_val = u1_pred_vals_horz_d; + memcpy(pu1_dst, (pu1_pred_val + 6), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 4), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 2), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + } + else if (i4_min_cost == i4_cost[7]) + { + *u4_intra_mode = VERT_L_I4x4; + pu1_pred_val = u1_pred_vals_diag_11 + 5; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + pu1_pred_val = u1_pred_vals_diag_121 + 5; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + pu1_pred_val = u1_pred_vals_diag_11 + 6; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + pu1_pred_val = u1_pred_vals_diag_121 + 6; + memcpy(pu1_dst, (pu1_pred_val), 4); + } + else if (i4_min_cost == i4_cost[8]) + { + *u4_intra_mode = HORZ_U_I4x4; + pu1_pred_val = u1_pred_vals_horz_u; + memcpy(pu1_dst, (pu1_pred_val), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 2), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 4), 4); + pu1_dst += dst_strd; + memcpy(pu1_dst, (pu1_pred_val + 6), 4); + pu1_dst += dst_strd; + } + + return; +} + +/** +****************************************************************************** +* +* @brief: +* Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction. +* +* @par Description +* This function evaluates first three intra chroma modes and compute corresponding sad +* and return the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] pu1_ngbr_pels +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* Pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* Pointer to the variable in which minimum sad is returned +* +* @param[in] u4_valid_intra_modes +* Says what all modes are valid +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 u4_n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes) +{ + UWORD8 *pu1_neighbour; + UWORD8 *pu1_src_temp = pu1_src; + UWORD8 left = 0, top = 0; + WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */ + u4_dcval_u_t[2] = { 0, 0 }; /*sum top neighbours for 'U'*/ + + WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/ + u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/ + + WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, + i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX; + UWORD8 val_u, val_v; + + WORD32 u4_dc_val[2][2][2];/* ----------- + | | | Chroma can have four + | 00 | 01 | separate dc value... + ----------- u4_dc_val corresponds to this dc values + | | | with u4_dc_val[2][2][U] and u4_dc_val[2][2][V] + | 10 | 11 | + ----------- */ + left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK); + top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; + + /*Evaluating HORZ*/ + if (left)/* Ifleft available*/ + { + i4_sad_horz = 0; + + for (i = 0; i < 8; i++) + { + val_v = pu1_ngbr_pels[15 - 2 * i]; + val_u = pu1_ngbr_pels[15 - 2 * i - 1]; + row = i / 4; + u4_dcval_u_l[row] += val_u; + u4_dcval_v_l[row] += val_v; + for (j = 0; j < 8; j++) + { + i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/ + i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]); + } + + pu1_src_temp += src_strd; + } + u4_dcval_u_l[0] += 2; + u4_dcval_u_l[1] += 2; + u4_dcval_v_l[0] += 2; + u4_dcval_v_l[1] += 2; + } + + /*Evaluating VERT**/ + pu1_src_temp = pu1_src; + if (top) /* top available*/ + { + i4_sad_vert = 0; + + for (i = 0; i < 8; i++) + { + col = i / 4; + + val_u = pu1_ngbr_pels[18 + i * 2]; + val_v = pu1_ngbr_pels[18 + i * 2 + 1]; + u4_dcval_u_t[col] += val_u; + u4_dcval_v_t[col] += val_v; + + for (j = 0; j < 16; j++) + { + i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/ + } + pu1_src_temp += src_strd; + + } + u4_dcval_u_t[0] += 2; + u4_dcval_u_t[1] += 2; + u4_dcval_v_t[0] += 2; + u4_dcval_v_t[1] += 2; + } + + /* computing DC value*/ + /* Equation 8-128 in spec*/ + u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top); + u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top); + u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top); + u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top); + + if (top) + { + /* Equation 8-132 in spec*/ + u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top); + u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top); + } + else + { + u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left); + u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left); + } + + if (left) + { + u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left); + u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left); + } + else + { + u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top); + u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top); + } + + if (!(left || top)) + { + /*none available*/ + u4_dc_val[0][0][0] = u4_dc_val[0][0][1] = + u4_dc_val[0][1][0] = u4_dc_val[0][1][1] = + u4_dc_val[1][0][0] = u4_dc_val[1][0][1] = + u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128; + } + + /* Evaluating DC */ + pu1_src_temp = pu1_src; + i4_sad_dc = 0; + for (i = 0; i < 8; i++) + { + for (j = 0; j < 8; j++) + { + col = j / 4; + row = i / 4; + val_u = u4_dc_val[row][col][0]; + val_v = u4_dc_val[row][col][1]; + + i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/ + i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]); + } + pu1_src_temp += src_strd; + } + + if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/ + i4_sad_dc = INT_MAX; + if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/ + i4_sad_horz = INT_MAX; + if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/ + i4_sad_vert = INT_MAX; + + i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert); + + /* Finding Minimum sad and doing corresponding prediction*/ + if (i4_min_sad < *pu4_sadmin) + { + *pu4_sadmin = i4_min_sad; + + if (i4_min_sad == i4_sad_dc) + { + *u4_intra_mode = DC_CH_I8x8; + for (i = 0; i < 8; i++) + { + for (j = 0; j < 8; j++) + { + col = j / 4; + row = i / 4; + + pu1_dst[2 * j] = u4_dc_val[row][col][0]; + pu1_dst[2 * j + 1] = u4_dc_val[row][col][1]; + } + pu1_dst += dst_strd; + } + } + else if (i4_min_sad == i4_sad_horz) + { + *u4_intra_mode = HORZ_CH_I8x8; + for (j = 0; j < 8; j++) + { + val_v = pu1_ngbr_pels[15 - 2 * j]; + val_u = pu1_ngbr_pels[15 - 2 * j - 1]; + + for (i = 0; i < 8; i++) + { + pu1_dst[2 * i] = val_u; + pu1_dst[2 * i + 1] = val_v; + + } + pu1_dst += dst_strd; + } + } + else + { + *u4_intra_mode = VERT_CH_I8x8; + pu1_neighbour = pu1_ngbr_pels + 18; + for (j = 0; j < 8; j++) + { + memcpy(pu1_dst, pu1_neighbour, MB_SIZE); + pu1_dst += dst_strd; + } + } + } + + return; +} diff --git a/encoder/ih264e_intra_modes_eval.h b/encoder/ih264e_intra_modes_eval.h new file mode 100755 index 0000000..c8402e5 --- /dev/null +++ b/encoder/ih264e_intra_modes_eval.h @@ -0,0 +1,418 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_intra_modes_eval.h +* +* @brief +* This file contains declarations of routines that perform rate distortion +* analysis on a macroblock if coded as intra. +* +* @author +* ittiam +* +* @remarks +* none +* +******************************************************************************* +*/ + +#ifndef IH264E_INTRA_MODES_EVAL_H_ +#define IH264E_INTRA_MODES_EVAL_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief +* derivation process for macroblock availability +* +* @par Description +* Calculates the availability of the left, top, topright and topleft macroblocks. +* +* @param[in] ps_proc_ctxt +* pointer to proc context (handle) +* +* @remarks Based on section 6.4.5 in H264 spec +* +* @return none +* +****************************************************************************** +*/ +void ih264e_derive_nghbr_avbl_of_mbs + ( + process_ctxt_t *ps_proc_ctxt + ); + +/** +****************************************************************************** +* +* @brief +* derivation process for subblock/partition availability +* +* @par Description +* Calculates the availability of the left, top, topright and topleft subblock +* or partitions. +* +* @param[in] ps_proc_ctxt +* pointer to macroblock context (handle) +* +* @param[in] i1_pel_pos_x +* column position of the pel wrt the current block +* +* @param[in] i1_pel_pos_y +* row position of the pel in wrt current block +* +* @remarks Assumptions: before calling this function it is assumed that +* the neighbor availability of the current macroblock is already derived. +* Based on table 6-3 of H264 specification +* +* @return availability status (yes or no) +* +****************************************************************************** +*/ +UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions + ( + block_neighbors_t *s_ngbr_avbl, + WORD8 i1_pel_pos_x, + WORD8 i1_pel_pos_y + ); + +/** +****************************************************************************** +* +* @brief +* evaluate best intra 16x16 mode (rate distortion opt off) +* +* @par Description +* This function evaluates all the possible intra 16x16 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to process context (handle) +* +* @remarks +* Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: As per the Document JVT-O079, for intra 16x16 macroblock, +* the SAD and cost are one and the same. +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff + ( + process_ctxt_t *ps_proc_ctxt + ); + +/** +****************************************************************************** +* +* @brief +* evaluate best intra 8x8 mode (rate distortion opt on) +* +* @par Description +* This function evaluates all the possible intra 8x8 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to proc ctxt +* +* @remarks Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: TODO: This function needs to be tested +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff + ( + process_ctxt_t *ps_proc_ctxt + ); + +/** +****************************************************************************** +* +* @brief +* evaluate best intra 4x4 mode (rate distortion opt on) +* +* @par Description +* This function evaluates all the possible intra 4x4 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to proc ctxt +* +* @remarks +* Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock, +* 24*lambda is added to the SAD before comparison with the best SAD for +* inter prediction. This is an empirical value to prevent using too many intra +* blocks. +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton + ( + process_ctxt_t *ps_proc_ctxt + ); + +/** +****************************************************************************** +* +* @brief +* evaluate best intra 4x4 mode (rate distortion opt off) +* +* @par Description +* This function evaluates all the possible intra 4x4 modes and finds the mode +* that best represents the macro-block (least distortion) and occupies fewer +* bits in the bit-stream. +* +* @param[in] ps_proc_ctxt +* pointer to proc ctxt +* +* @remarks +* Ideally the cost of encoding a macroblock is calculated as +* (distortion + lambda*rate). Where distortion is SAD/SATD,... between the +* input block and the reconstructed block and rate is the number of bits taken +* to place the macroblock in the bit-stream. In this routine the rate does not +* exactly point to the total number of bits it takes, rather it points to header +* bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits +* and residual bits fall in to texture bits the number of bits taken to encoding +* mbtype is considered as rate, we compute cost. Further we will approximate +* the distortion as the deviation b/w input and the predicted block as opposed +* to input and reconstructed block. +* +* NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock, +* 24*lambda is added to the SAD before comparison with the best SAD for +* inter prediction. This is an empirical value to prevent using too many intra +* blocks. +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff + ( + process_ctxt_t *ps_proc_ctxt + ); + +/** +****************************************************************************** +* +* @brief +* evaluate best chroma intra 8x8 mode (rate distortion opt off) +* +* @par Description +* This function evaluates all the possible chroma intra 8x8 modes and finds +* the mode that best represents the macroblock (least distortion) and occupies +* fewer bits in the bitstream. +* +* @param[in] ps_proc_ctxt +* pointer to macroblock context (handle) +* +* @remarks +* For chroma best intra pred mode is calculated based only on SAD +* +* @returns none +* +****************************************************************************** +*/ +void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff + ( + process_ctxt_t *ps_proc_ctxt + ); + + +/** +****************************************************************************** +* +* @brief +* Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the +* prediction. +* +* @par Description +* This function evaluates first three 16x16 modes and compute corresponding sad +* and return the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] pu1_ngbr_pels_i16 +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* Pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* Pointer to the variable in which minimum sad is returned +* +* @param[in] u4_valid_intra_modes +* Says what all modes are valid +* +* @returns none +* +****************************************************************************** +*/ +typedef void ih264e_evaluate_intra_modes_ft(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels_i16, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 u4_n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes); + +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra16x16_modes; +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra_chroma_modes; + +/* assembly */ +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra16x16_modes_a9q; +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra_chroma_modes_a9q; + +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra16x16_modes_av8; +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra_chroma_modes_av8; + +/* x86 intrinsics */ +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra16x16_modes_ssse3; +ih264e_evaluate_intra_modes_ft ih264e_evaluate_intra_chroma_modes_ssse3; + +/** +****************************************************************************** +* +* @brief +* Evaluate best intra 4x4 mode and perform prediction. +* +* @par Description +* This function evaluates 4x4 modes and compute corresponding sad +* and return the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] pu1_ngbr_pels +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* Pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* Pointer to the variable in which minimum cost is returned +* +* @param[in] u4_valid_intra_modes +* Says what all modes are valid +* +* @param[in] u4_lambda +* Lamda value for computing cost from SAD +* +* @param[in] u4_predictd_mode +* Predicted mode for cost computation +* +* @returns none +* +****************************************************************************** +*/ +typedef void ih264e_evaluate_intra_4x4_modes_ft(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 u4_n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes, + UWORD32 u4_lambda, + UWORD32 u4_predictd_mode); + +ih264e_evaluate_intra_4x4_modes_ft ih264e_evaluate_intra_4x4_modes; + +/* x86 intrinsics */ +ih264e_evaluate_intra_4x4_modes_ft ih264e_evaluate_intra_4x4_modes_ssse3; + +/* assembly */ +ih264e_evaluate_intra_4x4_modes_ft ih264e_evaluate_intra_4x4_modes_a9q; +ih264e_evaluate_intra_4x4_modes_ft ih264e_evaluate_intra_4x4_modes_av8; + +#endif /* IH264E_INTRA_MODES_EVAL_H_ */ diff --git a/encoder/ih264e_list.h b/encoder/ih264e_list.h new file mode 100755 index 0000000..782c007 --- /dev/null +++ b/encoder/ih264e_list.h @@ -0,0 +1,42 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_list.h +* +* @brief +* The file contains declarations of functions for encoder queue management +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_LIST_H_ +#define IH264E_LIST_H_ + + +#endif /* IH264E_LIST_H_ */ diff --git a/encoder/ih264e_master.h b/encoder/ih264e_master.h new file mode 100755 index 0000000..6c7505a --- /dev/null +++ b/encoder/ih264e_master.h @@ -0,0 +1,132 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_master.h +* +* @brief +* Contains declarations of functions used by master thread +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_MASTER_H_ +#define IH264E_MASTER_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief +* This function joins all the spawned threads after successful completion of +* their tasks +* +* @par Description +* +* @param[in] ps_codec +* pointer to codec context +* +* @returns none +* +****************************************************************************** +*/ +void ih264e_join_threads(codec_t *ps_codec); + +/** +****************************************************************************** +* +* @brief This function puts the current thread to sleep for a duration +* of sleep_us +* +* @par Description +* ithread_yield() method causes the calling thread to yield execution to another +* thread that is ready to run on the current processor. The operating system +* selects the thread to yield to. ithread_usleep blocks the current thread for +* the specified number of milliseconds. In other words, yield just says, +* end my timeslice prematurely, look around for other threads to run. If there +* is nothing better than me, continue. Sleep says I don't want to run for x +* milliseconds. Even if no other thread wants to run, don't make me run. +* +* @param[in] sleep_us +* thread sleep duration +* +* @returns error_status +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_wait_for_thread(UWORD32 sleep_us); + +/** +****************************************************************************** +* +* @brief +* Encodes in synchronous api mode +* +* @par Description +* This routine processes input yuv, encodes it and outputs bitstream and recon +* +* @param[in] ps_codec_obj +* Pointer to codec object at API level +* +* @param[in] pv_api_ip +* Pointer to input argument structure +* +* @param[out] pv_api_op +* Pointer to output argument structure +* +* @returns Status +* +****************************************************************************** +*/ +WORD32 ih264e_encode(iv_obj_t *ps_codec_obj, void *pv_api_ip, void *pv_api_op); + +/** +******************************************************************************* +* +* @brief update encoder configuration parameters +* +* @par Description: +* updates encoder configuration parameters from the given config set. +* Initialize/reinitialize codec parameters according to new configurations. +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] ps_cfg +* Pointer to config param set +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_codec_update_config(codec_t *ps_codec, cfg_params_t *ps_cfg); + +#endif /* IH264E_MASTER_H_ */ diff --git a/encoder/ih264e_mc.c b/encoder/ih264e_mc.c new file mode 100755 index 0000000..2dd0974 --- /dev/null +++ b/encoder/ih264e_mc.c @@ -0,0 +1,320 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_mc.c +* +* @brief +* Contains definition of functions for motion compensation +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_motion_comp_luma() +* - ih264e_motion_comp_chroma() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ih264_defs.h" +#include "iv2.h" +#include "ive2.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_structs.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_mc.h" +#include "ih264e_half_pel.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief +* performs motion compensation for a luma mb for the given mv. +* +* @par Description +* This routine performs motion compensation of an inter mb. When the inter +* mb mode is P16x16, there is no need to copy 16x16 unit from reference buffer +* to pred buffer. In this case the function returns pointer and stride of the +* ref. buffer and this info is used in place of pred buffer else where. +* In other cases, the pred buffer is populated via copy / filtering + copy +* (q pel cases) and returned. +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @param[out] pu1_pseudo_pred +* pseudo prediction buffer +* +* @param[out] u4_pseudo_pred_strd +* pseudo pred buffer stride +* +* @return none +* +* @remarks Assumes half pel buffers for the entire frame are populated. +* +****************************************************************************** +*/ +void ih264e_motion_comp_luma(process_ctxt_t *ps_proc, + UWORD8 **pu1_pseudo_pred, + WORD32 *pi4_pseudo_pred_strd) +{ + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* me ctxt */ + me_ctxt_t *ps_me_ctxt = &ps_proc->s_me_ctxt; + + /* Pointer to the structure having motion vectors, size and position of curr partitions */ + enc_pu_t *ps_curr_pu; + + /* pointers to full pel, half pel x, half pel y, half pel xy reference buffer */ + UWORD8 *pu1_ref[4]; + + /* pred buffer ptr */ + UWORD8 *pu1_pred; + + /* strides of full pel, half pel x, half pel y, half pel xy reference buffer */ + WORD32 i4_ref_strd[4]; + + /* pred buffer stride */ + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + + /* full pel motion vectors */ + WORD32 u4_mv_x_full, u4_mv_y_full; + + /* half pel motion vectors */ + WORD32 u4_mv_x_hpel, u4_mv_y_hpel; + + /* quarter pel motion vectors */ + WORD32 u4_mv_x_qpel, u4_mv_y_qpel; + + /* width & height of the partition */ + UWORD32 wd, ht; + + /* partition idx */ + UWORD32 u4_num_prtn; + + /* half / qpel coefficient */ + UWORD32 u4_subpel_factor; + + /* temp var */ + UWORD32 u4_lkup_idx1; + + /* Init */ + i4_ref_strd[0] = ps_proc->i4_rec_strd; + + i4_ref_strd[1] = i4_ref_strd[2] = i4_ref_strd[3] = ps_me_ctxt->u4_hp_buf_strd; + + for (u4_num_prtn = 0; u4_num_prtn < ps_proc->u4_num_sub_partitions; u4_num_prtn++) + { + /* update ptr to curr partition */ + ps_curr_pu = ps_proc->ps_pu + u4_num_prtn; + + + /* get full pel mv's (full pel units) */ + u4_mv_x_full = ps_curr_pu->s_l0_mv.i2_mvx >> 2; + u4_mv_y_full = ps_curr_pu->s_l0_mv.i2_mvy >> 2; + + /* get half pel mv's */ + u4_mv_x_hpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x2) >> 1; + u4_mv_y_hpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x2) >> 1; + + /* get quarter pel mv's */ + u4_mv_x_qpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x1); + u4_mv_y_qpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x1); + + /* width and height of partition */ + wd = (ps_curr_pu->b4_wd + 1) << 2; + ht = (ps_curr_pu->b4_ht + 1) << 2; + + /* decision ? qpel/hpel, fpel */ + u4_subpel_factor = (u4_mv_y_hpel << 3) + (u4_mv_x_hpel << 2) + (u4_mv_y_qpel << 1) + (u4_mv_x_qpel); + + /* update ref buffer ptrs */ + pu1_ref[0] = ps_proc->pu1_ref_buf_luma + (u4_mv_y_full * i4_ref_strd[0]) + u4_mv_x_full; + + pu1_ref[1] = ps_proc->pu1_best_subpel_buf; + i4_ref_strd[1] = ps_proc->u4_bst_spel_buf_strd; + + + /* update pred buff ptr */ + pu1_pred = ps_proc->pu1_pred_mb + 4 * ps_curr_pu->b4_pos_y * i4_pred_strd + 4 * ps_curr_pu->b4_pos_x; + + /*u4_lkup_idx1 will be non zero for half pel*/ + u4_lkup_idx1 = (u4_subpel_factor >> 2 ) != 0 ; + + { + /********************************************************************/ + /* if the block is P16x16 MB and mv are not quarter pel motion */ + /* vectors, there is no need to copy 16x16 unit from reference frame*/ + /* to pred buffer. We might as well send the reference frame buffer */ + /* pointer as pred buffer (ofc with updated stride) to fwd transform*/ + /* and inverse transform unit. */ + /********************************************************************/ + if (ps_proc->u4_num_sub_partitions == 1) + { + *pu1_pseudo_pred = pu1_ref[u4_lkup_idx1]; + *pi4_pseudo_pred_strd = i4_ref_strd[u4_lkup_idx1]; + + } + /* + * Copying half pel or full pel to prediction buffer + * Currently ps_proc->u4_num_sub_partitions will always be 1 as we only support 16x16 in P mbs + */ + else + { + ps_codec->pf_inter_pred_luma_copy(pu1_ref[u4_lkup_idx1], pu1_pred, i4_ref_strd[u4_lkup_idx1], i4_pred_strd, ht, wd, NULL, 0); + } + + } + } +} + +/** +****************************************************************************** +* +* @brief +* performs motion compensation for chroma mb +* +* @par Description +* Copies a MB of data from the reference buffer (Full pel, half pel or q pel) +* according to the motion vectors given +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @return none +* +* @remarks Assumes half pel and quarter pel buffers for the entire frame are +* populated. +****************************************************************************** +*/ +void ih264e_motion_comp_chroma(process_ctxt_t *ps_proc) +{ + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* Pointer to the structure having motion vectors, size and position of curr partitions */ + enc_pu_t *ps_curr_pu; + + /* pointers to full pel, half pel x, half pel y, half pel xy reference buffer */ + UWORD8 *pu1_ref; + + /* pred buffer ptr */ + UWORD8 *pu1_pred; + + /* strides of full pel reference buffer */ + WORD32 i4_ref_strd = ps_proc->i4_rec_strd; + + /* pred buffer stride */ + WORD32 i4_pred_strd = ps_proc->i4_pred_strd; + + /* full pel motion vectors */ + WORD32 u4_mv_x_full, u4_mv_y_full; + + /* half pel motion vectors */ + WORD32 u4_mv_x_hpel, u4_mv_y_hpel; + + /* quarter pel motion vectors */ + WORD32 u4_mv_x_qpel, u4_mv_y_qpel; + + /* width & height of the partition */ + UWORD32 wd, ht; + + /* partition idx */ + UWORD32 u4_num_prtn; + + WORD32 u4_mv_x; + WORD32 u4_mv_y; + UWORD8 u1_dx, u1_dy; + + for (u4_num_prtn = 0; u4_num_prtn < ps_proc->u4_num_sub_partitions; u4_num_prtn++) + { + ps_curr_pu =ps_proc->ps_pu + u4_num_prtn; + + u4_mv_x = ps_curr_pu->s_l0_mv.i2_mvx >> 3; + u4_mv_y = ps_curr_pu->s_l0_mv.i2_mvy >> 3; + + /* corresponds to full pel motion vector in luma, but in chroma corresponds to pel formed with dx, dy =4*/ + u4_mv_x_full = (ps_curr_pu->s_l0_mv.i2_mvx & 0x4) >> 2; + u4_mv_y_full = (ps_curr_pu->s_l0_mv.i2_mvy & 0x4) >> 2; + + /* get half pel mv's */ + u4_mv_x_hpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x2) >> 1; + u4_mv_y_hpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x2) >> 1; + + /* get quarter pel mv's */ + u4_mv_x_qpel = (ps_curr_pu->s_l0_mv.i2_mvx & 0x1); + u4_mv_y_qpel = (ps_curr_pu->s_l0_mv.i2_mvy & 0x1); + + /* width and height of sub macro block */ + wd = (ps_curr_pu->b4_wd + 1) << 1; + ht = (ps_curr_pu->b4_ht + 1) << 1; + + /* move the pointers so that they point to the motion compensated locations */ + pu1_ref = ps_proc->pu1_ref_buf_chroma + (u4_mv_y * i4_ref_strd) + (u4_mv_x << 1); + + pu1_pred = ps_proc->pu1_pred_mb + 4 * ps_curr_pu->b4_pos_y * i4_pred_strd + 2 * ps_curr_pu->b4_pos_x; + + u1_dx = (u4_mv_x_full << 2) + (u4_mv_x_hpel << 1) + (u4_mv_x_qpel); + u1_dy = (u4_mv_y_full << 2) + (u4_mv_y_hpel << 1) + (u4_mv_y_qpel); + + ps_codec->pf_inter_pred_chroma(pu1_ref, pu1_pred, i4_ref_strd, i4_pred_strd, + u1_dx, u1_dy, ht, wd); + } +} diff --git a/encoder/ih264e_mc.h b/encoder/ih264e_mc.h new file mode 100755 index 0000000..965e1d1 --- /dev/null +++ b/encoder/ih264e_mc.h @@ -0,0 +1,104 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_mc.h +* +* @brief +* This file contains declarations of routines that perform motion compensation +* of luma and chroma macroblocks. +* +* @author +* ittiam +* +* @remarks +* none +* +******************************************************************************* +*/ + +#ifndef IH264E_MC_H_ +#define IH264E_MC_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief +* performs motion compensation for a luma mb for the given mv. +* +* @par Description +* This routine performs motion compensation of an inter mb. When the inter +* mb mode is P16x16, there is no need to copy 16x16 unit from reference buffer +* to pred buffer. In this case the function returns pointer and stride of the +* ref. buffer and this info is used in place of pred buffer else where. +* In other cases, the pred buffer is populated via copy / filtering + copy +* (q pel cases) and returned. +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @param[out] pu1_pseudo_pred +* pseudo prediction buffer +* +* @param[out] u4_pseudo_pred_strd +* pseudo pred buffer stride +* +* @return none +* +* @remarks Assumes half pel buffers for the entire frame are populated. +* +****************************************************************************** +*/ +void ih264e_motion_comp_luma(process_ctxt_t *ps_proc, + UWORD8 **pu1_pseudo_pred, + WORD32 *pi4_pseudo_pred_strd); + +/** +****************************************************************************** +* +* @brief +* performs motion compensation for chroma mb +* +* @par Description +* Copies a MB of data from the reference buffer (Full pel, half pel or q pel) +* according to the motion vectors given +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @return none +* +* @remarks Assumes half pel and quarter pel buffers for the entire frame are +* populated. +****************************************************************************** +*/ +void ih264e_motion_comp_chroma + ( + process_ctxt_t *ps_proc + ); + + +#endif // IH264E_MC_H_ diff --git a/encoder/ih264e_me.c b/encoder/ih264e_me.c new file mode 100755 index 0000000..9e8d7a3 --- /dev/null +++ b/encoder/ih264e_me.c @@ -0,0 +1,1153 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264e_me.c + * + * @brief + * Contains definition of functions for motion estimation + * + * @author + * ittiam + * + * @par List of Functions: + * - ih264e_init_mv_bits() + * - ih264e_skip_analysis_chroma() + * - ih264e_skip_analysis_luma() + * - ih264e_analyse_skip() + * - ih264e_get_search_candidates() + * - ih264e_find_skip_motion_vector() + * - ih264e_get_mv_predictor() + * - ih264e_mv_pred() + * - ih264e_mv_pred_me() + * - ih264e_init_me() + * - ih264e_compute_me() + * - ih264e_compute_me_nmb() + * + * @remarks + * None + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <assert.h> +#include <limits.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ithread.h" +#include "ih264_platform_macros.h" +#include "ih264_defs.h" +#include "ime_defs.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_globals.h" +#include "ih264_macros.h" +#include "ih264e_me.h" +#include "ime.h" +#include "ime_distortion_metrics.h" +#include "ih264_debug.h" +#include "ithread.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264e_core_coding.h" +#include "ih264e_mc.h" +#include "ih264e_debug.h" +#include "ih264e_half_pel.h" +#include "ime_statistics.h" +#include "ih264e_platform_macros.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* This function populates the length of the codewords for motion vectors in the +* range (-search range, search range) in pixels +* +* @param[in] ps_me +* Pointer to me ctxt +* +* @param[out] pu1_mv_bits +* length of the codeword for all mv's +* +* @remarks The length of the code words are derived from signed exponential +* goloumb codes. +* +******************************************************************************* +*/ +void ih264e_init_mv_bits(me_ctxt_t *ps_me_ctxt) +{ + /* temp var */ + WORD32 i, codesize = 3, diff, limit; + UWORD32 u4_code_num, u4_range; + UWORD32 u4_uev_min, u4_uev_max, u4_sev_min, u4_sev_max; + + /* max srch range */ + diff = MAX(DEFAULT_MAX_SRCH_RANGE_X, DEFAULT_MAX_SRCH_RANGE_Y); + /* sub pel */ + diff <<= 2; + /* delta mv */ + diff <<= 1; + + /* codeNum for positive integer = 2x-1 : Table9-3 */ + u4_code_num = (diff << 1); + + /* get range of the bit string and put using put_bits() */ + GETRANGE(u4_range, u4_code_num); + + limit = 2*u4_range - 1; + + /* init mv bits */ + ps_me_ctxt->pu1_mv_bits[0] = 1; + + while (codesize < limit) + { + u4_uev_min = (1 << (codesize >> 1)); + u4_uev_max = 2*u4_uev_min - 1; + + u4_sev_min = u4_uev_min >> 1; + u4_sev_max = u4_uev_max >> 1; + + DEBUG("\n%d min, %d max %d codesize", u4_sev_min, u4_sev_max, codesize); + + for (i = u4_sev_min; i <= (WORD32)u4_sev_max; i++) + { + ps_me_ctxt->pu1_mv_bits[-i] = ps_me_ctxt->pu1_mv_bits[i] = codesize; + } + + codesize += 2; + } +} + +/** +******************************************************************************* +* +* @brief Determines the valid candidates for which the initial search shall happen. +* The best of these candidates is used to center the diamond pixel search. +* +* @par Description: The function sends the skip, (0,0), left, top and top-right +* neighbouring MBs MVs. The left, top and top-right MBs MVs are used because +* these are the same MVs that are used to form the MV predictor. This initial MV +* search candidates need not take care of slice boundaries and hence neighbor +* availability checks are not made here. +* +* @param[in] ps_left_mb_pu +* pointer to left mb motion vector info +* +* @param[in] ps_top_mb_pu +* pointer to top & top right mb motion vector info +* +* @param[in] ps_top_left_mb_pu +* pointer to top left mb motion vector info +* +* @param[out] ps_skip_mv +* pointer to skip motion vectors for the curr mb +* +* @param[in] i4_mb_x +* mb index x +* +* @param[in] i4_mb_y +* mb index y +* +* @param[in] i4_wd_mbs +* pic width in mbs +* +* @param[in] ps_motionEst +* pointer to me context +* +* @returns The list of MVs to be used of priming the full pel search and the +* number of such MVs +* +* @remarks +* Assumptions : 1. Assumes Single reference frame +* 2. Assumes Only partition of size 16x16 +* +******************************************************************************* +*/ +static void ih264e_get_search_candidates(process_ctxt_t *ps_proc, + me_ctxt_t *ps_me_ctxt) +{ + /* curr mb indices */ + WORD32 i4_mb_x = ps_proc->i4_mb_x; + + /* left mb motion vector */ + mv_t *ps_left_mv; + + /* top left mb motion vector */ + mv_t *ps_top_mv; + + /* top left mb motion vector */ + mv_t *ps_top_left_mv; + + /* top left mb motion vector */ + mv_t *ps_top_right_mv; + + /* skip mv */ + mv_t *ps_skip_mv = ps_proc->ps_skip_mv; + + /* mb part info */ + mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part; + + /* num of candidate search candidates */ + UWORD32 u4_num_candidates = 0; + + /* mvs */ + WORD32 mvx, mvy; + + /* ngbr availability */ + block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl; + + /* srch range*/ + WORD32 i4_srch_range_n = ps_me_ctxt->i4_srch_range_n; + WORD32 i4_srch_range_s = ps_me_ctxt->i4_srch_range_s; + WORD32 i4_srch_range_e = ps_me_ctxt->i4_srch_range_e; + WORD32 i4_srch_range_w = ps_me_ctxt->i4_srch_range_w; + + ps_left_mv = &ps_proc->s_left_mb_pu_ME.s_l0_mv; + ps_top_mv = &(ps_proc->ps_top_row_pu_ME + i4_mb_x)->s_l0_mv; + ps_top_left_mv = &ps_proc->s_top_left_mb_pu_ME.s_l0_mv; + ps_top_right_mv = &(ps_proc->ps_top_row_pu_ME + i4_mb_x + 1)->s_l0_mv; + + /************************************************************/ + /* Taking the Zero motion vector as one of the candidates */ + /************************************************************/ + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = 0; + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = 0; + + u4_num_candidates++; + + /************************************************************/ + /* Taking the Left MV Predictor as one of the candidates */ + /************************************************************/ + if (ps_ngbr_avbl->u1_mb_a) + { + mvx = (ps_left_mv->i2_mvx + 2) >> 2; + mvy = (ps_left_mv->i2_mvy + 2) >> 2; + + mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx); + mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy); + + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx; + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy; + + u4_num_candidates ++; + } + /*else + { + ps_me_ctxt->as_mv_init_search[LEFT_CAND].i2_mvx = 0; + ps_me_ctxt->as_mv_init_search[LEFT_CAND].i2_mvy = 0; + }*/ + + /************************************************************/ + /* Taking the Top MV Predictor as one of the candidates */ + /************************************************************/ + if (ps_ngbr_avbl->u1_mb_b) + { + mvx = (ps_top_mv->i2_mvx + 2) >> 2; + mvy = (ps_top_mv->i2_mvy + 2) >> 2; + + mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx); + mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy); + + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx; + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy; + + u4_num_candidates ++; + + /************************************************************/ + /* Taking the TopRt MV Predictor as one of the candidates */ + /************************************************************/ + if (ps_ngbr_avbl->u1_mb_c) + { + mvx = (ps_top_right_mv->i2_mvx + 2) >> 2; + mvy = (ps_top_right_mv->i2_mvy + 2)>> 2; + + mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx); + mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy); + + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx; + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy; + + u4_num_candidates ++; + } + /************************************************************/ + /* Taking the TopLt MV Predictor as one of the candidates */ + /************************************************************/ + else if (ps_ngbr_avbl->u1_mb_d) + { + mvx = (ps_top_left_mv->i2_mvx + 2) >> 2; + mvy = (ps_top_left_mv->i2_mvy + 2) >> 2; + + mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx); + mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy); + + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx; + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy; + + u4_num_candidates ++; + } + /*else + { + ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvx = 0; + ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvy = 0; + }*/ + } + /*else + { + ps_me_ctxt->as_mv_init_search[TOP_CAND].i2_mvx = 0; + ps_me_ctxt->as_mv_init_search[TOP_CAND].i2_mvy = 0; + + ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvx = 0; + ps_me_ctxt->as_mv_init_search[TOPR_CAND].i2_mvy = 0; + }*/ + + + /********************************************************************/ + /* MV Prediction */ + /********************************************************************/ + ih264e_mv_pred_me(ps_proc); + + ps_mb_part->s_mv_pred.i2_mvx = ps_proc->ps_pred_mv->i2_mvx; + ps_mb_part->s_mv_pred.i2_mvy = ps_proc->ps_pred_mv->i2_mvy; + + /************************************************************/ + /* Get the skip motion vector */ + /************************************************************/ + ih264e_find_skip_motion_vector(ps_proc, 1); + + /************************************************************/ + /* Taking the Skip motion vector as one of the candidates */ + /************************************************************/ + mvx = (ps_skip_mv->i2_mvx + 2) >> 2; + mvy = (ps_skip_mv->i2_mvy + 2) >> 2; + + mvx = CLIP3(i4_srch_range_w, i4_srch_range_e, mvx); + mvy = CLIP3(i4_srch_range_n, i4_srch_range_s, mvy); + + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvx = mvx; + ps_me_ctxt->as_mv_init_search[u4_num_candidates].i2_mvy = mvy; + + u4_num_candidates++; + + ASSERT(u4_num_candidates <= 5); + + ps_me_ctxt->u4_num_candidates = u4_num_candidates; +} + +/** +******************************************************************************* +* +* @brief The function gives the skip motion vector +* +* @par Description: +* The function gives the skip motion vector +* +* @param[in] ps_left_mb_pu +* pointer to left mb motion vector info +* +* @param[in] ps_top_row_pu +* pointer to top & top right mb motion vector info +* +* @param[out] ps_pred_mv +* pointer to candidate predictors for the current block +* +* @returns The x & y components of the MV predictor. +* +* @remarks The code implements the logic as described in sec 8.4.1.1 in H264 +* specification. +* +******************************************************************************* +*/ +void ih264e_find_skip_motion_vector(process_ctxt_t *ps_proc, UWORD32 u4_for_me) +{ + /* left mb motion vector */ + enc_pu_t *ps_left_mb_pu ; + + /* top mb motion vector */ + enc_pu_t *ps_top_mb_pu ; + + /* skip mv */ + mv_t *ps_skip_mv = ps_proc->ps_skip_mv; + + if (u4_for_me == 1) + { + ps_left_mb_pu = &ps_proc->s_left_mb_pu_ME; + ps_top_mb_pu = ps_proc->ps_top_row_pu_ME + ps_proc->i4_mb_x; + } + else + { + ps_left_mb_pu = &ps_proc->s_left_mb_pu ; + ps_top_mb_pu = ps_proc->ps_top_row_pu + ps_proc->i4_mb_x; + } + + if ( (!ps_proc->ps_ngbr_avbl->u1_mb_a) || + (!ps_proc->ps_ngbr_avbl->u1_mb_b) || + ((ps_left_mb_pu->i1_l0_ref_idx | ps_left_mb_pu->s_l0_mv.i2_mvx | ps_left_mb_pu->s_l0_mv.i2_mvy) == 0) || + ((ps_top_mb_pu->i1_l0_ref_idx | ps_top_mb_pu->s_l0_mv.i2_mvx | ps_top_mb_pu->s_l0_mv.i2_mvy) == 0) ) + { + ps_skip_mv->i2_mvx = 0; + ps_skip_mv->i2_mvy = 0; + } + else + { + ps_skip_mv->i2_mvx = ps_proc->ps_pred_mv->i2_mvx; + ps_skip_mv->i2_mvy = ps_proc->ps_pred_mv->i2_mvy; + } +} + +/** +******************************************************************************* +* +* @brief motion vector predictor +* +* @par Description: +* The routine calculates the motion vector predictor for a given block, +* given the candidate MV predictors. +* +* @param[in] ps_left_mb_pu +* pointer to left mb motion vector info +* +* @param[in] ps_top_row_pu +* pointer to top & top right mb motion vector info +* +* @param[out] ps_pred_mv +* pointer to candidate predictors for the current block +* +* @returns The x & y components of the MV predictor. +* +* @remarks The code implements the logic as described in sec 8.4.1.3 in H264 +* specification. +* Assumptions : 1. Assumes Single reference frame +* 2. Assumes Only partition of size 16x16 +* +******************************************************************************* +*/ +void ih264e_get_mv_predictor(enc_pu_t *ps_left_mb_pu, + enc_pu_t *ps_top_row_pu, + mv_t *ps_pred_mv) +{ + /* curr frame ref idx */ + /* we are assuming that we are operating on single reference frame + * hence the ref idx is insignificant during mv prediction. + */ + WORD32 u4_ref_idx = 0; + + /* temp var */ + WORD32 pred_algo = 3, a, b, c; + + /* If only one of the candidate blocks has a reference frame equal to + * the current block then use the same block as the final predictor */ + a = (ps_left_mb_pu->i1_l0_ref_idx == u4_ref_idx)? 0:-1; + b = (ps_top_row_pu[0].i1_l0_ref_idx == u4_ref_idx)? 0:-1; + c = (ps_top_row_pu[1].i1_l0_ref_idx == u4_ref_idx)? 0:-1; + + if (a == 0 && b == -1 && c == -1) + pred_algo = 0; /* LEFT */ + else if (a == -1 && b == 0 && c == -1) + pred_algo = 1; /* TOP */ + else if (a == -1 && b == -1 && c == 0) + pred_algo = 2; /* TOP RIGHT */ + + switch (pred_algo) + { + case 0: + /* left */ + ps_pred_mv->i2_mvx = ps_left_mb_pu->s_l0_mv.i2_mvx; + ps_pred_mv->i2_mvy = ps_left_mb_pu->s_l0_mv.i2_mvy; + break; + case 1: + /* top */ + ps_pred_mv->i2_mvx = ps_top_row_pu[0].s_l0_mv.i2_mvx; + ps_pred_mv->i2_mvy = ps_top_row_pu[0].s_l0_mv.i2_mvy; + break; + case 2: + /* top right */ + ps_pred_mv->i2_mvx = ps_top_row_pu[1].s_l0_mv.i2_mvx; + ps_pred_mv->i2_mvy = ps_top_row_pu[1].s_l0_mv.i2_mvy; + break; + case 3: + /* median */ + MEDIAN(ps_left_mb_pu->s_l0_mv.i2_mvx, + ps_top_row_pu[0].s_l0_mv.i2_mvx, + ps_top_row_pu[1].s_l0_mv.i2_mvx, + ps_pred_mv->i2_mvx); + MEDIAN(ps_left_mb_pu->s_l0_mv.i2_mvy, + ps_top_row_pu[0].s_l0_mv.i2_mvy, + ps_top_row_pu[1].s_l0_mv.i2_mvy, + ps_pred_mv->i2_mvy); + + break; + default: + break; + } +} + +/** +******************************************************************************* +* +* @brief This function performs MV prediction +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns none +* +* @remarks none +* This function will update the MB availability since intra inter decision +* should be done before the call +* +******************************************************************************* +*/ +void ih264e_mv_pred(process_ctxt_t *ps_proc) +{ + + /* left mb motion vector */ + enc_pu_t *ps_left_mb_pu ; + + /* top left mb motion vector */ + enc_pu_t *ps_top_left_mb_pu ; + + /* top row motion vector info */ + enc_pu_t *ps_top_row_pu; + + /* predicted motion vector */ + mv_t *ps_pred_mv = ps_proc->ps_pred_mv; + + /* zero mv */ + mv_t zero_mv = {0, 0}; + + /* mb neighbor availability */ + block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl; + + /* mb syntax elements of neighbors */ + mb_info_t *ps_top_syn = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x; + mb_info_t *ps_top_left_syn; + UWORD32 u4_left_is_intra; + + ps_top_left_syn = &(ps_proc->s_top_left_mb_syntax_ele); + u4_left_is_intra = ps_proc->s_left_mb_syntax_ele.u2_is_intra; + ps_left_mb_pu = &ps_proc->s_left_mb_pu; + ps_top_left_mb_pu = &ps_proc->s_top_left_mb_pu; + ps_top_row_pu = (ps_proc->ps_top_row_pu + ps_proc->i4_mb_x); + + /* Before performing mv prediction prepare the ngbr information and + * reset motion vectors basing on their availability */ + if (!ps_ngbr_avbl->u1_mb_a || (u4_left_is_intra == 1) ) + { + /* left mv */ + ps_left_mb_pu->i1_l0_ref_idx = -1; + ps_left_mb_pu->s_l0_mv = zero_mv; + } + if (!ps_ngbr_avbl->u1_mb_b || ps_top_syn->u2_is_intra) + { + /* top mv */ + ps_top_row_pu[0].i1_l0_ref_idx = -1; + ps_top_row_pu[0].s_l0_mv = zero_mv; + } + if (!ps_ngbr_avbl->u1_mb_c) + { + /* top right mv - When top right partition is not available for + * prediction if top left is available use it for prediction else + * set the mv information to -1 and (0, 0) + * */ + if (!ps_ngbr_avbl->u1_mb_d || ps_top_left_syn->u2_is_intra) + { + ps_top_row_pu[1].i1_l0_ref_idx = -1; + ps_top_row_pu[1].s_l0_mv = zero_mv; + } + else + { + ps_top_row_pu[1].i1_l0_ref_idx = ps_top_left_mb_pu->i1_l0_ref_idx; + ps_top_row_pu[1].s_l0_mv = ps_top_left_mb_pu->s_l0_mv; + } + } + else if (ps_top_syn[1].u2_is_intra) + { + ps_top_row_pu[1].i1_l0_ref_idx = -1; + ps_top_row_pu[1].s_l0_mv = zero_mv; + } + + ih264e_get_mv_predictor(ps_left_mb_pu, ps_top_row_pu, ps_pred_mv); +} + +/** +******************************************************************************* +* +* @brief This function approximates Pred. MV +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns none +* +* @remarks none +* Motion estimation happens at nmb level. For cost calculations, mv is appro +* ximated using this function +* +******************************************************************************* +*/ +void ih264e_mv_pred_me(process_ctxt_t *ps_proc) +{ + /* left mb motion vector */ + enc_pu_t *ps_left_mb_pu ; + + /* top left mb motion vector */ + enc_pu_t *ps_top_left_mb_pu ; + + /* top row motion vector info */ + enc_pu_t *ps_top_row_pu; + + enc_pu_t s_top_row_pu[2]; + + /* predicted motion vector */ + mv_t *ps_pred_mv = ps_proc->ps_pred_mv; + + /* zero mv */ + mv_t zero_mv = {0, 0}; + + /* mb neighbor availability */ + block_neighbors_t *ps_ngbr_avbl = ps_proc->ps_ngbr_avbl; + + ps_left_mb_pu = &ps_proc->s_left_mb_pu_ME; + ps_top_left_mb_pu = &ps_proc->s_top_left_mb_pu_ME; + ps_top_row_pu = (ps_proc->ps_top_row_pu_ME + ps_proc->i4_mb_x); + + s_top_row_pu[0] = ps_top_row_pu[0]; + s_top_row_pu[1] = ps_top_row_pu[1]; + + /* Before performing mv prediction prepare the ngbr information and + * reset motion vectors basing on their availability */ + if (!ps_ngbr_avbl->u1_mb_a ) + { + /* left mv */ + ps_left_mb_pu->i1_l0_ref_idx = -1; + ps_left_mb_pu->s_l0_mv = zero_mv; + } + if (!ps_ngbr_avbl->u1_mb_b ) + { + /* top mv */ + s_top_row_pu[0].i1_l0_ref_idx = -1; + s_top_row_pu[0].s_l0_mv = zero_mv; + } + if (!ps_ngbr_avbl->u1_mb_c) + { + /* top right mv - When top right partition is not available for + * prediction if top left is available use it for prediction else + * set the mv information to -1 and (0, 0) + * */ + if (!ps_ngbr_avbl->u1_mb_d) + { + s_top_row_pu[1].i1_l0_ref_idx = -1; + s_top_row_pu[1].s_l0_mv = zero_mv; + } + else + { + s_top_row_pu[1].i1_l0_ref_idx = ps_top_left_mb_pu->i1_l0_ref_idx; + s_top_row_pu[1].s_l0_mv = ps_top_left_mb_pu->s_l0_mv; + } + } + + ih264e_get_mv_predictor(ps_left_mb_pu, &(s_top_row_pu[0]), ps_pred_mv); +} + +/** +******************************************************************************* +* +* @brief This function initializes me ctxt +* +* @par Description: +* Before dispatching the current job to me thread, the me context associated +* with the job is initialized. +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_me(process_ctxt_t *ps_proc) +{ + /* me ctxt */ + me_ctxt_t *ps_me_ctxt = &ps_proc->s_me_ctxt; + + /* src ptr */ + ps_me_ctxt->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma; + + /* ref ptr */ + ps_me_ctxt->pu1_ref_buf_luma = ps_proc->pu1_ref_buf_luma; + + /* lagrange param */ + ps_me_ctxt->u4_lambda_motion = gu1_qp0[ps_me_ctxt->u1_mb_qp]; +} + +/** +******************************************************************************* +* +* @brief This function performs motion estimation for the current mb +* +* @par Description: +* The current mb is compared with a list of mb's in the reference frame for +* least cost. The mb that offers least cost is chosen as predicted mb and the +* displacement of the predicted mb from index location of the current mb is +* signaled as mv. The list of the mb's that are chosen in the reference frame +* are dependent on the speed of the ME configured. +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns motion vector of the pred mb, sad, cost. +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_compute_me(process_ctxt_t *ps_proc) +{ + /* me ctxt */ + me_ctxt_t *ps_me_ctxt = &ps_proc->s_me_ctxt; + + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + +// /* mb syntax elements of neighbors */ +// mb_info_t *ps_top_syn = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x; +// mb_info_t *ps_top_left_syn = &(ps_proc->s_top_left_mb_syntax_ME); + + /* mb part info */ + mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part; + mb_part_ctxt skip_mb_part_info; + + /* temp var */ + WORD32 rows_above, rows_below, columns_left, columns_right,u4_use_stat_sad; + + /* Motion vectors in full-pel units */ + WORD16 mv_x, mv_y; + + /* recon stride */ + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* source buffer for halp pel generation functions */ + UWORD8 *pu1_hpel_src; + + /* quantization parameters */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0]; + + /* Sad therholds */ + ps_me_ctxt->pu2_sad_thrsh = ps_qp_params->pu2_sad_thrsh; + + /*Best half pel buffer*/ + UWORD8 *pu1_best_subpel_buf = ps_proc->pu1_best_subpel_buf; + UWORD32 u4_bst_spel_strd = ps_proc->u4_bst_spel_buf_strd; + + /* During evaluation for motion vectors do not search through padded regions */ + /* Obtain number of rows and columns that are effective for computing for me evaluation */ + rows_above = MB_SIZE + ps_proc->i4_mb_y * MB_SIZE; + rows_below = (ps_proc->i4_ht_mbs - ps_proc->i4_mb_y) * MB_SIZE; + columns_left = MB_SIZE + ps_proc->i4_mb_x * MB_SIZE; + columns_right = (ps_proc->i4_wd_mbs - ps_proc->i4_mb_x) * MB_SIZE; + + /* init srch range */ + /* NOTE : For now, lets limit the search range by DEFAULT_MAX_SRCH_RANGE_X / 2 + * on all sides. + */ +// ps_me_ctxt->i4_srch_range_w = -MIN(columns_left, ps_me_ctxt->ai2_srch_boundaries[0]); +// ps_me_ctxt->i4_srch_range_e = MIN(columns_right, ps_me_ctxt->ai2_srch_boundaries[0]); +// ps_me_ctxt->i4_srch_range_n = -MIN(rows_above, ps_me_ctxt->ai2_srch_boundaries[1]); +// ps_me_ctxt->i4_srch_range_s = MIN(rows_below, ps_me_ctxt->ai2_srch_boundaries[1]); + + ps_me_ctxt->i4_srch_range_w = -MIN(columns_left, DEFAULT_MAX_SRCH_RANGE_X >> 1); + ps_me_ctxt->i4_srch_range_e = MIN(columns_right, DEFAULT_MAX_SRCH_RANGE_X >> 1); + ps_me_ctxt->i4_srch_range_n = -MIN(rows_above, DEFAULT_MAX_SRCH_RANGE_Y >> 1); + ps_me_ctxt->i4_srch_range_s = MIN(rows_below, DEFAULT_MAX_SRCH_RANGE_Y >> 1); + + /* this is to facilitate fast sub pel computation with minimal loads */ + if (ps_me_ctxt->u4_enable_hpel) + { + ps_me_ctxt->i4_srch_range_w += 1; + ps_me_ctxt->i4_srch_range_e -= 1; + ps_me_ctxt->i4_srch_range_n += 1; + ps_me_ctxt->i4_srch_range_s -= 1; + } + + /*Initialize the min sad option*/ + ps_me_ctxt->u4_min_sad_reached = 0; /*Not yet found min sad*/ + ps_me_ctxt->i4_min_sad = ps_proc->ps_cur_mb->u4_min_sad; + + /************************************************************/ + /* Get the seed motion vector candidates */ + /************************************************************/ + ih264e_get_search_candidates(ps_proc, ps_me_ctxt); + + /************************************************************/ + /* Init the MB part ctxt structure */ + /************************************************************/ + ps_mb_part->s_mv_curr.i2_mvx = 0; + ps_mb_part->s_mv_curr.i2_mvy = 0; + ps_mb_part->i4_mb_cost = INT_MAX; + ps_mb_part->i4_mb_distortion = INT_MAX; + + /* With NMB changes this logic will not work as we cannot exit NME in between*/ + /********************************************************************/ + /* Analyse skip */ + /********************************************************************/ +// if (ps_proc->ps_codec->s_cfg.u4_enable_satqd == 0 +// && u4_frame_level_me == 0) +// { +// if ( (ps_proc->ps_ngbr_avbl->u1_mb_a && (ps_me_ctxt->u4_left_is_skip == 1)) || +// (ps_proc->ps_ngbr_avbl->u1_mb_b && ps_top_syn->u2_mb_type == PSKIP) || +// (ps_proc->ps_ngbr_avbl->u1_mb_d && ps_top_left_syn->u2_mb_type == PSKIP) ) +// { +// if ( 0 == ih264e_analyse_skip(ps_proc, ps_me_ctxt) ) +// { +// return; +// } +// } +// } + + /********************************************************************/ + /* compute skip cost */ + /********************************************************************/ + /* See if we need to use modified sad */ + u4_use_stat_sad = (ps_proc->ps_codec->s_cfg.u4_enable_satqd == 1); + + /* init the cost of skip MB */ + skip_mb_part_info.i4_mb_cost = INT_MAX; + ime_compute_skip_cost(ps_me_ctxt, ps_proc->ps_skip_mv, &skip_mb_part_info, u4_use_stat_sad); + + + if (ps_me_ctxt->u4_min_sad_reached == 0) + { + /************************************************************/ + /* Evaluate search candidates for initial mv pt. */ + /************************************************************/ + ime_evaluate_init_srchposn_16x16(ps_me_ctxt); + + /********************************************************************/ + /* full pel motion estimation */ + /********************************************************************/ + ime_full_pel_motion_estimation_16x16(ps_me_ctxt); + + DEBUG_MV_HISTOGRAM_ADD((ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx >> 2), + (ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy >> 2)); + + DEBUG_SAD_HISTOGRAM_ADD(ps_me_ctxt->s_mb_part.i4_mb_distortion, 1); + /********************************************************************/ + /* sub pel motion estimation */ + /********************************************************************/ + if (ps_me_ctxt->u4_enable_hpel) + { + /* motion vectors in terms of full pel values */ + mv_x = ps_mb_part->s_mv_curr.i2_mvx >> 2; + mv_y = ps_mb_part->s_mv_curr.i2_mvy >> 2; + + /* moving src pointer to the converged motion vector location*/ + pu1_hpel_src = ps_me_ctxt->pu1_ref_buf_luma + mv_x + (mv_y * i4_rec_strd); + + ps_me_ctxt->pu1_half_x = ps_proc->pu1_half_x; + ps_me_ctxt->pu1_half_y = ps_proc->pu1_half_y; + ps_me_ctxt->pu1_half_xy = ps_proc->pu1_half_xy; + ps_me_ctxt->u4_hp_buf_strd = HP_BUFF_WD; + + /* half pel search is done for both sides of full pel, + * hence half_x of width x height = 17x16 is created + * starting from left half_x of converged full pel */ + pu1_hpel_src -= 1; + + /* computing half_x */ + ps_codec->pf_ih264e_sixtapfilter_horz(pu1_hpel_src, + ps_proc->pu1_half_x, + i4_rec_strd, + ps_me_ctxt->u4_hp_buf_strd); + + /* + * Halfpel search is done for both sides of full pel, + * hence half_y of width x height = 16x17 is created + * starting from top half_y of converged full pel + * for half_xy top_left is required + * hence it starts from pu1_hpel_src = full_pel_converged_point - i4_rec_strd - 1 + */ + + pu1_hpel_src -= i4_rec_strd; + + /* computing half_y , and half_xy*/ + ps_codec->pf_ih264e_sixtap_filter_2dvh_vert( + pu1_hpel_src, ps_proc->pu1_half_y, + ps_proc->pu1_half_xy, i4_rec_strd, + ps_me_ctxt->u4_hp_buf_strd, ps_proc->ai16_pred1 + 3, + ps_me_ctxt->u4_hp_buf_strd); + + ime_sub_pel_motion_estimation_16x16(ps_me_ctxt); + } + } + + { + + /* if skip gives a better cost than other search, copy the cost accordingly*/ + if (skip_mb_part_info.i4_mb_cost < ps_mb_part->i4_mb_cost) + { + ps_mb_part->i4_mb_cost = skip_mb_part_info.i4_mb_cost; + ps_mb_part->i4_mb_distortion = skip_mb_part_info.i4_mb_distortion; + ps_mb_part->s_mv_curr.i2_mvx = skip_mb_part_info.s_mv_curr.i2_mvx; + ps_mb_part->s_mv_curr.i2_mvy = skip_mb_part_info.s_mv_curr.i2_mvy; + } + else + { + /* + * If the current MB has a sub pel component, + * we need to copy that to the best subpel buffer + */ + if (ps_me_ctxt->u4_enable_hpel && ps_mb_part->pu1_best_hpel_buf) + { + ps_codec->pf_inter_pred_luma_copy(ps_mb_part->pu1_best_hpel_buf, + pu1_best_subpel_buf, + ps_me_ctxt->u4_hp_buf_strd, + u4_bst_spel_strd, MB_SIZE, + MB_SIZE, NULL, 0); + } + } + } + + DEBUG_SAD_HISTOGRAM_ADD(ps_me_ctxt->s_mb_part.i4_mb_distortion, 0); + + /* update the type of the mb if necessary */ + if (ps_me_ctxt->s_mb_part.i4_mb_cost < ps_proc->ps_cur_mb->i4_mb_cost) + { + /* mb cost */ + ps_proc->ps_cur_mb->i4_mb_cost = ps_me_ctxt->s_mb_part.i4_mb_cost; + + /* mb distortion */ + ps_proc->ps_cur_mb->i4_mb_distortion = ps_me_ctxt->s_mb_part.i4_mb_distortion; + + /* mb type */ + ps_proc->ps_cur_mb->u4_mb_type = P16x16; + } + + /* number of partitions */ + ps_proc->u4_num_sub_partitions = 1; + *(ps_proc->pu4_mb_pu_cnt) = 1; + + /* position in-terms of PU */ + ps_proc->ps_pu->b4_pos_x = 0; + ps_proc->ps_pu->b4_pos_y = 0; + + /* PU size */ + ps_proc->ps_pu->b4_wd = 3; + ps_proc->ps_pu->b4_ht = 3; + + /* ref idx */ + ps_proc->ps_pu->i1_l0_ref_idx = 0; + + /* motion vector L0 */ + ps_proc->ps_pu->s_l0_mv.i2_mvx = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx; + ps_proc->ps_pu->s_l0_mv.i2_mvy = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy; + + /* Update min sad conditions */ + if (ps_me_ctxt->u4_min_sad_reached == 1) + { + ps_proc->ps_cur_mb->u4_min_sad_reached = 1; + ps_proc->ps_cur_mb->u4_min_sad = ps_me_ctxt->i4_min_sad; + } +} + +/** +******************************************************************************* +* +* @brief This function performs motion estimation for the current NMB +* +* @par Description: +* Intializes input and output pointers required by the function ih264e_compute_me +* and calls the function ih264e_compute_me in a loop to process NMBs. +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_compute_me_nmb(process_ctxt_t *ps_proc, UWORD32 u4_nmb_count) +{ + /* pic pu */ + enc_pu_t *ps_pu_begin = ps_proc->ps_pu; + + /* ME map */ + UWORD8 *pu1_me_map = ps_proc->pu1_me_map + (ps_proc->i4_mb_y * ps_proc->i4_wd_mbs); + + /* temp var */ + UWORD32 u4_i; + + ps_proc->s_me_ctxt.u4_left_is_intra = ps_proc->s_left_mb_syntax_ele.u2_is_intra; + ps_proc->s_me_ctxt.u4_left_is_skip = (ps_proc->s_left_mb_syntax_ele.u2_mb_type == PSKIP); + + for (u4_i = 0; u4_i < u4_nmb_count; u4_i++) + { + /* Wait for ME map */ + if (ps_proc->i4_mb_y > 0) + { + /* Wait for top right ME to be done */ + UWORD8 *pu1_me_map_tp_rw = ps_proc->pu1_me_map + (ps_proc->i4_mb_y - 1) * ps_proc->i4_wd_mbs; + + while (1) + { + volatile UWORD8 *pu1_buf; + WORD32 idx = ps_proc->i4_mb_x + u4_i + 1; + + idx = MIN(idx, (ps_proc->i4_wd_mbs - 1)); + pu1_buf = pu1_me_map_tp_rw + idx; + if(*pu1_buf) + break; + ithread_yield(); + } + } + + ps_proc->ps_skip_mv = &(ps_proc->ps_nmb_info[u4_i].s_skip_mv); + ps_proc->ps_ngbr_avbl = &(ps_proc->ps_nmb_info[u4_i].s_ngbr_avbl); + ps_proc->ps_pred_mv = &(ps_proc->ps_nmb_info[u4_i].s_pred_mv); + + ps_proc->ps_cur_mb = &(ps_proc->ps_nmb_info[u4_i]); + + ps_proc->ps_cur_mb->u4_min_sad = ps_proc->u4_min_sad; + ps_proc->ps_cur_mb->u4_min_sad_reached = 0; + + ps_proc->ps_cur_mb->i4_mb_cost = INT_MAX; + ps_proc->ps_cur_mb->i4_mb_distortion = SHRT_MAX; + + /* Set the best subpel buf to the correct mb so that the buffer can be copied */ + ps_proc->pu1_best_subpel_buf = ps_proc->ps_nmb_info[u4_i].pu1_best_sub_pel_buf; + ps_proc->u4_bst_spel_buf_strd = ps_proc->ps_nmb_info[u4_i].u4_bst_spel_buf_strd; + + /* Set the min sad conditions */ + ps_proc->ps_cur_mb->u4_min_sad = ps_proc->ps_codec->u4_min_sad; + ps_proc->ps_cur_mb->u4_min_sad_reached = 0; + + /* Derive neighbor availability for the current macroblock */ + ih264e_derive_nghbr_avbl_of_mbs(ps_proc); + + /* init me */ + ih264e_init_me(ps_proc); + + ih264e_compute_me(ps_proc); + + /* update top and left structs */ + { + mb_info_t *ps_top_syn = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x; + mb_info_t *ps_top_left_syn = &(ps_proc->s_top_left_mb_syntax_ME); + enc_pu_t *ps_left_mb_pu = &ps_proc->s_left_mb_pu_ME; + enc_pu_t *ps_top_left_mb_pu = &ps_proc->s_top_left_mb_pu_ME; + enc_pu_t *ps_top_mv = ps_proc->ps_top_row_pu_ME + ps_proc->i4_mb_x; + + *ps_top_left_syn = *ps_top_syn; + + *ps_top_left_mb_pu = *ps_top_mv; + *ps_left_mb_pu = *ps_proc->ps_pu; + } + + ps_proc->ps_pu += *ps_proc->pu4_mb_pu_cnt; + + /* Copy the min sad reached info */ + ps_proc->ps_nmb_info[u4_i].u4_min_sad_reached = ps_proc->ps_cur_mb->u4_min_sad_reached; + ps_proc->ps_nmb_info[u4_i].u4_min_sad = ps_proc->ps_cur_mb->u4_min_sad; + + /* + * To make sure that the MV map is properly sync to the + * cache we need to do a DDB + */ + { + DATA_SYNC(); + + pu1_me_map[ps_proc->i4_mb_x] = 1; + } + ps_proc->i4_mb_x++; + + ps_proc->s_me_ctxt.u4_left_is_intra = 0; + ps_proc->s_me_ctxt.u4_left_is_skip = (ps_proc->ps_cur_mb->u4_mb_type == PSKIP); + + /* update buffers pointers */ + ps_proc->pu1_src_buf_luma += MB_SIZE; + ps_proc->pu1_rec_buf_luma += MB_SIZE; + ps_proc->pu1_ref_buf_luma += MB_SIZE; + + /* + * Note: Although chroma mb size is 8, as the chroma buffers are interleaved, + * the stride per MB is MB_SIZE + */ + ps_proc->pu1_src_buf_chroma += MB_SIZE; + ps_proc->pu1_rec_buf_chroma += MB_SIZE; + ps_proc->pu1_ref_buf_chroma += MB_SIZE; + + ps_proc->pu4_mb_pu_cnt += 1; + } + + + ps_proc->ps_pu = ps_pu_begin; + ps_proc->i4_mb_x = ps_proc->i4_mb_x - u4_nmb_count; + + /* update buffers pointers */ + ps_proc->pu1_src_buf_luma -= MB_SIZE * u4_nmb_count; + ps_proc->pu1_rec_buf_luma -= MB_SIZE * u4_nmb_count; + ps_proc->pu1_ref_buf_luma -= MB_SIZE * u4_nmb_count; + + /* + * Note: Although chroma mb size is 8, as the chroma buffers are interleaved, + * the stride per MB is MB_SIZE + */ + ps_proc->pu1_src_buf_chroma -= MB_SIZE * u4_nmb_count; + ps_proc->pu1_rec_buf_chroma -= MB_SIZE * u4_nmb_count; + ps_proc->pu1_ref_buf_chroma -= MB_SIZE * u4_nmb_count; + + ps_proc->pu4_mb_pu_cnt -= u4_nmb_count; +} diff --git a/encoder/ih264e_me.h b/encoder/ih264e_me.h new file mode 100755 index 0000000..c4834a1 --- /dev/null +++ b/encoder/ih264e_me.h @@ -0,0 +1,278 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * ih264e_me.h + * + * @brief + * Contains declarations of global variables for H264 encoder + * + * @author + * ittiam + * + * @remarks + * + ******************************************************************************* + */ + +#ifndef IH264E_ME_H_ +#define IH264E_ME_H_ + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief compute median of 3 elements (a, b, c) and store the output + * in to result. This is used for mv prediction +****************************************************************************** + */ + +#define MEDIAN(a, b, c, result) if (a > b){\ + if (b > c)\ + result = b;\ + else {\ + if (a > c)\ + result = c;\ + else \ + result = a;\ + }\ + }\ + else {\ + if (c > b)\ + result = b;\ + else {\ + if (c > a)\ + result = c;\ + else \ + result = a;\ + }\ + } + + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* This function populates the length of the codewords for motion vectors in the +* range (-search range, search range) in pixels +* +* @param[in] ps_me +* Pointer to me ctxt +* +* @param[out] pu1_mv_bits +* length of the codeword for all mv's +* +* @remarks The length of the code words are derived from signed exponential +* goloumb codes. +* +******************************************************************************* +*/ +void ih264e_init_mv_bits + ( + me_ctxt_t *ps_me + ); + +/** +******************************************************************************* +* +* @brief The function gives the skip motion vector +* +* @par Description: +* The function gives the skip motion vector +* +* @param[in] ps_left_mb_pu +* pointer to left mb motion vector info +* +* @param[in] ps_top_row_pu +* pointer to top & top right mb motion vector info +* +* @param[out] ps_pred_mv +* pointer to candidate predictors for the current block +* +* @returns The x & y components of the MV predictor. +* +* @remarks The code implements the logic as described in sec 8.4.1.1 in H264 +* specification. +* +******************************************************************************* +*/ +void ih264e_find_skip_motion_vector + ( + process_ctxt_t *ps_proc, + UWORD32 u4_for_me + ); + +/** +******************************************************************************* +* +* @brief motion vector predictor +* +* @par Description: +* The routine calculates the motion vector predictor for a given block, +* given the candidate MV predictors. +* +* @param[in] ps_left_mb_pu +* pointer to left mb motion vector info +* +* @param[in] ps_top_row_pu +* pointer to top & top right mb motion vector info +* +* @param[out] ps_pred_mv +* pointer to candidate predictors for the current block +* +* @returns The x & y components of the MV predictor. +* +* @remarks The code implements the logic as described in sec 8.4.1.3 in H264 +* specification. +* Assumptions : 1. Assumes Single reference frame +* 2. Assumes Only partition of size 16x16 +* +******************************************************************************* +*/ +void ih264e_get_mv_predictor + ( + enc_pu_t *ps_left_mb_pu, + enc_pu_t *ps_top_row_pu, + mv_t *ps_pred_mv + ); + +/** +******************************************************************************* +* +* @brief This function computes the best motion vector for the current mb +* +* @par Description: +* This function currently does nothing except set motion vectors from external +* source +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_compute_me + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* +* @brief This function initializes me ctxt +* +* @par Description: +* Before dispatching the current job to me thread, the me context associated +* with the job is initialized. +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_me(process_ctxt_t *ps_proc); + +/** +******************************************************************************* +* +* @brief This function performs motion estimation for the current NMB +* +* @par Description: +* Intializes input and output pointers required by the function ih264e_compute_me +* and calls the function ih264e_compute_me in a loop to process NMBs. +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_compute_me_nmb + ( + process_ctxt_t *ps_proc, + UWORD32 u4_nmb_count + ); + +/** +******************************************************************************* +* +* @brief This function performs MV prediction +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns none +* +* @remarks none +* This function will update the MB availability since intra inter decision +* should be done before the call +* +******************************************************************************* +*/ +void ih264e_mv_pred + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* +* @brief This function approximates Pred. MV +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns none +* +* @remarks none +* Motion estimation happens at nmb level. For cost calculations, mv is appro +* ximated using this function +* +******************************************************************************* +*/ +void ih264e_mv_pred_me + ( + process_ctxt_t *ps_proc + ); + +#endif /* IH264E_ME_H_ */ diff --git a/encoder/ih264e_modify_frm_rate.c b/encoder/ih264e_modify_frm_rate.c new file mode 100755 index 0000000..bc0e873 --- /dev/null +++ b/encoder/ih264e_modify_frm_rate.c @@ -0,0 +1,240 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_modify_frm_rate.c +* +* @brief +* Functions used to modify frame rate +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_pd_frm_rate_get_init_free_memtab() +* - ih264e_init_pd_frm_rate() +* - ih264e_update_pd_frm_rate() +* - ih264e_get_pd_avg_frm_rate() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "irc_datatypes.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ih264e_defs.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_rc_mem_interface.h" +#include "ih264e_time_stamp.h" +#include "ih264e_modify_frm_rate.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Function to init pd frame rate memtab +* +* @par Description +* Function to init pull down frame rate memtab +* +* @param[in] pps_pd_frm_rate +* pull down frame rate context +* +* @param[in] ps_memtab +* Handle to memtab +* +* @param[in] e_func_type +* Function type (get memtab/ update memtab) +* +* @returns Number of memtabs used +* +* @remarks None +* +******************************************************************************* +*/ +WORD32 ih264e_pd_frm_rate_get_init_free_memtab(pd_frm_rate_handle *pps_pd_frm_rate, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static pd_frm_rate_t s_temp_pd_frm_rate_t; + + /* Hack for al alloc, during which we dont have any state memory. + Dereferencing can cause issues */ + if (e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_pd_frm_rate) = &s_temp_pd_frm_rate_t; + + /* for src rate control state structure */ + if (e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(pd_frm_rate_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**) pps_pd_frm_rate, e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +/** +******************************************************************************* +* +* @brief Initializes the pull down frame rate state structure based on input +* frame rate +* +* @par Description +* Initializes the pull down frame rate state structure based on input frame rate +* +* @param[in] ps_pd_frm_rate +* Pull down frame rate context +* +* @param[in] u4_input_frm_rate +* Input frame rate in frame per 1000sec +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_pd_frm_rate(pd_frm_rate_t *ps_pd_frm_rate, + UWORD32 u4_input_frm_rate) +{ + WORD32 i; + + ps_pd_frm_rate->u4_input_frm_rate = u4_input_frm_rate; + + for (i = 0; i < (WORD32) (u4_input_frm_rate / 1000); i++) + { + ps_pd_frm_rate->u4_cur_frm_rate[i] = u4_input_frm_rate; + } + + ps_pd_frm_rate->u4_frm_num = 0; + + ps_pd_frm_rate->u4_tot_frm_encoded = 0; +} + +/** +******************************************************************************* +* +* @brief Function to update pull down frame rate +* +* @par Description +* For each frame a run time frame rate value is sent based on whether a frame +* is skipped or not. If it is skipped for pull down then the current frame +* rate for the pull down period is signaled as 4/5th of the original frame +* rate. Thus when this is averaged the frame rate gradually switches from the +* input frame rate to 4/5th of input frame rate as and when more 3:2 pull +* down patterns are detected +* +* @param[in] ps_pd_frm_rate +* Pull down frame rate context +* +* @param[in] u4_input_frm_rate +* Input frame rate in frame per 1000sec +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_pd_frm_rate(pd_frm_rate_t *ps_pd_frm_rate, + UWORD32 u4_cur_frm_rate) +{ + ps_pd_frm_rate->u4_cur_frm_rate[ps_pd_frm_rate->u4_frm_num] = u4_cur_frm_rate; + + ps_pd_frm_rate->u4_frm_num++; + + /* Increment the frame number */ + if (ps_pd_frm_rate->u4_tot_frm_encoded < (ps_pd_frm_rate->u4_input_frm_rate / 1000)) + { + ps_pd_frm_rate->u4_tot_frm_encoded++; + } + + /* Reset frm_num to zero */ + if (ps_pd_frm_rate->u4_frm_num >= (ps_pd_frm_rate->u4_input_frm_rate / 1000)) + { + ps_pd_frm_rate->u4_frm_num = 0; + } +} + +/** +******************************************************************************* +* +* @brief returns average frame rate in 1 sec duration +* +* @par Description +* Averages the last N frame in period(1 sec) and then gives that +* as the current frames frame rate. Thus this averages out the sudden +* variation in frame rate +* +* @param[in] ps_pd_frm_rate +* Handle to pull down frame rate context +* +* @returns average frame rate +* +* @remarks +* +******************************************************************************* +*/ +UWORD32 ih264e_get_pd_avg_frm_rate(pd_frm_rate_t *ps_pd_frm_rate) +{ + WORD32 i; + WORD32 i4_avg_frm_rate = 0; + + for (i = 0; i < (WORD32) ps_pd_frm_rate->u4_tot_frm_encoded; i++) + { + i4_avg_frm_rate += ps_pd_frm_rate->u4_cur_frm_rate[i]; + } + + i4_avg_frm_rate = i4_avg_frm_rate / ps_pd_frm_rate->u4_tot_frm_encoded; + + return i4_avg_frm_rate; +} diff --git a/encoder/ih264e_modify_frm_rate.h b/encoder/ih264e_modify_frm_rate.h new file mode 100755 index 0000000..c301e2c --- /dev/null +++ b/encoder/ih264e_modify_frm_rate.h @@ -0,0 +1,182 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_modify_frm_rate.h +* +* @brief +* Functions declarations used to modify frame rate +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_MODIFY_FRM_RATE_H_ +#define IH264E_MODIFY_FRM_RATE_H_ + +/*****************************************************************************/ +/* Constant Definitions */ +/*****************************************************************************/ + +#define MAX_NUM_FRAME 120 + + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ +typedef struct pd_frm_rate_t +{ + /* + * The input frame rate set in the encoder (per 1000 sec) + */ + UWORD32 u4_input_frm_rate; + + /* + * Frame rate of current frame due to pull down + */ + UWORD32 u4_cur_frm_rate[MAX_NUM_FRAME]; + + /* + * current frame num in the above buffer + */ + UWORD32 u4_frm_num; + + /* + * Total number of frames encoded. + * if greater than input frame rate stays at input frame rate + */ + UWORD32 u4_tot_frm_encoded; + +}pd_frm_rate_t; + +typedef struct pd_frm_rate_t *pd_frm_rate_handle; + + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Function to init pd frame rate memtab +* +* @par Description +* Function to init pull down frame rate memtab +* +* @param[in] pps_pd_frm_rate +* pull down frame rate context +* +* @param[in] ps_memtab +* Handle to memtab +* +* @param[in] e_func_type +* Function type (get memtab/ update memtab) +* +* @returns Number of memtabs used +* +* @remarks None +* +******************************************************************************* +*/ +WORD32 ih264e_pd_frm_rate_get_init_free_memtab(pd_frm_rate_handle *pps_pd_frm_rate, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); +/** +******************************************************************************* +* +* @brief Initializes the pull down frame rate state structure based on input +* frame rate +* +* @par Description +* Initializes the pull down frame rate state structure based on input frame rate +* +* @param[in] ps_pd_frm_rate +* Pull down frame rate context +* +* @param[in] u4_input_frm_rate +* Input frame rate in frame per 1000sec +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_pd_frm_rate(pd_frm_rate_handle ps_pd_frm_rate, + UWORD32 u4_input_frm_rate); + +/** +******************************************************************************* +* +* @brief Function to update pull down frame rate +* +* @par Description +* For each frame a run time frame rate value is sent based on whether a frame +* is skipped or not. If it is skipped for pull down then the current frame +* rate for the pull down period is signaled as 4/5th of the original frame +* rate. Thus when this is averaged the frame rate gradually switches from the +* input frame rate to 4/5th of input frame rate as and when more 3:2 pull +* down patterns are detected +* +* @param[in] ps_pd_frm_rate +* Pull down frame rate context +* +* @param[in] u4_input_frm_rate +* Input frame rate in frame per 1000sec +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_pd_frm_rate(pd_frm_rate_handle ps_pd_frm_rate, + UWORD32 u4_cur_frm_rate); + +/** +******************************************************************************* +* +* @brief returns average frame rate in 1 sec duration +* +* @par Description +* Averages the last N frame in period(1 sec) and then gives that +* as the current frames frame rate. Thus this averages out the sudden +* variation in frame rate +* +* @param[in] ps_pd_frm_rate +* Handle to pull down frame rate context +* +* @returns average frame rate +* +* @remarks +* +******************************************************************************* +*/ +UWORD32 ih264e_get_pd_avg_frm_rate(pd_frm_rate_handle ps_pd_frm_rate); + +#endif /* IH264E_MODIFY_FRM_RATE_H_ */ diff --git a/encoder/ih264e_process.c b/encoder/ih264e_process.c new file mode 100755 index 0000000..9a468e9 --- /dev/null +++ b/encoder/ih264e_process.c @@ -0,0 +1,2369 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_process.c +* +* @brief +* Contains functions for codec thread +* +* @author +* Harish +* +* @par List of Functions: +* - ih264e_generate_sps_pps() +* - ih264e_init_entropy_ctxt() +* - ih264e_entropy() +* - ih264e_pack_header_data() +* - ih264e_update_proc_ctxt() +* - ih264e_init_proc_ctxt() +* - ih264e_pad_recon_buffer() +* - ih264e_dblk_pad_hpel_processing_n_mbs() +* - ih264e_process() +* - ih264e_set_rc_pic_params() +* - ih264e_update_rc_post_enc() +* - ih264e_process_thread() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <limits.h> +#include <assert.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_debug.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_platform_macros.h" +#include "ih264_macros.h" +#include "ih264_error.h" +#include "ih264_buf_mgr.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ih264_structs.h" +#include "ih264_common_tables.h" +#include "ih264_list.h" +#include "ih264e_defs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_process.h" +#include "ithread.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264e_encode_header.h" +#include "ih264e_globals.h" +#include "ih264e_config.h" +#include "ih264e_trace.h" +#include "ih264e_statistics.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264e_deblk.h" +#include "ih264e_me.h" +#include "ih264e_debug.h" +#include "ih264e_process.h" +#include "ih264e_master.h" +#include "ih264e_utils.h" +#include "irc_mem_req_and_acq.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "irc_rate_control_api.h" +#include "ih264e_platform_macros.h" +#include "ih264_padding.h" +#include "ime_statistics.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief This function generates sps, pps set on request +* +* @par Description +* When the encoder is set in header generation mode, the following function +* is called. This generates sps and pps headers and returns the control back +* to caller. +* +* @param[in] ps_codec +* pointer to codec context +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_generate_sps_pps(codec_t *ps_codec) +{ + /* choose between ping-pong process buffer set */ + WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1; + + /* entropy ctxt */ + entropy_ctxt_t *ps_entropy = &ps_codec->as_process[ctxt_sel * MAX_PROCESS_THREADS].s_entropy; + + /* Bitstream structure */ + bitstrm_t *ps_bitstrm = ps_entropy->ps_bitstrm; + + /* sps */ + sps_t *ps_sps = NULL; + + /* pps */ + pps_t *ps_pps = NULL; + + /* output buff */ + out_buf_t *ps_out_buf = &ps_codec->as_out_buf[ctxt_sel]; + + + /********************************************************************/ + /* initialize the bit stream buffer */ + /********************************************************************/ + ih264e_bitstrm_init(ps_bitstrm, ps_out_buf->s_bits_buf.pv_buf, ps_out_buf->s_bits_buf.u4_bufsize); + + /********************************************************************/ + /* BEGIN HEADER GENERATION */ + /********************************************************************/ + /*ps_codec->i4_pps_id ++;*/ + ps_codec->i4_pps_id %= MAX_PPS_CNT; + + /*ps_codec->i4_sps_id ++;*/ + ps_codec->i4_sps_id %= MAX_SPS_CNT; + + /* populate sps header */ + ps_sps = ps_codec->ps_sps_base + ps_codec->i4_sps_id; + ih264e_populate_sps(ps_codec, ps_sps); + + /* populate pps header */ + ps_pps = ps_codec->ps_pps_base + ps_codec->i4_pps_id; + ih264e_populate_pps(ps_codec, ps_pps); + + ps_entropy->i4_error_code = IH264E_SUCCESS; + + /* generate sps */ + ps_entropy->i4_error_code |= ih264e_generate_sps(ps_bitstrm, ps_sps); + + /* generate pps */ + ps_entropy->i4_error_code |= ih264e_generate_pps(ps_bitstrm, ps_pps, ps_sps); + + /* queue output buffer */ + ps_out_buf->s_bits_buf.u4_bytes = ps_bitstrm->u4_strm_buf_offset; + + return ps_entropy->i4_error_code; +} + +/** +******************************************************************************* +* +* @brief initialize entropy context. +* +* @par Description: +* Before invoking the call to perform to entropy coding the entropy context +* associated with the job needs to be initialized. This involves the start +* mb address, end mb address, slice index and the pointer to location at +* which the mb residue info and mb header info are packed. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_init_entropy_ctxt(process_ctxt_t *ps_proc) +{ + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* entropy ctxt */ + entropy_ctxt_t *ps_entropy = &ps_proc->s_entropy; + + /* start address */ + ps_entropy->i4_mb_start_add = ps_entropy->i4_mb_y * ps_entropy->i4_wd_mbs + ps_entropy->i4_mb_x; + + /* end address */ + ps_entropy->i4_mb_end_add = ps_entropy->i4_mb_start_add + ps_entropy->i4_mb_cnt; + + /* slice index */ + ps_entropy->i4_cur_slice_idx = ps_proc->pu1_slice_idx[ps_entropy->i4_mb_start_add]; + + /* sof */ + /* @ start of frame or start of a new slice, set sof flag */ + if (ps_entropy->i4_mb_start_add == 0) + { + ps_entropy->i4_sof = 1; + } + + if (ps_entropy->i4_mb_x == 0) + { + /* packed mb coeff data */ + ps_entropy->pv_mb_coeff_data = ((UWORD8 *)ps_entropy->pv_pic_mb_coeff_data) + + ps_entropy->i4_mb_y * ps_codec->u4_size_coeff_data; + + /* packed mb header data */ + ps_entropy->pv_mb_header_data = ((UWORD8 *)ps_entropy->pv_pic_mb_header_data) + + ps_entropy->i4_mb_y * ps_codec->u4_size_header_data; + } + + return IH264E_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief entry point for entropy coding +* +* @par Description +* This function calls lower level functions to perform entropy coding for a +* group (n rows) of mb's. After encoding 1 row of mb's, the function takes +* back the control, updates the ctxt and calls lower level functions again. +* This process is repeated till all the rows or group of mb's (which ever is +* minimum) are coded +* +* @param[in] ps_proc +* process context +* +* @returns error status +* +* @remarks +* +******************************************************************************* +*/ +#define GET_NUM_BITS(ps_bitstream) ((ps_bitstream->u4_strm_buf_offset << 3) + WORD_SIZE - ps_bitstream->i4_bits_left_in_cw) + +IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc) +{ + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* entropy context */ + entropy_ctxt_t *ps_entropy = &ps_proc->s_entropy; + + /* sps */ + sps_t *ps_sps = ps_entropy->ps_sps_base + (ps_entropy->u4_sps_id % MAX_SPS_CNT); + + /* pps */ + pps_t *ps_pps = ps_entropy->ps_pps_base + (ps_entropy->u4_pps_id % MAX_PPS_CNT); + + /* slice header */ + slice_header_t *ps_slice_hdr = ps_entropy->ps_slice_hdr_base + (ps_entropy->i4_cur_slice_idx % MAX_SLICE_HDR_CNT); + + /* slice type */ + WORD32 i4_slice_type = ps_proc->i4_slice_type; + + /* Bitstream structure */ + bitstrm_t *ps_bitstrm = ps_entropy->ps_bitstrm; + + /* output buff */ + out_buf_t s_out_buf; + + /* proc map */ + UWORD8 *pu1_proc_map; + + /* entropy map */ + UWORD8 *pu1_entropy_map_curr; + + /* proc base idx */ + WORD32 ctxt_sel = ps_proc->i4_encode_api_call_cnt & 1; + + /* temp var */ + WORD32 i4_wd_mbs, i4_ht_mbs; + UWORD32 u4_mb_cnt, u4_mb_idx, u4_mb_end_idx; + + /********************************************************************/ + /* BEGIN INIT */ + /********************************************************************/ + + /* entropy encode start address */ + u4_mb_idx = ps_entropy->i4_mb_start_add; + + /* entropy encode end address */ + u4_mb_end_idx = ps_entropy->i4_mb_end_add; + + /* width in mbs */ + i4_wd_mbs = ps_entropy->i4_wd_mbs; + + /* height in mbs */ + i4_ht_mbs = ps_entropy->i4_ht_mbs; + + /* total mb cnt */ + u4_mb_cnt = i4_wd_mbs * i4_ht_mbs; + + /* proc map */ + pu1_proc_map = ps_proc->pu1_proc_map + ps_entropy->i4_mb_y * i4_wd_mbs; + + /* entropy map */ + pu1_entropy_map_curr = ps_entropy->pu1_entropy_map + ps_entropy->i4_mb_y * i4_wd_mbs; + + /********************************************************************/ + /* @ start of frame / slice, */ + /* initialize the output buffer, */ + /* initialize the bit stream buffer, */ + /* check if sps and pps headers have to be generated, */ + /* populate and generate slice header */ + /********************************************************************/ + if (ps_entropy->i4_sof) + { + /********************************************************************/ + /* initialize the output buffer */ + /********************************************************************/ + s_out_buf = ps_codec->as_out_buf[ctxt_sel]; + + /* is last frame to encode */ + s_out_buf.u4_is_last = ps_entropy->u4_is_last; + + /* frame idx */ + s_out_buf.u4_timestamp_high = ps_entropy->u4_timestamp_high; + s_out_buf.u4_timestamp_low = ps_entropy->u4_timestamp_low; + + /********************************************************************/ + /* initialize the bit stream buffer */ + /********************************************************************/ + ih264e_bitstrm_init(ps_bitstrm, s_out_buf.s_bits_buf.pv_buf, s_out_buf.s_bits_buf.u4_bufsize); + + /********************************************************************/ + /* BEGIN HEADER GENERATION */ + /********************************************************************/ + if (1 == ps_entropy->i4_gen_header) + { + /* generate sps */ + ps_entropy->i4_error_code |= ih264e_generate_sps(ps_bitstrm, ps_sps); + + /* generate pps */ + ps_entropy->i4_error_code |= ih264e_generate_pps(ps_bitstrm, ps_pps, ps_sps); + + /* reset i4_gen_header */ + ps_entropy->i4_gen_header = 0; + } + + /* populate slice header */ + ih264e_populate_slice_header(ps_proc, ps_slice_hdr, ps_pps, ps_sps); + + /* generate slice header */ + ps_entropy->i4_error_code |= ih264e_generate_slice_header(ps_bitstrm, ps_slice_hdr, + ps_pps, ps_sps); + + /* once start of frame / slice is done, you can reset it */ + /* it is the responsibility of the caller to set this flag */ + ps_entropy->i4_sof = 0; + } + + /* begin entropy coding for the mb set */ + while (u4_mb_idx < u4_mb_end_idx) + { + /* init ptrs/indices */ + if (ps_entropy->i4_mb_x == i4_wd_mbs) + { + ps_entropy->i4_mb_y ++; + ps_entropy->i4_mb_x = 0; + + /* packed mb coeff data */ + ps_entropy->pv_mb_coeff_data = ((UWORD8 *)ps_entropy->pv_pic_mb_coeff_data) + + ps_entropy->i4_mb_y * ps_codec->u4_size_coeff_data; + + /* packed mb header data */ + ps_entropy->pv_mb_header_data = ((UWORD8 *)ps_entropy->pv_pic_mb_header_data) + + ps_entropy->i4_mb_y * ps_codec->u4_size_header_data; + + /* proc map */ + pu1_proc_map = ps_proc->pu1_proc_map + ps_entropy->i4_mb_y * i4_wd_mbs; + + /* entropy map */ + pu1_entropy_map_curr = ps_entropy->pu1_entropy_map + ps_entropy->i4_mb_y * i4_wd_mbs; + } + + DEBUG("\nmb indices x, y %d, %d", ps_entropy->i4_mb_x, ps_entropy->i4_mb_y); + ENTROPY_TRACE("mb index x %d", ps_entropy->i4_mb_x); + ENTROPY_TRACE("mb index y %d", ps_entropy->i4_mb_y); + + /* wait until the curr mb is core coded */ + /* The wait for curr mb to be core coded is essential when entropy is launched + * as a separate job + */ + while (1) + { + volatile UWORD8 *pu1_buf1; + WORD32 idx = ps_entropy->i4_mb_x; + + pu1_buf1 = pu1_proc_map + idx; + if(*pu1_buf1) + break; + ithread_yield(); + } + + /* write mb layer */ + ps_codec->pf_write_mb_syntax_layer[i4_slice_type](ps_entropy); + + /* set entropy map */ + pu1_entropy_map_curr[ps_entropy->i4_mb_x] = 1; + + u4_mb_idx ++; + ps_entropy->i4_mb_x ++; + + if (ps_entropy->i4_mb_x == i4_wd_mbs) + { + /* if slices are enabled */ + if (ps_codec->s_cfg.e_slice_mode == IVE_SLICE_MODE_BLOCKS) + { + /* current slice index */ + WORD32 i4_curr_slice_idx = ps_entropy->i4_cur_slice_idx; + + /* slice map */ + UWORD8 *pu1_slice_idx = ps_entropy->pu1_slice_idx; + + /* No need to open a slice at end of frame. The current slice can be closed at the time + * of signaling eof flag. + */ + if ( (u4_mb_idx != u4_mb_cnt) && (i4_curr_slice_idx != pu1_slice_idx[u4_mb_idx])) + { + /* mb skip run */ + if ((i4_slice_type != ISLICE) && *ps_entropy->pi4_mb_skip_run) + { + if (*ps_entropy->pi4_mb_skip_run) + { + PUT_BITS_UEV(ps_bitstrm, *ps_entropy->pi4_mb_skip_run, ps_entropy->i4_error_code, "mb skip run"); + *ps_entropy->pi4_mb_skip_run = 0; + } + } + + /* put rbsp trailing bits for the previous slice */ + ps_entropy->i4_error_code |= ih264e_put_rbsp_trailing_bits(ps_bitstrm); + + /* update slice header pointer */ + i4_curr_slice_idx = pu1_slice_idx[u4_mb_idx]; + ps_entropy->i4_cur_slice_idx = i4_curr_slice_idx; + ps_slice_hdr = ps_entropy->ps_slice_hdr_base + (i4_curr_slice_idx % MAX_SLICE_HDR_CNT); + + /* populate slice header */ + ps_entropy->i4_mb_start_add = u4_mb_idx; + ih264e_populate_slice_header(ps_proc, ps_slice_hdr, ps_pps, ps_sps); + + /* generate slice header */ + ps_entropy->i4_error_code |= ih264e_generate_slice_header(ps_bitstrm, ps_slice_hdr, + ps_pps, ps_sps); + } + } + + /* Dont execute any further instructions until store synchronization took place */ + DATA_SYNC(); + } + } + + /* check for eof */ + if (u4_mb_idx == u4_mb_cnt) + { + /* set end of frame flag */ + ps_entropy->i4_eof = 1; + } + + if (ps_entropy->i4_eof) + { + /* mb skip run */ + if ((i4_slice_type != ISLICE) && *ps_entropy->pi4_mb_skip_run) + { + if (*ps_entropy->pi4_mb_skip_run) + { + PUT_BITS_UEV(ps_bitstrm, *ps_entropy->pi4_mb_skip_run, ps_entropy->i4_error_code, "mb skip run"); + *ps_entropy->pi4_mb_skip_run = 0; + } + } + + /* put rbsp trailing bits */ + ps_entropy->i4_error_code |= ih264e_put_rbsp_trailing_bits(ps_bitstrm); + + /* update current frame stats to rc library */ + if (IVE_RC_NONE != ps_codec->s_cfg.e_rc_mode) + { + /* number of bytes to stuff */ + WORD32 i4_stuff_bytes; + + /* update */ + i4_stuff_bytes = ih264e_update_rc_post_enc(ps_codec, ctxt_sel, ps_proc->i4_pic_cnt); + + /* cbr rc - house keeping */ + if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel]) + { + ps_entropy->ps_bitstrm->u4_strm_buf_offset = 0; + } + else if (i4_stuff_bytes) + { + /* add filler nal units */ + ps_entropy->i4_error_code |= ih264e_add_filler_nal_unit(ps_bitstrm, i4_stuff_bytes); + } + } + + /********************************************************************/ + /* signal the output */ + /********************************************************************/ + ps_codec->as_out_buf[ctxt_sel].s_bits_buf.u4_bytes = ps_entropy->ps_bitstrm->u4_strm_buf_offset; + + DEBUG("entropy status %x", ps_entropy->i4_error_code); + } + + /* allow threads to dequeue entropy jobs */ + ps_codec->au4_entropy_thread_active[ctxt_sel] = 0; + + return ps_entropy->i4_error_code; +} + +/** +******************************************************************************* +* +* @brief Packs header information of a mb in to a buffer +* +* @par Description: +* After the deciding the mode info of a macroblock, the syntax elements +* associated with the mb are packed and stored. The entropy thread unpacks +* this buffer and generates the end bit stream. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pack_header_data(process_ctxt_t *ps_proc) +{ + /* curr mb type */ + UWORD32 u4_mb_type = ps_proc->u4_mb_type; + + /* pack mb syntax layer of curr mb (used for entropy coding) */ + if (u4_mb_type == I4x4) + { + /* pointer to mb header storage space */ + UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + + /* temp var */ + WORD32 i4, byte; + + /* mb type plus mode */ + *pu1_ptr++ = (ps_proc->u1_c_i8_mode << 6) + u4_mb_type; + + /* cbp */ + *pu1_ptr++ = ps_proc->u4_cbp; + + /* mb qp delta */ + *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + + /* sub mb modes */ + for (i4 = 0; i4 < 16; i4 ++) + { + byte = 0; + + if (ps_proc->au1_predicted_intra_luma_mb_4x4_modes[i4] == + ps_proc->au1_intra_luma_mb_4x4_modes[i4]) + { + byte |= 1; + } + else + { + + if (ps_proc->au1_intra_luma_mb_4x4_modes[i4] < + ps_proc->au1_predicted_intra_luma_mb_4x4_modes[i4]) + { + byte |= (ps_proc->au1_intra_luma_mb_4x4_modes[i4] << 1); + } + else + { + byte |= (ps_proc->au1_intra_luma_mb_4x4_modes[i4] - 1) << 1; + } + } + + i4++; + + if (ps_proc->au1_predicted_intra_luma_mb_4x4_modes[i4] == + ps_proc->au1_intra_luma_mb_4x4_modes[i4]) + { + byte |= 16; + } + else + { + + if (ps_proc->au1_intra_luma_mb_4x4_modes[i4] < + ps_proc->au1_predicted_intra_luma_mb_4x4_modes[i4]) + { + byte |= (ps_proc->au1_intra_luma_mb_4x4_modes[i4] << 5); + } + else + { + byte |= (ps_proc->au1_intra_luma_mb_4x4_modes[i4] - 1) << 5; + } + } + + *pu1_ptr++ = byte; + } + + /* end of mb layer */ + ps_proc->pv_mb_header_data = pu1_ptr; + } + else if (u4_mb_type == I16x16) + { + /* pointer to mb header storage space */ + UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + + /* mb type plus mode */ + *pu1_ptr++ = (ps_proc->u1_c_i8_mode << 6) + (ps_proc->u1_l_i16_mode << 4) + u4_mb_type; + + /* cbp */ + *pu1_ptr++ = ps_proc->u4_cbp; + + /* mb qp delta */ + *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + + /* end of mb layer */ + ps_proc->pv_mb_header_data = pu1_ptr; + } + else if (u4_mb_type == P16x16) + { + /* pointer to mb header storage space */ + UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + + WORD16 *i2_mv_ptr; + + /* mb type plus mode */ + *pu1_ptr++ = u4_mb_type; + + /* cbp */ + *pu1_ptr++ = ps_proc->u4_cbp; + + /* mb qp delta */ + *pu1_ptr++ = ps_proc->u4_mb_qp - ps_proc->u4_mb_qp_prev; + + i2_mv_ptr = (WORD16 *)pu1_ptr; + + *i2_mv_ptr++ = ps_proc->ps_pu->s_l0_mv.i2_mvx - ps_proc->ps_pred_mv->i2_mvx; + + *i2_mv_ptr++ = ps_proc->ps_pu->s_l0_mv.i2_mvy - ps_proc->ps_pred_mv->i2_mvy; + + /* end of mb layer */ + ps_proc->pv_mb_header_data = i2_mv_ptr; + } + else if (u4_mb_type == PSKIP) + { + /* pointer to mb header storage space */ + UWORD8 *pu1_ptr = ps_proc->pv_mb_header_data; + + /* mb type plus mode */ + *pu1_ptr++ = u4_mb_type; + + /* end of mb layer */ + ps_proc->pv_mb_header_data = pu1_ptr; + } + + return IH264E_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief update process context after encoding an mb. This involves preserving +* the current mb information for later use, initialize the proc ctxt elements to +* encode next mb. +* +* @par Description: +* This function performs house keeping tasks after encoding an mb. +* After encoding an mb, various elements of the process context needs to be +* updated to encode the next mb. For instance, the source, recon and reference +* pointers, mb indices have to be adjusted to the next mb. The slice index of +* the current mb needs to be updated. If mb qp modulation is enabled, then if +* the qp changes the quant param structure needs to be updated. Also to encoding +* the next mb, the current mb info is used as part of mode prediction or mv +* prediction. Hence the current mb info has to preserved at top/top left/left +* locations. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +WORD32 ih264e_update_proc_ctxt(process_ctxt_t *ps_proc) +{ + /* error status */ + WORD32 error_status = IH264_SUCCESS; + + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* curr mb indices */ + WORD32 i4_mb_x = ps_proc->i4_mb_x; + WORD32 i4_mb_y = ps_proc->i4_mb_y; + + /* mb syntax elements of neighbors */ + mb_info_t *ps_left_syn = &ps_proc->s_left_mb_syntax_ele; + mb_info_t *ps_top_syn = ps_proc->ps_top_row_mb_syntax_ele + i4_mb_x; + mb_info_t *ps_top_left_syn = &ps_proc->s_top_left_mb_syntax_ele; + + /* curr mb type */ + UWORD32 u4_mb_type = ps_proc->u4_mb_type; + + /* curr mb type */ + UWORD32 u4_is_intra = ps_proc->u4_is_intra; + + /* width in mbs */ + WORD32 i4_wd_mbs = ps_proc->i4_wd_mbs; + + /*height in mbs*/ + WORD32 i4_ht_mbs = ps_proc->i4_ht_mbs; + + /* proc map */ + UWORD8 *pu1_proc_map = ps_proc->pu1_proc_map + (i4_mb_y * i4_wd_mbs); + + /* deblk context */ + deblk_ctxt_t *ps_deblk = &ps_proc->s_deblk_ctxt; + + /* deblk bs context */ + bs_ctxt_t *ps_bs = &(ps_deblk->s_bs_ctxt); + + /* top row motion vector info */ + enc_pu_t *ps_top_row_pu = ps_proc->ps_top_row_pu + i4_mb_x; + + /* top left mb motion vector */ + enc_pu_t *ps_top_left_mb_pu = &ps_proc->s_top_left_mb_pu; + + /* left mb motion vector */ + enc_pu_t *ps_left_mb_pu = &ps_proc->s_left_mb_pu; + + /* sub mb modes */ + UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (i4_mb_x << 4); + +// /* zero mv */ +// mv_t zero_mv = {0, 0}; + + /* Pad the MB to support non standard sizes */ + UWORD32 u4_pad_right_sz = ps_codec->s_cfg.u4_wd - ps_codec->s_cfg.u4_disp_wd; + UWORD32 u4_pad_bottom_sz = ps_codec->s_cfg.u4_ht - ps_codec->s_cfg.u4_disp_ht; + + /*************************************************************/ + /* During MV prediction, when top right mb is not available, */ + /* top left mb info. is used for prediction. Hence the curr */ + /* top, which will be top left for the next mb needs to be */ + /* preserved before updating it with curr mb info. */ + /*************************************************************/ + + /* mb type, mb class, csbp */ + *ps_top_left_syn = *ps_top_syn; + + if (ps_proc->i4_slice_type == PSLICE) + { + /*****************************************/ + /* update top left with top info results */ + /*****************************************/ + + /* mv */ + *ps_top_left_mb_pu = *ps_top_row_pu; + } + + /*************************************************/ + /* update top and left with curr mb info results */ + /*************************************************/ + + /* mb type */ + ps_left_syn->u2_mb_type = ps_top_syn->u2_mb_type = u4_mb_type; + + /* mb class */ + ps_left_syn->u2_is_intra = ps_top_syn->u2_is_intra = u4_is_intra; + + /* csbp */ + ps_left_syn->u4_csbp = ps_top_syn->u4_csbp = ps_proc->u4_csbp; + + /* distortion */ + ps_left_syn->i4_mb_distortion = ps_top_syn->i4_mb_distortion = ps_proc->i4_mb_distortion; + + if (u4_is_intra) + { + /* mb / sub mb modes */ + if (I16x16 == u4_mb_type) + { + pu1_top_mb_intra_modes[0] = ps_proc->au1_left_mb_intra_modes[0] = ps_proc->u1_l_i16_mode; + } + else if (I4x4 == u4_mb_type) + { + ps_codec->pf_mem_cpy_mul8(ps_proc->au1_left_mb_intra_modes, ps_proc->au1_intra_luma_mb_4x4_modes, 16); + ps_codec->pf_mem_cpy_mul8(pu1_top_mb_intra_modes, ps_proc->au1_intra_luma_mb_4x4_modes, 16); + } + else if (I8x8 == u4_mb_type) + { + memcpy(ps_proc->au1_left_mb_intra_modes, ps_proc->au1_intra_luma_mb_8x8_modes, 4); + memcpy(pu1_top_mb_intra_modes, ps_proc->au1_intra_luma_mb_8x8_modes, 4); + } + + if (ps_proc->i4_slice_type == PSLICE) + { + /* mv */ + *ps_left_mb_pu = *ps_top_row_pu = *(ps_proc->ps_pu); + +// /* reset ngbr mv's */ +// ps_top_row_pu->i1_l0_ref_idx = -1; +// ps_top_row_pu->s_l0_mv = zero_mv; +// +// *ps_left_mb_pu = *ps_top_row_pu; + } + } + else + { + /* mv */ + *ps_left_mb_pu = *ps_top_row_pu = *(ps_proc->ps_pu); + } + + /* + * Mark that the MB has been coded intra + * So that future AIRs can skip it + */ + ps_proc->pu1_is_intra_coded[i4_mb_x + (i4_mb_y * i4_wd_mbs)] = u4_is_intra; + + /**************************************************/ + /* pack mb header info. for entropy coding */ + /**************************************************/ + ih264e_pack_header_data(ps_proc); + + /* update previous mb qp */ + ps_proc->u4_mb_qp_prev = ps_proc->u4_mb_qp; + + /* store qp */ + ps_proc->s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp[(i4_mb_y * i4_wd_mbs) + i4_mb_x] = ps_proc->u4_mb_qp; + + /* + * We need to sync the cache to make sure that the nmv content of proc + * is updated to cache properly + */ + DATA_SYNC(); + + /* Just before finishing the row, enqueue the job in to entropy queue. + * The master thread depending on its convenience shall dequeue it and + * performs entropy. + * + * WARN !! Placing this block post proc map update can cause queuing of + * entropy jobs in out of order. + */ + if (i4_mb_x == i4_wd_mbs - 1) + { + /* job structures */ + job_t s_job; + + /* job class */ + s_job.i4_cmd = CMD_ENTROPY; + + /* number of mbs to be processed in the current job */ + s_job.i2_mb_cnt = ps_codec->s_cfg.i4_wd_mbs; + + /* job start index x */ + s_job.i2_mb_x = 0; + + /* job start index y */ + s_job.i2_mb_y = ps_proc->i4_mb_y; + + /* proc base idx */ + s_job.i2_proc_base_idx = (ps_codec->i4_encode_api_call_cnt & 1) ? (MAX_PROCESS_CTXT / 2): 0 ; + + /* queue the job */ + error_status |= ih264_list_queue(ps_proc->pv_entropy_jobq, &s_job, 1); + + if(ps_proc->i4_mb_y == (i4_ht_mbs - 1)) + ih264_list_terminate(ps_codec->pv_entropy_jobq); + } + + /* update proc map */ + pu1_proc_map[i4_mb_x] = 1; + + /**************************************************/ + /* update proc ctxt elements for encoding next mb */ + /**************************************************/ + /* update indices */ + i4_mb_x ++; + ps_proc->i4_mb_x = i4_mb_x; + + if (ps_proc->i4_mb_x == i4_wd_mbs) + { + ps_proc->i4_mb_y++; + ps_proc->i4_mb_x = 0; + } + + /* update slice index */ + ps_proc->i4_cur_slice_idx = ps_proc->pu1_slice_idx[ps_proc->i4_mb_y * i4_wd_mbs + ps_proc->i4_mb_x]; + + /* update buffers pointers */ + ps_proc->pu1_src_buf_luma += MB_SIZE; + ps_proc->pu1_rec_buf_luma += MB_SIZE; + ps_proc->pu1_ref_buf_luma += MB_SIZE; + + /* + * Note: Although chroma mb size is 8, as the chroma buffers are interleaved, + * the stride per MB is MB_SIZE + */ + ps_proc->pu1_src_buf_chroma += MB_SIZE; + ps_proc->pu1_rec_buf_chroma += MB_SIZE; + ps_proc->pu1_ref_buf_chroma += MB_SIZE; + + /* pad right edge */ + if (u4_pad_right_sz && (ps_proc->i4_mb_x == i4_wd_mbs - 1)) + { + ih264_pad_right_luma( + ps_proc->pu1_src_buf_luma + MB_SIZE - u4_pad_right_sz, + ps_proc->i4_src_strd, MB_SIZE, u4_pad_right_sz); + + ih264_pad_right_chroma( + ps_proc->pu1_src_buf_chroma + MB_SIZE - u4_pad_right_sz, + ps_proc->i4_src_strd, BLK8x8SIZE, u4_pad_right_sz); + } + + /* pad bottom edge */ + if (u4_pad_bottom_sz && (ps_proc->i4_mb_y == i4_ht_mbs - 1) && + ps_proc->i4_mb_x != 0) + { + ih264_pad_bottom(ps_proc->pu1_src_buf_luma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd, + ps_proc->i4_src_strd, MB_SIZE, u4_pad_bottom_sz); + + ih264_pad_bottom(ps_proc->pu1_src_buf_chroma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd / 2, + ps_proc->i4_src_strd, MB_SIZE, (u4_pad_bottom_sz / 2)); + } + + /* Reset cost, distortion params */ + ps_proc->i4_mb_cost = INT_MAX; + ps_proc->i4_mb_distortion = SHRT_MAX; + + ps_proc->ps_pu += *ps_proc->pu4_mb_pu_cnt; + + ps_proc->pu4_mb_pu_cnt += 1; + + /* deblk ctxts */ + if (ps_proc->u4_disable_deblock_level != 1) + { + /* indices */ + ps_bs->i4_mb_x = ps_proc->i4_mb_x; + ps_bs->i4_mb_y = ps_proc->i4_mb_y; + +#ifndef N_MB_ENABLE /* For N MB processing update take place inside deblocking function */ + ps_deblk->i4_mb_x ++; + + ps_deblk->pu1_cur_pic_luma += MB_SIZE; + /* + * Note: Although chroma mb size is 8, as the chroma buffers are interleaved, + * the stride per MB is MB_SIZE + */ + ps_deblk->pu1_cur_pic_chroma += MB_SIZE; +#endif + } + + return error_status; +} + +/** +******************************************************************************* +* +* @brief initialize process context. +* +* @par Description: +* Before dispatching the current job to process thread, the process context +* associated with the job is initialized. Usually every job aims to encode one +* row of mb's. Basing on the row indices provided by the job, the process +* context's buffer ptrs, slice indices and other elements that are necessary +* during core-coding are initialized. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc) +{ + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* nmb processing context*/ + n_mb_process_ctxt_t *ps_n_mb_ctxt = &ps_proc->s_n_mb_ctxt; + + /* indices */ + WORD32 i4_mb_x, i4_mb_y; + + /* strides */ + WORD32 i4_src_strd = ps_proc->i4_src_strd; + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* quant params */ + quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0]; + + /* deblk ctxt */ + deblk_ctxt_t *ps_deblk = &ps_proc->s_deblk_ctxt; + + /* deblk bs context */ + bs_ctxt_t *ps_bs = &(ps_deblk->s_bs_ctxt); + + /* Pointer to mv_buffer of current frame */ + mv_buf_t *ps_cur_mv_buf = ps_proc->ps_cur_mv_buf; + + /* Pointers for color space conversion */ + UWORD8 *pu1_y_buf_base, *pu1_u_buf_base, *pu1_v_buf_base; + + /* Pad the MB to support non standard sizes */ + UWORD32 u4_pad_bottom_sz = ps_codec->s_cfg.u4_ht - ps_codec->s_cfg.u4_disp_ht; + + /********************************************************************/ + /* BEGIN INIT */ + /********************************************************************/ + + i4_mb_x = ps_proc->i4_mb_x; + i4_mb_y = ps_proc->i4_mb_y; + + /* Number of mbs processed in one loop of process function */ + ps_proc->i4_nmb_ntrpy = (ps_proc->i4_wd_mbs > MAX_NMB) ? MAX_NMB : ps_proc->i4_wd_mbs; + ps_proc->u4_nmb_me = (ps_proc->i4_wd_mbs > MAX_NMB)? MAX_NMB : ps_proc->i4_wd_mbs; + + /* init buffer pointers */ + ps_proc->pu1_src_buf_luma = ps_proc->pu1_src_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_src_strd * (i4_mb_y * MB_SIZE); + ps_proc->pu1_src_buf_chroma = ps_proc->pu1_src_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_src_strd * (i4_mb_y * BLK8x8SIZE); + ps_proc->pu1_rec_buf_luma = ps_proc->pu1_rec_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE); + ps_proc->pu1_rec_buf_chroma = ps_proc->pu1_rec_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE); + ps_proc->pu1_ref_buf_luma = ps_proc->pu1_ref_buf_luma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * MB_SIZE); + ps_proc->pu1_ref_buf_chroma = ps_proc->pu1_ref_buf_chroma_base + (i4_mb_x * MB_SIZE) + i4_rec_strd * (i4_mb_y * BLK8x8SIZE); + + /* + * Do color space conversion + * NOTE : We assume there that the number of MB's to process will not span multiple rows + */ + switch (ps_codec->s_cfg.e_inp_color_fmt) + { + case IV_YUV_420SP_UV: + case IV_YUV_420SP_VU: + break; + + case IV_YUV_420P : + pu1_y_buf_base = (UWORD8 *)ps_proc->s_inp_buf.s_raw_buf.apv_bufs[0] + (i4_mb_x * MB_SIZE) + + ps_proc->s_inp_buf.s_raw_buf.au4_strd[0] * (i4_mb_y * MB_SIZE); + + pu1_u_buf_base = (UWORD8 *)ps_proc->s_inp_buf.s_raw_buf.apv_bufs[1] + (i4_mb_x * BLK8x8SIZE) + + ps_proc->s_inp_buf.s_raw_buf.au4_strd[1] * (i4_mb_y * BLK8x8SIZE); + + pu1_v_buf_base = (UWORD8 *)ps_proc->s_inp_buf.s_raw_buf.apv_bufs[2] + (i4_mb_x * BLK8x8SIZE) + + ps_proc->s_inp_buf.s_raw_buf.au4_strd[2] * (i4_mb_y * BLK8x8SIZE); + + ps_codec->pf_ih264e_conv_420p_to_420sp( + pu1_y_buf_base, pu1_u_buf_base, pu1_v_buf_base, + ps_proc->pu1_src_buf_luma, + ps_proc->pu1_src_buf_chroma, MB_SIZE, + ps_proc->i4_wd_mbs * MB_SIZE, + ps_proc->s_inp_buf.s_raw_buf.au4_strd[0], + ps_proc->s_inp_buf.s_raw_buf.au4_strd[1], + ps_proc->s_inp_buf.s_raw_buf.au4_strd[2], + ps_proc->i4_src_strd, ps_proc->i4_src_strd, 1); + break; + + case IV_YUV_422ILE : + pu1_y_buf_base = (UWORD8 *)ps_proc->s_inp_buf.s_raw_buf.apv_bufs[0] + (i4_mb_x * MB_SIZE * 2) + + ps_proc->s_inp_buf.s_raw_buf.au4_strd[0] * (i4_mb_y * MB_SIZE); + + ps_codec->pf_ih264e_fmt_conv_422i_to_420sp( + ps_proc->pu1_src_buf_luma, + ps_proc->pu1_src_buf_chroma, + ps_proc->pu1_src_buf_chroma + 1, pu1_y_buf_base, + ps_proc->i4_wd_mbs * MB_SIZE, MB_SIZE, + ps_proc->i4_src_strd, ps_proc->i4_src_strd, + ps_proc->i4_src_strd, + ps_proc->s_inp_buf.s_raw_buf.au4_strd[0] >> 1); + break; + + default: + break; + } + + /* pad bottom edge */ + if (u4_pad_bottom_sz && (ps_proc->i4_mb_y == ps_proc->i4_ht_mbs - 1) && ps_proc->i4_mb_x == 0) + { + ih264_pad_bottom(ps_proc->pu1_src_buf_luma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd, + ps_proc->i4_src_strd, MB_SIZE, u4_pad_bottom_sz); + + ih264_pad_bottom(ps_proc->pu1_src_buf_chroma + (MB_SIZE - u4_pad_bottom_sz) * ps_proc->i4_src_strd / 2, + ps_proc->i4_src_strd, MB_SIZE, (u4_pad_bottom_sz / 2)); + } + + /* packed mb coeff data */ + ps_proc->pv_mb_coeff_data = ((UWORD8 *)ps_proc->pv_pic_mb_coeff_data) + i4_mb_y * ps_codec->u4_size_coeff_data; + + /* packed mb header data */ + ps_proc->pv_mb_header_data = ((UWORD8 *)ps_proc->pv_pic_mb_header_data) + i4_mb_y * ps_codec->u4_size_header_data; + + /* slice index */ + ps_proc->i4_cur_slice_idx = ps_proc->pu1_slice_idx[i4_mb_y * ps_proc->i4_wd_mbs + i4_mb_x]; + + /*********************************************************************/ + /* ih264e_init_quant_params() routine is called at the pic init level*/ + /* this would have initialized the qp. */ + /* TODO_LATER: currently it is assumed that quant params donot change*/ + /* across mb's. When they do calculate update ps_qp_params accordingly*/ + /*********************************************************************/ + + /* init mv buffer ptr */ + ps_proc->ps_pu = ps_cur_mv_buf->ps_pic_pu + (i4_mb_y * ps_proc->i4_wd_mbs * (MIN_PU_SIZE * MIN_PU_SIZE)); + + if (i4_mb_y == 0) + { + ps_proc->ps_top_row_pu_ME = ps_cur_mv_buf->ps_pic_pu; + } + else + { + ps_proc->ps_top_row_pu_ME = ps_cur_mv_buf->ps_pic_pu + ((i4_mb_y - 1) * ps_proc->i4_wd_mbs * (MIN_PU_SIZE * MIN_PU_SIZE)); + } + + ps_proc->pu4_mb_pu_cnt = ps_cur_mv_buf->pu4_mb_pu_cnt + (i4_mb_y * ps_proc->i4_wd_mbs); + + /* mb type */ + ps_proc->u4_mb_type = I16x16; + + /* lambda */ + ps_proc->u4_lambda = gu1_qp0[ps_qp_params->u1_mb_qp]; + + /* mb distortion */ + ps_proc->i4_mb_distortion = SHRT_MAX; + + if (i4_mb_x == 0) + { + ps_proc->s_left_mb_syntax_ele.i4_mb_distortion = 0; + + ps_proc->s_top_left_mb_syntax_ele.i4_mb_distortion = 0; + + ps_proc->s_top_left_mb_syntax_ME.i4_mb_distortion = 0; + + if (i4_mb_y == 0) + { + memset(ps_proc->ps_top_row_mb_syntax_ele, 0, (ps_proc->i4_wd_mbs + 1)*sizeof(mb_info_t)); + } + } + + /* mb cost */ + ps_proc->i4_mb_cost = INT_MAX; + + /**********************/ + /* init deblk context */ + /**********************/ + ps_deblk->i4_mb_x = ps_proc->i4_mb_x; + /* deblk lags the current mb proc by 1 row */ + /* NOTE: Intra prediction has to happen with non deblocked samples used as reference */ + /* Hence to deblk MB 0 of row 0, you have wait till MB 0 of row 1 is encoded. */ + /* For simplicity, we chose to lag deblking by 1 Row wrt to proc */ + ps_deblk->i4_mb_y = ps_proc->i4_mb_y - 1; + + /* buffer ptrs */ + ps_deblk->pu1_cur_pic_luma = ps_proc->pu1_rec_buf_luma_base + i4_rec_strd * (ps_deblk->i4_mb_y * MB_SIZE); + ps_deblk->pu1_cur_pic_chroma = ps_proc->pu1_rec_buf_chroma_base + i4_rec_strd * (ps_deblk->i4_mb_y * BLK8x8SIZE); + + /* init deblk bs context */ + /* mb indices */ + ps_bs->i4_mb_x = ps_proc->i4_mb_x; + ps_bs->i4_mb_y = ps_proc->i4_mb_y; + + /* init n_mb_process context */ + ps_n_mb_ctxt->i4_mb_x = 0; + ps_n_mb_ctxt->i4_mb_y = ps_deblk->i4_mb_y; + ps_n_mb_ctxt->i4_n_mbs = ps_proc->i4_nmb_ntrpy; + + return IH264E_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief This function performs luma & chroma padding +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @param[in] pu1_curr_pic_luma +* Pointer to luma buffer +* +* @param[in] pu1_curr_pic_chroma +* Pointer to chroma buffer +* +* @param[in] i4_mb_x +* mb index x +* +* @param[in] i4_mb_y +* mb index y +* +* @param[in] i4_pad_ht +* number of rows to be padded +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pad_recon_buffer(process_ctxt_t *ps_proc, + UWORD8 *pu1_curr_pic_luma, + UWORD8 *pu1_curr_pic_chroma, + WORD32 i4_mb_x, + WORD32 i4_mb_y, + WORD32 i4_pad_ht) +{ + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* strides */ + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + if (i4_mb_x == 0) + { + /* padding left luma */ + ps_codec->pf_pad_left_luma(pu1_curr_pic_luma, i4_rec_strd, i4_pad_ht, PAD_LEFT); + + /* padding left chroma */ + ps_codec->pf_pad_left_chroma(pu1_curr_pic_chroma, i4_rec_strd, i4_pad_ht >> 1, PAD_LEFT); + } + else if (i4_mb_x == ps_proc->i4_wd_mbs - 1) + { + /* padding right luma */ + ps_codec->pf_pad_right_luma(pu1_curr_pic_luma + MB_SIZE, i4_rec_strd, i4_pad_ht, PAD_RIGHT); + + /* padding right chroma */ + ps_codec->pf_pad_right_chroma(pu1_curr_pic_chroma + MB_SIZE, i4_rec_strd, i4_pad_ht >> 1, PAD_RIGHT); + + if (i4_mb_y == ps_proc->i4_ht_mbs - 1) + { + UWORD8 *pu1_rec_luma = pu1_curr_pic_luma + MB_SIZE + PAD_RIGHT + ((i4_pad_ht - 1) * i4_rec_strd); + UWORD8 *pu1_rec_chroma = pu1_curr_pic_chroma + MB_SIZE + PAD_RIGHT + (((i4_pad_ht >> 1) - 1) * i4_rec_strd); + + /* padding bottom luma */ + ps_codec->pf_pad_bottom(pu1_rec_luma, i4_rec_strd, i4_rec_strd, PAD_BOT); + + /* padding bottom chroma */ + ps_codec->pf_pad_bottom(pu1_rec_chroma, i4_rec_strd, i4_rec_strd, (PAD_BOT >> 1)); + } + } + + if (i4_mb_y == 0) + { + UWORD8 *pu1_rec_luma = pu1_curr_pic_luma; + UWORD8 *pu1_rec_chroma = pu1_curr_pic_chroma; + WORD32 wd = MB_SIZE; + + if (i4_mb_x == 0) + { + pu1_rec_luma -= PAD_LEFT; + pu1_rec_chroma -= PAD_LEFT; + + wd += PAD_LEFT; + } + else if (i4_mb_x == ps_proc->i4_wd_mbs - 1) + { + wd += PAD_RIGHT; + } + + /* padding top luma */ + ps_codec->pf_pad_top(pu1_rec_luma, i4_rec_strd, wd, PAD_TOP); + + /* padding top chroma */ + ps_codec->pf_pad_top(pu1_rec_chroma, i4_rec_strd, wd, (PAD_TOP >> 1)); + } + + return IH264E_SUCCESS; +} + + + + +/** +******************************************************************************* +* +* @brief This function performs deblocking, padding and halfpel generation for +* 'n' MBs +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @param[in] pu1_curr_pic_luma +* Current MB being processed(Luma) +* +* @param[in] pu1_curr_pic_chroma +* Current MB being processed(Chroma) +* +* @param[in] i4_mb_x +* Column value of current MB processed +* +* @param[in] i4_mb_y +* Curent row processed +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_dblk_pad_hpel_processing_n_mbs(process_ctxt_t *ps_proc, + UWORD8 *pu1_curr_pic_luma, + UWORD8 *pu1_curr_pic_chroma, + WORD32 i4_mb_x, + WORD32 i4_mb_y) +{ + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* n_mb processing context */ + n_mb_process_ctxt_t *ps_n_mb_ctxt = &ps_proc->s_n_mb_ctxt; + + /* deblk context */ + deblk_ctxt_t *ps_deblk = &ps_proc->s_deblk_ctxt; + + /* strides */ + WORD32 i4_rec_strd = ps_proc->i4_rec_strd; + + /* loop variables */ + WORD32 row, i, j, col; + + /* Padding Width */ + UWORD32 u4_pad_wd; + + /* deblk_map of the row being deblocked */ + UWORD8 *pu1_deblk_map = ps_proc->pu1_deblk_map + ps_deblk->i4_mb_y * ps_proc->i4_wd_mbs; + + /* deblk_map_previous row */ + UWORD8 *pu1_deblk_map_prev_row = pu1_deblk_map - ps_proc->i4_wd_mbs; + + WORD32 u4_pad_top = 0; + + WORD32 u4_deblk_prev_row = 0; + + /* Number of mbs to be processed */ + WORD32 i4_n_mbs = ps_n_mb_ctxt->i4_n_mbs; + + /* Number of mbs actually processed + * (at the end of a row, when remaining number of MBs are less than i4_n_mbs) */ + WORD32 i4_n_mb_process_count = 0; + + UWORD8 *pu1_pad_bottom_src = NULL; + + UWORD8 *pu1_pad_src_luma = NULL; + UWORD8 *pu1_pad_src_chroma = NULL; + + if (ps_proc->u4_disable_deblock_level == 1) + { + /* If left most MB is processed, then pad left */ + if (i4_mb_x == 0) + { + /* padding left luma */ + ps_codec->pf_pad_left_luma(pu1_curr_pic_luma, i4_rec_strd, MB_SIZE, PAD_LEFT); + + /* padding left chroma */ + ps_codec->pf_pad_left_chroma(pu1_curr_pic_chroma, i4_rec_strd, MB_SIZE >> 1, PAD_LEFT); + } + /*last col*/ + if (i4_mb_x == (ps_proc->i4_wd_mbs - 1)) + { + /* padding right luma */ + ps_codec->pf_pad_right_luma(pu1_curr_pic_luma + MB_SIZE, i4_rec_strd, MB_SIZE, PAD_RIGHT); + + /* padding right chroma */ + ps_codec->pf_pad_right_chroma(pu1_curr_pic_chroma + MB_SIZE, i4_rec_strd, MB_SIZE >> 1, PAD_RIGHT); + } + } + + if (i4_mb_y > 0) + { + /* if number of mb's to be processed are less than 'N', go back. + * exception to the above clause is end of row */ + if ( ((i4_mb_x - (ps_n_mb_ctxt->i4_mb_x - 1)) < i4_n_mbs) && (i4_mb_x < (ps_proc->i4_wd_mbs - 1)) ) + { + return IH264E_SUCCESS; + } + else + { + i4_n_mb_process_count = MIN(i4_mb_x - (ps_n_mb_ctxt->i4_mb_x - 1), i4_n_mbs); + + u4_deblk_prev_row = 1; + + /* checking whether the top rows are deblocked */ + for (col = 0; col < i4_n_mb_process_count; col++) + { + u4_deblk_prev_row &= pu1_deblk_map_prev_row[ps_deblk->i4_mb_x + col]; + } + + /* checking whether the top right MB is deblocked */ + if ((ps_deblk->i4_mb_x + i4_n_mb_process_count) != ps_proc->i4_wd_mbs) + { + u4_deblk_prev_row &= pu1_deblk_map_prev_row[ps_deblk->i4_mb_x + i4_n_mb_process_count]; + } + + /* performing deblocking for required number of MBs */ + if (ps_proc->u4_disable_deblock_level != 1) + { + /* Top or Top right MBs not deblocked */ + if (u4_deblk_prev_row != 1) + { + return IH264E_SUCCESS; + } + + for (row = 0; row < i4_n_mb_process_count; row++) + { + ih264e_deblock_mb(ps_proc, ps_deblk); + + pu1_deblk_map[ps_deblk->i4_mb_x] = 1; + + if (ps_deblk->i4_mb_y > 0) + { + if (ps_deblk->i4_mb_x == 0)/* If left most MB is processed, then pad left*/ + { + /* padding left luma */ + ps_codec->pf_pad_left_luma(ps_deblk->pu1_cur_pic_luma - i4_rec_strd * MB_SIZE, i4_rec_strd, MB_SIZE, PAD_LEFT); + + /* padding left chroma */ + ps_codec->pf_pad_left_chroma(ps_deblk->pu1_cur_pic_chroma - i4_rec_strd * BLK8x8SIZE, i4_rec_strd, MB_SIZE >> 1, PAD_LEFT); + } + + if (ps_deblk->i4_mb_x == (ps_proc->i4_wd_mbs - 1))/*last column*/ + { + /* padding right luma */ + ps_codec->pf_pad_right_luma(ps_deblk->pu1_cur_pic_luma - i4_rec_strd * MB_SIZE + MB_SIZE, i4_rec_strd, MB_SIZE, PAD_RIGHT); + + /* padding right chroma */ + ps_codec->pf_pad_right_chroma(ps_deblk->pu1_cur_pic_chroma - i4_rec_strd * BLK8x8SIZE + MB_SIZE, i4_rec_strd, MB_SIZE >> 1, PAD_RIGHT); + } + } + ps_deblk->i4_mb_x++; + + ps_deblk->pu1_cur_pic_luma += MB_SIZE; + ps_deblk->pu1_cur_pic_chroma += MB_SIZE; + + } + } + else + { + ps_deblk->i4_mb_x += i4_n_mb_process_count; + + ps_deblk->pu1_cur_pic_luma += i4_n_mb_process_count * MB_SIZE; + ps_deblk->pu1_cur_pic_chroma += i4_n_mb_process_count * MB_SIZE; + } + + if (i4_mb_y == 2) + { + u4_pad_wd = i4_n_mb_process_count * MB_SIZE; + u4_pad_top = ps_n_mb_ctxt->i4_mb_x * MB_SIZE; + + if (ps_n_mb_ctxt->i4_mb_x == 0) + { + u4_pad_wd += PAD_LEFT; + u4_pad_top = -PAD_LEFT; + } + + if (i4_mb_x == ps_proc->i4_wd_mbs - 1) + { + u4_pad_wd += PAD_RIGHT; + } + + /* padding top luma */ + ps_codec->pf_pad_top(ps_proc->pu1_rec_buf_luma_base + u4_pad_top, i4_rec_strd, u4_pad_wd, PAD_TOP); + + /* padding top chroma */ + ps_codec->pf_pad_top(ps_proc->pu1_rec_buf_chroma_base + u4_pad_top, i4_rec_strd, u4_pad_wd, (PAD_TOP >> 1)); + } + + ps_n_mb_ctxt->i4_mb_x += i4_n_mb_process_count; + + if (i4_mb_x == ps_proc->i4_wd_mbs - 1) + { + if (ps_proc->i4_mb_y == ps_proc->i4_ht_mbs - 1) + { + /* Bottom Padding is done in one stretch for the entire width */ + if (ps_proc->u4_disable_deblock_level != 1) + { + ps_deblk->pu1_cur_pic_luma = ps_proc->pu1_rec_buf_luma_base + (ps_proc->i4_ht_mbs - 1) * i4_rec_strd * MB_SIZE; + + ps_deblk->pu1_cur_pic_chroma = ps_proc->pu1_rec_buf_chroma_base + (ps_proc->i4_ht_mbs - 1) * i4_rec_strd * BLK8x8SIZE; + + ps_n_mb_ctxt->i4_mb_x = 0; + ps_n_mb_ctxt->i4_mb_y = ps_proc->i4_mb_y; + ps_deblk->i4_mb_x = 0; + ps_deblk->i4_mb_y = ps_proc->i4_mb_y; + + /* update pic qp map (as update_proc_ctxt is still not called for the last MB) */ + ps_proc->s_deblk_ctxt.s_bs_ctxt.pu1_pic_qp[(i4_mb_y * ps_proc->i4_wd_mbs) + i4_mb_x] = ps_proc->u4_mb_qp; + + i4_n_mb_process_count = (ps_proc->i4_wd_mbs) % i4_n_mbs; + + j = (ps_proc->i4_wd_mbs) / i4_n_mbs; + + for (i = 0; i < j; i++) + { + for (col = 0; col < i4_n_mbs; col++) + { + ih264e_deblock_mb(ps_proc, ps_deblk); + + pu1_deblk_map[ps_deblk->i4_mb_x] = 1; + + ps_deblk->i4_mb_x++; + ps_deblk->pu1_cur_pic_luma += MB_SIZE; + ps_deblk->pu1_cur_pic_chroma += MB_SIZE; + ps_n_mb_ctxt->i4_mb_x++; + } + } + + for (col = 0; col < i4_n_mb_process_count; col++) + { + ih264e_deblock_mb(ps_proc, ps_deblk); + + pu1_deblk_map[ps_deblk->i4_mb_x] = 1; + + ps_deblk->i4_mb_x++; + ps_deblk->pu1_cur_pic_luma += MB_SIZE; + ps_deblk->pu1_cur_pic_chroma += MB_SIZE; + ps_n_mb_ctxt->i4_mb_x++; + } + + pu1_pad_src_luma = ps_proc->pu1_rec_buf_luma_base + (ps_proc->i4_ht_mbs - 2) * MB_SIZE * i4_rec_strd; + + pu1_pad_src_chroma = ps_proc->pu1_rec_buf_chroma_base + (ps_proc->i4_ht_mbs - 2) * BLK8x8SIZE * i4_rec_strd; + + /* padding left luma */ + ps_codec->pf_pad_left_luma(pu1_pad_src_luma, i4_rec_strd, MB_SIZE, PAD_LEFT); + + /* padding left chroma */ + ps_codec->pf_pad_left_chroma(pu1_pad_src_chroma, i4_rec_strd, BLK8x8SIZE, PAD_LEFT); + + pu1_pad_src_luma += i4_rec_strd * MB_SIZE; + pu1_pad_src_chroma += i4_rec_strd * BLK8x8SIZE; + + /* padding left luma */ + ps_codec->pf_pad_left_luma(pu1_pad_src_luma, i4_rec_strd, MB_SIZE, PAD_LEFT); + + /* padding left chroma */ + ps_codec->pf_pad_left_chroma(pu1_pad_src_chroma, i4_rec_strd, BLK8x8SIZE, PAD_LEFT); + + pu1_pad_src_luma = ps_proc->pu1_rec_buf_luma_base + (ps_proc->i4_ht_mbs - 2) * MB_SIZE * i4_rec_strd + (ps_proc->i4_wd_mbs) * MB_SIZE; + + pu1_pad_src_chroma = ps_proc->pu1_rec_buf_chroma_base + (ps_proc->i4_ht_mbs - 2) * BLK8x8SIZE * i4_rec_strd + (ps_proc->i4_wd_mbs) * MB_SIZE; + + /* padding right luma */ + ps_codec->pf_pad_right_luma(pu1_pad_src_luma, i4_rec_strd, MB_SIZE, PAD_RIGHT); + + /* padding right chroma */ + ps_codec->pf_pad_right_chroma(pu1_pad_src_chroma, i4_rec_strd, BLK8x8SIZE, PAD_RIGHT); + + pu1_pad_src_luma += i4_rec_strd * MB_SIZE; + pu1_pad_src_chroma += i4_rec_strd * BLK8x8SIZE; + + /* padding right luma */ + ps_codec->pf_pad_right_luma(pu1_pad_src_luma, i4_rec_strd, MB_SIZE, PAD_RIGHT); + + /* padding right chroma */ + ps_codec->pf_pad_right_chroma(pu1_pad_src_chroma, i4_rec_strd, BLK8x8SIZE, PAD_RIGHT); + + } + + /* padding bottom luma */ + pu1_pad_bottom_src = ps_proc->pu1_rec_buf_luma_base + ps_proc->i4_ht_mbs * MB_SIZE * i4_rec_strd - PAD_LEFT; + ps_codec->pf_pad_bottom(pu1_pad_bottom_src, i4_rec_strd, i4_rec_strd, PAD_BOT); + + /* padding bottom chroma */ + pu1_pad_bottom_src = ps_proc->pu1_rec_buf_chroma_base + ps_proc->i4_ht_mbs * (MB_SIZE >> 1) * i4_rec_strd - PAD_LEFT; + ps_codec->pf_pad_bottom(pu1_pad_bottom_src, i4_rec_strd, i4_rec_strd, (PAD_BOT >> 1)); + } + } + } + } + + return IH264E_SUCCESS; +} + + +/** +******************************************************************************* +* +* @brief This function performs luma & chroma core coding for a set of mb's. +* +* @par Description: +* The mb to be coded is taken and is evaluated over a predefined set of modes +* (intra (i16, i4, i8)/inter (mv, skip)) for best cost. The mode with least cost +* is selected and using intra/inter prediction filters, prediction is carried out. +* The deviation between src and pred signal constitutes error signal. This error +* signal is transformed (hierarchical transform if necessary) and quantized. The +* quantized residue is packed in to entropy buffer for entropy coding. This is +* repeated for all the mb's enlisted under the job. +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +WORD32 ih264e_process(process_ctxt_t *ps_proc) +{ + /* error status */ + WORD32 error_status = IH264_SUCCESS; + + /* codec context */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* cbp luma, chroma */ + UWORD32 u4_cbp_l, u4_cbp_c; + + /* width in mbs */ + WORD32 i4_wd_mbs = ps_proc->i4_wd_mbs; + + /* loop var */ + WORD32 i4_mb_idx, i4_mb_cnt = ps_proc->i4_mb_cnt; + + /* valid modes */ + UWORD32 u4_valid_modes = 0; + + /* gate threshold */ + WORD32 i4_gate_threshold = 0; + + /* is intra */ + WORD32 luma_idx, chroma_idx, is_intra; + + /* temp variables */ + WORD32 ctxt_sel = ps_proc->i4_encode_api_call_cnt & 1; + + /* list of modes for evaluation */ + if (ps_proc->i4_slice_type == ISLICE) + { + /* enable intra 16x16 */ + u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_16x16 ? (1 << I16x16) : 0; + + /* enable intra 8x8 */ + u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_8x8 ? (1 << I8x8) : 0; + + /* enable intra 4x4 */ + u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_4x4 ? (1 << I4x4) : 0; + } + else if (ps_proc->i4_slice_type == PSLICE) + { + /* enable intra 16x16 */ + u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_16x16 ? (1 << I16x16) : 0; + + /* enable intra 4x4 */ + if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_SLOWEST) + { + u4_valid_modes |= ps_codec->s_cfg.u4_enable_intra_4x4 ? (1 << I4x4) : 0; + } + + /* enable inter 16x16 */ + u4_valid_modes |= (1 << P16x16); + } + + + /* init entropy */ + ps_proc->s_entropy.i4_mb_x = ps_proc->i4_mb_x; + ps_proc->s_entropy.i4_mb_y = ps_proc->i4_mb_y; + ps_proc->s_entropy.i4_mb_cnt = MIN(ps_proc->i4_nmb_ntrpy, i4_wd_mbs - ps_proc->i4_mb_x); + + /* compute recon when : + * 1. current frame is to be used as a reference + * 2. dump recon for bit stream sanity check + */ + ps_proc->u4_compute_recon = ps_codec->u4_is_curr_frm_ref || + ps_codec->s_cfg.u4_enable_recon; + + /* Encode 'n' macroblocks, + * 'n' being the number of mbs dictated by current proc ctxt */ + for (i4_mb_idx = 0; i4_mb_idx < i4_mb_cnt; i4_mb_idx ++) + { + /* since we have not yet found sad, we have not yet got min sad */ + /* we need to initialize these variables for each MB */ + /* TODO how to get the min sad into the codec */ + ps_proc->u4_min_sad = ps_codec->s_cfg.i4_min_sad; + ps_proc->u4_min_sad_reached = 0; + + /* mb analysis */ + { + /* temp var */ + WORD32 i4_mb_id = ps_proc->i4_mb_x + ps_proc->i4_mb_y * i4_wd_mbs; + + /* force intra refresh ? */ + WORD32 i4_air_enable_inter = (ps_codec->s_cfg.e_air_mode == IVE_AIR_MODE_NONE) || + (ps_proc->pu1_is_intra_coded[i4_mb_id] != 0) || + (ps_codec->pu2_intr_rfrsh_map[i4_mb_id] != ps_codec->i4_air_pic_cnt); + + /* evaluate inter 16x16 modes */ + if (u4_valid_modes & (1 << P16x16)) + { + /* compute nmb me */ + if (ps_proc->i4_mb_x % ps_proc->u4_nmb_me == 0) + { + ih264e_compute_me_nmb(ps_proc, MIN((WORD32)ps_proc->u4_nmb_me, + i4_wd_mbs - ps_proc->i4_mb_x)); + } + + /* set pointers to ME data appropriately for other modules to use */ + { + UWORD32 u4_mb_index = ps_proc->i4_mb_x % ps_proc->u4_nmb_me ; + + /* get the min sad condition for current mb */ + ps_proc->u4_min_sad_reached = ps_proc->ps_nmb_info[u4_mb_index].u4_min_sad_reached; + ps_proc->u4_min_sad = ps_proc->ps_nmb_info[u4_mb_index].u4_min_sad; + + ps_proc->ps_skip_mv = &(ps_proc->ps_nmb_info[u4_mb_index].s_skip_mv); + ps_proc->ps_ngbr_avbl = &(ps_proc->ps_nmb_info[u4_mb_index].s_ngbr_avbl); + ps_proc->ps_pred_mv = &(ps_proc->ps_nmb_info[u4_mb_index].s_pred_mv); + + ps_proc->i4_mb_distortion = ps_proc->ps_nmb_info[u4_mb_index].i4_mb_distortion; + ps_proc->i4_mb_cost = ps_proc->ps_nmb_info[u4_mb_index].i4_mb_cost; + ps_proc->u4_min_sad = ps_proc->ps_nmb_info[u4_mb_index].u4_min_sad; + ps_proc->u4_min_sad_reached = ps_proc->ps_nmb_info[u4_mb_index].u4_min_sad_reached; + ps_proc->u4_mb_type = ps_proc->ps_nmb_info[u4_mb_index].u4_mb_type; + + /* get the best sub pel buffer */ + ps_proc->pu1_best_subpel_buf = ps_proc->ps_nmb_info[u4_mb_index].pu1_best_sub_pel_buf; + ps_proc->u4_bst_spel_buf_strd = ps_proc->ps_nmb_info[u4_mb_index].u4_bst_spel_buf_strd; + } + ih264e_derive_nghbr_avbl_of_mbs(ps_proc); + } + else + { + /* Derive neighbor availability for the current macroblock */ + ps_proc->ps_ngbr_avbl = &ps_proc->s_ngbr_avbl; + + ih264e_derive_nghbr_avbl_of_mbs(ps_proc); + } + + /* + * If air says intra, we need to force the following code path to evaluate intra + * The easy way is just to say that the inter cost is too much + */ + if (!i4_air_enable_inter) + { + ps_proc->u4_min_sad_reached = 0; + ps_proc->i4_mb_cost = INT_MAX; + ps_proc->i4_mb_distortion = INT_MAX; + } + else if (ps_proc->u4_mb_type == PSKIP) + { + goto UPDATE_MB_INFO; + } + + /* wait until the proc of [top + 1] mb is computed. + * We wait till the proc dependencies are satisfied */ + if(ps_proc->i4_mb_y > 0) + { + /* proc map */ + UWORD8 *pu1_proc_map_top; + + pu1_proc_map_top = ps_proc->pu1_proc_map + ((ps_proc->i4_mb_y - 1) * i4_wd_mbs); + + while (1) + { + volatile UWORD8 *pu1_buf; + WORD32 idx = i4_mb_idx + 1; + + idx = MIN(idx, ((WORD32)ps_codec->s_cfg.i4_wd_mbs - 1)); + pu1_buf = pu1_proc_map_top + idx; + if(*pu1_buf) + break; + ithread_yield(); + } + } + + /* If we already have the minimum sad, there is no point in searching for sad again */ + if (ps_proc->u4_min_sad_reached == 0) + { + /* intra gating in inter slices */ + /* No need of gating if we want to force intra, we need to find the threshold only if inter is enabled by AIR*/ + if (i4_air_enable_inter && ps_proc->i4_slice_type == PSLICE && ps_codec->u4_inter_gate) + { + /* distortion of neighboring blocks */ + WORD32 i4_distortion[4]; + + i4_distortion[0] = ps_proc->s_left_mb_syntax_ele.i4_mb_distortion; + + i4_distortion[1] = ps_proc->ps_top_row_mb_syntax_ele[ps_proc->i4_mb_x].i4_mb_distortion; + + i4_distortion[2] = ps_proc->ps_top_row_mb_syntax_ele[ps_proc->i4_mb_x + 1].i4_mb_distortion; + + i4_distortion[3] = ps_proc->s_top_left_mb_syntax_ele.i4_mb_distortion; + + i4_gate_threshold = (i4_distortion[0] + i4_distortion[1] + i4_distortion[2] + i4_distortion[3]) >> 2; + + } + + /* If we are going to force intra we need to evaluate intra irrespective of gating */ + if ( (!i4_air_enable_inter) || ((i4_gate_threshold + 16 *((WORD32) ps_proc->u4_lambda)) < ps_proc->i4_mb_distortion)) + { + /* evaluate intra 4x4 modes */ + if (u4_valid_modes & (1 << I4x4)) + { + if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_SLOWEST) + { + ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(ps_proc); + } + else + { + ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(ps_proc); + } + } + + /* evaluate intra 16x16 modes */ + if (u4_valid_modes & (1 << I16x16)) + { + ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(ps_proc); + } + + /* evaluate intra 8x8 modes */ + if (u4_valid_modes & (1 << I8x8)) + { + ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(ps_proc); + } + } + + } + } + + /* is intra */ + if (ps_proc->u4_mb_type == I4x4 || ps_proc->u4_mb_type == I16x16 || ps_proc->u4_mb_type == I8x8) + { + luma_idx = ps_proc->u4_mb_type; + chroma_idx = 0; + is_intra = 1; + + /* evaluate chroma blocks for intra */ + ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(ps_proc); + } + else + { + luma_idx = 3; + chroma_idx = 1; + is_intra = 0; + } + ps_proc->u4_is_intra = is_intra; + + /* redo MV pred of neighbors in the case intra mb */ + /* TODO : currently called unconditionally, needs to be called only in the case of intra + * to modify neighbors */ + if (ps_proc->i4_slice_type != ISLICE) + { + ih264e_mv_pred(ps_proc); + } + + /* Perform luma mb core coding */ + u4_cbp_l = (ps_codec->luma_energy_compaction)[luma_idx](ps_proc); + + /* Perform luma mb core coding */ + u4_cbp_c = (ps_codec->chroma_energy_compaction)[chroma_idx](ps_proc); + + /* coded block pattern */ + ps_proc->u4_cbp = (u4_cbp_c << 4) | u4_cbp_l; + + /* mb skip */ + if (is_intra == 0) + { + if (ps_proc->u4_cbp == 0) + { + /* get skip mv */ + UWORD32 u4_for_me = 0; + ih264e_find_skip_motion_vector(ps_proc,u4_for_me); + + /* skip ? */ + if (ps_proc->ps_skip_mv->i2_mvx == ps_proc->ps_pu->s_l0_mv.i2_mvx && + ps_proc->ps_skip_mv->i2_mvy == ps_proc->ps_pu->s_l0_mv.i2_mvy) + { + ps_proc->u4_mb_type = PSKIP; + } + } + } + +UPDATE_MB_INFO: + + /* Update mb sad, mb qp and intra mb cost. Will be used by rate control */ + ih264e_update_rc_mb_info(&ps_proc->s_frame_info, ps_proc); + + /**********************************************************************/ + /* if disable deblock level is '0' this implies enable deblocking for */ + /* all edges of all macroblocks with out any restrictions */ + /* */ + /* if disable deblock level is '1' this implies disable deblocking for*/ + /* all edges of all macroblocks with out any restrictions */ + /* */ + /* if disable deblock level is '2' this implies enable deblocking for */ + /* all edges of all macroblocks except edges overlapping with slice */ + /* boundaries. This option is not currently supported by the encoder */ + /* hence the slice map should be of no significance to perform debloc */ + /* king */ + /**********************************************************************/ + + if (ps_proc->u4_compute_recon) + { + /* deblk context */ + /* src pointers */ + UWORD8 *pu1_cur_pic_luma = ps_proc->pu1_rec_buf_luma; + UWORD8 *pu1_cur_pic_chroma = ps_proc->pu1_rec_buf_chroma; + + /* src indices */ + UWORD32 i4_mb_x = ps_proc->i4_mb_x; + UWORD32 i4_mb_y = ps_proc->i4_mb_y; + + /* compute blocking strength */ + if (ps_proc->u4_disable_deblock_level != 1) + { + ih264e_compute_bs(ps_proc); + } + + /* nmb deblocking and hpel and padding */ + ih264e_dblk_pad_hpel_processing_n_mbs(ps_proc, pu1_cur_pic_luma, + pu1_cur_pic_chroma, i4_mb_x, + i4_mb_y); + } + + /* update the context after for coding next mb */ + error_status |= ih264e_update_proc_ctxt(ps_proc); + + /* Once the last row is processed, mark the buffer status appropriately */ + if (ps_proc->i4_ht_mbs == ps_proc->i4_mb_y) + { + /* Pointer to current picture buffer structure */ + pic_buf_t *ps_cur_pic = ps_proc->ps_cur_pic; + + /* Pointer to current picture's mv buffer structure */ + mv_buf_t *ps_cur_mv_buf = ps_proc->ps_cur_mv_buf; + + /**********************************************************************/ + /* if disable deblock level is '0' this implies enable deblocking for */ + /* all edges of all macroblocks with out any restrictions */ + /* */ + /* if disable deblock level is '1' this implies disable deblocking for*/ + /* all edges of all macroblocks with out any restrictions */ + /* */ + /* if disable deblock level is '2' this implies enable deblocking for */ + /* all edges of all macroblocks except edges overlapping with slice */ + /* boundaries. This option is not currently supported by the encoder */ + /* hence the slice map should be of no significance to perform debloc */ + /* king */ + /**********************************************************************/ + error_status |= ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr, ps_cur_mv_buf->i4_buf_id , BUF_MGR_CODEC); + + error_status |= ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, ps_cur_pic->i4_buf_id , BUF_MGR_CODEC); + + if (ps_codec->s_cfg.u4_enable_recon) + { + /* pic cnt */ + ps_codec->as_rec_buf[ctxt_sel].i4_pic_cnt = ps_proc->i4_pic_cnt; + + /* rec buffers */ + ps_codec->as_rec_buf[ctxt_sel].s_pic_buf = *ps_proc->ps_cur_pic; + + /* is last? */ + ps_codec->as_rec_buf[ctxt_sel].u4_is_last = ps_proc->s_entropy.u4_is_last; + + /* frame time stamp */ + ps_codec->as_rec_buf[ctxt_sel].u4_timestamp_high = ps_proc->s_entropy.u4_timestamp_high; + ps_codec->as_rec_buf[ctxt_sel].u4_timestamp_low = ps_proc->s_entropy.u4_timestamp_low; + } + + } + } + + DEBUG_HISTOGRAM_DUMP(ps_codec->s_cfg.i4_ht_mbs == ps_proc->i4_mb_y); + + return error_status; +} + +/** +******************************************************************************* +* +* @brief +* function to receive frame qp and pic type before encoding +* +* @par Description: +* Before encoding the frame, this function calls the rc library for frame qp +* and picture type +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] pic_cnt +* pic count +* +* @param[out] pi4_pic_type +* pic type + +* @returns skip_src +* if the source frame rate and target frame rate are not identical, the encoder +* skips few source frames. skip_src is set when the source need not be encoded. +* +* @remarks none +* +******************************************************************************* +*/ +WORD32 ih264e_set_rc_pic_params(codec_t *ps_codec, WORD32 cur_pic_cnt, WORD32 *pi4_pic_type) +{ + /* rate control context */ + rate_control_ctxt_t *ps_rate_control = &ps_codec->s_rate_control; + + /* frame qp */ + UWORD8 u1_frame_qp; + + /* pic type */ + PIC_TYPE_T pic_type = PIC_NA; + + /* should src be skipped */ + WORD32 skip_src = 0; + + /* temp var */ + WORD32 delta_time_stamp = 1; + + /* see if the app requires any specific frame */ + if (ps_codec->force_curr_frame_type == IV_IDR_FRAME || ps_codec->force_curr_frame_type == IV_I_FRAME) + { + irc_force_I_frame(ps_codec->s_rate_control.pps_rate_control_api); + } + + /* call rate control lib to get curr pic type and qp to be used */ + skip_src = ih264e_rc_pre_enc(ps_rate_control->pps_rate_control_api, + ps_rate_control->pps_pd_frm_rate, + ps_rate_control->pps_time_stamp, + ps_rate_control->pps_frame_time, + delta_time_stamp, + (ps_codec->s_cfg.i4_wd_mbs * ps_codec->s_cfg.i4_ht_mbs), + &ps_rate_control->e_pic_type, + &u1_frame_qp); + + switch (ps_rate_control->e_pic_type) + { + case I_PIC: + pic_type = PIC_I; + break; + + case P_PIC: + pic_type = PIC_P; + break; + + case B_PIC: + pic_type = PIC_B; + break; + + default: + break; + } + + /* is idr? */ + if ((0 == cur_pic_cnt % ps_codec->s_cfg.u4_idr_frm_interval) || + ps_codec->force_curr_frame_type == IV_IDR_FRAME) + { + pic_type = PIC_IDR; + } + + /* force frame tag is not sticky */ + if (ps_codec->force_curr_frame_type == IV_IDR_FRAME || ps_codec->force_curr_frame_type == IV_I_FRAME) + { + ps_codec->force_curr_frame_type = IV_NA_FRAME; + } + + /* qp */ + ps_codec->u4_frame_qp = gau1_mpeg2_to_h264_qmap[u1_frame_qp]; + + /* pic type */ + *pi4_pic_type = pic_type; + + return skip_src; +} + +/** +******************************************************************************* +* +* @brief +* Function to update rc context after encoding +* +* @par Description +* This function updates the rate control context after the frame is encoded. +* Number of bits consumed by the current frame, frame distortion, frame cost, +* number of intra/inter mb's, ... are passed on to rate control context for +* updating the rc model. +* +* @param[in] ps_codec +* Handle to codec context +* +* @param[in] ctxt_sel +* frame context selector +* +* @param[in] pic_cnt +* pic count +* +* @returns i4_stuffing_byte +* number of stuffing bytes (if necessary) +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_update_rc_post_enc(codec_t *ps_codec, WORD32 ctxt_sel, WORD32 pic_cnt) +{ + /* proc set base idx */ + WORD32 i4_proc_ctxt_sel_base = ctxt_sel ? (MAX_PROCESS_CTXT / 2) : 0; + + /* proc ctxt */ + process_ctxt_t *ps_proc = &ps_codec->as_process[i4_proc_ctxt_sel_base]; + + /* frame qp */ + UWORD8 u1_frame_qp = ps_codec->u4_frame_qp; + + /* cbr rc return status */ + WORD32 i4_stuffing_byte = 0; + + /* current frame stats */ + frame_info_t s_frame_info; + picture_type_e rc_pic_type; + + /* temp var */ + WORD32 i, j; + + /********************************************************************/ + /* BEGIN INIT */ + /********************************************************************/ + + /* init frame info */ + irc_init_frame_info(&s_frame_info); + + /* get frame info */ + for (i = 0; i < (WORD32)ps_codec->s_cfg.u4_num_cores; i++) + { + /*****************************************************************/ + /* One frame can be encoded by max of u4_num_cores threads */ + /* Accumulating the num mbs, sad, qp and intra_mb_cost from */ + /* u4_num_cores threads */ + /*****************************************************************/ + for (j = 0; j< MAX_MB_TYPE; j++) + { + s_frame_info.num_mbs[j] += ps_proc[i].s_frame_info.num_mbs[j]; + + s_frame_info.tot_mb_sad[j] += ps_proc[i].s_frame_info.tot_mb_sad[j]; + + s_frame_info.qp_sum[j] += ps_proc[i].s_frame_info.qp_sum[j]; + } + + s_frame_info.intra_mb_cost_sum += ps_proc[i].s_frame_info.intra_mb_cost_sum; + + s_frame_info.activity_sum += ps_proc[i].s_frame_info.activity_sum; + + /*****************************************************************/ + /* gather number of residue and header bits consumed by the frame*/ + /*****************************************************************/ + ih264e_update_rc_bits_info(&s_frame_info, &ps_proc[i].s_entropy); + } + + /* get pic type */ + switch (ps_codec->pic_type) + { + case PIC_I: + case PIC_IDR: + rc_pic_type = I_PIC; + break; + case PIC_P: + rc_pic_type = P_PIC; + break; + case PIC_B: + rc_pic_type = B_PIC; + break; + default: + assert(0); + break; + } + + /* update rc lib with current frame stats */ + i4_stuffing_byte = ih264e_rc_post_enc(ps_codec->s_rate_control.pps_rate_control_api, + &(s_frame_info), + ps_codec->s_rate_control.pps_pd_frm_rate, + ps_codec->s_rate_control.pps_time_stamp, + ps_codec->s_rate_control.pps_frame_time, + (ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs), + &rc_pic_type, + pic_cnt, + &ps_codec->s_rate_control.post_encode_skip[ctxt_sel], + u1_frame_qp, + &ps_codec->s_rate_control.num_intra_in_prev_frame, + &ps_codec->s_rate_control.i4_avg_activity); + + /* in case the frame needs to be skipped, the frame num should not be incremented */ + if (ps_codec->s_rate_control.post_encode_skip[ctxt_sel]) + { + ps_codec->i4_frame_num --; + } + + return i4_stuffing_byte; +} + +/** +******************************************************************************* +* +* @brief +* entry point of a spawned encoder thread +* +* @par Description: +* The encoder thread dequeues a proc/entropy job from the encoder queue and +* calls necessary routines. +* +* @param[in] pv_proc +* Process context corresponding to the thread +* +* @returns error status +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_process_thread(void *pv_proc) +{ + /* error status */ + IH264_ERROR_T ret = IH264_SUCCESS; + WORD32 error_status = IH264_SUCCESS; + + /* proc ctxt */ + process_ctxt_t *ps_proc = pv_proc; + + /* codec ctxt */ + codec_t *ps_codec = ps_proc->ps_codec; + + /* structure to represent a processing job entry */ + job_t s_job; + + /* blocking call : entropy dequeue is non-blocking till all + * the proc jobs are processed */ + WORD32 is_blocking = 0; + + /* set affinity */ + ithread_set_affinity(ps_proc->i4_id); + + while(1) + { + /* dequeue a job from the entropy queue */ + { + int error = ithread_mutex_lock(ps_codec->pv_entropy_mutex); + + /* codec context selector */ + WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1; + + volatile UWORD32 *pu4_buf = &ps_codec->au4_entropy_thread_active[ctxt_sel]; + + /* have the lock */ + if (error == 0) + { + if (*pu4_buf == 0) + { + /* no entropy threads are active, try dequeuing a job from the entropy queue */ + ret = ih264_list_dequeue(ps_proc->pv_entropy_jobq, &s_job, is_blocking); + if (IH264_SUCCESS == ret) + { + *pu4_buf = 1; + ithread_mutex_unlock(ps_codec->pv_entropy_mutex); + goto WORKER; + } + else if(is_blocking) + { + ithread_mutex_unlock(ps_codec->pv_entropy_mutex); + break; + } + } + ithread_mutex_unlock(ps_codec->pv_entropy_mutex); + } + } + + /* dequeue a job from the process queue */ + ret = ih264_list_dequeue(ps_proc->pv_proc_jobq, &s_job, 1); + if (IH264_SUCCESS != ret) + { + if(ps_proc->i4_id) + break; + else + { + is_blocking = 1; + continue; + } + } + +WORKER: + /* choose appropriate proc context based on proc_base_idx */ + ps_proc = &ps_codec->as_process[ps_proc->i4_id + s_job.i2_proc_base_idx]; + + switch (s_job.i4_cmd) + { + case CMD_PROCESS: + ps_proc->i4_mb_cnt = s_job.i2_mb_cnt; + ps_proc->i4_mb_x = s_job.i2_mb_x; + ps_proc->i4_mb_y = s_job.i2_mb_y; + + /* init process context */ + ih264e_init_proc_ctxt(ps_proc); + + /* core code all mbs enlisted under the current job */ + error_status |= ih264e_process(ps_proc); + break; + + case CMD_ENTROPY: + ps_proc->s_entropy.i4_mb_x = s_job.i2_mb_x; + ps_proc->s_entropy.i4_mb_y = s_job.i2_mb_y; + ps_proc->s_entropy.i4_mb_cnt = s_job.i2_mb_cnt; + + /* init entropy */ + ih264e_init_entropy_ctxt(ps_proc); + + /* entropy code all mbs enlisted under the current job */ + error_status |= ih264e_entropy(ps_proc); + break; + + default: + error_status |= IH264_FAIL; + break; + } + } + + /* send error code */ + ps_proc->i4_error_code = error_status; + return ret; +} diff --git a/encoder/ih264e_process.h b/encoder/ih264e_process.h new file mode 100755 index 0000000..9715434 --- /dev/null +++ b/encoder/ih264e_process.h @@ -0,0 +1,364 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_process.h +* +* @brief +* Contains functions for codec thread +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_PROCESS_H_ +#define IH264E_PROCESS_H_ + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief This function generates sps, pps set on request +* +* @par Description +* When the encoder is set in header generation mode, the following function +* is called. This generates sps and pps headers and returns the control back +* to caller. +* +* @param[in] ps_codec +* pointer to codec context +* +* @return success or failure error code +* +****************************************************************************** +*/ +IH264E_ERROR_T ih264e_generate_sps_pps + ( + codec_t *ps_codec + ); + +/** +******************************************************************************* +* +* @brief initialize entropy context. +* +* @par Description: +* Before invoking the call to perform to entropy coding the entropy context +* associated with the job needs to be initialized. This involves the start +* mb address, end mb address, slice index and the pointer to location at +* which the mb residue info and mb header info are packed. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_init_entropy_ctxt(process_ctxt_t *ps_proc); + +/** +******************************************************************************* +* +* @brief entry point for entropy coding +* +* @par Description +* This function calls lower level functions to perform entropy coding for a +* group (n rows) of mb's. After encoding 1 row of mb's, the function takes +* back the control, updates the ctxt and calls lower level functions again. +* This process is repeated till all the rows or group of mb's (which ever is +* minimum) are coded +* +* @param[in] ps_proc +* process context +* +* @returns error status +* +* @remarks +* NOTE : It is assumed that this routine is invoked at the start of a slice, +* so the slice header is generated by default. +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_entropy(process_ctxt_t *ps_proc); + +/** +******************************************************************************* +* +* @brief Packs header information of a mb in to a buffer +* +* @par Description: +* After the deciding the mode info of a macroblock, the syntax elements +* associated with the mb are packed and stored. The entropy thread unpacks +* this buffer and generates the end bit stream. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pack_header_data + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* +* @brief update process context after encoding an mb. This involves preserving +* the current mb information for later use, initialize the proc ctxt elements to +* encode next mb. +* +* @par Description: +* This function performs house keeping tasks after encoding an mb. +* After encoding an mb, various elements of the process context needs to be +* updated to encode the next mb. For instance, the source, recon and reference +* pointers, mb indices have to be adjusted to the next mb. The slice index of +* the current mb needs to be updated. If mb qp modulation is enabled, then if +* the qp changes the quant param structure needs to be updated. Also to encoding +* the next mb, the current mb info is used as part of mode prediction or mv +* prediction. Hence the current mb info has to preserved at top/top left/left +* locations. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +WORD32 ih264e_update_proc_ctxt + ( + process_ctxt_t *ps_proc + ); + +/** +******************************************************************************* +* +* @brief initialize process context. +* +* @par Description: +* Before dispatching the current job to process thread, the process context +* associated with the job is initialized. Usually every job aims to encode one +* row of mb's. Basing on the row indices provided by the job, the process +* context's buffer ptrs, slice indices and other elements that are necessary +* during core-coding are initialized. +* +* @param[in] ps_proc +* Pointer to the current process context +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_init_proc_ctxt(process_ctxt_t *ps_proc); + +/** +******************************************************************************* +* +* @brief This function performs luma & chroma padding +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @param[in] pu1_curr_pic_luma +* Pointer to luma buffer +* +* @param[in] pu1_curr_pic_chroma +* Pointer to chroma buffer +* +* @param[in] i4_mb_x +* mb index x +* +* @param[in] i4_mb_y +* mb index y +* +* @param[in] i4_pad_ht +* number of rows to be padded +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pad_recon_buffer + ( + process_ctxt_t *ps_proc, + UWORD8 *pu1_curr_pic_luma, + UWORD8 *pu1_curr_pic_chroma, + WORD32 i4_mb_x, + WORD32 i4_mb_y, + WORD32 i4_pad_ht + ); + +/** +******************************************************************************* +* +* @brief This function performs luma half pel planes generation +* +* @par Description: +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_halfpel_generation + ( + process_ctxt_t *ps_proc, + UWORD8 *pu1_curr_pic_luma, + WORD32 i4_mb_x, + WORD32 i4_mb_y + ); + +/** +******************************************************************************* +* +* @brief This function performs luma & chroma core coding for a set of mb's. +* +* @par Description: +* The mb to be coded is taken and is evaluated over a predefined set of modes +* (intra (i16, i4, i8)/inter (mv, skip)) for best cost. The mode with least cost +* is selected and using intra/inter prediction filters, prediction is carried out. +* The deviation between src and pred signal constitutes error signal. This error +* signal is transformed (hierarchical transform if necessary) and quantized. The +* quantized residue is packed in to entropy buffer for entropy coding. This is +* repeated for all the mb's enlisted under the job. +* +* @param[in] ps_proc +* Process context corresponding to the job +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +WORD32 ih264e_process(process_ctxt_t *ps_proc); + +/** +******************************************************************************* +* +* @brief +* function to receive frame qp and pic type before encoding +* +* @par Description: +* Before encoding the frame, this function calls the rc library for frame qp +* and picture type +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] pic_cnt +* pic count +* +* @param[out] pi4_pic_type +* pic type + +* @returns skip_src +* if the source frame rate and target frame rate are not identical, the encoder +* skips few source frames. skip_src is set when the source need not be encoded. +* +* @remarks none +* +******************************************************************************* +*/ +WORD32 ih264e_set_rc_pic_params(codec_t *ps_codec, WORD32 cur_pic_cnt, WORD32 *pi4_pic_type); + + +/** +******************************************************************************* +* +* @brief +* Function to update rc context after encoding +* +* @par Description +* This function updates the rate control context after the frame is encoded. +* Number of bits consumed by the current frame, frame distortion, frame cost, +* number of intra/inter mb's, ... are passed on to rate control context for +* updating the rc model. +* +* @param[in] ps_codec +* Handle to codec context +* +* @param[in] ctxt_sel +* frame context selector +* +* @param[in] pic_cnt +* pic count +* +* @returns i4_stuffing_byte +* number of stuffing bytes (if necessary) +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_update_rc_post_enc(codec_t *ps_codec, WORD32 ctxt_sel, WORD32 pic_cnt); + +/** +******************************************************************************* +* +* @brief +* entry point of a spawned encoder thread +* +* @par Description: +* The encoder thread dequeues a proc/entropy job from the encoder queue and +* calls necessary routines. +* +* @param[in] pv_proc +* Process context corresponding to the thread +* +* @returns error status +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_process_thread(void *pv_proc); + +#endif /* IH264E_PROCESS_H_ */ diff --git a/encoder/ih264e_rate_control.c b/encoder/ih264e_rate_control.c new file mode 100755 index 0000000..1e2fe4f --- /dev/null +++ b/encoder/ih264e_rate_control.c @@ -0,0 +1,801 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_rate_control.c +* +* @brief +* Contains api function definitions for h264 rate control +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_rc_init() +* - ih264e_rc_get_picture_details() +* - ih264e_rc_pre_enc() +* - ih264e_update_rc_mb_info() +* - ih264e_rc_get_buffer_status() +* - ih264e_rc_post_enc() +* - ih264e_update_rc_bits_info() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "irc_datatypes.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e.h" +#include "ih264_defs.h" +#include "ih264_macros.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_common_tables.h" +#include "ih264e_defs.h" +#include "ih264e_globals.h" +#include "irc_mem_req_and_acq.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "irc_rate_control_api.h" +#include "ih264e_time_stamp.h" +#include "ih264e_modify_frm_rate.h" +#include "ih264e_rate_control.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264e_structs.h" +#include "ih264e_utils.h" +#include "irc_trace_support.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief This function does nothing +* +* @par Description +* This function does nothing +* +* @param[in] variadic function + +* @returns none +* +* @remarks This function is used by the rc library for debugging purposes. +* However this function was not part of rc library. So this is defined here +* to resolve link issues. +* +******************************************************************************* +*/ +int trace_printf(const WORD8 *format, ...) +{ + UNUSED(format); + return(0); +}; + +/** +******************************************************************************* +* +* @brief +* This function initializes rate control context and variables +* +* @par Description +* This function initializes rate control type, source and target frame rate, +* average and peak bitrate, intra-inter frame interval and initial +* quantization parameter +* +* @param[in] pv_rc_api +* Handle to rate control api +* +* @param[in] pv_frame_time +* Handle to frame time context +* +* @param[in] pv_time_stamp +* Handle to time stamp context +* +* @param[in] pv_pd_frm_rate +* Handle to pull down frame time context +* +* @param[in] u4_max_frm_rate +* Maximum frame rate +* +* @param[in] u4_src_frm_rate +* Source frame rate +* +* @param[in] u4_tgt_frm_rate +* Target frame rate +* +* @param[in] e_rate_control_type +* Rate control type +* +* @param[in] u4_avg_bit_rate +* Average bit rate +* +* @param[in] u4_peak_bit_rate +* Peak bit rate +* +* @param[in] u4_max_delay +* Maximum delay between frames +* +* @param[in] u4_intra_frame_interval +* Intra frame interval +* +* @param[in] pu1_init_qp +* Initial qp +* +* @param[in] i4_max_inter_frm_int +* Maximum inter frame interval +* +* @param[in] pu1_min_max_qp +* Array of min/max qp +* +* @param[in] u1_profile_level +* Encoder profile level +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_rc_init(void *pv_rc_api, + void *pv_frame_time, + void *pv_time_stamp, + void *pv_pd_frm_rate, + UWORD32 u4_max_frm_rate, + UWORD32 u4_src_frm_rate, + UWORD32 u4_tgt_frm_rate, + rc_type_e e_rate_control_type, + UWORD32 u4_avg_bit_rate, + UWORD32 u4_peak_bit_rate, + UWORD32 u4_max_delay, + UWORD32 u4_intra_frame_interval, + UWORD8 *pu1_init_qp, + WORD32 i4_max_inter_frm_int, + UWORD8 *pu1_min_max_qp, + UWORD8 u1_profile_level) +{ +// UWORD8 u1_is_mb_level_rc_on = 0; + UWORD32 au4_peak_bit_rate[2] = {0,0}; + UWORD32 u4_min_bit_rate = 0; + WORD32 i4_is_gop_closed = 0; +// WORD32 i4_use_est_intra_sad = 1; + UWORD32 u4_src_ticks = 0; + UWORD32 u4_tgt_ticks = 0; + UWORD8 u1_level_idx = ih264e_get_lvl_idx(u1_profile_level); + UWORD32 u4_max_cpb_size = 1200 * gas_ih264_lvl_tbl[u1_level_idx].u4_max_cpb_size; + + /* Fill the params needed for the RC init */ + if (e_rate_control_type == CBR_NLDRC) + { + au4_peak_bit_rate[0] = u4_avg_bit_rate; + au4_peak_bit_rate[1] = u4_avg_bit_rate; + } + else + { + au4_peak_bit_rate[0] = u4_peak_bit_rate; + au4_peak_bit_rate[1] = u4_peak_bit_rate; + } + + /* Initialize frame time computation module*/ + ih264e_init_frame_time(pv_frame_time, + u4_src_frm_rate, /* u4_src_frm_rate */ + u4_tgt_frm_rate); /* u4_tgt_frm_rate */ + + /* Initialize the pull_down frame rate */ + ih264e_init_pd_frm_rate(pv_pd_frm_rate, + u4_src_frm_rate); /* u4_input_frm_rate */ + + /* Initialize time stamp structure */ + ih264e_init_time_stamp(pv_time_stamp, + u4_max_frm_rate, /* u4_max_frm_rate */ + u4_src_frm_rate); /* u4_src_frm_rate */ + + u4_src_ticks = ih264e_frame_time_get_src_ticks(pv_frame_time); + u4_tgt_ticks = ih264e_frame_time_get_tgt_ticks(pv_frame_time); + + /* Initialize the rate control */ + irc_initialise_rate_control(pv_rc_api, /* RC handle */ + e_rate_control_type, /* RC algo type */ + 0, /* MB activity on/off */ + u4_avg_bit_rate, /* Avg Bitrate */ + au4_peak_bit_rate, /* Peak bitrate array[2]:[I][P] */ + u4_min_bit_rate, /* Min Bitrate */ + u4_src_frm_rate, /* Src frame_rate */ + u4_max_delay, /* Max buffer delay */ + u4_intra_frame_interval, /* Intra frm_interval */ + pu1_init_qp, /* Init QP array[3]:[I][P][B] */ + u4_max_cpb_size, /* Max VBV/CPB Buffer Size */ + i4_max_inter_frm_int, /* Max inter frm_interval */ + i4_is_gop_closed, /* Open/Closed GOP */ + pu1_min_max_qp, /* Min-max QP array[6]:[Imax][Imin][Pmax][Pmin][Bmax][Bmin] */ + 0, /* How to calc the I-frame estimated_sad */ + u4_src_ticks, /* Src_ticks = LCM(src_frm_rate,tgt_frm_rate)/src_frm_rate */ + u4_tgt_ticks); /* Tgt_ticks = LCM(src_frm_rate,tgt_frm_rate)/tgt_frm_rate */ +} + +/** +******************************************************************************* +* +* @brief Function to get picture details +* +* @par Description +* This function returns the Picture type(I/P/B) +* +* @param[in] pv_rc_api +* Handle to Rate control api +* +* @returns +* Picture type +* +* @remarks none +* +******************************************************************************* +*/ +picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api) +{ + WORD32 i4_pic_id = 0; + WORD32 i4_pic_disp_order_no = 0; + picture_type_e e_rc_pic_type = P_PIC; + + irc_get_picture_details(pv_rc_api, &i4_pic_id, &i4_pic_disp_order_no, + &e_rc_pic_type); + + return (e_rc_pic_type); +} + +/** +******************************************************************************* +* +* @brief Function to get rate control output before encoding +* +* @par Description +* This function is called before encoding the current frame and gets the qp +* for the current frame from rate control module +* +* @param[in] ps_rate_control_api +* Handle to rate control api +* +* @param[in] ps_pd_frm_rate +* Handle to pull down frm rate context +* +* @param[in] ps_time_stamp +* Handle to time stamp context +* +* @param[in] ps_frame_time +* Handle to frame time context +* +* @param[in] i4_delta_time_stamp +* Time stamp difference between frames +* +* @param[in] i4_total_mb_in_frame +* Total Macro Blocks in frame +* +* @param[in/out] pe_vop_coding_type +* Picture coding type(I/P/B) +* +* @param[in/out] pu1_frame_qp +* QP for current frame +* +* @returns +* Skip or encode the current frame +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_rc_pre_enc(void * ps_rate_control_api, + void * ps_pd_frm_rate, + void * ps_time_stamp, + void * ps_frame_time, + WORD32 i4_delta_time_stamp, + WORD32 i4_total_mb_in_frame, + picture_type_e *pe_vop_coding_type, + UWORD8 *pu1_frame_qp) +{ + WORD8 i4_skip_src = 0, i4_num_app_skips = 0; + UWORD32 u4_src_not_skipped_for_dts = 0; + + /* Variables for the update_frm_level_info */ + WORD32 ai4_tot_mb_in_type[MAX_MB_TYPE]; + WORD32 ai4_tot_mb_type_qp[MAX_MB_TYPE] = {0, 0}; + WORD32 ai4_mb_type_sad[MAX_MB_TYPE] = {0, 0}; + WORD32 ai4_mb_type_tex_bits[MAX_MB_TYPE] = {0, 0}; + WORD32 i4_total_frame_bits = 0; + WORD32 i4_total_hdr_bits = 0; + WORD32 i4_avg_mb_activity = 0; + WORD32 i4_intra_frm_cost = 0; + UWORD8 u1_is_scd = 0; + + /* Set all the MBs to Intra */ + ai4_tot_mb_in_type[0] = i4_total_mb_in_frame; + ai4_tot_mb_in_type[1] = 0; + + /* If delta time stamp is greater than 1, do rcupdate that many times */ + for (i4_num_app_skips = 0; (i4_num_app_skips < i4_delta_time_stamp - 1); i4_num_app_skips++) + { + /*update the missing frames frm_rate with 0 */ + ih264e_update_pd_frm_rate(ps_pd_frm_rate,0); + + /* Update the time stamp */ + ih264e_update_time_stamp(ps_time_stamp); + + /* Do a pre encode skip update */ + + irc_update_frame_level_info(ps_rate_control_api, + (*pe_vop_coding_type), + ai4_mb_type_sad, /* Frame level SAD for each type of MB[Intra/Inter] */ + i4_total_frame_bits, /* Total frame bits actually consumed */ + i4_total_hdr_bits, /*header bits for model updation*/ + ai4_mb_type_tex_bits, /* Total texture bits consumed for each type of MB[Intra/Inter] used for model */ + ai4_tot_mb_type_qp, /* Total qp of all MBs based on mb type */ + ai4_tot_mb_in_type, /* total number of mbs in each mb type */ + i4_avg_mb_activity, /* Average mb activity in frame */ + u1_is_scd, /* Is a scene change detected at the current frame */ + 1, /* If it's a pre-encode skip */ + i4_intra_frm_cost, /* Sum of Intra cost for each frame */ + 0); /* Is pic handling [irc_update_pic_handling_state] done before update */ + } + + /* Update the time stamp for the current frame */ + ih264e_update_time_stamp(ps_time_stamp); + + /* Check if a src not needs to be skipped */ + i4_skip_src = ih264e_should_src_be_skipped(ps_frame_time, + i4_delta_time_stamp, + &u4_src_not_skipped_for_dts); + + /*********************************************************************** + Based on difference in source and target frame rate frames are skipped + ***********************************************************************/ + if (i4_skip_src) + { + /*update the missing frames frm_rate with 0 */ + ih264e_update_pd_frm_rate(ps_pd_frm_rate,0); + + /* Do a pre encode skip update */ + irc_update_frame_level_info(ps_rate_control_api, + (*pe_vop_coding_type), + ai4_mb_type_sad, /* Frame level SAD for each type of MB[Intra/Inter] */ + i4_total_frame_bits, /* Total frame bits actually consumed */ + i4_total_hdr_bits, /*header bits for model updation*/ + ai4_mb_type_tex_bits, /* Total texture bits consumed for each type of MB[Intra/Inter] used for model */ + ai4_tot_mb_type_qp, /* Total qp of all MBs based on mb type */ + ai4_tot_mb_in_type, /* total number of mbs in each mb type */ + i4_avg_mb_activity, /* Average mb activity in frame */ + u1_is_scd, /* Is a scene change detected at the current frame */ + 1, /* If it's a pre-encode skip */ + i4_intra_frm_cost, /* Sum of Intra cost for each frame */ + 0); /* Is pic handling [irc_update_pic_handling_state] done before update */ + + /* Set the current frame type to NA */ + *pe_vop_coding_type = BUF_PIC; + } + else + { +#define MAX_FRAME_BITS 0x7FFFFFFF +// WORD32 i4_pic_id; +// WORD32 i4_pic_disp_order_no; + WORD32 i4_avg_frm_rate, i4_source_frame_rate; + + i4_source_frame_rate = ih264e_frame_time_get_src_frame_rate(ps_frame_time); + + /* Update the frame rate of the frame present with the tgt_frm_rate */ + /* If the frm was not skipped due to delta_time_stamp, update the + frame_rate with double the tgt_frame_rate value, so that it makes + up for one of the frames skipped by the application */ + ih264e_update_pd_frm_rate(ps_pd_frm_rate, + i4_source_frame_rate); + + /* Based on the update get the average frame rate */ + i4_avg_frm_rate = ih264e_get_pd_avg_frm_rate(ps_pd_frm_rate); + + /* Call the RC library function to change the frame_rate to the + actually achieved frm_rate */ + irc_change_frm_rate_for_bit_alloc(ps_rate_control_api, i4_avg_frm_rate); + + /* --------Rate control related things. Get pic type and frame Qp---------*/ + /* Add picture to the stack. For IPP encoder we push the variable + into the stack and get back the variables by requesting RC. + This interface is designed for IPB encoder */ + irc_add_picture_to_stack(ps_rate_control_api, 1); + + /* Query the picture_type */ + *pe_vop_coding_type = ih264e_rc_get_picture_details(ps_rate_control_api); + + /* Get current frame Qp */ + pu1_frame_qp[0] = (UWORD8)irc_get_frame_level_qp(ps_rate_control_api, + (picture_type_e)(pe_vop_coding_type[0]), + MAX_FRAME_BITS); + } + + return(i4_skip_src); +} + +/** +******************************************************************************* +* +* @brief Function to update mb info for rate control context +* +* @par Description +* After encoding a mb, information such as mb type, qp used, mb distortion +* resulted in encoding the block and so on needs to be preserved for modeling +* RC. This is preserved via this function call. +* +* @param[in] ps_frame_info +* Handle Frame info context +* +* @param[in] ps_proc +* Process context +* +* @returns +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_rc_mb_info(frame_info_t *ps_frame_info, void *pv_proc) +{ + /* proc ctxt */ + process_ctxt_t *ps_proc = pv_proc; + + /* is intra or inter */ + WORD32 mb_type = !ps_proc->u4_is_intra; + + /* distortion */ + ps_frame_info->tot_mb_sad[mb_type] += ps_proc->i4_mb_distortion; + + /* qp */ + ps_frame_info->qp_sum[mb_type] += gau1_h264_to_mpeg2_qmap[ps_proc->u4_mb_qp]; + + /* mb cnt */ + ps_frame_info->num_mbs[mb_type]++; + + /* cost */ + if (ps_proc->u4_is_intra) + { + ps_frame_info->intra_mb_cost_sum += ps_proc->i4_mb_cost; + } +} + +/** +******************************************************************************* +* +* @brief Function to get rate control buffer status +* +* @par Description +* This function is used to get buffer status(underflow/overflow) by rate +* control module +* +* @param[in] pv_rc_api +* Handle to rate control api context +* +* @param[in] i4_total_frame_bits +* Total frame bits +* +* @param[in] u1_pic_type +* Picture type +* +* @param[in] pi4_num_bits_to_prevent_vbv_underflow +* Number of bits to prevent underflow +* +* @param[out] pu1_is_enc_buf_overflow +* Buffer overflow indication flag +* +* @param[out] pu1_is_enc_buf_underflow +* Buffer underflow indication flag +* +* @returns +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_rc_get_buffer_status(void *pv_rc_api, + WORD32 i4_total_frame_bits, + picture_type_e e_pic_type, + WORD32 *pi4_num_bits_to_prevent_vbv_underflow, + UWORD8 *pu1_is_enc_buf_overflow, + UWORD8 *pu1_is_enc_buf_underflow) +{ + vbv_buf_status_e e_vbv_buf_status = VBV_NORMAL; + + e_vbv_buf_status = irc_get_buffer_status(pv_rc_api, + i4_total_frame_bits, + e_pic_type, + pi4_num_bits_to_prevent_vbv_underflow); + + if (e_vbv_buf_status == VBV_OVERFLOW) + { + *pu1_is_enc_buf_underflow = 1; + *pu1_is_enc_buf_overflow = 0; + } + else if (e_vbv_buf_status == VBV_UNDERFLOW) + { + *pu1_is_enc_buf_underflow = 0; + *pu1_is_enc_buf_overflow = 1; + } + else + { + *pu1_is_enc_buf_underflow = 0; + *pu1_is_enc_buf_overflow = 0; + } +} + +/** +******************************************************************************* +* +* @brief Function to update rate control module after encoding +* +* @par Description +* This function is used to update the rate control module after the current +* frame encoding is done with details such as bits consumed, SAD for I/P/B, +* intra cost ,mb type and other +* +* @param[in] ps_rate_control_api +* Handle to rate control api context +* +* @param[in] ps_frame_info +* Handle to frame info context +* +* @param[in] ps_pd_frm_rate +* Handle to pull down frame rate context +* +* @param[in] ps_time_stamp +* Handle to time stamp context +* +* @param[in] ps_frame_time +* Handle to frame time context +* +* @param[in] i4_total_mb_in_frame +* Total mb in frame +* +* @param[in] pe_vop_coding_type +* Picture coding type +* +* @param[in] i4_is_first_frame +* Is first frame +* +* @param[in] pi4_is_post_encode_skip +* Post encoding skip flag +* +* @param[in] u1_frame_qp +* Frame qp +* +* @param[in] pi4_num_intra_in_prev_frame +* Numberf of intra mbs in previous frame +* +* @param[in] pi4_avg_activity +* Average activity +* +* @returns +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_rc_post_enc(void * ps_rate_control_api, + frame_info_t *ps_frame_info, + void * ps_pd_frm_rate, + void * ps_time_stamp, + void * ps_frame_time, + WORD32 i4_total_mb_in_frame, + picture_type_e *pe_vop_coding_type, + WORD32 i4_is_first_frame, + WORD32 *pi4_is_post_encode_skip, + UWORD8 u1_frame_qp, + WORD32 *pi4_num_intra_in_prev_frame, + WORD32 *pi4_avg_activity) +{ + /* Variables for the update_frm_level_info */ + WORD32 ai4_tot_mb_in_type[MAX_MB_TYPE]; + WORD32 ai4_tot_mb_type_qp[MAX_MB_TYPE] = {0, 0}; + WORD32 ai4_mb_type_sad[MAX_MB_TYPE] = {0, 0}; + WORD32 ai4_mb_type_tex_bits[MAX_MB_TYPE] = {0, 0}; + WORD32 i4_total_frame_bits = 0; + WORD32 i4_total_hdr_bits = 0; + WORD32 i4_total_texturebits; + WORD32 i4_avg_mb_activity = 0; + WORD32 i4_intra_frm_cost = 0; + UWORD8 u1_is_scd = 0; + WORD32 i4_cbr_bits_to_stuff = 0; + UWORD32 u4_num_intra_in_prev_frame = *pi4_num_intra_in_prev_frame; + UNUSED(ps_pd_frm_rate); + UNUSED(ps_time_stamp); + UNUSED(ps_frame_time); + UNUSED(u1_frame_qp); + /* Accumulate RC stats */ + ai4_tot_mb_in_type[MB_TYPE_INTRA] = irc_fi_get_total_mb(ps_frame_info,MB_TYPE_INTRA); + ai4_tot_mb_in_type[MB_TYPE_INTER] = irc_fi_get_total_mb(ps_frame_info,MB_TYPE_INTER); + /* ai4_tot_mb_type_qp[MB_TYPE_INTRA] = 0; + ai4_tot_mb_type_qp[MB_TYPE_INTER] = ps_enc->pu1_h264_mpg2quant[u1_frame_qp] * i4_total_mb_in_frame;*/ + ai4_tot_mb_type_qp[MB_TYPE_INTRA] = irc_fi_get_total_mb_qp(ps_frame_info,MB_TYPE_INTRA); + ai4_tot_mb_type_qp[MB_TYPE_INTER] = irc_fi_get_total_mb_qp(ps_frame_info,MB_TYPE_INTER); + ai4_mb_type_sad[MB_TYPE_INTRA] = irc_fi_get_total_mb_sad(ps_frame_info,MB_TYPE_INTRA); + ai4_mb_type_sad[MB_TYPE_INTER] = irc_fi_get_total_mb_sad(ps_frame_info,MB_TYPE_INTER); + i4_intra_frm_cost = irc_fi_get_total_intra_mb_cost(ps_frame_info); + i4_avg_mb_activity = irc_fi_get_avg_activity(ps_frame_info); + i4_total_hdr_bits = irc_fi_get_total_header_bits(ps_frame_info); + i4_total_texturebits = irc_fi_get_total_mb_texture_bits(ps_frame_info,MB_TYPE_INTRA); + i4_total_texturebits += irc_fi_get_total_mb_texture_bits(ps_frame_info,MB_TYPE_INTER); + i4_total_frame_bits = i4_total_hdr_bits + i4_total_texturebits ; + + *pi4_avg_activity = i4_avg_mb_activity; + + + /* Texture bits are not accumulated. Hence subtracting hdr bits from total bits */ + ai4_mb_type_tex_bits[MB_TYPE_INTRA] = 0; + ai4_mb_type_tex_bits[MB_TYPE_INTER] = i4_total_frame_bits - i4_total_hdr_bits; + + /* Set post encode skip to zero */ + pi4_is_post_encode_skip[0]= 0; + + /* For NLDRC, get the buffer status for stuffing or skipping */ + if (irc_get_rc_type(ps_rate_control_api) == CBR_NLDRC) + { + WORD32 i4_get_num_bit_to_prevent_vbv_overflow; + UWORD8 u1_enc_buf_overflow,u1_enc_buf_underflow; + + /* Getting the buffer status */ + ih264e_rc_get_buffer_status(ps_rate_control_api, i4_total_frame_bits, + pe_vop_coding_type[0], &i4_get_num_bit_to_prevent_vbv_overflow, + &u1_enc_buf_overflow,&u1_enc_buf_underflow); + + /* We skip the frame if decoder buffer is underflowing. But we never skip first I frame */ + // if((u1_enc_buf_overflow == 1) && (i4_is_first_frame != 1)) + if ((u1_enc_buf_overflow == 1) && (i4_is_first_frame != 0)) + { + irc_post_encode_frame_skip(ps_rate_control_api, (picture_type_e)pe_vop_coding_type[0]); + // i4_total_frame_bits = imp4_write_skip_frame_header(ps_enc); + i4_total_frame_bits = 0; + + *pi4_is_post_encode_skip = 1; + + /* Adjust the GOP if in case we skipped an I-frame */ + if (*pe_vop_coding_type == I_PIC) + irc_force_I_frame(ps_rate_control_api); + + /* Since this frame is skipped by writing 7 bytes header, we say this is a P frame */ + // *pe_vop_coding_type = P; + + /* Getting the buffer status again,to check if it underflows */ + irc_get_buffer_status(ps_rate_control_api, i4_total_frame_bits, + (picture_type_e)pe_vop_coding_type[0], &i4_get_num_bit_to_prevent_vbv_overflow); + + } + + /* In this case we stuff bytes as buffer is overflowing */ + if (u1_enc_buf_underflow == 1) + { + /* The stuffing function is directly pulled out from split controller workspace. + encode_vop_data() function makes sure alignment data is dumped at the end of a + frame. Split controller was identifying this alignment byte, overwriting it with + the stuff data and then finally aligning the buffer. Here every thing is inside + the DSP. So, ideally encode_vop_data needn't align, and we can start stuffing directly. + But in that case, it'll break the logic for a normal frame. + Hence for simplicity, not changing this part since it is ok to align and + then overwrite since stuffing is not done for every frame */ + i4_cbr_bits_to_stuff = irc_get_bits_to_stuff(ps_rate_control_api, i4_total_frame_bits, pe_vop_coding_type[0]); + + /* Just add extra 32 bits to make sure we don't stuff lesser */ + i4_cbr_bits_to_stuff += 32; + + /* We can not stuff more than the outbuf size. So have a check here */ + /* Add stuffed bits to total bits */ + i4_total_frame_bits += i4_cbr_bits_to_stuff; + } + } + +#define ENABLE_SCD 1 +#if ENABLE_SCD + /* If number of intra MBs are more than 2/3rd of total MBs, assume it as a scene change */ + if ((ai4_tot_mb_in_type[MB_TYPE_INTRA] > ((2 * i4_total_mb_in_frame) / 3)) && + (*pe_vop_coding_type == P_PIC) && + (ai4_tot_mb_in_type[MB_TYPE_INTRA] > ((11 * (WORD32)u4_num_intra_in_prev_frame) / 10))) + { + u1_is_scd = 1; + } +#endif + + /* Update num intra mbs of this frame */ + if (pi4_is_post_encode_skip[0] == 0) + { + *pi4_num_intra_in_prev_frame = ai4_tot_mb_in_type[MB_TYPE_INTRA]; + } + + /* Reset intra count to zero, if u encounter an I frame */ + if (*pe_vop_coding_type == I_PIC) + { + *pi4_num_intra_in_prev_frame = 0; + } + + /* Do an update of rate control after post encode */ + irc_update_frame_level_info(ps_rate_control_api, /* RC state */ + pe_vop_coding_type[0], /* PIC type */ + ai4_mb_type_sad, /* SAD for [Intra/Inter] */ + i4_total_frame_bits, /* Total frame bits */ + i4_total_hdr_bits, /* header bits for */ + ai4_mb_type_tex_bits, /* for MB[Intra/Inter] */ + ai4_tot_mb_type_qp, /* for MB[Intra/Inter] */ + ai4_tot_mb_in_type, /* for MB[Intra/Inter] */ + i4_avg_mb_activity, /* Average mb activity in frame */ + u1_is_scd, /* Is a scene change detected */ + 0, /* Pre encode skip */ + (WORD32)i4_intra_frm_cost, /* Intra cost for frame */ + 0); /* Not done outside */ + + return (i4_cbr_bits_to_stuff >> 3); +} + +/** +******************************************************************************* +* +* @brief Function to update bits consumed info to rate control context +* +* @par Description +* Function to update bits consume info to rate control context +* +* @param[in] ps_frame_info +* Frame info context +* +* @param[in] ps_entropy +* Entropy context +* +* @returns +* total bits consumed by the frame +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_rc_bits_info(frame_info_t *ps_frame_info, void *pv_entropy) +{ + entropy_ctxt_t *ps_entropy = pv_entropy; + + ps_frame_info->mb_header_bits[MB_TYPE_INTRA] += ps_entropy->u4_header_bits[MB_TYPE_INTRA]; + + ps_frame_info->mb_texture_bits[MB_TYPE_INTRA] += ps_entropy->u4_residue_bits[MB_TYPE_INTRA]; + + ps_frame_info->mb_header_bits[MB_TYPE_INTER] += ps_entropy->u4_header_bits[MB_TYPE_INTER]; + + ps_frame_info->mb_texture_bits[MB_TYPE_INTER] += ps_entropy->u4_residue_bits[MB_TYPE_INTER]; + + return; +} + diff --git a/encoder/ih264e_rate_control.h b/encoder/ih264e_rate_control.h new file mode 100755 index 0000000..de9466a --- /dev/null +++ b/encoder/ih264e_rate_control.h @@ -0,0 +1,351 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_rate_control.h +* +* @brief +* This file contains function declarations of api functions for h264 rate +* control +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_RATE_CONTROL_H_ +#define IH264E_RATE_CONTROL_H_ + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* This function initializes rate control context and variables +* +* @par Description +* This function initializes rate control type, source and target frame rate, +* average and peak bitrate, intra-inter frame interval and initial +* quantization parameter +* +* @param[in] pv_rc_api +* Handle to rate control api +* +* @param[in] pv_frame_time +* Handle to frame time context +* +* @param[in] pv_time_stamp +* Handle to time stamp context +* +* @param[in] pv_pd_frm_rate +* Handle to pull down frame time context +* +* @param[in] u4_max_frm_rate +* Maximum frame rate +* +* @param[in] u4_src_frm_rate +* Source frame rate +* +* @param[in] u4_tgt_frm_rate +* Target frame rate +* +* @param[in] e_rate_control_type +* Rate control type +* +* @param[in] u4_avg_bit_rate +* Average bit rate +* +* @param[in] u4_peak_bit_rate +* Peak bit rate +* +* @param[in] u4_max_delay +* Maximum delay between frames +* +* @param[in] u4_intra_frame_interval +* Intra frame interval +* +* @param[in] pu1_init_qp +* Initial qp +* +* @param[in] i4_max_inter_frm_int +* Maximum inter frame interval +* +* @param[in] pu1_min_max_qp +* Array of min/max qp +* +* @param[in] u1_profile_level +* Encoder profile level +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_rc_init(void *pv_rc_api, + void *pv_frame_time, + void *pv_time_stamp, + void *pv_pd_frm_rate, + UWORD32 u4_max_frm_rate, + UWORD32 u4_src_frm_rate, + UWORD32 u4_tgt_frm_rate, + rc_type_e e_rate_control_type, + UWORD32 u4_avg_bit_rate, + UWORD32 u4_peak_bit_rate, + UWORD32 u4_max_delay, + UWORD32 u4_intra_frame_interval, + UWORD8 *pu1_init_qp, + WORD32 i4_max_inter_frm_int, + UWORD8 *pu1_min_max_qp, + UWORD8 u1_profile_level); + +/** +******************************************************************************* +* +* @brief Function to get picture details +* +* @par Description +* This function returns the Picture type(I/P/B) +* +* @param[in] pv_rc_api +* Handle to Rate control api +* +* @returns +* Picture type +* +* @remarks none +* +******************************************************************************* +*/ +picture_type_e ih264e_rc_get_picture_details(void *pv_rc_api); + + +/** +******************************************************************************* +* +* @brief Function to get rate control output before encoding +* +* @par Description +* This function is called before encoding the current frame and gets the qp +* for the current frame from rate control module +* +* @param[in] ps_rate_control_api +* Handle to rate control api +* +* @param[in] ps_pd_frm_rate +* Handle to pull down frm rate context +* +* @param[in] ps_time_stamp +* Handle to time stamp context +* +* @param[in] ps_frame_time +* Handle to frame time context +* +* @param[in] i4_delta_time_stamp +* Time stamp difference between frames +* +* @param[in] i4_total_mb_in_frame +* Total Macro Blocks in frame +* +* @param[in/out] pe_vop_coding_type +* Picture coding type(I/P/B) +* +* @param[in/out] pu1_frame_qp +* QP for current frame +* +* @returns +* Skip or encode the current frame +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_rc_pre_enc(void * ps_rate_control_api, + void * ps_pd_frm_rate, + void * ps_time_stamp, + void * ps_frame_time, + WORD32 i4_delta_time_stamp, + WORD32 i4_total_mb_in_frame, + picture_type_e *pe_vop_coding_type, + UWORD8 *pu1_frame_qp); + +/** +******************************************************************************* +* +* @brief Function to update mb info for rate control context +* +* @par Description +* After encoding a mb, information such as mb type, qp used, mb distortion +* resulted in encoding the block and so on needs to be preserved for modelling +* RC. This is preserved via this function call. +* +* @param[in] ps_frame_info +* Handle Frame info context +* +* @param[in] ps_proc +* Process context +* +* @returns +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_rc_mb_info(frame_info_t *ps_frame_info, void *pv_proc); + +/** +******************************************************************************* +* +* @brief Function to get rate control buffer status +* +* @par Description +* This function is used to get buffer status(underflow/overflow) by rate +* control module +* +* @param[in] pv_rc_api +* Handle to rate control api context +* +* @param[in] i4_total_frame_bits +* Total frame bits +* +* @param[in] u1_pic_type +* Picture type +* +* @param[in] pi4_num_bits_to_prevent_vbv_underflow +* Number of bits to prevent underflow +* +* @param[out] pu1_is_enc_buf_overflow +* Buffer overflow indication flag +* +* @param[out] pu1_is_enc_buf_underflow +* Buffer underflow indication flag +* +* @returns +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_rc_get_buffer_status(void *pv_rc_api, + WORD32 i4_total_frame_bits, + picture_type_e e_pic_type, + WORD32 *pi4_num_bits_to_prevent_vbv_underflow, + UWORD8 *pu1_is_enc_buf_overflow, + UWORD8 *pu1_is_enc_buf_underflow); + +/** +******************************************************************************* +* +* @brief Function to update rate control module after encoding +* +* @par Description +* This function is used to update the rate control module after the current +* frame encoding is done with details such as bits consumed, SAD for I/P/B, +* intra cost ,mb type and other +* +* @param[in] ps_rate_control_api +* Handle to rate control api context +* +* @param[in] ps_frame_info +* Handle to frame info context +* +* @param[in] ps_pd_frm_rate +* Handle to pull down frame rate context +* +* @param[in] ps_time_stamp +* Handle to time stamp context +* +* @param[in] ps_frame_time +* Handle to frame time context +* +* @param[in] i4_total_mb_in_frame +* Total mb in frame +* +* @param[in] pe_vop_coding_type +* Picture coding type +* +* @param[in] i4_is_first_frame +* Is first frame +* +* @param[in] pi4_is_post_encode_skip +* Post encoding skip flag +* +* @param[in] u1_frame_qp +* Frame qp +* +* @param[in] pi4_num_intra_in_prev_frame +* Number of intra mbs in previous frame +* +* @param[in] pi4_avg_activity +* Average activity +* +* @returns +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_rc_post_enc(void *ps_rate_control_api, + frame_info_t *ps_frame_info, + void *ps_pd_frm_rate, + void *ps_time_stamp, + void *ps_frame_time, + WORD32 i4_total_mb_in_frame, + picture_type_e *pe_vop_coding_type, + WORD32 i4_is_first_frame, + WORD32 *pi4_is_post_encode_skip, + UWORD8 u1_frame_qp, + WORD32 *pi4_num_intra_in_prev_frame, + WORD32 *pi4_avg_activity); + +/** +******************************************************************************* +* +* @brief Function to update bits consumed info to rate control context +* +* @par Description +* Function to update bits consume info to rate control context +* +* @param[in] ps_frame_info +* Frame info context +* +* @param[in] ps_entropy +* Entropy context +* +* @returns +* total bits consumed by the frame +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_rc_bits_info(frame_info_t *ps_frame_info, void *pv_entropy); + +#endif /* IH264E_RATE_CONTROL_H */ + diff --git a/encoder/ih264e_rc_mem_interface.c b/encoder/ih264e_rc_mem_interface.c new file mode 100755 index 0000000..e4d5781 --- /dev/null +++ b/encoder/ih264e_rc_mem_interface.c @@ -0,0 +1,395 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_rc_mem_interface.c +* +* @brief +* This file contains api function definitions for rate control memtabs +* +* @author +* ittiam +* +* List of Functions +* - fill_memtab() +* - use_or_fill_base() +* - ih264e_map_rc_mem_recs_to_itt_api() +* - ih264e_map_itt_mem_rec_to_rc_mem_rec() +* - ih264e_get_rate_control_mem_tab() +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <assert.h> +#include <stdarg.h> +#include <math.h> + +/* User Include Files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "ih264_size_defs.h" +#include "iv2.h" +#include "ive2.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264e.h" +#include "ithread.h" +#include "ih264e.h" +#include "ih264_defs.h" +#include "ih264_debug.h" +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_common_tables.h" +#include "ih264_list.h" +#include "ih264e_error.h" +#include "ih264e_defs.h" +#include "ih264e_bitstream.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_master.h" +#include "ih264_buf_mgr.h" +#include "ih264_dpb_mgr.h" +#include "ih264e_utils.h" +#include "ih264e_platform_macros.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_config.h" +#include "ih264e_statistics.h" +#include "ih264e_trace.h" +#include "ih264e_statistics.h" +#include "ih264e_error.h" +#include "ih264e_utils.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_cavlc.h" +#include "ih264e_rc_mem_interface.h" +#include "ih264e_time_stamp.h" +#include "irc_common.h" +#include "irc_rd_model.h" +#include "irc_est_sad.h" +#include "irc_fixed_point_error_bits.h" +#include "irc_vbr_storage_vbv.h" +#include "irc_picture_type.h" +#include "irc_bit_allocation.h" +#include "irc_mb_model_based.h" +#include "irc_cbr_buffer_control.h" +#include "irc_vbr_str_prms.h" +#include "irc_rate_control_api.h" +#include "irc_rate_control_api_structs.h" +#include "ih264e_modify_frm_rate.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief This function fills memory record attributes +* +* @par Description +* This function fills memory record attributes +* +* @param[in] ps_mem_tab +* pointer to mem records +* +* @param[in] u4_size +* size of the record +* +* @param[in] i4_alignment +* memory alignment size +* +* @param[in] e_usage +* usage +* +* @param[in] e_mem_region +* mem region +* +* @return void +* +****************************************************************************** +*/ +void fill_memtab(itt_memtab_t *ps_mem_tab, + WORD32 u4_size, + WORD32 i4_alignment, + ITT_MEM_USAGE_TYPE_E e_usage, + ITT_MEM_REGION_E e_mem_region) +{ + /* Make the size next multiple of alignment */ + WORD32 i4_aligned_size = (((u4_size) + (i4_alignment-1)) & (~(i4_alignment-1))); + + /* Fill the memtab */ + ps_mem_tab->u4_size = i4_aligned_size; + ps_mem_tab->i4_alignment = i4_alignment; + ps_mem_tab->e_usage = e_usage; + ps_mem_tab->e_mem_region = e_mem_region; +} + +/** +****************************************************************************** +* +* @brief This function fills memory record attributes +* +* @par Description +* This function fills memory record attributes +* +* @param[in] ps_mem_tab +* pointer to mem records +* +* @param[in] ptr_to_be_filled +* handle to the memory record storage space +* +* @param[in] e_func_type +* enum that dictates fill memory records or use memory records +* +* @return void +* +****************************************************************************** +*/ +WORD32 use_or_fill_base(itt_memtab_t *ps_mem_tab, + void **ptr_to_be_filled, + ITT_FUNC_TYPE_E e_func_type) +{ + /* Fill base for freeing the allocated memory */ + if (e_func_type == FILL_BASE) + { + if (ptr_to_be_filled[0] != 0) + { + ps_mem_tab->pv_base = ptr_to_be_filled[0]; + return (0); + } + else + { + return (-1); + } + } + /* obtain the allocated memory from base pointer */ + if (e_func_type == USE_BASE) + { + if (ps_mem_tab->pv_base != 0) + { + ptr_to_be_filled[0] = ps_mem_tab->pv_base; + return (0); + } + else + { + return (-1); + } + } + return (0); +} + +/** +****************************************************************************** +* +* @brief This function maps rc mem records structure to encoder lib mem records +* structure +* +* @par Description +* This function maps rc mem records structure to encoder lib mem records +* structure +* +* @param[in] ps_mem +* pointer to encoder lib mem records +* +* @param[in] rc_memtab +* pointer to rc mem records +* +* @param[in] num_mem_recs +* number of memory records +* +* @return void +* +****************************************************************************** +*/ +void ih264e_map_rc_mem_recs_to_itt_api(iv_mem_rec_t *ps_mem, + itt_memtab_t *rc_memtab, + UWORD32 num_mem_recs) +{ + UWORD32 j; + UWORD32 Size, align; + + for (j = 0; j < num_mem_recs; j++) + { + Size = rc_memtab->u4_size; + align = rc_memtab->i4_alignment; + + /* we always ask for external persistent cacheable memory */ + FILL_MEMTAB(ps_mem, j, Size, align, IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM); + + rc_memtab++; + } +} + +/** +******************************************************************************* +* +* @brief This function maps encoder lib mem records structure to RC memory +* records structure +* +* @par Description +* This function maps encoder lib mem records structure to RC memory +* records structure +* +* @param[in] ps_mem +* pointer to encoder lib mem records +* +* @param[in] rc_memtab +* pointer to rc mem records +* +* @param[in] num_mem_recs +* Number of memory records + +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_map_itt_mem_rec_to_rc_mem_rec(iv_mem_rec_t *ps_mem, + itt_memtab_t *rc_memtab, + UWORD32 num_mem_recs) +{ + UWORD32 i; + + for (i = 0; i < num_mem_recs; i++) + { + rc_memtab->i4_alignment = ps_mem->u4_mem_alignment; + rc_memtab->u4_size = ps_mem->u4_mem_size; + rc_memtab->pv_base = ps_mem->pv_base; + + /* only DDR memory is available */ + rc_memtab->e_mem_region = DDR; + rc_memtab->e_usage = PERSISTENT; + + rc_memtab++; + ps_mem++; + } +} + +/** +****************************************************************************** +* +* @brief Get memtabs for rate control +* +* @par Description +* This routine is used to Get/init memtabs for rate control +* +* @param[in] pv_rate_control +* pointer to rate control context (handle) +* +* @param[in] ps_mem +* pointer to encoder lib mem records +* +* @param[in] e_func_type +* enum that dictates fill memory records or Init memory records +* +* @return total number of mem records +* +****************************************************************************** +*/ +WORD32 ih264e_get_rate_control_mem_tab(void *pv_rate_control, + iv_mem_rec_t *ps_mem, + ITT_FUNC_TYPE_E e_func_type) +{ + static itt_memtab_t as_itt_memtab[NUM_RC_MEMTABS]; + WORD32 i4_num_memtab = 0, j = 0; + void *refptr2[4]; + void **refptr1[4]; + rate_control_ctxt_t *ps_rate_control = pv_rate_control; + + for (j = 0; j < 4; j++) + refptr1[j] = &(refptr2[j]); + + j = 0; + + if (e_func_type == USE_BASE || e_func_type == FILL_BASE) + { + refptr1[1] = &ps_rate_control->pps_frame_time; + refptr1[2] = &ps_rate_control->pps_time_stamp; + refptr1[3] = &ps_rate_control->pps_pd_frm_rate; + refptr1[0] = &ps_rate_control->pps_rate_control_api; + } + + /* Get the total number of memtabs used by Rate Controller */ + i4_num_memtab = irc_rate_control_num_fill_use_free_memtab((rate_control_api_t **)refptr1[0], NULL, GET_NUM_MEMTAB); + /* Few extra steps during init */ + ih264e_map_itt_mem_rec_to_rc_mem_rec((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + /* Fill the memtabs used by Rate Controller */ + i4_num_memtab = irc_rate_control_num_fill_use_free_memtab((rate_control_api_t **)refptr1[0],as_itt_memtab+j,e_func_type); + /* Mapping ittiam memtabs to App. memtabs */ + ih264e_map_rc_mem_recs_to_itt_api((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + j += i4_num_memtab; + + /* Get the total number of memtabs used by Frame time Module */ + i4_num_memtab = ih264e_frame_time_get_init_free_memtab((frame_time_t **)refptr1[1], NULL, GET_NUM_MEMTAB); + /* Few extra steps during init */ + ih264e_map_itt_mem_rec_to_rc_mem_rec((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + /* Fill the memtabs used by Frame time Module */ + i4_num_memtab = ih264e_frame_time_get_init_free_memtab((frame_time_t **)refptr1[1], as_itt_memtab+j, e_func_type); + /* Mapping ittiam memtabs to App. memtabs */ + ih264e_map_rc_mem_recs_to_itt_api((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + j += i4_num_memtab; + + /* Get the total number of memtabs used by Time stamp Module */ + i4_num_memtab = ih264e_time_stamp_get_init_free_memtab((time_stamp_t **)refptr1[2], NULL, GET_NUM_MEMTAB); + /* Few extra steps during init */ + ih264e_map_itt_mem_rec_to_rc_mem_rec((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + /* Fill the memtabs used by Time Stamp Module */ + i4_num_memtab = ih264e_time_stamp_get_init_free_memtab((time_stamp_t **)refptr1[2], as_itt_memtab+j, e_func_type); + /* Mapping ittiam memtabs to App. memtabs */ + ih264e_map_rc_mem_recs_to_itt_api((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + j += i4_num_memtab; + + /* Get the total number of memtabs used by Frame rate Module */ + i4_num_memtab = ih264e_pd_frm_rate_get_init_free_memtab((pd_frm_rate_t **)refptr1[3], NULL, GET_NUM_MEMTAB); + /* Few extra steps during init */ + ih264e_map_itt_mem_rec_to_rc_mem_rec((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + /* Fill the memtabs used by Frame Rate Module */ + i4_num_memtab = ih264e_pd_frm_rate_get_init_free_memtab((pd_frm_rate_t **)refptr1[3], as_itt_memtab+j, e_func_type); + /* Mapping ittiam memtabs to App. memtabs */ + ih264e_map_rc_mem_recs_to_itt_api((&ps_mem[j]), as_itt_memtab+j, i4_num_memtab); + j += i4_num_memtab; + + return j; /* Total MemTabs Needed by Rate Control Module */ +} diff --git a/encoder/ih264e_rc_mem_interface.h b/encoder/ih264e_rc_mem_interface.h new file mode 100755 index 0000000..a2946a7 --- /dev/null +++ b/encoder/ih264e_rc_mem_interface.h @@ -0,0 +1,179 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_rc_mem_interface.h +* +* @brief +* This file contains function declaration and structures for rate control +* memtabs +* +* @author +* ittiam +* +* @remarks +* The rate control library is a global library across various codecs. It +* anticipates certain structures definitions. Those definitions are to be +* imported from global workspace. Instead of that, the structures needed for +* rc library are copied in to this file and exported to rc library. If the +* structures / enums / ... in the global workspace change, this file also needs +* to be modified accordingly. +* +****************************************************************************** +*/ +#ifndef IH264E_RC_MEM_INTERFACE_H_ +#define IH264E_RC_MEM_INTERFACE_H_ + + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ + +#define FILL_MEMTAB(m_pv_mem_rec, m_j, m_mem_size, m_align, m_type) \ +{ \ + m_pv_mem_rec[m_j].u4_size = sizeof(iv_mem_rec_t); \ + m_pv_mem_rec[m_j].u4_mem_size = m_mem_size; \ + m_pv_mem_rec[m_j].u4_mem_alignment = m_align; \ + m_pv_mem_rec[m_j].e_mem_type = m_type; \ +} + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ +typedef enum +{ + ALIGN_BYTE = 1, + ALIGN_WORD16 = 2, + ALIGN_WORD32 = 4, + ALIGN_WORD64 = 8, + ALIGN_128_BYTE = 128 +}ITT_MEM_ALIGNMENT_TYPE_E; + +typedef enum +{ + SCRATCH = 0, + PERSISTENT = 1, + WRITEONCE = 2 +}ITT_MEM_USAGE_TYPE_E; + +typedef enum +{ + L1D = 0, + SL2 = 1, + DDR = 3 +}ITT_MEM_REGION_E; + +typedef enum +{ + GET_NUM_MEMTAB = 0, + FILL_MEMTAB = 1, + USE_BASE = 2, + FILL_BASE =3 +}ITT_FUNC_TYPE_E; + + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +/*NOTE : This should be an exact replica of IALG_MemRec, any change in IALG_MemRec + must be replicated here*/ +typedef struct +{ + /* Size in bytes */ + UWORD32 u4_size; + + /* Alignment in bytes */ + WORD32 i4_alignment; + + /* decides which memory region to be placed */ + ITT_MEM_REGION_E e_mem_region; + + /* memory is scratch or persistent */ + ITT_MEM_USAGE_TYPE_E e_usage; + + /* Base pointer for allocated memory */ + void *pv_base; +} itt_memtab_t; + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief This function fills memory record attributes +* +* @par Description +* This function fills memory record attributes +* +* @param[in] ps_mem_tab +* pointer to mem records +* +* @param[in] u4_size +* size of the record +* +* @param[in] i4_alignment +* memory alignment size +* +* @param[in] e_usage +* usage +* +* @param[in] e_mem_region +* mem region +* +* @return void +* +****************************************************************************** +*/ +void fill_memtab(itt_memtab_t *ps_mem_tab, WORD32 u4_size, WORD32 i4_alignment, + ITT_MEM_USAGE_TYPE_E e_usage, ITT_MEM_REGION_E e_mem_region); + +/** +****************************************************************************** +* +* @brief This function fills memory record attributes +* +* @par Description +* This function fills memory record attributes +* +* @param[in] ps_mem_tab +* pointer to mem records +* +* @param[in] ptr_to_be_filled +* handle to the memory record storage space +* +* @param[in] e_func_type +* enum that dictates fill memory records or use memory records +* +* @return void +* +****************************************************************************** +*/ +WORD32 use_or_fill_base(itt_memtab_t *ps_mem_tab, void **ptr_to_be_filled, + ITT_FUNC_TYPE_E e_func_type); + + +#endif // IH264E_RC_MEM_INTERFACE_H_ + diff --git a/encoder/ih264e_statistics.h b/encoder/ih264e_statistics.h new file mode 100755 index 0000000..0ab33ca --- /dev/null +++ b/encoder/ih264e_statistics.h @@ -0,0 +1,141 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_statistics.h +* +* @brief +* Contains macros for generating stats about h264 encoder +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_STATISTICS_H_ +#define IH264E_STATISTICS_H_ + +#if CAVLC_LEVEL_STATS + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @brief In cavlc encoding, a lut is used for encoding levels. It is not possible + * to use look up for all possible levels. The extent to which look up is generated + * is based on the statistics that were collected in the following global variables. + * + * gu4_cavlc_level_bin_lt_4 represents the number coefficients with abs(level) < 4 + * gu4_cavlc_level_bin_lt_16 represents the number coefficients with 4 < abs(level) < 16 + * gu4_cavlc_level_bin_lt_32 represents the number coefficients with 16 < abs(level) < 32 + * and so on ... + * ****************************************************************************** + */ +extern UWORD32 gu4_cavlc_level_bin_lt_4; +extern UWORD32 gu4_cavlc_level_bin_lt_16; +extern UWORD32 gu4_cavlc_level_bin_lt_32; +extern UWORD32 gu4_cavlc_level_bin_lt_64; +extern UWORD32 gu4_cavlc_level_bin_lt_128; +extern UWORD32 gu4_cavlc_level_bin_else_where; +extern UWORD32 gu4_cavlc_level_lut_hit_rate; + +/*****************************************************************************/ +/* Extern function declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* @brief print cavlc stats +****************************************************************************** +*/ +void print_cavlc_level_stats(void); + +#define GATHER_CAVLC_STATS1() \ + if (u4_abs_level < 4)\ + gu4_cavlc_level_bin_lt_4 ++; \ + else if (u4_abs_level < 16) \ + gu4_cavlc_level_bin_lt_16 ++; \ + else if (u4_abs_level < 32) \ + gu4_cavlc_level_bin_lt_32 ++; \ + else if (u4_abs_level < 64) \ + gu4_cavlc_level_bin_lt_64 ++; \ + else if (u4_abs_level < 128) \ + gu4_cavlc_level_bin_lt_128 ++; \ + else \ + gu4_cavlc_level_bin_else_where ++; + +#define GATHER_CAVLC_STATS2() \ + gu4_cavlc_level_lut_hit_rate ++; + +#else + +#define GATHER_CAVLC_STATS1() + +#define GATHER_CAVLC_STATS2() + +#endif + + +#if GATING_STATS + +/*****************************************************************************/ +/* Extern global declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* @brief During encoding at fastest preset, some times if the inter threshold +* is lesser than the predefined threshold, intra analysis is not done. The +* below variable keeps track of the number of mb for which intra analysis is not +* done +* ****************************************************************************** +*/ +extern UWORD32 gu4_mb_gated_cnt; + +/*****************************************************************************/ +/* Extern function declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* @brief print gating stats +****************************************************************************** +*/ +void print_gating_stats(void); + +#define GATHER_GATING_STATS() \ + gu4_mb_gated_cnt ++; + +#else + +#define GATHER_GATING_STATS() + +#endif + + +#endif /* IH264E_STATISTICS_H_ */ diff --git a/encoder/ih264e_structs.h b/encoder/ih264e_structs.h new file mode 100755 index 0000000..1043a53 --- /dev/null +++ b/encoder/ih264e_structs.h @@ -0,0 +1,2566 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_structs.h +* +* @brief +* Structure definitions used in the encoder +* +* @author +* Harish +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_STRUCTS_H_ +#define IH264E_STRUCTS_H_ + +/*****************************************************************************/ +/* Extern Function type definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief intra prediction filters leaf level +****************************************************************************** + */ +typedef void (*pf_intra_pred)(UWORD8 *pu1_src, UWORD8 *pu1_dst, + WORD32 src_strd, WORD32 dst_strd, + WORD32 ui_neighboravailability); + +/** +****************************************************************************** + * @brief inter prediction filters leaf level +****************************************************************************** + */ + +typedef void (*pf_inter_pred_luma_bilinear)(UWORD8 *pu1_src1, UWORD8 *pu1_src2, UWORD8 *pu1_dst, + WORD32 src_strd1, WORD32 src_strd2, WORD32 dst_strd, + WORD32 height, WORD32 width); + +/** +****************************************************************************** + * @brief fwd transform leaf level +****************************************************************************** + */ +typedef void (*pf_trans_quant)(UWORD8*pu1_src, UWORD8 *pu1_pred, WORD16 *pi2_out, + WORD32 i4_src_stride, UWORD32 u4_pred_stride, UWORD32 u4_dst_stride, + const UWORD16 *pu2_scale_mat, const UWORD16 *pu2_thresh_mat, + UWORD32 u4_qbit, UWORD32 u4_round_fact, UWORD8 *pu1_nnz); + +typedef void (*pf_iquant_itrans)(WORD16 *pi2_src, UWORD8 *pu1_pred, UWORD8 *pu1_out, + WORD32 i4_src_stride, UWORD32 u4_pred_stride, UWORD32 u4_out_stride, + const UWORD16 *pu2_iscale_mat, const UWORD16 *pu2_weigh_mat, + UWORD32 qp_div, WORD32 *pi4_tmp); + +/** +****************************************************************************** + * @brief Padding leaf level +****************************************************************************** + */ +typedef void (*pf_pad)(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 pad_size); + +/** +****************************************************************************** + * @brief memory handling leaf level +****************************************************************************** + */ +typedef void (*pf_memcpy)(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes); + +typedef void (*pf_memset)(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes); + +typedef void (*pf_memcpy_mul8)(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes); + +typedef void (*pf_memset_mul8)(UWORD8 *pu1_dst, UWORD8 value, UWORD32 num_bytes); + +/** +****************************************************************************** + * @brief Sad computation +****************************************************************************** + */ +typedef void (*pf_compute_sad)(UWORD8 *pu1_src, UWORD8 *pu1_est, + UWORD32 src_strd, UWORD32 est_strd, + WORD32 i4_max_sad, WORD32 *pi4_mb_distortion); + +/** +****************************************************************************** + * @brief Intra mode eval:encoder level +****************************************************************************** + */ +typedef void (*pf_evaluate_intra_modes)(UWORD8 *pu1_src, UWORD8 *pu1_ngbr_pels_i16, UWORD8 *pu1_dst, + UWORD32 src_strd, UWORD32 dst_strd, + WORD32 u4_n_avblty, UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes); + +typedef void (*pf_evaluate_intra_4x4_modes)(UWORD8 *pu1_src, UWORD8 *pu1_ngbr_pels, UWORD8 *pu1_dst, + UWORD32 src_strd, UWORD32 dst_strd, + WORD32 u4_n_avblty, UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes, UWORD32 u4_lambda, + UWORD32 u4_predictd_mode); + +/** +****************************************************************************** + * @brief half_pel generation :encoder level +****************************************************************************** + */ +typedef void (*pf_sixtapfilter_horz)(UWORD8 *pu1_src, UWORD8 *pu1_dst, + WORD32 src_strd, WORD32 dst_strd); + +typedef void (*pf_sixtap_filter_2dvh_vert)(UWORD8 *pu1_src, UWORD8 *pu1_dst1, UWORD8 *pu1_dst2, + WORD32 src_strd, WORD32 dst_strd, + WORD32 *pi16_pred1, + WORD32 pi16_pred1_strd); +/** +****************************************************************************** + * @brief color space conversion +****************************************************************************** + */ +typedef void (*pf_fmt_conv_420p_to_420sp)(UWORD8 *pu1_y_src, UWORD8 *pu1_u_src, UWORD8 *pu1_v_src, + UWORD8 *pu1_y_dst, UWORD8 *pu1_uv_dst, + UWORD16 u2_height, UWORD16 u2_width, + UWORD16 src_y_strd, UWORD16 src_u_strd, UWORD16 src_v_strd, + UWORD16 dst_y_strd, UWORD16 dst_uv_strd, + UWORD32 convert_uv_only); + +typedef void (*pf_fmt_conv_422ile_to_420sp)(UWORD8 *pu1_y_buf, UWORD8 *pu1_u_buf, UWORD8 *pu1_v_buf, + UWORD8 *pu1_422i_buf, + WORD32 u4_y_width, WORD32 u4_y_height, WORD32 u4_y_stride, + WORD32 u4_u_stride, WORD32 u4_v_stride, + WORD32 u4_422i_stride); + + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ + +/** + ****************************************************************************** + * @enum CODEC_STATE_T + * @brief codec state + ****************************************************************************** + */ +typedef enum +{ + INIT_DONE, + HEADER_DONE, + FIRST_FRAME_DONE, +} CODEC_STATE_T; + + +/** + ****************************************************************************** + * @enum JOBQ_CMD_T + * @brief list of job commands (used during job instantiation) + ****************************************************************************** + */ +typedef enum +{ + CMD_PROCESS, + CMD_ENTROPY, + CMD_FMTCONV, + CMD_ME, +}JOBQ_CMD_T; + + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +/** + * PU information + */ +typedef struct +{ + + /** + * L0 Motion Vector + */ + mv_t s_l0_mv; + + /** + * PU X position in terms of min PU (4x4) units + */ + UWORD32 b4_pos_x : 4; + + /** + * PU Y position in terms of min PU (4x4) units + */ + UWORD32 b4_pos_y : 4; + + /** + * PU width in pixels = (b4_wd + 1) << 2 + */ + UWORD32 b4_wd : 2; + + /** + * PU height in pixels = (b4_ht + 1) << 2 + */ + UWORD32 b4_ht : 2; + + /** + * L0 Ref index + */ + WORD8 i1_l0_ref_idx; + +} enc_pu_t; + +typedef struct _codec_t codec_t; + +typedef struct +{ + /** Descriptor of raw buffer */ + iv_raw_buf_t s_raw_buf; + + /** Lower 32bits of time stamp corresponding to the above buffer */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to the above buffer */ + UWORD32 u4_timestamp_high; + + /** Flag to indicate if the current buffer is last buffer */ + UWORD32 u4_is_last; + + /** Flag to indicate if mb info is sent along with input buffer */ + UWORD32 u4_mb_info_type; + + /** Flag to indicate the size of mb info structure */ + UWORD32 u4_mb_info_size; + + /** Buffer containing mb info if mb_info_type is non-zero */ + void *pv_mb_info; + + /** Flag to indicate if pic info is sent along with input buffer */ + UWORD32 u4_pic_info_type; + + /** Buffer containing pic info if mb_info_type is non-zero */ + void *pv_pic_info; + +}inp_buf_t; + +typedef struct +{ + /** Descriptor of bitstream buffer */ + iv_bits_buf_t s_bits_buf; + + /** Lower 32bits of time stamp corresponding to the above buffer */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to the above buffer */ + UWORD32 u4_timestamp_high; + + /** Flag to indicate if the current buffer is last buffer */ + UWORD32 u4_is_last; + +}out_buf_t; + +typedef struct +{ + /** Descriptor of picture buffer */ + pic_buf_t s_pic_buf; + + /** Lower 32bits of time stamp corresponding to the above buffer */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to the above buffer */ + UWORD32 u4_timestamp_high; + + /** Flag to indicate if the current buffer is last buffer */ + UWORD32 u4_is_last; + + /** Picture count corresponding to current picture */ + WORD32 i4_pic_cnt; + +}rec_buf_t; + +typedef struct +{ + /** maximum width for which codec should request memory requirements */ + UWORD32 u4_max_wd; + + /** maximum height for which codec should request memory requirements */ + UWORD32 u4_max_ht; + + /** Maximum number of reference frames */ + UWORD32 u4_max_ref_cnt; + + /** Maximum number of reorder frames */ + UWORD32 u4_max_reorder_cnt; + + /** Maximum level supported */ + UWORD32 u4_max_level; + + /** Input color format */ + IV_COLOR_FORMAT_T e_inp_color_fmt; + + /** Flag to enable/disable - To be used only for debugging/testing */ + UWORD32 u4_enable_recon; + + /** Recon color format */ + IV_COLOR_FORMAT_T e_recon_color_fmt; + + /** Encoder Speed preset - Value between 0 (slowest) and 100 (fastest) */ + IVE_SPEED_CONFIG u4_enc_speed_preset; + + /** Rate control mode */ + IVE_RC_MODE_T e_rc_mode; + + /** Maximum frame rate to be supported */ + UWORD32 u4_max_framerate; + + /** Maximum bitrate to be supported */ + UWORD32 u4_max_bitrate; + + /** Maximum number of consecutive B frames */ + UWORD32 u4_max_num_bframes; + + /** Content type Interlaced/Progressive */ + IV_CONTENT_TYPE_T e_content_type; + + /** Maximum search range to be used in X direction */ + UWORD32 u4_max_srch_rng_x; + + /** Maximum search range to be used in Y direction */ + UWORD32 u4_max_srch_rng_y; + + /** Slice Mode */ + IVE_SLICE_MODE_T e_slice_mode; + + /** Slice parameter */ + UWORD32 u4_slice_param; + + /** Processor architecture */ + IV_ARCH_T e_arch; + + /** SOC details */ + IV_SOC_T e_soc; + + /** Input width to be sent in bitstream */ + UWORD32 u4_disp_wd; + + /** Input height to be sent in bitstream */ + UWORD32 u4_disp_ht; + + /** Input width */ + UWORD32 u4_wd; + + /** Input height */ + UWORD32 u4_ht; + + /** Input stride */ + UWORD32 u4_strd; + + /** Source frame rate */ + UWORD32 u4_src_frame_rate; + + /** Target frame rate */ + UWORD32 u4_tgt_frame_rate; + + /** Target bitrate in kilobits per second */ + UWORD32 u4_target_bitrate; + + /** Force current frame type */ + IV_PICTURE_CODING_TYPE_T e_frame_type; + + /** Encoder mode */ + IVE_ENC_MODE_T e_enc_mode; + + /** Set initial Qp for I pictures */ + UWORD32 u4_i_qp; + + /** Set initial Qp for P pictures */ + UWORD32 u4_p_qp; + + /** Set initial Qp for B pictures */ + UWORD32 u4_b_qp; + + /** Set minimum Qp for I pictures */ + UWORD32 u4_i_qp_min; + + /** Set maximum Qp for I pictures */ + UWORD32 u4_i_qp_max; + + /** Set minimum Qp for P pictures */ + UWORD32 u4_p_qp_min; + + /** Set maximum Qp for P pictures */ + UWORD32 u4_p_qp_max; + + /** Set minimum Qp for B pictures */ + UWORD32 u4_b_qp_min; + + /** Set maximum Qp for B pictures */ + UWORD32 u4_b_qp_max; + + /** Adaptive intra refresh mode */ + IVE_AIR_MODE_T e_air_mode; + + /** Adaptive intra refresh period in frames */ + UWORD32 u4_air_refresh_period; + + /** VBV buffer delay */ + UWORD32 u4_vbv_buffer_delay; + + /** VBV buffer size */ + UWORD32 u4_vbv_buf_size; + + /** Number of cores to be used */ + UWORD32 u4_num_cores; + + /** ME speed preset - Value between 0 (slowest) and 100 (fastest) */ + UWORD32 u4_me_speed_preset; + + /** Flag to enable/disable half pel motion estimation */ + UWORD32 u4_enable_hpel; + + /** Flag to enable/disable quarter pel motion estimation */ + UWORD32 u4_enable_qpel; + + /** Flag to enable/disable intra 4x4 analysis */ + UWORD32 u4_enable_intra_4x4; + + /** Flag to enable/disable intra 8x8 analysis */ + UWORD32 u4_enable_intra_8x8; + + /** Flag to enable/disable intra 16x16 analysis */ + UWORD32 u4_enable_intra_16x16; + + /** Flag to enable/disable fast SAD approximation */ + UWORD32 u4_enable_fast_sad; + + /*flag to enable/disable alternate reference frames */ + UWORD32 u4_enable_alt_ref; + + /*Flag to enable/disable computation of SATDQ in ME*/ + UWORD32 u4_enable_satqd; + + /*Minimum SAD to search for*/ + WORD32 i4_min_sad; + + /** Maximum search range in X direction for farthest reference */ + UWORD32 u4_srch_rng_x; + + /** Maximum search range in Y direction for farthest reference */ + UWORD32 u4_srch_rng_y; + + /** I frame interval */ + UWORD32 u4_i_frm_interval; + + /** IDR frame interval */ + UWORD32 u4_idr_frm_interval; + + /** consecutive B frames */ + UWORD32 u4_num_b_frames; + + /** Disable deblock level (0: Enable completely, 3: Disable completely */ + UWORD32 u4_disable_deblock_level; + + /** Profile */ + IV_PROFILE_T e_profile; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + + /** Flag to say if the current config parameter set is valid + * Will be zero to start with and will be set to 1, when configured + * Once encoder uses the parameter set, this will be set to zero */ + UWORD32 u4_is_valid; + + /** Command associated with this config param set */ + IVE_CONTROL_API_COMMAND_TYPE_T e_cmd; + + /** Input width in mbs */ + UWORD32 i4_wd_mbs; + + /** Input height in mbs */ + UWORD32 i4_ht_mbs; + + /** entropy coding mode flag */ + UWORD32 u4_entropy_coding_mode; + + /** enable weighted prediction */ + UWORD32 u4_weighted_prediction; + + /** enable constrained intra prediction */ + UWORD32 u4_constrained_intra_pred; + + /** Pic info type */ + UWORD32 u4_pic_info_type; + /** + * MB info type + */ + UWORD32 u4_mb_info_type; + +}cfg_params_t; + + + +/** Structure to hold format conversion context */ +typedef struct +{ + /** Current row for which format conversion should be done */ + WORD32 i4_cur_row; + + /** Number of rows for which format conversion should be done */ + WORD32 i4_num_rows; + +}fmt_conv_t; + + +/** + * Structure to represent a processing job entry + */ +typedef struct +{ + /** + * Command + */ + WORD32 i4_cmd; + + /** + * MB x of the starting MB + */ + WORD16 i2_mb_x; + + /** + * MB y of the starting MB + */ + + WORD16 i2_mb_y; + + /** + * Number of MBs that need to be processed in this job + */ + WORD16 i2_mb_cnt; + + /** + * Process contexts base index + * Will toggle between 0 and MAX_PROCESS_THREADS + */ + WORD16 i2_proc_base_idx; + +} job_t; + + +/** + * Structure to represent a MV Bank buffer + */ +typedef struct +{ + /** + * Pointer to hold num PUs each MB in a picture + */ + UWORD32 *pu4_mb_pu_cnt; + + /** + * Pointer to hold enc_pu_t for each PU in a picture + */ + enc_pu_t *ps_pic_pu; + + /** + * Pointer to hold PU map for each MB in a picture + */ + UWORD8 *pu1_pic_pu_map; + + /** + * Pointer to hold the Slice map + */ + UWORD16 *pu1_pic_slice_map; + + /** + * Absolute POC for the current MV Bank + */ + WORD32 i4_abs_poc; + + /** + * Buffer Id + */ + WORD32 i4_buf_id; + +} mv_buf_t; + + +/** + * Reference set containing pointers to MV buf and pic buf + */ +typedef struct +{ + /** Picture count */ + WORD32 i4_pic_cnt; + + /** POC */ + WORD32 i4_poc; + + /** picture buffer */ + pic_buf_t *ps_pic_buf; + + /** mv buffer */ + mv_buf_t *ps_mv_buf; + +}ref_set_t; + +typedef struct +{ + + /** + * Pointer to current PPS + */ + pps_t *ps_pps; + + /** + * Pointer to current SPS + */ + sps_t *ps_sps; + + /** + * Pointer to current slice header structure + */ + slice_header_t *ps_slice_hdr; + + /** + * MB's x position within a picture in raster scan in MB units + */ + WORD32 i4_mb_x; + + /** + * MB's y position within a picture in raster scan in MB units + */ + + WORD32 i4_mb_y; + + /** + * Current PU structure - set to MB enc_pu_t pointer at the start of MB processing and incremented + * for every TU + */ + enc_pu_t *ps_pu; + + /** + * Pointer to frame level enc_pu_t for the current frame being parsed + * where MVs and Intra pred modes will be updated + */ + enc_pu_t *ps_pic_pu; + + /** + * Pointer to hold num PUs each MB in a picture + */ + UWORD32 *pu4_mb_pu_cnt; + + /** PU Index map per MB. The indices in this map are w.r.t picture pu array and not + * w.r.t MB pu array. + * This will be used during mv prediction and since neighbors will have different MB pu map + * it will be easier if they all have indices w.r.t picture level PU array rather than MB level + * PU array. + * pu1_pic_pu_map is map w.r.t MB's enc_pu_t array + */ + UWORD32 *pu4_pic_pu_idx_map; + + /** + * Pointer to pu_map for the current frame being parsed + * where MVs and Intra pred modes will be updated + */ + UWORD8 *pu1_pic_pu_map; + + /** + * PU count in current MB + */ + WORD32 i4_mb_pu_cnt; + + /** + * PU count in current MB + */ + WORD32 i4_mb_start_pu_idx; + + /** + * Top availability for current MB level + */ + UWORD8 u1_top_mb_avail; + + /** + * Top right availability for current MB level + */ + UWORD8 u1_top_rt_mb_avail; + /** + * Top left availability for current MB level + */ + UWORD8 u1_top_lt_mb_avail; + /** + * left availability for current MB level + */ + UWORD8 u1_left_mb_avail; + +}mv_ctxt_t; + +typedef struct +{ + /** + * MB's x position within a picture in raster scan in MB units + */ + WORD32 i4_mb_x; + + /** + * MB's y position within a picture in raster scan in MB units + */ + WORD32 i4_mb_y; + + /** + * MB's x position within a Slice in raster scan in MB units + */ + WORD32 i4_mb_slice_x; + + /** + * MB's y position within a Slice in raster scan in MB units + */ + WORD32 i4_mb_slice_y; + + /** + * Vertical strength, Two bits per edge. + * Stored in format. BS[15] | BS[14] | .. |BS[0] + */ + UWORD32 *pu4_pic_vert_bs; + + /** + * Boundary strength, Two bits per edge. + * Stored in format. BS[15] | BS[14] | .. |BS[0] + */ + UWORD32 *pu4_pic_horz_bs; + + /** + * Qp array stored for each mb + */ + UWORD8 *pu1_pic_qp; + +}bs_ctxt_t; + +typedef struct +{ + /** + * MB's x position within a picture in raster scan in MB units + */ + WORD32 i4_mb_x; + + /** + * MB's y position within a picture in raster scan in MB units + */ + WORD32 i4_mb_y; + + /** + * structure that contains BS and QP frame level arrays + */ + bs_ctxt_t s_bs_ctxt; + + /** + * Pointer to 0th luma pixel in current pic + */ + UWORD8 *pu1_cur_pic_luma; + + /** + * Pointer to 0th chroma pixel in current pic + */ + UWORD8 *pu1_cur_pic_chroma; + + /** + * Points to the array of slice indices which is used to identify the slice + * to which each MB in a frame belongs. + */ + UWORD8 *pu1_slice_idx; + +}deblk_ctxt_t; + + +/** + ****************************************************************************** + * @brief Structure to hold data and flags for 'n' mb processing for + * deblocking , padding and half pel generation. + ****************************************************************************** + */ +typedef struct +{ + /** + * MB's x position last processed + 1 + */ + WORD32 i4_mb_x; + + /** + * MB's y position ,current processing. + */ + WORD32 i4_mb_y; + + /** + * Number of MBs processed in a stretch + */ + WORD32 i4_n_mbs; + +}n_mb_process_ctxt_t; + + +/** +****************************************************************************** + * @brief Structure to hold coefficient info for a 4x4 subblock. + * The following can be used to type-cast coefficient data that is stored + * per subblock. Note that though i2_level is shown as an array that + * holds 16 coefficients, only the first few entries will be valid. Next + * subblocks data starts after the valid number of coefficients. Number + * of non-zero coefficients will be derived using number of non-zero bits + * in sig coeff map +****************************************************************************** + */ +typedef struct +{ + /** + * significant coefficient map and nnz are packed in + * to msb (2 bytes) and lsb (2 bytes) respectively + */ + WORD32 i4_sig_map_nnz; + + /** + * array of non zero residue coefficients + */ + WORD16 ai2_residue[16]; + +}tu_sblk_coeff_data_t; + +/** +****************************************************************************** + * @brief Structure contains few common state variables such as MB indices, + * current SPS, PPS etc which are to be used in the entropy thread. By keeping + * it a different structure it is being explicitly signaled that these + * variables are specific to entropy threads context and other threads should + * not update these elements +****************************************************************************** + */ +typedef struct +{ + + /** + * start of frame / start of slice flag + */ + WORD32 i4_sof; + + /** + * end of frame / end of slice flag + */ + WORD32 i4_eof; + + /** + * generate header upon request + */ + WORD32 i4_gen_header; + + /** + * seq_parameter_set_id + */ + UWORD32 u4_sps_id; + + /** + * Pointer to base of sequence parameter set structure array + */ + sps_t *ps_sps_base; + + /** + * pic_parameter_set_id + */ + UWORD32 u4_pps_id; + + /** + * Pointer to base of Picture parameter set structure array + */ + pps_t *ps_pps_base; + + /** + * Current slice idx + */ + WORD32 i4_cur_slice_idx; + + /** + * Points to the array of slice indices which is used to identify the independent slice + * to which each MB in a frame belongs. + */ + UWORD8 *pu1_slice_idx; + + /** + * Pointer to base of slice header structure array + */ + slice_header_t *ps_slice_hdr_base; + + /** + * entropy status + */ + UWORD8 *pu1_entropy_map; + + /** + * MB's x position within a picture in raster scan in MB units + */ + WORD32 i4_mb_x; + + /** + * MB's y position within a picture in raster scan in MB units + */ + WORD32 i4_mb_y; + + /** + * MB start address + */ + WORD32 i4_mb_cnt; + + /** + * MB start address + */ + WORD32 i4_mb_start_add; + + /** + * MB end address + */ + WORD32 i4_mb_end_add; + + /** + * Input width in mbs + */ + WORD32 i4_wd_mbs; + + /** + * Input height in mbs + */ + WORD32 i4_ht_mbs; + + /** + * Bitstream structure + */ + bitstrm_t *ps_bitstrm; + + /** + * transform_8x8_mode_flag + */ + WORD8 i1_transform_8x8_mode_flag; + + /** + * entropy_coding_mode_flag + */ + WORD8 u1_entropy_coding_mode_flag; + + /** + * Pointer to the top row nnz for luma + */ + UWORD8 (*pu1_top_nnz_luma)[4]; + + /** + * left nnz for luma + */ + UWORD32 u4_left_nnz_luma; + + /** + * Pointer to zero runs before for the mb + */ + UWORD8 au1_zero_run[16]; + + /** + * Pointer to the top row nnz for chroma + */ + UWORD8 (*pu1_top_nnz_cbcr)[4]; + + /** + * left nnz for chroma + */ + UWORD8 u4_left_nnz_cbcr; + + /** + * Pointer frame level mb subblock coeff data + */ + void *pv_pic_mb_coeff_data; + + /** + * Pointer to mb subblock coeff data and number of subblocks and scan idx + * Incremented each time a coded subblock is processed + */ + void *pv_mb_coeff_data; + + /** + * Pointer frame level mb header data + */ + void *pv_pic_mb_header_data; + + /** + * Pointer to mb header data and + * incremented each time a coded mb is encoded + */ + void *pv_mb_header_data; + + /** + * Error code during parse stage + */ + IH264E_ERROR_T i4_error_code; + + /** + * Void pointer to job context + */ + void *pv_proc_jobq, *pv_entropy_jobq; + + /** + * Flag to signal end of frame + */ + WORD32 i4_end_of_frame; + + /** + * Abs POC count of the frame + */ + WORD32 i4_abs_pic_order_cnt; + + /** + * mb skip run + */ + WORD32 *pi4_mb_skip_run; + + /** + * Flag to signal end of sequence + */ + UWORD32 u4_is_last; + + /** + * Lower 32bits of time-stamp corresponding to the buffer being encoded + */ + UWORD32 u4_timestamp_low; + + /** + * Upper 32bits of time-stamp corresponding to the buffer being encoded + */ + UWORD32 u4_timestamp_high; + + /** + * Current Picture count - used for synchronization + */ + WORD32 i4_pic_cnt; + + /** + * Number of bits consumed by header for I and P mb types + */ + UWORD32 u4_header_bits[MAX_MB_TYPE]; + + /** + * Number of bits consumed by residue for I and P mb types + */ + UWORD32 u4_residue_bits[MAX_MB_TYPE]; + +} entropy_ctxt_t; + +/** +****************************************************************************** +* @brief macro block info. +****************************************************************************** +*/ +typedef struct +{ + /** + * mb type + */ + UWORD16 u2_is_intra; + + /** + * mb type + */ + UWORD16 u2_mb_type; + + /** + * csbp + */ + UWORD32 u4_csbp; + + /** + * mb distortion + */ + WORD32 i4_mb_distortion; + +}mb_info_t; + +/** +****************************************************************************** +* @brief structure presenting the neighbor availability of a mb +* or subblk or any other partition +****************************************************************************** +*/ +typedef struct +{ + /** + * left blk/subblk/partition + */ + UWORD8 u1_mb_a; + + /** + * top blk/subblk/partition + */ + UWORD8 u1_mb_b; + + /** + * topright blk/subblk/partition + */ + UWORD8 u1_mb_c; + + /** + * topleft blk/subblk/partition + */ + UWORD8 u1_mb_d; + +}block_neighbors_t; + +/** + ****************************************************************************** + * @brief MB info related variables used during NMB processing + ****************************************************************************** + */ +typedef struct +{ + UWORD32 u4_mb_type; + UWORD32 u4_min_sad; + UWORD32 u4_min_sad_reached; + WORD32 i4_mb_cost; + WORD32 i4_mb_distortion; + + + mv_t s_skip_mv; + mv_t s_pred_mv; + + block_neighbors_t s_ngbr_avbl; + + /* + * Buffer to hold best subpel buffer in each MB of NMB + */ + UWORD8 *pu1_best_sub_pel_buf; + + /* + * Stride for subpel buffer + */ + UWORD32 u4_bst_spel_buf_strd; + +}mb_info_nmb_t; + +/** + ****************************************************************************** + * @brief Pixel processing thread context + ****************************************************************************** + */ +typedef struct +{ + /** + * entropy context + */ + entropy_ctxt_t s_entropy; + + /** + * me context + */ + me_ctxt_t s_me_ctxt; + + /** + * Pointer to codec context + */ + codec_t *ps_codec; + + /** + * N mb process contest + */ + n_mb_process_ctxt_t s_n_mb_ctxt; + + /** + * Source pointer to current MB luma + */ + UWORD8 *pu1_src_buf_luma; + + /** + * Source pointer to current MB chroma + */ + UWORD8 *pu1_src_buf_chroma; + + /** + * Recon pointer to current MB luma + */ + UWORD8 *pu1_rec_buf_luma; + + /** + * Recon pointer to current MB chroma + */ + UWORD8 *pu1_rec_buf_chroma; + + /** + * Ref pointer to current MB luma + */ + UWORD8 *pu1_ref_buf_luma; + + /** + * Ref pointer to current MB chroma + */ + UWORD8 *pu1_ref_buf_chroma; + + /** + * pointer to luma plane of input buffer (base :: mb (0,0)) + */ + UWORD8 *pu1_src_buf_luma_base; + + /** + * pointer to luma plane of reconstructed buffer (base :: mb (0,0)) + */ + UWORD8 *pu1_rec_buf_luma_base; + + /** + * pointer to luma plane of ref buffer (base :: mb (0,0)) + */ + UWORD8 *pu1_ref_buf_luma_base; + + /** + * pointer to chroma plane of input buffer (base :: mb (0,0)) + */ + UWORD8 *pu1_src_buf_chroma_base; + + /* + * Buffer for color space conversion of luma + */ + UWORD8 *pu1_y_csc_buf; + + /* + * Buffer for color space conversion of luma + */ + + UWORD8 *pu1_uv_csc_buf; + + /** + * pointer to chroma plane of reconstructed buffer (base :: mb (0,0)) + */ + UWORD8 *pu1_rec_buf_chroma_base; + + /** + * pointer to chroma plane of reconstructed buffer (base :: mb (0,0)) + */ + UWORD8 *pu1_ref_buf_chroma_base; + + /** + * Pointer to ME NMB info + */ + mb_info_nmb_t *ps_nmb_info; + + mb_info_nmb_t *ps_cur_mb; + + /** + * source stride + * (strides for luma and chroma are the same) + */ + WORD32 i4_src_strd; + + /** + * recon stride & ref stride + * (strides for luma and chroma are the same) + */ + WORD32 i4_rec_strd; + + /** + * Offset for half pel x plane from the pic buf + */ + UWORD32 u4_half_x_offset; + + /** + * Offset for half pel y plane from half x plane + */ + UWORD32 u4_half_y_offset; + + /** + * Offset for half pel xy plane from half y plane + */ + UWORD32 u4_half_xy_offset; + + /** + * pred buffer pointer (temp buffer 1) + */ + UWORD8 *pu1_pred_mb; + + /** + * pred buffer pointer (prediction buffer for intra 16x16 + */ + UWORD8 *pu1_pred_mb_intra_16x16; + + /** + * pred buffer pointer (prediction buffer for intra 16x16_plane + */ + UWORD8 *pu1_pred_mb_intra_16x16_plane; + + /** + * pred buffer pointer (prediction buffer for intra chroma + */ + UWORD8 *pu1_pred_mb_intra_chroma; + + /** + * pred buffer pointer (prediction buffer for intra chroma plane + */ + UWORD8 *pu1_pred_mb_intra_chroma_plane; + + /** + * temp. reference buffer ptr for intra 4x4 when rdopt is on + */ + UWORD8 *pu1_ref_mb_intra_4x4; + + /** + * prediction buffer stride + */ + WORD32 i4_pred_strd; + + /** + * transform buffer pointer (temp buffer 2) + */ + WORD16 *pi2_res_buf; + + /** + * temp. transform buffer ptr for intra 4x4 when rdopt is on + */ + WORD16 *pi2_res_buf_intra_4x4; + + /** + * transform buffer stride + */ + WORD32 i4_res_strd; + + /** + * scratch buffer for inverse transform (temp buffer 3) + */ + void *pv_scratch_buff; + + /** + * frame num + */ + WORD32 i4_frame_num; + + /** + * start address of frame / sub-frame + */ + WORD32 i4_frame_strt_add; + + /** + * IDR pic + */ + UWORD32 u4_is_idr; + + /** + * idr_pic_id + */ + UWORD32 u4_idr_pic_id; + + /** + * Input width in mbs + */ + WORD32 i4_wd_mbs; + + /** + * Input height in mbs + */ + WORD32 i4_ht_mbs; + + /** + * slice_type + */ + WORD32 i4_slice_type; + + /** + * Current slice idx + */ + WORD32 i4_cur_slice_idx; + + /** + * MB's x position within a picture in raster scan in MB units + */ + WORD32 i4_mb_x; + + /** + * MB's y position within a picture in raster scan in MB units + */ + WORD32 i4_mb_y; + + /** + * MB's x position within a Slice in raster scan in MB units + */ + WORD32 i4_mb_slice_x; + + /** + * MB's y position within a Slice in raster scan in MB units + */ + WORD32 i4_mb_slice_y; + + /** + * mb type + */ + UWORD32 u4_mb_type; + + /** + * is intra + */ + UWORD32 u4_is_intra; + + /** + * mb neighbor availability pointer + */ + block_neighbors_t *ps_ngbr_avbl; + + /** + * lambda (lagrange multiplier for cost computation) + */ + UWORD32 u4_lambda; + + /** + * mb distortion + */ + WORD32 i4_mb_distortion; + + /** + * mb cost + */ + WORD32 i4_mb_cost; + + /********************************************************************/ + /* i4_ngbr_avbl_mb_16 - ngbr avbl of curr mb */ + /* i4_ngbr_avbl_sb_8 - ngbr avbl of all 8x8 sub blocks of curr mb */ + /* i4_ngbr_avbl_sb_4 - ngbr avbl of all 4x4 sub blocks of curr mb */ + /* i4_ngbr_avbl_mb_c - chroma ngbr avbl of curr mb */ + /********************************************************************/ + WORD32 i4_ngbr_avbl_16x16_mb; + WORD32 ai4_neighbor_avail_8x8_subblks[4]; + UWORD8 au1_ngbr_avbl_4x4_subblks[16]; + WORD32 i4_chroma_neighbor_avail_8x8_mb; + + /** + * array to store the mode of mb sub blocks + */ + UWORD8 au1_intra_luma_mb_4x4_modes[16]; + + /** + * array to store the predicted mode of mb sub blks + */ + UWORD8 au1_predicted_intra_luma_mb_4x4_modes[16]; + + /** + * macro block intra 16x16 mode + */ + UWORD8 u1_l_i16_mode; + + /** + * array to store the mode of the macro block intra 8x8 4 modes + */ + UWORD8 au1_intra_luma_mb_8x8_modes[4]; + + /** + * intra chroma mb mode + */ + UWORD8 u1_c_i8_mode; + + /********************************************************************/ + /* array to store pixels from the neighborhood for intra prediction */ + /* i16 - 16 left pels + 1 top left pel + 16 top pels = 33 pels */ + /* i8 - 8 lpels + 1 tlpels + 8 tpels + 8 tr pels = 25 pels */ + /* i4 - 4 lpels + 1 tlpels + 4 tpels + 4 tr pels = 13 pels */ + /* ic - 8 left pels + 1 top left pel + 8 top pels )*2 */ + /********************************************************************/ + UWORD8 au1_ngbr_pels[34]; + + /** + * array for 8x8 intra pels filtering (temp buff 4) + */ + UWORD8 au1_neighbor_pels_i8x8_unfiltered[25]; + + /** + * Number of sub partitons in the inter pred MB + */ + UWORD32 u4_num_sub_partitions; + + /** + * Pointer to hold num PUs each MB in a picture + */ + UWORD32 *pu4_mb_pu_cnt; + + /** + * Pointer to the array of structures having motion vectors, size + * and position of sub partitions + */ + enc_pu_t *ps_pu; + + /** + * predicted motion vector + */ + mv_t *ps_pred_mv; + + /** + * top row mb syntax information base + * In normal working scenarios, for a given context set, + * the mb syntax info pointer is identical across all process threads. + * But when the hard bound on slices are enabled, in multi core, frame + * is partitioned in to sections equal to set number of cores and each + * partition is run independently. In this scenario, a ctxt set will alone + * appear to run multiple frames at a time. For this to occur, the common + * pointers across the proc ctxt should disappear. + * + * This is done by allocating MAX_PROCESS_THREADS memory and distributing + * across individual ctxts when byte bnd per slice is enabled. + */ + mb_info_t *ps_top_row_mb_syntax_ele_base; + + /** + * top row mb syntax information + */ + mb_info_t *ps_top_row_mb_syntax_ele; + + /** + * left mb syntax information + */ + mb_info_t s_left_mb_syntax_ele; + + /** + * top left mb syntax information + */ + mb_info_t s_top_left_mb_syntax_ele; + + /** + * top left mb syntax information + */ + + mb_info_t s_top_left_mb_syntax_ME; + + /** + * left mb motion vector + */ + enc_pu_t s_left_mb_pu_ME; + + /** + * top left mb motion vector + */ + enc_pu_t s_top_left_mb_pu_ME; + + + /** + * mb neighbor availability pointer + */ + block_neighbors_t s_ngbr_avbl; + + /** + * In case the macroblock type is intra, the intra modes of all + * partitions for the left mb are stored in the array below + */ + UWORD8 au1_left_mb_intra_modes[16]; + + /** + * In case the macroblock type is intra, the intra modes of all + * partitions for the top mb are stored in the array below + * + * In normal working scenarios, for a given context set, + * the mb syntax info pointer is identical across all process threads. + * But when the hard bound on slices are enabled, in multi core, frame + * is partitioned in to sections equal to set number of cores and each + * partition is run independently. In this scenario, a ctxt set will alone + * appear to run multiple frames at a time. For this to occur, the common + * pointers across the proc ctxt should disappear. + * + * This is done by allocating MAX_PROCESS_THREADS memory and distributing + * across individual ctxts when byte bnd per slice is enabled. + */ + UWORD8 *pu1_top_mb_intra_modes_base; + + /** + * In case the macroblock type is intra, the intra modes of all + * partitions for the top mb are stored in the array below + */ + UWORD8 *pu1_top_mb_intra_modes; + + /** + * skip motion vector info + */ + mv_t *ps_skip_mv; + + /** + * left mb motion vector + */ + enc_pu_t s_left_mb_pu; + + /** + * top left mb motion vector + */ + enc_pu_t s_top_left_mb_pu; + + /** + * top row motion vector info + * + * In normal working scenarios, for a given context set, + * the top row pu pointer is identical across all process threads. + * But when the hard bound on slices are enabled, in multi core, frame + * is partitioned in to sections equal to set number of cores and each + * partition is run independently. In this scenario, a ctxt set will alone + * appear to run multiple frames at a time. For this to occur, the common + * pointers across the proc ctxt should disappear. + * + * This is done by allocating MAX_PROCESS_THREADS memory and distributing + * across individual ctxts when byte bnd per slice is enabled. + */ + enc_pu_t *ps_top_row_pu_base; + + /** + * top row motion vector info + */ + enc_pu_t *ps_top_row_pu; + + enc_pu_t *ps_top_row_pu_ME; + + /** + * coded block pattern + */ + UWORD32 u4_cbp; + + /** + * csbp + */ + UWORD32 u4_csbp; + + /** + * number of non zero coeffs + */ + UWORD32 au4_nnz[5]; + + /** + * number of non zero coeffs for intra 4x4 when rdopt is on + */ + UWORD32 au4_nnz_intra_4x4[4]; + + /** + * frame qp & mb qp + */ + UWORD32 u4_frame_qp, u4_mb_qp; + + /** + * mb qp previous + */ + UWORD32 u4_mb_qp_prev; + + /** + * quantization parameters for luma & chroma planes + */ + quant_params_t *ps_qp_params[3]; + + /** + * Pointer frame level mb subblock coeff data + */ + void *pv_pic_mb_coeff_data; + + /** + * Pointer to mb subblock coeff data and number of subblocks and scan idx + * Incremented each time a coded subblock is processed + */ + void *pv_mb_coeff_data; + + /** + * Pointer frame level mb header data + */ + void *pv_pic_mb_header_data; + + /** + * Pointer to mb header data and + * incremented each time a coded mb is encoded + */ + void *pv_mb_header_data; + + /** + * Signal that pic_init is called first time + */ + WORD32 i4_first_pic_init; + + /** + * Current MV Bank's buffer ID + */ + WORD32 i4_cur_mv_bank_buf_id; + + /** + * Void pointer to job context + */ + void *pv_proc_jobq, *pv_entropy_jobq; + + /** + * Number of MBs to be processed in the current Job + */ + WORD32 i4_mb_cnt; + + /** + * ID for the current context - Used for debugging + */ + WORD32 i4_id; + + /** + * Pointer to current picture buffer structure + */ + pic_buf_t *ps_cur_pic; + + /** + * Pointer to current picture's mv buffer structure + */ + mv_buf_t *ps_cur_mv_buf; + + /** + * Flag to indicate if ps_proc was initialized at least once in a frame. + * This is needed to handle cases where a core starts to handle format + * conversion jobs directly + */ + WORD32 i4_init_done; + + /** + * Process status: one byte per MB + */ + UWORD8 *pu1_proc_map; + + /** + * Deblk status: one byte per MB + */ + UWORD8 *pu1_deblk_map; + + /** + * Process status: one byte per MB + */ + UWORD8 *pu1_me_map; + + /* + * Intra refresh mask. + * Indicates if an Mb is coded in intra mode within the current AIR interval + * NOTE Refreshes after each AIR period + * NOTE The map is shared between process + */ + UWORD8 *pu1_is_intra_coded; + + /** + * Disable deblock level (0: Enable completely, 3: Disable completely + */ + UWORD32 u4_disable_deblock_level; + + /** + * Pointer to the structure that contains deblock context + */ + deblk_ctxt_t s_deblk_ctxt; + + /** + * Points to the array of slice indices which is used to identify the independent + * slice to which each MB in a frame belongs. + */ + UWORD8 *pu1_slice_idx; + + /** + * Pointer to base of slice header structure array + */ + slice_header_t *ps_slice_hdr_base; + + /** + * Number of mb's to process in one loop + */ + WORD32 i4_nmb_ntrpy; + + /** + * Number of mb's to process in one loop + */ + UWORD32 u4_nmb_me; + + /** + * Structure for current input buffer + */ + inp_buf_t s_inp_buf; + + /** + * api call cnt + */ + WORD32 i4_encode_api_call_cnt; + + /** + * Current Picture count - used for synchronization + */ + WORD32 i4_pic_cnt; + + /** + * Intermediate buffer for interpred leaf level functions + */ + WORD32 ai16_pred1[HP_BUFF_WD * HP_BUFF_HT]; + + /** + * Reference picture for the current picture + * TODO: Only 1 reference assumed currently + */ + pic_buf_t *ps_ref_pic; + + /** + * frame info used by RC + */ + frame_info_t s_frame_info; + + /* + * NOTE NOT PERSISTANT INSIDE FUNCTIONS + * Min sad for current MB + * will be populated initially + * Once a sad less than eq to u4_min_sad is reached, the value will be copied to the cariable + */ + UWORD32 u4_min_sad; + + /* + * indicates weather we have rached minimum sa or not + */ + UWORD32 u4_min_sad_reached; + + /** + * Current error code + */ + WORD32 i4_error_code; + + /* + * Enables or disables computation of recon + */ + UWORD32 u4_compute_recon; + + /* + * Buffer for holding half_x (1/2,1 - interpolated) + * values when halfpel generation + * for the entire plane is not enabled + */ + UWORD8 *pu1_half_x; + + /* + * Buffer for holding half_x (1,1/2 - interpolated) + * values when halfpel generation + * for the entire plane is not enabled + */ + UWORD8 *pu1_half_y; + + /* + * Buffer for holding half_x (1/2,1/2 - interpolated) + * values when halfpel generation + * for the entire plane is not enabled + * + */ + UWORD8 *pu1_half_xy; + + /* + * Buffer holding best sub pel values + */ + UWORD8 *pu1_best_subpel_buf; + + /* + * Stride for buffer holding best sub pel + */ + UWORD32 u4_bst_spel_buf_strd; + +} process_ctxt_t; + +/** + ****************************************************************************** + * @brief Rate control related variables + ****************************************************************************** + */ +typedef struct +{ + void *pps_rate_control_api; + + void *pps_frame_time; + + void *pps_time_stamp; + + void *pps_pd_frm_rate; + + /** + * frame rate pull down + */ + WORD32 pre_encode_skip[MAX_CTXT_SETS]; + + /** + * skip frame (cbr) + */ + WORD32 post_encode_skip[MAX_CTXT_SETS]; + + /** + * rate control type + */ + rc_type_e e_rc_type; + + /** + * pic type + */ + picture_type_e e_pic_type; + + /** + * intra cnt in previous frame + */ + WORD32 num_intra_in_prev_frame; + + /** + * avg activity of prev frame + */ + WORD32 i4_avg_activity; + +}rate_control_ctxt_t; + +/** + * Codec context + */ +struct _codec_t +{ + /** + * Number of coded pictures + */ + WORD32 i4_coded_pic_cnt; + + /** + * Number of encode frame API calls made + */ + WORD32 i4_encode_api_call_cnt; + + /** + * Number of pictures encoded + */ + WORD32 i4_pic_cnt; + + /** + * Number of threads created + */ + WORD32 i4_proc_thread_cnt; + + /** + * Mutex used to keep the control calls thread-safe + */ + void *pv_ctl_mutex; + + /** + * Current active config parameters + */ + cfg_params_t s_cfg; + + /** + * Array containing the config parameter sets + */ + cfg_params_t as_cfg[MAX_ACTIVE_CONFIG_PARAMS]; + + /** + * Color format used by encoder internally + */ + IV_COLOR_FORMAT_T e_codec_color_format; + + /** + * source stride + * (strides for luma and chroma are the same) + */ + WORD32 i4_src_strd; + + /** + * recon stride + * (strides for luma and chroma are the same) + */ + WORD32 i4_rec_strd; + + /** + * Flag to enable/disable deblocking of a frame + */ + WORD32 i4_disable_deblk_pic; + + /** + * Number of continuous frames where deblocking was disabled + */ + WORD32 i4_disable_deblk_pic_cnt; + + /** + * frame type + */ + PIC_TYPE_T pic_type; + + /** + * frame qp + */ + UWORD32 u4_frame_qp; + + /** + * frame num + */ + WORD32 i4_frame_num; + + /** + * slice_type + */ + WORD32 i4_slice_type; + + /* + * Force current frame to specific type + */ + IV_PICTURE_CODING_TYPE_T force_curr_frame_type; + + /** + * IDR pic + */ + UWORD32 u4_is_idr; + + /** + * idr_pic_id + */ + WORD32 i4_idr_pic_id; + + /** + * Flush mode + */ + WORD32 i4_flush_mode; + + /** + * Encode header mode + */ + WORD32 i4_header_mode; + + /** + * Flag to indicate if header has already + * been generated when i4_api_call_cnt 0 + */ + UWORD32 u4_header_generated; + + /** + * Encode generate header + */ + WORD32 i4_gen_header; + + /** + * To signal successful completion of init + */ + WORD32 i4_init_done; + + /** + * To signal that at least one picture was decoded + */ + WORD32 i4_first_pic_done; + + /** + * Reset flag - Codec is reset if this flag is set + */ + WORD32 i4_reset_flag; + + /** + * Current error code + */ + WORD32 i4_error_code; + + /** + * threshold residue + */ + WORD32 u4_thres_resi; + + /** + * disable intra inter gating + */ + UWORD32 u4_inter_gate; + + /** + * Holds mem records passed during init. + * This will be used to return the mem records during retrieve call + */ + iv_mem_rec_t *ps_mem_rec_backup; + + /** + * Flag to determine if the entropy thread is active + */ + volatile UWORD32 au4_entropy_thread_active[MAX_CTXT_SETS]; + + /** + * Mutex used to keep the entropy calls thread-safe + */ + void *pv_entropy_mutex; + + /** + * Job queue buffer base + */ + void *pv_proc_jobq_buf, *pv_entropy_jobq_buf; + + /** + * Job Queue mem tab size + */ + WORD32 i4_proc_jobq_buf_size, i4_entropy_jobq_buf_size; + + /** + * Memory for MV Bank buffer manager + */ + void *pv_mv_buf_mgr_base; + + /** + * MV Bank buffer manager + */ + void *pv_mv_buf_mgr; + + /** + * Pointer to MV Buf structure array + */ + void *ps_mv_buf; + + /** + * Base address for Motion Vector bank buffer + */ + void *pv_mv_bank_buf_base; + + /** + * MV Bank size allocated + */ + WORD32 i4_total_mv_bank_size; + + /** + * Memory for Picture buffer manager for reference pictures + */ + void *pv_ref_buf_mgr_base; + + /** + * Picture buffer manager for reference pictures + */ + void *pv_ref_buf_mgr; + + /** + * Number of reference buffers added to the buffer manager + */ + WORD32 i4_ref_buf_cnt; + + /** + * Pointer to Pic Buf structure array + */ + void *ps_pic_buf; + + /** + * Base address for Picture buffer + */ + void *pv_pic_buf_base; + + /** + * Total pic buffer size allocated + */ + WORD32 i4_total_pic_buf_size; + + /** + * Memory for Buffer manager for output buffers + */ + void *pv_out_buf_mgr_base; + + /** + * Buffer manager for output buffers + */ + void *pv_out_buf_mgr; + + /** + * Current output buffer's buffer ID + */ + WORD32 i4_out_buf_id; + + /** + * Number of output buffers added to the buffer manager + */ + WORD32 i4_out_buf_cnt; + + /** + * Memory for Picture buffer manager for input buffers + */ + void *pv_inp_buf_mgr_base; + + /** + * Picture buffer manager for input buffers + */ + void *pv_inp_buf_mgr; + + /** + * Current input buffer's buffer ID + */ + WORD32 i4_inp_buf_id; + + /** + * Number of input buffers added to the buffer manager + */ + WORD32 i4_inp_buf_cnt; + + /** + * Current input buffer + */ + pic_buf_t *ps_inp_buf; + + /** + * Pointer to dpb manager structure + */ + void *pv_dpb_mgr; + + /** + * Pointer to base of Sequence parameter set structure array + */ + sps_t *ps_sps_base; + + /** + * Pointer to base of Picture parameter set structure array + */ + pps_t *ps_pps_base; + + /** + * seq_parameter_set_id + */ + WORD32 i4_sps_id; + + /** + * pic_parameter_set_id + */ + WORD32 i4_pps_id; + + /** + * Pointer to base of slice header structure array + */ + slice_header_t *ps_slice_hdr_base; + + /** + * packed residue coeff data size for 1 row of mbs + */ + UWORD32 u4_size_coeff_data; + + /** + * packed header data size for 1 row of mbs + */ + UWORD32 u4_size_header_data; + + /** + * Processing context - One for each processing thread + * Create two sets, each set used for alternate frames + */ + process_ctxt_t as_process[MAX_PROCESS_CTXT]; + + /** + * Thread handle for each of the processing threads + */ + void *apv_proc_thread_handle[MAX_PROCESS_THREADS]; + + /** + * Thread created flag for each of the processing threads + */ + WORD32 ai4_process_thread_created[MAX_PROCESS_THREADS]; + + /** + * Void pointer to process job context + */ + void *pv_proc_jobq, *pv_entropy_jobq; + + /** + * Number of MBs processed together for better instruction cache handling + */ + WORD32 i4_proc_nmb; + + /** + * Previous POC lsb + */ + WORD32 i4_prev_poc_lsb; + + /** + * Previous POC msb + */ + WORD32 i4_prev_poc_msb; + + /** + * Max POC lsb that has arrived till now + */ + WORD32 i4_max_prev_poc_lsb; + + /** + * Context for format conversion + */ + fmt_conv_t s_fmt_conv; + + /** + * Absolute pic order count + */ + WORD32 i4_abs_pic_order_cnt; + + /** + * Pic order count of lsb + */ + WORD32 i4_pic_order_cnt_lsb; + + /** + * Array giving current picture being processed in each context set + */ + WORD32 ai4_pic_cnt[MAX_CTXT_SETS]; + + /* + * Min sad to search for + */ + UWORD32 u4_min_sad; + + /** + * Reference picture set + */ + ref_set_t as_ref_set[MAX_DPB_SIZE + MAX_CTXT_SETS]; + + /* + * Air pic cnt + * Contains the number of pictures that have been encoded with air + * This value is moudulo air refresh period + */ + WORD32 i4_air_pic_cnt; + + /* + * Intra refresh map + * Stores the frames at which intra refresh should occur for a MB + */ + UWORD16 *pu2_intr_rfrsh_map; + + /* + * Alternate reference frames + * Indicates if the current frame is used as a reference frame + */ + UWORD32 u4_is_curr_frm_ref; + + /* + * Memory for color space conversion for luma plane + */ + UWORD8 *pu1_y_csc_buf_base; + + /* + * Memory for color space conversion foe chroma plane + */ + UWORD8 *pu1_uv_csc_buf_base; + + /** + * Function pointers for intra pred leaf level functions luma + */ + pf_intra_pred apf_intra_pred_16_l[MAX_I16x16]; + pf_intra_pred apf_intra_pred_8_l[MAX_I8x8]; + pf_intra_pred apf_intra_pred_4_l[MAX_I4x4]; + + /** + * Function pointers for intra pred leaf level functions chroma + */ + pf_intra_pred apf_intra_pred_c[MAX_CH_I8x8]; + + /** + * luma core coding function pointer + */ + UWORD8 (*luma_energy_compaction[4])(process_ctxt_t *ps_proc); + + /** + * chroma core coding function pointer + */ + UWORD8 (*chroma_energy_compaction[2])(process_ctxt_t *ps_proc); + + /** + * forward transform for intra blk of mb type 16x16 + */ + ih264_luma_16x16_resi_trans_dctrans_quant_ft *pf_resi_trans_dctrans_quant_16x16; + + /** + * inverse transform for intra blk of mb type 16x16 + */ + ih264_luma_16x16_idctrans_iquant_itrans_recon_ft *pf_idctrans_iquant_itrans_recon_16x16; + + /** + * forward transform for 4x4 blk luma + */ + ih264_resi_trans_quant_ft *pf_resi_trans_quant_4x4; + + /** + * forward transform for 4x4 blk luma + */ + ih264_resi_trans_quant_ft *pf_resi_trans_quant_chroma_4x4; + + /* + * hadamard transform and quant for a 4x4 block + */ + ih264_hadamard_quant_ft *pf_hadamard_quant_4x4; + + /* + * hadamard transform and quant for a 4x4 block + */ + ih264_hadamard_quant_ft *pf_hadamard_quant_2x2_uv; + + /** + * inverse transform for 4x4 blk + */ + ih264_iquant_itrans_recon_ft *pf_iquant_itrans_recon_4x4; + + /** + * inverse transform for chroma 4x4 blk + */ + ih264_iquant_itrans_recon_chroma_ft *pf_iquant_itrans_recon_chroma_4x4; + + /** + * inverse transform for 4x4 blk with only single dc coeff + */ + ih264_iquant_itrans_recon_ft *pf_iquant_itrans_recon_4x4_dc; + + /** + * inverse transform for chroma 4x4 blk with only single dc coeff + */ + ih264_iquant_itrans_recon_chroma_ft *pf_iquant_itrans_recon_chroma_4x4_dc; + + /* + * Inverse hadamard transform and iquant for a 4x4 block + */ + ih264_ihadamard_scaling_ft *pf_ihadamard_scaling_4x4; + + /* + * Inverse hadamard transform and iquant for a 4x4 block + */ + ih264_ihadamard_scaling_ft *pf_ihadamard_scaling_2x2_uv; + + /* + * Function for interleave copy* + */ + ih264_interleave_copy_ft *pf_interleave_copy; + + /** + * forward transform for 8x8 blk + */ + ih264_resi_trans_quant_ft *pf_resi_trans_quant_8x8; + + /** + * inverse transform for 8x8 blk + */ + /** + * inverse transform for 4x4 blk + */ + ih264_iquant_itrans_recon_ft *pf_iquant_itrans_recon_8x8; + + /** + * forward transform for chroma MB + */ + ih264_chroma_8x8_resi_trans_dctrans_quant_ft *pf_resi_trans_dctrans_quant_8x8_chroma; + + /** + * inverse transform for chroma MB + */ + ih264_idctrans_iquant_itrans_recon_ft *pf_idctrans_iquant_itrans_recon_8x8_chroma; + + /** + * deblock vertical luma edge with blocking strength 4 + */ + ih264_deblk_edge_bs4_ft *pf_deblk_luma_vert_bs4; + + /** + * deblock vertical chroma edge with blocking strength 4 + */ + ih264_deblk_chroma_edge_bs4_ft *pf_deblk_chroma_vert_bs4; + + /** + * deblock vertical luma edge with blocking strength less than 4 + */ + ih264_deblk_edge_bslt4_ft *pf_deblk_luma_vert_bslt4; + + /** + * deblock vertical chroma edge with blocking strength less than 4 + */ + ih264_deblk_chroma_edge_bslt4_ft *pf_deblk_chroma_vert_bslt4; + + /** + * deblock horizontal luma edge with blocking strength 4 + */ + ih264_deblk_edge_bs4_ft *pf_deblk_luma_horz_bs4; + + /** + * deblock horizontal chroma edge with blocking strength 4 + */ + ih264_deblk_chroma_edge_bs4_ft *pf_deblk_chroma_horz_bs4; + + /** + * deblock horizontal luma edge with blocking strength less than 4 + */ + ih264_deblk_edge_bslt4_ft *pf_deblk_luma_horz_bslt4; + + /** + * deblock horizontal chroma edge with blocking strength less than 4 + */ + ih264_deblk_chroma_edge_bslt4_ft *pf_deblk_chroma_horz_bslt4; + + + /** + * functions for padding + */ + pf_pad pf_pad_top; + pf_pad pf_pad_bottom; + pf_pad pf_pad_left_luma; + pf_pad pf_pad_left_chroma; + pf_pad pf_pad_right_luma; + pf_pad pf_pad_right_chroma; + + /** + * Inter pred leaf level functions + */ + ih264_inter_pred_luma_ft *pf_inter_pred_luma_copy; + ih264_inter_pred_luma_ft *pf_inter_pred_luma_horz; + ih264_inter_pred_luma_ft *pf_inter_pred_luma_vert; + pf_inter_pred_luma_bilinear pf_inter_pred_luma_bilinear; + ih264_inter_pred_chroma_ft *pf_inter_pred_chroma; + + /** + * fn ptrs for compute sad routines + */ + ime_compute_sad_ft *apf_compute_sad_16x16[2]; + ime_compute_sad_ft *pf_compute_sad_16x8; + + /** + * fn ptrs for memory handling operations + */ + pf_memcpy pf_mem_cpy; + pf_memset pf_mem_set; + pf_memcpy_mul8 pf_mem_cpy_mul8; + pf_memset_mul8 pf_mem_set_mul8; + + /** + * intra mode eval -encoder level function + */ + pf_evaluate_intra_modes pf_ih264e_evaluate_intra16x16_modes; + pf_evaluate_intra_modes pf_ih264e_evaluate_intra_chroma_modes; + pf_evaluate_intra_4x4_modes pf_ih264e_evaluate_intra_4x4_modes; + + /* Half pel generation function - encoder level + * + */ + pf_sixtapfilter_horz pf_ih264e_sixtapfilter_horz; + pf_sixtap_filter_2dvh_vert pf_ih264e_sixtap_filter_2dvh_vert; + + /** + * color space conversion form YUV 420P to YUV 420Sp + */ + pf_fmt_conv_420p_to_420sp pf_ih264e_conv_420p_to_420sp; + + + /** + * color space conversion form YUV 420P to YUV 420Sp + */ + pf_fmt_conv_422ile_to_420sp pf_ih264e_fmt_conv_422i_to_420sp; + + /** + * write mb layer for a given slice I, P, B + */ + IH264E_ERROR_T (*pf_write_mb_syntax_layer[3]) ( entropy_ctxt_t *ps_ent_ctxt ); + + + /** + * Output buffer + */ + out_buf_t as_out_buf[MAX_CTXT_SETS]; + + /** + * recon buffer + */ + rec_buf_t as_rec_buf[MAX_CTXT_SETS]; + + /** + * rate control context + */ + rate_control_ctxt_t s_rate_control; +}; +#endif /* IH264E_STRUCTS_H_ */ diff --git a/encoder/ih264e_time_stamp.c b/encoder/ih264e_time_stamp.c new file mode 100755 index 0000000..a6a7f3c --- /dev/null +++ b/encoder/ih264e_time_stamp.c @@ -0,0 +1,748 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_time_stamp.c +* +* @brief +* This file contains functions used for source and target time stamp management +* +* @author +* ittiam +* +* @par List of Functions: +* - gcd() +* - ih264e_get_range() +* - ih264e_frame_time_get_init_free_memtab() +* - ih264e_init_frame_time() +* - ih264e_should_src_be_skipped() +* - ih264e_time_stamp_get_init_free_memtab() +* - ih264e_init_time_stamp() +* - ih264e_update_time_stamp() +* - ih264e_frame_time_get_src_frame_rate() +* - ih264e_frame_time_get_tgt_frame_rate() +* - ih264e_frame_time_get_src_ticks() +* - ih264e_frame_time_get_tgt_ticks() +* - ih264e_frame_time_get_src_time() +* - ih264e_frame_time_get_tgt_time() +* - ih264e_frame_time_update_src_frame_rate() +* - ih264e_frame_time_update_tgt_frame_rate() +* - ih264_time_stamp_update_frame_rate() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* user include files */ +#include "irc_datatypes.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ih264_defs.h" +#include "ih264e_defs.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_structs.h" +#include "ih264e_rc_mem_interface.h" +#include "ih264e_time_stamp.h" +#include "irc_rate_control_api.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Function to compute gcd of two numbers +* +* @par Description +* Function to compute gcd of two numbers +* +* @param[in] i4_x +* value 1 +* +* @param[in] i4_y +* value 2 +* +* @returns +* GCD(value 1, value 2) +* +* @remarks none +* +******************************************************************************* +*/ +static WORD32 gcd(WORD32 i4_x, WORD32 i4_y) +{ + if (i4_x > i4_y) + { + i4_x = i4_y + i4_x; + i4_y = i4_x - i4_y; + i4_x = i4_x - i4_y; + } + while (i4_y != 0) + { + WORD32 temp; + i4_x = i4_x % i4_y; + temp = i4_x; + i4_x = i4_y; + i4_y = temp; + } + return (i4_x); +} + +/** +******************************************************************************* +* +* @brief Function to determine number of bits required to represent a given +* value +* +* @par Description +* This function determines the number of bits required to represent the given +* value. It is used to find out number of bits to read when the data size is +* not fixed (e.g. vop_time_increment_resolution). +* +* @param[in] u4_value +* Value for which the number of bits required to represent is to be determined +* +* @param[in] u1_no_of_bits +* Represents the value's word type = 8/16/32 +* +* @returns +* The number of bits required to represent the given number +* +* @remarks none +* +******************************************************************************* +*/ +static UWORD8 ih264e_get_range(UWORD32 u4_value, UWORD8 u1_no_of_bits) +{ + UWORD8 count; + UWORD32 temp; + + if (u4_value > (UWORD32) ((1 << (u1_no_of_bits >> 1)) - 1)) + { + temp = (1 << (u1_no_of_bits - 1)); + for (count = 0; count < (u1_no_of_bits >> 1); count++) + { + if ((temp & u4_value) != 0) + { + return (UWORD8) (u1_no_of_bits - count); + } + else + { + temp >>= 1; + } + } + return 0; + } + else + { + temp = (1 << ((u1_no_of_bits >> 1) - 1)); + for (count = 0; count < ((u1_no_of_bits >> 1) - 1); count++) + { + if ((temp & u4_value) != 0) + { + return (UWORD8) ((u1_no_of_bits >> 1) - count); + } + else + { + temp >>= 1; + } + } + return 1; + } +} + +/** +******************************************************************************* +* +* @brief +* Function to init frame time memtabs +* +* @par Description +* Function to init frame time memtabs +* +* @param[in] pps_frame_time +* Pointer to frame time contexts +* +* @param[in] ps_memtab +* Pointer to memtab +* +* @param[in] e_func_type +* Function type (get memtabs/init memtabs) +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_init_free_memtab(frame_time_handle *pps_frame_time, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static frame_time_t s_temp_frame_time_t; + + /* Hack for al alloc, during which we dont have any state memory. + Dereferencing can cause issues */ + if (e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_frame_time) = &s_temp_frame_time_t; + + /* for src rate control state structure */ + if (e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(frame_time_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**) pps_frame_time, e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +/** +******************************************************************************* +* +* @brief +* Function to init frame time context +* +* @par Description +* Frame time structure stores the time of the source and the target frames to +* be encoded. Based on the time we decide whether or not to encode the source +* frame +* +* @param[in] ps_frame_time +* Pointer Frame time context +* +* @param[in] u4_src_frm_rate +* Source frame rate +* +* @param[in] u4_tgt_frm_rate +* Target frame rate +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_frame_time(frame_time_t *ps_frame_time, + UWORD32 u4_src_frm_rate, + UWORD32 u4_tgt_frm_rate) +{ + /* Initialise the common time base based on which the source and target + * frame times increase */ + WORD32 i4_gcd = gcd(u4_src_frm_rate, u4_tgt_frm_rate); + + ps_frame_time->common_time_base = (u4_src_frm_rate * u4_tgt_frm_rate) + / i4_gcd; + + /* The source and target increment per vop is initialized */ + ps_frame_time->u4_src_frm_time_incr = ps_frame_time->common_time_base + / u4_src_frm_rate; + ps_frame_time->u4_tgt_frm_time_incr = ps_frame_time->common_time_base + / u4_tgt_frm_rate; + + /* Initialise the source and target times to 0 (RESET) */ + ps_frame_time->u4_src_frm_time = 0; + ps_frame_time->u4_tgt_frm_time = 0; + + /* Initialize the number of frms not to be skipped to 0 */ + ps_frame_time->u4_num_frms_dont_skip = 0; +} + +/** +******************************************************************************* +* +* @brief +* Function to check if frame can be skipped +* +* @par Description +* Based on the source and target frame time and the delta time stamp +* we decide whether to code the source or not. +* This is based on the assumption +* that the source frame rate is greater that target frame rate. +* Updates the time_stamp structure +* +* @param[in] ps_frame_time +* Handle to frame time context +* +* @param[in] u4_delta_time_stamp +* Time stamp difference between frames +* +* @param[out] pu4_frm_not_skipped_for_dts +* Flag to indicate if frame is already skipped by application +* +* @returns +* Flag to skip frame +* +* @remarks +* +******************************************************************************* +*/ +UWORD8 ih264e_should_src_be_skipped(frame_time_t *ps_frame_time, + UWORD32 u4_delta_time_stamp, + UWORD32 *pu4_frm_not_skipped_for_dts) +{ + UWORD8 skip_src = 0; + + if (ps_frame_time->u4_tgt_frm_time > ps_frame_time->u4_src_frm_time && + ps_frame_time->u4_tgt_frm_time >= (ps_frame_time->u4_src_frm_time + + ps_frame_time->u4_src_frm_time_incr)) + { + skip_src = 1; + } + + /* source time gets updated every frame */ + ps_frame_time->u4_src_frm_time += ps_frame_time->u4_src_frm_time_incr; + + /* target time gets updated only when the source is coded */ + if (!skip_src) + { + ps_frame_time->u4_tgt_frm_time += ps_frame_time->u4_tgt_frm_time_incr; + } + + /* If the source and target frame times get incremented properly + both should be equal to the common time base at the same time. If + that happens we reset the time to zero*/ + if (( ps_frame_time->common_time_base ==(WORD32)ps_frame_time->u4_src_frm_time) + && (ps_frame_time->common_time_base ==(WORD32) ps_frame_time->u4_tgt_frm_time )) + { + ps_frame_time->u4_src_frm_time = 0; + ps_frame_time->u4_tgt_frm_time = 0; + } + + /* This keeps a count of how many frames need not be skipped in order + to take care of the delta time stamp */ + ps_frame_time->u4_num_frms_dont_skip += (u4_delta_time_stamp - 1); + + /** If this frame is to be skipped in order to maintain the tgt_frm_rate + check if already a frame has been skipped by the application. + In that case, do not skip this frame **/ + if (ps_frame_time->u4_num_frms_dont_skip && skip_src) + { + skip_src = 0; + *pu4_frm_not_skipped_for_dts = 1; + ps_frame_time->u4_num_frms_dont_skip -= 1; + } + else + { + pu4_frm_not_skipped_for_dts[0] = 0; + } + + return (skip_src); +} + +/** +******************************************************************************* +* +* @brief +* Function to inititialize time stamp memtabs +* +* @par Description +* Function to initialize time stamp memtabs +* +* @param[in] pps_time_stamp +* Pointer to time stamp context +* +* @param[in] ps_memtab +* Pointer to memtab +* +* @param[in] e_func_type +* Funcion type (Get memtab/ init memtab) +* +* @returns +* number of memtabs used +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_time_stamp_get_init_free_memtab(time_stamp_handle *pps_time_stamp, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static time_stamp_t s_temp_time_stamp_t; + + /* Hack for al alloc, during which we dont have any state memory. + Dereferencing can cause issues */ + if (e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_time_stamp) = &s_temp_time_stamp_t; + + /* for src rate control state structure */ + if (e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(time_stamp_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**) pps_time_stamp, e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +/** +******************************************************************************* +* +* @brief +* Function to initialize time stamp context +* +* @par Description +* Time stamp structure stores the time stamp data that +* needs to be sent in to the header of MPEG4. Based on the +* max target frame rate the vop_time increment resolution is set +* so as to support all the frame rates below max frame rate. +* A support till the third decimal point is assumed. +* +* @param[in] ps_time_stamp +* Pointer to time stamp structure +* +* @param[in] u4_max_frm_rate +* Maximum frame rate +* +* @param[in] u4_src_frm_rate +* Source frame rate +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_time_stamp(time_stamp_t *ps_time_stamp, + UWORD32 u4_max_frm_rate, + UWORD32 u4_src_frm_rate) +{ + /* We expect the max frame rate to be less than 60000, + * if not we divide it by zero and work with it */ + if (u4_max_frm_rate > 60000) + { + u4_max_frm_rate >>= 1; + ps_time_stamp->is_max_frame_rate_scaled = 1; + } + else + { + ps_time_stamp->is_max_frame_rate_scaled = 0; + } + + ps_time_stamp->u4_vop_time_incr_res = u4_max_frm_rate; + ps_time_stamp->u4_vop_time_incr_range = ih264e_get_range(u4_max_frm_rate, 32); + ps_time_stamp->u4_vop_time_incr = (ps_time_stamp->u4_vop_time_incr_res * 1000) / u4_src_frm_rate;/* Since frm rate is in millisec */ + ps_time_stamp->u4_vop_time = 0; + ps_time_stamp->u4_cur_tgt_vop_time = 0; + ps_time_stamp->u4_prev_tgt_vop_time = 0; +} + +/** +******************************************************************************* +* +* @brief Function to update time stamp context +* +* @par Description +* Vop time is incremented by increment value. When vop time goes +* more than the vop time resolution set the modulo time base to +* 1 and reduce the vop time by vop time resolution so that the +* excess value is present in vop time and get accumulated over time +* so that the corresponding frame rate is achieved at a average of +* 1000 seconds +* +* @param[in] ps_time_stamp +* Pointer to time stamp structure +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_time_stamp(time_stamp_t *ps_time_stamp) +{ + /* Since get time stamp is called after the update + A copy of the vop time and the modulo time is stored */ + ps_time_stamp->u4_cur_tgt_vop_time = ps_time_stamp->u4_vop_time; + + ps_time_stamp->u4_vop_time += ps_time_stamp->u4_vop_time_incr; + if (ps_time_stamp->u4_vop_time >= ps_time_stamp->u4_vop_time_incr_res) + { + ps_time_stamp->u4_vop_time -= ps_time_stamp->u4_vop_time_incr_res; + } +} + +/**************************************************************************** + Run-Time Modifying functions +****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Function to get source frame rate +* +* @par Description +* Function to get source frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* source frame rate +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_src_frame_rate(frame_time_t *ps_frame_time) +{ + return (ps_frame_time->common_time_base / ps_frame_time->u4_src_frm_time_incr); +} + +/** +******************************************************************************* +* +* @brief Function to get target frame rate +* +* @par Description +* Function to get target frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* target frame rate +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_tgt_frame_rate(frame_time_t *ps_frame_time) +{ + return (ps_frame_time->common_time_base / ps_frame_time->u4_tgt_frm_time_incr); +} + +/** +******************************************************************************* +* +* @brief Function to get source time increment +* +* @par Description +* Function to get source time increment +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* source time increment +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_src_ticks(frame_time_t *ps_frame_time) +{ + return (ps_frame_time->u4_src_frm_time_incr); +} + +/** +******************************************************************************* +* +* @brief Function to get target time increment +* +* @par Description +* Function to get target time increment +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* target time increment +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_tgt_ticks(frame_time_t *ps_frame_time) +{ + return (ps_frame_time->u4_tgt_frm_time_incr); +} + +/** +******************************************************************************* +* +* @brief Function to get src frame time +* +* @par Description +* Function to get src frame time +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* src frame time +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_src_time(frame_time_t *frame_time) +{ + return (frame_time->u4_src_frm_time); +} + +/** +******************************************************************************* +* +* @brief Function to get tgt frame time +* +* @par Description +* Function to get tgt frame time +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* tgt frame time +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_tgt_time(frame_time_t *frame_time) +{ + return (frame_time->u4_tgt_frm_time); +} + +/** +******************************************************************************* +* +* @brief Function to update source frame time with a new source frame rate +* +* @par Description +* Function to update source frame time with a new source frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @param[in] src_frm_rate +* source frame rate +* +* @returns +* None +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_frame_time_update_src_frame_rate(frame_time_t *ps_frame_time, + WORD32 src_frm_rate) +{ + /* Since tgt frame rate does not change deriving the tgt_frm rate from + * common_time_base */ + WORD32 tgt_frm_rate = ps_frame_time->common_time_base / ps_frame_time->u4_tgt_frm_time_incr; + + /* Re-initialise frame_time based on the new src_frame_rate and + * old tgt_frame_rate */ + ih264e_init_frame_time(ps_frame_time, src_frm_rate, tgt_frm_rate); +} + +/** +******************************************************************************* +* +* @brief Function to update target frame time with a new source frame rate +* +* @par Description +* Function to update target frame time with a new source frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @param[in] tgt_frm_rate +* target frame rate +* +* @returns +* None +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_frame_time_update_tgt_frame_rate(frame_time_t *ps_frame_time, + WORD32 tgt_frm_rate) +{ + /* Since src frame rate does not change deriving the src_frm rate from + * common_time_base */ + WORD32 src_frm_rate = ps_frame_time->common_time_base / ps_frame_time->u4_src_frm_time_incr; + + /* Re-initialise frame_time based on the new tgt_frame_rate and + * old src_frame_rate */ + ih264e_init_frame_time(ps_frame_time, src_frm_rate, tgt_frm_rate); +} + +/** +******************************************************************************* +* +* @brief Function to update target frame time with a new source frame rate +* +* @par Description +* When the frame rate changes the time increment is modified by appropriate ticks +* +* @param[in] ps_time_stamp +* Pointer to time stamp structure +* +* @param[in] src_frm_rate +* source frame rate +* +* @returns +* None +* +* @remarks +* +******************************************************************************* +*/ +void ih264_time_stamp_update_frame_rate(time_stamp_t *ps_time_stamp, + UWORD32 src_frm_rate) +{ + ps_time_stamp->u4_vop_time_incr = (ps_time_stamp->u4_vop_time_incr_res * 1000) / src_frm_rate;/* Since frm rate is in millisec */ +} diff --git a/encoder/ih264e_time_stamp.h b/encoder/ih264e_time_stamp.h new file mode 100755 index 0000000..1ee559d --- /dev/null +++ b/encoder/ih264e_time_stamp.h @@ -0,0 +1,498 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_time_stamp.h +* +* @brief +* This file contains function declarations used for managing input and output +* frame time stamps +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_TIME_STAMP_H_ +#define IH264E_TIME_STAMP_H_ + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +/** + * Parameters for Src/Tgt frames that are encoded + */ +typedef struct frame_time_t +{ + /* common time base(=LCM) between source and target frame rate (in ticks)*/ + WORD32 common_time_base; + + /* number of ticks between two source frames */ + UWORD32 u4_src_frm_time_incr; + + /* number of ticks between two target frames */ + UWORD32 u4_tgt_frm_time_incr; + + /* Source frame time - measured as modulo of common time base + and incremented by src_frm_time_incr */ + UWORD32 u4_src_frm_time; + + /* Target frame time - measured as modulo of common time base + and incremented by tgt_frm_time_incr */ + UWORD32 u4_tgt_frm_time; + + /* Number of frames not to be skipped while maintaining + tgt_frm_rate due to delta_time_stamp */ + UWORD32 u4_num_frms_dont_skip; +}frame_time_t; + +typedef struct frame_time_t *frame_time_handle; + +/** + * Parameters that go in the bitstream based on tgt_frm_rate + * 1) Initialize the vop_time_incr_res with the max_frame_rate (in frames per 1000 bits) + * - To represent all kinds of frame rates + * 2) Decide the vop_time_incr based on the source frame rate + * - The decoder would like to know which source frame is encoded i.e. the source time + * id of the target frame encoded and there by adjusting its time of delay + * 3) vop_time increments every source frame and whenever a frame is encoded (target frame), + * the encoder queries the vop time of the source frame and sends it in the bit stream. + * 4) Since the Source frame skip logic is taken care by the frame_time module, whenever the + * encoder queries the time stamp module (which gets updated outside the encoder) the + * time stamp module would have the source time + */ +typedef struct time_stamp_t +{ + /*vop_time_incr_res is a integer that indicates + the number of evenly spaced subintervals, called ticks, + within one modulo time. */ + UWORD32 u4_vop_time_incr_res; + + /* number of bits to represent vop_time_incr_res */ + UWORD32 u4_vop_time_incr_range; + + /* The number of ticks elapsed between two source vops */ + UWORD32 u4_vop_time_incr; + + /* incremented by vop_time_incr for every source frame. + Represents the time offset after a modulo_time_base = 1 is sent + in bit stream*/ + UWORD32 u4_vop_time; + + /* A temporary buffer to copy of vop time and modulo time base + is stored since update is called before query (get time stamp) and + so these extra variables cur_tgt_vop_time, */ + UWORD32 u4_cur_tgt_vop_time; + + UWORD32 u4_prev_tgt_vop_time; + + /* This variable is set to 1 if we scale max frame rate by a factor of 2. + For mpeg4 standard, we just have 16bits and we can't accommodate more than 60000 as frame rate. + So we scale it and work with it */ + WORD32 is_max_frame_rate_scaled; +} time_stamp_t; + +typedef struct time_stamp_t *time_stamp_handle; + +/*****************************************************************************/ +/* Extern function declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* Function to init frame time context +* +* @par Description +* Frame time structure stores the time of the source and the target frames to +* be encoded. Based on the time we decide whether or not to encode the source +* frame +* +* @param[in] ps_frame_time +* Pointer Frame time context +* +* @param[in] u4_src_frm_rate +* Source frame rate +* +* @param[in] u4_tgt_frm_rate +* Target frame rate +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_frame_time(frame_time_t *ps_frame_time, + UWORD32 u4_src_frm_rate, + UWORD32 u4_tgt_frm_rate); + +/** +******************************************************************************* +* +* @brief +* Function to check if frame can be skipped +* +* @par Description +* Based on the source and target frame time and the delta time stamp +* we decide whether to code the source or not. +* This is based on the assumption +* that the source frame rate is greater that target frame rate. +* Updates the time_stamp structure +* +* @param[in] ps_frame_time +* Handle to frame time context +* +* @param[in] u4_delta_time_stamp +* Time stamp difference between frames +* +* @param[out] pu4_frm_not_skipped_for_dts +* Flag to indicate if frame is already skipped by application +* +* @returns +* Flag to skip frame +* +* @remarks +* +******************************************************************************* +*/ +UWORD8 ih264e_should_src_be_skipped(frame_time_t *ps_frame_time, + UWORD32 u4_delta_time_stamp, + UWORD32 *pu4_frm_not_skipped_for_dts); + +/** +******************************************************************************* +* +* @brief +* Function to initialize time stamp context +* +* @par Description +* Time stamp structure stores the time stamp data that +* needs to be sent in to the header of MPEG4. Based on the +* max target frame rate the vop_time increment resolution is set +* so as to support all the frame rates below max frame rate. +* A support till the third decimal point is assumed. +* +* @param[in] ps_time_stamp +* Pointer to time stamp structure +* +* @param[in] u4_max_frm_rate +* Maximum frame rate +* +* @param[in] u4_src_frm_rate +* Source frame rate +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_time_stamp(time_stamp_handle time_stamp, + UWORD32 max_frm_rate, + UWORD32 src_frm_rate); + +/** +******************************************************************************* +* +* @brief Function to update time stamp context +* +* @par Description +* Vop time is incremented by increment value. When vop time goes +* more than the vop time resolution set the modulo time base to +* 1 and reduce the vop time by vop time resolution so that the +* excess value is present in vop time and get accumulated over time +* so that the corresponding frame rate is achieved at a average of +* 1000 seconds +* +* @param[in] ps_time_stamp +* Pointer to time stamp structure +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_update_time_stamp(time_stamp_handle time_stamp); + +/** +******************************************************************************* +* +* @brief +* Function to init frame time memtabs +* +* @par Description +* Function to init frame time memtabs +* +* @param[in] pps_frame_time +* Pointer to frame time contexts +* +* @param[in] ps_memtab +* Pointer to memtab +* +* @param[in] e_func_type +* Function type (get memtabs/init memtabs) +* +* @returns +* none +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_init_free_memtab(frame_time_handle *pps_frame_time, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +/** +******************************************************************************* +* +* @brief +* Function to initialize time stamp memtabs +* +* @par Description +* Function to initialize time stamp memtabs +* +* @param[in] pps_time_stamp +* Pointer to time stamp context +* +* @param[in] ps_memtab +* Pointer to memtab +* +* @param[in] e_func_type +* Funcion type (Get memtab/ init memtab) +* +* @returns +* number of memtabs used +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_time_stamp_get_init_free_memtab(time_stamp_handle *pps_time_stamp, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +/**************************************************************************** + Run-Time Modifying functions +****************************************************************************/ +/** +******************************************************************************* +* +* @brief Function to get source frame rate +* +* @par Description +* Function to get source frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* source frame rate +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_src_frame_rate(frame_time_t *ps_frame_time); + +/** +******************************************************************************* +* +* @brief Function to get target frame rate +* +* @par Description +* Function to get target frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* target frame rate +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_tgt_frame_rate(frame_time_t *ps_frame_time); + +/** +******************************************************************************* +* +* @brief Function to get source time increment +* +* @par Description +* Function to get source time increment +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* source time increment +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_src_ticks(frame_time_t *ps_frame_time); + +/** +******************************************************************************* +* +* @brief Function to get target time increment +* +* @par Description +* Function to get target time increment +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* target time increment +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_tgt_ticks(frame_time_t *ps_frame_time); + +/** +******************************************************************************* +* +* @brief Function to get src frame time +* +* @par Description +* Function to get src frame time +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* src frame time +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_src_time(frame_time_t *frame_time); + +/** +******************************************************************************* +* +* @brief Function to get tgt frame time +* +* @par Description +* Function to get tgt frame time +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @returns +* tgt frame time +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_frame_time_get_tgt_time(frame_time_t *frame_time); + +/** +******************************************************************************* +* +* @brief Function to update source frame time with a new source frame rate +* +* @par Description +* Function to update source frame time with a new source frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @param[in] src_frm_rate +* source frame rate +* +* @returns +* None +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_frame_time_update_src_frame_rate(frame_time_t *ps_frame_time, WORD32 src_frm_rate); + +/** +******************************************************************************* +* +* @brief Function to update target frame time with a new source frame rate +* +* @par Description +* Function to update target frame time with a new source frame rate +* +* @param[in] ps_frame_time +* Pointer to frame time context +* +* @param[in] tgt_frm_rate +* target frame rate +* +* @returns +* None +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_frame_time_update_tgt_frame_rate(frame_time_t *ps_frame_time, WORD32 tgt_frm_rate); + +/** +******************************************************************************* +* +* @brief Function to update target frame time with a new source frame rate +* +* @par Description +* When the frame rate changes the time increment is modified by appropriate ticks +* +* @param[in] ps_time_stamp +* Pointer to time stamp structure +* +* @param[in] src_frm_rate +* source frame rate +* +* @returns +* None +* +* @remarks +* +******************************************************************************* +*/ +void ih264_time_stamp_update_frame_rate(time_stamp_t *ps_time_stamp, UWORD32 src_frm_rate); + +#endif /*IH264E_TIME_STAMP_H_*/ + diff --git a/encoder/ih264e_trace.h b/encoder/ih264e_trace.h new file mode 100755 index 0000000..8134524 --- /dev/null +++ b/encoder/ih264e_trace.h @@ -0,0 +1,161 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_trace.h +* +* @brief +* This file contains extern declarations of routines that could be helpful +* for debugging purposes. +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_TRACE_H_ +#define IH264E_TRACE_H_ + +#if ENABLE_TRACE +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Data for the trace functionality +****************************************************************************** + */ +typedef struct +{ + /** + * fp + */ + FILE *fp; +}enc_trace_t; + +/*****************************************************************************/ +/* Extern variable declarations */ +/*****************************************************************************/ +extern enc_trace_t g_enc_trace; + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief defines flag used for enabling trace +****************************************************************************** + */ + + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Macro to print trace messages +****************************************************************************** + */ + +#define ENTROPY_TRACE(syntax_string, value) \ + { \ + if(g_enc_trace.fp) \ + { \ + fprintf( g_enc_trace.fp, "%-40s : %d\n", syntax_string, value ); \ + fflush ( g_enc_trace.fp); \ + } \ + } + + +/** +****************************************************************************** + * @brief Macro to print CABAC trace messages +****************************************************************************** + */ + +#define AEV_TRACE(string, value, range) \ + if(range && g_enc_trace.fp) \ + { \ + fprintf( g_enc_trace.fp, "%-40s:%8d R:%d\n", string, value, range); \ + fflush ( g_enc_trace.fp); \ + } + +#else + +/* Dummy macros when trace is disabled */ +#define ENTROPY_TRACE(syntax_string, value) + +#define AEV_TRACE(string, value, range) + +#endif + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + + +/** +****************************************************************************** +* +* @brief Dummy trace init when trace is disabled in encoder +* +* @par Description +* This routine needs to be called at start of trace +* +* @param[in] pu1_file_name +* Name of file where trace outputs need to be stores (handle) +* +* @return success or failure error code +* +****************************************************************************** +*/ +extern WORD32 ih264e_trace_init + ( + const char *pu1_file_name + ); + +/** +****************************************************************************** +* +* @brief Dummy trace de-init function when trace is disabled +* +* @par Description +* This routine needs to be called at end of trace +* +* @return success or failure error code +* +****************************************************************************** +*/ +extern WORD32 ih264e_trace_deinit + ( + void + ); + +#endif // IH264E_TRACE_H_ diff --git a/encoder/ih264e_trace_support.h b/encoder/ih264e_trace_support.h new file mode 100755 index 0000000..c35bd4f --- /dev/null +++ b/encoder/ih264e_trace_support.h @@ -0,0 +1,61 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_trace_support.h +* +* @brief +* This file contains extern declarations of routines that could be helpful +* for debugging purposes. +* +* @author +* Harish +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef TRACE_SUPPORT_H_ +#define TRACE_SUPPORT_H_ + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +typedef struct +{ + WORD8 * pu1_buf; + WORD32 i4_offset; + WORD32 i4_max_size; +}trace_support_t; + +/*****************************************************************************/ +/* Extern function declarations */ +/*****************************************************************************/ + +void init_trace_support(WORD8 *pu1_buf, WORD32 i4_size); + +int trace_printf(const WORD8 *format, ...); + +#endif // TRACE_SUPPORT_H_ diff --git a/encoder/ih264e_utils.c b/encoder/ih264e_utils.c new file mode 100755 index 0000000..f0086cb --- /dev/null +++ b/encoder/ih264e_utils.c @@ -0,0 +1,1804 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_utils.c +* +* @brief +* Contains miscellaneous utility functions used by the encoder +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_get_min_level() +* - ih264e_get_lvl_idx() +* - ih264e_get_dpb_size() +* - ih264e_get_total_pic_buf_size() +* - ih264e_get_pic_mv_bank_size() +* - ih264e_pic_buf_mgr_add_bufs() +* - ih264e_mv_buf_mgr_add_bufs() +* - ih264e_init_quant_params() +* - ih264e_init_air_map() +* - ih264e_codec_init() +* - ih264e_pic_init() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* system include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +/* user include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e.h" +#include "ithread.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264_macros.h" +#include "ih264_common_tables.h" +#include "ih264_debug.h" +#include "ih264_trans_data.h" +#include "ih264e_defs.h" +#include "ih264e_globals.h" +#include "ih264_buf_mgr.h" +#include "ih264_dpb_mgr.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_utils.h" +#include "ih264e_config.h" +#include "ih264e_statistics.h" +#include "ih264e_trace.h" +#include "ih264_list.h" +#include "ih264e_encode_header.h" +#include "ih264e_me.h" +#include "ime_defs.h" +#include "ime.h" +#include "ih264e_rate_control.h" +#include "ih264e_core_coding.h" +#include "ih264e_rc_mem_interface.h" +#include "ih264e_time_stamp.h" +#include "ih264e_debug.h" +#include "ih264e_process.h" +#include "ih264e_master.h" +#include "irc_rate_control_api.h" +#include "ime_statistics.h" + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* Used to get minimum level index for a given picture size +* +* @par Description: +* Gets the minimum level index and then gets corresponding level. +* Also used to ignore invalid levels like 2.3, 3.3 etc +* +* @param[in] level +* Level of the stream +* +* @returns Level index for a given level +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_get_min_level(WORD32 pic_size) +{ + WORD32 lvl_idx = MAX_LEVEL, i; + + for (i = 0; i < MAX_LEVEL; i++) + { + if (pic_size <= gai4_ih264_max_luma_pic_size[i]) + { + lvl_idx = i; + break; + } + } + + return gai4_ih264_levels[lvl_idx]; +} + +/** +******************************************************************************* +* +* @brief +* Used to get level index for a given level +* +* @par Description: +* Converts from level_idc (which is multiplied by 30) to an index that can be +* used as a lookup. Also used to ignore invalid levels like 2.2 , 3.2 etc +* +* @param[in] level +* Level of the stream +* +* @returns Level index for a given level +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_get_lvl_idx(WORD32 level) +{ + WORD32 lvl_idx = 0; + + if (level < IH264_LEVEL_11) + { + lvl_idx = 0; + } + else if (level < IH264_LEVEL_12) + { + lvl_idx = 1; + } + else if (level < IH264_LEVEL_13) + { + lvl_idx = 2; + } + else if (level < IH264_LEVEL_20) + { + lvl_idx = 3; + } + else if (level < IH264_LEVEL_21) + { + lvl_idx = 4; + } + else if (level < IH264_LEVEL_22) + { + lvl_idx = 5; + } + else if (level < IH264_LEVEL_30) + { + lvl_idx = 6; + } + else if (level < IH264_LEVEL_31) + { + lvl_idx = 7; + } + else if (level < IH264_LEVEL_32) + { + lvl_idx = 8; + } + else if (level < IH264_LEVEL_40) + { + lvl_idx = 9; + } + else if (level < IH264_LEVEL_41) + { + lvl_idx = 10; + } + else if (level < IH264_LEVEL_42) + { + lvl_idx = 11; + } + else if (level < IH264_LEVEL_50) + { + lvl_idx = 12; + } + + return (lvl_idx); +} + +/** +******************************************************************************* +* +* @brief returns maximum number of pictures allowed in dpb for a given level +* +* @par Description: +* For given width, height and level, number of pictures allowed in decoder +* picture buffer is computed as per Annex A.3.1 +* +* @param[in] level +* level of the bit-stream +* +* @param[in] pic_size +* width * height +* +* @returns Number of buffers in DPB +* +* @remarks +* From annexure A.3.1 of H264 specification, +* max_dec_frame_buffering <= MaxDpbSize, where MaxDpbSize is equal to +* Min( 1024 * MaxDPB / ( PicWidthInMbs * FrameHeightInMbs * 384 ), 16 ) and +* MaxDPB is given in Table A-1 in units of 1024 bytes. However the MaxDPB size +* presented in the look up table gas_ih264_lvl_tbl is in units of 512 +* bytes. Hence the expression is modified accordingly. +* +******************************************************************************* +*/ +WORD32 ih264e_get_dpb_size(WORD32 level, WORD32 pic_size) +{ + /* dpb size */ + WORD32 max_dpb_size_bytes = 0; + + /* dec frame buffering */ + WORD32 max_dpb_size_frames = 0; + + /* temp var */ + WORD32 i; + + /* determine max luma samples */ + for (i = 0; i < 16; i++) + if (level == (WORD32)gas_ih264_lvl_tbl[i].u4_level_idc) + max_dpb_size_bytes = gas_ih264_lvl_tbl[i].u4_max_dpb_size; + + /* from Annexure A.3.1 h264 specification */ + max_dpb_size_frames = + MIN( 1024 * max_dpb_size_bytes / ( pic_size * 3 ), MAX_DPB_SIZE ); + + return max_dpb_size_frames; +} + +/** +******************************************************************************* +* +* @brief +* Used to get reference picture buffer size for a given level and +* and padding used +* +* @par Description: +* Used to get reference picture buffer size for a given level and padding used +* Each picture is padded on all four sides +* +* @param[in] pic_size +* Number of luma samples (Width * Height) +* +* @param[in] level +* Level +* +* @param[in] horz_pad +* Total padding used in horizontal direction +* +* @param[in] vert_pad +* Total padding used in vertical direction +* +* @returns Total picture buffer size +* +* @remarks +* +* +******************************************************************************* +*/ +WORD32 ih264e_get_total_pic_buf_size(WORD32 pic_size, + WORD32 level, + WORD32 horz_pad, + WORD32 vert_pad, + WORD32 num_ref_frames, + WORD32 num_reorder_frames) +{ + WORD32 size; + WORD32 num_luma_samples; + WORD32 lvl_idx; + WORD32 max_wd, min_ht; + WORD32 num_samples; + WORD32 max_num_bufs; + WORD32 pad = MAX(horz_pad, vert_pad); + UNUSED(pic_size); + /* + * If num_ref_frames and num_reorder_frmaes is specified + * Use minimum value + */ + max_num_bufs = (num_ref_frames + num_reorder_frames + MAX_CTXT_SETS); + + /* Get level index */ + lvl_idx = ih264e_get_lvl_idx(level); + + /* Maximum number of luma samples in a picture at given level */ + num_luma_samples = gai4_ih264_max_luma_pic_size[lvl_idx]; + + /* Account for chroma */ + num_samples = num_luma_samples * 3 / 2; + + /* Maximum width of luma samples in a picture at given level */ + max_wd = gai4_ih264_max_wd_ht[lvl_idx]; + + /* Minimum height of luma samples in a picture at given level */ + min_ht = gai4_ih264_min_wd_ht[lvl_idx]; + + /* Allocation is required for + * (Wd + horz_pad) * (Ht + vert_pad) * (2 * max_dpb_size + 1) + * + * Above expanded as + * ((Wd * Ht) + (horz_pad * vert_pad) + Wd * vert_pad + Ht * horz_pad) * (2 * max_dpb_size + 1) + * (Wd * Ht) * (2 * max_dpb_size + 1) + ((horz_pad * vert_pad) + Wd * vert_pad + Ht * horz_pad) * (2 * max_dpb_size + 1) + * Now max_dpb_size increases with smaller Wd and Ht, but Wd * ht * max_dpb_size will still be lesser or equal to max_wd * max_ht * dpb_size + * + * In the above equation (Wd * Ht) * (2 * max_dpb_size + 1) is accounted by using num_samples * (2 * max_dpb_size + 1) below + * + * For the padded area use MAX(horz_pad, vert_pad) as pad + * ((pad * pad) + pad * (Wd + Ht)) * (2 * max_dpb_size + 1) has to accounted from the above for padding + * + * Since Width and Height can change worst Wd + Ht is when One of the dimensions is max and other is min + * So use max_wd and min_ht + */ + + /* Number of bytes in reference pictures */ + size = num_samples * max_num_bufs; + + /* Account for padding area */ + size += ((pad * pad) + pad * (max_wd + min_ht)) * max_num_bufs; + + return size; +} + +/** +******************************************************************************* +* +* @brief Returns MV bank buffer size for a given number of luma samples +* +* @par Description: +* For given number of luma samples one MV bank size is computed. +* Each MV bank includes pu_map and enc_pu_t for all the min PUs(4x4) in a picture +* +* @param[in] num_luma_samples +* Max number of luma pixels in the frame +* +* @returns Total MV Bank size +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_get_pic_mv_bank_size(WORD32 num_luma_samples) +{ + /* mv bank buffer size */ + WORD32 mv_bank_size = 0; + + /* number of sub mb partitions possible */ + WORD32 num_pu = num_luma_samples / (MIN_PU_SIZE * MIN_PU_SIZE); + + /* number of mbs */ + WORD32 num_mb = num_luma_samples / (MB_SIZE * MB_SIZE); + + /* Size for storing enc_pu_t start index each MB */ + /* One extra entry is needed to compute number of PUs in the last MB */ + mv_bank_size += num_mb * sizeof(WORD32); + + /* Size for pu_map */ + mv_bank_size += num_pu; + + /* Size for storing enc_pu_t for each PU */ + mv_bank_size += num_pu * sizeof(enc_pu_t); + + return mv_bank_size; +} + +/** +******************************************************************************* +* +* @brief +* Function to initialize ps_pic_buf structs add pic buffers to +* buffer manager in case of non-shared mode +* +* @par Description: +* Function to initialize ps_pic_buf structs add pic buffers to +* buffer manager in case of non-shared mode +* To be called once per stream or for every reset +* +* @param[in] ps_codec +* Pointer to codec context +* +* @returns error status +* +* @remarks +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pic_buf_mgr_add_bufs(codec_t *ps_codec) +{ + /* error status */ + IH264E_ERROR_T ret = IH264E_SUCCESS; + + /* max ref buffer cnt */ + WORD32 max_num_bufs = ps_codec->i4_ref_buf_cnt; + + /* total size for pic buffers */ + WORD32 pic_buf_size_allocated = ps_codec->i4_total_pic_buf_size + - BUF_MGR_MAX_CNT * sizeof(pic_buf_t); + + /* temp var */ + UWORD8 *pu1_buf = (UWORD8 *) ps_codec->ps_pic_buf; + pic_buf_t *ps_pic_buf = (pic_buf_t *) ps_codec->ps_pic_buf; + WORD32 i; + + pu1_buf += BUF_MGR_MAX_CNT * sizeof(pic_buf_t); + + /* In case of non-shared mode, add picture buffers to buffer manager + * In case of shared mode, buffers are added in the run-time + */ + { + WORD32 buf_ret; + + WORD32 luma_samples = (ps_codec->i4_rec_strd) + * (ps_codec->s_cfg.u4_ht + PAD_HT); + + WORD32 chroma_samples = luma_samples >> 1; + + /* Try and add as many buffers as possible for the memory that is allocated */ + /* If the number of buffers that can be added is less than max_num_bufs + * return with an error */ + for (i = 0; i < max_num_bufs; i++) + { + pic_buf_size_allocated -= (luma_samples + chroma_samples); + + if (pic_buf_size_allocated < 0) + { + ps_codec->i4_error_code = IH264E_INSUFFICIENT_MEM_PICBUF; + return IH264E_INSUFFICIENT_MEM_PICBUF; + } + + ps_pic_buf->pu1_luma = pu1_buf + ps_codec->i4_rec_strd * PAD_TOP + + PAD_LEFT; + pu1_buf += luma_samples; + + ps_pic_buf->pu1_chroma = pu1_buf + + ps_codec->i4_rec_strd * (PAD_TOP / 2)+ PAD_LEFT; + pu1_buf += chroma_samples; + + buf_ret = ih264_buf_mgr_add((buf_mgr_t *) ps_codec->pv_ref_buf_mgr, + ps_pic_buf, i); + + if (0 != buf_ret) + { + ps_codec->i4_error_code = IH264E_BUF_MGR_ERROR; + return IH264E_BUF_MGR_ERROR; + } + pu1_buf += (HPEL_PLANES_CNT - 1) * (chroma_samples + luma_samples); + ps_pic_buf++; + } + } + + return ret; +} + +/** +******************************************************************************* +* +* @brief Function to add buffers to MV Bank buffer manager +* +* @par Description: +* Function to add buffers to MV Bank buffer manager. To be called once per +* stream or for every reset +* +* @param[in] ps_codec +* Pointer to codec context +* +* @returns error status +* +* @remarks +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_mv_buf_mgr_add_bufs(codec_t *ps_codec) +{ + /* error status */ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + IH264_ERROR_T ret; + + /* max dpb size in frames */ + WORD32 max_dpb_size = 0; + + /* mv bank size for the entire dpb */ + WORD32 mv_bank_size_allocated = 0; + + /* mv bank size per pic */ + WORD32 pic_mv_bank_size = 0; + + /* mv buffer ptr */ + mv_buf_t *ps_mv_buf = NULL; + + /* num of luma samples */ + WORD32 num_luma_samples = ALIGN16(ps_codec->s_cfg.u4_wd) + * ALIGN16(ps_codec->s_cfg.u4_ht); + + /* number of mb's & frame partitions */ + WORD32 num_pu, num_mb; + + /* temp var */ + UWORD8 *pu1_buf = NULL; + WORD32 i; + + /* Compute the number of MB Bank buffers needed */ + max_dpb_size = ps_codec->i4_ref_buf_cnt; + + /* allocate memory for mv buffer array */ + ps_codec->ps_mv_buf = ps_codec->pv_mv_bank_buf_base; + pu1_buf = ps_codec->pv_mv_bank_buf_base; + pu1_buf += BUF_MGR_MAX_CNT * sizeof(mv_buf_t); + + /********************************************************************/ + /* allocate memory for individual elements of mv buffer ptr */ + /********************************************************************/ + mv_bank_size_allocated = ps_codec->i4_total_mv_bank_size + - (BUF_MGR_MAX_CNT * sizeof(mv_buf_t)); + + /* compute MV bank size per picture */ + pic_mv_bank_size = ih264e_get_pic_mv_bank_size(num_luma_samples); + + num_pu = num_luma_samples / (MIN_PU_SIZE * MIN_PU_SIZE); + num_mb = num_luma_samples / (MB_SIZE * MB_SIZE); + i = 0; + ps_mv_buf = ps_codec->pv_mv_bank_buf_base; + + while (i < max_dpb_size) + { + mv_bank_size_allocated -= pic_mv_bank_size; + + if (mv_bank_size_allocated < 0) + { + ps_codec->i4_error_code = IH264E_INSUFFICIENT_MEM_MVBANK; + + error_status = IH264E_INSUFFICIENT_MEM_MVBANK; + + return error_status; + } + + ps_mv_buf->pu4_mb_pu_cnt = (UWORD32 *) pu1_buf; + + ps_mv_buf->pu1_pic_pu_map = (pu1_buf + num_mb * sizeof(WORD32)); + + ps_mv_buf->ps_pic_pu = (enc_pu_t *) (pu1_buf + num_mb * sizeof(WORD32) + + num_pu); + + ret = ih264_buf_mgr_add((buf_mgr_t *) ps_codec->pv_mv_buf_mgr, + ps_mv_buf, i); + + if (IH264_SUCCESS != ret) + { + ps_codec->i4_error_code = IH264E_BUF_MGR_ERROR; + error_status = IH264E_BUF_MGR_ERROR; + return error_status; + } + + pu1_buf += pic_mv_bank_size; + ps_mv_buf++; + i++; + } + + return error_status; +} + +/** +******************************************************************************* +* +* @brief Function to initialize quant params structure +* +* @par Description: +* The forward quantization modules depends on qp/6, qp mod 6, forward scale +* matrix, forward threshold matrix, weight list. The inverse quantization +* modules depends on qp/6, qp mod 6, inverse scale matrix, weight list. +* These params are initialized in this function. +* +* @param[in] ps_proc +* pointer to process context +* +* @param[in] qp +* quantization parameter +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_quant_params(process_ctxt_t *ps_proc, int qp) +{ + /* quant params */ + quant_params_t *ps_qp_params; + + /* ptr to forward quant threshold matrix */ + const UWORD16 *pu2_thres_mat = NULL; + + /* ptr to forward scale matrix */ + const UWORD16 *pu2_scale_mat = gu2_quant_scale_matrix_4x4; + + /* ptr to inverse scale matrix */ + const UWORD16 *pu2_iscale_mat = gau2_ih264_iquant_scale_matrix_4x4; + + /* temp var */ + UWORD32 u4_qp[3], u4_qp_div6, u4_qp_mod6; + COMPONENT_TYPE plane; + WORD32 i; + UWORD32 u4_satdq_t; + const UWORD16 *pu2_smat; + + /********************************************************************/ + /* init quant params for all planes Y, U and V */ + /********************************************************************/ + /* luma qp */ + u4_qp[Y] = qp; + + /* chroma qp + * TODO_LATER : just in case if the chroma planes use different qp's this + * needs to be corrected accordingly. + */ + u4_qp[U] = gu1_qpc_fqpi[qp]; + u4_qp[V] = gu1_qpc_fqpi[qp]; + + plane = Y; + while (plane <= V) + { + u4_qp_div6 = (u4_qp[plane] / 6); + u4_qp_mod6 = (u4_qp[plane] % 6); + + ps_qp_params = ps_proc->ps_qp_params[plane]; + + /* mb qp */ + ps_qp_params->u1_mb_qp = u4_qp[plane]; + + /* mb qp / 6 */ + ps_qp_params->u1_qp_div = u4_qp_div6; + + /* mb qp % 6 */ + ps_qp_params->u1_qp_rem = u4_qp_mod6; + + /* QP bits */ + ps_qp_params->u1_qbits = QP_BITS_h264_4x4 + u4_qp_div6; + + /* forward scale matrix */ + ps_qp_params->pu2_scale_mat = pu2_scale_mat + (u4_qp_mod6 * 16); + + /* threshold matrix & weight for quantization */ + pu2_thres_mat = gu2_forward_quant_threshold_4x4 + (u4_qp_mod6 * 16); + for (i = 0; i < 16; i++) + { + ps_qp_params->pu2_thres_mat[i] = pu2_thres_mat[i] + >> (8 - u4_qp_div6); + ps_qp_params->pu2_weigh_mat[i] = 16; + } + + /* qp dependent rounding constant */ + ps_qp_params->u4_dead_zone = + gu4_forward_quant_round_factor_4x4[u4_qp_div6]; + + /* slice dependent rounding constant */ + if (ps_proc->i4_slice_type != ISLICE + && ps_proc->i4_slice_type != SISLICE) + { + ps_qp_params->u4_dead_zone >>= 1; + } + + /* SATQD threshold for zero block prediction */ + if (ps_proc->ps_codec->s_cfg.u4_enable_satqd) + { + pu2_smat = ps_qp_params->pu2_scale_mat; + + u4_satdq_t = ((1 << (ps_qp_params->u1_qbits)) - ps_qp_params->u4_dead_zone); + + ps_qp_params->pu2_sad_thrsh[0] = u4_satdq_t / MAX(pu2_smat[3], pu2_smat[11]); + ps_qp_params->pu2_sad_thrsh[1] = u4_satdq_t / MAX(pu2_smat[1], pu2_smat[9]); + ps_qp_params->pu2_sad_thrsh[2] = u4_satdq_t / pu2_smat[15]; + ps_qp_params->pu2_sad_thrsh[3] = u4_satdq_t / pu2_smat[7]; + ps_qp_params->pu2_sad_thrsh[4] = u4_satdq_t / MAX(pu2_smat[12], pu2_smat[14]); + ps_qp_params->pu2_sad_thrsh[5] = u4_satdq_t / MAX(pu2_smat[4], pu2_smat[6]); + ps_qp_params->pu2_sad_thrsh[6] = u4_satdq_t / pu2_smat[13]; + ps_qp_params->pu2_sad_thrsh[7] = u4_satdq_t / pu2_smat[5]; + ps_qp_params->pu2_sad_thrsh[8] = u4_satdq_t / MAX(MAX3(pu2_smat[0], pu2_smat[2], pu2_smat[8]), pu2_smat[10]); + } + + /* inverse scale matrix */ + ps_qp_params->pu2_iscale_mat = pu2_iscale_mat + (u4_qp_mod6 * 16); + + plane += 1; + } + return ; +} + +/** +******************************************************************************* +* +* @brief +* Initialize AIR mb frame Map +* +* @par Description: +* Initialize AIR mb frame map +* MB frame map indicates which frame an Mb should be coded as intra according to AIR +* +* @param[in] ps_codec +* Pointer to codec context +* +* @returns error_status +* +* @remarks +* +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_init_air_map(codec_t *ps_codec) +{ + /* intra refresh map */ + UWORD16 *pu2_intr_rfrsh_map = ps_codec->pu2_intr_rfrsh_map; + + /* air mode */ + IVE_AIR_MODE_T air_mode = ps_codec->s_cfg.e_air_mode; + + /* refresh period */ + UWORD32 air_period = ps_codec->s_cfg.u4_air_refresh_period; + + /* mb cnt */ + UWORD32 u4_mb_cnt = ps_codec->s_cfg.i4_wd_mbs * ps_codec->s_cfg.i4_ht_mbs; + + /* temp var */ + UWORD32 curr_mb, seed_rand = 1; + + switch (air_mode) + { + case IVE_AIR_MODE_CYCLIC: + + for (curr_mb = 0; curr_mb < u4_mb_cnt; curr_mb++) + { + pu2_intr_rfrsh_map[curr_mb] = curr_mb % air_period; + } + break; + + case IVE_AIR_MODE_RANDOM: + + for (curr_mb = 0; curr_mb < u4_mb_cnt; curr_mb++) + { + seed_rand = (seed_rand * 32719 + 3) % 32749; + pu2_intr_rfrsh_map[curr_mb] = seed_rand % air_period; + } + break; + + default: + + break; + } + + return IH264E_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Codec level initializations +* +* @par Description: +* Initializes the codec with parameters that needs to be set before encoding +* first frame +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] ps_inp_buf +* Pointer to input buffer context +* +* @returns error_status +* +* @remarks +* +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_codec_init(codec_t *ps_codec) +{ + /******************************************************************** + * INITIALIZE CODEC CONTEXT * + ********************************************************************/ + /* encoder presets */ + if (ps_codec->s_cfg.u4_enc_speed_preset != IVE_CONFIG) + { + if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_SLOWEST) + {/* high quality */ + /* enable diamond search */ + ps_codec->s_cfg.u4_me_speed_preset = DMND_SRCH; + ps_codec->s_cfg.u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_codec->s_cfg.u4_enable_intra_4x4 = 1; + ps_codec->luma_energy_compaction[1] = + ih264e_code_luma_intra_macroblock_4x4_rdopt_on; + + /* sub pel off */ + ps_codec->s_cfg.u4_enable_hpel = 1; + + /* deblocking off */ + ps_codec->s_cfg.u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_0; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 0; + } + else if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_NORMAL) + {/* normal */ + /* enable diamond search */ + ps_codec->s_cfg.u4_me_speed_preset = DMND_SRCH; + ps_codec->s_cfg.u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_codec->s_cfg.u4_enable_intra_4x4 = 1; + + /* sub pel off */ + ps_codec->s_cfg.u4_enable_hpel = 1; + + /* deblocking off */ + ps_codec->s_cfg.u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_0; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 0; + } + else if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST) + {/* normal */ + /* enable diamond search */ + ps_codec->s_cfg.u4_me_speed_preset = DMND_SRCH; + ps_codec->s_cfg.u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_codec->s_cfg.u4_enable_intra_4x4 = 0; + + /* sub pel off */ + ps_codec->s_cfg.u4_enable_hpel = 1; + + /* deblocking off */ + ps_codec->s_cfg.u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_0; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 1; + } + else if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_HIGH_SPEED) + {/* fast */ + /* enable diamond search */ + ps_codec->s_cfg.u4_me_speed_preset = DMND_SRCH; + ps_codec->s_cfg.u4_enable_fast_sad = 0; + + /* disable intra 4x4 */ + ps_codec->s_cfg.u4_enable_intra_4x4 = 0; + + /* sub pel off */ + ps_codec->s_cfg.u4_enable_hpel = 0; + + /* deblocking off */ + ps_codec->s_cfg.u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_4; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 0; + } + else if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST) + {/* fastest */ + /* enable diamond search */ + ps_codec->s_cfg.u4_me_speed_preset = DMND_SRCH; + + /* disable intra 4x4 */ + ps_codec->s_cfg.u4_enable_intra_4x4 = 0; + + /* sub pel off */ + ps_codec->s_cfg.u4_enable_hpel = 0; + + /* deblocking off */ + ps_codec->s_cfg.u4_disable_deblock_level = DISABLE_DEBLK_LEVEL_4; + + /* disabled intra inter gating in Inter slices */ + ps_codec->u4_inter_gate = 1; + } + } + + /***************************************************************** + * Initialize AIR inside codec + *****************************************************************/ + if (IVE_AIR_MODE_NONE != ps_codec->s_cfg.e_air_mode) + { + ih264e_init_air_map(ps_codec); + + ps_codec->i4_air_pic_cnt = -1; + } + + /****************************************************/ + /* INITIALIZE RATE CONTROL */ + /****************************************************/ + { + /* init qp */ + UWORD8 au1_init_qp[MAX_PIC_TYPE]; + + /* min max qp */ + UWORD8 au1_min_max_qp[2 * MAX_PIC_TYPE]; + + /* init i,p,b qp */ + au1_init_qp[0] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp]; + au1_init_qp[1] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp]; + au1_init_qp[2] = gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp]; + + /* init min max qp */ + au1_min_max_qp[2 * I_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp_min]; + au1_min_max_qp[2 * I_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_i_qp_max]; + + au1_min_max_qp[2 * P_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp_min]; + au1_min_max_qp[2 * P_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_p_qp_max]; + + au1_min_max_qp[2 * B_PIC] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp_min]; + au1_min_max_qp[2 * B_PIC + 1] = + gau1_h264_to_mpeg2_qmap[ps_codec->s_cfg.u4_b_qp_max]; + + /* get rc mode */ + switch (ps_codec->s_cfg.e_rc_mode) + { + case IVE_RC_STORAGE: + ps_codec->s_rate_control.e_rc_type = VBR_STORAGE; + break; + case IVE_RC_CBR_NON_LOW_DELAY: + ps_codec->s_rate_control.e_rc_type = CBR_NLDRC; + break; + case IVE_RC_CBR_LOW_DELAY: + ps_codec->s_rate_control.e_rc_type = CBR_LDRC; + break; + case IVE_RC_NONE: + ps_codec->s_rate_control.e_rc_type = CONST_QP; + break; + default: + break; + } + + /* init rate control */ + ih264e_rc_init(ps_codec->s_rate_control.pps_rate_control_api, + ps_codec->s_rate_control.pps_frame_time, + ps_codec->s_rate_control.pps_time_stamp, + ps_codec->s_rate_control.pps_pd_frm_rate, + ps_codec->s_cfg.u4_max_framerate, + ps_codec->s_cfg.u4_src_frame_rate, + ps_codec->s_cfg.u4_tgt_frame_rate, + ps_codec->s_rate_control.e_rc_type, + ps_codec->s_cfg.u4_target_bitrate, + ps_codec->s_cfg.u4_max_bitrate, + ps_codec->s_cfg.u4_vbv_buffer_delay, + ps_codec->s_cfg.u4_i_frm_interval, au1_init_qp, + H264_ALLOC_INTER_FRM_INTV, au1_min_max_qp, + ps_codec->s_cfg.u4_max_level); + } + + /* src stride */ + ps_codec->i4_src_strd = ps_codec->s_cfg.u4_strd; + + /* recon stride */ + ps_codec->i4_rec_strd = ALIGN16(ps_codec->s_cfg.u4_max_wd) + PAD_WD; + + /* max ref and reorder cnt */ + ps_codec->i4_ref_buf_cnt = ps_codec->s_cfg.u4_max_ref_cnt + + ps_codec->s_cfg.u4_max_reorder_cnt; + ps_codec->i4_ref_buf_cnt += MAX_CTXT_SETS; + + DEBUG_HISTOGRAM_INIT(); + + return IH264E_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Picture level initializations +* +* @par Description: +* Before beginning to encode the frame, the current function initializes all +* the ctxts (proc, entropy, me, ...) basing on the input configured params. +* It locates space for storing recon in the encoder picture buffer set, fetches +* reference frame from encoder picture buffer set. Calls RC pre-enc to get +* qp and pic type for the current frame. Queues proc jobs so that +* the other threads can begin encoding. In brief, this function sets up the +* tone for the entire encoder. +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] ps_inp_buf +* Pointer to input buffer context +* +* @returns error_status +* +* @remarks +* +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf) +{ + /* error status */ + IH264E_ERROR_T error_status = IH264E_SUCCESS; + IH264_ERROR_T ret = IH264_SUCCESS; + + /* mv buff bank */ + mv_buf_t *ps_mv_buf = NULL; + WORD32 cur_mv_bank_buf_id; + + /* recon buffer set */ + pic_buf_t *ps_cur_pic; + WORD32 cur_pic_buf_id; + UWORD8 *pu1_cur_pic_luma, *pu1_cur_pic_chroma; + + /* ref buffer set */ + pic_buf_t *ps_ref_pic; + WORD32 ref_set_id; + + /* pic time stamp */ + UWORD32 u4_timestamp_high = ps_inp_buf->u4_timestamp_high; + UWORD32 u4_timestamp_low = ps_inp_buf->u4_timestamp_low; + + /* indices to access curr/prev frame info */ + WORD32 ctxt_sel = ps_codec->i4_encode_api_call_cnt & 1; + + /* curr pic type */ + PIC_TYPE_T *pic_type = &ps_codec->pic_type; + + /* should src be skipped */ + WORD32 *skip_src = &ps_codec->s_rate_control.pre_encode_skip[ctxt_sel]; + + /* Diamond search Iteration Max Cnt */ + UWORD32 u4_num_layers = + (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST) ? + (NUM_LAYERS >> 2) : NUM_LAYERS; + + /* enable fast sad */ + UWORD32 u4_enable_fast_sad = ps_codec->s_cfg.u4_enable_fast_sad; + + /********************************************************************/ + /* INITIALIZE CODEC CONTEXT */ + /********************************************************************/ + + /* pre enc rc call */ + *skip_src = ih264e_set_rc_pic_params(ps_codec, + ps_codec->i4_encode_api_call_cnt, + (WORD32 *) pic_type); + if (*skip_src == 1) + { + ps_codec->as_process[ctxt_sel * MAX_PROCESS_THREADS].s_inp_buf = + *ps_inp_buf; + + /* inform output bytes generated as zero */ + ps_codec->as_out_buf[ctxt_sel].s_bits_buf.u4_bytes = 0; + + return error_status; + } + + /********************************************************************/ + /* Alternate reference frame */ + /********************************************************************/ + if (ps_codec->s_cfg.u4_enable_alt_ref) + { + if (PIC_IDR == *pic_type || PIC_I == *pic_type) + { + ps_codec->u4_is_curr_frm_ref = 1; + } + else + { + ps_codec->u4_is_curr_frm_ref = 1; + if(ps_codec->i4_encode_api_call_cnt % (ps_codec->s_cfg.u4_enable_alt_ref + 1)) + ps_codec->u4_is_curr_frm_ref = 0; + } + + if ((ps_codec->u4_is_curr_frm_ref == 1) || (ps_codec->i4_frame_num < 0)) + { + ps_codec->i4_frame_num++; + } + } + else + { + ps_codec->u4_is_curr_frm_ref = 1; + + ps_codec->i4_frame_num++; + } + + /* slice_type */ + ps_codec->i4_slice_type = PSLICE; + + if ((PIC_I == *pic_type) || (PIC_IDR == *pic_type)) + { + ps_codec->i4_slice_type = ISLICE; + } + else if (PIC_P == *pic_type) + { + ps_codec->i4_slice_type = PSLICE; + } + + /* is this an IDR pic */ + ps_codec->u4_is_idr = 0; + + if (PIC_IDR == *pic_type) + { + /* set idr flag */ + ps_codec->u4_is_idr = 1; + + /* reset frame num */ + ps_codec->i4_frame_num = 0; + + /* idr_pic_id */ + ps_codec->i4_idr_pic_id++; + } + + /* set deblock disable flags based on disable deblock level */ + ps_codec->i4_disable_deblk_pic = 1; + + if (ps_codec->s_cfg.u4_disable_deblock_level == DISABLE_DEBLK_LEVEL_0) + { + /* enable deblocking */ + ps_codec->i4_disable_deblk_pic = 0; + } + else if (ps_codec->s_cfg.u4_disable_deblock_level == DISABLE_DEBLK_LEVEL_2) + { + /* enable deblocking after a period of frames */ + if (ps_codec->i4_disable_deblk_pic_cnt == DISABLE_DEBLOCK_INTERVAL + || ps_codec->i4_slice_type == ISLICE) + { + ps_codec->i4_disable_deblk_pic = 0; + } + } + else if (ps_codec->s_cfg.u4_disable_deblock_level == DISABLE_DEBLK_LEVEL_3) + { + if (ps_codec->i4_slice_type == ISLICE) + { + ps_codec->i4_disable_deblk_pic = 0; + } + } + + if (ps_codec->i4_disable_deblk_pic) + { + ps_codec->i4_disable_deblk_pic_cnt++; + } + else + { + ps_codec->i4_disable_deblk_pic_cnt = 0; + } + + /* In slice mode - lets not deblk mb edges that lie along slice boundaries */ + if (ps_codec->i4_disable_deblk_pic == 0) + { + if (ps_codec->s_cfg.e_slice_mode != IVE_SLICE_MODE_NONE) + { + ps_codec->i4_disable_deblk_pic = 2; + } + } + + /* error status */ + ps_codec->i4_error_code = IH264E_SUCCESS; + + /* populate header */ + if (ps_codec->i4_gen_header) + { + /* sps */ + sps_t *ps_sps = NULL; + + /* pps */ + pps_t *ps_pps = NULL; + + /*ps_codec->i4_pps_id ++;*/ + ps_codec->i4_pps_id %= MAX_PPS_CNT; + + /*ps_codec->i4_sps_id ++;*/ + ps_codec->i4_sps_id %= MAX_SPS_CNT; + + /* populate sps header */ + ps_sps = ps_codec->ps_sps_base + ps_codec->i4_sps_id; + ih264e_populate_sps(ps_codec, ps_sps); + + /* populate pps header */ + ps_pps = ps_codec->ps_pps_base + ps_codec->i4_pps_id; + ih264e_populate_pps(ps_codec, ps_pps); + } + + /* Reference and MV bank Buffer Manager */ + { + /* min pic cnt among the list of pics stored in ref list */ + WORD32 min_pic_cnt; + + /* max pic cnt among the list of pics stored in ref list */ + WORD32 max_pic_cnt; + + /* temp var */ + WORD32 i; + + ps_ref_pic = NULL; + + /* get reference picture when necessary */ + /* Only nearest picture encoded (max pic cnt) is used as reference */ + if ((*pic_type != PIC_IDR) && (*pic_type != PIC_I)) + { + max_pic_cnt = ps_codec->as_ref_set[0].i4_pic_cnt; + + ps_ref_pic = ps_codec->as_ref_set[0].ps_pic_buf; + + /* loop through to get the max pic cnt among the list of pics stored in ref list */ + for (i = 1; i < ps_codec->i4_ref_buf_cnt; i++) + { + if (max_pic_cnt < ps_codec->as_ref_set[i].i4_pic_cnt) + { + max_pic_cnt = ps_codec->as_ref_set[i].i4_pic_cnt; + ps_ref_pic = ps_codec->as_ref_set[i].ps_pic_buf; + } + } + } + + /* get a location at which the curr pic info can be stored for future reference */ + ref_set_id = -1; + + for (i = 0; i < ps_codec->i4_ref_buf_cnt; i++) + { + if (-1 == ps_codec->as_ref_set[i].i4_pic_cnt) + { + ref_set_id = i; + break; + } + } + + /* If all the entries in the ref_set array are filled, then remove the entry with least pic_cnt */ + if (ref_set_id == -1) + { + /* pic info */ + pic_buf_t *ps_cur_pic; + + /* mv info */ + mv_buf_t *ps_cur_mv_buf; + + ref_set_id = 0; + min_pic_cnt = ps_codec->as_ref_set[0].i4_pic_cnt; + + /* loop through to get the min pic cnt among the list of pics stored in ref list */ + for (i = 1; i < ps_codec->i4_ref_buf_cnt; i++) + { + if (min_pic_cnt > ps_codec->as_ref_set[i].i4_pic_cnt) + { + min_pic_cnt = ps_codec->as_ref_set[i].i4_pic_cnt; + ref_set_id = i; + } + } + + ps_cur_pic = ps_codec->as_ref_set[ref_set_id].ps_pic_buf; + + ps_cur_mv_buf = ps_codec->as_ref_set[ref_set_id].ps_mv_buf; + + /* release this frame from reference list */ + ih264_buf_mgr_release(ps_codec->pv_mv_buf_mgr, + ps_cur_mv_buf->i4_buf_id, BUF_MGR_REF); + + ih264_buf_mgr_release(ps_codec->pv_ref_buf_mgr, + ps_cur_pic->i4_buf_id, BUF_MGR_REF); + } + + if (ps_codec->s_cfg.u4_enable_recon) + { + ret = ih264_buf_mgr_check_free((buf_mgr_t *)ps_codec->pv_ref_buf_mgr); + + if (ret != IH264_SUCCESS) + { + return IH264E_NO_FREE_RECONBUF; + } + } + } + + { + /*****************************************************************/ + /* Get free MV Bank to hold current picture's motion vector data */ + /* If there are no free buffers then return with an error code. */ + /* If the buffer is to be freed by another thread, change the */ + /* following to call thread yield and wait for buffer to be freed*/ + /*****************************************************************/ + ps_mv_buf = (mv_buf_t *) ih264_buf_mgr_get_next_free( + (buf_mgr_t *) ps_codec->pv_mv_buf_mgr, + &cur_mv_bank_buf_id); + + if (NULL == ps_mv_buf) + { + ps_codec->i4_error_code = IH264E_NO_FREE_MVBANK; + return IH264E_NO_FREE_MVBANK; + } + + /* mark the buffer as needed for reference if the curr pic is available for ref */ + if (ps_codec->u4_is_curr_frm_ref) + { + ih264_buf_mgr_set_status(ps_codec->pv_mv_buf_mgr, + cur_mv_bank_buf_id, BUF_MGR_REF); + } + + /* Set current ABS poc to ps_mv_buf, so that while freeing a reference buffer + * corresponding mv buffer can be found by looping through ps_codec->ps_mv_buf array + * and getting a buffer id to free + */ + ps_mv_buf->i4_abs_poc = ps_codec->i4_abs_pic_order_cnt; + + ps_mv_buf->i4_buf_id = cur_mv_bank_buf_id; + } + + { + /*****************************************************************/ + /* Get free pic buf to hold current picture's recon data */ + /* If there are no free buffers then return with an error code. */ + /* If the buffer is to be freed by another thread, change the */ + /* following to call thread yield and wait for buffer to be freed*/ + /*****************************************************************/ + ps_cur_pic = (pic_buf_t *) ih264_buf_mgr_get_next_free( + (buf_mgr_t *) ps_codec->pv_ref_buf_mgr, + &cur_pic_buf_id); + + if (NULL == ps_cur_pic) + { + ps_codec->i4_error_code = IH264E_NO_FREE_PICBUF; + return IH264E_NO_FREE_PICBUF; + } + + /* mark the buffer as needed for reference if the curr pic is available for ref */ + if (1 == ps_codec->u4_is_curr_frm_ref) + { + ih264_buf_mgr_set_status(ps_codec->pv_ref_buf_mgr, cur_pic_buf_id, + BUF_MGR_REF); + } + + /* Mark the current buffer as needed for IO if recon is enabled */ + if (1 == ps_codec->s_cfg.u4_enable_recon) + { + ih264_buf_mgr_set_status(ps_codec->pv_ref_buf_mgr, cur_pic_buf_id, + BUF_MGR_IO); + } + + /* Associate input timestamp with current buffer */ + ps_cur_pic->u4_timestamp_high = ps_inp_buf->u4_timestamp_high; + ps_cur_pic->u4_timestamp_low = ps_inp_buf->u4_timestamp_low; + + ps_cur_pic->i4_abs_poc = ps_codec->i4_abs_pic_order_cnt; + ps_cur_pic->i4_poc_lsb = ps_codec->i4_pic_order_cnt_lsb; + + ps_cur_pic->i4_buf_id = cur_pic_buf_id; + + pu1_cur_pic_luma = ps_cur_pic->pu1_luma; + pu1_cur_pic_chroma = ps_cur_pic->pu1_chroma; + } + + /* in case the current picture is used for reference then add it to the reference set */ + if (ps_codec->u4_is_curr_frm_ref + && ((*pic_type == PIC_IDR) || (*pic_type == PIC_I) + || (*pic_type == PIC_P))) + { + ps_codec->as_ref_set[ref_set_id].i4_pic_cnt = ps_codec->i4_pic_cnt; + + /* TODO: Currently pic_cnt and poc are same - Once frame drops are introduced change appropriately */ + ps_codec->as_ref_set[ref_set_id].i4_poc = ps_codec->i4_pic_cnt; + + ps_codec->as_ref_set[ref_set_id].ps_mv_buf = ps_mv_buf; + + ps_codec->as_ref_set[ref_set_id].ps_pic_buf = ps_cur_pic; + } + + /********************************************************************/ + /* INITIALIZE PROCESS CONTEXT */ + /********************************************************************/ + { + /* temp var */ + WORD32 i, j = 0; + + /* curr proc ctxt */ + process_ctxt_t *ps_proc = NULL; + + j = ctxt_sel * MAX_PROCESS_THREADS; + + /* begin init */ + for (i = j; i < (j + MAX_PROCESS_THREADS); i++) + { + ps_proc = &ps_codec->as_process[i]; + + /* luma src buffer */ + if (ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_422ILE) + { + ps_proc->pu1_src_buf_luma_base = ps_codec->pu1_y_csc_buf_base; + } + else + { + ps_proc->pu1_src_buf_luma_base = + ps_inp_buf->s_raw_buf.apv_bufs[0]; + } + + /* chroma src buffer */ + if (ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_422ILE + || ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420P) + { + ps_proc->pu1_src_buf_chroma_base = + ps_codec->pu1_uv_csc_buf_base; + } + else + { + ps_proc->pu1_src_buf_chroma_base = + ps_inp_buf->s_raw_buf.apv_bufs[1]; + } + + /* luma rec buffer */ + ps_proc->pu1_rec_buf_luma_base = pu1_cur_pic_luma; + + /* chroma rec buffer */ + ps_proc->pu1_rec_buf_chroma_base = pu1_cur_pic_chroma; + + /* src stride */ + ps_proc->i4_src_strd = ps_codec->i4_src_strd; + + /* rec stride */ + ps_proc->i4_rec_strd = ps_codec->i4_rec_strd; + + /* frame num */ + ps_proc->i4_frame_num = ps_codec->i4_frame_num; + + /* is idr */ + ps_proc->u4_is_idr = ps_codec->u4_is_idr; + + /* idr pic id */ + ps_proc->u4_idr_pic_id = ps_codec->i4_idr_pic_id; + + /* slice_type */ + ps_proc->i4_slice_type = ps_codec->i4_slice_type; + + /* Input width in mbs */ + ps_proc->i4_wd_mbs = ps_codec->s_cfg.i4_wd_mbs; + + /* Input height in mbs */ + ps_proc->i4_ht_mbs = ps_codec->s_cfg.i4_ht_mbs; + + /* Half x plane offset from pic buf */ + ps_proc->u4_half_x_offset = 0; + + /* Half y plane offset from half x plane */ + ps_proc->u4_half_y_offset = 0; + + /* Half x plane offset from half y plane */ + ps_proc->u4_half_xy_offset = 0; + + /* top row syntax elements */ + ps_proc->ps_top_row_mb_syntax_ele = + ps_proc->ps_top_row_mb_syntax_ele_base; + + ps_proc->pu1_top_mb_intra_modes = + ps_proc->pu1_top_mb_intra_modes_base; + + ps_proc->ps_top_row_pu = ps_proc->ps_top_row_pu_base; + + /* initialize quant params */ + ps_proc->u4_frame_qp = ps_codec->u4_frame_qp; + ps_proc->u4_mb_qp = ps_codec->u4_frame_qp; + ih264e_init_quant_params(ps_proc, ps_proc->u4_frame_qp); + + /* previous mb qp*/ + ps_proc->u4_mb_qp_prev = ps_proc->u4_frame_qp; + + /* Reset frame info */ + memset(&ps_proc->s_frame_info, 0, sizeof(frame_info_t)); + + /* initialize proc, deblk and ME map */ + if (i == j) + { + /* row '-1' */ + memset(ps_proc->pu1_proc_map - ps_proc->i4_wd_mbs, 1, ps_proc->i4_wd_mbs); + /* row 0 to ht in mbs */ + memset(ps_proc->pu1_proc_map, 0, ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs); + + /* row '-1' */ + memset(ps_proc->pu1_deblk_map - ps_proc->i4_wd_mbs, 1, ps_proc->i4_wd_mbs); + /* row 0 to ht in mbs */ + memset(ps_proc->pu1_deblk_map, 0, ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs); + + /* row '-1' */ + memset(ps_proc->pu1_me_map - ps_proc->i4_wd_mbs, 1, ps_proc->i4_wd_mbs); + /* row 0 to ht in mbs */ + memset(ps_proc->pu1_me_map, 0, ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs); + + /* at the start of air refresh period, reset intra coded map */ + if (IVE_AIR_MODE_NONE != ps_codec->s_cfg.e_air_mode) + { + ps_codec->i4_air_pic_cnt = (ps_codec->i4_air_pic_cnt + 1) + % ps_codec->s_cfg.u4_air_refresh_period; + + if (!ps_codec->i4_air_pic_cnt) + { + memset(ps_proc->pu1_is_intra_coded, 0, ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs); + } + } + } + + /* deblock level */ + ps_proc->u4_disable_deblock_level = ps_codec->i4_disable_deblk_pic; + + /* slice index map */ + /* no slice */ + if (ps_codec->s_cfg.e_slice_mode == IVE_SLICE_MODE_NONE) + { + memset(ps_proc->pu1_slice_idx, 0, ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs); + } + /* generate slices for every 'n' rows, 'n' is given through slice param */ + else if (ps_codec->s_cfg.e_slice_mode == IVE_SLICE_MODE_BLOCKS) + { + /* slice idx map */ + UWORD8 *pu1_slice_idx = ps_proc->pu1_slice_idx; + + /* temp var */ + WORD32 i4_mb_y = 0, slice_idx = 0, cnt; + + while (i4_mb_y < ps_proc->i4_ht_mbs) + { + if (i4_mb_y +(WORD32)ps_codec->s_cfg.u4_slice_param < ps_proc->i4_ht_mbs) + { + cnt = ps_codec->s_cfg.u4_slice_param * ps_proc->i4_wd_mbs; + i4_mb_y += ps_codec->s_cfg.u4_slice_param; + } + else + { + cnt = (ps_proc->i4_ht_mbs - i4_mb_y) * ps_proc->i4_wd_mbs; + i4_mb_y += (ps_proc->i4_ht_mbs - i4_mb_y); + } + memset(pu1_slice_idx, slice_idx, cnt); + slice_idx++; + pu1_slice_idx += cnt; + } + } + + /* Current MV Bank's buffer ID */ + ps_proc->i4_cur_mv_bank_buf_id = cur_mv_bank_buf_id; + + /* Pointer to current picture buffer structure */ + ps_proc->ps_cur_pic = ps_cur_pic; + + /* Pointer to current pictures mv buffers */ + ps_proc->ps_cur_mv_buf = ps_mv_buf; + + /* pointer to ref picture */ + ps_proc->ps_ref_pic = ps_ref_pic; + + if ((*pic_type != PIC_IDR) && (*pic_type != PIC_I)) + { + /* ref pointer luma */ + ps_proc->pu1_ref_buf_luma_base = ps_ref_pic->pu1_luma; + + /* ref pointer chroma */ + ps_proc->pu1_ref_buf_chroma_base = ps_ref_pic->pu1_chroma; + } + + /* Structure for current input buffer */ + ps_proc->s_inp_buf = *ps_inp_buf; + + /* Number of encode frame API calls made */ + ps_proc->i4_encode_api_call_cnt = ps_codec->i4_encode_api_call_cnt; + + /* Current Picture count */ + ps_proc->i4_pic_cnt = ps_codec->i4_pic_cnt; + + /* error status */ + ps_proc->i4_error_code = 0; + + /********************************************************************/ + /* INITIALIZE ENTROPY CONTEXT */ + /********************************************************************/ + { + entropy_ctxt_t *ps_entropy = &ps_proc->s_entropy; + + /* start of frame */ + ps_entropy->i4_sof = 0; + + /* end of frame */ + ps_entropy->i4_eof = 0; + + /* generate header */ + ps_entropy->i4_gen_header = ps_codec->i4_gen_header; + + /* sps ref_set_id */ + ps_entropy->u4_sps_id = ps_codec->i4_sps_id; + + /* sps base */ + ps_entropy->ps_sps_base = ps_codec->ps_sps_base; + + /* sps id */ + ps_entropy->u4_pps_id = ps_codec->i4_pps_id; + + /* sps base */ + ps_entropy->ps_pps_base = ps_codec->ps_pps_base; + + /* slice map */ + ps_entropy->pu1_slice_idx = ps_proc->pu1_slice_idx; + + /* slice hdr base */ + ps_entropy->ps_slice_hdr_base = ps_proc->ps_slice_hdr_base; + + /* initialize entropy map */ + if (i == j) + { + /* row '-1' */ + memset(ps_entropy->pu1_entropy_map - ps_proc->i4_wd_mbs, 1, ps_proc->i4_wd_mbs); + /* row 0 to ht in mbs */ + memset(ps_entropy->pu1_entropy_map, 0, ps_proc->i4_wd_mbs * ps_proc->i4_ht_mbs); + } + + /* wd in mbs */ + ps_entropy->i4_wd_mbs = ps_proc->i4_wd_mbs; + + /* ht in mbs */ + ps_entropy->i4_ht_mbs = ps_proc->i4_ht_mbs; + + /* transform_8x8_mode_flag */ + ps_entropy->i1_transform_8x8_mode_flag = 0; + + /* entropy_coding_mode_flag */ + ps_entropy->u1_entropy_coding_mode_flag = + ps_codec->s_cfg.u4_entropy_coding_mode; + + /* error code */ + ps_entropy->i4_error_code = IH264E_SUCCESS; + + /* mb skip run */ + *(ps_proc->s_entropy.pi4_mb_skip_run) = 0; + + /* last frame to encode */ + ps_proc->s_entropy.u4_is_last = ps_inp_buf->u4_is_last; + + /* Current Picture count */ + ps_proc->s_entropy.i4_pic_cnt = ps_codec->i4_pic_cnt; + + /* time stamps */ + ps_entropy->u4_timestamp_low = u4_timestamp_low; + ps_entropy->u4_timestamp_high = u4_timestamp_high; + + /* init frame statistics */ + ps_entropy->u4_header_bits[MB_TYPE_INTRA] = 0; + ps_entropy->u4_header_bits[MB_TYPE_INTER] = 0; + ps_entropy->u4_residue_bits[MB_TYPE_INTRA] = 0; + ps_entropy->u4_residue_bits[MB_TYPE_INTER] = 0; + } + + /********************************************************************/ + /* INITIALIZE DEBLOCK CONTEXT */ + /********************************************************************/ + { + /* deblk ctxt */ + deblk_ctxt_t *ps_deblk = &ps_proc->s_deblk_ctxt; + + /* slice idx map */ + ps_deblk->pu1_slice_idx = ps_proc->pu1_slice_idx; + } + + /********************************************************************/ + /* INITIALIZE ME CONTEXT */ + /********************************************************************/ + { + /* me ctxt */ + me_ctxt_t *ps_me_ctxt = &ps_proc->s_me_ctxt; + + /* srch range x */ + ps_me_ctxt->ai2_srch_boundaries[0] = + ps_codec->s_cfg.u4_srch_rng_x; + + /* srch range y */ + ps_me_ctxt->ai2_srch_boundaries[1] = + ps_codec->s_cfg.u4_srch_rng_y; + + /* src stride */ + ps_me_ctxt->i4_src_strd = ps_codec->i4_src_strd; + + /* rec stride */ + ps_me_ctxt->i4_rec_strd = ps_codec->i4_rec_strd; + + /* Half x plane offset from pic buf */ + ps_me_ctxt->u4_half_x_offset = ps_proc->u4_half_x_offset; + + /* Half y plane offset from half x plane */ + ps_me_ctxt->u4_half_y_offset = ps_proc->u4_half_y_offset; + + /* Half x plane offset from half y plane */ + ps_me_ctxt->u4_half_xy_offset = ps_proc->u4_half_xy_offset; + + /* enable fast sad */ + ps_me_ctxt->u4_enable_fast_sad = u4_enable_fast_sad; + + /* half pel */ + ps_me_ctxt->u4_enable_hpel = ps_codec->s_cfg.u4_enable_hpel; + + /* Diamond search Iteration Max Cnt */ + ps_me_ctxt->u4_num_layers = u4_num_layers; + + /* me speed preset */ + ps_me_ctxt->u4_me_speed_preset = + ps_codec->s_cfg.u4_me_speed_preset; + + /* qp */ + ps_me_ctxt->u1_mb_qp = ps_codec->u4_frame_qp; + + if ((i == 0) && (0 == ps_codec->i4_pic_cnt)) + { + /* init mv bits tables */ + ih264e_init_mv_bits(ps_me_ctxt); + } + } + + ps_proc->ps_ngbr_avbl = &(ps_proc->s_ngbr_avbl); + + } + + /* reset encoder header */ + ps_codec->i4_gen_header = 0; + } + + /********************************************************************/ + /* ADD JOBS TO THE QUEUE */ + /********************************************************************/ + { + /* job structures */ + job_t s_job; + + /* temp var */ + WORD32 i; + + /* job class */ + s_job.i4_cmd = CMD_PROCESS; + + /* number of mbs to be processed in the current job */ + s_job.i2_mb_cnt = ps_codec->s_cfg.i4_wd_mbs; + + /* job start index x */ + s_job.i2_mb_x = 0; + + /* proc base idx */ + s_job.i2_proc_base_idx = ctxt_sel ? (MAX_PROCESS_CTXT / 2) : 0; + + for (i = 0; i < (WORD32)ps_codec->s_cfg.i4_ht_mbs; i++) + { + /* job start index y */ + s_job.i2_mb_y = i; + + /* queue the job */ + ret = ih264_list_queue(ps_codec->pv_proc_jobq, &s_job, 1); + if (ret != IH264_SUCCESS) + { + ps_codec->i4_error_code = ret; + return IH264E_FAIL; + } + } + + /* Once all the jobs are queued, terminate the queue */ + /* Since the threads are created and deleted in each call, terminating + here is not an issue */ + ih264_list_terminate(ps_codec->pv_proc_jobq); + } + + return error_status; +} diff --git a/encoder/ih264e_utils.h b/encoder/ih264e_utils.h new file mode 100755 index 0000000..651dad9 --- /dev/null +++ b/encoder/ih264e_utils.h @@ -0,0 +1,327 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_utils.h +* +* @brief +* Contains declarations of miscellaneous utility functions used by the encoder +* +* @author +* Harish +* +* @par List of Functions: +* -ih264e_get_min_level() +* -ih264e_get_lvl_idx() +* -ih264e_get_dpb_size() +* -ih264e_get_total_pic_buf_size() +* -ih264e_get_pic_mv_bank_size() +* -ih264e_pic_buf_mgr_add_bufs() +* -ih264e_mv_buf_mgr_add_bufs() +* -ih264e_init_quant_params() +* -ih264e_init_air_map() +* -ih264e_codec_init() +* -ih264e_pic_init() +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_UTILS_H_ +#define IH264E_UTILS_H_ + +/** +******************************************************************************* +* +* @brief +* Used to get minimum level index for a given picture size +* +* @par Description: +* Gets the minimum level index and then gets corresponding level. +* Also used to ignore invalid levels like 2.3, 3.3 etc +* +* @param[in] level +* Level of the stream +* +* @returns Level index for a given level +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_get_min_level(WORD32 pic_size); + +/** +******************************************************************************* +* +* @brief +* Used to get level index for a given level +* +* @par Description: +* Converts from level_idc (which is multiplied by 30) to an index that can be +* used as a lookup. Also used to ignore invalid levels like 2.2 , 3.2 etc +* +* @param[in] level +* Level of the stream +* +* @returns Level index for a given level +* +* @remarks +* +******************************************************************************* +*/ +WORD32 ih264e_get_lvl_idx(WORD32 level); + +/** +******************************************************************************* +* +* @brief returns maximum number of pictures allowed in dpb for a given level +* +* @par Description: +* For given width, height and level, number of pictures allowed in decoder +* picture buffer is computed as per Annex A.3.1 +* +* @param[in] level +* level of the bit-stream +* +* @param[in] pic_size +* width * height +* +* @returns Number of buffers in DPB +* +* @remarks +* From annexure A.3.1 of H264 specification, +* max_dec_frame_buffering <= MaxDpbSize, where MaxDpbSize is equal to +* Min( 1024 * MaxDPB / ( PicWidthInMbs * FrameHeightInMbs * 384 ), 16 ) and +* MaxDPB is given in Table A-1 in units of 1024 bytes. However the MaxDPB size +* presented in the look up table gas_ih264_lvl_tbl is in units of 512 +* bytes. Hence the expression is modified accordingly. +* +******************************************************************************* +*/ +WORD32 ih264e_get_dpb_size(WORD32 level, WORD32 pic_size); + +/** +******************************************************************************* +* +* @brief +* Used to get reference picture buffer size for a given level and +* and padding used +* +* @par Description: +* Used to get reference picture buffer size for a given level and padding used +* Each picture is padded on all four sides +* +* @param[in] pic_size +* Number of luma samples (Width * Height) +* +* @param[in] level +* Level +* +* @param[in] horz_pad +* Total padding used in horizontal direction +* +* @param[in] vert_pad +* Total padding used in vertical direction +* +* @returns Total picture buffer size +* +* @remarks +* +* +******************************************************************************* +*/ +WORD32 ih264e_get_total_pic_buf_size(WORD32 pic_size, WORD32 level, + WORD32 horz_pad, WORD32 vert_pad, + WORD32 num_ref_frames, + WORD32 num_reorder_frames); + +/** +******************************************************************************* +* +* @brief Returns MV bank buffer size for a given number of luma samples +* +* @par Description: +* For given number of luma samples one MV bank size is computed. +* Each MV bank includes pu_map and enc_pu_t for all the min PUs(4x4) in a picture +* +* @param[in] num_luma_samples +* Max number of luma pixels in the frame +* +* @returns Total MV Bank size +* +* @remarks +* +* +******************************************************************************* +*/ +WORD32 ih264e_get_pic_mv_bank_size(WORD32 num_luma_samples); + +/** +******************************************************************************* +* +* @brief +* Function to initialize ps_pic_buf structs add pic buffers to +* buffer manager in case of non-shared mode +* +* @par Description: +* Function to initialize ps_pic_buf structs add pic buffers to +* buffer manager in case of non-shared mode +* To be called once per stream or for every reset +* +* @param[in] ps_codec +* Pointer to codec context +* +* @returns error status +* +* @remarks +* +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pic_buf_mgr_add_bufs(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Function to add buffers to MV Bank buffer manager +* +* @par Description: +* Function to add buffers to MV Bank buffer manager. To be called once per +* stream or for every reset +* +* @param[in] ps_codec +* Pointer to codec context +* +* @returns error status +* +* @remarks +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_mv_buf_mgr_add_bufs(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Function to initialize quant params structure +* +* @par Description: +* The forward quantization modules depends on qp/6, qp mod 6, forward scale +* matrix, forward threshold matrix, weight list. The inverse quantization +* modules depends on qp/6, qp mod 6, inverse scale matrix, weight list. +* These params are initialized in this function. +* +* @param[in] ps_proc +* pointer to process context +* +* @param[in] qp +* quantization parameter +* +* @returns none +* +* @remarks +* +******************************************************************************* +*/ +void ih264e_init_quant_params(process_ctxt_t *ps_proc, int qp); + +/** +******************************************************************************* +* +* @brief +* Initialize AIR mb frame Map +* +* @par Description: +* Initialize AIR mb frame map +* MB frame map indicates which frame an Mb should be coded as intra according to AIR +* +* @param[in] ps_codec +* Pointer to codec context +* +* @returns error_status +* +* @remarks +* +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_init_air_map(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief +* Codec level initializations +* +* @par Description: +* Initializes the codec with parameters that needs to be set before encoding +* first frame +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] ps_inp_buf +* Pointer to input buffer context +* +* @returns error_status +* +* @remarks +* +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_codec_init(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief +* Picture level initializations +* +* @par Description: +* Before beginning to encode the frame, the current function initializes all +* the ctxts (proc, entropy, me, ...) basing on the input configured params. +* It locates space for storing recon in the encoder picture buffer set, fetches +* reference frame from encoder picture buffer set. Calls RC pre-enc to get +* qp and pic type for the current frame. Queues proc jobs so that +* the other threads can begin encoding. In brief, this function sets up the +* tone for the entire encoder. +* +* @param[in] ps_codec +* Pointer to codec context +* +* @param[in] ps_inp_buf +* Pointer to input buffer context +* +* @returns error_status +* +* @remarks +* +* +******************************************************************************* +*/ +IH264E_ERROR_T ih264e_pic_init(codec_t *ps_codec, inp_buf_t *ps_inp_buf); + +#endif /* IH264E_UTILS_H_ */ diff --git a/encoder/ih264e_version.c b/encoder/ih264e_version.c new file mode 100755 index 0000000..3dcba8d --- /dev/null +++ b/encoder/ih264e_version.c @@ -0,0 +1,143 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_version.c +* +* @brief +* Contains version info for H264 encoder +* +* @author +* ittiam +* +* @par List of Functions: +* - ih264e_get_version() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +/* system include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* user include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264e.h" +#include "ih264_defs.h" +#include "ih264_debug.h" +#include "ih264_structs.h" +#include "ih264e_version.h" + + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** + * Name of the codec and target platform (All Cortex A processors in this case) + */ +#define CODEC_NAME "H264ENC" +/** + * Codec release type, production or evaluation + */ +#define CODEC_RELEASE_TYPE "production" +/** + * Version string. First two digits signify major version and last two minor + */ +#define CODEC_RELEASE_VER "01.00" +/** + * Vendor name + */ +#define CODEC_VENDOR "ITTIAM" + +#define MAX_STRLEN 511 +/** +******************************************************************************* +* Concatenates various strings to form a version string +******************************************************************************* +*/ +#define VERSION(version_string, codec_name, codec_release_type, codec_release_ver, codec_vendor) \ + strncpy(version_string,"@(#)Id:", MAX_STRLEN); \ + strncat(version_string,codec_name, MAX_STRLEN); \ + strncat(version_string,"_", MAX_STRLEN); \ + strncat(version_string,codec_release_type, MAX_STRLEN); \ + strncat(version_string," Ver:", MAX_STRLEN); \ + strncat(version_string,codec_release_ver, MAX_STRLEN); \ + strncat(version_string," Released by ", MAX_STRLEN); \ + strncat(version_string,codec_vendor, MAX_STRLEN); \ + strncat(version_string," Build: ", MAX_STRLEN); \ + strncat(version_string,__DATE__, MAX_STRLEN); \ + strncat(version_string," @ ", MAX_STRLEN); \ + strncat(version_string,__TIME__, MAX_STRLEN); + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief +* Fills the version info in the given char pointer +* +* @par Description: +* Fills the version info in the given char pointer +* +* @param[in] pc_version +* Pointer to hold version info +* +* @param[in] u4_version_bufsize +* Size of the buffer passed +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IV_STATUS_T ih264e_get_version(CHAR *pc_version, UWORD32 u4_version_bufsize) +{ + CHAR ac_version_tmp[MAX_STRLEN]; + + VERSION(ac_version_tmp, CODEC_NAME, CODEC_RELEASE_TYPE, CODEC_RELEASE_VER, + CODEC_VENDOR); + + if (u4_version_bufsize >= (strnlen(ac_version_tmp, MAX_STRLEN) + 1)) + { + memcpy(pc_version, ac_version_tmp, (strnlen(ac_version_tmp, MAX_STRLEN) + 1)); + return IV_SUCCESS; + } + else + { + return IV_FAIL; + } +} diff --git a/encoder/ih264e_version.h b/encoder/ih264e_version.h new file mode 100755 index 0000000..303a1e2 --- /dev/null +++ b/encoder/ih264e_version.h @@ -0,0 +1,64 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_version.h +* +* @brief +* Contains declarations of miscellaneous utility functions used by the encoder +* +* @author +* ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IH264E_VERSION_H_ +#define IH264E_VERSION_H_ + +/** +******************************************************************************* +* +* @brief +* Fills the version info in the given char pointer +* +* @par Description: +* Fills the version info in the given char pointer +* +* @param[in] pc_version +* Pointer to hold version info +* +* @param[in] u4_version_bufsize +* Size of the buffer passed +* +* @returns error status +* +* @remarks none +* +******************************************************************************* +*/ +IV_STATUS_T ih264e_get_version(CHAR *pc_version, UWORD32 u4_version_bufsize); + +#endif /* IH264E_VERSION_H_ */ diff --git a/encoder/ime.c b/encoder/ime.c new file mode 100755 index 0000000..c89aaab --- /dev/null +++ b/encoder/ime.c @@ -0,0 +1,836 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264e_me.c + * + * @brief + * + * + * @author + * Ittiam + * + * @par List of Functions: + * - + * + * @remarks + * None + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <assert.h> +#include <limits.h> +#include <string.h> + +/* User include files */ +#include "ime_typedefs.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ime_defs.h" +#include "ime_macros.h" +#include "ime.h" +#include "ime_statistics.h" + +/** +******************************************************************************* +* +* @brief Diamond Search +* +* @par Description: +* This function computes the sad at vertices of several layers of diamond grid +* at a time. The number of layers of diamond grid that would be evaluated is +* configurable.The function computes the sad at vertices of a diamond grid. If +* the sad at the center of the diamond grid is lesser than the sad at any other +* point of the diamond grid, the function marks the candidate Mb partition as +* mv. +* +* @param[in] ps_mb_part +* pointer to current mb partition ctxt with respect to ME +* +* @param[in] ps_me_ctxt +* pointer to me context +* +* @param[in] u4_lambda_motion +* lambda motion +* +* @param[in] u4_enable_fast_sad +* enable/disable fast sad computation +* +* @returns mv pair & corresponding distortion and cost +* +* @remarks Diamond Srch, radius is 1 +* +******************************************************************************* +*/ +void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt) +{ + /* MB partition info */ + mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part; + + /* lagrange parameter */ + UWORD32 u4_lambda_motion = ps_me_ctxt->u4_lambda_motion; + + /* srch range*/ + WORD32 i4_srch_range_n = ps_me_ctxt->i4_srch_range_n; + WORD32 i4_srch_range_s = ps_me_ctxt->i4_srch_range_s; + WORD32 i4_srch_range_e = ps_me_ctxt->i4_srch_range_e; + WORD32 i4_srch_range_w = ps_me_ctxt->i4_srch_range_w; + + /* enabled fast sad computation */ +// UWORD32 u4_enable_fast_sad = ps_me_ctxt->u4_enable_fast_sad; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma; + UWORD8 *pu1_ref_mb = ps_me_ctxt->pu1_ref_buf_luma; + + /* strides */ + WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd; + WORD32 i4_ref_strd = ps_me_ctxt->i4_rec_strd; + + /* least cost */ + WORD32 i4_cost_least = ps_mb_part->i4_mb_cost; + + /* least sad */ + WORD32 i4_distortion_least = ps_mb_part->i4_mb_distortion; + + /* mv pair */ + WORD16 i2_mvx, i2_mvy; + + /* mv bits */ + UWORD8 *pu1_mv_bits = ps_me_ctxt->pu1_mv_bits; + + /* temp var */ + WORD32 i4_cost[4]; + WORD32 i4_sad[4]; + UWORD8 *pu1_ref; + WORD16 i2_mv_u_x, i2_mv_u_y; + + /* Diamond search Iteration Max Cnt */ + UWORD32 u4_num_layers = ps_me_ctxt->u4_num_layers; + + /* temp var */ +// UWORD8 u1_prev_jump = NONE; +// UWORD8 u1_curr_jump = NONE; +// UWORD8 u1_next_jump; +// WORD32 mask_arr[5] = {15, 13, 14, 7, 11}; +// WORD32 mask; +// UWORD8 *apu1_ref[4]; +// WORD32 i, cnt; +// WORD32 dia[4][2] = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}}; + + /* mv with best sad during initial evaluation */ + i2_mvx = ps_mb_part->s_mv_curr.i2_mvx; + i2_mvy = ps_mb_part->s_mv_curr.i2_mvy; + + i2_mv_u_x = i2_mvx; + i2_mv_u_y = i2_mvy; + + while (u4_num_layers--) + { + /* FIXME : is this the write way to check for out of bounds ? */ + if ( (i2_mvx - 1 < i4_srch_range_w) || + (i2_mvx + 1 > i4_srch_range_e) || + (i2_mvy - 1 < i4_srch_range_n) || + (i2_mvy + 1 > i4_srch_range_s) ) + { + break; + } + + pu1_ref = pu1_ref_mb + i2_mvx + (i2_mvy * i4_ref_strd); + + ps_me_ctxt->pf_ime_compute_sad4_diamond(pu1_ref, + pu1_curr_mb, + i4_ref_strd, + i4_src_strd, + i4_sad); + + DEBUG_SAD_HISTOGRAM_ADD(i4_sad[0], 2); + DEBUG_SAD_HISTOGRAM_ADD(i4_sad[1], 2); + DEBUG_SAD_HISTOGRAM_ADD(i4_sad[2], 2); + DEBUG_SAD_HISTOGRAM_ADD(i4_sad[3], 2); + + /* compute cost */ + i4_cost[0] = i4_sad[0] + u4_lambda_motion * ( pu1_mv_bits[ ((i2_mvx - 1) << 2) - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[(i2_mvy << 2) - ps_mb_part->s_mv_pred.i2_mvy] ); + i4_cost[1] = i4_sad[1] + u4_lambda_motion * ( pu1_mv_bits[ ((i2_mvx + 1) << 2) - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[(i2_mvy << 2) - ps_mb_part->s_mv_pred.i2_mvy] ); + i4_cost[2] = i4_sad[2] + u4_lambda_motion * ( pu1_mv_bits[ (i2_mvx << 2) - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[((i2_mvy - 1) << 2) - ps_mb_part->s_mv_pred.i2_mvy] ); + i4_cost[3] = i4_sad[3] + u4_lambda_motion * ( pu1_mv_bits[ (i2_mvx << 2) - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[((i2_mvy + 1) << 2) - ps_mb_part->s_mv_pred.i2_mvy] ); + + + if (i4_cost_least > i4_cost[0]) + { + i4_cost_least = i4_cost[0]; + i4_distortion_least = i4_sad[0]; + + i2_mv_u_x = (i2_mvx - 1); + i2_mv_u_y = i2_mvy; + } + + if (i4_cost_least > i4_cost[1]) + { + i4_cost_least = i4_cost[1]; + i4_distortion_least = i4_sad[1]; + + i2_mv_u_x = (i2_mvx + 1); + i2_mv_u_y = i2_mvy; + } + + if (i4_cost_least > i4_cost[2]) + { + i4_cost_least = i4_cost[2]; + i4_distortion_least = i4_sad[2]; + + i2_mv_u_x = i2_mvx; + i2_mv_u_y = i2_mvy - 1; + } + + if (i4_cost_least > i4_cost[3]) + { + i4_cost_least = i4_cost[3]; + i4_distortion_least = i4_sad[3]; + + i2_mv_u_x = i2_mvx; + i2_mv_u_y = i2_mvy + 1; + } + + if( (i2_mv_u_x == i2_mvx) && (i2_mv_u_y == i2_mvy)) + { + ps_mb_part->u4_exit = 1; + break; + } + else + { + i2_mvx = i2_mv_u_x; + i2_mvy = i2_mv_u_y; + } + + + } + + if (i4_cost_least < ps_mb_part->i4_mb_cost) + { + ps_mb_part->i4_mb_cost = i4_cost_least; + ps_mb_part->i4_mb_distortion = i4_distortion_least; + ps_mb_part->s_mv_curr.i2_mvx = i2_mvx; + ps_mb_part->s_mv_curr.i2_mvy = i2_mvy; + } + +} + + +/** +******************************************************************************* +* +* @brief This function computes the best motion vector among the tentative mv +* candidates chosen. +* +* @par Description: +* This function determines the position in the search window at which the motion +* estimation should begin in order to minimise the number of search iterations. +* +* @param[in] ps_mb_part +* pointer to current mb partition ctxt with respect to ME +* +* @param[in] u4_lambda_motion +* lambda motion +* +* @param[in] u4_fast_flag +* enable/disable fast sad computation +* +* @returns mv pair & corresponding distortion and cost +* +* @remarks none +* +******************************************************************************* +*/ +void ime_evaluate_init_srchposn_16x16 + ( + me_ctxt_t *ps_me_ctxt + ) +{ + UWORD32 u4_lambda_motion = ps_me_ctxt->u4_lambda_motion; + + /* candidate mv cnt */ + UWORD32 u4_num_candidates = ps_me_ctxt->u4_num_candidates; + + /* list of candidate mvs */ + ime_mv_t *ps_mv_list = ps_me_ctxt->as_mv_init_search; + + /* pointer to src macro block */ + UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma; + UWORD8 *pu1_ref_mb = ps_me_ctxt->pu1_ref_buf_luma; + + /* strides */ + WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd; + WORD32 i4_ref_strd = ps_me_ctxt->i4_rec_strd; + + /* enabled fast sad computation */ + UWORD32 u4_enable_fast_sad = ps_me_ctxt->u4_enable_fast_sad; + + /* SAD(distortion metric) of an 8x8 block */ + WORD32 i4_mb_distortion; + + /* cost = distortion + u4_lambda_motion * rate */ + WORD32 i4_mb_cost, i4_mb_cost_least = INT_MAX, i4_distortion_least = INT_MAX; + + /* mb partitions info */ + mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part; + + /* mv bits */ + UWORD8 *pu1_mv_bits = ps_me_ctxt->pu1_mv_bits; + + /* temp var */ + UWORD32 i, j, u4_srch_pos_idx = 0; + UWORD8 *pu1_ref = NULL; + WORD16 mv_x, mv_y; + + if (0) + { + /************************************************************/ + /* Compute SKIP Cost */ + /************************************************************/ + mv_x = ps_mv_list[SKIP_CAND].i2_mvx; + mv_y = ps_mv_list[SKIP_CAND].i2_mvy; + + /* adjust ref pointer */ + pu1_ref = pu1_ref_mb + mv_x + (mv_y * i4_ref_strd); + + /* compute distortion */ + ps_me_ctxt->pf_ime_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd, i4_mb_cost_least, &i4_mb_distortion); + + /* for skip mode cost & distortion are identical + * But we shall add a bias to favor skip mode. + * Doc. JVT B118 Suggests SKIP_BIAS as 16. + * TODO : Empirical analysis of SKIP_BIAS is necessary */ + + i4_distortion_least = i4_mb_distortion; + + u4_srch_pos_idx = 0; + +#define SKIP_BIAS 8 + + i4_mb_cost_least = i4_mb_distortion - (u4_lambda_motion * SKIP_BIAS); + +#undef SKIP_BIAS + } + + + /* Carry out a search using each of the motion vector pairs identified above as predictors. */ + /* TODO : Just like Skip, Do we need to add any bias to zero mv as well */ + for(i = 0; i < u4_num_candidates; i++) + { + /* compute sad */ + WORD32 c_sad = 1; + + for(j = 0; j < i; j++ ) + { + if ( (ps_mv_list[i].i2_mvx == ps_mv_list[j].i2_mvx) && + (ps_mv_list[i].i2_mvy == ps_mv_list[j].i2_mvy) ) + { + c_sad = 0; + break; + } + } + if(c_sad) + { + /* adjust ref pointer */ + pu1_ref = pu1_ref_mb + ps_mv_list[i].i2_mvx + (ps_mv_list[i].i2_mvy * i4_ref_strd); + + /* compute distortion */ + ps_me_ctxt->pf_ime_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd, i4_mb_cost_least, &i4_mb_distortion); + DEBUG_SAD_HISTOGRAM_ADD(i4_mb_distortion, 3); + /* compute cost */ + i4_mb_cost = i4_mb_distortion + u4_lambda_motion * ( pu1_mv_bits[ (ps_mv_list[i].i2_mvx << 2) - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[(ps_mv_list[i].i2_mvy << 2) - ps_mb_part->s_mv_pred.i2_mvy] ); + + if (i4_mb_cost < i4_mb_cost_least) + { + i4_mb_cost_least = i4_mb_cost; + + i4_distortion_least = i4_mb_distortion; + + u4_srch_pos_idx = i; + } + } + } + + if (i4_mb_cost_least < ps_mb_part->i4_mb_cost) + { + ps_mb_part->u4_srch_pos_idx = u4_srch_pos_idx; + ps_mb_part->i4_mb_cost = i4_mb_cost_least; + ps_mb_part->i4_mb_distortion = i4_distortion_least; + ps_mb_part->s_mv_curr.i2_mvx = ps_mv_list[u4_srch_pos_idx].i2_mvx; + ps_mb_part->s_mv_curr.i2_mvy = ps_mv_list[u4_srch_pos_idx].i2_mvy; + } +} + + +/** +******************************************************************************* +* +* @brief Searches for the best matching full pixel predictor within the search +* range +* +* @par Description: +* This function begins by computing the mv predict vector for the current mb. +* This is used for cost computations. Further basing on the algo. chosen, it +* looks through a set of candidate vectors that best represent the mb a least +* cost and returns this information. +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @param[in] ps_me_ctxt +* pointer to me context +* +* @returns mv pair & corresponding distortion and cost +* +* @remarks none +* +******************************************************************************* +*/ +void ime_full_pel_motion_estimation_16x16 + ( + me_ctxt_t *ps_me_ctxt + ) +{ + /* mb part info */ + mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part; + + /******************************************************************/ + /* Modify Search range about initial candidate instead of zero mv */ + /******************************************************************/ + /* + * FIXME: The motion vectors in a way can become unbounded. It may so happen that + * MV might exceed the limit of the profile configured. + */ + ps_me_ctxt->i4_srch_range_w = MAX(ps_me_ctxt->i4_srch_range_w, + -ps_me_ctxt->ai2_srch_boundaries[0] + ps_mb_part->s_mv_curr.i2_mvx); + ps_me_ctxt->i4_srch_range_e = MIN(ps_me_ctxt->i4_srch_range_e, + ps_me_ctxt->ai2_srch_boundaries[0] + ps_mb_part->s_mv_curr.i2_mvx); + ps_me_ctxt->i4_srch_range_n = MAX(ps_me_ctxt->i4_srch_range_n, + -ps_me_ctxt->ai2_srch_boundaries[1] + ps_mb_part->s_mv_curr.i2_mvy); + ps_me_ctxt->i4_srch_range_s = MIN(ps_me_ctxt->i4_srch_range_s, + ps_me_ctxt->ai2_srch_boundaries[1] + ps_mb_part->s_mv_curr.i2_mvy); + + /************************************************************/ + /* Traverse about best initial candidate for mv */ + /************************************************************/ + + switch (ps_me_ctxt->u4_me_speed_preset) + { + case DMND_SRCH: + ime_diamond_search_16x16(ps_me_ctxt); + break; + default: + assert(0); + break; + } + + ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvx << 2; + ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy = ps_me_ctxt->s_mb_part.s_mv_curr.i2_mvy << 2; + +} + + +/** +******************************************************************************* +* +* @brief Searches for the best matching sub pixel predictor within the search +* range +* +* @par Description: +* This function begins by searching across all sub pixel sample points +* around the full pel motion vector. The vector with least cost is chosen as +* the mv for the current mb. If the skip mode is not evaluated while analysing +* the initial search candidates then analyse it here and update the mv. +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @param[in] ps_me_ctxt +* pointer to me context +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ime_sub_pel_motion_estimation_16x16 + ( + me_ctxt_t *ps_me_ctxt + ) +{ + /* pointers to src & ref macro block */ + UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma; + + + /* pointers to ref. half pel planes */ + UWORD8 *pu1_ref_mb_half_x; + UWORD8 *pu1_ref_mb_half_y; + UWORD8 *pu1_ref_mb_half_xy; + + /* pointers to ref. half pel planes */ + UWORD8 *pu1_ref_mb_half_x_temp; + UWORD8 *pu1_ref_mb_half_y_temp; + UWORD8 *pu1_ref_mb_half_xy_temp; + + /* strides */ + WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd; + + WORD32 i4_ref_strd = ps_me_ctxt->u4_hp_buf_strd; + + /* mb partitions info */ + mb_part_ctxt *ps_mb_part = &ps_me_ctxt->s_mb_part; + + /* SAD(distortion metric) of an mb */ + WORD32 i4_mb_distortion; + WORD32 i4_distortion_least = ps_mb_part->i4_mb_distortion; + + /* cost = distortion + u4_lambda_motion * rate */ + WORD32 i4_mb_cost; + WORD32 i4_mb_cost_least = ps_mb_part->i4_mb_cost; + + /*Best half pel buffer*/ + UWORD8 *pu1_best_hpel_buf = NULL; + + + /* mv bits */ + UWORD8 *pu1_mv_bits = ps_me_ctxt->pu1_mv_bits; + + /* Motion vectors in full-pel units */ + WORD16 mv_x, mv_y; + + /* lambda - lagrange constant */ + UWORD32 u4_lambda_motion = ps_me_ctxt->u4_lambda_motion; + + /* Flags to check if half pel points needs to be evaluated */ + /**************************************/ + /* 1 bit for each half pel candidate */ + /* bit 0 - half x = 1, half y = 0 */ + /* bit 1 - half x = -1, half y = 0 */ + /* bit 2 - half x = 0, half y = 1 */ + /* bit 3 - half x = 0, half y = -1 */ + /* bit 4 - half x = 1, half y = 1 */ + /* bit 5 - half x = -1, half y = 1 */ + /* bit 6 - half x = 1, half y = -1 */ + /* bit 7 - half x = -1, half y = -1 */ + /**************************************/ + /* temp var */ + WORD16 i2_mv_u_x, i2_mv_u_y; + WORD32 i, j; + WORD32 ai4_sad[8]; + + i2_mv_u_x = ps_mb_part->s_mv_curr.i2_mvx; + i2_mv_u_y = ps_mb_part->s_mv_curr.i2_mvy; + + /************************************************************/ + /* Evaluate half pel */ + /************************************************************/ + mv_x = ps_mb_part->s_mv_curr.i2_mvx >> 2; + mv_y = ps_mb_part->s_mv_curr.i2_mvy >> 2; + + + /**************************************************************/ + /* ps_me_ctxt->pu1_half_x points to the half pel pixel on the */ + /* left side of full pel */ + /* ps_me_ctxt->pu1_half_y points to the half pel pixel on the */ + /* top side of full pel */ + /* ps_me_ctxt->pu1_half_xy points to the half pel pixel */ + /* on the top left side of full pel */ + /* for the function pf_ime_sub_pel_compute_sad_16x16 the */ + /* default postions are */ + /* ps_me_ctxt->pu1_half_x = right halp_pel */ + /* ps_me_ctxt->pu1_half_y = bottom halp_pel */ + /* ps_me_ctxt->pu1_half_xy = bottom right halp_pel */ + /* Hence corresponding adjustments made here */ + /**************************************************************/ + + pu1_ref_mb_half_x_temp = pu1_ref_mb_half_x = ps_me_ctxt->pu1_half_x + 1; + pu1_ref_mb_half_y_temp = pu1_ref_mb_half_y = ps_me_ctxt->pu1_half_y + 1 + i4_ref_strd; + pu1_ref_mb_half_xy_temp = pu1_ref_mb_half_xy = ps_me_ctxt->pu1_half_xy + 1 + i4_ref_strd; + + + ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16(pu1_curr_mb, pu1_ref_mb_half_x, + pu1_ref_mb_half_y, + pu1_ref_mb_half_xy, + i4_src_strd, i4_ref_strd, + ai4_sad); + + /* Half x plane */ + for(i = 0; i < 2; i++) + { + WORD32 mv_x_tmp = (mv_x << 2) + 2; + WORD32 mv_y_tmp = (mv_y << 2); + + mv_x_tmp -= (i * 4); + + i4_mb_distortion = ai4_sad[i]; + + /* compute cost */ + i4_mb_cost = i4_mb_distortion + u4_lambda_motion * ( pu1_mv_bits[ mv_x_tmp - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[mv_y_tmp - ps_mb_part->s_mv_pred.i2_mvy] ); + + if (i4_mb_cost < i4_mb_cost_least) + { + i4_mb_cost_least = i4_mb_cost; + + i4_distortion_least = i4_mb_distortion; + + i2_mv_u_x = mv_x_tmp; + + i2_mv_u_y = mv_y_tmp; + +#ifndef HP_PL /*choosing whether left or right half_x*/ + ps_me_ctxt->pu1_half_x = pu1_ref_mb_half_x_temp - i; + pu1_best_hpel_buf = pu1_ref_mb_half_x_temp - i; +#endif + } + + } + + /* Half y plane */ + for(i = 0; i < 2; i++) + { + WORD32 mv_x_tmp = (mv_x << 2); + WORD32 mv_y_tmp = (mv_y << 2) + 2; + + mv_y_tmp -= (i * 4); + + i4_mb_distortion = ai4_sad[2 + i]; + + /* compute cost */ + i4_mb_cost = i4_mb_distortion + u4_lambda_motion * ( pu1_mv_bits[ mv_x_tmp - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[mv_y_tmp - ps_mb_part->s_mv_pred.i2_mvy] ); + + if (i4_mb_cost < i4_mb_cost_least) + { + i4_mb_cost_least = i4_mb_cost; + + i4_distortion_least = i4_mb_distortion; + + i2_mv_u_x = mv_x_tmp; + + i2_mv_u_y = mv_y_tmp; + +#ifndef HP_PL/*choosing whether top or bottom half_y*/ + ps_me_ctxt->pu1_half_y = pu1_ref_mb_half_y_temp - i*(i4_ref_strd); + pu1_best_hpel_buf = pu1_ref_mb_half_y_temp - i*(i4_ref_strd); +#endif + } + + } + + /* Half xy plane */ + for(j = 0; j < 2; j++) + { + for(i = 0; i < 2; i++) + { + WORD32 mv_x_tmp = (mv_x << 2) + 2; + WORD32 mv_y_tmp = (mv_y << 2) + 2; + + mv_x_tmp -= (i * 4); + mv_y_tmp -= (j * 4); + + i4_mb_distortion = ai4_sad[4 + i + 2 * j]; + + /* compute cost */ + i4_mb_cost = i4_mb_distortion + u4_lambda_motion * ( pu1_mv_bits[ mv_x_tmp - ps_mb_part->s_mv_pred.i2_mvx] + + pu1_mv_bits[mv_y_tmp - ps_mb_part->s_mv_pred.i2_mvy] ); + + if (i4_mb_cost < i4_mb_cost_least) + { + i4_mb_cost_least = i4_mb_cost; + + i4_distortion_least = i4_mb_distortion; + + i2_mv_u_x = mv_x_tmp; + + i2_mv_u_y = mv_y_tmp; + +#ifndef HP_PL /*choosing between four half_xy */ + ps_me_ctxt->pu1_half_xy = pu1_ref_mb_half_xy_temp - j*(i4_ref_strd) - i; + pu1_best_hpel_buf = pu1_ref_mb_half_xy_temp - j*(i4_ref_strd) - i; +#endif + } + + } + } + + ps_mb_part->i4_mb_cost = i4_mb_cost_least; + ps_mb_part->i4_mb_distortion = i4_distortion_least; + ps_mb_part->s_mv_curr.i2_mvx = i2_mv_u_x; + ps_mb_part->s_mv_curr.i2_mvy = i2_mv_u_y; + ps_mb_part->pu1_best_hpel_buf = pu1_best_hpel_buf; + +} + + +/** +******************************************************************************* +* +* @brief This function computes cost of skip macroblocks +* +* @par Description: +* +* @param[in] ps_me_ctxt +* pointer to me ctxt +* +* @param[in] ps_skip_mv +* pointer to skip mv +* +* @returns none +* +* @remarks +* NOTE: while computing the skip cost, do not enable early exit from compute +* sad function because, a negative bias gets added later +* +******************************************************************************* +*/ +void ime_compute_skip_cost + ( + me_ctxt_t *ps_me_ctxt, + void *pv_skip_mv, + mb_part_ctxt *ps_smb_part_info, + UWORD32 u4_use_stat_sad + ) +{ + + /* pointers to src & ref macro block */ + UWORD8 *pu1_curr_mb = ps_me_ctxt->pu1_src_buf_luma; + UWORD8 *pu1_ref_mb = ps_me_ctxt->pu1_ref_buf_luma; + + /* strides */ + WORD32 i4_src_strd = ps_me_ctxt->i4_src_strd; + WORD32 i4_ref_strd = ps_me_ctxt->i4_rec_strd; + + /* enabled fast sad computation */ + UWORD32 u4_enable_fast_sad = ps_me_ctxt->u4_enable_fast_sad; + + /* SAD(distortion metric) of an mb */ + WORD32 i4_mb_distortion; + + /* cost = distortion + u4_lambda_motion * rate */ + WORD32 i4_mb_cost; + + /* Motion vectors in full-pel units */ + WORD16 mv_x, mv_y; + + /* lambda - lagrange constant */ + UWORD32 u4_lambda_motion = ps_me_ctxt->u4_lambda_motion; + + /* skip mv */ + ime_mv_t *ps_skip_mv = pv_skip_mv, s_clip_skip_mv; + + /* temp var */ + UWORD8 *pu1_ref = NULL; + UWORD32 u4_is_nonzero; + + s_clip_skip_mv.i2_mvx = CLIP3(ps_me_ctxt->i4_srch_range_w, ps_me_ctxt->i4_srch_range_e, ps_skip_mv->i2_mvx); + s_clip_skip_mv.i2_mvy = CLIP3(ps_me_ctxt->i4_srch_range_n, ps_me_ctxt->i4_srch_range_s, ps_skip_mv->i2_mvy); + + if ((s_clip_skip_mv.i2_mvx != ps_skip_mv->i2_mvx) || + (s_clip_skip_mv.i2_mvy != ps_skip_mv->i2_mvy)) + { + /* skip motion vector not with in bounds */ + /* it is possible that mv is already evaluated */ + return ; + } + + mv_x = (ps_skip_mv->i2_mvx + 2) >> 2; + mv_y = (ps_skip_mv->i2_mvy + 2) >> 2; + + if ((mv_x << 2) != ps_skip_mv->i2_mvx || (mv_y << 2) != ps_skip_mv->i2_mvy) + { + + + return ; + + + } + else + { + /* adjust ref pointer */ + pu1_ref = pu1_ref_mb + mv_x + (mv_y * i4_ref_strd); + } + + if(u4_use_stat_sad == 1) + { + ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16(pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd, + ps_me_ctxt->pu2_sad_thrsh, &i4_mb_distortion,&u4_is_nonzero); + + /* + *NOTE The check here is two fold + * One is checking if the sad has been reached, ie min sad, which a configurable parameter + * If that is reached,we need not do any mode evaluation + * Similary if we find a distortion of zero there is no point of doing any further mode evaluation + * as sad is a non negative quantity + * hence in this case too, no further evaluation is necessary + */ + /* + *NOTE in case we need to disable the zero check using satdq, + * we need only to set the u4_is_zero to a non zero value + */ + if(u4_is_nonzero==0 || i4_mb_distortion <= ps_me_ctxt->i4_min_sad) + { + ps_me_ctxt->u4_min_sad_reached = 1; /* found min sad*/ + ps_me_ctxt->i4_min_sad = (u4_is_nonzero == 0)?0:i4_mb_distortion; + } + } + else + { + ps_me_ctxt->pf_ime_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_ref, i4_src_strd, i4_ref_strd, INT_MAX, &i4_mb_distortion); + + if(i4_mb_distortion <= ps_me_ctxt->i4_min_sad) + { + ps_me_ctxt->i4_min_sad = i4_mb_distortion; + ps_me_ctxt->u4_min_sad_reached = 1; /* found min sad*/ + } + } + + /* for skip mode cost & distortion are identical + * But we shall add a bias to favor skip mode. + * Doc. JVT B118 Suggests SKIP_BIAS as 16. + * TODO : Empirical analysis of SKIP_BIAS is necessary */ +#define SKIP_BIAS 8 + i4_mb_cost = i4_mb_distortion - (u4_lambda_motion * SKIP_BIAS); +#undef SKIP_BIAS + + if (i4_mb_cost <= ps_smb_part_info->i4_mb_cost) + { + ps_smb_part_info->i4_mb_cost = i4_mb_cost; + ps_smb_part_info->i4_mb_distortion = i4_mb_distortion; + ps_smb_part_info->s_mv_curr.i2_mvx = ps_skip_mv->i2_mvx; + ps_smb_part_info->s_mv_curr.i2_mvy = ps_skip_mv->i2_mvy; + } +} + diff --git a/encoder/ime.h b/encoder/ime.h new file mode 100755 index 0000000..5c039e8 --- /dev/null +++ b/encoder/ime.h @@ -0,0 +1,209 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ime.h + * + * @brief + * Contains declarations of global variables for H264 encoder + * + * @author + * Ittiam + * + * @remarks + * + ******************************************************************************* + */ + +#ifndef IME_H_ +#define IME_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** +****************************************************************************** + * @brief Number of iterations before exiting during diamond search +****************************************************************************** + */ +#define NUM_LAYERS 16 + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + + +/** +******************************************************************************* +* +* @brief Diamond Search +* +* @par Description: +* This function computes the sad at vertices of several layers of diamond grid +* at a time. The number of layers of diamond grid that would be evaluated is +* configurable.The function computes the sad at vertices of a diamond grid. If +* the sad at the center of the diamond grid is lesser than the sad at any other +* point of the diamond grid, the function marks the candidate Mb partition as +* mv. +* +* @param[in] ps_mb_part +* pointer to current mb partition ctxt with respect to ME +* +* @param[in] ps_me_ctxt +* pointer to me context +* +* @param[in] u4_lambda +* lambda motion +* +* @param[in] u4_fast_flag +* enable/disable fast sad computation +* +* @returns mv pair & corresponding distortion and cost +* +* @remarks This module cannot be part of the final product due to its lack of +* computational feasibility. This is only for quality eval purposes. +* +******************************************************************************* +*/ +extern void ime_diamond_search_16x16(me_ctxt_t *ps_me_ctxt); + + +/** +******************************************************************************* +* +* @brief This function computes the best motion vector among the tentative mv +* candidates chosen. +* +* @par Description: +* This function determines the position in the search window at which the motion +* estimation should begin in order to minimise the number of search iterations. +* +* @param[in] ps_mb_part +* pointer to current mb partition ctxt with respect to ME +* +* @param[in] u4_lambda_motion +* lambda motion +* +* @param[in] u4_fast_flag +* enable/disable fast sad computation +* +* @returns mv pair & corresponding distortion and cost +* +* @remarks none +* +******************************************************************************* +*/ +extern void ime_evaluate_init_srchposn_16x16 + ( + me_ctxt_t *ps_me_ctxt + ); + +/** +******************************************************************************* +* +* @brief Searches for the best matching full pixel predictor within the search +* range +* +* @par Description: +* This function begins by computing the mv predict vector for the current mb. +* This is used for cost computations. Further basing on the algo. chosen, it +* looks through a set of candidate vectors that best represent the mb a least +* cost and returns this information. +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @param[in] ps_me_ctxt +* pointer to me context +* +* @returns mv pair & corresponding distortion and cost +* +* @remarks none +* +******************************************************************************* +*/ +extern void ime_full_pel_motion_estimation_16x16 + ( + me_ctxt_t *ps_me_ctxt + ); + +/** +******************************************************************************* +* +* @brief Searches for the best matching sub pixel predictor within the search +* range +* +* @par Description: +* This function begins by searching across all sub pixel sample points +* around the full pel motion vector. The vector with least cost is chosen as +* the mv for the current mb. If the skip mode is not evaluated while analysing +* the initial search candidates then analyse it here and update the mv. +* +* @param[in] ps_proc +* pointer to current proc ctxt +* +* @param[in] ps_me_ctxt +* pointer to me context +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +extern void ime_sub_pel_motion_estimation_16x16 + ( + me_ctxt_t *ps_me_ctxt + ); + +/** +******************************************************************************* +* +* @brief This function computes cost of skip macroblocks +* +* @par Description: +* +* @param[in] ps_me_ctxt +* pointer to me ctxt +* +* @param[in] ps_skip_mv +* pointer to skip mv +* +* @returns none +* +* @remarks +* NOTE: while computing the skip cost, do not enable early exit from compute +* sad function because, a negative bias gets added later +* +******************************************************************************* +*/ +extern void ime_compute_skip_cost + ( + me_ctxt_t *ps_me_ctxt, + void *pv_skip_mv, + mb_part_ctxt *ps_smb_part_info, + UWORD32 u4_use_stat_sad + ); + + +#endif /* IME_H_ */ diff --git a/encoder/ime_defs.h b/encoder/ime_defs.h new file mode 100755 index 0000000..14d9c55 --- /dev/null +++ b/encoder/ime_defs.h @@ -0,0 +1,59 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ihevc_typedefs.h +* +* @brief +* Type definitions used in the code +* +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IME_DEFS_H_ +#define _IME_DEFS_H_ + + +/* Macros to Label candidates */ +#define SKIP_CAND 0 +#define ZERO_CAND 1 +#define LEFT_CAND 2 +#define TOP_CAND 3 +#define TOPR_CAND 4 + +#define NONE 0 +#define LEFT 1 +#define RIGHT 2 +#define TOP 3 +#define BOTTOM 4 + +#define MB_SIZE 16 + +#define FULL_SRCH 0 +#define DMND_SRCH 100 +#define NSTEP_SRCH 50 +#define HEX_SRCH 75 + +#endif /*_IME_DEFS_H_*/ + diff --git a/encoder/ime_distortion_metrics.c b/encoder/ime_distortion_metrics.c new file mode 100755 index 0000000..23a1fbc --- /dev/null +++ b/encoder/ime_distortion_metrics.c @@ -0,0 +1,1262 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file ih264e_distortion_metrics.c +* +* @brief +* This file contains definitions of routines that compute distortion +* between two macro/sub blocks of identical dimensions +* +* @author +* Ittiam +* +* @par List of Functions: +* - ime_sub_pel_compute_sad_16x16() +* - ime_calculate_sad4_prog() +* - ime_calculate_sad3_prog() +* - ime_calculate_sad2_prog() +* - ime_compute_sad_16x16() +* - ime_compute_sad_16x16_fast() +* - ime_compute_sad_16x16_ea8() +* - ime_compute_sad_8x8() +* - ime_compute_sad_4x4() +* - ime_compute_sad_16x8() +* - ime_compute_satqd_16x16_lumainter() +* - ime_compute_satqd_8x16_chroma() +* - ime_compute_satqd_16x16_lumaintra() +* +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* User include files */ +#include "ime_typedefs.h" +#include "ime_defs.h" +#include "ime_macros.h" +#include "ime_statistics.h" +#include "ime_platform_macros.h" +#include "ime_distortion_metrics.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) at all subpel points about the src location +* +* @par Description +* This functions computes SAD at all points at a subpel distance from the +* current source location. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_ref_half_x +* UWORD8 pointer to half pel buffer +* +* @param[out] pu1_ref_half_y +* UWORD8 pointer to half pel buffer +* +* @param[out] pu1_ref_half_xy +* UWORD8 pointer to half pel buffer +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ref_strd +* integer ref stride +* +* @param[out] pi4_sad +* integer evaluated sad +* pi4_sad[0] - half x +* pi4_sad[1] - half x - 1 +* pi4_sad[2] - half y +* pi4_sad[3] - half y - 1 +* pi4_sad[4] - half xy +* pi4_sad[5] - half xy - 1 +* pi4_sad[6] - half xy - strd +* pi4_sad[7] - half xy - 1 - strd +* +* @remarks +* +****************************************************************************** +*/ +void ime_sub_pel_compute_sad_16x16(UWORD8 *pu1_src, + UWORD8 *pu1_ref_half_x, + UWORD8 *pu1_ref_half_y, + UWORD8 *pu1_ref_half_xy, + WORD32 src_strd, + WORD32 ref_strd, + WORD32 *pi4_sad) +{ + UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1; + UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd; + UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1; + UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd; + UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1; + + WORD32 row, col; + + memset(pi4_sad, 0, 8 * sizeof(WORD32)); + + for(row = 0; row < MB_SIZE; row++) + { + for(col = 0; col < MB_SIZE; col++) + { + WORD32 src; + WORD32 diff; + + src = pu1_src[col]; + + diff = src - pu1_ref_half_x[col]; + pi4_sad[0] += ABS(diff); + + diff = src - pu1_ref_half_x_left[col]; + pi4_sad[1] += ABS(diff); + + diff = src - pu1_ref_half_y[col]; + pi4_sad[2] += ABS(diff); + + diff = src - pu1_ref_half_y_top[col]; + pi4_sad[3] += ABS(diff); + + diff = src - pu1_ref_half_xy[col]; + pi4_sad[4] += ABS(diff); + + diff = src - pu1_ref_half_xy_left[col]; + pi4_sad[5] += ABS(diff); + + diff = src - pu1_ref_half_xy_top[col]; + pi4_sad[6] += ABS(diff); + + diff = src - pu1_ref_half_xy_top_left[col]; + pi4_sad[7] += ABS(diff); + } + + pu1_src += src_strd; + + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + } +} + +/** +******************************************************************************* +* +* @brief compute sad +* +* @par Description: This function computes the sad at vertices of diamond grid +* centered at reference pointer and at unit distance from it. +* +* @param[in] pu1_ref +* UWORD8 pointer to the reference +* +* @param[out] pu1_src +* UWORD8 pointer to the source +* +* @param[in] ref_strd +* integer reference stride +* +* @param[in] src_strd +* integer source stride +* +* @param[out] pi4_sad +* pointer to integer array evaluated sad +* +* @returns sad at all evaluated vertexes +* +* @remarks none +* +******************************************************************************* +*/ +void ime_calculate_sad4_prog(UWORD8 *pu1_ref, + UWORD8 *pu1_src, + WORD32 ref_strd, + WORD32 src_strd, + WORD32 *pi4_sad) +{ + + /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */ + UWORD8 *left_ptr = pu1_ref - 1; + UWORD8 *right_ptr = pu1_ref + 1; + UWORD8 *top_ptr = pu1_ref - ref_strd; + UWORD8 *bot_ptr = pu1_ref + ref_strd; + + /* temp var */ + WORD32 count2, count3; + UWORD32 u4_ref_buf_offset = ref_strd - MB_SIZE; + UWORD32 u4_cur_buf_offset = src_strd - MB_SIZE; + + memset(pi4_sad, 0, 4 * sizeof(WORD32)); + + for(count2 = MB_SIZE; count2 > 0; count2--) + { + for(count3 = MB_SIZE; count3 > 0 ; count3--) + { + WORD32 src; + WORD32 diff; + + src = *pu1_src++; + + diff = src - *left_ptr++; + pi4_sad[0] += ABS(diff); + + diff = src - *right_ptr++; + pi4_sad[1] += ABS(diff); + + diff = src - *top_ptr++; + pi4_sad[2] += ABS(diff); + + diff = src - *bot_ptr++; + pi4_sad[3] += ABS(diff); + } + + bot_ptr += u4_ref_buf_offset; + left_ptr += u4_ref_buf_offset; + right_ptr += u4_ref_buf_offset; + top_ptr += u4_ref_buf_offset; + + pu1_src += u4_cur_buf_offset; + } + +} + +/** +******************************************************************************* +* +* @brief compute sad +* +* @par Description: This function computes the sad at vertices of diamond grid +* centered at reference pointer and at unit distance from it. +* +* @param[in] pu1_ref1, pu1_ref2, pu1_ref3 +* UWORD8 pointer to the reference +* +* @param[out] pu1_src +* UWORD8 pointer to the source +* +* @param[in] ref_strd +* integer reference stride +* +* @param[in] src_strd +* integer source stride +* +* @param[out] pi4_sad +* pointer to integer array evaluated sad +* +* @returns sad at all evaluated vertexes +* +* @remarks none +* +******************************************************************************* +*/ +void ime_calculate_sad3_prog(UWORD8 *pu1_ref1, + UWORD8 *pu1_ref2, + UWORD8 *pu1_ref3, + UWORD8 *pu1_src, + WORD32 ref_strd, + WORD32 src_strd, + WORD32 *pi4_sad) +{ + /* temp var */ + WORD32 i; + UWORD32 u4_ref_buf_offset = ref_strd - MB_SIZE; + UWORD32 u4_cur_buf_offset = src_strd - MB_SIZE; + + for(i = 16; i > 0; i--) + { + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + USADA8(pu1_src, pu1_ref3, pi4_sad[2]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + pu1_ref3 += 4; + + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + USADA8(pu1_src, pu1_ref3, pi4_sad[2]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + pu1_ref3 += 4; + + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + USADA8(pu1_src, pu1_ref3, pi4_sad[2]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + pu1_ref3 += 4; + + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + USADA8(pu1_src, pu1_ref3, pi4_sad[2]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + pu1_ref3 += 4; + + pu1_src += u4_cur_buf_offset; + pu1_ref1 += u4_ref_buf_offset; + pu1_ref2 += u4_ref_buf_offset; + pu1_ref3 += u4_ref_buf_offset; + } + +} + +/** +******************************************************************************* +* +* @brief compute sad +* +* @par Description: This function computes the sad at vertices of diamond grid +* centered at reference pointer and at unit distance from it. +* +* @param[in] pu1_ref1, pu1_ref2 +* UWORD8 pointer to the reference +* +* @param[out] pu1_src +* UWORD8 pointer to the source +* +* @param[in] ref_strd +* integer reference stride +* +* @param[in] src_strd +* integer source stride +* +* @param[out] pi4_sad +* pointer to integer array evaluated sad +* +* @returns sad at all evaluated vertexes +* +* @remarks none +* +******************************************************************************* +*/ +void ime_calculate_sad2_prog(UWORD8 *pu1_ref1, + UWORD8 *pu1_ref2, + UWORD8 *pu1_src, + WORD32 ref_strd, + WORD32 src_strd, + WORD32 *pi4_sad) +{ + /* temp var */ + WORD32 i; + UWORD32 u4_ref_buf_offset = ref_strd - MB_SIZE; + UWORD32 u4_cur_buf_offset = src_strd - MB_SIZE; + + for(i = 16; i > 0; i--) + { + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + + USADA8(pu1_src, pu1_ref1, pi4_sad[0]); + USADA8(pu1_src, pu1_ref2, pi4_sad[1]); + pu1_src += 4; + pu1_ref1 += 4; + pu1_ref2 += 4; + + pu1_src += u4_cur_buf_offset; + pu1_ref1 += u4_ref_buf_offset; + pu1_ref2 += u4_ref_buf_offset; + } + +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x16 blocks +* +* @par Description +* This functions computes SAD between 2 16x16 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] i4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x16(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + WORD32 i4_sad = 0; + UWORD32 u4_src_offset = src_strd - 16; + UWORD32 u4_est_offset = est_strd - 16; + UWORD32 i; + +GATHER_16x16_SAD_EE_STATS(gu4_16x16_sad_ee_stats, 16); + + for(i = 16; i > 0; i--) + { + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + /* early exit */ + if(i4_max_sad < i4_sad) + { + +GATHER_16x16_SAD_EE_STATS(gu4_16x16_sad_ee_stats, 16-i); + + *pi4_mb_distortion = i4_sad; + return ; + } + pu1_src += u4_src_offset; + pu1_est += u4_est_offset; + } + + *pi4_mb_distortion = i4_sad; + return ; +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) +* +* @par Description +* This functions computes SAD between 2 16x16 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] i4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x16_fast(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + + WORD32 i4_sad = 0; + UWORD32 u4_src_offset = 2 * src_strd - 16; + UWORD32 u4_est_offset = 2 * est_strd - 16; + UWORD32 i; + + UNUSED(i4_max_sad); + + for(i = 16; i > 0; i-= 2) + { + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += 4; + pu1_est += 4; + + pu1_src += u4_src_offset; + pu1_est += u4_est_offset; + } + + *pi4_mb_distortion = (i4_sad << 1); + return ; +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 8x8 blocks +* +* @par Description +* This functions computes SAD between 2 8x8 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_max_sad +* integer maximum allowed distortion +* +* @param[out] i4_sad +* integer evaluated sad +* +* @remarks +* +****************************************************************************** + */ + +void ime_compute_sad_8x8(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + WORD32 i4_sad = 0; + UWORD32 u4_src_offset = src_strd - 8; + UWORD32 u4_est_offset = est_strd - 8; + UWORD32 i, j; + WORD16 temp; + + for(i = 8; i > 0; i--) + { + for(j = 8; j > 0; j--) + { + /* SAD */ + temp = *pu1_src++ - *pu1_est++; + i4_sad += ABS(temp); + } + /* early exit */ + if(i4_max_sad < i4_sad) + { + *pi4_mb_distortion = i4_sad; + return; + } + pu1_src += u4_src_offset; + pu1_est += u4_est_offset; + } + *pi4_mb_distortion = i4_sad; +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 4x4 blocks +* +* @par Description +* This functions computes SAD between 2 4x4 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_4x4 + ( + UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion + ) +{ + WORD32 i4_sad = 0; + + UNUSED(i4_max_sad); + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += src_strd; + pu1_est += est_strd; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += src_strd; + pu1_est += est_strd; + + USADA8(pu1_src, pu1_est, i4_sad); + pu1_src += src_strd; + pu1_est += est_strd; + + USADA8(pu1_src, pu1_est, i4_sad); + *pi4_mb_distortion = i4_sad; +} + + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x8 blocks +* +* +* @par Description +* This functions computes SAD between 2 16x8 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x8 + ( + UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion + ) +{ + WORD32 i4_sad = 0; + UWORD32 u4_src_offset = src_strd - 16; + UWORD32 u4_est_offset = est_strd - 16; + UWORD32 i, j; + WORD16 temp; + +GATHER_16x8_SAD_EE_STATS(gu4_16x8_sad_ee_stats, 8); + + for(i = 8; i > 0; i--) + { + for(j = 16; j > 0; j--) + { + /* SAD */ + temp = *pu1_src++ - *pu1_est++; + i4_sad += ABS(temp); + } + /* early exit */ + if(i4_max_sad < i4_sad) + { + +GATHER_16x8_SAD_EE_STATS(gu4_16x8_sad_ee_stats, 8-i); + + *pi4_mb_distortion = i4_sad; + + return; + } + pu1_src += u4_src_offset; + pu1_est += u4_est_offset; + } + + *pi4_mb_distortion = i4_sad; + return; + +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x16 blocks +* +* @par Description +* This functions computes SAD between 2 16x16 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] i4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x16_ea8(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + WORD32 i4_sad = 0; + UWORD32 u4_src_offset = src_strd - 16; + UWORD32 u4_est_offset = est_strd - 16; + UWORD32 i, j; + WORD16 temp; + UWORD8 *pu1_src_temp = pu1_src + src_strd; + UWORD8 *pu1_est_temp = pu1_est + est_strd; + + for(i = 16; i > 0; i -= 2) + { + for(j = 16; j > 0; j--) + { + /* SAD */ + temp = *pu1_src++ - *pu1_est++; + i4_sad += ABS(temp); + } + + pu1_src += (u4_src_offset + src_strd); + pu1_est += (u4_est_offset + est_strd); + + } + + /* early exit */ + if(i4_max_sad < i4_sad) + { + *pi4_mb_distortion = i4_sad; + return; + } + + pu1_src = pu1_src_temp; + pu1_est = pu1_est_temp; + + for(i = 16; i > 0; i -= 2) + { + for(j = 16; j > 0; j--) + { + /* SAD */ + temp = *pu1_src++ - *pu1_est++; + i4_sad += ABS(temp); + } + + pu1_src += u4_src_offset + src_strd; + pu1_est += u4_est_offset + est_strd; + } + + *pi4_mb_distortion = i4_sad; + return; +} + + +/** +******************************************************************************* +* +* @brief This function computes SAD between two 16x16 blocks +* It also computes if the block will be zero after H264 transform and quant for +* Intra 16x16 blocks +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] pu2_thrsh +* Threshold for each element of transofrmed quantized block +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @param[out] pu4_is_zero +* Poitner to store if the block is zero after transform and quantization +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_satqd_16x16_lumainter(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + UWORD16 *pu2_thrsh, + WORD32 *pi4_mb_distortion, + UWORD32 *pu4_is_non_zero) +{ + UWORD32 i,j; + WORD16 s1,s2,s3,s4,sad_1,sad_2,ls1,ls2,ls3,ls4,ls5,ls6,ls7,ls8; + UWORD8 *pu1_src_lp,*pu1_est_lp; + UWORD32 sad = 0; + + (*pi4_mb_distortion) = 0; + for(i=0;i<4;i++) + { + for(j=0;j<4;j++) + { + pu1_src_lp = pu1_src + 4*j; + pu1_est_lp = pu1_est + 4*j; + + s1 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s4 = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s2 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s3 = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s2 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s3 += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s1 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s4 += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + sad_1 = s1+s2+s3+s4; + + if(sad == 0) + { + sad_2 = sad_1<<1; + + ls1 = sad_2 -(s2 + s3); + ls2 = sad_2 -(s1 + s4); + ls3 = sad_2 -(s3 + s4); + ls4 = sad_2 -(s3 - (s1<<1)); + ls5 = sad_2 -(s4 - (s2<<1)); + ls6 = sad_2 -(s1 + s2); + ls7 = sad_2 -(s2 - (s4<<1)); + ls8 = sad_2 -(s1 - (s3<<1)); + + if( + pu2_thrsh[8] <= sad_1 || + pu2_thrsh[0] <= ls2 || + pu2_thrsh[1] <= ls1 || + pu2_thrsh[2] <= ls8 || + pu2_thrsh[3] <= ls5 || + + pu2_thrsh[4] <= ls6 || + pu2_thrsh[5] <= ls3 || + pu2_thrsh[6] <= ls7 || + pu2_thrsh[7] <= ls4 + + )sad = 1; + } + (*pi4_mb_distortion) += sad_1; + } + pu1_src += (src_strd *4); + pu1_est += (est_strd *4); + } + *pu4_is_non_zero = sad; +} + + +/** +****************************************************************************** +* +* @brief computes distortion (SAD and SAQTD) between 2 16x8 (interleaved) chroma blocks +* +* +* @par Description +* This functions computes SAD between2 16x8 chroma blocks(interleaved) +* It also checks if the SATDD(Sum of absolute transformed wuqntized differnce beteern the blocks +* If SAQTD is zero, it gives back zero +* Other wise sad is retrned +* There is no provison for early exit +* +* The transform done here is the transform for chroma blocks in H264 +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] pu2_thrsh +* Threshold for each element of transofrmed quantized block +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* Fucntion code is nit updated. +* Will require debugging and minor modifications +* +****************************************************************************** +*/ +void ime_compute_satqd_8x16_chroma(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 max_sad, + UWORD16 *thrsh) +{ + WORD32 i,j,plane; + WORD16 s1,s2,s3,s4,sad_1,sad_2,ls1,ls2,ls3,ls4,ls5,ls6,ls7,ls8; + UWORD8 *pu1_src_lp,*pu1_est_lp,*pu1_src_plane,*pu1_est_plane; + WORD32 sad =0; + UNUSED(max_sad); + + pu1_src_plane = pu1_src; + pu1_est_plane = pu1_est; + + for(plane =0;plane<2;plane++) + { + for(i=0;i<4;i++) + { + for(j=0;j<4;j++) + { + pu1_src_lp = pu1_src + 8*j; + pu1_est_lp = pu1_est + 8*j; + + s1 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]); + s4 = ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s2 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]); + s3 = ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s2 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]); + s3 += ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s1 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]); + s4 += ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]); + + sad_1 = s1+s2+s3+s4; + sad_2 = sad_1<<1; + + ls1 = sad_2 -(s2 + s3); + ls2 = sad_2 -(s1 + s4); + ls3 = sad_2 -(s3 + s4); + ls4 = sad_2 -(s3 - (s1<<1)); + ls5 = sad_2 -(s4 - (s2<<1)); + ls6 = sad_2 -(s1 + s2); + ls7 = sad_2 -(s2 - (s4<<1)); + ls8 = sad_2 -(s1 - (s3<<1)); + + if( + //thrsh[0] > sad_1 && Chroma Dc is checked later + thrsh[1] > ls1 && + thrsh[2] > sad_1 && + thrsh[3] > ls2 && + + thrsh[4] > ls3 && + thrsh[5] > ls4 && + thrsh[6] > ls3 && + thrsh[7] > ls5 && + + thrsh[8] > sad_1 && + thrsh[9] > ls1 && + thrsh[10]> sad_1 && + thrsh[11]> ls2 && + + thrsh[12]> ls6 && + thrsh[13]> ls7 && + thrsh[14]> ls6 && + thrsh[15]> ls8 + ) + { + /*set current sad to be zero*/ + } + else + return ; + + sad += sad_1; + } + pu1_src += (src_strd *4); + pu1_est += (est_strd *4); + } + if(sad < (thrsh[0]<<1))sad = 0; + else return ; + + pu1_src = pu1_src_plane+1; + pu1_est = pu1_est_plane+1; + } + return ; +} + + +/** +****************************************************************************** +* +* @brief computes distortion (SAD and SAQTD) between 2 16x16 blocks +* +* @par Description +* This functions computes SAD between 2 16x16 blocks. +* It also checks if the SATDD(Sum of absolute transformed wuqntized differnce beteern the blocks +* If SAQTD is zero, it gives back zero +* Other wise sad is retrned +* There is no provison for early exit +* +* The transform done here is the transform for inter 16x16 blocks in H264 +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] pu2_thrsh +* Threshold for each element of transofrmed quantized block +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_satqd_16x16_lumaintra(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 max_sad, + UWORD16 *thrsh, + WORD32 *pi4_mb_distortion, + UWORD8 *sig_nz_sad) +{ + UWORD32 i,j; + WORD16 s1[4],s2[4],s3[4],s4[4],sad[4]; + UWORD8 *pu1_src_lp,*pu1_est_lp; + UWORD8 *sig_sad_dc; + UWORD32 nz_sad_sig = 0; + UNUSED(max_sad); + *pi4_mb_distortion =0; + + sig_sad_dc = sig_nz_sad; + sig_nz_sad++; + + for(i=0;i<4;i++) + { + for(j=0;j<4;j++) + { + pu1_src_lp = pu1_src + 4*j; + pu1_est_lp = pu1_est + 4*j; + + s1[j] = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s4[j] = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s2[j] = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s3[j] = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s2[j] += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s3[j] += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + pu1_src_lp += src_strd; + pu1_est_lp += est_strd; + + s1[j] += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]); + s4[j] += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]); + + sad[j] = ((s1[j]+s2[j]+s3[j]+s4[j])<<1); + } + + for(j=0;j<4;j++) + { + + if( + //thrsh[0] > (sad[j] >> 1) &&Dc goes in the other part + thrsh[1] > (sad[j] -(s2[j] + s3[j])) && + thrsh[2] > (sad[j]>>1) && + thrsh[3] > (sad[j] -(s1[j] + s4[j])) && + + thrsh[4] > (sad[j] -(s3[j] + s4[j])) && + thrsh[5] > (sad[j] -(s3[j] - (s1[j]<<1))) && + thrsh[6] > (sad[j] -(s3[j] + s4[j])) && + thrsh[7] > (sad[j] -(s4[j] - (s2[j]<<1))) && + + thrsh[8] > (sad[j]>>1) && + thrsh[9] > (sad[j] -(s2[j] + s3[j])) && + thrsh[10]> (sad[j]>>1) && + thrsh[11]> (sad[j] -(s1[j] + s4[j])) && + + thrsh[12]> (sad[j] -(s1[j] + s2[j])) && + thrsh[13]> (sad[j] -(s2[j] - (s4[j]<<1))) && + thrsh[14]> (sad[j] -(s1[j] + s2[j])) && + thrsh[15]> (sad[j] -(s1[j] - (s3[j]<<1))) + ) + { + //sad[j] = 0; /*set current sad to be zero*/ + sig_nz_sad[j] = 0;/*Signal that the sad is zero*/ + } + else + { + sig_nz_sad[j] = 1;/*signal that sad is non zero*/ + nz_sad_sig = 1; + } + + (*pi4_mb_distortion) += (sad[j]>>1); + //if((*pi4_mb_distortion) >= max_sad)return; /*return or some thing*/ + } + + sig_nz_sad += 4; + pu1_src += (src_strd *4); + pu1_est += (est_strd *4); + } + + if((*pi4_mb_distortion) < thrsh[0]<<2) + { + *sig_sad_dc = 0; + if(nz_sad_sig == 0)(*pi4_mb_distortion) = 0; + } + else *sig_sad_dc = 1; +} + diff --git a/encoder/ime_distortion_metrics.h b/encoder/ime_distortion_metrics.h new file mode 100755 index 0000000..a30e1fc --- /dev/null +++ b/encoder/ime_distortion_metrics.h @@ -0,0 +1,170 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file ih264e_distortion_metrics.h +* +* @brief +* This file contains declarations of routines that compute distortion +* between two macro/sub blocks of identical dimensions +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef IME_DISTORTION_METRICS_H_ +#define IME_DISTORTION_METRICS_H_ + + +/*****************************************************************************/ +/* Type definitions for function prototypes */ +/*****************************************************************************/ + +typedef void ime_compute_sad_ft(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion); + +typedef void ime_compute_sad4_diamond(UWORD8 *pu1_ref, + UWORD8 *pu1_src, + WORD32 ref_strd, + WORD32 src_strd, + WORD32 *pi4_sad); + +typedef void ime_compute_sad3_diamond(UWORD8 *pu1_ref1, + UWORD8 *pu1_ref2, + UWORD8 *pu1_ref3, + UWORD8 *pu1_src, + WORD32 ref_strd, + WORD32 src_strd, + WORD32 *pi4_sad); + +typedef void ime_compute_sad2_diamond(UWORD8 *pu1_ref1, + UWORD8 *pu1_ref2, + UWORD8 *pu1_src, + WORD32 ref_strd, + WORD32 src_strd, + WORD32 *pi4_sad); + +typedef void ime_sub_pel_compute_sad_16x16_ft(UWORD8 *pu1_src, + UWORD8 *pu1_ref_half_x, + UWORD8 *pu1_ref_half_y, + UWORD8 *pu1_ref_half_xy, + WORD32 src_strd, + WORD32 ref_strd, + WORD32 *pi4_sad); + +typedef void ime_compute_sad_stat(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + UWORD16 *pu2_thrsh, + WORD32 *pi4_mb_distortion, + UWORD32 *pu4_is_zero); + +typedef void ime_compute_satqd_16x16_lumainter_ft(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + UWORD16 *pu2_thrsh, + WORD32 *pi4_mb_distortion, + UWORD32 *pu4_is_zero); + +typedef void ime_compute_satqd_8x16_chroma_ft(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + UWORD16 *thrsh); + +typedef void ime_compute_satqd_16x16_lumaintra_ft(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + UWORD16 *thrsh, + WORD32 *pi4_mb_distortion, + UWORD8 *sig_nz_sad); + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +ime_compute_sad_ft ime_compute_sad_16x16; +ime_compute_sad_ft ime_compute_sad_16x16_fast; +ime_compute_sad_ft ime_compute_sad_16x8; +ime_compute_sad_ft ime_compute_sad_16x16_ea8; +ime_compute_sad_ft ime_compute_sad_8x8; +ime_compute_sad_ft ime_compute_sad_4x4; +ime_compute_sad4_diamond ime_calculate_sad4_prog; +ime_compute_sad3_diamond ime_calculate_sad3_prog; +ime_compute_sad2_diamond ime_calculate_sad2_prog; +ime_sub_pel_compute_sad_16x16_ft ime_sub_pel_compute_sad_16x16; +ime_compute_sad_stat ime_compute_16x16_sad_stat; +ime_compute_satqd_16x16_lumainter_ft ime_compute_satqd_16x16_lumainter; +ime_compute_satqd_8x16_chroma_ft ime_compute_satqd_8x16_chroma; +ime_compute_satqd_16x16_lumaintra_ft ime_compute_satqd_16x16_lumaintra; + +/*SSE4.2 Declarations*/ +ime_compute_sad_ft ime_compute_sad_16x16_sse42; +ime_compute_sad_ft ime_compute_sad_16x16_fast_sse42; +ime_compute_sad_ft ime_compute_sad_16x8_sse42; +ime_compute_sad_ft ime_compute_sad_16x16_ea8_sse42; +ime_sub_pel_compute_sad_16x16_ft ime_sub_pel_compute_sad_16x16_sse42; +ime_compute_sad4_diamond ime_calculate_sad4_prog_sse42; +ime_compute_satqd_16x16_lumainter_ft ime_compute_satqd_16x16_lumainter_sse42; + +/* assembly */ +ime_compute_sad_ft ime_compute_sad_16x16_a9q; +ime_compute_sad_ft ime_compute_sad_16x16_fast_a9q; +ime_compute_sad_ft ime_compute_sad_16x8_a9q; +ime_compute_sad_ft ime_compute_sad_16x16_ea8_a9q; +ime_compute_sad4_diamond ime_calculate_sad4_prog_a9q; +ime_compute_sad3_diamond ime_calculate_sad3_prog_a9q; +ime_compute_sad2_diamond ime_calculate_sad2_prog_a9q; +ime_sub_pel_compute_sad_16x16_ft ime_sub_pel_compute_sad_16x16_a9q; +ime_compute_sad_stat ime_compute_16x16_sad_stat_a9; +ime_compute_satqd_16x16_lumainter_ft ime_compute_satqd_16x16_lumainter_a9q; + + +/* assembly - AV8 declarations */ +ime_compute_sad_ft ime_compute_sad_16x16_av8; +ime_compute_sad_ft ime_compute_sad_16x16_fast_av8; +ime_compute_sad_ft ime_compute_sad_16x8_av8; +ime_compute_sad_ft ime_compute_sad_16x16_ea8_av8; +ime_compute_sad4_diamond ime_calculate_sad4_prog_av8; +ime_compute_sad3_diamond ime_calculate_sad3_prog_av8; +ime_compute_sad2_diamond ime_calculate_sad2_prog_av8; +ime_sub_pel_compute_sad_16x16_ft ime_sub_pel_compute_sad_16x16_av8; +ime_compute_sad_stat ime_compute_16x16_sad_stat_av8; +ime_compute_satqd_16x16_lumainter_ft ime_compute_satqd_16x16_lumainter_av8; + + +#endif /* IME_DISTORTION_METRICS_H_ */ + + diff --git a/encoder/ime_macros.h b/encoder/ime_macros.h new file mode 100755 index 0000000..a7b8c65 --- /dev/null +++ b/encoder/ime_macros.h @@ -0,0 +1,44 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ihevc_typedefs.h +* +* @brief +* Type definitions used in the code +* +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IME_MACROS_H_ +#define _IME_MACROS_H_ + +#define ABS(x) ((x) < 0 ? (-(x)) : (x)) +#define MAX(a,b) ((a > b)?(a):(b)) +#define MIN(a,b) ((a < b)?(a):(b)) + +#define CLIP3(miny, maxy, y) (((y) < (miny))?(miny):(((y) > maxy)?(maxy):(y))) +#define UNUSED(x) ((void)(x)) + +#endif /*_IME_MACROS_H_*/ diff --git a/encoder/ime_statistics.h b/encoder/ime_statistics.h new file mode 100755 index 0000000..eeacaf2 --- /dev/null +++ b/encoder/ime_statistics.h @@ -0,0 +1,86 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ihevc_typedefs.h +* +* @brief +* Type definitions used in the code +* +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IME_STATISTICS_H_ +#define _IME_STATISTICS_H_ +#define DEBUG_HISTOGRAM_ENABLE 0 +#define SAD_EXIT_STATS 0 + + +#if SAD_EXIT_STATS + +/** +****************************************************************************** +* @brief While computing sad, if we want to do a early exit, how often we +* should check if the sad computed till now has exceeded min sad param is +* chosen statistically. +* ****************************************************************************** +*/ +extern UWORD32 gu4_16x16_sad_ee_stats[16+1]; +extern UWORD32 gu4_16x8_sad_ee_stats[8+1]; + +/** +****************************************************************************** +* @brief print sad early exit stats +****************************************************************************** +*/ +extern void print_sad_ee_stats(void); + +#define GATHER_16x16_SAD_EE_STATS(gu4_16x16_sad_ee_stats, i) \ + gu4_16x16_sad_ee_stats[i]++; +#define GATHER_16x8_SAD_EE_STATS(gu4_16x8_sad_ee_stats, i) \ + gu4_16x8_sad_ee_stats[i]++; + +#else + +#define GATHER_16x16_SAD_EE_STATS(gu4_16x16_sad_ee_stats, i) +#define GATHER_16x8_SAD_EE_STATS(gu4_16x8_sad_ee_stats, i) + +#endif + + +#if DEBUG_HISTOGRAM_ENABLE +#define DEBUG_HISTOGRAM_INIT() debug_histogram_init() +#define DEBUG_HISTOGRAM_DUMP(condition) if(condition) debug_histogram_dump() +#define DEBUG_MV_HISTOGRAM_ADD(mv_x, mv_y) debug_mv_histogram_add(mv_x, mv_y) +#define DEBUG_SAD_HISTOGRAM_ADD(sad, level) debug_sad_histogram_add(sad, level) +#else +#define DEBUG_HISTOGRAM_INIT() +#define DEBUG_HISTOGRAM_DUMP(condition) +#define DEBUG_MV_HISTOGRAM_ADD(mv_x, mv_y) +#define DEBUG_SAD_HISTOGRAM_ADD(sad, level) +#endif + + + +#endif /*_IME_STATISTICS_H_*/ diff --git a/encoder/ime_structs.h b/encoder/ime_structs.h new file mode 100755 index 0000000..7819b91 --- /dev/null +++ b/encoder/ime_structs.h @@ -0,0 +1,305 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264e_me.h + * + * @brief + * + * + * @author + * Ittiam + * + * @par List of Functions: + * - + * + * @remarks + * None + * + ******************************************************************************* + */ + +#ifndef _IME_STRUCTS_H_ +#define _IME_STRUCTS_H_ + +/** + * Motion vector + */ +typedef struct +{ + /** + * Horizontal Motion Vector + */ + WORD16 i2_mvx; + + /** + * Vertical Motion Vector + */ + WORD16 i2_mvy; +} ime_mv_t; + + +/** +************************************************************************** +* @brief mb_part_ctxt +* +* Structure that would hold the information for individual MB partitions +* gathered during the full pel ME stage +************************************************************************** +*/ +typedef struct +{ + /** + * best mvs + */ + ime_mv_t s_mv_curr; + + /** + * mv predictor + */ + ime_mv_t s_mv_pred; + + /** + * SAD associated with the MB partition + */ + WORD32 i4_mb_distortion; + + /** + * cost for the MB partition + */ + WORD32 i4_mb_cost; + + /** + * Search position for least cost among the list of candidates + */ + UWORD32 u4_srch_pos_idx; + + /** + * Search position for least cost among the list of candidates + */ + UWORD32 u4_exit; + + /* + * Buffer corresponding to best half pel cost + */ + UWORD8 *pu1_best_hpel_buf; + +} mb_part_ctxt; + + +/** +************************************************************************** +* @brief me_ctxt_t +* +* Structure encapsulating the parameters used in the motion estimation +* context +************************************************************************** +*/ +typedef struct +{ + /** + * Ref pointer to current MB luma + */ + UWORD8 *pu1_ref_buf_luma; + + /** + * Src pointer to current MB luma + */ + UWORD8 *pu1_src_buf_luma; + + /** + * source stride + * (strides for luma and chroma are the same) + */ + WORD32 i4_src_strd; + + /** + * recon stride + * (strides for luma and chroma are the same) + */ + WORD32 i4_rec_strd; + + /** + * Offset for half pel x plane from the pic buf + */ + UWORD32 u4_half_x_offset; + + /** + * Offset for half pel y plane from half x plane + */ + UWORD32 u4_half_y_offset; + + /** + * Offset for half pel xy plane from half y plane + */ + UWORD32 u4_half_xy_offset; + + /** + * Search range in the X, Y axis in terms of pixels + */ + WORD32 ai2_srch_boundaries[2]; + + /** + * Search range in the north direction in terms of pixels + */ + WORD32 i4_srch_range_n; + + /** + * Search range in the south direction in terms of pixels + */ + WORD32 i4_srch_range_s; + + /** + * Search range in the east direction in terms of pixels + */ + WORD32 i4_srch_range_e; + + /** + * Search range in the west direction in terms of pixels + */ + WORD32 i4_srch_range_w; + + /** + * left mb motion vector + */ + ime_mv_t s_left_mv; + + /** + * top left mb motion vector + */ + ime_mv_t s_top_left_mv; + + /** + * Number of valid candidates for the Initial search position + */ + UWORD32 u4_num_candidates; + + /** + * Motion vector predictors derived from neighbouring + * blocks for each of the six block partitions + */ + ime_mv_t as_mv_init_search[5]; + + /** + * mv bits + */ + UWORD8 *pu1_mv_bits; + + /** + * lambda (lagrange multiplier for cost computation) + */ + UWORD32 u4_lambda_motion; + + /** + * enabled fast sad computation + */ + UWORD32 u4_enable_fast_sad; + + /* + * Enable SKIP block prediction based on SATQD + */ + UWORD32 u4_enable_stat_sad; + + /* + * Minimum distortion to search for + * */ + WORD32 i4_min_sad; + + /* + * Signal that minimum sad has been reached in ME + * */ + UWORD32 u4_min_sad_reached; + + /** + * Flag to enable/disbale half pel motion estimation + */ + UWORD32 u4_enable_hpel; + + /** + * Diamond search Iteration Max Cnt + */ + UWORD32 u4_num_layers; + + /** + * encoder me speed + */ + UWORD32 u4_me_speed_preset; + + UWORD32 u4_left_is_intra; + + UWORD32 u4_left_is_skip; + + /** + * Structure to store the MB partition info + */ + mb_part_ctxt s_mb_part; + /* + * Threshold to compare the sad with + */ + UWORD16 *pu2_sad_thrsh; + + /** + * fn ptrs for compute sad routines + */ + ime_compute_sad_ft *pf_ime_compute_sad_16x16[2]; + ime_compute_sad_ft *pf_ime_compute_sad_16x8; + ime_compute_sad4_diamond *pf_ime_compute_sad4_diamond; + ime_compute_sad3_diamond *pf_ime_compute_sad3_diamond; + ime_compute_sad2_diamond *pf_ime_compute_sad2_diamond; + ime_sub_pel_compute_sad_16x16_ft *pf_ime_sub_pel_compute_sad_16x16; + + /* + * Function poitners for SATQD + */ + ime_compute_sad_stat *pf_ime_compute_sad_stat_luma_16x16; + + /** + * Qp + */ + UWORD8 u1_mb_qp; + + /* + * Buffers for holding half_x , half_y and half_xy + * values when halfpel generation + * for the entire plane is not enabled + */ + UWORD8 *pu1_half_x; + UWORD8 *pu1_half_y; + UWORD8 *pu1_half_xy; + + + /* + * Buffers to store the best halfpel plane* + */ + UWORD8 *pu1_hpel_buf; + + /* + * Stride for hpel buffer + */ + UWORD32 u4_hpel_buf_strd; + + WORD32 u4_hp_buf_strd; + +} me_ctxt_t; + + +#endif // _IME_STRUCTS_H_ + diff --git a/encoder/ime_typedefs.h b/encoder/ime_typedefs.h new file mode 100755 index 0000000..d36632d --- /dev/null +++ b/encoder/ime_typedefs.h @@ -0,0 +1,50 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ihevc_typedefs.h +* +* @brief +* Type definitions used in the code +* +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IME_TYPEDEFS_H_ +#define _IME_TYPEDEFS_H_ + + +typedef unsigned char UWORD8; +typedef unsigned short UWORD16; +typedef unsigned int UWORD32; +typedef unsigned long UWORD64; + +typedef signed char WORD8; +typedef short WORD16; +typedef int WORD32; +typedef long WORD64; + +typedef char CHAR; + +#endif /*_IME_TYPEDEFS_H_*/ diff --git a/encoder/irc_bit_allocation.c b/encoder/irc_bit_allocation.c new file mode 100755 index 0000000..1dfd9de --- /dev/null +++ b/encoder/irc_bit_allocation.c @@ -0,0 +1,859 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** Includes */ +#include <stdio.h> +#include <string.h> +#include "irc_datatypes.h" +#include "irc_mem_req_and_acq.h" +#include "irc_common.h" +#include "irc_cntrl_param.h" +#include "irc_fixed_point_error_bits.h" +#include "irc_rd_model.h" +#include "irc_est_sad.h" +#include "irc_picture_type.h" +#include "irc_bit_allocation.h" +#include "irc_trace_support.h" + +/** Macros **/ +#define MIN(x,y) ((x) < (y))? (x) : (y) + +/* State structure for bit allocation */ +typedef struct +{ + /* using var_q number as it can cross 31 bits for large intra frameinterval */ + number_t vq_rem_bits_in_period; + + /* Storing inputs */ + WORD32 i4_tot_frms_in_gop; + + WORD32 i4_num_intra_frm_interval; + + WORD32 i4_bits_per_frm; + +} rem_bit_in_prd_t; + +typedef struct bit_allocation_t +{ + rem_bit_in_prd_t s_rbip; + + /* A universal constant giving the relative complexity between pictures */ + WORD32 i2_K[MAX_PIC_TYPE]; + + /* To get a estimate of the header bits consumed */ + WORD32 i4_prev_frm_header_bits[MAX_PIC_TYPE]; + + WORD32 i4_bits_per_frm; + + WORD32 i4_num_gops_in_period; + + /* Num gops as set by rate control module */ + WORD32 i4_actual_num_gops_in_period; + + number_t vq_saved_bits; + + WORD32 i4_max_bits_per_frm[MAX_NUM_DRAIN_RATES]; + + WORD32 i4_min_bits_per_frm; + + /* Error bits module */ + error_bits_handle ps_error_bits; + + /* Storing frame rate */ + WORD32 i4_frame_rate; + + WORD32 i4_bit_rate; + + WORD32 ai4_peak_bit_rate[MAX_NUM_DRAIN_RATES]; + +} bit_allocation_t; + +static WORD32 get_number_of_frms_in_a_gop(pic_handling_handle ps_pic_handling) +{ + WORD32 i4_tot_frms_in_gop = 0, i; + WORD32 ai4_frms_in_gop[MAX_PIC_TYPE]; + + /* Query the pic_handling struct for the rem frames in the period */ + irc_pic_type_get_frms_in_gop(ps_pic_handling, ai4_frms_in_gop); + + /* Get the total frms in the gop */ + i4_tot_frms_in_gop = 0; + for(i = 0; i < MAX_PIC_TYPE; i++) + { + i4_tot_frms_in_gop += ai4_frms_in_gop[i]; + } + return (i4_tot_frms_in_gop); +} + +static void init_rbip(rem_bit_in_prd_t *ps_rbip, + pic_handling_handle ps_pic_handling, + WORD32 i4_bits_per_frm, + WORD32 i4_num_intra_frm_interval) +{ + WORD32 i4_tot_frms_in_gop = get_number_of_frms_in_a_gop(ps_pic_handling); + + /* rem_bits_in_period = bits_per_frm * tot_frms_in_gop * num_intra_frm_interval */ + { + number_t vq_bits_per_frm, vq_tot_frms_in_gop, vq_num_intra_frm_interval; + number_t *pvq_rem_bits_in_period = &ps_rbip->vq_rem_bits_in_period; + + SET_VAR_Q(vq_bits_per_frm, i4_bits_per_frm, 0); + SET_VAR_Q(vq_tot_frms_in_gop, i4_tot_frms_in_gop, 0); + SET_VAR_Q(vq_num_intra_frm_interval, i4_num_intra_frm_interval, 0); + + /* rem_bits_in_period = bits_per_frm * tot_frms_in_gop */ + mult32_var_q(vq_bits_per_frm, vq_tot_frms_in_gop, + pvq_rem_bits_in_period); + + /* rem_bits_in_period *= num_intra_frm_interval */ + mult32_var_q(vq_num_intra_frm_interval, pvq_rem_bits_in_period[0], + pvq_rem_bits_in_period); + } + + /* + * Store the total number of frames in GOP value which is + * used from module A + */ + ps_rbip->i4_tot_frms_in_gop = i4_tot_frms_in_gop; + ps_rbip->i4_num_intra_frm_interval = i4_num_intra_frm_interval; + ps_rbip->i4_bits_per_frm = i4_bits_per_frm; +} + +static void check_update_rbip(rem_bit_in_prd_t *ps_rbip, + pic_handling_handle ps_pic_handling) +{ + /* + * NOTE: Intra frame interval changes after the first I frame that is + * encoded in a GOP + */ + WORD32 i4_new_tot_frms_in_gop = get_number_of_frms_in_a_gop( + ps_pic_handling); + + if(i4_new_tot_frms_in_gop != ps_rbip->i4_tot_frms_in_gop) + { + WORD32 i4_rem_frames_in_period = + ps_rbip->i4_num_intra_frm_interval + * (i4_new_tot_frms_in_gop + - ps_rbip->i4_tot_frms_in_gop); + + number_t vq_rem_frms_in_period, s_bits_per_frm, vq_delta_bits_in_period; + + SET_VAR_Q(vq_rem_frms_in_period, i4_rem_frames_in_period, 0); + SET_VAR_Q(s_bits_per_frm, ps_rbip->i4_bits_per_frm, 0); + + /* delta_bits_in_period = bits_per_frm * rem_frms_in_period */ + mult32_var_q(s_bits_per_frm, vq_rem_frms_in_period, + &vq_delta_bits_in_period); + + /* rem_bits_in_period += delta_bits_in_period */ + add32_var_q(vq_delta_bits_in_period, ps_rbip->vq_rem_bits_in_period, + &ps_rbip->vq_rem_bits_in_period); + } + /* Updated the new values */ + ps_rbip->i4_tot_frms_in_gop = i4_new_tot_frms_in_gop; +} + +static void irc_ba_update_rbip(rem_bit_in_prd_t *ps_rbip, + pic_handling_handle ps_pic_handling, + WORD32 i4_num_of_bits) +{ + number_t vq_num_bits; + + check_update_rbip(ps_rbip, ps_pic_handling); + + /* rem_bits_in_period += num_of_bits */ + SET_VAR_Q(vq_num_bits, i4_num_of_bits, 0); + add32_var_q(vq_num_bits, ps_rbip->vq_rem_bits_in_period, + &ps_rbip->vq_rem_bits_in_period); +} + +static void irc_ba_change_rbip(rem_bit_in_prd_t *ps_rbip, + pic_handling_handle ps_pic_handling, + WORD32 i4_new_bits_per_frm, + WORD32 i4_new_num_intra_frm_interval) +{ + WORD32 ai4_rem_frms_in_period[MAX_PIC_TYPE], i4_rem_frms_in_gop, i; + irc_pic_type_get_rem_frms_in_gop(ps_pic_handling, ai4_rem_frms_in_period); + + i4_rem_frms_in_gop = 0; + for(i = 0; i < MAX_PIC_TYPE; i++) + i4_rem_frms_in_gop += ai4_rem_frms_in_period[i]; + + if(i4_new_bits_per_frm != ps_rbip->i4_bits_per_frm) + { + WORD32 i4_rem_frms_in_period = (ps_rbip->i4_num_intra_frm_interval - 1) + * ps_rbip->i4_tot_frms_in_gop + i4_rem_frms_in_gop; + + number_t vq_rem_frms_in_period, vq_delta_bits_per_frm, + vq_delta_bits_in_period; + + /* delta_bits_per_frm = new_bits_per_frm - old_bits_per_frm */ + SET_VAR_Q(vq_delta_bits_per_frm, + (i4_new_bits_per_frm - ps_rbip->i4_bits_per_frm), 0); + + SET_VAR_Q(vq_rem_frms_in_period, i4_rem_frms_in_period, 0); + + /* delta_bits_in_period = delta_bits_per_frm * rem_frms_in_period */ + mult32_var_q(vq_delta_bits_per_frm, vq_rem_frms_in_period, + &vq_delta_bits_in_period); + + /* ps_rbip->rem_bits_in_period += delta_bits_in_period */ + add32_var_q(vq_delta_bits_in_period, ps_rbip->vq_rem_bits_in_period, + &ps_rbip->vq_rem_bits_in_period); + } + + if(i4_new_num_intra_frm_interval != ps_rbip->i4_num_intra_frm_interval) + { + WORD32 i4_rem_frms_in_period = ps_rbip->i4_tot_frms_in_gop + * (i4_new_num_intra_frm_interval + - ps_rbip->i4_num_intra_frm_interval); + + number_t vq_rem_frms_in_period, vq_new_bits_per_frm, + vq_delta_bits_in_period; + + /* new_bits_per_frm = new_new_bits_per_frm - old_new_bits_per_frm */ + SET_VAR_Q(vq_new_bits_per_frm, i4_new_bits_per_frm, 0); + + SET_VAR_Q(vq_rem_frms_in_period, i4_rem_frms_in_period, 0); + + /* delta_bits_in_period = new_bits_per_frm * rem_frms_in_period */ + mult32_var_q(vq_new_bits_per_frm, vq_rem_frms_in_period, + &vq_delta_bits_in_period); + + /* ps_rbip->rem_bits_in_period += delta_bits_in_period */ + add32_var_q(vq_delta_bits_in_period, ps_rbip->vq_rem_bits_in_period, + &ps_rbip->vq_rem_bits_in_period); + } + /* Update the new value */ + ps_rbip->i4_num_intra_frm_interval = i4_new_num_intra_frm_interval; + ps_rbip->i4_bits_per_frm = i4_new_bits_per_frm; +} + +WORD32 irc_ba_num_fill_use_free_memtab(bit_allocation_t **pps_bit_allocation, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static bit_allocation_t s_bit_allocation_temp; + + /* + * Hack for all alloc, during which we don't have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_bit_allocation) = &s_bit_allocation_temp; + + /*for src rate control state structure*/ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(bit_allocation_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_bit_allocation, + e_func_type); + } + i4_mem_tab_idx++; + + i4_mem_tab_idx += irc_error_bits_num_fill_use_free_memtab( + &pps_bit_allocation[0]->ps_error_bits, + &ps_memtab[i4_mem_tab_idx], e_func_type); + + return (i4_mem_tab_idx); +} + +/******************************************************************************* + Function Name : irc_ba_init_bit_allocation + Description : Initialize the bit_allocation structure. + ******************************************************************************/ +void irc_ba_init_bit_allocation(bit_allocation_t *ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_num_intra_frm_interval, + WORD32 i4_bit_rate, + WORD32 i4_frm_rate, + WORD32 *i4_peak_bit_rate, + WORD32 i4_min_bitrate) +{ + WORD32 i; + WORD32 i4_bits_per_frm, i4_max_bits_per_frm[MAX_NUM_DRAIN_RATES]; + + /* Calculate the bits per frame */ + X_PROD_Y_DIV_Z(i4_bit_rate, 1000, i4_frm_rate, i4_bits_per_frm); + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + X_PROD_Y_DIV_Z(i4_peak_bit_rate[i], 1000, i4_frm_rate, + i4_max_bits_per_frm[i]); + } + /* Initialize the bits_per_frame */ + ps_bit_allocation->i4_bits_per_frm = i4_bits_per_frm; + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_bit_allocation->i4_max_bits_per_frm[i] = i4_max_bits_per_frm[i]; + } + X_PROD_Y_DIV_Z(i4_min_bitrate, 1000, i4_frm_rate, + ps_bit_allocation->i4_min_bits_per_frm); + + /* + * Initialize the rem_bits in period + * The first gop in case of an OPEN GOP may have fewer B_PICs, + * That condition is not taken care of + */ + init_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, i4_bits_per_frm, + i4_num_intra_frm_interval); + + /* Initialize the num_gops_in_period */ + ps_bit_allocation->i4_num_gops_in_period = i4_num_intra_frm_interval; + ps_bit_allocation->i4_actual_num_gops_in_period = i4_num_intra_frm_interval; + + /* Relative complexity between I and P frames */ + ps_bit_allocation->i2_K[I_PIC] = (1 << K_Q); + ps_bit_allocation->i2_K[P_PIC] = I_TO_P_RATIO; + ps_bit_allocation->i2_K[B_PIC] = (P_TO_B_RATIO * I_TO_P_RATIO) >> K_Q; + + /* Initialize the saved bits to 0*/ + SET_VAR_Q(ps_bit_allocation->vq_saved_bits, 0, 0); + + /* Update the error bits module with average bits */ + irc_init_error_bits(ps_bit_allocation->ps_error_bits, i4_frm_rate, + i4_bit_rate); + /* Store the input for implementing change in values */ + ps_bit_allocation->i4_frame_rate = i4_frm_rate; + ps_bit_allocation->i4_bit_rate = i4_bit_rate; + + memset(ps_bit_allocation->i4_prev_frm_header_bits, 0, sizeof(ps_bit_allocation->i4_prev_frm_header_bits)); + for(i=0;i<MAX_NUM_DRAIN_RATES;i++) + ps_bit_allocation->ai4_peak_bit_rate[i] = i4_peak_bit_rate[i]; +} + +/******************************************************************************* + Function Name : get_cur_frm_est_bits + Description : Based on remaining bits in period and rd_model + the number of bits required for the current frame is estimated. + ******************************************************************************/ +WORD32 irc_ba_get_cur_frm_est_texture_bits(bit_allocation_t *ps_bit_allocation, + rc_rd_model_handle *pps_rd_model, + est_sad_handle ps_est_sad, + pic_handling_handle ps_pic_handling, + picture_type_e e_pic_type) +{ + WORD32 i, j; + WORD32 i4_est_texture_bits_for_frm; + number_t vq_rem_texture_bits; + number_t vq_complexity_estimate[MAX_PIC_TYPE]; + WORD32 i4_rem_frms_in_period[MAX_PIC_TYPE], i4_frms_in_period[MAX_PIC_TYPE]; + number_t vq_max_consumable_bits; + number_t vq_rem_frms_in_period[MAX_PIC_TYPE], vq_est_texture_bits_for_frm; + number_t vq_prev_hdr_bits[MAX_PIC_TYPE]; + + WORD32 complexity_est = 0; + + /* Get the rem_frms_in_gop & the frms_in_gop from the pic_type state struct */ + irc_pic_type_get_rem_frms_in_gop(ps_pic_handling, i4_rem_frms_in_period); + irc_pic_type_get_frms_in_gop(ps_pic_handling, i4_frms_in_period); + + /* Depending on the number of gops in a period, find the num_frms_in_prd */ + for(j = 0; j < MAX_PIC_TYPE; j++) + { + i4_rem_frms_in_period[j] += (i4_frms_in_period[j] + * (ps_bit_allocation->i4_num_gops_in_period - 1)); + i4_frms_in_period[j] *= ps_bit_allocation->i4_num_gops_in_period; + } + + /* Remove the header bits from the remaining bits to find how many bits you + can transfer.*/ + irc_ba_update_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, 0); + for(i = 0; i < MAX_PIC_TYPE; i++) + { + SET_VAR_Q(vq_rem_frms_in_period[i], i4_rem_frms_in_period[i], 0); + SET_VAR_Q(vq_prev_hdr_bits[i], + ps_bit_allocation->i4_prev_frm_header_bits[i], 0); + } + { + /* + *rem_texture_bits = rem_bits_in_period - + *(rem_frms_in_period[I_PIC] * prev_frm_header_bits[I_PIC]) - + *(rem_frms_in_period[P_PIC] * prev_frm_header_bits[P_PIC]) - + *(rem_frms_in_period[B_PIC] * prev_frm_header_bits[B_PIC]); + */ + number_t vq_rem_hdr_bits; + vq_rem_texture_bits = ps_bit_allocation->s_rbip.vq_rem_bits_in_period; + + mult32_var_q(vq_prev_hdr_bits[I_PIC], vq_rem_frms_in_period[I_PIC], + &vq_rem_hdr_bits); + sub32_var_q(vq_rem_texture_bits, vq_rem_hdr_bits, &vq_rem_texture_bits); + + mult32_var_q(vq_prev_hdr_bits[P_PIC], vq_rem_frms_in_period[P_PIC], + &vq_rem_hdr_bits); + sub32_var_q(vq_rem_texture_bits, vq_rem_hdr_bits, &vq_rem_texture_bits); + + mult32_var_q(vq_prev_hdr_bits[B_PIC], vq_rem_frms_in_period[B_PIC], + &vq_rem_hdr_bits); + sub32_var_q(vq_rem_texture_bits, vq_rem_hdr_bits, &vq_rem_texture_bits); + } + { + /* max_consumable_bits = + *(frms_in_period[I_PIC] * max_bits_per_frm[0] ) + + *(frms_in_period[P_PIC] + frms_in_period[B_PIC] ) * max_bits_per_frm[1]; + */ + number_t vq_max_bits, vq_max_bits_per_frm[2]; + + SET_VAR_Q(vq_max_bits_per_frm[0], + ps_bit_allocation->i4_max_bits_per_frm[0], 0); + SET_VAR_Q(vq_max_bits_per_frm[1], + ps_bit_allocation->i4_max_bits_per_frm[1], 0); + + mult32_var_q(vq_rem_frms_in_period[I_PIC], vq_max_bits_per_frm[0], + &vq_max_bits); + vq_max_consumable_bits = vq_max_bits; + + mult32_var_q(vq_rem_frms_in_period[P_PIC], vq_max_bits_per_frm[1], + &vq_max_bits); + add32_var_q(vq_max_bits, vq_max_consumable_bits, + &vq_max_consumable_bits); + + mult32_var_q(vq_rem_frms_in_period[B_PIC], vq_max_bits_per_frm[1], + &vq_max_bits); + add32_var_q(vq_max_bits, vq_max_consumable_bits, + &vq_max_consumable_bits); + } + + /* rem_texture_bits = MIN(rem_texture_bits, max_consumable_bits) */ + MIN_VARQ(vq_max_consumable_bits, vq_rem_texture_bits, vq_rem_texture_bits); + + /* The bits are then allocated based on the relative complexity of the + current frame with respect to that of the rest of the frames in period */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + number_t vq_lin_mod_coeff, vq_est_sad, vq_K; + + /* Getting the linear model coefficient */ + vq_lin_mod_coeff = irc_get_linear_coefficient(pps_rd_model[i]); + + /* Getting the estimated SAD */ + SET_VAR_Q(vq_est_sad, irc_get_est_sad(ps_est_sad,i), 0); + + /* Making K factor a var Q format */ + SET_VAR_Q(vq_K, ps_bit_allocation->i2_K[i], K_Q); + + /* Complexity_estimate = [ (lin_mod_coeff * estimated_sad) / K factor ] */ + mult32_var_q(vq_lin_mod_coeff, vq_est_sad, &vq_lin_mod_coeff); + div32_var_q(vq_lin_mod_coeff, vq_K, &vq_complexity_estimate[i]); + } + + /* + * For simple cases, one of the complexities go to zero and in those cases + * distribute the bits evenly among frames based on I_TO_P_RATIO + */ + + /* Also check the B-pictures complexity only in case they are present*/ + if(i4_frms_in_period[B_PIC] == 0) + { + complexity_est = (vq_complexity_estimate[I_PIC] + && vq_complexity_estimate[P_PIC]); + } + else + { + complexity_est = (vq_complexity_estimate[I_PIC] + && vq_complexity_estimate[P_PIC] + && vq_complexity_estimate[B_PIC]); + } + + if(complexity_est) + { + /* + * Estimated texture bits = + * (remaining bits) * (cur frm complexity) + * --------------------------------------- + * (num_i_frm*i_frm_complexity) + (num_p_frm*pfrm_complexity) + * + (b_frm * b_frm_cm) + */ + mult32_var_q(vq_rem_texture_bits, vq_complexity_estimate[e_pic_type], + &vq_rem_texture_bits); + + for(i = 0; i < MAX_PIC_TYPE; i++) + { + mult32_var_q(vq_rem_frms_in_period[i], vq_complexity_estimate[i], + &vq_rem_frms_in_period[i]); + } + + add32_var_q(vq_rem_frms_in_period[I_PIC], vq_rem_frms_in_period[P_PIC], + &vq_rem_frms_in_period[I_PIC]); + + add32_var_q(vq_rem_frms_in_period[I_PIC], vq_rem_frms_in_period[B_PIC], + &vq_rem_frms_in_period[I_PIC]); + + div32_var_q(vq_rem_texture_bits, vq_rem_frms_in_period[I_PIC], + &vq_est_texture_bits_for_frm); + + number_t_to_word32(vq_est_texture_bits_for_frm, + &i4_est_texture_bits_for_frm); + } + else + { + number_t vq_i_to_p_bit_ratio, vq_rem_frms; + + SET_VAR_Q(vq_i_to_p_bit_ratio, I_TO_P_BIT_RATIO, 0); + + /* rem_frms = ((I_TO_P_BIT_RATIO * rem_frms_in_period[I_PIC]) + + * rem_frms_in_period[P_PIC] + rem_frms_in_period[B_PIC]); + */ + mult32_var_q(vq_rem_frms_in_period[I_PIC], vq_i_to_p_bit_ratio, + &vq_rem_frms); + add32_var_q(vq_rem_frms_in_period[P_PIC], vq_rem_frms, &vq_rem_frms); + add32_var_q(vq_rem_frms_in_period[B_PIC], vq_rem_frms, &vq_rem_frms); + + /* est_texture_bits_for_frm = rem_texture_bits / rem_frms */ + div32_var_q(vq_rem_texture_bits, vq_rem_frms, + &vq_est_texture_bits_for_frm); + number_t_to_word32(vq_est_texture_bits_for_frm, + &i4_est_texture_bits_for_frm); + + i4_est_texture_bits_for_frm = + (I_PIC == e_pic_type) ? + (i4_est_texture_bits_for_frm + * I_TO_P_BIT_RATIO) : + i4_est_texture_bits_for_frm; + } + + /* + * If the remaining bits in the period becomes negative then the estimated + * texture bits would also become negative. This would send a feedback to + * the model which may go for a toss. Thus sending the minimum possible + * value = 0 + */ + if(i4_est_texture_bits_for_frm < 0) + { + i4_est_texture_bits_for_frm = 0; + } + + return (i4_est_texture_bits_for_frm); +} + +/****************************************************************************** + Function Name : irc_ba_get_cur_frm_est_header_bits + Description : Based on remaining bits in period and rd_model + the number of bits required for the current frame is estimated. + ******************************************************************************/ +WORD32 irc_ba_get_cur_frm_est_header_bits(bit_allocation_t *ps_bit_allocation, + picture_type_e e_pic_type) +{ + return (ps_bit_allocation->i4_prev_frm_header_bits[e_pic_type]); +} + +WORD32 irc_ba_get_rem_bits_in_period(bit_allocation_t *ps_bit_allocation, + pic_handling_handle ps_pic_handling) +{ + WORD32 i4_rem_bits_in_gop = 0; + irc_ba_update_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, 0); + number_t_to_word32(ps_bit_allocation->s_rbip.vq_rem_bits_in_period, + &i4_rem_bits_in_gop); + return (i4_rem_bits_in_gop); +} + +/******************************************************************************* + Function Name : irc_ba_update_cur_frm_consumed_bits + Description : Based on remaining bits in period and rd_model + the number of bits required for the current frame is estimated. + ******************************************************************************/ +void irc_ba_update_cur_frm_consumed_bits(bit_allocation_t *ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_total_frame_bits, + WORD32 i4_model_updation_hdr_bits, + picture_type_e e_pic_type, + UWORD8 u1_is_scd, + WORD32 i4_last_frm_in_gop) +{ + WORD32 i4_error_bits = irc_get_error_bits(ps_bit_allocation->ps_error_bits); + + /* Update the remaining bits in period */ + irc_ba_update_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, + (-i4_total_frame_bits + i4_error_bits)); + + /* + * Update the header bits so that it can be used as an estimate to the next + * frame + */ + if(u1_is_scd) + { + /* + * In case of SCD, even though the frame type is P, it is equivalent to + * a I frame and so the corresponding header bits is updated + */ + ps_bit_allocation->i4_prev_frm_header_bits[I_PIC] = + i4_model_updation_hdr_bits; + +#define MAX_NUM_GOPS_IN_PERIOD (3) + if(ps_bit_allocation->i4_num_gops_in_period < MAX_NUM_GOPS_IN_PERIOD) + { + /* + * Whenever there is a scene change increase the number of gops by + * 2 so that the number of bits allocated is not very constrained + */ + ps_bit_allocation->i4_num_gops_in_period += 2; + /* Add the extra bits in GOP to remaining bits in period */ + irc_ba_change_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, + ps_bit_allocation->i4_bits_per_frm, + ps_bit_allocation->i4_num_gops_in_period); + } + } + else + { + ps_bit_allocation->i4_prev_frm_header_bits[e_pic_type] = + i4_model_updation_hdr_bits; + } + + if(i4_last_frm_in_gop) + { + WORD32 i4_num_bits_in_a_gop = get_number_of_frms_in_a_gop( + ps_pic_handling) * ps_bit_allocation->i4_bits_per_frm; + /* + * If the number of gops in period has been increased due to scene + * change, slowly bring in down across the gops + */ + if(ps_bit_allocation->i4_num_gops_in_period + > ps_bit_allocation->i4_actual_num_gops_in_period) + { + ps_bit_allocation->i4_num_gops_in_period--; + irc_ba_change_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, + ps_bit_allocation->i4_bits_per_frm, + ps_bit_allocation->i4_num_gops_in_period); + } + /* + * If rem_bits_in_period < 0 decrease the number of bits allocated for + * the next period else increase it + */ + irc_ba_update_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, + i4_num_bits_in_a_gop); + } + /* Update the lower modules */ + irc_update_error_bits(ps_bit_allocation->ps_error_bits); +} + +void irc_ba_change_remaining_bits_in_period(bit_allocation_t *ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_bit_rate, + WORD32 i4_frame_rate, + WORD32 *i4_peak_bit_rate) +{ + WORD32 i4_new_avg_bits_per_frm; + WORD32 i4_new_peak_bits_per_frm[MAX_NUM_DRAIN_RATES]; + WORD32 i4_rem_frms_in_period[MAX_PIC_TYPE]; + int i; + + /* Calculate the new per frame bits */ + X_PROD_Y_DIV_Z(i4_bit_rate, 1000, i4_frame_rate, i4_new_avg_bits_per_frm); + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + X_PROD_Y_DIV_Z(i4_peak_bit_rate[i], 1000, i4_frame_rate, + i4_new_peak_bits_per_frm[i]); + } + + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_bit_allocation->i4_max_bits_per_frm[i] = i4_new_peak_bits_per_frm[i]; + } + + /* + * Get the rem_frms_in_prd & the frms_in_prd from the pic_type state + * struct + */ + irc_pic_type_get_rem_frms_in_gop(ps_pic_handling, i4_rem_frms_in_period); + + /* + * If the difference > 0(/ <0), the remaining bits in period needs to be + * increased(/decreased) based on the remaining number of frames + */ + irc_ba_change_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, + i4_new_avg_bits_per_frm, + ps_bit_allocation->i4_num_gops_in_period); + + /* Update the new average bits per frame */ + ps_bit_allocation->i4_bits_per_frm = i4_new_avg_bits_per_frm; + /* change the lower modules state */ + irc_change_bitrate_in_error_bits(ps_bit_allocation->ps_error_bits, + i4_bit_rate); + irc_change_frm_rate_in_error_bits(ps_bit_allocation->ps_error_bits, + i4_frame_rate); + + /* Store the modified frame_rate */ + ps_bit_allocation->i4_frame_rate = i4_frame_rate; + ps_bit_allocation->i4_bit_rate = i4_bit_rate; + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + ps_bit_allocation->ai4_peak_bit_rate[i] = i4_peak_bit_rate[i]; +} + +void irc_ba_change_ba_peak_bit_rate(bit_allocation_t *ps_bit_allocation, + WORD32 *ai4_peak_bit_rate) +{ + WORD32 i; + + /* Calculate the bits per frame */ + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + X_PROD_Y_DIV_Z(ai4_peak_bit_rate[i], 1000, + ps_bit_allocation->i4_frame_rate, + ps_bit_allocation->i4_max_bits_per_frm[i]); + ps_bit_allocation->ai4_peak_bit_rate[i] = ai4_peak_bit_rate[i]; + } +} + +/****************************************************************************** + * @brief Modifies the remaining bit in period for the gop which has fif. + * since fif would cause a new gop to be created, we need to add the number + * of encoded frames in the fif GOP worth of bits to remaining bits in + * period + ******************************************************************************/ +void irc_ba_change_rem_bits_in_prd_at_force_I_frame(bit_allocation_t *ps_bit_allocation, + pic_handling_handle ps_pic_handling) +{ + WORD32 i4_frms_in_period; + i4_frms_in_period = irc_pic_type_get_frms_in_gop_force_I_frm( + ps_pic_handling); + irc_ba_update_rbip(&ps_bit_allocation->s_rbip, ps_pic_handling, + ps_bit_allocation->i4_bits_per_frm * i4_frms_in_period); +} + +void irc_ba_check_and_update_bit_allocation(bit_allocation_t *ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_cur_buf_size, + WORD32 i4_max_buf_size, + WORD32 i4_max_bits_inflow_per_frm, + WORD32 i4_tot_frame_bits) +{ + + number_t vq_max_drain_bits, vq_extra_bits, vq_less_bits, + vq_allocated_saved_bits, vq_min_bits_for_period; + WORD32 i4_num_frms_in_period = get_number_of_frms_in_a_gop(ps_pic_handling); + number_t vq_rem_bits_in_period, vq_num_frms_in_period, vq_zero; + WORD32 b_rem_bits_gt_max_drain, b_rem_bits_lt_min_bits, + b_saved_bits_gt_zero; + rem_bit_in_prd_t *ps_rbip = &ps_bit_allocation->s_rbip; + + UNUSED(i4_cur_buf_size); + UNUSED(i4_max_buf_size); + UNUSED(i4_tot_frame_bits); + + /* + * If the remaining bits is greater than what can be drained in that period + * Clip the remaining bits in period to the maximum it can drain in that + * period with the error of current buffer size.Accumulate the saved bits + * if any. else if the remaining bits is lesser than the minimum bit rate + * promised in that period Add the excess bits to remaining bits in period + * and reduce it from the saved bits Else Provide the extra bits from the + * "saved bits pool". + */ + /* + * max_drain_bits = num_gops_in_period * num_frms_in_period * + * * max_bits_inflow_per_frm + */ + SET_VAR_Q(vq_num_frms_in_period, + (ps_bit_allocation->i4_num_gops_in_period * i4_num_frms_in_period), + 0); + SET_VAR_Q(vq_max_drain_bits, i4_max_bits_inflow_per_frm, 0); + SET_VAR_Q(vq_zero, 0, 0); + mult32_var_q(vq_max_drain_bits, vq_num_frms_in_period, &vq_max_drain_bits); + + /* + * min_bits_for_period = num_gops_in_period * num_frms_in_period * + * min_bits_per_frm + */ + SET_VAR_Q(vq_min_bits_for_period, ps_bit_allocation->i4_min_bits_per_frm, + 0); + mult32_var_q(vq_min_bits_for_period, vq_num_frms_in_period, + &vq_min_bits_for_period); + + vq_rem_bits_in_period = ps_rbip->vq_rem_bits_in_period; + + /* Evaluate rem_bits_in_period > max_drain_bits */ + VQ_A_GT_VQ_B(ps_rbip->vq_rem_bits_in_period, vq_max_drain_bits, + b_rem_bits_gt_max_drain); + + /* Evaluate rem_bits_in_period < min_bits_for_period */ + VQ_A_LT_VQ_B(ps_rbip->vq_rem_bits_in_period, vq_min_bits_for_period, + b_rem_bits_lt_min_bits); + + /* Evaluate saved_bits > 0 */ + VQ_A_LT_VQ_B(ps_bit_allocation->vq_saved_bits, vq_zero, + b_saved_bits_gt_zero); + + /* (i4_rem_bits_in_period > i4_max_drain_bits) */ + if(b_rem_bits_gt_max_drain) + { + /* extra_bits = rem_bits_in_period - max_drain_bits */ + sub32_var_q(ps_rbip->vq_rem_bits_in_period, vq_max_drain_bits, + &vq_extra_bits); + + /* saved_bits += extra_bits */ + add32_var_q(ps_bit_allocation->vq_saved_bits, vq_extra_bits, + &ps_bit_allocation->vq_saved_bits); + + /* rem_bits_in_period = vq_max_drain_bits */ + ps_rbip->vq_rem_bits_in_period = vq_max_drain_bits; + } + else if(b_rem_bits_lt_min_bits) + { + /* extra_bits(-ve) = rem_bits_in_period - i4_min_bits_for_period */ + sub32_var_q(ps_rbip->vq_rem_bits_in_period, vq_min_bits_for_period, + &vq_extra_bits); + + /* saved_bits += extra_bits(-ve) */ + add32_var_q(ps_bit_allocation->vq_saved_bits, vq_extra_bits, + &ps_bit_allocation->vq_saved_bits); + + /* rem_bits_in_period = min_bits_for_period */ + ps_rbip->vq_rem_bits_in_period = vq_min_bits_for_period; + } + else if(b_saved_bits_gt_zero) + { + /* less_bits = max_drain_bits - _rem_bits_in_period */ + sub32_var_q(vq_max_drain_bits, vq_rem_bits_in_period, &vq_less_bits); + + /* allocated_saved_bits = MIN (less_bits, saved_bits) */ + MIN_VARQ(ps_bit_allocation->vq_saved_bits, vq_less_bits, + vq_allocated_saved_bits); + + /* rem_bits_in_period += allocted_save_bits */ + add32_var_q(ps_rbip->vq_rem_bits_in_period, vq_allocated_saved_bits, + &ps_rbip->vq_rem_bits_in_period); + + /* saved_bits -= allocted_save_bits */ + sub32_var_q(ps_bit_allocation->vq_saved_bits, vq_allocated_saved_bits, + &ps_bit_allocation->vq_saved_bits); + } + return; +} + +WORD32 irc_ba_get_frame_rate(bit_allocation_t *ps_bit_allocation) +{ + return (ps_bit_allocation->i4_frame_rate); +} + +WORD32 irc_ba_get_bit_rate(bit_allocation_t *ps_bit_allocation) +{ + return (ps_bit_allocation->i4_bit_rate); +} + +void irc_ba_get_peak_bit_rate(bit_allocation_t *ps_bit_allocation, + WORD32 *pi4_peak_bit_rate) +{ + WORD32 i; + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + pi4_peak_bit_rate[i] = ps_bit_allocation->ai4_peak_bit_rate[i]; + } +} diff --git a/encoder/irc_bit_allocation.h b/encoder/irc_bit_allocation.h new file mode 100755 index 0000000..19ba0df --- /dev/null +++ b/encoder/irc_bit_allocation.h @@ -0,0 +1,99 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _BIT_ALLOCATION_H_ +#define _BIT_ALLOCATION_H_ + +typedef struct bit_allocation_t *bit_allocation_handle; + +WORD32 irc_ba_num_fill_use_free_memtab(bit_allocation_handle *pps_bit_allocation, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +void irc_ba_init_bit_allocation(bit_allocation_handle ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_num_intra_frm_interval, + WORD32 i4_bit_rate, + WORD32 i4_frm_rate, + WORD32 *u4_peak_bit_rate, + WORD32 i4_min_bitrate); + +/* Estimates the number of texture bits required by the current frame */ +WORD32 irc_ba_get_cur_frm_est_texture_bits(bit_allocation_handle ps_bit_allocation, + rc_rd_model_handle *pps_rd_model, + est_sad_handle ps_est_sad, + pic_handling_handle ps_pic_handling, + picture_type_e e_pic_type); + +/* Estimate the number of header bits required by the current frame */ +WORD32 irc_ba_get_cur_frm_est_header_bits(bit_allocation_handle ps_bit_allocation, + picture_type_e e_pic_type); + +/* Get the remaining bits allocated in the period */ +WORD32 irc_ba_get_rem_bits_in_period(bit_allocation_handle ps_bit_allocation, + pic_handling_handle ps_pic_handling); + +WORD32 irc_ba_get_frame_rate(bit_allocation_handle ps_bit_allocation); + +WORD32 irc_ba_get_bit_rate(bit_allocation_handle ps_bit_allocation); +void irc_ba_get_peak_bit_rate(bit_allocation_handle ps_bit_allocation, + WORD32 *pi4_peak_bit_rate); + +/* Updates the bit allocation module with the actual encoded values */ +void irc_ba_update_cur_frm_consumed_bits(bit_allocation_handle ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_total_frame_bits, + WORD32 i4_model_updation_hdr_bits, + picture_type_e e_pic_type, + UWORD8 u1_is_scd, + WORD32 i4_last_frm_in_gop); + +void irc_ba_check_and_update_bit_allocation(bit_allocation_handle ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_cur_buf_size, + WORD32 i4_max_buf_size, + WORD32 i4_max_bits_inflow_per_frm, + WORD32 i4_tot_frame_bits); + +/* Based on the change in frame/bit rate update the remaining bits in period */ +void irc_ba_change_remaining_bits_in_period(bit_allocation_handle ps_bit_allocation, + pic_handling_handle ps_pic_handling, + WORD32 i4_bit_rate, + WORD32 i4_frame_rate, + WORD32 *i4_peak_bit_rate); + +/* Change the gop size in the middle of a current gop */ +void change_gop_size(bit_allocation_handle ps_bit_allocation, + WORD32 i4_intra_frm_interval, + WORD32 i4_inter_frm_interval, + WORD32 i4_num_intra_frm_interval); + +void update_rem_frms_in_period(bit_allocation_handle ps_bit_allocation, + picture_type_e e_pic_type, + UWORD8 u1_is_first_frm, + WORD32 i4_intra_frm_interval, + WORD32 i4_num_intra_frm_interval); + +void irc_ba_change_rem_bits_in_prd_at_force_I_frame(bit_allocation_handle ps_bit_allocation, + pic_handling_handle ps_pic_handling); + +void irc_ba_change_ba_peak_bit_rate(bit_allocation_handle ps_bit_allocation, + WORD32 *ai4_peak_bit_rate); +#endif diff --git a/encoder/irc_cbr_buffer_control.c b/encoder/irc_cbr_buffer_control.c new file mode 100755 index 0000000..c179a28 --- /dev/null +++ b/encoder/irc_cbr_buffer_control.c @@ -0,0 +1,653 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_cntrl_param.h" +#include "irc_common.h" +#include "irc_mem_req_and_acq.h" +#include "irc_fixed_point_error_bits.h" +#include "irc_cbr_buffer_control.h" +#include "irc_trace_support.h" + +typedef struct cbr_buffer_t +{ + /* Buffer size = Delay * Bitrate*/ + WORD32 i4_buffer_size; + + /* Constant drain rate */ + WORD32 i4_drain_bits_per_frame[MAX_NUM_DRAIN_RATES]; + + /* Encoder Buffer Fullness */ + WORD32 i4_ebf; + + /* Upper threshold of the Buffer */ + WORD32 i4_upr_thr[MAX_PIC_TYPE]; + + /* Lower threshold of the Buffer */ + WORD32 i4_low_thr[MAX_PIC_TYPE]; + + /* Stuffing threshold equal to error bits per second in the drain bits + * fixed point computation */ + WORD32 i4_stuffing_threshold; + + /* For error due to bits per frame calculation */ + error_bits_handle aps_bpf_error_bits[MAX_NUM_DRAIN_RATES]; + + /* Whether the buffer model is used for CBR or VBR streaming */ + WORD32 i4_is_cbr_mode; + + /* Input parameters stored for initialization */ + WORD32 ai4_bit_rate[MAX_NUM_DRAIN_RATES]; + + WORD32 i4_max_delay; + + WORD32 ai4_num_pics_in_delay_period[MAX_PIC_TYPE]; + + WORD32 i4_tgt_frm_rate; + + UWORD32 u4_max_vbv_buf_size; + +} cbr_buffer_t; + +WORD32 irc_cbr_buffer_num_fill_use_free_memtab(cbr_buffer_t **pps_cbr_buffer, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0, i; + static cbr_buffer_t s_cbr_buffer_temp; + + /* + * Hack for all alloc, during which we don't have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_cbr_buffer) = &s_cbr_buffer_temp; + + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(cbr_buffer_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_cbr_buffer, e_func_type); + } + i4_mem_tab_idx++; + + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + i4_mem_tab_idx += irc_error_bits_num_fill_use_free_memtab( + &pps_cbr_buffer[0]->aps_bpf_error_bits[i], + &ps_memtab[i4_mem_tab_idx], e_func_type); + } + return (i4_mem_tab_idx); +} + +/****************************************************************************** + * @brief Initialize the CBR VBV buffer state. + * This could however be used for VBR streaming VBV also + * + ******************************************************************************/ +void irc_init_cbr_buffer(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_buffer_delay, + WORD32 i4_tgt_frm_rate, + WORD32 *i4_bit_rate, + UWORD32 *u4_num_pics_in_delay_prd, + UWORD32 u4_vbv_buf_size) +{ + WORD32 i4_i, i4_bits_per_frm[MAX_NUM_DRAIN_RATES]; + int i; + + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + X_PROD_Y_DIV_Z(i4_bit_rate[i], 1000, i4_tgt_frm_rate, + i4_bits_per_frm[i]); + /* Drain rate = bitrate/(framerate/1000) */ + ps_cbr_buffer->i4_drain_bits_per_frame[i] = i4_bits_per_frm[i]; + /* Initialize the bits per frame error bits calculation */ + irc_init_error_bits(ps_cbr_buffer->aps_bpf_error_bits[i], + i4_tgt_frm_rate, i4_bit_rate[i]); + } + + /* Bitrate * delay = buffer size, divide by 1000 as delay is in ms*/ + /* This would mean CBR mode */ + if(i4_bit_rate[0] == i4_bit_rate[1]) + { + X_PROD_Y_DIV_Z(i4_bit_rate[0], i4_buffer_delay, 1000, + ps_cbr_buffer->i4_buffer_size); + ps_cbr_buffer->i4_is_cbr_mode = 1; + } + else + { + /* VBR streaming case which has different drain rates for I and P */ + ps_cbr_buffer->i4_buffer_size = u4_num_pics_in_delay_prd[0] + * ps_cbr_buffer->i4_drain_bits_per_frame[0] + + u4_num_pics_in_delay_prd[1] + * ps_cbr_buffer->i4_drain_bits_per_frame[1]; + + ps_cbr_buffer->i4_is_cbr_mode = 0; + } + + if(ps_cbr_buffer->i4_buffer_size > (WORD32)u4_vbv_buf_size) + { + ps_cbr_buffer->i4_buffer_size = u4_vbv_buf_size; + } + + /* Initially Encoder buffer fullness is zero */ + ps_cbr_buffer->i4_ebf = 0; + + /* tgt_frame_rate is divided by 1000 because, an approximate value is fine + * as this is just a threshold below which stuffing is done to avoid buffer + * underflow due to fixed point error in drain rate + */ + ps_cbr_buffer->i4_stuffing_threshold = (i4_bit_rate[0] + - (i4_bits_per_frm[0] * (i4_tgt_frm_rate / 1000))); + + for(i4_i = 0; i4_i < MAX_PIC_TYPE; i4_i++) + { + /* + * Upper threshold for + * I frame = 1 * bits per frame + * P Frame = 4 * bits per frame. + * The threshold for I frame is only 1 * bits per frame as the threshold + * should only account for error in estimated bits. + * In P frame it should account for difference bets bits consumed by + * I(Scene change) and P frame I to P complexity is assumed to be 5. + */ + WORD32 i4_index; + i4_index = i4_i > 0 ? 1 : 0; + ps_cbr_buffer->i4_upr_thr[i4_i] = ps_cbr_buffer->i4_buffer_size + - (ps_cbr_buffer->i4_buffer_size >> 3); + + /* + * For both I and P frame Lower threshold is equal to drain rate.Even if + * the encoder consumes zero bits it should have enough bits to drain + */ + ps_cbr_buffer->i4_low_thr[i4_i] = i4_bits_per_frm[i4_index]; + } + + /* Storing the input parameters for using it for change functions */ + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_cbr_buffer->ai4_bit_rate[i] = i4_bit_rate[i]; + } + + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_cbr_buffer->ai4_num_pics_in_delay_period[i] = + u4_num_pics_in_delay_prd[i]; + } + ps_cbr_buffer->i4_tgt_frm_rate = i4_tgt_frm_rate; + ps_cbr_buffer->i4_max_delay = i4_buffer_delay; + ps_cbr_buffer->u4_max_vbv_buf_size = u4_vbv_buf_size; +} + +/****************************************************************************** + * @brief Condition check for constraining the number of bits allocated based on + * bufer size + ******************************************************************************/ +WORD32 irc_cbr_buffer_constraint_check(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_tgt_bits, + picture_type_e e_pic_type) +{ + WORD32 i4_max_tgt_bits, i4_min_tgt_bits; + WORD32 i4_drain_bits_per_frame = (e_pic_type == I_PIC) ? + ps_cbr_buffer->i4_drain_bits_per_frame[0] : + ps_cbr_buffer->i4_drain_bits_per_frame[1]; + + /* Max tgt bits = Upper threshold - current encoder buffer fullness */ + i4_max_tgt_bits = ps_cbr_buffer->i4_upr_thr[e_pic_type] + - ps_cbr_buffer->i4_ebf; + /* Max tgt bits cannot be negative */ + if(i4_max_tgt_bits < 0) + i4_max_tgt_bits = 0; + + /* + * Min tgt bits , least number of bits in the Encoder after + * draining such that it is greater than lower threshold + */ + i4_min_tgt_bits = ps_cbr_buffer->i4_low_thr[e_pic_type] + - (ps_cbr_buffer->i4_ebf - i4_drain_bits_per_frame); + /* Min tgt bits cannot be negative */ + if(i4_min_tgt_bits < 0) + i4_min_tgt_bits = 0; + + /* Current tgt bits should be between max and min tgt bits */ + CLIP(i4_tgt_bits, i4_max_tgt_bits, i4_min_tgt_bits); + return i4_tgt_bits; +} + +/* ***************************************************************************** + * @brief constaints the bit allocation based on buffer size + * + ******************************************************************************/ +WORD32 irc_vbr_stream_buffer_constraint_check(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_tgt_bits, + picture_type_e e_pic_type) +{ + WORD32 i4_max_tgt_bits; + + /* Max tgt bits = Upper threshold - current encoder buffer fullness */ + i4_max_tgt_bits = ps_cbr_buffer->i4_upr_thr[e_pic_type] + - ps_cbr_buffer->i4_ebf; + + /* Max tgt bits cannot be negative */ + if(i4_max_tgt_bits < 0) + i4_max_tgt_bits = 0; + + if(i4_tgt_bits > i4_max_tgt_bits) + i4_tgt_bits = i4_max_tgt_bits; + + return i4_tgt_bits; +} + +/* ***************************************************************************** + * @brief Verifies the buffer state and returns whether it is overflowing, + * underflowing or normal + * + ******************************************************************************/ +vbv_buf_status_e irc_get_cbr_buffer_status(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_tot_consumed_bits, + WORD32 *pi4_num_bits_to_prevent_overflow, + picture_type_e e_pic_type) +{ + vbv_buf_status_e e_buf_status; + WORD32 i4_cur_enc_buf; + WORD32 i4_error_bits = (e_pic_type == I_PIC) ? + irc_get_error_bits(ps_cbr_buffer + ->aps_bpf_error_bits[0]) : + irc_get_error_bits(ps_cbr_buffer + ->aps_bpf_error_bits[1]); + + WORD32 i4_drain_bits_per_frame = (e_pic_type == I_PIC) ? + ps_cbr_buffer->i4_drain_bits_per_frame[0] : + ps_cbr_buffer->i4_drain_bits_per_frame[1]; + + /* Add the tot consumed bits to the Encoder Buffer*/ + i4_cur_enc_buf = ps_cbr_buffer->i4_ebf + i4_tot_consumed_bits; + + /* If the Encoder exceeds the Buffer Size signal an Overflow*/ + if(i4_cur_enc_buf > ps_cbr_buffer->i4_buffer_size) + { + e_buf_status = VBV_OVERFLOW; + i4_cur_enc_buf = ps_cbr_buffer->i4_buffer_size; + } + else + { + /* + * Subtract the constant drain bits and error bits due to fixed point + * implementation + */ + i4_cur_enc_buf -= (i4_drain_bits_per_frame + i4_error_bits); + + /* + * If the buffer is less than stuffing threshold an Underflow is + * signaled else its NORMAL + */ + if(i4_cur_enc_buf < ps_cbr_buffer->i4_stuffing_threshold) + { + e_buf_status = VBV_UNDERFLOW; + } + else + { + e_buf_status = VBV_NORMAL; + } + + if(i4_cur_enc_buf < 0) + i4_cur_enc_buf = 0; + } + + /* + * The RC lib models the encoder buffer, but the VBV buffer characterizes + * the decoder buffer + */ + if(e_buf_status == VBV_OVERFLOW) + { + e_buf_status = VBV_UNDERFLOW; + } + else if(e_buf_status == VBV_UNDERFLOW) + { + e_buf_status = VBV_OVERFLOW; + } + + pi4_num_bits_to_prevent_overflow[0] = (ps_cbr_buffer->i4_buffer_size + - i4_cur_enc_buf); + + return e_buf_status; +} + +/******************************************************************************* + * @brief Based on the bits consumed the buffer model is updated + ******************************************************************************/ +void irc_update_cbr_buffer(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_tot_consumed_bits, + picture_type_e e_pic_type) +{ + WORD32 i4_error_bits = (e_pic_type == I_PIC) ? + irc_get_error_bits(ps_cbr_buffer-> + aps_bpf_error_bits[0]) : + irc_get_error_bits( ps_cbr_buffer-> + aps_bpf_error_bits[1]); + + WORD32 i4_drain_bits_per_frame = (e_pic_type == I_PIC) ? + ps_cbr_buffer->i4_drain_bits_per_frame[0] : + ps_cbr_buffer->i4_drain_bits_per_frame[1]; + + /* Update the Encoder buffer with the total consumed bits*/ + ps_cbr_buffer->i4_ebf += i4_tot_consumed_bits; + + /* + * Subtract the drain bits and error bits due to fixed point + * implementation + */ + ps_cbr_buffer->i4_ebf -= (i4_drain_bits_per_frame + i4_error_bits); + + if(ps_cbr_buffer->i4_ebf < 0) + ps_cbr_buffer->i4_ebf = 0; + + /*SS - Fix for lack of stuffing*/ + if(ps_cbr_buffer->i4_ebf > ps_cbr_buffer->i4_buffer_size) + { + trace_printf( + (const WORD8*)"Error: Should not be coming here with stuffing\n"); + ps_cbr_buffer->i4_ebf = ps_cbr_buffer->i4_buffer_size; + } +} + +/******************************************************************************* + * @brief If the buffer underflows then return the number of bits to prevent + * underflow + * + ******************************************************************************/ +WORD32 irc_get_cbr_bits_to_stuff(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_tot_consumed_bits, + picture_type_e e_pic_type) +{ + WORD32 i4_bits_to_stuff; + WORD32 i4_error_bits = (e_pic_type == I_PIC) ? + irc_get_error_bits(ps_cbr_buffer + ->aps_bpf_error_bits[0]) : + irc_get_error_bits(ps_cbr_buffer + ->aps_bpf_error_bits[1]); + + WORD32 i4_drain_bits_per_frame = (e_pic_type == I_PIC) ? + ps_cbr_buffer->i4_drain_bits_per_frame[0] : + ps_cbr_buffer->i4_drain_bits_per_frame[1]; + + /* + * Stuffing bits got from the following equation + * Stuffing_threshold = ebf + tcb - drain bits - error bits + stuff_bits + */ + i4_bits_to_stuff = i4_drain_bits_per_frame + i4_error_bits + + ps_cbr_buffer->i4_stuffing_threshold + - (ps_cbr_buffer->i4_ebf + i4_tot_consumed_bits); + + return i4_bits_to_stuff; +} + +/******************************************************************************* + * @brief Update the state for change in number of pics in the delay period + * + ******************************************************************************/ +void irc_change_cbr_vbv_num_pics_in_delay_period(cbr_buffer_t *ps_cbr_buffer, + UWORD32 *u4_num_pics_in_delay_prd) +{ + WORD32 i; + + if(!ps_cbr_buffer->i4_is_cbr_mode) + { + ps_cbr_buffer->i4_buffer_size = + u4_num_pics_in_delay_prd[0] + * ps_cbr_buffer->i4_drain_bits_per_frame[0] + + u4_num_pics_in_delay_prd[1] + * ps_cbr_buffer->i4_drain_bits_per_frame[1]; + + if(ps_cbr_buffer->i4_buffer_size + > (WORD32)ps_cbr_buffer->u4_max_vbv_buf_size) + { + ps_cbr_buffer->i4_buffer_size = ps_cbr_buffer->u4_max_vbv_buf_size; + } + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_cbr_buffer->i4_upr_thr[i] = ps_cbr_buffer->i4_buffer_size + - (ps_cbr_buffer->i4_buffer_size >> 3); + } + + /* Re-initialize the number of pics in delay period */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_cbr_buffer->ai4_num_pics_in_delay_period[i] = + u4_num_pics_in_delay_prd[i]; + } + } +} + +/****************************************************************************** + * @brief update the state for change in target frame rate + * + ******************************************************************************/ +void irc_change_cbr_vbv_tgt_frame_rate(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_tgt_frm_rate) +{ + WORD32 i4_i, i4_bits_per_frm[MAX_NUM_DRAIN_RATES]; + int i; + + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + X_PROD_Y_DIV_Z(ps_cbr_buffer->ai4_bit_rate[i], 1000, i4_tgt_frm_rate, + i4_bits_per_frm[i]); + /* Drain rate = bitrate/(framerate/1000) */ + ps_cbr_buffer->i4_drain_bits_per_frame[i] = i4_bits_per_frm[i]; + /* Initialize the bits per frame error bits calculation */ + irc_change_frm_rate_in_error_bits(ps_cbr_buffer->aps_bpf_error_bits[i], + i4_tgt_frm_rate); + } + + /* Bitrate * delay = buffer size, divide by 1000 as delay is in ms*/ + if(!ps_cbr_buffer->i4_is_cbr_mode) + { + /* VBR streaming case which has different drain rates for I and P */ + ps_cbr_buffer->i4_buffer_size = + ps_cbr_buffer->ai4_num_pics_in_delay_period[0] + * ps_cbr_buffer->i4_drain_bits_per_frame[0] + + ps_cbr_buffer->ai4_num_pics_in_delay_period[1] + * ps_cbr_buffer->i4_drain_bits_per_frame[1]; + } + + if(ps_cbr_buffer->i4_buffer_size + > (WORD32)ps_cbr_buffer->u4_max_vbv_buf_size) + { + ps_cbr_buffer->i4_buffer_size = ps_cbr_buffer->u4_max_vbv_buf_size; + } + + /* + * Tgt_frame_rate is divided by 1000 because an approximate value is fine as + * this is just a threshold below which stuffing is done to avoid buffer + * underflow due to fixed point error in drain rate + */ + ps_cbr_buffer->i4_stuffing_threshold = (ps_cbr_buffer->ai4_bit_rate[0] + - (i4_bits_per_frm[0] * (i4_tgt_frm_rate / 1000))); + + for(i4_i = 0; i4_i < MAX_PIC_TYPE; i4_i++) + { + /* + * Upper threshold for + * I frame = 1 * bits per frame + * P Frame = 4 * bits per frame. + * The threshold for I frame is only 1 * bits per frame as the threshold should + * only account for error in estimated bits. + * In P frame it should account for difference bets bits consumed by I(Scene change) + * and P frame I to P complexity is assumed to be 5. + */ + WORD32 i4_index; + i4_index = i4_i > 0 ? 1 : 0; + ps_cbr_buffer->i4_upr_thr[i4_i] = ps_cbr_buffer->i4_buffer_size + - (ps_cbr_buffer->i4_buffer_size >> 3); + + /* + * For both I and P frame Lower threshold is equal to drain rate. + * Even if the encoder consumes zero bits it should have enough bits to + * drain + */ + ps_cbr_buffer->i4_low_thr[i4_i] = i4_bits_per_frm[i4_index]; + } + + /* Storing the input parameters for using it for change functions */ + ps_cbr_buffer->i4_tgt_frm_rate = i4_tgt_frm_rate; +} + +/******************************************************************************* + * @brief Change the state for change in bit rate + * + ******************************************************************************/ +void irc_change_cbr_vbv_bit_rate(cbr_buffer_t *ps_cbr_buffer, + WORD32 *i4_bit_rate) +{ + WORD32 i4_i, i4_bits_per_frm[MAX_NUM_DRAIN_RATES]; + int i; + + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + X_PROD_Y_DIV_Z(i4_bit_rate[i], 1000, ps_cbr_buffer->i4_tgt_frm_rate, + i4_bits_per_frm[i]); + /* Drain rate = bitrate/(framerate/1000) */ + ps_cbr_buffer->i4_drain_bits_per_frame[i] = i4_bits_per_frm[i]; + /* Initialize the bits per frame error bits calculation */ + irc_change_bitrate_in_error_bits(ps_cbr_buffer->aps_bpf_error_bits[i], + i4_bit_rate[i]); + } + + /* Bitrate * delay = buffer size, divide by 1000 as delay is in ms*/ + if(i4_bit_rate[0] == i4_bit_rate[1]) /* This would mean CBR mode */ + { + X_PROD_Y_DIV_Z(i4_bit_rate[0], ps_cbr_buffer->i4_max_delay, 1000, + ps_cbr_buffer->i4_buffer_size); + ps_cbr_buffer->i4_is_cbr_mode = 1; + } + else + { + /* VBR streaming case which has different drain rates for I and P */ + ps_cbr_buffer->i4_buffer_size = + ps_cbr_buffer->ai4_num_pics_in_delay_period[0] + * ps_cbr_buffer->i4_drain_bits_per_frame[0] + + ps_cbr_buffer->ai4_num_pics_in_delay_period[1] + * ps_cbr_buffer->i4_drain_bits_per_frame[1]; + + ps_cbr_buffer->i4_is_cbr_mode = 0; + } + + if(ps_cbr_buffer->i4_buffer_size + > (WORD32)ps_cbr_buffer->u4_max_vbv_buf_size) + { + ps_cbr_buffer->i4_buffer_size = ps_cbr_buffer->u4_max_vbv_buf_size; + } + + /* + * tgt_frame_rate is divided by 1000 because + * an approximate value is fine as this is just a threshold below which + * stuffing is done to avoid buffer underflow due to fixed point + * error in drain rate + */ + ps_cbr_buffer->i4_stuffing_threshold = (i4_bit_rate[0] + - (i4_bits_per_frm[0] + * (ps_cbr_buffer->i4_tgt_frm_rate / 1000))); + + for(i4_i = 0; i4_i < MAX_PIC_TYPE; i4_i++) + { + /* + * Upper threshold for + * I frame = 1 * bits per frame + * P Frame = 4 * bits per frame. + * The threshold for I frame is only 1 * bits per frame as the threshold + * should only account for error in estimated bits. + * In P frame it should account for difference bets bits consumed by + * I(Scene change) and P frame I to P complexity is assumed to be 5. + */ + + WORD32 i4_index; + i4_index = i4_i > 0 ? 1 : 0; + ps_cbr_buffer->i4_upr_thr[i4_i] = ps_cbr_buffer->i4_buffer_size + - (ps_cbr_buffer->i4_buffer_size >> 3); + + /* For both I and P frame Lower threshold is equal to drain rate. + * Even if the encoder consumes zero bits it should have enough bits to + * drain + */ + ps_cbr_buffer->i4_low_thr[i4_i] = i4_bits_per_frm[i4_index]; + } + + /* Storing the input parameters for using it for change functions */ + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_cbr_buffer->ai4_bit_rate[i] = i4_bit_rate[i]; + } +} + +void irc_change_cbr_buffer_delay(cbr_buffer_t *ps_cbr_buffer, + WORD32 i4_buffer_delay) +{ + WORD32 i4_i; + + /* Bitrate * delay = buffer size, divide by 1000 as delay is in ms*/ + if(ps_cbr_buffer->i4_is_cbr_mode) + { + X_PROD_Y_DIV_Z(ps_cbr_buffer->ai4_bit_rate[0], i4_buffer_delay, 1000, + ps_cbr_buffer->i4_buffer_size); + } + + if(ps_cbr_buffer->i4_buffer_size + > (WORD32)ps_cbr_buffer->u4_max_vbv_buf_size) + { + ps_cbr_buffer->i4_buffer_size = ps_cbr_buffer->u4_max_vbv_buf_size; + } + + for(i4_i = 0; i4_i < MAX_PIC_TYPE; i4_i++) + { + /* + * Upper threshold for + * I frame = 1 * bits per frame + * P Frame = 4 * bits per frame. + * The threshold for I frame is only 1 * bits per frame as the threshold + * should only account for error in estimated bits. + * In P frame it should account for difference bets bits consumed by I + * (Scene change) and P frame I to P complexity is assumed to be 5. + */ + ps_cbr_buffer->i4_upr_thr[i4_i] = ps_cbr_buffer->i4_buffer_size + - (ps_cbr_buffer->i4_buffer_size >> 3); + } + + /* Storing the input parameters for using it for change functions */ + ps_cbr_buffer->i4_max_delay = i4_buffer_delay; +} + +WORD32 irc_get_cbr_buffer_delay(cbr_buffer_t *ps_cbr_buffer) +{ + return (ps_cbr_buffer->i4_max_delay); +} + +WORD32 irc_get_cbr_buffer_size(cbr_buffer_t *ps_cbr_buffer) +{ + return (ps_cbr_buffer->i4_buffer_size); +} diff --git a/encoder/irc_cbr_buffer_control.h b/encoder/irc_cbr_buffer_control.h new file mode 100755 index 0000000..2534961 --- /dev/null +++ b/encoder/irc_cbr_buffer_control.h @@ -0,0 +1,104 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* */ +/* File Name : irc_cbr_buffer_control.h */ +/* */ +/* Description : This file contains all the necessary declarations */ +/* for cbr_buffer_control functions */ +/* */ +/* */ +/* List of Functions : <List the functions defined in this file> */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 06 05 2008 Ittiam Draft */ +/* */ +/*****************************************************************************/ + +#ifndef CBR_BUFFER_CONTROL_H +#define CBR_BUFFER_CONTROL_H + +/* Macro for clipping a number between to extremes */ +#define CLIP(Number,Max,Min) if((Number) > (Max)) (Number) = (Max); \ + else if((Number) < (Min)) (Number) = (Min); +/*****************************************************************************/ +/* Structure */ +/*****************************************************************************/ +typedef struct cbr_buffer_t *cbr_buffer_handle; + +WORD32 irc_cbr_buffer_num_fill_use_free_memtab(cbr_buffer_handle *pps_cbr_buffer, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +/* Initialize the cbr Buffer*/ +void irc_init_cbr_buffer(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_buffer_delay, + WORD32 i4_tgt_frm_rate, + WORD32 *i4_bit_rate, + UWORD32 *u4_num_pics_in_delay_prd, + UWORD32 u4_vbv_buf_size); + +/* Check for tgt bits with in CBR buffer*/ +WORD32 irc_cbr_buffer_constraint_check(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_tgt_bits, + picture_type_e e_pic_type); + +/* Get the buffer status with the current consumed bits*/ +vbv_buf_status_e irc_get_cbr_buffer_status(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_tot_consumed_bits, + WORD32 *pi4_num_bits_to_prevent_overflow, + picture_type_e e_pic_type); + +/* Update the CBR buffer at the end of the VOP*/ +void irc_update_cbr_buffer(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_tot_consumed_bits, + picture_type_e e_pic_type); + +/*Get the bits needed to stuff in case of Underflow*/ +WORD32 irc_get_cbr_bits_to_stuff(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_tot_consumed_bits, + picture_type_e e_pic_type); + +WORD32 irc_get_cbr_buffer_delay(cbr_buffer_handle ps_cbr_buffer); + +WORD32 irc_get_cbr_buffer_size(cbr_buffer_handle ps_cbr_buffer); + +WORD32 irc_vbr_stream_buffer_constraint_check(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_tgt_bits, + picture_type_e e_pic_type); + +void irc_change_cbr_vbv_bit_rate(cbr_buffer_handle ps_cbr_buffer, + WORD32 *i4_bit_rate); + +void irc_change_cbr_vbv_tgt_frame_rate(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_tgt_frm_rate); + +void irc_change_cbr_vbv_num_pics_in_delay_period(cbr_buffer_handle ps_cbr_buffer, + UWORD32 *u4_num_pics_in_delay_prd); + +void irc_change_cbr_buffer_delay(cbr_buffer_handle ps_cbr_buffer, + WORD32 i4_buffer_delay); +#endif /* CBR_BUFFER_CONTROL_H */ + diff --git a/encoder/irc_cntrl_param.h b/encoder/irc_cntrl_param.h new file mode 100755 index 0000000..82235f7 --- /dev/null +++ b/encoder/irc_cntrl_param.h @@ -0,0 +1,59 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _RC_CNTRL_PARAM_H_ +#define _RC_CNTRL_PARAM_H_ + +/* This file should contain only enumerations exported to codec by RC */ + +/* RC algo type */ +typedef enum +{ + VBR_STORAGE = 0, + VBR_STORAGE_DVD_COMP = 1, + VBR_STREAMING = 2, + CONST_QP = 3, + CBR_LDRC = 4, + CBR_NLDRC = 5 + +} rc_type_e; + +/* Picture type structure*/ +typedef enum +{ + BUF_PIC = -1, I_PIC = 0, P_PIC, B_PIC, MAX_PIC_TYPE + +} picture_type_e; + +/* MB Type structure*/ +typedef enum +{ + /* Based on MB TYPES added the array size increases */ + MB_TYPE_INTRA, MB_TYPE_INTER, MAX_MB_TYPE +} mb_type_e; + +typedef enum +{ + VBV_NORMAL, VBV_UNDERFLOW, VBV_OVERFLOW, VBR_CAUTION + +} vbv_buf_status_e; + +#endif + diff --git a/encoder/irc_common.h b/encoder/irc_common.h new file mode 100755 index 0000000..c341de4 --- /dev/null +++ b/encoder/irc_common.h @@ -0,0 +1,104 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _RC_COMMON_H_ +#define _RC_COMMON_H_ + +/**************************************************************************** + NOTE : Put only those things into this file which are common across many + files, say I_TO_P_BIT_RATIO macro is used across irc_bit_allocation.c + and irc_rate_control_api.c.If anything is exclusive only to one file, + define it in the same file + + This file is an RC private file. It should not be exported to Codec + ****************************************************************************/ + +#define UNUSED(x) ((void)(x)) + +typedef float number_t; + +#define mult32_var_q(a,b,c) *c = a * b + +#define div32_var_q(a,b,c) (*c = ((b == 0)? a : (a / b))) + +#define add32_var_q(a,b,c) *c = a + b + +#define sub32_var_q(a,b,c) *c = a - b + +#define sqrt32_var_q(a, c) *c = sqrt(a) + +#define number_t_to_word32(num_a, a) *a = (WORD32)num_a + +#define convert_float_to_fix(a_f, a) *a = (WORD32)a_f + +#define convert_fix_to_float(a, a_f) *a_f = (float) a + +#define SET_VAR_Q(a,b,c) {a = (float) b;} + + +/* Defines the maximum and the minimum quantizer allowed in the stream.*/ +#define MAX_MPEG2_QP 255 /* 127*/ + +/* Bits ratio between I and P frame */ +#define I_TO_P_BIT_RATIO 5 + +/* Calculates P = (X*Y/Z) (Assuming all the four are in integers)*/ +#define X_PROD_Y_DIV_Z(X1,Y1,Z1,P1)\ +{\ + number_t vq_a,vq_b,vq_c;\ + SET_VAR_Q(vq_a,(X1),0);\ + SET_VAR_Q(vq_b,(Y1),0);\ + SET_VAR_Q(vq_c,(Z1),0);\ + mult32_var_q(vq_a,vq_b,&vq_a);\ + div32_var_q(vq_a,vq_c,&vq_a);\ + number_t_to_word32(vq_a,&(P1));\ +} +#define VQ_A_LT_VQ_B(A,B, Z) Z = A < B; +#define VQ_A_GT_VQ_B(A,B, Z) Z = A > B; + +/* Z=MAX(A,B) where A, B and Z are var_q variables */ +#define MAX_VARQ(A,B, Z)\ +{\ + WORD32 a_gt_b;\ + VQ_A_GT_VQ_B((A), (B), a_gt_b);\ + (Z) = (a_gt_b) ? (A) : (B);\ +} + +/* Z=MIN(A,B) where A, B and Z are var_q variables */ +#define MIN_VARQ(A,B, Z)\ +{\ + WORD32 a_lt_b;\ + VQ_A_LT_VQ_B((A), (B), a_lt_b);\ + (Z) = (a_lt_b) ? (A) : (B);\ +} + +/* Maximum number of drain-rates supported. Currently a maximum of only 2 + drain-rates supported. One for + I pictures and the other for P & B pictures */ +#define MAX_NUM_DRAIN_RATES 2 + +/* The ratios between I to P and P to B Qp is specified here */ +#define K_Q 4 +#define I_TO_P_RATIO (19) /* In K_Q Q factor */ +#define P_TO_B_RATIO (21) /* In K_Q Q factor */ +#define P_TO_I_RATIO (13) /* In K_Q Q factor */ + +#endif /* _RC_COMMON_H_ */ + diff --git a/encoder/irc_datatypes.h b/encoder/irc_datatypes.h new file mode 100755 index 0000000..8e4685a --- /dev/null +++ b/encoder/irc_datatypes.h @@ -0,0 +1,64 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264_typedefs.h +* +* @brief +* Type definitions used in the code +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IH264_TYPEDEFS_H_ +#define _IH264_TYPEDEFS_H_ + + +/*****************************************************************************/ +/* Unsigned data types */ +/*****************************************************************************/ +typedef unsigned char UWORD8; +typedef unsigned short UWORD16; +typedef unsigned int UWORD32; +typedef unsigned long long UWORD64; + + +/*****************************************************************************/ +/* Signed data types */ +/*****************************************************************************/ +typedef signed char WORD8; +typedef short WORD16; +typedef int WORD32; + + +/*****************************************************************************/ +/* Miscellaneous data types */ +/*****************************************************************************/ +typedef char CHAR; +typedef double DOUBLE; + +#endif /* _IH264_TYPEDEFS_H_ */ diff --git a/encoder/irc_est_sad.c b/encoder/irc_est_sad.c new file mode 100755 index 0000000..0d8abc2 --- /dev/null +++ b/encoder/irc_est_sad.c @@ -0,0 +1,260 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_cntrl_param.h" +#include "irc_mem_req_and_acq.h" +#include "irc_est_sad.h" +#include "irc_common.h" + +typedef struct est_sad_t +{ + WORD32 i4_use_est_intra_sad; + + /* Previous frame SAD */ + UWORD32 au4_prev_frm_sad[MAX_PIC_TYPE]; + + /* Current (nth) ifi average P frame SAD */ + UWORD32 u4_n_p_frm_ifi_avg_sad; + + /* (n-1)th ifi average P frame SAD */ + UWORD32 u4_n_1_p_frm_ifi_avg_sad; + + /* (n-2)th ifi average P frame SAD */ + UWORD32 u4_n_2_p_frm_ifi_avg_sad; + + /* number of ifi encoded till now */ + WORD32 i4_num_ifi_encoded; + + /* number of P frames in the current IFI */ + WORD32 i4_num_p_frm_in_cur_ifi; + +} est_sad_t; + +WORD32 irc_est_sad_num_fill_use_free_memtab(est_sad_t **pps_est_sad, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static est_sad_t s_est_sad; + + /* Hack for al alloc, during which we don't have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_est_sad) = &s_est_sad; + + /* For src rate control state structure */ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(est_sad_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_est_sad, e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +void irc_init_est_sad(est_sad_t *ps_est_sad, WORD32 i4_use_est_intra_sad) +{ + WORD32 i; + ps_est_sad->i4_use_est_intra_sad = i4_use_est_intra_sad; + + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_est_sad->au4_prev_frm_sad[i] = 0; + } + + ps_est_sad->u4_n_p_frm_ifi_avg_sad = 0; + ps_est_sad->u4_n_1_p_frm_ifi_avg_sad = 0; + ps_est_sad->u4_n_2_p_frm_ifi_avg_sad = 0; + ps_est_sad->i4_num_ifi_encoded = 0; + ps_est_sad->i4_num_p_frm_in_cur_ifi = 0; +} + +void irc_reset_est_sad(est_sad_t *ps_est_sad) +{ + irc_init_est_sad(ps_est_sad, ps_est_sad->i4_use_est_intra_sad); +} + +/* + * Get estimated SAD can be called at any point. The various use cases are: + * 1) When a I frame is getting encoded, + * - get the estimated of P => No issues since we use the last coded P frame + * value + * - get estimated of I => This call for two cases: + * => a) if num_ifi_encoded is less than 2 + * then return the previous encoded I frame sad + * => b) if num_ifi_encoded is more than 2, then we scale + * the prev I sad by the ratio of (n-1) ifi P to n-2 ifi P + * 2) When P frame is getting encoded, + * - get the estimated of P => No issues since we use the last coded P frame value + * - get the estimated of I => Simillar to I we have two cases. + * To handle the b) case extra logic had to introduced using + * u1_is_n_1_p_frm_ifi_avg_sad_usable flag + */ +UWORD32 irc_get_est_sad(est_sad_t *ps_est_sad, picture_type_e e_pic_type) +{ + if(ps_est_sad->i4_use_est_intra_sad) + { + UWORD32 u4_estimated_sad; + if(e_pic_type == P_PIC) + { + u4_estimated_sad = ps_est_sad->au4_prev_frm_sad[P_PIC]; + } + else if(e_pic_type == B_PIC) + { + u4_estimated_sad = ps_est_sad->au4_prev_frm_sad[B_PIC]; + } + else + { + if(ps_est_sad->i4_num_ifi_encoded < 2) + { + /* + * Only one IFI has been encoded and so use the previous I + * frames SAD + */ + u4_estimated_sad = ps_est_sad->au4_prev_frm_sad[I_PIC]; + } + else + { + /* + * Since the n-1 'P' frame IFI would have just accumulated the + * frame sads we average it out here + */ + UWORD32 u4_n_1_p_frm_ifi_avg_sad, u4_n_2_p_frm_ifi_avg_sad; + number_t vq_n_1_p_frm_ifi_avg_sad, vq_n_2_p_frm_ifi_avg_sad; + number_t vq_prev_frm_sad_i; + + /* + * If there are frames in the current IFI start using it to + * estimate the I frame SAD + */ + if(ps_est_sad->i4_num_p_frm_in_cur_ifi) + { + u4_n_1_p_frm_ifi_avg_sad = + (ps_est_sad->u4_n_p_frm_ifi_avg_sad + / ps_est_sad->i4_num_p_frm_in_cur_ifi); + u4_n_2_p_frm_ifi_avg_sad = + ps_est_sad->u4_n_1_p_frm_ifi_avg_sad; + } + else + { + u4_n_1_p_frm_ifi_avg_sad = + ps_est_sad->u4_n_1_p_frm_ifi_avg_sad; + u4_n_2_p_frm_ifi_avg_sad = + ps_est_sad->u4_n_2_p_frm_ifi_avg_sad; + } + + /* + * If any of the previous p frame SADs are zeros we just return + * the previous I frame SAD + */ + if(u4_n_1_p_frm_ifi_avg_sad && u4_n_2_p_frm_ifi_avg_sad) + { + SET_VAR_Q(vq_prev_frm_sad_i, + ps_est_sad->au4_prev_frm_sad[I_PIC], 0); + SET_VAR_Q(vq_n_1_p_frm_ifi_avg_sad, + u4_n_1_p_frm_ifi_avg_sad, 0); + SET_VAR_Q(vq_n_2_p_frm_ifi_avg_sad, + u4_n_2_p_frm_ifi_avg_sad, 0); + /* + * Estimated SAD = + *(n-1)th intra frame interval(ifi) P frame Avg SAD * + *(prev I frame SAD / + *(prev (n-2)nd intra frame interval(ifi) P frame Avg SAD) + */ + mult32_var_q(vq_prev_frm_sad_i, vq_n_1_p_frm_ifi_avg_sad, + &vq_prev_frm_sad_i); + div32_var_q(vq_prev_frm_sad_i, vq_n_2_p_frm_ifi_avg_sad, + &vq_prev_frm_sad_i); + number_t_to_word32(vq_prev_frm_sad_i, + (WORD32*)&u4_estimated_sad); + } + else + { + u4_estimated_sad = ps_est_sad->au4_prev_frm_sad[I_PIC]; + } + } + } + return u4_estimated_sad; + } + else + { + return ps_est_sad->au4_prev_frm_sad[e_pic_type]; + } +} + +void irc_update_actual_sad(est_sad_t *ps_est_sad, + UWORD32 u4_actual_sad, + picture_type_e e_pic_type) +{ + ps_est_sad->au4_prev_frm_sad[e_pic_type] = u4_actual_sad; + + if(ps_est_sad->i4_use_est_intra_sad) + { + if(e_pic_type == I_PIC) + { + /* The requirement is to have two IFI before estimating I frame SAD */ + if(ps_est_sad->i4_num_ifi_encoded < 2) + ps_est_sad->i4_num_ifi_encoded++; + + /* Calculate the average SAD */ + if(ps_est_sad->i4_num_p_frm_in_cur_ifi) + { + ps_est_sad->u4_n_p_frm_ifi_avg_sad /= + ps_est_sad->i4_num_p_frm_in_cur_ifi; + } + else + { + ps_est_sad->u4_n_p_frm_ifi_avg_sad = 0; + } + /* Push the (n-1)th average SAD to the (n-2)th average SAD */ + ps_est_sad->u4_n_2_p_frm_ifi_avg_sad = + ps_est_sad->u4_n_1_p_frm_ifi_avg_sad; + /* Push the nth average SAD to the (n-1)th average SAD */ + ps_est_sad->u4_n_1_p_frm_ifi_avg_sad = + ps_est_sad->u4_n_p_frm_ifi_avg_sad; + /* Reset SAD and number of P frames */ + ps_est_sad->u4_n_p_frm_ifi_avg_sad = 0; + ps_est_sad->i4_num_p_frm_in_cur_ifi = 0; + } + else + { + ps_est_sad->u4_n_p_frm_ifi_avg_sad += u4_actual_sad; + ps_est_sad->i4_num_p_frm_in_cur_ifi++; + } + } +} + +void irc_update_actual_sad_for_intra(est_sad_t *ps_est_sad, + WORD32 i4_intra_frm_cost) +{ + if(!(ps_est_sad->i4_use_est_intra_sad)) + { + irc_update_actual_sad(ps_est_sad, i4_intra_frm_cost, I_PIC); + } +} diff --git a/encoder/irc_est_sad.h b/encoder/irc_est_sad.h new file mode 100755 index 0000000..c8238c9 --- /dev/null +++ b/encoder/irc_est_sad.h @@ -0,0 +1,64 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _EST_SAD_H_ +#define _EST_SAD_H_ + +/* + * "est_sad_t->i4_use_est_intra_sad" Flag to control how the I frame SAD is estimated. + * If set to zero + * - it uses the Intra sad calculated by the previous P frame as + * the estimated sad for the current I frame + * else + * - it uses the ratio of P frame sads of the previous two GOPS and + * scales the I Frame sad with this ratio to estimate the current + * I frame SAD + */ + +/* Estimating the Average SAD for the current picture type is done by: + * 1) if picture_type is I + * - Estimated SAD = (n-1)th intra frame interval(ifi) P frame Avg SAD * + * ( prev I frame SAD / (n-2)nd intra frame interval(ifi) P frame Avg SAD) + * - if only one IFI is encoded use the previous I frame SAD + * 2) if picture type is P + * - Estimate SAD is previous P frame SAD + * 3) The first P frame in a IFI could use a little better logic to decide the + * estimated SAD but currently we assume the last coded P frames SAD + a*/ + +typedef struct est_sad_t *est_sad_handle; + +WORD32 irc_est_sad_num_fill_use_free_memtab(est_sad_handle *est_sad, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +void irc_init_est_sad(est_sad_handle est_sad, WORD32 i4_use_est_frame_sad); + +UWORD32 irc_get_est_sad(est_sad_handle est_sad, picture_type_e e_pic_type); + +void irc_update_actual_sad(est_sad_handle est_sad, + UWORD32 u4_actual_sad, + picture_type_e e_pic_type); + +void irc_update_actual_sad_for_intra(est_sad_handle est_sad, + WORD32 i4_intra_frm_cost); + +void irc_reset_est_sad(est_sad_handle ps_est_sad); +#endif diff --git a/encoder/irc_fixed_point_error_bits.c b/encoder/irc_fixed_point_error_bits.c new file mode 100755 index 0000000..42dcfc5 --- /dev/null +++ b/encoder/irc_fixed_point_error_bits.c @@ -0,0 +1,185 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_common.h" +#include "irc_cntrl_param.h" +#include "irc_mem_req_and_acq.h" +#include "irc_fixed_point_error_bits.h" + +typedef struct error_bits_t +{ + /* Max tgt frm rate so that dynamic change in frm rate can be handled */ + WORD32 i4_max_tgt_frm_rate; + + /* Cur frm rate */ + WORD32 i4_cur_tgt_frm_rate; + + /* tgt frame rate*/ + WORD32 i4_tgt_frm_rate; + + /* tgt frm rate increment */ + WORD32 i4_tgt_frm_rate_incr; + + /* flag to indicate 1 second is up */ + UWORD8 u1_compute_error_bits; + + /* Bitrate/frame rate value added over a period */ + WORD32 i4_accum_bitrate; + + /* bitrate */ + WORD32 i4_bitrate; + +} error_bits_t; + +WORD32 irc_error_bits_num_fill_use_free_memtab(error_bits_t **pps_error_bits, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static error_bits_t s_error_bits_temp; + + /* + * Hack for all alloc, during which we don't have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_error_bits) = &s_error_bits_temp; + + /* For src rate control state structure */ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(error_bits_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_error_bits, e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +/******************************************************************************* + * @brief Calculates the error bits due to fixed point divisions + ******************************************************************************/ +void irc_init_error_bits(error_bits_t *ps_error_bits, + WORD32 i4_max_tgt_frm_rate, + WORD32 i4_bitrate) +{ + /* Initializing the parameters*/ + ps_error_bits->i4_cur_tgt_frm_rate = 0; + ps_error_bits->i4_max_tgt_frm_rate = i4_max_tgt_frm_rate; + + /* Value by which i4_cur_tgt_frm_rate is incremented every VOP*/ + ps_error_bits->i4_tgt_frm_rate_incr = 1000; + + /*Compute error bits is set to 1 at the end of 1 second*/ + ps_error_bits->u1_compute_error_bits = 0; + ps_error_bits->i4_tgt_frm_rate = i4_max_tgt_frm_rate; + ps_error_bits->i4_accum_bitrate = 0; + ps_error_bits->i4_bitrate = i4_bitrate; +} + +/******************************************************************************* + * @brief Updates the error state + ******************************************************************************/ +void irc_update_error_bits(error_bits_t *ps_error_bits) +{ + WORD32 i4_bits_per_frame; + + X_PROD_Y_DIV_Z(ps_error_bits->i4_bitrate, 1000, + ps_error_bits->i4_tgt_frm_rate, i4_bits_per_frame); + + /* + * This value is incremented every at the end of every VOP by + * i4_tgt_frm_rate_incr + */ + ps_error_bits->i4_cur_tgt_frm_rate += ps_error_bits->i4_tgt_frm_rate_incr; + if(ps_error_bits->u1_compute_error_bits == 1) + { + ps_error_bits->i4_accum_bitrate = 0; + } + ps_error_bits->i4_accum_bitrate += i4_bits_per_frame; + + /* + * When current tgt frm rate is equal or greater than max tgt frame rate + * 1 second is up , compute the error bits + */ + if(ps_error_bits->i4_cur_tgt_frm_rate >= ps_error_bits->i4_max_tgt_frm_rate) + { + ps_error_bits->i4_cur_tgt_frm_rate -= + ps_error_bits->i4_max_tgt_frm_rate; + ps_error_bits->u1_compute_error_bits = 1; + } + else + { + ps_error_bits->u1_compute_error_bits = 0; + } +} + +/******************************************************************************* + * @brief Returns the error bits for the current frame if there are any + * + ******************************************************************************/ +WORD32 irc_get_error_bits(error_bits_t *ps_error_bits) +{ + WORD32 i4_error_bits = 0; + + /*If 1s is up calculate error for the last 1s worth of frames*/ + if(ps_error_bits->u1_compute_error_bits == 1) + { + /*Error = Actual bitrate - bits_per_frame * num of frames*/ + i4_error_bits = ps_error_bits->i4_bitrate + - ps_error_bits->i4_accum_bitrate; + } + + return (i4_error_bits); +} + +/* ***************************************************************************** + * + * @brief Change the frame rate parameter for the error bits state + * + ******************************************************************************/ +void irc_change_frm_rate_in_error_bits(error_bits_t *ps_error_bits, + WORD32 i4_tgt_frm_rate) +{ + /* Value by which i4_cur_tgt_frm_rate is incremented every VOP*/ + ps_error_bits->i4_tgt_frm_rate_incr = (ps_error_bits->i4_max_tgt_frm_rate + * 1000) / i4_tgt_frm_rate; + ps_error_bits->i4_tgt_frm_rate = i4_tgt_frm_rate; +} + +/******************************************************************************* + * @brief Change the bitrate value for error bits module + ******************************************************************************/ +void irc_change_bitrate_in_error_bits(error_bits_t *ps_error_bits, + WORD32 i4_bitrate) +{ + ps_error_bits->i4_bitrate = i4_bitrate; +} + diff --git a/encoder/irc_fixed_point_error_bits.h b/encoder/irc_fixed_point_error_bits.h new file mode 100755 index 0000000..4ddf1eb --- /dev/null +++ b/encoder/irc_fixed_point_error_bits.h @@ -0,0 +1,64 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* */ +/* File Name : irc_cbr_buffer_control.h */ +/* */ +/* Description : This file contains all the necessary declarations */ +/* for cbr_buffer_control functions */ +/* */ +/* */ +/* List of Functions : <List the functions defined in this file> */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 06 05 2008 Ittiam Draft */ +/* */ +/*****************************************************************************/ + +#ifndef FIXED_POINT_ERROR_BITS_H +#define FIXED_POINT_ERROR_BITS_H + +typedef struct error_bits_t *error_bits_handle; + +WORD32 irc_error_bits_num_fill_use_free_memtab(error_bits_handle *pps_error_bits, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +void irc_init_error_bits(error_bits_handle ps_error_bits, + WORD32 i4_max_tgt_frm_rate, + WORD32 i4_bitrate); + +void irc_update_error_bits(error_bits_handle ps_error_bits); + +WORD32 irc_get_error_bits(error_bits_handle ps_error_bits); + +void irc_change_frm_rate_in_error_bits(error_bits_handle ps_error_bits, + WORD32 i4_tgt_frm_rate); + +void irc_change_bitrate_in_error_bits(error_bits_handle ps_error_bits, + WORD32 i4_bitrate); + +#endif + diff --git a/encoder/irc_frame_info_collector.c b/encoder/irc_frame_info_collector.c new file mode 100755 index 0000000..65f24c4 --- /dev/null +++ b/encoder/irc_frame_info_collector.c @@ -0,0 +1,177 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/******************************************************************************/ +/* File Includes */ +/******************************************************************************/ + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" + +void irc_init_frame_info(frame_info_t *frame_info) +{ + WORD32 i; + + for(i = 0; i < MAX_MB_TYPE; i++) + { + frame_info->mb_header_bits[i] = 0; + frame_info->tot_mb_sad[i] = 0; + frame_info->num_mbs[i] = 0; + frame_info->qp_sum[i] = 0; + frame_info->mb_texture_bits[i] = 0; + } + + frame_info->other_header_bits = 0; + frame_info->activity_sum = 0; + frame_info->intra_mb_cost_sum = 0; +} + +/****************************************************************************** + * GET Functions: Sending back collected information to the rate control module + ******************************************************************************/ +WORD32 irc_fi_get_total_header_bits(frame_info_t *frame_info) +{ + WORD32 total_header_bits = 0, i; + + for(i = 0; i < MAX_MB_TYPE; i++) + { + total_header_bits += frame_info->mb_header_bits[i]; + } + total_header_bits += frame_info->other_header_bits; + + return (total_header_bits); +} + +WORD32 irc_fi_get_total_texture_bits(frame_info_t *frame_info) +{ + WORD32 total_texture_bits = 0, i; + + for(i = 0; i < MAX_MB_TYPE; i++) + { + total_texture_bits += frame_info->mb_texture_bits[i]; + } + + return (total_texture_bits); +} + +WORD32 irc_fi_get_total_frame_sad(frame_info_t *frame_info) +{ + WORD32 total_sad = 0, i; + + for(i = 0; i < MAX_MB_TYPE; i++) + { + total_sad += frame_info->tot_mb_sad[i]; + } + + return (total_sad); +} + +WORD32 irc_fi_get_average_qp(frame_info_t *frame_info) +{ + WORD32 i, total_qp = 0, total_mbs = 0; + + for(i = 0; i < MAX_MB_TYPE; i++) + { + total_qp += frame_info->qp_sum[i]; + total_mbs += frame_info->num_mbs[i]; + } + + if(total_mbs) + { + return (total_qp / total_mbs); + } + else + { + return 0; + } +} + +WORD32 irc_fi_get_avg_mb_header(frame_info_t *frame_info, UWORD8 mb_type) +{ + if(frame_info->num_mbs[mb_type]) + { + return (frame_info->mb_header_bits[mb_type] + / frame_info->num_mbs[mb_type]); + } + else + { + return 0; + } +} + +WORD32 irc_fi_get_total_mb_texture_bits(frame_info_t *frame_info, + UWORD8 mb_type) +{ + return (frame_info->mb_texture_bits[mb_type]); +} + +WORD32 irc_fi_get_total_mb_sad(frame_info_t *frame_info, UWORD8 mb_type) +{ + return (frame_info->tot_mb_sad[mb_type]); +} + +WORD32 irc_fi_get_total_mb_qp(frame_info_t *frame_info, UWORD8 mb_type) +{ + if(frame_info->num_mbs[mb_type]) + { + return (frame_info->qp_sum[mb_type]); + } + else + { + return 0; + } +} + +WORD32 irc_fi_get_total_mb(frame_info_t *frame_info, UWORD8 mb_type) +{ + return (frame_info->num_mbs[mb_type]); +} + +WORD32 irc_fi_get_num_intra_mb(frame_info_t *frame_info) +{ + return (frame_info->num_mbs[MB_TYPE_INTRA]); +} + +WORD32 irc_fi_get_avg_activity(frame_info_t *frame_info) +{ + WORD32 i; + WORD32 i4_tot_mbs = 0; + + for(i = 0; i < MAX_MB_TYPE; i++) + { + i4_tot_mbs += frame_info->num_mbs[i]; + } + + if(i4_tot_mbs) + { + return (frame_info->activity_sum / i4_tot_mbs); + } + else + { + return 0; + } +} + +WORD32 irc_fi_get_total_intra_mb_cost(frame_info_t *frame_info) +{ + return (frame_info->intra_mb_cost_sum); +} diff --git a/encoder/irc_frame_info_collector.h b/encoder/irc_frame_info_collector.h new file mode 100755 index 0000000..58dc467 --- /dev/null +++ b/encoder/irc_frame_info_collector.h @@ -0,0 +1,109 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _FRAME_INFO_COLLECTOR_H_ +#define _FRAME_INFO_COLLECTOR_H_ + +typedef struct +{ + /* Number of MBs in each type */ + WORD32 num_mbs[MAX_MB_TYPE]; + + /* Sum of all MB SADs of each MB type */ + WORD32 tot_mb_sad[MAX_MB_TYPE]; + + /* Sum of QPs for each mb type */ + WORD32 qp_sum[MAX_MB_TYPE]; + + /* Header bits consumed other than MB headers */ + WORD32 other_header_bits; + + /* Header bits consumed for each type of MBs */ + WORD32 mb_header_bits[MAX_MB_TYPE]; + + /* Texture bits consumed for each type of MBs */ + WORD32 mb_texture_bits[MAX_MB_TYPE]; + + /* Sum of all MB activity */ + WORD32 activity_sum; + + /* Sum of all the Intra MB cost values for the entire frame */ + WORD32 intra_mb_cost_sum; + +} frame_info_t; + +void irc_init_frame_info(frame_info_t *frame_info); + +/* + * Update functions: Collecting information from encoder + */ +#define FI_UPDATE_OTHER_HEADER_BITS(frame_info,header_bits)\ + {(frame_info)->other_header_bits += (header_bits);} + +#define FI_UPDATE_MB_HEADER(frame_info,header_bits,mb_type)\ + {(frame_info)->mb_header_bits[(mb_type)] += (header_bits);} + +#define FI_UPDATE_MB_TEXTURE(frame_info,texture_bits,mb_type)\ + {(frame_info)->mb_texture_bits[(mb_type)] += (texture_bits);} + +#define FI_UPDATE_MB_SAD(frame_info,mb_sad,mb_type)\ + {(frame_info)->tot_mb_sad[(mb_type)] += (mb_sad);} + +#define FI_UPDATE_MB_QP(frame_info,qp,mb_type)\ + {(frame_info)->qp_sum[(mb_type)] += (qp);(frame_info)->num_mbs[(mb_type)]++;} + +#define FI_UPDATE_ACTIVITY(frame_info,mb_activity)\ + {(frame_info)->activity_sum += (mb_activity);} + +#define FI_UPDATE_INTRA_MB_COST(frame_info,intra_mb_cost)\ + {(frame_info)->intra_mb_cost_sum += (intra_mb_cost);} + +/* + * GET Functions: Sending back collected information to the rate control module + */ + +/* Frame Level Model Information */ +WORD32 irc_fi_get_total_header_bits(frame_info_t *frame_info); + +WORD32 irc_fi_get_total_texture_bits(frame_info_t *frame_info); + +WORD32 irc_fi_get_average_qp(frame_info_t *frame_info); + +WORD32 irc_fi_get_total_frame_sad(frame_info_t *frame_info); + +WORD32 irc_fi_get_avg_activity(frame_info_t *frame_info); + +/* Number of Intra MBs for Scene Change Detection */ +WORD32 irc_fi_get_num_intra_mb(frame_info_t *frame_info); + +/* MB Level Model Information */ +WORD32 irc_fi_get_avg_mb_header(frame_info_t *frame_info, UWORD8 mb_type); + +WORD32 irc_fi_get_total_mb_texture_bits(frame_info_t *frame_info, + UWORD8 mb_type); + +WORD32 irc_fi_get_total_mb_sad(frame_info_t *frame_info, UWORD8 mb_type); + +WORD32 irc_fi_get_total_mb_qp(frame_info_t *frame_info, UWORD8 mb_type); + +WORD32 irc_fi_get_total_mb(frame_info_t *frame_info, UWORD8 mb_type); + +WORD32 irc_fi_get_total_intra_mb_cost(frame_info_t *frame_info); +#endif diff --git a/encoder/irc_mb_model_based.c b/encoder/irc_mb_model_based.c new file mode 100755 index 0000000..880ee19 --- /dev/null +++ b/encoder/irc_mb_model_based.c @@ -0,0 +1,157 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_cntrl_param.h" +#include "irc_mem_req_and_acq.h" +#include "irc_mb_model_based.h" + +typedef struct mb_rate_control_t +{ + /* Frame Qp */ + UWORD8 u1_frm_qp; + + /* + * Estimated average activity for the current frame (updated with the + * previous frame activity since it is independent of picture type whether + * it is I or P) + */ + WORD32 i4_avg_activity; + +} mb_rate_control_t; + +WORD32 irc_mbrc_num_fill_use_free_memtab(mb_rate_control_t **pps_mb_rate_control, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static mb_rate_control_t s_mb_rate_control_temp; + + /* + * Hack for al alloc, during which we don't have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + { + (*pps_mb_rate_control) = &s_mb_rate_control_temp; + } + + /*For src rate control state structure*/ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(mb_rate_control_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_mb_rate_control, + e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +/******************************************************************************* + MB LEVEL API FUNCTIONS + ******************************************************************************/ + +/****************************************************************************** + Description : Initialize the mb model and the average activity to default + values + ******************************************************************************/ +void irc_init_mb_level_rc(mb_rate_control_t *ps_mb_rate_control) +{ + /* Set values to default */ + ps_mb_rate_control->i4_avg_activity = 0; +} + +/****************************************************************************** + Description : Initialize the mb state with frame level decisions + *********************************************************************************/ +void irc_mb_init_frame_level(mb_rate_control_t *ps_mb_rate_control, + UWORD8 u1_frame_qp) +{ + /* Update frame level QP */ + ps_mb_rate_control->u1_frm_qp = u1_frame_qp; +} + +/****************************************************************************** + Description : Reset the mb activity - Whenever there is SCD + the mb activity is reset + *********************************************************************************/ +void irc_reset_mb_activity(mb_rate_control_t *ps_mb_rate_control) +{ + ps_mb_rate_control->i4_avg_activity = 0; +} + +/****************************************************************************** + Description : Calculates the mb level qp + *********************************************************************************/ +void irc_get_mb_qp(mb_rate_control_t *ps_mb_rate_control, + WORD32 i4_cur_mb_activity, + WORD32 *pi4_mb_qp) +{ + WORD32 i4_qp; + /* Initialize the mb level qp with the frame level qp */ + i4_qp = ps_mb_rate_control->u1_frm_qp; + + /* + * Store the model based QP - This is used for updating the rate control model + */ + pi4_mb_qp[0] = i4_qp; + + /* Modulate the Qp based on the activity */ + if((ps_mb_rate_control->i4_avg_activity) && (i4_qp < 100)) + { + i4_qp =((((2 * i4_cur_mb_activity)) + + ps_mb_rate_control->i4_avg_activity)* i4_qp + + ((i4_cur_mb_activity + 2 * ps_mb_rate_control->i4_avg_activity) + >> 1))/ (i4_cur_mb_activity + 2 * ps_mb_rate_control->i4_avg_activity); + + if(i4_qp > ((3 * ps_mb_rate_control->u1_frm_qp) >> 1)) + { + i4_qp = ((3 * ps_mb_rate_control->u1_frm_qp) >> 1); + } + } + + /* Store the qp modulated by mb activity - This is used for encoding the MB */ + pi4_mb_qp[1] = i4_qp; +} + +/******************************************************************************* + Description : Returns the stored frame level QP + ******************************************************************************/ +UWORD8 irc_get_frm_level_qp(mb_rate_control_t *ps_mb_rate_control) +{ + return (ps_mb_rate_control->u1_frm_qp); +} + +/******************************************************************************* + Description : Update the frame level info collected + ******************************************************************************/ +void irc_mb_update_frame_level(mb_rate_control_t *ps_mb_rate_control, + WORD32 i4_avg_activity) +{ + /* Update the Average Activity */ + ps_mb_rate_control->i4_avg_activity = i4_avg_activity; +} diff --git a/encoder/irc_mb_model_based.h b/encoder/irc_mb_model_based.h new file mode 100755 index 0000000..aad520a --- /dev/null +++ b/encoder/irc_mb_model_based.h @@ -0,0 +1,57 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _MB_MODEL_BASED_H_ +#define _MB_MODEL_BASED_H_ + +typedef struct mb_rate_control_t *mb_rate_control_handle; + +WORD32 irc_mbrc_num_fill_use_free_memtab(mb_rate_control_handle *pps_mb_rate_control, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +/* Initializing the state structure */ +void irc_init_mb_level_rc(mb_rate_control_handle ps_mb_rate_control); + +/* MB parameters that are to be initialized at a frame level */ +void irc_mb_init_frame_level(mb_rate_control_handle ps_mb_rate_control, + UWORD8 u1_frame_qp); + +/* MB Level call to get the mb_level QP */ +void irc_get_mb_qp(mb_rate_control_handle ps_mb_rate_control, + WORD32 i4_cur_mb_activity, + WORD32 *pi4_mb_qp); + +/* MB Parameters that are to be updated at a frame level */ +void irc_mb_update_frame_level(mb_rate_control_handle ps_mb_rate_control, + WORD32 i4_avg_activity); + +/**************************************************************************** + CONTROL FUCNTIONS FROM FRAME LEVEL + ****************************************************************************/ + +/* Returns the stored frame level QP */ +UWORD8 irc_get_frm_level_qp(mb_rate_control_handle ps_mb_rate_control); + +/* Disables activity based qp modulation */ +void irc_reset_mb_activity(mb_rate_control_handle ps_mb_rate_control); + +#endif + diff --git a/encoder/irc_mem_req_and_acq.h b/encoder/irc_mem_req_and_acq.h new file mode 100755 index 0000000..a2946a7 --- /dev/null +++ b/encoder/irc_mem_req_and_acq.h @@ -0,0 +1,179 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file +* ih264e_rc_mem_interface.h +* +* @brief +* This file contains function declaration and structures for rate control +* memtabs +* +* @author +* ittiam +* +* @remarks +* The rate control library is a global library across various codecs. It +* anticipates certain structures definitions. Those definitions are to be +* imported from global workspace. Instead of that, the structures needed for +* rc library are copied in to this file and exported to rc library. If the +* structures / enums / ... in the global workspace change, this file also needs +* to be modified accordingly. +* +****************************************************************************** +*/ +#ifndef IH264E_RC_MEM_INTERFACE_H_ +#define IH264E_RC_MEM_INTERFACE_H_ + + +/*****************************************************************************/ +/* Function Macros */ +/*****************************************************************************/ + +#define FILL_MEMTAB(m_pv_mem_rec, m_j, m_mem_size, m_align, m_type) \ +{ \ + m_pv_mem_rec[m_j].u4_size = sizeof(iv_mem_rec_t); \ + m_pv_mem_rec[m_j].u4_mem_size = m_mem_size; \ + m_pv_mem_rec[m_j].u4_mem_alignment = m_align; \ + m_pv_mem_rec[m_j].e_mem_type = m_type; \ +} + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ +typedef enum +{ + ALIGN_BYTE = 1, + ALIGN_WORD16 = 2, + ALIGN_WORD32 = 4, + ALIGN_WORD64 = 8, + ALIGN_128_BYTE = 128 +}ITT_MEM_ALIGNMENT_TYPE_E; + +typedef enum +{ + SCRATCH = 0, + PERSISTENT = 1, + WRITEONCE = 2 +}ITT_MEM_USAGE_TYPE_E; + +typedef enum +{ + L1D = 0, + SL2 = 1, + DDR = 3 +}ITT_MEM_REGION_E; + +typedef enum +{ + GET_NUM_MEMTAB = 0, + FILL_MEMTAB = 1, + USE_BASE = 2, + FILL_BASE =3 +}ITT_FUNC_TYPE_E; + + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +/*NOTE : This should be an exact replica of IALG_MemRec, any change in IALG_MemRec + must be replicated here*/ +typedef struct +{ + /* Size in bytes */ + UWORD32 u4_size; + + /* Alignment in bytes */ + WORD32 i4_alignment; + + /* decides which memory region to be placed */ + ITT_MEM_REGION_E e_mem_region; + + /* memory is scratch or persistent */ + ITT_MEM_USAGE_TYPE_E e_usage; + + /* Base pointer for allocated memory */ + void *pv_base; +} itt_memtab_t; + + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief This function fills memory record attributes +* +* @par Description +* This function fills memory record attributes +* +* @param[in] ps_mem_tab +* pointer to mem records +* +* @param[in] u4_size +* size of the record +* +* @param[in] i4_alignment +* memory alignment size +* +* @param[in] e_usage +* usage +* +* @param[in] e_mem_region +* mem region +* +* @return void +* +****************************************************************************** +*/ +void fill_memtab(itt_memtab_t *ps_mem_tab, WORD32 u4_size, WORD32 i4_alignment, + ITT_MEM_USAGE_TYPE_E e_usage, ITT_MEM_REGION_E e_mem_region); + +/** +****************************************************************************** +* +* @brief This function fills memory record attributes +* +* @par Description +* This function fills memory record attributes +* +* @param[in] ps_mem_tab +* pointer to mem records +* +* @param[in] ptr_to_be_filled +* handle to the memory record storage space +* +* @param[in] e_func_type +* enum that dictates fill memory records or use memory records +* +* @return void +* +****************************************************************************** +*/ +WORD32 use_or_fill_base(itt_memtab_t *ps_mem_tab, void **ptr_to_be_filled, + ITT_FUNC_TYPE_E e_func_type); + + +#endif // IH264E_RC_MEM_INTERFACE_H_ + diff --git a/encoder/irc_picture_type.c b/encoder/irc_picture_type.c new file mode 100755 index 0000000..186188c --- /dev/null +++ b/encoder/irc_picture_type.c @@ -0,0 +1,1585 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include "stdio.h" +#include "string.h" + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_cntrl_param.h" +#include "irc_mem_req_and_acq.h" +#include "irc_picture_type.h" +#include "irc_trace_support.h" + +#define MAX_INTER_FRM_INT 10 + +/******************************Pic_details ************************************/ +typedef struct +{ + /* The id sent by the codec */ + WORD32 i4_pic_id; + + /* The pics come in, in this order */ + WORD32 i4_pic_disp_order_no; + + /* I,P,B */ + picture_type_e e_pic_type; + +} pic_details_t; + +/**************************Pic_handling structure *****************************/ +typedef struct pic_handling_t +{ + /*************************************************************************** + * Inputs from the codec + **************************************************************************/ + + /* Number of frames after which an I frame will repeat in display order */ + WORD32 i4_intra_frm_int; + + /* (num_b_pics_in_subgop + 1) */ + WORD32 i4_inter_frm_int; + + /* After these many buffered frames, the pics are encoded */ + WORD32 i4_max_inter_frm_int; + + /* OPEN or CLOSED */ + WORD32 i4_is_gop_closed; + + /* The pic stack */ + /* Stack used to store the input pics in encode order */ + pic_details_t as_pic_stack[MAX_INTER_FRM_INT + 2]; + + /*************************************************************************** + * Counters + **************************************************************************/ + + /* Decides whether a B or ref pic */ + WORD32 i4_buf_pic_no; + + /* Current pic's number in displayed, and gets reset after an I-frm */ + WORD32 i4_pic_disp_order_no; + + /* Number of P frms that have come, in the current gop, so far */ + WORD32 i4_p_count_in_gop; + + /* Number of B frms that have come, in the current gop, so far */ + WORD32 i4_b_count_in_gop; + + /* Number of B frms that have come, in the current subgop, so far */ + WORD32 i4_b_count_in_subgop; + + /*************************************************************************** + * Indices to the pic stack (Since we store the pics in the encode order, + * these vars are modified to meet that) + **************************************************************************/ + + /* B_PIC index */ + WORD32 i4_b_pic_idx; + + /* I,P PIC index */ + WORD32 i4_ref_pic_idx; + + /*************************************************************************** + * Variables operating on the input pics + **************************************************************************/ + + /* Flag denoting whether it's the first gop or not */ + WORD32 i4_is_first_gop; + + /* Number of B_PICs in an incomplete subgop */ + WORD32 i4_b_in_incomp_subgop; + + /* In CLOSED_GOPs, even if inter_frm_int > 1, there can be 2 continous + * P_PICs at the GOP end. This takes values of 0 or 1 */ + WORD32 i4_extra_p; + + /*************************************************************************** + * Arrays storing the number of frms in the gop + **************************************************************************/ + + /* In the steady state, what's the pic distribution in display order */ + WORD32 i4_frms_in_gop[MAX_PIC_TYPE]; + + /* + * In case of a change in inter frm int call, the pic distribution in + * that gop in display order + */ + WORD32 i4_frms_in_cur_gop[MAX_PIC_TYPE]; + + /* + * This is used to denote the number of frms remaining to be encoded in the + * current gop + */ + WORD32 i4_rem_frms_in_gop[MAX_PIC_TYPE]; + + /*************************************************************************** + * Variables operating on the output pics + **************************************************************************/ + + /* Counts the frms encoded in a gop */ + WORD32 i4_coded_pic_no; + + /* Counts from the start of stack to the end repeatedly */ + WORD32 i4_stack_count; + + /*************************************************************************** + * Tracking a change in the inputs from the codec + **************************************************************************/ + + /* A flag that is set when the codec calls for a change in inter_frm_int */ + WORD32 i4_change_in_inter_frm_int; + + /* + * When a change_in_inter_frm_int is called, this stores the new + * inter_frm_int + */ + WORD32 i4_new_inter_frm_int; + + /* + * When a change_in_inter_frm_int is called in the middle of a gop,this + * stores the B_PICs in the incomplete subgop of the mixed gop + */ + WORD32 i4_b_in_incomp_subgop_mix_gop; + + /* + * For a CLOSED GOP, when a change_in_inter_frm_int is called in the middle + * of a gop,this is a flag denoting if there is an extra P_PIC in the mixed + * gop + */ + WORD32 i4_extra_p_mix_gop; + + /* A flag that is set when the codec calls for a change in intra_frm_int */ + WORD32 i4_change_in_intra_frm_int; + + /* + * When a change_in_intra_frm_int is called, this stores the new + * intra_frm_int + */ + WORD32 i4_new_intra_frm_int; + + /*************************************************************************** + * Previous pic_stack_indices & details + **************************************************************************/ + pic_details_t s_prev_pic_details; + + WORD32 i4_prev_b_pic_idx; + + WORD32 i4_last_frm_in_gop; + + WORD32 i4_first_gop_encoded; + + /* NITT TBR */ + picture_type_e e_previous_pic_type; + + WORD32 i4_force_I_frame; + + WORD32 i4_forced_I_frame_cur_frame; + + WORD32 i4_sum_remaining_frm_in_gop; + + WORD32 i4_mod_temp_ref_cnt; + + WORD32 i4_frames_in_fif_gop; + + WORD32 i4_prev_intra_frame_interval; + +} pic_handling_t; + +static void irc_update_pic_distbn(pic_handling_t *ps_pic_handling, + WORD32 i4_intra_frm_int, + WORD32 i4_inter_frm_int, + WORD32 i4_gop_boundary); + +static void find_pic_distbn_in_gop(WORD32 i4_frms_in_gop[MAX_PIC_TYPE], + WORD32 i4_intra_frm_int, + WORD32 i4_inter_frm_int, + WORD32 i4_is_gop_closed, + WORD32 *pi4_b_in_incomp_subgop, + WORD32 *pi4_extra_p); + +WORD32 irc_pic_handling_num_fill_use_free_memtab(pic_handling_t **pps_pic_handling, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static pic_handling_t s_pic_handling_temp; + + /* + * Hack for al alloc, during which we dont have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + { + (*pps_pic_handling) = &s_pic_handling_temp; + } + + /*for src rate control state structure*/ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(pic_handling_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_pic_handling, e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +/****************************************************************************** + Description : initializes the pic handling state struct + *****************************************************************************/ +void irc_init_pic_handling(pic_handling_t *ps_pic_handling, + WORD32 i4_intra_frm_int, + WORD32 i4_max_inter_frm_int, + WORD32 i4_is_gop_closed) +{ + /* Declarations */ + WORD32 i; + + /* Checks */ + /* Codec Parameters */ + ps_pic_handling->i4_intra_frm_int = i4_intra_frm_int; + ps_pic_handling->i4_inter_frm_int = i4_max_inter_frm_int; + ps_pic_handling->i4_max_inter_frm_int = i4_max_inter_frm_int; + ps_pic_handling->i4_is_gop_closed = i4_is_gop_closed; + + /* Pic_stack */ + memset(ps_pic_handling->as_pic_stack, 0, + sizeof(ps_pic_handling->as_pic_stack)); + memset(&ps_pic_handling->s_prev_pic_details, 0, + sizeof(ps_pic_handling->s_prev_pic_details)); + + /* Counters */ + ps_pic_handling->i4_buf_pic_no = 0; + ps_pic_handling->i4_pic_disp_order_no = 0; + + /* Indices to the pic_stack */ + ps_pic_handling->i4_ref_pic_idx = 0; + ps_pic_handling->i4_b_pic_idx = 2; + ps_pic_handling->i4_prev_b_pic_idx = 2; + + /* Variables working on the input frames */ + ps_pic_handling->i4_is_first_gop = 1; + ps_pic_handling->i4_p_count_in_gop = 0; + ps_pic_handling->i4_b_count_in_gop = 0; + ps_pic_handling->i4_b_count_in_subgop = 0; + + /* Variables working on the output frames */ + ps_pic_handling->i4_coded_pic_no = -1; + ps_pic_handling->i4_stack_count = -1; + + /* Tracks the changes in the Codec Parameters */ + ps_pic_handling->i4_change_in_inter_frm_int = 0; + ps_pic_handling->i4_new_inter_frm_int = i4_max_inter_frm_int; + + /* Tracks the changes in the Codec Parameters */ + ps_pic_handling->i4_change_in_intra_frm_int = 0; + ps_pic_handling->i4_new_intra_frm_int = i4_intra_frm_int; + + /* Variables on which the bit allocation is dependent */ + /* Get the pic distribution in the gop */ + find_pic_distbn_in_gop(ps_pic_handling->i4_frms_in_gop, i4_intra_frm_int, + i4_max_inter_frm_int, i4_is_gop_closed, + &ps_pic_handling->i4_b_in_incomp_subgop, + &ps_pic_handling->i4_extra_p); + + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_pic_handling->i4_frms_in_cur_gop[i] = + ps_pic_handling->i4_frms_in_gop[i]; + ps_pic_handling->i4_rem_frms_in_gop[i] = + ps_pic_handling->i4_frms_in_gop[i]; + } + + ps_pic_handling->e_previous_pic_type = I_PIC; + ps_pic_handling->i4_prev_intra_frame_interval = i4_intra_frm_int; + ps_pic_handling->i4_force_I_frame = 0; + ps_pic_handling->i4_forced_I_frame_cur_frame = 0; + ps_pic_handling->i4_sum_remaining_frm_in_gop = 0; + ps_pic_handling->i4_mod_temp_ref_cnt = 0; + + ps_pic_handling->i4_b_in_incomp_subgop_mix_gop = + ps_pic_handling->i4_b_in_incomp_subgop; + ps_pic_handling->i4_extra_p_mix_gop = ps_pic_handling->i4_extra_p; + + ps_pic_handling->i4_last_frm_in_gop = 0; + ps_pic_handling->i4_first_gop_encoded = 0; + ps_pic_handling->i4_frames_in_fif_gop = 0; + +} + +/******************************************************************************* + * @brief registers the new intra frame interval value + ******************************************************************************/ +void irc_pic_handling_register_new_int_frm_interval(pic_handling_t *ps_pic_handling, + WORD32 i4_intra_frm_int) +{ + ps_pic_handling->i4_change_in_intra_frm_int = 1; + ps_pic_handling->i4_new_intra_frm_int = i4_intra_frm_int; +} + +void irc_pic_handling_register_new_inter_frm_interval(pic_handling_t *ps_pic_handling, + WORD32 i4_inter_frm_int) +{ + /* Update the state structure with the latest values */ + ps_pic_handling->i4_change_in_inter_frm_int = 1; + ps_pic_handling->i4_new_inter_frm_int = i4_inter_frm_int; +} + +static void start_new_gop(pic_handling_t *ps_pic_handling) +{ + WORD32 i; + WORD32 i4_sum_remaining_frm_in_gop = 0; + + /* Now, the end of gop updates */ + ps_pic_handling->i4_pic_disp_order_no = 0; + ps_pic_handling->i4_buf_pic_no = 0; + ps_pic_handling->i4_is_first_gop = 0; + ps_pic_handling->i4_extra_p_mix_gop = ps_pic_handling->i4_extra_p; + + if(ps_pic_handling->i4_is_gop_closed) + { + ps_pic_handling->i4_b_in_incomp_subgop_mix_gop = + ps_pic_handling->i4_b_in_incomp_subgop; + } + /* + * Store the number of frames in the gop that is encoded till now + * just before Force I frame call is made + */ + ps_pic_handling->i4_frames_in_fif_gop = ps_pic_handling->i4_b_count_in_gop + + ps_pic_handling->i4_p_count_in_gop + 1; + for(i = 0; i < MAX_PIC_TYPE; i++) + { + i4_sum_remaining_frm_in_gop += ps_pic_handling->i4_rem_frms_in_gop[i]; + } + ps_pic_handling->i4_sum_remaining_frm_in_gop = i4_sum_remaining_frm_in_gop; + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_pic_handling->i4_frms_in_cur_gop[i] = + ps_pic_handling->i4_frms_in_gop[i]; + ps_pic_handling->i4_rem_frms_in_gop[i] = + ps_pic_handling->i4_frms_in_cur_gop[i]; + } +} + +/******************************************************************************* + * @brief Fills the pic_stack with the incoming pics in encode order + ******************************************************************************/ +void irc_add_pic_to_stack(pic_handling_t *ps_pic_handling, WORD32 i4_enc_pic_id) +{ + /* Declarations */ + WORD32 i4_inter_frm_int, i4_max_inter_frm_int, + i4_intra_frm_int, i4_new_inter_frm_int; + WORD32 i4_is_gop_closed; + WORD32 i4_buf_pic_no, i4_pic_disp_order_no; + WORD32 i4_b_pic_idx, i4_ref_pic_idx; + WORD32 i4_is_first_gop, i4_b_in_incomp_subgop, i4_p_count_in_gop, + i4_b_count_in_gop, i4_b_count_in_subgop; + WORD32 i, i4_p_frms_in_prd, i4_b_frms_in_prd, + i4_num_b_in_subgop, i4_extra_p; + WORD32 i4_condn_for_change_in_inter_frm_int; + picture_type_e e_previous_pic_type, e_cur_pic_type; + WORD32 i4_force_I_frame; + + /* + * Initialize the local vars with the state struct values needed by the + * change calls + */ + i4_intra_frm_int = ps_pic_handling->i4_intra_frm_int; + i4_inter_frm_int = ps_pic_handling->i4_inter_frm_int; + i4_max_inter_frm_int = ps_pic_handling->i4_max_inter_frm_int; + i4_is_gop_closed = ps_pic_handling->i4_is_gop_closed; + + i4_buf_pic_no = ps_pic_handling->i4_buf_pic_no; + i4_pic_disp_order_no = ps_pic_handling->i4_pic_disp_order_no; + i4_b_count_in_gop = ps_pic_handling->i4_b_count_in_gop; + i4_b_frms_in_prd = ps_pic_handling->i4_frms_in_cur_gop[B_PIC]; + i4_is_first_gop = ps_pic_handling->i4_is_first_gop; + i4_new_inter_frm_int = ps_pic_handling->i4_new_inter_frm_int; + e_previous_pic_type = ps_pic_handling->e_previous_pic_type; + i4_force_I_frame = ps_pic_handling->i4_force_I_frame; + + /* Force I frame : + * Two different cases + * 1)OPEN_GOP: New GOP is started after number of B pictures in the last + * sub gop of a gop to mimic the GOP structure. + * 2)Closed GOP:Wait till P frame at input and The frame after a P frame + * a new GOP is started to mimic the GOP structure. + */ + if(i4_force_I_frame) + { + WORD32 i4_temp_is_gop_closed; + WORD32 i4_codn = 0; + /* A special case of Open GOP where the it behaves like Closed GOP*/ + if((i4_intra_frm_int % i4_inter_frm_int) == 1) + { + i4_temp_is_gop_closed = 1; + } + else + { + i4_temp_is_gop_closed = i4_is_gop_closed; + } + /* Get the current picture type to aid decision to force an I frame*/ + if((i4_buf_pic_no % i4_inter_frm_int) + && !(i4_is_gop_closed&& (i4_b_count_in_gop == i4_b_frms_in_prd))) + { + e_cur_pic_type = B_PIC; + } + else + { + if(i4_pic_disp_order_no == 0) + { + e_cur_pic_type = I_PIC; + } + else + { + e_cur_pic_type = P_PIC; + } + } + if((i4_intra_frm_int % i4_inter_frm_int) == 0) + { + i4_codn = (e_cur_pic_type == P_PIC); + } + else + { + i4_codn = (ps_pic_handling->i4_b_count_in_subgop + == ps_pic_handling->i4_b_in_incomp_subgop); + } + if(e_cur_pic_type == I_PIC) + { + /* + * Don't do anything. Resetting the force I frame flag + * since the current picture type is already a I frame + */ + i4_force_I_frame = 0; + } + else if(i4_inter_frm_int == 1) + { + /*IPP case , Force I frame immediately*/ + start_new_gop(ps_pic_handling); + } + else if((!i4_temp_is_gop_closed) && i4_codn) + { + start_new_gop(ps_pic_handling); + if(ps_pic_handling->i4_b_count_in_subgop) + { + ps_pic_handling->i4_b_pic_idx += 1; + ps_pic_handling->i4_b_pic_idx %= (i4_max_inter_frm_int + 1); + } + } + else if(i4_temp_is_gop_closed && (e_previous_pic_type == P_PIC) + && (e_cur_pic_type != P_PIC)) + { + start_new_gop(ps_pic_handling); + ps_pic_handling->i4_b_pic_idx++; + ps_pic_handling->i4_b_pic_idx %= (i4_max_inter_frm_int + 1); + } + i4_is_first_gop = ps_pic_handling->i4_is_first_gop; + } + + + /***********************CHANGE_INTRA_FRM_INTERVAL************************** + * + * Call the irc_update_pic_distbn if + * 1)Change in intra frm interval flag is set + * 2)It's the first B_PIC of a gop + */ + if((ps_pic_handling->i4_change_in_intra_frm_int == 1) + && ((i4_pic_disp_order_no == 1))) + { + irc_update_pic_distbn(ps_pic_handling, + ps_pic_handling->i4_new_intra_frm_int, + ps_pic_handling->i4_inter_frm_int, 1); + + ps_pic_handling->i4_change_in_intra_frm_int = 0; + + if(ps_pic_handling->i4_new_intra_frm_int == 1) + { + ps_pic_handling->i4_pic_disp_order_no = 0; + } + } + /*********************CHANGE_INTER_FRM_INTERVAL****************************/ + /* Call irc_update_pic_distbn if + * 1)Change in inter frm interval flag is set + * 2)It's the first B_PIC after gop/subgop start, and + * 3)The new inter-frm-interval won't cross the intra_frm_interval + */ + if((ps_pic_handling->i4_change_in_inter_frm_int == 1) + && ((i4_buf_pic_no % i4_inter_frm_int == 1) + || (i4_pic_disp_order_no == 1) || (i4_inter_frm_int == 1))) + { + /* + * Condition which checks if the new inter_frm_int will cross the + * intra_frm_int + */ + i4_condn_for_change_in_inter_frm_int = ((i4_pic_disp_order_no + + i4_new_inter_frm_int - 1) < i4_intra_frm_int); + + if(i4_condn_for_change_in_inter_frm_int) + { + /*If the inter_frm_int = 1, then the b_pic_idx needs to be modified */ + if(i4_inter_frm_int == 1) + { + ps_pic_handling->i4_b_pic_idx = (1 + + ps_pic_handling->i4_ref_pic_idx) + % (i4_max_inter_frm_int + 1); + } + + /* + * Depending on the gop/subgop boundary, call the change_inter_frm_int + * + * TO DO: make a single call, change the name of the fxn to + * update_state, + * where state = frms_in_gop + b_incomp_subgop + extra_p + */ + + /* GOP boundary */ + if(i4_pic_disp_order_no == 1) + { + irc_update_pic_distbn(ps_pic_handling, + ps_pic_handling->i4_intra_frm_int, + ps_pic_handling->i4_new_inter_frm_int, 1); + } + /* Subgop boundary */ + else + { + irc_update_pic_distbn(ps_pic_handling, + ps_pic_handling->i4_intra_frm_int, + ps_pic_handling->i4_new_inter_frm_int, 0); + } + + ps_pic_handling->i4_change_in_inter_frm_int = 0; + ps_pic_handling->i4_new_inter_frm_int = + ps_pic_handling->i4_inter_frm_int; + } + + } + + /* Initialize the local vars with the state struct values */ + i4_buf_pic_no = ps_pic_handling->i4_buf_pic_no; + i4_pic_disp_order_no = ps_pic_handling->i4_pic_disp_order_no; + i4_b_pic_idx = ps_pic_handling->i4_b_pic_idx; + i4_ref_pic_idx = ps_pic_handling->i4_ref_pic_idx; + i4_b_in_incomp_subgop = ps_pic_handling->i4_b_in_incomp_subgop_mix_gop; + i4_p_count_in_gop = ps_pic_handling->i4_p_count_in_gop; + i4_b_count_in_gop = ps_pic_handling->i4_b_count_in_gop; + i4_b_count_in_subgop = ps_pic_handling->i4_b_count_in_subgop; + i4_p_frms_in_prd = ps_pic_handling->i4_frms_in_cur_gop[P_PIC]; + i4_b_frms_in_prd = ps_pic_handling->i4_frms_in_cur_gop[B_PIC]; + i4_extra_p = ps_pic_handling->i4_extra_p_mix_gop; + i4_inter_frm_int = ps_pic_handling->i4_inter_frm_int; + i4_intra_frm_int = ps_pic_handling->i4_intra_frm_int; + + /* Initializing the prev_state vars */ + ps_pic_handling->i4_prev_b_pic_idx = ps_pic_handling->i4_b_pic_idx; + + i4_num_b_in_subgop = (i4_inter_frm_int - 1); + + /*********************** Fill the stack ***********************************/ + /* The next part of the code is organized as + * + * if(B_PIC conditions satisfied) + * { + * Fill the pic_stack using the b_pic_index + * Update the b_pic_index and the other b_pic related vars for the + * next B_PIC + * } + * else + * { + * if(I_PIC conditions are satisfied) + * { + * Fill the pic_stack using the ref_pic_index + * Update the ref_pic_index and the other ref_pic related vars for the next + * I_PIC/P_PIC + * } + * else + * { + * Fill the pic_stack using the ref_pic_index + * Update the ref_pic_index and the other ref_pic related vars for the next + * I_PIC/P_PIC + * } + * } + */ + /* + * Condition for a B_PIC - + * 1) Other than the first I_PIC and the periodically appearing P_PICs, after + * every inter_frm_int, rest all pics are B_PICs + * 2) In case of CLOSED_GOP, the last frame of the gop has to be a P_PIC + */ + + if((i4_buf_pic_no % i4_inter_frm_int)&& !(i4_is_gop_closed + && (i4_b_count_in_gop == i4_b_frms_in_prd))) /**** B_PIC ****/ + { + /* Fill the pic_stack */ + ps_pic_handling->as_pic_stack[i4_b_pic_idx].i4_pic_id = i4_enc_pic_id; + ps_pic_handling->as_pic_stack[i4_b_pic_idx].e_pic_type = B_PIC; + ps_pic_handling->as_pic_stack[i4_b_pic_idx].i4_pic_disp_order_no = + i4_pic_disp_order_no; + + /* Store Pic type*/ + e_previous_pic_type = B_PIC; + + /* Update the prev_pic_details */ + memcpy(&ps_pic_handling->s_prev_pic_details, + &ps_pic_handling->as_pic_stack[i4_b_pic_idx], + sizeof(pic_details_t)); + + i4_b_count_in_gop++; + i4_b_count_in_subgop++; + + /* Update the i4_b_pic_idx */ + if(!i4_is_gop_closed) + { + /* If this B_PIC features in one of the complete subgops */ + if((i4_b_count_in_subgop < i4_num_b_in_subgop) + && !(i4_b_count_in_gop == i4_b_frms_in_prd)) + { + i4_b_pic_idx++; + } + else /* Else if this B_PIC is the last one in a subgop or gop */ + { + /* + * If this is the last B_PIC of a GOP, depending on the number + * of incomp B_pics in the subgop, there can be either only I + * or I,P pics between this and the next B_PIC + */ + if(i4_b_count_in_gop == i4_b_frms_in_prd) + { + i4_b_pic_idx += (2 + (!i4_b_in_incomp_subgop)); /*Prev*/ + i4_b_count_in_gop = 0; + } + /* + * For the last B_PIC of a subgop, there's always a P b/w + * this & the next B_PIC + */ + else + { + i4_b_pic_idx += 2; + } + i4_b_count_in_subgop = 0; + } + } + else + { + /* For the last B_PIC of a gop + * Normally,there will be 3 pics (P,I,P) between this and the next + * B_PIC for a CLOSED gop, except when + * 1)Number of P_pics in the gop = 1 + * 2)There is an extra P at the end of the gop + */ + if(i4_b_count_in_gop == i4_b_frms_in_prd) + { + i4_b_pic_idx += (3 + ((i4_b_in_incomp_subgop == 0) + && (i4_p_frms_in_prd> 1) + && (i4_pic_disp_order_no + != (i4_p_frms_in_prd+ i4_b_frms_in_prd- 1)))); + + i4_b_count_in_subgop = 0; + } + /* For a B_PIC which is not the last one in a subgop */ + else if(i4_b_count_in_subgop < i4_num_b_in_subgop) + { + i4_b_pic_idx++; + } + else /* For the last B_PIC of a subgop */ + { + i4_b_pic_idx += 2; + i4_b_count_in_subgop = 0; + } + } + i4_b_pic_idx %= (i4_max_inter_frm_int + 1); + } + /*********** I or P pic *********/ + else + { + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].i4_pic_id = i4_enc_pic_id; + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].i4_pic_disp_order_no = + i4_pic_disp_order_no; + /* Store Pic type*/ + e_previous_pic_type = I_PIC; + + /**** I_PIC ****/ + if(i4_pic_disp_order_no == 0) + { + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].e_pic_type = I_PIC; + + /* Update the prev_pic_details */ + memcpy(&ps_pic_handling->s_prev_pic_details, + &ps_pic_handling->as_pic_stack[i4_ref_pic_idx], + sizeof(pic_details_t)); + /* + * In case of an I-frame depending on OPEN or CLOSED gop, + * the ref_pic_idx changes + */ + if((!i4_is_gop_closed) && (i4_is_first_gop == 0)) + { + if((i4_p_frms_in_prd <= 1) && (i4_b_in_incomp_subgop == 0)) + { + i4_ref_pic_idx++; + } + /* + * From the 2nd gop onwards, the I and first P frame are + * separated by the num_b_in_incomp_subgop + */ + else + { + i4_ref_pic_idx += (i4_b_in_incomp_subgop + 1); + } + + ps_pic_handling->i4_b_in_incomp_subgop_mix_gop = + ps_pic_handling->i4_b_in_incomp_subgop; + } + else + { + i4_ref_pic_idx++; + } + + i4_b_count_in_gop = 0; + i4_p_count_in_gop = 0; + i4_b_count_in_subgop = 0; + + } + /**** P_PIC ****/ + else + { + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].e_pic_type = P_PIC; + /* Store Pic type*/ + e_previous_pic_type = P_PIC; + + /* Update the prev_pic_details */ + memcpy(&ps_pic_handling->s_prev_pic_details, + &ps_pic_handling->as_pic_stack[i4_ref_pic_idx], + sizeof(pic_details_t)); + + i4_p_count_in_gop++; + ps_pic_handling->i4_prev_intra_frame_interval = i4_intra_frm_int; + + /* + * In case of an P-frame depending on OPEN or CLOSED gop, the + * ref_pic_idx changes + */ + if(i4_is_gop_closed && (i4_p_count_in_gop == i4_p_frms_in_prd)) + { + /* + * For the last P_PIC in a gop, if extra_p or incomp_b are + * present, the number of such pics between this and the next + * ref_pic is (i4_b_in_incomp_subgop + 1) + */ + if((i4_p_count_in_gop > 1) + && (i4_b_in_incomp_subgop || i4_extra_p)) + { + i4_ref_pic_idx += (i4_b_in_incomp_subgop + 1); + } + else + { + i4_ref_pic_idx += i4_inter_frm_int; + } + } + else + { + i4_ref_pic_idx += i4_inter_frm_int; + } + } + + i4_ref_pic_idx %= (i4_max_inter_frm_int + 1); + } + + /* Update those variables working on the input frames */ + i4_pic_disp_order_no++; + i4_buf_pic_no++; + + /* For any gop */ + if(ps_pic_handling->i4_pic_disp_order_no + == (i4_max_inter_frm_int - 1- ((!i4_is_gop_closed) + * ps_pic_handling->i4_b_in_incomp_subgop_mix_gop))) + { + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_pic_handling->i4_rem_frms_in_gop[i] = + ps_pic_handling->i4_frms_in_cur_gop[i]; + } + + if((!i4_is_gop_closed) && (i4_is_first_gop) + && (ps_pic_handling->i4_rem_frms_in_gop[B_PIC] + > ps_pic_handling->i4_b_in_incomp_subgop_mix_gop)) + { + ps_pic_handling->i4_rem_frms_in_gop[B_PIC] = + ps_pic_handling->i4_frms_in_cur_gop[B_PIC] + - ps_pic_handling->i4_b_in_incomp_subgop_mix_gop; + } + } + + /* End of GOP updates */ + if(i4_pic_disp_order_no == (i4_p_frms_in_prd + i4_b_frms_in_prd + 1)) + { + /* Now, the end of gop updates */ + i4_pic_disp_order_no = 0; + i4_buf_pic_no = 0; + i4_is_first_gop = 0; + ps_pic_handling->i4_extra_p_mix_gop = ps_pic_handling->i4_extra_p; + + if(i4_is_gop_closed) + { + ps_pic_handling->i4_b_in_incomp_subgop_mix_gop = + ps_pic_handling->i4_b_in_incomp_subgop; + } + + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_pic_handling->i4_frms_in_cur_gop[i] = + ps_pic_handling->i4_frms_in_gop[i]; + } + } + + /* Updating the vars which work on the encoded pics */ + /* For the first gop */ + if(((ps_pic_handling->i4_is_first_gop) + && (ps_pic_handling->i4_pic_disp_order_no + == (i4_max_inter_frm_int - 1))) + || (i4_intra_frm_int == 1)) + { + ps_pic_handling->i4_coded_pic_no = 0; + ps_pic_handling->i4_stack_count = 0; + } + + /* Update the state struct with the modifiable local vars */ + ps_pic_handling->i4_buf_pic_no = i4_buf_pic_no; + ps_pic_handling->i4_pic_disp_order_no = i4_pic_disp_order_no; + ps_pic_handling->i4_b_pic_idx = i4_b_pic_idx; + ps_pic_handling->i4_ref_pic_idx = i4_ref_pic_idx; + ps_pic_handling->i4_is_first_gop = i4_is_first_gop; + ps_pic_handling->i4_p_count_in_gop = i4_p_count_in_gop; + ps_pic_handling->i4_b_count_in_gop = i4_b_count_in_gop; + ps_pic_handling->i4_b_count_in_subgop = i4_b_count_in_subgop; + ps_pic_handling->e_previous_pic_type = e_previous_pic_type; + ps_pic_handling->i4_force_I_frame = i4_force_I_frame; +} + +/******************************************************************************* + * @brief Returns the picture type, ip and display order number for the frame to + * be encoded + ******************************************************************************/ +void irc_get_pic_from_stack(pic_handling_t *ps_pic_handling, + WORD32 *pi4_pic_id, + WORD32 *pi4_pic_disp_order_no, + picture_type_e *pe_pic_type) +{ + pic_details_t s_pic_details; + pic_details_t *ps_pic_details = &s_pic_details; + + if(ps_pic_handling->i4_stack_count < 0) + { + ps_pic_details->e_pic_type = BUF_PIC; + ps_pic_details->i4_pic_disp_order_no = -1; + ps_pic_details->i4_pic_id = -1; + } + else + { + memcpy(ps_pic_details, + &ps_pic_handling->as_pic_stack[ps_pic_handling->i4_stack_count], + sizeof(pic_details_t)); + + /* Force I frame updations */ + if((ps_pic_handling->i4_force_I_frame == 1) + && (ps_pic_details->e_pic_type == I_PIC)) + { + /* Flag to signal change in remaining bits*/ + ps_pic_handling->i4_forced_I_frame_cur_frame = 1; + ps_pic_handling->i4_force_I_frame = 0; + /* + * Indicates count for no. of Pictures whose temporal reference + * has to be modified + * in the new GOP + */ + ps_pic_handling->i4_mod_temp_ref_cnt = + ps_pic_handling->i4_b_in_incomp_subgop + 1; + ps_pic_handling->i4_first_gop_encoded = 1; + } + + /* + * In MPEG2, the temporal reference of the first displayed frame in a + * gop is 0.In case of an OPEN_GOP, the B_PICs of the last subgop in a + * gop, maybe coded as a part of the next gop. Hence, in such conditions + * the pic_disp_order needs to be modified so that it gives an + * indication of the temporal reference + */ + if((!ps_pic_handling->i4_is_gop_closed) + && (ps_pic_handling->i4_first_gop_encoded)) + { + if(!ps_pic_handling->i4_mod_temp_ref_cnt) + { + ps_pic_details->i4_pic_disp_order_no = + (ps_pic_handling->as_pic_stack[ps_pic_handling->i4_stack_count].i4_pic_disp_order_no + + ps_pic_handling->i4_b_in_incomp_subgop) + % (ps_pic_handling->i4_prev_intra_frame_interval); + + } + else + { + /* + * due to force I frame First frame will have only + * ps_pic_handling->i4_frames_in_fif_gop number of frames + */ + ps_pic_details->i4_pic_disp_order_no = + (ps_pic_handling->as_pic_stack[ps_pic_handling->i4_stack_count].i4_pic_disp_order_no + + ps_pic_handling->i4_b_in_incomp_subgop) + % (ps_pic_handling->i4_frames_in_fif_gop); + ps_pic_handling->i4_mod_temp_ref_cnt--; + } + } + } + + /* Giving this to the Codec */ + *pi4_pic_id = s_pic_details.i4_pic_id; + *pi4_pic_disp_order_no = s_pic_details.i4_pic_disp_order_no; + *pe_pic_type = s_pic_details.e_pic_type; +} + +/******************************************************************************* + * @brief Updates the picture handling state whenever there is changes in input + * parameter + * + ******************************************************************************/ +static void irc_update_pic_distbn(pic_handling_t *ps_pic_handling, + WORD32 i4_intra_frm_int, + WORD32 i4_inter_frm_int, + WORD32 i4_gop_boundary) +{ + /* Declarations */ + WORD32 i4_is_gop_closed; + WORD32 i, i4_prev_inter_frm_int, i4_max_inter_frm_int, i4_pic_disp_order_no; + WORD32 i4_b_in_incomp_subgop, i4_extra_p, + i4_b_in_incomp_subgop_mix_gop,i4_extra_p_mix_gop; + WORD32 i4_pb_frms_till_prev_p; + WORD32 ai4_diff_in_frms[MAX_PIC_TYPE]; + + /* Initialize the local vars from the state struct */ + i4_is_gop_closed = ps_pic_handling->i4_is_gop_closed; + i4_prev_inter_frm_int = ps_pic_handling->i4_inter_frm_int; + i4_max_inter_frm_int = ps_pic_handling->i4_max_inter_frm_int; + i4_b_in_incomp_subgop = ps_pic_handling->i4_b_in_incomp_subgop; + i4_extra_p = ps_pic_handling->i4_extra_p; + i4_b_in_incomp_subgop_mix_gop = + ps_pic_handling->i4_b_in_incomp_subgop_mix_gop; + i4_extra_p_mix_gop = ps_pic_handling->i4_extra_p_mix_gop; + i4_pic_disp_order_no = ps_pic_handling->i4_pic_disp_order_no; + + i4_pb_frms_till_prev_p = (ps_pic_handling->i4_p_count_in_gop + * i4_prev_inter_frm_int); + + /* Check for the validity of the intra_frm_int */ + if(i4_intra_frm_int <= 0) + { + i4_intra_frm_int = ps_pic_handling->i4_intra_frm_int; + } + /* Check for the validity of the inter_frm_int */ + if((i4_inter_frm_int > i4_max_inter_frm_int) || (i4_inter_frm_int < 0)) + { + i4_inter_frm_int = ps_pic_handling->i4_inter_frm_int; + } + + /* Keep a copy of the older frms_in_gop */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ai4_diff_in_frms[i] = ps_pic_handling->i4_frms_in_cur_gop[i]; + } + + /* Update all the variables which are calculated from the inter_frm_int */ + + /* Get the new pic distribution in the gop */ + find_pic_distbn_in_gop(ps_pic_handling->i4_frms_in_gop, i4_intra_frm_int, + i4_inter_frm_int, i4_is_gop_closed, + &i4_b_in_incomp_subgop, &i4_extra_p); + + /* Find the other related variables */ + if(i4_gop_boundary == 0) + { + /* + * Since, the inter frame interval has changed between a gop the + * current gop will be a mixed gop. So, we need to find the values of + * the related variables + */ + find_pic_distbn_in_gop(ps_pic_handling->i4_frms_in_cur_gop, + (i4_intra_frm_int - i4_pb_frms_till_prev_p), + i4_inter_frm_int, i4_is_gop_closed, + &i4_b_in_incomp_subgop_mix_gop, + &i4_extra_p_mix_gop); + + ps_pic_handling->i4_frms_in_cur_gop[P_PIC] += + ps_pic_handling->i4_p_count_in_gop; + ps_pic_handling->i4_frms_in_cur_gop[B_PIC] += + ps_pic_handling->i4_b_count_in_gop; + } + else + { + /* + * Since, the inter_frm_interval has changed at a gop boundary, the + * new gop will have all the subgops with the new inter_frm_interval + */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_pic_handling->i4_frms_in_cur_gop[i] = + ps_pic_handling->i4_frms_in_gop[i]; + } + + i4_b_in_incomp_subgop_mix_gop = i4_b_in_incomp_subgop; + i4_extra_p_mix_gop = i4_extra_p; + } + + /* For bit-allocation the rem_frms_in_gop need to be updated */ + /* Checks needed: + 1) If the encoding is happening on the same gop as that of the buffering */ + if(ps_pic_handling->i4_pic_disp_order_no + >= (i4_max_inter_frm_int - 1- ((!i4_is_gop_closed) + * ps_pic_handling->i4_b_in_incomp_subgop_mix_gop))) + { + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_pic_handling->i4_rem_frms_in_gop[i] += + (ps_pic_handling->i4_frms_in_cur_gop[i] + - ai4_diff_in_frms[i]); + } + } + + /* Update the vars which will affect the proper filling of the pic_stack */ + if(i4_pic_disp_order_no == 0) /*Check if redundant*/ + { + ps_pic_handling->i4_buf_pic_no = 0; + } + else + { + ps_pic_handling->i4_buf_pic_no = 1; + } + + ps_pic_handling->i4_b_count_in_subgop = 0; + + /* Update the state struct with the new inter_frm_int */ + ps_pic_handling->i4_inter_frm_int = i4_inter_frm_int; + ps_pic_handling->i4_intra_frm_int = i4_intra_frm_int; + ps_pic_handling->i4_b_in_incomp_subgop = i4_b_in_incomp_subgop; + ps_pic_handling->i4_extra_p = i4_extra_p; + ps_pic_handling->i4_b_in_incomp_subgop_mix_gop = + i4_b_in_incomp_subgop_mix_gop; + ps_pic_handling->i4_extra_p_mix_gop = i4_extra_p_mix_gop; + +} + +/* ***************************************************************************** + * @brief Distributes the frames as I, P and B based on intra/inter frame interval. + * Along with it it fills the number of frames in sub-gop and extra p frame + * + ******************************************************************************/ +static void find_pic_distbn_in_gop(WORD32 i4_frms_in_gop[MAX_PIC_TYPE], + WORD32 i4_intra_frm_int, + WORD32 i4_inter_frm_int, + WORD32 i4_is_gop_closed, + WORD32 *pi4_b_in_incomp_subgop, + WORD32 *pi4_extra_p) +{ + /* + * Find the pic distribution in the gop depending on the inter and intra + * frm intervals + */ + i4_frms_in_gop[I_PIC] = 1; + + /* All I frames */ + if(i4_intra_frm_int == 1) + { + i4_frms_in_gop[P_PIC] = 0; + i4_frms_in_gop[B_PIC] = 0; + *pi4_b_in_incomp_subgop = 0; + *pi4_extra_p = 0; + } + else + { + if(i4_is_gop_closed) + { + i4_frms_in_gop[P_PIC] = ((i4_intra_frm_int - 2) / i4_inter_frm_int) + + 1; + + if((((i4_intra_frm_int - 2) / i4_inter_frm_int) * i4_inter_frm_int) + == (i4_intra_frm_int - 2)) + { + *pi4_extra_p = 1; + } + else + { + *pi4_extra_p = 0; + } + } + else + { + i4_frms_in_gop[P_PIC] = ((i4_intra_frm_int - 1) / i4_inter_frm_int); + + *pi4_extra_p = 0; + } + + i4_frms_in_gop[B_PIC] = (i4_intra_frm_int - 1 - i4_frms_in_gop[P_PIC]); + + *pi4_b_in_incomp_subgop = (i4_frms_in_gop[B_PIC] - (i4_inter_frm_int - 1) + * ((i4_intra_frm_int - 1)/ i4_inter_frm_int)); + } +} + +WORD32 irc_pic_type_get_intra_frame_interval(pic_handling_t *ps_pic_handling) +{ + + return (ps_pic_handling->i4_intra_frm_int); +} + +WORD32 irc_pic_type_get_inter_frame_interval(pic_handling_t *ps_pic_handling) +{ + return (ps_pic_handling->i4_inter_frm_int); +} + +void irc_pic_type_get_rem_frms_in_gop(pic_handling_t *ps_pic_handling, + WORD32 ai4_rem_frms_in_gop[MAX_PIC_TYPE]) +{ + memcpy(ai4_rem_frms_in_gop, ps_pic_handling->i4_rem_frms_in_gop, + sizeof(ps_pic_handling->i4_rem_frms_in_gop)); +} + +WORD32 irc_pic_type_get_frms_in_gop_force_I_frm(pic_handling_t *ps_pic_handling) +{ + return (ps_pic_handling->i4_frames_in_fif_gop); +} + +void irc_pic_type_get_frms_in_gop(pic_handling_t *ps_pic_handling, + WORD32 ai4_frms_in_gop[MAX_PIC_TYPE]) +{ + memcpy(ai4_frms_in_gop, ps_pic_handling->i4_frms_in_cur_gop, + sizeof(ps_pic_handling->i4_frms_in_cur_gop)); +} + +WORD32 irc_pic_type_get_disp_order_no(pic_handling_t *ps_pic_handling) +{ + return (ps_pic_handling->i4_pic_disp_order_no); +} + +void irc_set_force_I_frame_flag(pic_handling_t *ps_pic_handling) +{ + ps_pic_handling->i4_force_I_frame = 1; +} +WORD32 irc_get_forced_I_frame_cur_frm_flag(pic_handling_t *ps_pic_handling) +{ + return (ps_pic_handling->i4_forced_I_frame_cur_frame); +} +void irc_reset_forced_I_frame_cur_frm_flag(pic_handling_t *ps_pic_handling) +{ + ps_pic_handling->i4_forced_I_frame_cur_frame = 0; +} + +/******************************************************************************/ +/* Functions that work on the encoded frames */ +/******************************************************************************/ + +/****************************************************************************** + Function Name : irc_update_pic_handling + Description : Will be called only for the frames to be encoded + *****************************************************************************/ +void irc_update_pic_handling(pic_handling_t *ps_pic_handling, + picture_type_e e_pic_type) +{ + + WORD32 i4_max_inter_frm_int; + WORD32 i; + + /* Initializing the local vars with that of the state struct */ + i4_max_inter_frm_int = ps_pic_handling->i4_max_inter_frm_int; + + /* Update the variables working on the output frames */ + /* Update the stack count */ + ps_pic_handling->i4_stack_count++; + + if(ps_pic_handling->i4_stack_count == (i4_max_inter_frm_int + 1)) + { + ps_pic_handling->i4_stack_count = 0; + } + + /* Update the rem_frms_in_gop */ + ps_pic_handling->i4_rem_frms_in_gop[e_pic_type]--; + + /* Assumption : Rem_frms_in_gop needs to be taken care of, for every change in frms */ + ps_pic_handling->i4_last_frm_in_gop = 0; + if((ps_pic_handling->i4_rem_frms_in_gop[I_PIC] <= 0) + && (ps_pic_handling->i4_rem_frms_in_gop[P_PIC] <= 0) + && (ps_pic_handling->i4_rem_frms_in_gop[B_PIC] <= 0)) + { + /* Copy the cur_frms_in_gop to the rem_frm_in_gop */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_pic_handling->i4_rem_frms_in_gop[i] = + ps_pic_handling->i4_frms_in_cur_gop[i]; + } + + ps_pic_handling->i4_last_frm_in_gop = 1; + ps_pic_handling->i4_first_gop_encoded = 1; + } +} + +WORD32 irc_is_last_frame_in_gop(pic_handling_handle ps_pic_handling) +{ + return (ps_pic_handling->i4_last_frm_in_gop); +} + +/****************************************************************************** + Function Name : irc_skip_encoded_frame + Description : Needs to go to the current pic in the pic_stack. + If it's B_PIC don't do anything + If it's a reference picture, push all but the last B_PICs + in the current subgop one place down (i.e. just copy their + pic_details) and move the last B_PIC in that subgop to the + next slot of the skipped picture and convert it's pic_type + to that of the reference picture + *****************************************************************************/ +void irc_skip_encoded_frame(pic_handling_t *ps_pic_handling, + picture_type_e e_pic_type) +{ + pic_details_t s_pic_details; + WORD32 i4_stack_count, i4_next_ref_pic_idx, i4_pic_idx; + WORD32 i4_max_inter_frm_int, i4_last_b_pic_idx, i4_first_b_pic_idx; + WORD32 i4_next_pic_idx; + + /* State variables used to initialize the local vars (Not to be changed) */ + i4_stack_count = ps_pic_handling->i4_stack_count; + i4_next_ref_pic_idx = ps_pic_handling->i4_ref_pic_idx; + i4_max_inter_frm_int = ps_pic_handling->i4_max_inter_frm_int; + + i4_next_pic_idx = ((i4_stack_count + 1) % (i4_max_inter_frm_int + 1)); + + /* + * Check what is the encoded frm_type + * Changing a B_PIC to a ref_pic is not reqd if + * there are no B_PICs referring from the skipped ref_pic + */ + if(((e_pic_type == P_PIC) || (e_pic_type == I_PIC)) + && (i4_next_pic_idx != i4_next_ref_pic_idx)) + { + /* Go to the last B_PIC before the next_ref_pic */ + if(i4_next_ref_pic_idx == 0) + { + i4_last_b_pic_idx = i4_max_inter_frm_int; + } + else + { + i4_last_b_pic_idx = (i4_next_ref_pic_idx - 1); + } + + /* Keep a copy of the last B_PIC pic_details */ + memcpy(&s_pic_details, + &ps_pic_handling->as_pic_stack[i4_last_b_pic_idx], + sizeof(pic_details_t)); + + i4_pic_idx = i4_last_b_pic_idx; + i4_first_b_pic_idx = (i4_stack_count + 1) % (i4_max_inter_frm_int + 1); + + /* + * All the B_PICs other than the last one, need to be shifted one place + * in the stack + */ + while((i4_pic_idx != i4_stack_count) + && (i4_first_b_pic_idx != i4_last_b_pic_idx)) + { + if(i4_pic_idx == 0) + { + i4_pic_idx = i4_max_inter_frm_int; + } + else + { + i4_pic_idx--; + } + + memcpy(&ps_pic_handling->as_pic_stack[(i4_pic_idx + 1) + % (i4_max_inter_frm_int + 1)], + &ps_pic_handling->as_pic_stack[i4_pic_idx], + sizeof(pic_details_t)); + + } + + /* + * Copy the last B_PIC pic_details to the first B_PIC place and change + * it's pic type to the ref_PIC + */ + /*e_ref_pic_type*/ + ps_pic_handling->as_pic_stack[i4_first_b_pic_idx].e_pic_type = P_PIC; + + ps_pic_handling->as_pic_stack[i4_first_b_pic_idx].i4_pic_disp_order_no = + s_pic_details.i4_pic_disp_order_no; + ps_pic_handling->as_pic_stack[i4_first_b_pic_idx].i4_pic_id = + s_pic_details.i4_pic_id; + + /* Change the rem_frms_in_prd so that the update works properly */ + if(ps_pic_handling->i4_rem_frms_in_gop[B_PIC] > 0) + { + ps_pic_handling->i4_rem_frms_in_gop[B_PIC]--; + ps_pic_handling->i4_rem_frms_in_gop[P_PIC]++; + } + } + +} + +/****************************************************************************** + Function Name : flush_frame + Description : Since when a flush frame is called, there will be no valid + frames after it, the last frame cannot be a B_PIC, as there + will be no reference frame for it (Input in display order) + + So,this fxn needs to go to the last added pic in the pic_stack. + If it's reference pic don't do anything + If it's a B_PIC, copy it's pic_details and put it in the + place of the next reference pic, changing the pic_type to + P_PIC + *****************************************************************************/ +void irc_flush_frame_from_pic_stack(pic_handling_t *ps_pic_handling) +{ + + pic_details_t s_prev_pic_details; + + /* Get the last entered pic_details (not to be modified here) */ + WORD32 i4_prev_b_pic_idx = ps_pic_handling->i4_prev_b_pic_idx; + WORD32 i4_ref_pic_idx = ps_pic_handling->i4_ref_pic_idx; + WORD32 i4_b_pic_idx = ps_pic_handling->i4_b_pic_idx; + + memcpy(&s_prev_pic_details, &ps_pic_handling->s_prev_pic_details, + sizeof(pic_details_t)); + + if(s_prev_pic_details.e_pic_type == B_PIC) + { + /* Copy the last B_PIC details to the next reference pic in display order */ + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].i4_pic_disp_order_no = + s_prev_pic_details.i4_pic_disp_order_no; + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].i4_pic_id = + s_prev_pic_details.i4_pic_id; + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].e_pic_type = P_PIC; + + /* + * Modify the last B_PIC pic_type, so that codec gets to know when + * all the buffered frames + * are flushed + */ + ps_pic_handling->as_pic_stack[i4_prev_b_pic_idx].e_pic_type = + MAX_PIC_TYPE; + ps_pic_handling->as_pic_stack[i4_prev_b_pic_idx].i4_pic_id = -1; + ps_pic_handling->as_pic_stack[i4_prev_b_pic_idx].i4_pic_disp_order_no = + -1; + } + else + { + /* + * Modify the next pic_type details in the stack, so that codec gets to + * know when all the + * buffered frames are flushed + */ + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].e_pic_type = MAX_PIC_TYPE; + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].i4_pic_id = -1; + ps_pic_handling->as_pic_stack[i4_ref_pic_idx].i4_pic_disp_order_no = -1; + + if(ps_pic_handling->i4_inter_frm_int != 1) + { + ps_pic_handling->as_pic_stack[i4_b_pic_idx].e_pic_type = + MAX_PIC_TYPE; + ps_pic_handling->as_pic_stack[i4_b_pic_idx].i4_pic_id = -1; + ps_pic_handling->as_pic_stack[i4_b_pic_idx].i4_pic_disp_order_no = + -1; + } + } +} + +/****************************************************************************** + Function Name : irc_add_pic_to_stack_re_enc + Description : In case of a re-enc, we can assume the pictures to be coming + in the encode order. + In case of re-encoder basically, there are 2 problematic cases. + 1)Inter_frm_int is not known to start with + 2)Inter_frm_int can keep changing + 3)Intra_frm_int set by the application and that actually in the + decoded bitstream may be different + *****************************************************************************/ +WORD32 irc_add_pic_to_stack_re_enc(pic_handling_t *ps_pic_handling, + WORD32 i4_enc_pic_id, + picture_type_e e_pic_type) +{ + WORD32 i4_b_count_in_subgop; + WORD32 i4_max_inter_frm_int, i4_inter_frm_int, i4_intra_frm_int; + WORD32 i4_pic_disp_order_no; + WORD32 i4_is_gop_closed; + picture_type_e e_out_pic_type; + WORD32 i4_b_in_incomp_subgop; + + /* Check if a change in intra_frm_int call has been made */ + if(ps_pic_handling->i4_change_in_intra_frm_int == 1) + { + irc_update_pic_distbn(ps_pic_handling, + ps_pic_handling->i4_new_intra_frm_int, + ps_pic_handling->i4_inter_frm_int, 1); + ps_pic_handling->i4_change_in_intra_frm_int = 0; + } + + /* Check if a change in inter_frm_int call has been made */ + if(ps_pic_handling->i4_change_in_inter_frm_int == 1) + { + irc_update_pic_distbn(ps_pic_handling, + ps_pic_handling->i4_intra_frm_int, + ps_pic_handling->i4_new_inter_frm_int, 1); + + ps_pic_handling->i4_change_in_inter_frm_int = 0; + } + + /* Initialize the local vars with the state vars */ + i4_b_count_in_subgop = ps_pic_handling->i4_b_count_in_subgop; + i4_max_inter_frm_int = ps_pic_handling->i4_max_inter_frm_int; + i4_inter_frm_int = ps_pic_handling->i4_inter_frm_int; + i4_intra_frm_int = ps_pic_handling->i4_intra_frm_int; + i4_pic_disp_order_no = ps_pic_handling->i4_pic_disp_order_no; + i4_is_gop_closed = ps_pic_handling->i4_is_gop_closed; + i4_b_in_incomp_subgop = ps_pic_handling->i4_b_in_incomp_subgop; + + e_out_pic_type = e_pic_type; + + /* Initially the rate_control assumes an IPP sequence */ + if(e_pic_type == B_PIC) + { + /* Update the number of B_PICs in a subgop */ + i4_b_count_in_subgop++; + + if(i4_b_count_in_subgop > i4_max_inter_frm_int) + { + return (-1); + } + + /* If the number of B_PICs exceed the set inter_frm_int then + change the inter_frm_int */ + if(i4_b_count_in_subgop > (i4_inter_frm_int - 1)) + { + i4_inter_frm_int = (i4_b_count_in_subgop + 1); + + irc_update_pic_distbn(ps_pic_handling, i4_intra_frm_int, + i4_inter_frm_int, 0); + } + } + else if((e_pic_type == I_PIC) || (e_pic_type == P_PIC)) + { + /* If the B_PICs in the prev subgop were fewer than the current + * (inter_frm_int-1) and none of these conditions occur, it'll mean the + * decrease in the inter_frm_int + * 1)End of a GOP + * 2)Beginning of an OPEN_GOP + */ + if((i4_b_count_in_subgop < (i4_inter_frm_int - 1)) + && !((!i4_is_gop_closed) + && (i4_b_count_in_subgop + >= i4_b_in_incomp_subgop)) + && !((i4_pic_disp_order_no + + (i4_inter_frm_int - 1 + - i4_b_count_in_subgop)) + > i4_intra_frm_int)) + { + i4_inter_frm_int = (i4_b_count_in_subgop + 1); + + irc_update_pic_distbn(ps_pic_handling, i4_intra_frm_int, + i4_inter_frm_int, 0); + } + + /* Reset the number of B_PICs in a subgop */ + i4_b_count_in_subgop = 0; + } + + /* Updation of the frame level vars */ + i4_pic_disp_order_no++; + + /* End of gop condition + *Two cases can arise : + *1) The intra_frm_int set by the application is greater than the actual + * bitstream intra_frm_int (i.e. we will get an I frame before + * pic_disp_order_no goes to intra_frm_int) + *2) The intra_frm_int set by the application is smaller than the actual bitstream intra_frm_int + * (i.e. we won't get an I_PIC even if pic_disp_order_no goes to + * intra_frm_int) Constraints : + * 1) I_PIC cannot be changed to B_PIC + * 2) B_PIC cannot be changed to I_PIC + */ + if(i4_pic_disp_order_no >= i4_intra_frm_int) + { + if(e_pic_type != B_PIC) + { + e_out_pic_type = I_PIC; + } + else + { + e_out_pic_type = B_PIC; + ps_pic_handling->i4_rem_frms_in_gop[B_PIC]++; + ps_pic_handling->i4_frms_in_cur_gop[B_PIC]++; + ps_pic_handling->i4_frms_in_gop[B_PIC]++; + } + } + else + { + if((e_pic_type == I_PIC) && (!ps_pic_handling->i4_is_first_gop)) + { + e_out_pic_type = P_PIC; + ps_pic_handling->i4_rem_frms_in_gop[P_PIC]++; + ps_pic_handling->i4_frms_in_cur_gop[P_PIC]++; + ps_pic_handling->i4_frms_in_gop[P_PIC]++; + } + else + { + e_out_pic_type = e_pic_type; + } + } + + /* Update the frm_vars at the end of the gop */ + if(i4_pic_disp_order_no + == (ps_pic_handling->i4_frms_in_cur_gop[P_PIC] + + ps_pic_handling->i4_frms_in_cur_gop[B_PIC] + + 1)) + { + i4_pic_disp_order_no = 0; + ps_pic_handling->i4_is_first_gop = 0; + } + + /* Update the vars working on the encoded pics */ + if((ps_pic_handling->i4_is_first_gop) + && (ps_pic_handling->i4_stack_count == -1)) + { + ps_pic_handling->i4_coded_pic_no = 0; + ps_pic_handling->i4_stack_count = 0; + } + + /* Add the pic_details to the pic_stack */ + ps_pic_handling->as_pic_stack[ps_pic_handling->i4_stack_count].e_pic_type = + e_out_pic_type; + ps_pic_handling->as_pic_stack[ps_pic_handling->i4_stack_count].i4_pic_disp_order_no = + ps_pic_handling->i4_pic_disp_order_no; + ps_pic_handling->as_pic_stack[ps_pic_handling->i4_stack_count].i4_pic_id = + i4_enc_pic_id; + + /* Writing back those values which need to be updated */ + ps_pic_handling->i4_inter_frm_int = i4_inter_frm_int; + ps_pic_handling->i4_pic_disp_order_no = i4_pic_disp_order_no; + ps_pic_handling->i4_b_count_in_subgop = i4_b_count_in_subgop; + + return (0); +} diff --git a/encoder/irc_picture_type.h b/encoder/irc_picture_type.h new file mode 100755 index 0000000..1af5424 --- /dev/null +++ b/encoder/irc_picture_type.h @@ -0,0 +1,95 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _PIC_HANDLING_H_ +#define _PIC_HANDLING_H_ + +/* + * Basic Understanding: + * irc_add_pic_to_stack(_re_enc): + * This functions converts the input (or display) order to encoding order + * */ +typedef struct pic_handling_t *pic_handling_handle; + +WORD32 irc_pic_handling_num_fill_use_free_memtab(pic_handling_handle *pps_pic_handling, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +void irc_init_pic_handling(pic_handling_handle ps_pic_handling, + WORD32 i4_intra_frm_int, + WORD32 i4_max_inter_frm_int, + WORD32 i4_is_gop_closed); + +void irc_add_pic_to_stack(pic_handling_handle ps_pic_handling, + WORD32 i4_enc_pic_id); + +WORD32 irc_add_pic_to_stack_re_enc(pic_handling_handle ps_pic_handling, + WORD32 i4_enc_pic_id, + picture_type_e e_pic_type); + +void irc_get_pic_from_stack(pic_handling_handle ps_pic_handling, + WORD32 *pi4_pic_id, + WORD32 *pi4_pic_disp_order_no, + picture_type_e *pe_pic_type); + +WORD32 irc_is_last_frame_in_gop(pic_handling_handle ps_pic_handling); + +void irc_flush_frame_from_pic_stack(pic_handling_handle ps_pic_handling); + +/* NITT TBR The below two functions should be made a single function */ +void irc_skip_encoded_frame(pic_handling_handle ps_pic_handling, + picture_type_e e_pic_type); + +void irc_update_pic_handling(pic_handling_handle ps_pic_handling, + picture_type_e e_pic_type); + +/* + * Function returns the number of frames that have been encoded in the GOP in + * which the force I frame takes impact + */ +WORD32 irc_pic_type_get_frms_in_gop_force_I_frm(pic_handling_handle ps_pic_handling); + +void irc_set_force_I_frame_flag(pic_handling_handle ps_pic_handling); + +WORD32 irc_get_forced_I_frame_cur_frm_flag(pic_handling_handle ps_pic_handling); + +void irc_reset_forced_I_frame_cur_frm_flag(pic_handling_handle ps_pic_handling); + +/* Normal get functions */ +WORD32 irc_pic_type_get_inter_frame_interval(pic_handling_handle ps_pic_handling); + +WORD32 irc_pic_type_get_intra_frame_interval(pic_handling_handle ps_pic_handling); + +WORD32 irc_pic_type_get_disp_order_no(pic_handling_handle ps_pic_handling); + +void irc_pic_handling_register_new_int_frm_interval(pic_handling_handle ps_pic_handling, + WORD32 i4_intra_frm_int); + +void irc_pic_handling_register_new_inter_frm_interval(pic_handling_handle ps_pic_handling, + WORD32 i4_inter_frm_int); + +void irc_pic_type_get_rem_frms_in_gop(pic_handling_handle ps_pic_handling, + WORD32 ai4_rem_frms_in_gop[MAX_PIC_TYPE]); + +void irc_pic_type_get_frms_in_gop(pic_handling_handle ps_pic_handling, + WORD32 ai4_frms_in_gop[MAX_PIC_TYPE]); + +#endif /* _PIC_HANDLING_H_ */ + diff --git a/encoder/irc_rate_control_api.c b/encoder/irc_rate_control_api.c new file mode 100755 index 0000000..6c6586e --- /dev/null +++ b/encoder/irc_rate_control_api.c @@ -0,0 +1,1600 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* Includes */ +/*****************************************************************************/ + +/* System include files */ +#include "stdio.h" + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_common.h" +#include "irc_cntrl_param.h" +#include "irc_mem_req_and_acq.h" +#include "irc_rd_model.h" +#include "irc_est_sad.h" +#include "irc_fixed_point_error_bits.h" +#include "irc_vbr_storage_vbv.h" +#include "irc_picture_type.h" +#include "irc_bit_allocation.h" +#include "irc_mb_model_based.h" +#include "irc_cbr_buffer_control.h" +#include "irc_vbr_str_prms.h" +#include "irc_rate_control_api.h" +#include "irc_rate_control_api_structs.h" +#include "irc_trace_support.h" + +#define DEV_Q 4 /*Q format(Shift) for Deviation range factor */ +#define HI_DEV_FCTR 22 /* 1.4*16 */ +#define LO_DEV_FCTR 12 /* 0.75*16 */ +#define GET_HI_DEV_QP(Qprev) (( ((WORD32) Qprev)*HI_DEV_FCTR + (1<<(DEV_Q-1)))>>DEV_Q) +#define GET_LO_DEV_QP(Qprev) (( ((WORD32) Qprev)*LO_DEV_FCTR + (1<<(DEV_Q-1)))>>DEV_Q) +#define CLIP_QP(Qc, hi_d, lo_d) (((Qc) < (lo_d))?((lo_d)):(((Qc) > (hi_d))?(hi_d):(Qc))) + +/*****************************************************************************/ +/* Restricts the quantization parameter variation within delta */ +/*****************************************************************************/ +/* static WORD32 restrict_swing(WORD32 cur_qp, WORD32 prev_qp, WORD32 delta_qp) + { + if((cur_qp) - (prev_qp) > (delta_qp)) (cur_qp) = (prev_qp) + (delta_qp) ; + if((prev_qp) - (cur_qp) > (delta_qp)) (cur_qp) = (prev_qp) - (delta_qp) ; + return cur_qp; + }*/ + +/***************************************************************************** + Function Name : rate_control_get_init_free_memtab + Description : Takes or gives memtab + Inputs : pps_rate_control_api - pointer to RC api pointer + ps_memtab - Memtab pointer + i4_use_base - Set during init, else 0 + i4_fill_base - Set during free, else 0 + *****************************************************************************/ +WORD32 irc_rate_control_num_fill_use_free_memtab(rate_control_handle *pps_rate_control_api, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0, i; + static rate_control_api_t s_temp_rc_api; + + /* + * Hack for al alloc, during which we dont have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_rate_control_api) = &s_temp_rc_api; + + /*for src rate control state structure*/ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(rate_control_api_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_rate_control_api, + e_func_type); + } + i4_mem_tab_idx++; + + /* Get the memory requirement of lower modules */ + i4_mem_tab_idx += irc_ba_num_fill_use_free_memtab( + &pps_rate_control_api[0]->ps_bit_allocation, + &ps_memtab[i4_mem_tab_idx], e_func_type); + + i4_mem_tab_idx += irc_cbr_buffer_num_fill_use_free_memtab( + &pps_rate_control_api[0]->ps_cbr_buffer, + &ps_memtab[i4_mem_tab_idx], e_func_type); + + i4_mem_tab_idx += irc_est_sad_num_fill_use_free_memtab( + &pps_rate_control_api[0]->ps_est_sad, + &ps_memtab[i4_mem_tab_idx], e_func_type); + + i4_mem_tab_idx += irc_mbrc_num_fill_use_free_memtab( + &pps_rate_control_api[0]->ps_mb_rate_control, + &ps_memtab[i4_mem_tab_idx], e_func_type); + + i4_mem_tab_idx += irc_vbr_vbv_num_fill_use_free_memtab( + &pps_rate_control_api[0]->ps_vbr_storage_vbv, + &ps_memtab[i4_mem_tab_idx], e_func_type); + + for(i = 0; i < MAX_PIC_TYPE; i++) + { + i4_mem_tab_idx += irc_rd_model_num_fill_use_free_memtab( + &pps_rate_control_api[0]->aps_rd_model[i], + &ps_memtab[i4_mem_tab_idx], e_func_type); + } + i4_mem_tab_idx += irc_pic_handling_num_fill_use_free_memtab( + &pps_rate_control_api[0]->ps_pic_handling, + &ps_memtab[i4_mem_tab_idx], e_func_type); + + return (i4_mem_tab_idx); +} + +/***************************************************************************** + Function Name : irc_initialise_rate_control + Description : Initialise the rate control structure + Inputs : ps_rate_control_api - api struct + e_rate_control_type - VBR, CBR (NLDRC/LDRC), VBR_STREAMING + u1_is_mb_level_rc_on - enabling mb level RC + u4_avg_bit_rate - bit rate to achieved across the entire + file size + u4_peak_bit_rate - max possible drain rate + u4_frame_rate - number of frames in 1000 seconds + u4_intra_frame_interval - num frames between two I frames + *au1_init_qp - init_qp for I,P,B + *****************************************************************************/ +void irc_initialise_rate_control(rate_control_api_t *ps_rate_control_api, + rc_type_e e_rate_control_type, + UWORD8 u1_is_mb_level_rc_on, + UWORD32 u4_avg_bit_rate, + UWORD32 *pu4_peak_bit_rate, + UWORD32 u4_min_bit_rate, + UWORD32 u4_frame_rate, + UWORD32 u4_max_delay, + UWORD32 u4_intra_frame_interval, + UWORD8 *pu1_init_qp, + UWORD32 u4_max_vbv_buff_size, + WORD32 i4_max_inter_frm_int, + WORD32 i4_is_gop_closed, + UWORD8 *pu1_min_max_qp, + WORD32 i4_use_est_intra_sad, + UWORD32 u4_src_ticks, + UWORD32 u4_tgt_ticks) +{ + WORD32 i; + UWORD32 u4_frms_in_delay_prd = (u4_frame_rate * u4_max_delay) / 1000000; + ps_rate_control_api->e_rc_type = e_rate_control_type; + ps_rate_control_api->u1_is_mb_level_rc_on = u1_is_mb_level_rc_on; + + trace_printf((const WORD8*)"RC type = %d\n", e_rate_control_type); + + /* Set the avg_bitrate_changed flag for each pic_type to 0 */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_rate_control_api->au1_avg_bitrate_changed[i] = 0; + } + + /* Initialize the pic_handling module */ + irc_init_pic_handling(ps_rate_control_api->ps_pic_handling, + (WORD32)u4_intra_frame_interval, i4_max_inter_frm_int, + i4_is_gop_closed); + + /*** Initialize the rate control modules ***/ + if(ps_rate_control_api->e_rc_type != CONST_QP) + { + UWORD32 au4_num_pics_in_delay_prd[MAX_PIC_TYPE]; + + /* Initialize the model parameter structures */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + irc_init_frm_rc_rd_model(ps_rate_control_api->aps_rd_model[i], + MAX_FRAMES_MODELLED); + } + + /* Initialize the buffer mechanism */ + if((ps_rate_control_api->e_rc_type == VBR_STORAGE) + || (ps_rate_control_api->e_rc_type + == VBR_STORAGE_DVD_COMP)) + { + /* Assuming both the peak bit rates are same for a VBR_STORAGE and + VBR_STORAGE_DVD_COMP */ + if(pu4_peak_bit_rate[0] != pu4_peak_bit_rate[1]) + { + trace_printf((const WORD8*)"For VBR_STORAGE and VBR_STORAGE_DVD_COMP the peak bit rates should be same\n"); + } + irc_init_vbr_vbv(ps_rate_control_api->ps_vbr_storage_vbv, + (WORD32)pu4_peak_bit_rate[0], + (WORD32)u4_frame_rate, + (WORD32)u4_max_vbv_buff_size); + } + else if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + UWORD32 u4_avg_bit_rate_copy[MAX_NUM_DRAIN_RATES]; + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + u4_avg_bit_rate_copy[i] = u4_avg_bit_rate; + } + /* In case of CBR the num pics in delay is ignored */ + for(i = 0; i < MAX_PIC_TYPE; i++) + au4_num_pics_in_delay_prd[i] = 0; + + irc_init_cbr_buffer(ps_rate_control_api->ps_cbr_buffer, + u4_max_delay, u4_frame_rate, + (WORD32 *)u4_avg_bit_rate_copy, + au4_num_pics_in_delay_prd, + u4_max_vbv_buff_size); + } + else if(ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + irc_init_vbv_str_prms(&ps_rate_control_api->s_vbr_str_prms, + u4_intra_frame_interval, u4_src_ticks, + u4_tgt_ticks, u4_frms_in_delay_prd); + + /* Get the number of pics of each type in delay period */ + irc_get_vsp_num_pics_in_dly_prd( + &ps_rate_control_api->s_vbr_str_prms, + au4_num_pics_in_delay_prd); + + irc_init_cbr_buffer(ps_rate_control_api->ps_cbr_buffer, + u4_max_delay, u4_frame_rate, + (WORD32 *)pu4_peak_bit_rate, + au4_num_pics_in_delay_prd, + u4_max_vbv_buff_size); + } + + /* Initialize the SAD estimation module */ + irc_init_est_sad(ps_rate_control_api->ps_est_sad, i4_use_est_intra_sad); + + /* Initialize the bit allocation module according to VBR or CBR */ + if((ps_rate_control_api->e_rc_type == VBR_STORAGE) + || (ps_rate_control_api->e_rc_type == VBR_STREAMING) + || (ps_rate_control_api->e_rc_type + == VBR_STORAGE_DVD_COMP)) + { + irc_ba_init_bit_allocation(ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling, + VBR_BIT_ALLOC_PERIOD, u4_avg_bit_rate, + u4_frame_rate, + (WORD32 *)pu4_peak_bit_rate, + u4_min_bit_rate); + } + else if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + irc_ba_init_bit_allocation(ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling, + CBR_BIT_ALLOC_PERIOD, u4_avg_bit_rate, + u4_frame_rate, + (WORD32 *)pu4_peak_bit_rate, + u4_min_bit_rate); + } + + /* + * u1_scd_detected will be initialized to 1 when a Scene change is + * detected + */ + ps_rate_control_api->u1_scd_detected = 0; + } + + /* Initialize the init_qp */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_rate_control_api->au1_init_qp[i] = pu1_init_qp[i]; + ps_rate_control_api->au1_prev_frm_qp[i] = pu1_init_qp[i]; + ps_rate_control_api->au1_min_max_qp[(i << 1)] = + pu1_min_max_qp[(i << 1)]; + ps_rate_control_api->au1_min_max_qp[(i << 1) + 1] = pu1_min_max_qp[(i + << 1) + 1]; + } + + /* Initialize the is_first_frm_encoded */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_rate_control_api->au1_is_first_frm_coded[i] = 0; + } + ps_rate_control_api->u1_is_first_frm = 1; + + /* + * Control flag for delayed impact after a change in peak bitrate has been + * made + */ + ps_rate_control_api->u4_frms_in_delay_prd_for_peak_bit_rate_change = 0; + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_rate_control_api->au4_new_peak_bit_rate[i] = pu4_peak_bit_rate[i]; + } + + /* Initialize the mb level rate control module */ + irc_init_mb_level_rc(ps_rate_control_api->ps_mb_rate_control); + ps_rate_control_api->i4_prev_frm_est_bits = u4_avg_bit_rate * 1000 + / u4_frame_rate; + + ps_rate_control_api->prev_ref_pic_type = I_PIC; +} + +/****************************************************************************** + *Description : calls irc_add_pic_to_stack + ******************************************************************************/ +void irc_add_picture_to_stack(rate_control_api_t *rate_control_api, + WORD32 i4_enc_pic_id) +{ + /* Call the routine to add the pic to stack in encode order */ + irc_add_pic_to_stack(rate_control_api->ps_pic_handling, i4_enc_pic_id); +} + +void irc_add_picture_to_stack_re_enc(rate_control_api_t *rate_control_api, + WORD32 i4_enc_pic_id, + picture_type_e e_pic_type) +{ + /* + * In case of a re-encoder, the pics will come in the encode order itself. + * So, there is no need to buffer the pics up + */ + irc_add_pic_to_stack_re_enc(rate_control_api->ps_pic_handling, + i4_enc_pic_id, e_pic_type); +} + +/******************************************************************************* + Description : Decides the picture type based on the state + ******************************************************************************/ +void irc_get_picture_details(rate_control_handle rate_control_api, + WORD32 *pi4_pic_id, + WORD32 *pi4_pic_disp_order_no, + picture_type_e *pe_pic_type) +{ + /* Call to get the pic_details */ + irc_get_pic_from_stack(rate_control_api->ps_pic_handling, pi4_pic_id, + pi4_pic_disp_order_no, pe_pic_type); +} + +/******************************************************************************* + * Description : Gets the frame level qp for the given picture type + ******************************************************************************/ +UWORD8 irc_get_frame_level_qp(rate_control_api_t *ps_rate_control_api, + picture_type_e e_pic_type, + WORD32 i4_ud_max_bits) +{ + UWORD8 u1_frame_qp, i; + + if((ps_rate_control_api->e_rc_type != VBR_STORAGE) + && (ps_rate_control_api->e_rc_type != VBR_STORAGE_DVD_COMP) + && (ps_rate_control_api->e_rc_type != CBR_NLDRC) + && (ps_rate_control_api->e_rc_type != CONST_QP) + && (ps_rate_control_api->e_rc_type != VBR_STREAMING)) + { + trace_printf((const WORD8*)(const WORD8*)" Only VBR,NLDRC and CONST QP supported for now \n"); + return (0); + } + + if(ps_rate_control_api->e_rc_type != CONST_QP) + { + UWORD8 u1_is_first_frm_coded = 1; + + /* Check whether at least one frame of a each picture type gets encoded*/ + /* Check whether it is an IPP or IPB kind of encoding */ + if((ps_rate_control_api->au1_is_first_frm_coded[I_PIC] + && ps_rate_control_api->au1_is_first_frm_coded[P_PIC]) + || ((irc_pic_type_get_intra_frame_interval( + ps_rate_control_api->ps_pic_handling) + == 1) + && (ps_rate_control_api->au1_is_first_frm_coded[I_PIC]))) + { + if(e_pic_type != B_PIC) + u1_is_first_frm_coded = 1; + else + { + for(i = 0; i < MAX_PIC_TYPE; i++) + { + u1_is_first_frm_coded &= + ps_rate_control_api->au1_is_first_frm_coded[i]; + } + } + } + else + { + u1_is_first_frm_coded = 0; + } + + if(u1_is_first_frm_coded) + { + WORD32 i4_cur_est_texture_bits, i4_cur_est_header_bits; + WORD32 i4_cur_est_bits; + UWORD32 u4_estimated_sad; + + /* Force I frame updation of rem_bits_in_frame*/ + if(irc_get_forced_I_frame_cur_frm_flag( + ps_rate_control_api->ps_pic_handling) == 1) + { + irc_ba_change_rem_bits_in_prd_at_force_I_frame( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling); + irc_reset_forced_I_frame_cur_frm_flag( + ps_rate_control_api->ps_pic_handling); + } + + /* Get the estimated texture bits allocated for the current frame*/ + i4_cur_est_texture_bits = irc_ba_get_cur_frm_est_texture_bits( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->aps_rd_model, + ps_rate_control_api->ps_est_sad, + ps_rate_control_api->ps_pic_handling, e_pic_type); + + /* Get the estimated header bits*/ + i4_cur_est_header_bits = irc_ba_get_cur_frm_est_header_bits( + ps_rate_control_api->ps_bit_allocation, e_pic_type); + + /* Total estimated bits */ + i4_cur_est_bits = i4_cur_est_header_bits + i4_cur_est_texture_bits; + + trace_printf((const WORD8*)"ft %d, etb = %d, eb %d, ", e_pic_type, + i4_cur_est_texture_bits, i4_cur_est_bits); + + /* Threshold the estimated bits based on the buffer fullness*/ + if(ps_rate_control_api->e_rc_type == VBR_STORAGE) + { + WORD32 i4_cur_frm_max_bit_possible; + i4_cur_frm_max_bit_possible = irc_get_max_target_bits( + ps_rate_control_api->ps_vbr_storage_vbv); + + if(i4_cur_est_bits > i4_cur_frm_max_bit_possible) + { + /* Assuming header would consume the same amount of bits */ + i4_cur_est_texture_bits = i4_cur_frm_max_bit_possible + - i4_cur_est_header_bits; + } + } + else if(ps_rate_control_api->e_rc_type == VBR_STORAGE_DVD_COMP) + { + WORD32 i4_rem_bits_in_gop, i4_rem_frms_in_gop, i; + WORD32 i4_cur_frm_max_bit_possible, + ai4_rem_frms_in_gop[MAX_PIC_TYPE]; + irc_pic_type_get_rem_frms_in_gop( + ps_rate_control_api->ps_pic_handling, + ai4_rem_frms_in_gop); + i4_rem_bits_in_gop = irc_get_rem_bits_in_period( + ps_rate_control_api); + i4_rem_frms_in_gop = 0; + for(i = 0; i < MAX_PIC_TYPE; i++) + i4_rem_frms_in_gop += ai4_rem_frms_in_gop[i]; + + /* Threshold the bits based on estimated buffer fullness */ + i4_cur_frm_max_bit_possible = irc_get_max_tgt_bits_dvd_comp( + ps_rate_control_api->ps_vbr_storage_vbv, + i4_rem_bits_in_gop, i4_rem_frms_in_gop, + e_pic_type); + + if(i4_cur_est_bits > i4_cur_frm_max_bit_possible) + { + /* Assuming header would consume the same amount of bits */ + i4_cur_est_texture_bits = i4_cur_frm_max_bit_possible + - i4_cur_est_header_bits; + + } + } + else if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + WORD32 i4_cur_frm_bits_acc_buffer = + irc_cbr_buffer_constraint_check( + ps_rate_control_api->ps_cbr_buffer, + i4_cur_est_bits, e_pic_type); + + /* Assuming the header would consume the same amount of bits */ + i4_cur_est_texture_bits = i4_cur_frm_bits_acc_buffer + - i4_cur_est_header_bits; + + } + else if(ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + WORD32 i4_cur_frm_bits_acc_buffer = + irc_vbr_stream_buffer_constraint_check( + ps_rate_control_api->ps_cbr_buffer, + i4_cur_est_bits, e_pic_type); + + /* Assuming the header would consume the same amount of bits */ + i4_cur_est_texture_bits = i4_cur_frm_bits_acc_buffer + - i4_cur_est_header_bits; + } + + trace_printf((const WORD8*)"emtb = %d, ", i4_cur_est_texture_bits); + + /* + * If the estimated texture bits go to values less than zero + * due to buffer underflow, make the estimated target bits to go + * to zero + */ + if(i4_cur_est_texture_bits < 0) + i4_cur_est_texture_bits = 0; + + ps_rate_control_api->i4_prev_frm_est_bits = (i4_cur_est_texture_bits + + i4_cur_est_header_bits); + + /* Clip est_texture_bits according to the user-defined max value */ + if((i4_cur_est_texture_bits + > (i4_ud_max_bits - i4_cur_est_header_bits)) + && (e_pic_type != I_PIC)) + { + i4_cur_est_texture_bits = (i4_ud_max_bits + - i4_cur_est_header_bits); + trace_printf((const WORD8*)"udcb = %d, ", + i4_ud_max_bits - i4_cur_est_header_bits); + } + + /* Calculate the estimated SAD for corresponding frame*/ + u4_estimated_sad = irc_get_est_sad(ps_rate_control_api->ps_est_sad, + e_pic_type); + + /* Query the model for the Qp for the corresponding frame*/ + + /* + * The check is because the model gives a negative QP when the + * i4_cur_est_texture_bits is less than or equal to 0 + * [This is a bug in the model]. As a temporary fix, the frame QP + * is being set to the max QP allowed + */ + if(i4_cur_est_texture_bits > 0) + { + u1_frame_qp = irc_find_qp_for_target_bits( + ps_rate_control_api->aps_rd_model[e_pic_type], + i4_cur_est_texture_bits, + u4_estimated_sad, + ps_rate_control_api->au1_min_max_qp[(e_pic_type + << 1)], + ps_rate_control_api->au1_min_max_qp[(e_pic_type + << 1) + 1]); + } + else + { + u1_frame_qp = ps_rate_control_api->au1_min_max_qp[(e_pic_type + << 1) + 1]; + } + + trace_printf((const WORD8*)"ehb %d, etb %d, fqp %d, es %d, eb %d, ", + i4_cur_est_header_bits, i4_cur_est_texture_bits, + u1_frame_qp, u4_estimated_sad, i4_cur_est_bits); + + /* Restricting the QP swing if the average bit rate has changed */ + if(ps_rate_control_api->au1_avg_bitrate_changed[e_pic_type] == 0) + { + WORD32 prev_qp; + WORD32 hi_dev_qp, lo_dev_qp; + /* Restricting the qp swing */ + prev_qp = ps_rate_control_api->au1_prev_frm_qp[ps_rate_control_api->prev_ref_pic_type]; + + if(ps_rate_control_api->prev_ref_pic_type != e_pic_type) + { + if(e_pic_type == I_PIC) + { + /* + * Constrain I-frame QP to be within specified limit of + * prev_ref_qp/Kp + */ + prev_qp = (P_TO_I_RATIO * prev_qp + (1 << (K_Q - 1))) + >> (K_Q); + } + else if(e_pic_type == P_PIC) + { + /* + * Constrain P-frame QP to be within specified limit of + * Kp*prev_ref_qp + */ + prev_qp = (I_TO_P_RATIO * prev_qp + (1 << (K_Q - 1))) + >> (K_Q); + } + else if(ps_rate_control_api->prev_ref_pic_type == P_PIC) + { + /* current frame is B-pic */ + /* Constrain B-frame QP to be within specified limit of + * prev_ref_qp/Kb + */ + prev_qp = (P_TO_B_RATIO * prev_qp + (1 << (K_Q - 1))) + >> (K_Q); + } + else /* if(ps_rate_control_api->prev_ref_pic_type == I_PIC*/ + { + /* current frame is B-pic */ + /* + * Constrain B-frame QP to be within specified limit of + * prev_ref_qp/Kb + */ + prev_qp = (P_TO_B_RATIO * I_TO_P_RATIO * prev_qp + + (1 << (K_Q + K_Q - 1))) + >> (K_Q + K_Q); + } + } + + hi_dev_qp = GET_HI_DEV_QP(prev_qp); + /* + * For lower QPs due to scale factor and fixed point arithmetic, + * the hi_dev_qp can be same as that of the prev qp and in which + * case it gets stuck in the lower most qp and thus not allowing + * QPs not to change. To avoid this,for lower qps the hi_dev_qp + * should be made slightly more than prev_qp + */ + if(prev_qp == hi_dev_qp) + { + hi_dev_qp += 1; + } + lo_dev_qp = GET_LO_DEV_QP(prev_qp); + u1_frame_qp = (UWORD8)CLIP_QP((WORD32)u1_frame_qp, hi_dev_qp, lo_dev_qp); + } + else + { + ps_rate_control_api->au1_avg_bitrate_changed[e_pic_type] = 0; + } + } + else + { + /* + * The u1_is_first_frm_coded gets reset + * a) at start of sequence + * b) whenever there is a scene change. + * In both cases since we do not have any estimate about the + * current frame, we just send in the previous frame qp value.IN + * Scene change case the previous QP is incremented by 4 , This is + * done because the Scene changed VOP will have over consumed and + * chances of future frames skipping is very high. For the init + * case, the previous frame QP is initialized with the init qp + */ + if((ps_rate_control_api->u1_scd_detected) + && (ps_rate_control_api->e_rc_type != CONST_QP)) + { + /* + * If scene change is detected, I frame Qp would have been + * updated + */ + /* Use a QP calculated in the prev update fxn */ + u1_frame_qp = ps_rate_control_api->u1_frm_qp_after_scd; + } + else + { + u1_frame_qp = ps_rate_control_api->au1_prev_frm_qp[e_pic_type]; + } + } + } + else + { + u1_frame_qp = ps_rate_control_api->au1_init_qp[e_pic_type]; + } + + trace_printf((const WORD8*)"fqp %d\n", u1_frame_qp); + + return (u1_frame_qp); +} + +/******************************************************************************* + *Function Name : irc_get_buffer_status + *Description : Gets the state of VBV buffer + *Outputs : 0 = normal, 1 = underflow, 2= overflow + *Returns : vbv_buf_status_e + ******************************************************************************/ +vbv_buf_status_e irc_get_buffer_status(rate_control_api_t *ps_rate_control_api, + WORD32 i4_total_frame_bits, + picture_type_e e_pic_type, + WORD32 *pi4_num_bits_to_prevent_vbv_underflow) +{ + vbv_buf_status_e e_buf_status = VBV_NORMAL; + + /* Get the buffer status for the current total consumed bits and error bits*/ + if(ps_rate_control_api->e_rc_type == VBR_STORAGE_DVD_COMP) + { + e_buf_status = irc_get_vbv_buffer_status( + ps_rate_control_api->ps_vbr_storage_vbv, + i4_total_frame_bits, + pi4_num_bits_to_prevent_vbv_underflow); + + trace_printf((const WORD8*)"e_buf_status = %d\n", e_buf_status); + } + else if(ps_rate_control_api->e_rc_type == VBR_STORAGE) + { + /* For VBR case since there is not underflow returning the max value */ + pi4_num_bits_to_prevent_vbv_underflow[0] = irc_get_max_vbv_buf_size( + ps_rate_control_api->ps_vbr_storage_vbv); + e_buf_status = VBV_NORMAL; + } + else if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + e_buf_status = irc_get_cbr_buffer_status( + ps_rate_control_api->ps_cbr_buffer, i4_total_frame_bits, + pi4_num_bits_to_prevent_vbv_underflow, e_pic_type); + + } + else if(ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + /* For VBR_streaming, error bits are computed according to peak bitrate*/ + e_buf_status = irc_get_cbr_buffer_status( + ps_rate_control_api->ps_cbr_buffer, i4_total_frame_bits, + pi4_num_bits_to_prevent_vbv_underflow, e_pic_type); + } + return e_buf_status; +} + +/******************************************************************************* + Function Name : irc_update_pic_handling_state + Description : If the forward path and the backward path of rate control + ******************************************************************************/ +void irc_update_pic_handling_state(rate_control_api_t *ps_rate_control_api, + picture_type_e e_pic_type) +{ + irc_update_pic_handling(ps_rate_control_api->ps_pic_handling, e_pic_type); +} + +/****************************************************************************** + Function Name : irc_update_frame_level_info + Description : Updates the frame level information into the rate control + structure + ******************************************************************************/ +void irc_update_frame_level_info(rate_control_api_t *ps_rate_control_api, + picture_type_e e_pic_type, + WORD32 *pi4_mb_type_sad, + WORD32 i4_total_frame_bits, + WORD32 i4_model_updation_hdr_bits, + WORD32 *pi4_mb_type_tex_bits, + WORD32 *pi4_tot_mb_type_qp, + WORD32 *pi4_tot_mb_in_type, + WORD32 i4_avg_activity, + UWORD8 u1_is_scd, + WORD32 i4_is_it_a_skip, + WORD32 i4_intra_frm_cost, + WORD32 i4_is_pic_handling_done) +{ + UWORD8 u1_num_skips = 0; + WORD32 i; + UWORD32 u4_frame_sad = 0; + WORD32 i4_tot_texture_bits = 0; + WORD32 i4_tot_mbs = 0; + WORD32 i4_avg_qp = 0; + + /* SCD not supported in case of IPB encoder */ + if(u1_is_scd && (irc_pic_type_get_inter_frame_interval( + ps_rate_control_api->ps_pic_handling) > 1)) + { + u1_is_scd = 0; + } + trace_printf((const WORD8*)"i4_total_frame_bits %d\n", i4_total_frame_bits); + + if(!i4_is_it_a_skip && !i4_is_pic_handling_done) + { + /* Update the pic_handling struct */ + irc_update_pic_handling(ps_rate_control_api->ps_pic_handling, + e_pic_type); + } + + if(ps_rate_control_api->e_rc_type != CONST_QP) + { + if(!i4_is_it_a_skip) + { + WORD32 i4_new_period_flag; + /****************************************************************** + Calculate the total values from the individual values + ******************************************************************/ + for(i = 0; i < MAX_MB_TYPE; i++) + u4_frame_sad += pi4_mb_type_sad[i]; + for(i = 0; i < MAX_MB_TYPE; i++) + i4_tot_texture_bits += pi4_mb_type_tex_bits[i]; + for(i = 0; i < MAX_MB_TYPE; i++) + i4_avg_qp += pi4_tot_mb_type_qp[i]; + for(i = 0; i < MAX_MB_TYPE; i++) + i4_tot_mbs += pi4_tot_mb_in_type[i]; + i4_avg_qp /= i4_tot_mbs; /* Calculate the average QP */ + + if(ps_rate_control_api->u1_is_mb_level_rc_on) + { + /* + * The model needs to take into consideration the average + * activity of the entire frame while estimating the QP. Thus + * the frame sad values are scaled by the average activity + * before updating it into the model. + */ + if(!i4_avg_activity) + i4_avg_activity = 1; + i4_intra_frm_cost *= i4_avg_activity; + u4_frame_sad *= i4_avg_activity; + } + + /****************************************************************** + Update the bit allocation module + NOTE: For bit allocation module, the pic_type should not be + modified to that of 'I', in case of a SCD. + ******************************************************************/ + i4_new_period_flag = irc_is_last_frame_in_gop( + ps_rate_control_api->ps_pic_handling); + irc_ba_update_cur_frm_consumed_bits( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling, + i4_total_frame_bits, i4_model_updation_hdr_bits, + e_pic_type, u1_is_scd, i4_new_period_flag); + + if(1 == i4_new_period_flag + && ((ps_rate_control_api->e_rc_type == VBR_STORAGE) + || (ps_rate_control_api->e_rc_type + == VBR_STORAGE_DVD_COMP))) + { + irc_ba_check_and_update_bit_allocation( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling, + irc_get_cur_vbv_buf_size( + ps_rate_control_api->ps_vbr_storage_vbv), + irc_get_max_vbv_buf_size( + ps_rate_control_api->ps_vbr_storage_vbv), + irc_get_max_bits_per_tgt_frm( + ps_rate_control_api->ps_vbr_storage_vbv), + i4_total_frame_bits); + } + } + + /********************************************************************** + Update the buffer status + *********************************************************************/ + /* + * This update is done after overflow and underflow handling to + * account for the actual bits dumped + */ + if((ps_rate_control_api->e_rc_type == VBR_STORAGE) + || (ps_rate_control_api->e_rc_type + == VBR_STORAGE_DVD_COMP)) + { + irc_update_vbr_vbv(ps_rate_control_api->ps_vbr_storage_vbv, + i4_total_frame_bits); + } + else if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + irc_update_cbr_buffer(ps_rate_control_api->ps_cbr_buffer, + i4_total_frame_bits, e_pic_type); + } + else if(ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + UWORD32 au4_num_pics_in_delay_prd[MAX_PIC_TYPE]; + + irc_get_vsp_num_pics_in_dly_prd( + &ps_rate_control_api->s_vbr_str_prms, + au4_num_pics_in_delay_prd); + + irc_update_cbr_buffer(ps_rate_control_api->ps_cbr_buffer, + i4_total_frame_bits, e_pic_type); + + irc_update_vbr_str_prms(&ps_rate_control_api->s_vbr_str_prms, + e_pic_type); + + irc_change_cbr_vbv_num_pics_in_delay_period( + ps_rate_control_api->ps_cbr_buffer, + au4_num_pics_in_delay_prd); + + /* + * If the change_in_peak_bitrate flag is set, after the delay period + * update the peak_bitrate and the buffer parameters + */ + if(!ps_rate_control_api->u4_frms_in_delay_prd_for_peak_bit_rate_change) + { + irc_ba_change_ba_peak_bit_rate( + ps_rate_control_api->ps_bit_allocation, + (WORD32 *)&ps_rate_control_api->au4_new_peak_bit_rate[0]); + irc_change_cbr_vbv_bit_rate( + ps_rate_control_api->ps_cbr_buffer, + (WORD32 *)&ps_rate_control_api->au4_new_peak_bit_rate[0]); + } + if(ps_rate_control_api->u4_frms_in_delay_prd_for_peak_bit_rate_change) + ps_rate_control_api->u4_frms_in_delay_prd_for_peak_bit_rate_change--; + } + + if(!i4_is_it_a_skip) + { + /******************************************************************* + Handle the SCENE CHANGE DETECTED + 1) Make the picture type as I, so that updation happens as if it is + an I frame + 2) Reset model, SAD and flag to restart the estimation process + ******************************************************************/ + if(u1_is_scd) + { + WORD32 i4_frm_qp_after_scd; + UWORD32 u4_prev_I_frm_sad; + + e_pic_type = I_PIC; + + /* Scale scd qp based on SCD Frm sad and previous I Frm sad */ + /* frm_qp_after_scd = (avg_qp * cur_frm_sad)/prev_I_frm_sad */ + + /* + * QP for the next frame should take care of + * 1) due to scene change, the current picture has consumed more + * bits + * 2) relative complexity of the previous scene and the current + * scene + */ + + /* Get the intra SAD for the previous scene */ + u4_prev_I_frm_sad = irc_get_est_sad( + ps_rate_control_api->ps_est_sad, I_PIC); + + /* + * Scale the QP based on the SAD ratio of the current pic and + * previous scene intra SAD + */ + X_PROD_Y_DIV_Z(i4_avg_qp, u4_frame_sad, u4_prev_I_frm_sad, + i4_frm_qp_after_scd); + + /* Limit the next frame qp by 50% across both the sides */ + if(i4_frm_qp_after_scd > ((i4_avg_qp * 3) >> 1)) + { + i4_frm_qp_after_scd = (i4_avg_qp * 3) >> 1; + } + else if(i4_frm_qp_after_scd < (i4_avg_qp >> 1)) + { + i4_frm_qp_after_scd = (i4_avg_qp >> 1); + } + + /* + * Ensure that the next frame QP is within the min_max limit of + * QP allowed + */ + if(i4_frm_qp_after_scd + > ps_rate_control_api->au1_min_max_qp[(e_pic_type + << 1) + 1]) + { + i4_frm_qp_after_scd = + ps_rate_control_api->au1_min_max_qp[(e_pic_type + << 1) + 1]; + } + else if(i4_frm_qp_after_scd + < ps_rate_control_api->au1_min_max_qp[(e_pic_type + << 1)]) + { + i4_frm_qp_after_scd = + ps_rate_control_api->au1_min_max_qp[(e_pic_type + << 1)]; + } + + /* Update the state var */ + ps_rate_control_api->u1_frm_qp_after_scd = + (UWORD8)i4_frm_qp_after_scd; + + /* re-set model */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + irc_reset_frm_rc_rd_model( + ps_rate_control_api->aps_rd_model[i]); + } + + /* Reset the SAD estimation module */ + irc_reset_est_sad(ps_rate_control_api->ps_est_sad); + + /* Reset flag */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_rate_control_api->au1_is_first_frm_coded[i] = 0; + } + + /* Reset the MB Rate control */ + irc_init_mb_level_rc(ps_rate_control_api->ps_mb_rate_control); + + /*Set u1_scd_detected flag*/ + ps_rate_control_api->u1_scd_detected = 1; + + /* + * Adjust the average QP for the frame based on bits + * consumption + */ + /* + * Initialize the QP for each picture type according to the + * average QP of the SCD pic + */ + ps_rate_control_api->au1_prev_frm_qp[I_PIC] = (UWORD8)i4_avg_qp; + + trace_printf((const WORD8*)"SCD DETECTED\n"); + } + else + { + ps_rate_control_api->u1_scd_detected = 0; + /************************************************************** + Update the Qp used by the current frame + **************************************************************/ + ps_rate_control_api->au1_prev_frm_qp[e_pic_type] = + (UWORD8)i4_avg_qp; + } + + /******************************************************************** + Update the model of the correponding picture type + NOTE: For SCD, we force the frame type from 'P' to that of a 'I' + ******************************************************************/ + /* + * For very simple sequences no bits are consumed by texture. These + * frames do not add any information to the model and so not added + */ + if(i4_tot_texture_bits && u4_frame_sad) + { + irc_add_frame_to_rd_model( + ps_rate_control_api->aps_rd_model[e_pic_type], + i4_tot_texture_bits, (UWORD8)i4_avg_qp, + u4_frame_sad, u1_num_skips); + + /* + * At least one proper frame in added into the model. Until that + * keep using the initial QP + */ + ps_rate_control_api->au1_is_first_frm_coded[e_pic_type] = 1; + } + + if(i4_avg_activity) + { + /* Update the mb_level model */ + irc_mb_update_frame_level( + ps_rate_control_api->ps_mb_rate_control, + i4_avg_activity); + } + + /****************************************************************** + Update the sad estimation module + NOTE: For SCD, we force the frame type from 'P' to that of a 'I' + ******************************************************************/ + if(u4_frame_sad) + { + irc_update_actual_sad(ps_rate_control_api->ps_est_sad, + u4_frame_sad, e_pic_type); + + irc_update_actual_sad_for_intra(ps_rate_control_api->ps_est_sad, + i4_intra_frm_cost); + } + + /* + * Update the variable which denotes that a frame has been + * encountered + */ + ps_rate_control_api->u1_is_first_frm = 0; + + } + } + + /* Store the prev encoded picture type for restricting Qp swing */ + if((e_pic_type == I_PIC) || (e_pic_type == P_PIC)) + { + ps_rate_control_api->prev_ref_pic_type = e_pic_type; + } + + trace_printf((const WORD8*)"ft %d,hb %d,tb %d,qp %d,fs %d\n", e_pic_type, + i4_model_updation_hdr_bits, i4_tot_texture_bits, i4_avg_qp, + u4_frame_sad); + + return; +} + +/******************************************************************************* + MB Level API functions + ******************************************************************************/ + +/****************************************************************************** + Function Name : irc_init_mb_rc_frame_level + Description : Initialise the frame level details required for a mb level + ******************************************************************************/ + +void irc_init_mb_rc_frame_level(rate_control_api_t *ps_rate_control_api, + UWORD8 u1_frame_qp) +{ + irc_mb_init_frame_level(ps_rate_control_api->ps_mb_rate_control, + u1_frame_qp); +} + +/****************************************************************************** + Function Name : irc_get_mb_level_qp + Description : Get the mb level qp + *****************************************************************************/ +void irc_get_mb_level_qp(rate_control_api_t *ps_rate_control_api, + WORD32 i4_cur_mb_activity, + WORD32 *pi4_mb_qp, + picture_type_e e_pic_type) +{ + if(ps_rate_control_api->u1_is_mb_level_rc_on) + { + irc_get_mb_qp(ps_rate_control_api->ps_mb_rate_control, + i4_cur_mb_activity, pi4_mb_qp); + + /* Truncating the QP to the Max and Min Qp values possible */ + if(pi4_mb_qp[1] < ps_rate_control_api->au1_min_max_qp[e_pic_type << 1]) + { + pi4_mb_qp[1] = ps_rate_control_api->au1_min_max_qp[e_pic_type << 1]; + } + if(pi4_mb_qp[1] + > ps_rate_control_api->au1_min_max_qp[(e_pic_type << 1) + + 1]) + { + pi4_mb_qp[1] = ps_rate_control_api->au1_min_max_qp[(e_pic_type << 1) + + 1]; + } + } + else + { + WORD32 i4_qp; + i4_qp = irc_get_frm_level_qp(ps_rate_control_api->ps_mb_rate_control); + /* Both the qp are used for */ + pi4_mb_qp[0] = i4_qp; /* Used as feedback for the rate control */ + pi4_mb_qp[1] = i4_qp; /* Used for quantising the MB*/ + } +} + +/**************************************************************************** + Function Name : irc_get_bits_to_stuff + Description : Gets the bits to stuff to prevent Underflow of Encoder Buffer + *****************************************************************************/ +WORD32 irc_get_bits_to_stuff(rate_control_api_t *ps_rate_control_api, + WORD32 i4_tot_consumed_bits, + picture_type_e e_pic_type) +{ + WORD32 i4_bits_to_stuff; + /* Get the CBR bits to stuff*/ + i4_bits_to_stuff = irc_get_cbr_bits_to_stuff( + ps_rate_control_api->ps_cbr_buffer, i4_tot_consumed_bits, + e_pic_type); + return i4_bits_to_stuff; +} + +/**************************************************************************** + Function Name : irc_get_prev_frm_est_bits + Description : Returns previous frame estimated bits + *****************************************************************************/ +WORD32 irc_get_prev_frm_est_bits(rate_control_api_t *ps_rate_control_api) +{ + return (ps_rate_control_api->i4_prev_frm_est_bits); +} + +/****************************************************************************** + Control Level API functions + Logic: The control call sets the state structure of the rate control api + accordingly such that the next process call would implement the same. + ******************************************************************************/ + +void irc_change_inter_frm_int_call(rate_control_api_t *ps_rate_control_api, + WORD32 i4_inter_frm_int) +{ + irc_pic_handling_register_new_inter_frm_interval( + ps_rate_control_api->ps_pic_handling, i4_inter_frm_int); +} + +void irc_change_intra_frm_int_call(rate_control_api_t *ps_rate_control_api, + WORD32 i4_intra_frm_int) +{ + irc_pic_handling_register_new_int_frm_interval( + ps_rate_control_api->ps_pic_handling, i4_intra_frm_int); + + if(ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + irc_change_vsp_ifi(&ps_rate_control_api->s_vbr_str_prms, + i4_intra_frm_int); + } +} + +/**************************************************************************** + Function Name : irc_change_avg_bit_rate + Description : Whenever the average bit rate changes, the excess bits is + between the changed bit rate and the old one is re-distributed + in the bit allocation module + *****************************************************************************/ +void irc_change_avg_bit_rate(rate_control_api_t *ps_rate_control_api, + UWORD32 u4_average_bit_rate) +{ + int i; + if(ps_rate_control_api->e_rc_type != CONST_QP) + { + /* + * Bit Allocation Module: distribute the excess/deficit bits between the + * old and the new frame rate to all the remaining frames + */ + irc_ba_change_remaining_bits_in_period( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling, + u4_average_bit_rate, + irc_ba_get_frame_rate( + ps_rate_control_api->ps_bit_allocation), + (WORD32 *)(ps_rate_control_api->au4_new_peak_bit_rate)); + } + if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + UWORD32 u4_average_bit_rate_copy[MAX_NUM_DRAIN_RATES]; + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + u4_average_bit_rate_copy[i] = u4_average_bit_rate; + } + irc_change_cbr_vbv_bit_rate(ps_rate_control_api->ps_cbr_buffer, + (WORD32 *)(u4_average_bit_rate_copy)); + } + + /* + * This is done only for average bitrate changing somewhere after the model + * stabilizes.Here it is assumed that user will not do this call after + * first few frames. If we dont have this check, what would happen is since + * the model has not stabilized, also bitrate has changed before the first + * frame, we dont restrict the qp. Qp can go to very bad values after init + * qp since if swing is disabled. + * This check will become buggy if change bitrate is called say somewhere + * after first two frames.Bottom line - RC init is done during create and + * this call is done just before first process.And we want to differentiate + * between this call done before first process and the call which is done + * during run time + */ + if(ps_rate_control_api->u1_is_first_frm == 0) + { + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_rate_control_api->au1_avg_bitrate_changed[i] = 1; + } + } +} + +/**************************************************************************** + Function Name : irc_change_frame_rate + Description : Does the necessary changes whenever there is a change in + frame rate + *****************************************************************************/ +void irc_change_frame_rate(rate_control_api_t *ps_rate_control_api, + UWORD32 u4_frame_rate, + UWORD32 u4_src_ticks, + UWORD32 u4_tgt_ticks) +{ + + if(ps_rate_control_api->e_rc_type != CONST_QP) + { + UWORD32 u4_frms_in_delay_prd = ((u4_frame_rate + * irc_get_cbr_buffer_delay( + ps_rate_control_api->ps_cbr_buffer)) + / 1000000); + if((ps_rate_control_api->e_rc_type == VBR_STORAGE) + || (ps_rate_control_api->e_rc_type + == VBR_STORAGE_DVD_COMP)) + { + irc_change_vbr_vbv_frame_rate( + ps_rate_control_api->ps_vbr_storage_vbv, + u4_frame_rate); + } + else if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + irc_change_cbr_vbv_tgt_frame_rate( + ps_rate_control_api->ps_cbr_buffer, u4_frame_rate); + } + else if(ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + UWORD32 au4_num_pics_in_delay_prd[MAX_PIC_TYPE]; + irc_change_vsp_tgt_ticks(&ps_rate_control_api->s_vbr_str_prms, + u4_tgt_ticks); + irc_change_vsp_src_ticks(&ps_rate_control_api->s_vbr_str_prms, + u4_src_ticks); + irc_change_vsp_fidp(&ps_rate_control_api->s_vbr_str_prms, + u4_frms_in_delay_prd); + + irc_get_vsp_num_pics_in_dly_prd( + &ps_rate_control_api->s_vbr_str_prms, + au4_num_pics_in_delay_prd); + irc_change_cbr_vbv_tgt_frame_rate( + ps_rate_control_api->ps_cbr_buffer, u4_frame_rate); + irc_change_cbr_vbv_num_pics_in_delay_period( + ps_rate_control_api->ps_cbr_buffer, + au4_num_pics_in_delay_prd); + } + + /* + * Bit Allocation Module: distribute the excess/deficit bits between the + * old and the new frame rate to all the remaining frames + */ + irc_ba_change_remaining_bits_in_period( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling, + irc_ba_get_bit_rate( + ps_rate_control_api->ps_bit_allocation), + u4_frame_rate, + (WORD32 *)(ps_rate_control_api->au4_new_peak_bit_rate)); + } +} + +/**************************************************************************** + Function Name : irc_change_frm_rate_for_bit_alloc + Description : Does the necessary changes only in the bit_allocation module + there is a change in frame rate + *****************************************************************************/ +void irc_change_frm_rate_for_bit_alloc(rate_control_api_t *ps_rate_control_api, + UWORD32 u4_frame_rate) +{ + + if(ps_rate_control_api->e_rc_type != CONST_QP) + { + /* + * Bit Allocation Module: distribute the excess/deficit bits between the + * old and the new frame rate to all the remaining frames + */ + irc_ba_change_remaining_bits_in_period( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling, + irc_ba_get_bit_rate( + ps_rate_control_api->ps_bit_allocation), + u4_frame_rate, + (WORD32 *)(ps_rate_control_api->au4_new_peak_bit_rate)); + + if(ps_rate_control_api->e_rc_type == VBR_STORAGE + || ps_rate_control_api->e_rc_type + == VBR_STORAGE_DVD_COMP) + { + irc_change_vbr_max_bits_per_tgt_frm( + ps_rate_control_api->ps_vbr_storage_vbv, + u4_frame_rate); + } + } +} + +void irc_change_init_qp(rate_control_api_t *ps_rate_control_api, + UWORD8 *pu1_init_qp) +{ + WORD32 i; + /* Initialize the init_qp */ + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_rate_control_api->au1_init_qp[i] = pu1_init_qp[i]; + ps_rate_control_api->au1_prev_frm_qp[i] = pu1_init_qp[i]; + } +} + +void irc_change_min_max_qp(rate_control_api_t *ps_rate_control_api, + UWORD8 *pu1_min_max_qp) +{ + WORD32 i; + for(i = 0; i < MAX_PIC_TYPE; i++) + { + ps_rate_control_api->au1_min_max_qp[(i << 1)] = + pu1_min_max_qp[(i << 1)]; + ps_rate_control_api->au1_min_max_qp[(i << 1) + 1] = pu1_min_max_qp[(i + << 1) + 1]; + } +} + +/**************************************************************************** + Function Name : irc_change_peak_bit_rate + Description : Does the necessary changes whenever there is a change in + peak bit rate + *****************************************************************************/ +WORD32 irc_change_peak_bit_rate(rate_control_api_t *ps_rate_control_api, + UWORD32 *pu4_peak_bit_rate) +{ + WORD32 i4_ret_val = RC_OK; + int i; + + /* + * Buffer Mechanism Module: Re-initialize the number of bits consumed per + * frame + */ + if(ps_rate_control_api->e_rc_type == VBR_STORAGE + || ps_rate_control_api->e_rc_type == VBR_STORAGE_DVD_COMP) + { + /* Send the new peak bit rate and the old frame rate */ + irc_change_vbr_vbv_bit_rate(ps_rate_control_api->ps_vbr_storage_vbv, + pu4_peak_bit_rate[0]); + irc_ba_change_ba_peak_bit_rate(ps_rate_control_api->ps_bit_allocation, + (WORD32 *)pu4_peak_bit_rate); + + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_rate_control_api->au4_new_peak_bit_rate[i] = + pu4_peak_bit_rate[i]; + } + } + else if(ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + if(ps_rate_control_api->u4_frms_in_delay_prd_for_peak_bit_rate_change) + { + /* + * Means that change in peak bit rate has been made twice before the + * previous change could take effect + */ + i4_ret_val = RC_BENIGN_ERR; + } + /* + * If the change happens before encoding the first frame make the + * effect immediately else delay the effect + */ + if(ps_rate_control_api->u1_is_first_frm) + { + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_rate_control_api->au4_new_peak_bit_rate[i] = + pu4_peak_bit_rate[i]; + } + irc_ba_change_ba_peak_bit_rate( + ps_rate_control_api->ps_bit_allocation, + (WORD32 *)pu4_peak_bit_rate); + irc_change_cbr_vbv_bit_rate(ps_rate_control_api->ps_cbr_buffer, + (WORD32 *)pu4_peak_bit_rate); + } + else + { + UWORD32 au4_num_pics_in_delay_prd[MAX_NUM_DRAIN_RATES]; + /* + * Else store the number of frames after which the effect should + * happen and then update the peak bitrate + */ + ps_rate_control_api->u4_frms_in_delay_prd_for_peak_bit_rate_change = + irc_get_vsp_num_pics_in_dly_prd( + &ps_rate_control_api->s_vbr_str_prms, + au4_num_pics_in_delay_prd); + for(i = 0; i < MAX_NUM_DRAIN_RATES; i++) + { + ps_rate_control_api->au4_new_peak_bit_rate[i] = + pu4_peak_bit_rate[i]; + } + } + } + + return (i4_ret_val); +} + +void irc_change_buffer_delay(rate_control_api_t *ps_rate_control_api, + UWORD32 u4_buffer_delay) +{ + UWORD32 u4_frms_in_delay_prd = ((irc_ba_get_frame_rate( + ps_rate_control_api->ps_bit_allocation) * u4_buffer_delay) + / 1000000); + + /* Initialize the rate control modules */ + if(ps_rate_control_api->e_rc_type == CBR_NLDRC) + { + irc_change_cbr_buffer_delay(ps_rate_control_api->ps_cbr_buffer, + u4_buffer_delay); + } + else if(ps_rate_control_api->e_rc_type == VBR_STORAGE + || ps_rate_control_api->e_rc_type == VBR_STORAGE_DVD_COMP) + { + UWORD32 au4_num_pics_in_delay_prd[MAX_PIC_TYPE]; + + irc_change_vsp_fidp(&ps_rate_control_api->s_vbr_str_prms, + u4_frms_in_delay_prd); + + /* Get the number of pics of each type in delay period */ + irc_get_vsp_num_pics_in_dly_prd(&ps_rate_control_api->s_vbr_str_prms, + au4_num_pics_in_delay_prd); + + irc_change_cbr_vbv_num_pics_in_delay_period( + ps_rate_control_api->ps_cbr_buffer, + au4_num_pics_in_delay_prd); + } +} + +/* Getter functions to get the current rate control parameters */ +UWORD32 irc_get_frame_rate(rate_control_api_t *ps_rate_control_api) +{ + return (irc_ba_get_frame_rate(ps_rate_control_api->ps_bit_allocation)); +} + +UWORD32 irc_get_bit_rate(rate_control_api_t *ps_rate_control_api) +{ + return (irc_ba_get_bit_rate(ps_rate_control_api->ps_bit_allocation)); +} + +UWORD32 irc_get_peak_bit_rate(rate_control_api_t *ps_rate_control_api, + WORD32 i4_index) +{ + return (ps_rate_control_api->au4_new_peak_bit_rate[i4_index]); +} + +UWORD32 irc_get_intra_frame_interval(rate_control_api_t *ps_rate_control_api) +{ + return (irc_pic_type_get_intra_frame_interval( + ps_rate_control_api->ps_pic_handling)); +} + +UWORD32 irc_get_inter_frame_interval(rate_control_api_t *ps_rate_control_api) +{ + return (irc_pic_type_get_inter_frame_interval( + ps_rate_control_api->ps_pic_handling)); +} + +rc_type_e irc_get_rc_type(rate_control_api_t *ps_rate_control_api) +{ + return (ps_rate_control_api->e_rc_type); +} + +WORD32 irc_get_bits_per_frame(rate_control_api_t *ps_rate_control_api) +{ + WORD32 i4_bits_per_frm; + + X_PROD_Y_DIV_Z(irc_ba_get_bit_rate(ps_rate_control_api->ps_bit_allocation), + (UWORD32)1000, + irc_ba_get_frame_rate(ps_rate_control_api->ps_bit_allocation), + i4_bits_per_frm); + + return (i4_bits_per_frm); +} + +UWORD32 irc_get_max_delay(rate_control_api_t *ps_rate_control_api) +{ + return (irc_get_cbr_buffer_delay(ps_rate_control_api->ps_cbr_buffer)); +} + +UWORD32 irc_get_seq_no(rate_control_api_t *ps_rate_control_api) +{ + return (irc_pic_type_get_disp_order_no(ps_rate_control_api->ps_pic_handling)); +} + +UWORD32 irc_get_rem_frames_in_gop(rate_control_api_t *ps_rate_control_api) +{ + WORD32 ai4_rem_frms_in_period[MAX_PIC_TYPE]; + WORD32 j; + UWORD32 u4_rem_frms_in_period = 0; + + /* Get the rem_frms_in_gop & the frms_in_gop from the pic_type state struct */ + irc_pic_type_get_rem_frms_in_gop(ps_rate_control_api->ps_pic_handling, + ai4_rem_frms_in_period); + + /* Depending on the number of gops in a period, find the num_frms_in_prd */ + for(j = 0; j < MAX_PIC_TYPE; j++) + { + u4_rem_frms_in_period += ai4_rem_frms_in_period[j]; + } + + return (u4_rem_frms_in_period); +} + +/**************************************************************************** + Function Name : irc_flush_buf_frames + Description : API call to flush the buffered up frames + *****************************************************************************/ +void irc_flush_buf_frames(rate_control_api_t *ps_rate_control_api) +{ + irc_flush_frame_from_pic_stack(ps_rate_control_api->ps_pic_handling); +} + +/**************************************************************************** + Function Name : irc_flush_buf_frames + Description : API call to flush the buffered up frames + *****************************************************************************/ + +void irc_post_encode_frame_skip(rate_control_api_t *ps_rate_control_api, + picture_type_e e_pic_type) +{ + irc_skip_encoded_frame(ps_rate_control_api->ps_pic_handling, e_pic_type); +} + +/**************************************************************************** + Function Name : irc_force_I_frame + Description : API call to force an I frame + *****************************************************************************/ +void irc_force_I_frame(rate_control_api_t *ps_rate_control_api) +{ + irc_set_force_I_frame_flag(ps_rate_control_api->ps_pic_handling); +} + +/**************************************************************************** + * Function Name : rc_get_rem_bits_in_gop + * Description : API call to get remaining bits in GOP + * *****************************************************************************/ +WORD32 irc_get_rem_bits_in_period(rate_control_api_t *ps_rate_control_api) +{ + return (irc_ba_get_rem_bits_in_period( + ps_rate_control_api->ps_bit_allocation, + ps_rate_control_api->ps_pic_handling)); +} + +/**************************************************************************** + * Function Name : irc_get_vbv_buf_fullness + * Description : API call to get VBV buffer fullness + ******************************************************************************/ +WORD32 irc_get_vbv_buf_fullness(rate_control_api_t *ps_rate_control_api) +{ + return (irc_get_cur_vbv_buf_size(ps_rate_control_api->ps_vbr_storage_vbv)); +} + +WORD32 irc_get_vbv_buf_size(rate_control_api_t *ps_rate_control_api) +{ + if(ps_rate_control_api->e_rc_type == CBR_NLDRC + || ps_rate_control_api->e_rc_type == VBR_STREAMING) + { + return (irc_get_cbr_buffer_size(ps_rate_control_api->ps_cbr_buffer)); + } + else + { + return (irc_get_max_vbv_buf_size( + ps_rate_control_api->ps_vbr_storage_vbv)); + } +} + +WORD32 irc_get_vbv_fulness_with_cur_bits(rate_control_api_t *ps_rate_control_api, + UWORD32 u4_bits) +{ + return (irc_vbv_get_vbv_buf_fullness( + ps_rate_control_api->ps_vbr_storage_vbv, u4_bits)); +} + +void irc_set_avg_mb_act(rate_control_api_t *ps_rate_control_api, + WORD32 i4_avg_activity) +{ + irc_mb_update_frame_level(ps_rate_control_api->ps_mb_rate_control, + i4_avg_activity); + return; +} diff --git a/encoder/irc_rate_control_api.h b/encoder/irc_rate_control_api.h new file mode 100755 index 0000000..0173037 --- /dev/null +++ b/encoder/irc_rate_control_api.h @@ -0,0 +1,188 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _RATE_CONTROL_API_H_ +#define _RATE_CONTROL_API_H_ + +#define RC_OK 0 +#define RC_FAIL -1 +#define RC_BENIGN_ERR -2 + +/* This file should only contain RC API function declarations */ + +typedef struct rate_control_api_t *rate_control_handle; + +WORD32 irc_rate_control_num_fill_use_free_memtab(rate_control_handle *pps_rate_control_api, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +void irc_initialise_rate_control(rate_control_handle ps_rate_control_api, + rc_type_e e_rate_control_type, + UWORD8 u1_is_mb_level_rc_on, + UWORD32 u4_avg_bit_rate, + UWORD32 *pu4_peak_bit_rate, + UWORD32 u4_min_bit_rate, + UWORD32 u4_frame_rate, + UWORD32 u4_max_delay, + UWORD32 u4_intra_frame_interval, + UWORD8 *pu1_init_qp, + UWORD32 u4_max_vbv_buff_size, + WORD32 i4_max_inter_frm_int, + WORD32 i4_is_gop_closed, + UWORD8 *pu1_min_max_qp, + WORD32 i4_use_est_intra_sad, + UWORD32 u4_src_ticks, + UWORD32 u4_tgt_ticks); + +/***************************************************************************** + Process level API fuctions (FRAME LEVEL) + *****************************************************************************/ +void irc_flush_buf_frames(rate_control_handle ps_rate_control_api); + +void irc_post_encode_frame_skip(rate_control_handle ps_rate_control_api, + picture_type_e e_pic_type); + +void irc_add_picture_to_stack(rate_control_handle rate_control_api, + WORD32 i4_enc_pic_id); + +void irc_add_picture_to_stack_re_enc(rate_control_handle rate_control_api, + WORD32 i4_enc_pic_id, + picture_type_e e_pic_type); + +void irc_get_picture_details(rate_control_handle rate_control_api, + WORD32 *pi4_pic_id, + WORD32 *pi4_pic_disp_order_no, + picture_type_e *pe_pic_type); + +/* Gets the frame level Qp */ +UWORD8 irc_get_frame_level_qp(rate_control_handle rate_control_api, + picture_type_e pic_type, + WORD32 i4_max_frm_bits); + +vbv_buf_status_e irc_get_buffer_status(rate_control_handle rate_control_api, + WORD32 i4_total_frame_bits, + picture_type_e e_pic_type, + WORD32 *pi4_num_bits_to_prevent_vbv_underflow); + +WORD32 irc_get_prev_frm_est_bits(rate_control_handle ps_rate_control_api); + +void irc_update_pic_handling_state(rate_control_handle ps_rate_control_api, + picture_type_e e_pic_type); + +void irc_update_frame_level_info(rate_control_handle ps_rate_control_api, + picture_type_e e_pic_type, + WORD32 *pi4_mb_type_sad, + WORD32 i4_total_frame_bits, + WORD32 i4_model_updation_hdr_bits, + WORD32 *pi4_mb_type_tex_bits, + WORD32 *pi4_tot_mb_type_qp, + WORD32 *pi4_tot_mb_in_type, + WORD32 i4_avg_activity, + UWORD8 u1_is_scd, + WORD32 i4_is_it_a_skip, + WORD32 i4_intra_frm_cost, + WORD32 i4_is_pic_handling_done); + +/***************************************************************************** + MB LEVEL API (just wrapper fucntions) + *****************************************************************************/ + +void irc_init_mb_rc_frame_level(rate_control_handle ps_rate_control_api, + UWORD8 u1_frame_qp);/* Current frame qp*/ + +void irc_get_mb_level_qp(rate_control_handle ps_rate_control_api, + WORD32 i4_cur_mb_activity, + WORD32 *pi4_mb_qp, + picture_type_e e_pic_type); + +WORD32 irc_get_bits_to_stuff(rate_control_handle ps_rate_control_api, + WORD32 i4_tot_consumed_bits, + picture_type_e e_pic_type); + +/****************************************************************************** + Control Level API functions + Logic: The control call sets the state structure of the rate control api + accordingly such that the next process call would implement the same. + ******************************************************************************/ + +void irc_change_inter_frm_int_call(rate_control_handle ps_rate_control_api, + WORD32 i4_inter_frm_int); + +void irc_change_intra_frm_int_call(rate_control_handle ps_rate_control_api, + WORD32 i4_intra_frm_int); + +void irc_change_avg_bit_rate(rate_control_handle ps_rate_control_api, + UWORD32 u4_average_bit_rate); + +void irc_change_frame_rate(rate_control_handle ps_rate_control_api, + UWORD32 u4_frame_rate, + UWORD32 u4_src_ticks, + UWORD32 u4_target_ticks); + +void irc_change_frm_rate_for_bit_alloc(rate_control_handle ps_rate_control_api, + UWORD32 u4_frame_rate); + +void irc_change_init_qp(rate_control_handle ps_rate_control_api, + UWORD8 *init_qp); + +WORD32 irc_change_peak_bit_rate(rate_control_handle ps_rate_control_api, + UWORD32 *u4_peak_bit_rate); + +void irc_change_buffer_delay(rate_control_handle ps_rate_control_api, + UWORD32 u4_buffer_delay); + +void irc_force_I_frame(rate_control_handle ps_rate_control_api); + +void irc_change_min_max_qp(rate_control_handle ps_rate_control_api, + UWORD8 *u1_min_max_qp); + +/******************************************************************************** + Getter functions + For getting the current state of the rate control structures + ********************************************************************************/ + +UWORD32 irc_get_frame_rate(rate_control_handle ps_rate_control_api); + +UWORD32 irc_get_bit_rate(rate_control_handle ps_rate_control_api); + +UWORD32 irc_get_intra_frame_interval(rate_control_handle ps_rate_control_api); + +UWORD32 irc_get_inter_frame_interval(rate_control_handle ps_rate_control_api); + +rc_type_e irc_get_rc_type(rate_control_handle ps_rate_control_api); + +WORD32 irc_get_bits_per_frame(rate_control_handle ps_rate_control_api); + +UWORD32 irc_get_peak_bit_rate(rate_control_handle ps_rate_control_api, + WORD32 i4_index); + +UWORD32 irc_get_max_delay(rate_control_handle ps_rate_control_api); + +UWORD32 irc_get_seq_no(rate_control_handle ps_rate_control_api); + +WORD32 irc_get_rem_bits_in_period(rate_control_handle ps_rate_control_api); + +WORD32 irc_get_vbv_buf_fullness(rate_control_handle ps_rate_control_api); + +WORD32 irc_get_vbv_buf_size(rate_control_handle ps_rate_control_api); + +WORD32 irc_get_vbv_fulness_with_cur_bits(rate_control_handle ps_rate_control_api, + UWORD32 u4_bits); +#endif diff --git a/encoder/irc_rate_control_api_structs.h b/encoder/irc_rate_control_api_structs.h new file mode 100755 index 0000000..ba39e7f --- /dev/null +++ b/encoder/irc_rate_control_api_structs.h @@ -0,0 +1,93 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _RATE_CONTROL_API_STRUCTS_H_ +#define _RATE_CONTROL_API_STRUCTS_H_ + +/* + * The following definitions were present in irc_cntrl_param.h, moved to this + * file as it is used by irc_rate_control_api.c + */ + +/* num_frm_in_period = BIT_ALLOC_PERIOD*intra_frame_interval */ +#define VBR_BIT_ALLOC_PERIOD 3 +#define CBR_BIT_ALLOC_PERIOD 1 + +/* Rate control state structure */ +typedef struct rate_control_api_t +{ + /* RC Algorithm */ + rc_type_e e_rc_type; + + /* Whether MB level rc is enabled or not */ + UWORD8 u1_is_mb_level_rc_on; + + /* Picture handling struct */ + pic_handling_handle ps_pic_handling; + + /* Model struct for I and P frms */ + rc_rd_model_handle aps_rd_model[MAX_PIC_TYPE]; + + /* VBR storage VBV structure */ + vbr_storage_vbv_handle ps_vbr_storage_vbv; + + /* Calculate the estimated SAD */ + est_sad_handle ps_est_sad; + + /* Allocation of bits for each frame */ + bit_allocation_handle ps_bit_allocation; + + /* Init Qp(also used for Const Qp scenarios) */ + UWORD8 au1_init_qp[MAX_PIC_TYPE]; + + /* MB Level rate control state structure */ + mb_rate_control_handle ps_mb_rate_control; + + UWORD8 au1_is_first_frm_coded[MAX_PIC_TYPE]; + + UWORD8 au1_prev_frm_qp[MAX_PIC_TYPE]; + + cbr_buffer_handle ps_cbr_buffer; + + UWORD8 u1_scd_detected; + + UWORD8 u1_frm_qp_after_scd; + + UWORD8 au1_avg_bitrate_changed[MAX_PIC_TYPE]; + + UWORD8 u1_is_first_frm; + + UWORD8 au1_min_max_qp[(MAX_PIC_TYPE << 1)]; + + WORD32 i4_prev_frm_est_bits; + + vbr_str_prms_t s_vbr_str_prms; + + /* Store the values which are to be impacted after a delay */ + UWORD32 u4_frms_in_delay_prd_for_peak_bit_rate_change; + + UWORD32 au4_new_peak_bit_rate[MAX_NUM_DRAIN_RATES]; + + picture_type_e prev_ref_pic_type; + +} rate_control_api_t; + +#endif/*_RATE_CONTROL_API_STRUCTS_H_*/ + diff --git a/encoder/irc_rd_model.c b/encoder/irc_rd_model.c new file mode 100755 index 0000000..f5c0737 --- /dev/null +++ b/encoder/irc_rd_model.c @@ -0,0 +1,565 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/****************************************************************************/ +/* File Name : irc_rd_model.c */ +/* */ +/* Description : Implall the Functions to Model the */ +/* Rate Distortion Behaviour of the Codec over the Last */ +/* Few Frames. */ +/* */ +/* List of Functions : irc_update_frame_rd_model */ +/* estimate_mpeg2_qp_for_resbits */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 21 06 2006 Sarat Initial Version */ +/****************************************************************************/ + +/* System include files */ +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include "math.h" + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_common.h" +#include "irc_mem_req_and_acq.h" +#include "irc_rd_model.h" +#include "irc_rd_model_struct.h" + + +WORD32 irc_rd_model_num_fill_use_free_memtab(rc_rd_model_t **pps_rc_rd_model, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static rc_rd_model_t s_rc_rd_model_temp; + + /* + * Hack for al alloc, during which we don't have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_rc_rd_model) = &s_rc_rd_model_temp; + + /*for src rate control state structure*/ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(rc_rd_model_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_rc_rd_model, e_func_type); + } + i4_mem_tab_idx++; + + return (i4_mem_tab_idx); +} + +void irc_init_frm_rc_rd_model(rc_rd_model_t *ps_rd_model, + UWORD8 u1_max_frames_modelled) +{ + + ps_rd_model->u1_num_frms_in_model = 0; + ps_rd_model->u1_curr_frm_counter = 0; + ps_rd_model->u1_max_frms_to_model = u1_max_frames_modelled; + + ps_rd_model->model_coeff_a_lin_wo_int = 0; + ps_rd_model->model_coeff_b_lin_wo_int = 0; + ps_rd_model->model_coeff_c_lin_wo_int = 0; +} + +void irc_reset_frm_rc_rd_model(rc_rd_model_t *ps_rd_model) +{ + ps_rd_model->u1_num_frms_in_model = 0; + ps_rd_model->u1_curr_frm_counter = 0; + + ps_rd_model->model_coeff_a_lin_wo_int = 0; + ps_rd_model->model_coeff_b_lin_wo_int = 0; + ps_rd_model->model_coeff_c_lin_wo_int = 0; +} + +static UWORD8 find_model_coeffs(UWORD32 *pi4_res_bits, + UWORD32 *pi4_sad_h264, + UWORD8 *pu1_num_skips, + UWORD8 *pui_avg_mpeg2_qp, + UWORD8 u1_num_frms, + UWORD8 u1_model_used, + WORD8 *pi1_frame_index, + model_coeff *pmc_model_coeff, + model_coeff *pmc_model_coeff_lin, + model_coeff *pmc_model_coeff_lin_wo_int, + rc_rd_model_t *ps_rd_model) +{ + UWORD32 i; + UWORD8 u1_num_frms_used = 0; + UWORD8 u1_frm_indx; + +#if !(ENABLE_QUAD_RC_MODEL||ENABLE_LIN_MODEL_WITH_INTERCEPT) + UNUSED(pu1_num_skips); + UNUSED(pmc_model_coeff); + UNUSED(pmc_model_coeff_lin); +#endif + float sum_y = 0; + float sum_x_y = 0; + float sum_x2_y = 0; + float sum_x = 0; + float sum_x2 = 0; + float sum_x3 = 0; + float sum_x4 = 0; + + float x0, y0; + float model_coeff_a = 0.0, model_coeff_b = 0.0, model_coeff_c = 0.0; + + for(i = 0; i < u1_num_frms; i++) + { + if(-1 == pi1_frame_index[i]) + continue; + + u1_frm_indx = (UWORD8)pi1_frame_index[i]; + + y0 = (float)(pi4_res_bits[u1_frm_indx]); + x0 = (float)(pi4_sad_h264[u1_frm_indx] + / (float)pui_avg_mpeg2_qp[u1_frm_indx]); + + sum_y += y0; + sum_x_y += x0 * y0; + sum_x2_y += x0 * x0 * y0; + sum_x += x0; + sum_x2 += x0 * x0; + sum_x3 += x0 * x0 * x0; + sum_x4 += x0 * x0 * x0 * x0; + u1_num_frms_used++; + } + + sum_y /= u1_num_frms_used; + sum_x_y /= u1_num_frms_used; + sum_x2_y /= u1_num_frms_used; + sum_x /= u1_num_frms_used; + sum_x2 /= u1_num_frms_used; + sum_x3 /= u1_num_frms_used; + sum_x4 /= u1_num_frms_used; + + { + UWORD8 u1_curr_frame_index; + UWORD8 u1_avgqp_prvfrm; + UWORD32 u4_prevfrm_bits, u4_prevfrm_sad; + + u1_curr_frame_index = ps_rd_model->u1_curr_frm_counter; + if(0 == u1_curr_frame_index) + u1_curr_frame_index = (MAX_FRAMES_MODELLED - 1); + else + u1_curr_frame_index--; + + u1_avgqp_prvfrm = ps_rd_model->pu1_avg_qp[u1_curr_frame_index]; + u4_prevfrm_bits = ps_rd_model->pi4_res_bits[u1_curr_frame_index]; + u4_prevfrm_sad = ps_rd_model->pi4_sad[u1_curr_frame_index]; + + if(0 != u4_prevfrm_sad) + model_coeff_a = (float)(u4_prevfrm_bits * u1_avgqp_prvfrm) + / u4_prevfrm_sad; + else + model_coeff_a = 0; + + model_coeff_b = 0; + model_coeff_c = 0; + + pmc_model_coeff_lin_wo_int[0] = model_coeff_b; + pmc_model_coeff_lin_wo_int[1] = model_coeff_a; + pmc_model_coeff_lin_wo_int[2] = model_coeff_c; + } + + return u1_model_used; +} + +static void irc_update_frame_rd_model(rc_rd_model_t *ps_rd_model) +{ + WORD8 pi1_frame_index[MAX_FRAMES_MODELLED], + pi1_frame_index_initial[MAX_FRAMES_MODELLED]; + + UWORD8 u1_num_skips_temp; + UWORD8 u1_avg_mpeg2_qp_temp, u1_min_mpeg2_qp, u1_max_mpeg2_qp; + UWORD8 u1_num_frms_input, u1_num_active_frames, u1_reject_frame; + UWORD32 u4_num_skips; + + UWORD8 u1_min2_mpeg2_qp, u1_max2_mpeg2_qp; + UWORD8 u1_min_qp_frame_indx, u1_max_qp_frame_indx; + UWORD8 pu1_num_frames[MPEG2_QP_ELEM]; + model_coeff model_coeff_array[3], model_coeff_array_lin[3], + model_coeff_array_lin_wo_int[3]; + UWORD32 i; + UWORD8 u1_curr_frame_index; + + u1_curr_frame_index = ps_rd_model->u1_curr_frm_counter; + + ps_rd_model->u1_model_used = PREV_FRAME_MODEL; + + if(0 == u1_curr_frame_index) + u1_curr_frame_index = (MAX_FRAMES_MODELLED - 1); + else + u1_curr_frame_index--; + + /************************************************************************/ + /* Rearrange data to be fed into a Linear Regression Module */ + /* Module finds a,b,c such that */ + /* y = ax + bx^2 + c */ + /************************************************************************/ + u4_num_skips = 0; + u1_num_frms_input = 0; + memset(pu1_num_frames, 0, MPEG2_QP_ELEM); + memset(pi1_frame_index, -1, MAX_FRAMES_MODELLED); + u1_min_mpeg2_qp = MAX_MPEG2_QP; + u1_max_mpeg2_qp = 0; + + u1_num_active_frames = ps_rd_model->u1_num_frms_in_model; + if(u1_num_active_frames > MAX_ACTIVE_FRAMES) + { + u1_num_active_frames = MAX_ACTIVE_FRAMES; + } + + /************************************************************************/ + /* Choose the set of Points to be used for MSE fit of Quadratic model */ + /* Points chosen are spread across the Qp range. Max of 2 points are */ + /* chosen for a Qp. */ + /************************************************************************/ + for(i = 0; i < u1_num_active_frames; i++) + { + u1_reject_frame = 0; + u1_num_skips_temp = ps_rd_model->pu1_num_skips[u1_curr_frame_index]; + u1_avg_mpeg2_qp_temp = ps_rd_model->pu1_avg_qp[u1_curr_frame_index]; + + if((0 == u4_num_skips) && (0 != u1_num_skips_temp)) + u1_reject_frame = 1; + if((1 == u4_num_skips) && (u1_num_skips_temp > 1)) + u1_reject_frame = 1; + if(pu1_num_frames[u1_avg_mpeg2_qp_temp] >= 2) + u1_reject_frame = 1; + + if(0 == i) + u1_reject_frame = 0; + + if(0 == u1_reject_frame) + { + pi1_frame_index[u1_num_frms_input] = (WORD8)u1_curr_frame_index; + pu1_num_frames[u1_avg_mpeg2_qp_temp] += 1; + + if(u1_min_mpeg2_qp > u1_avg_mpeg2_qp_temp) + u1_min_mpeg2_qp = u1_avg_mpeg2_qp_temp; + if(u1_max_mpeg2_qp < u1_avg_mpeg2_qp_temp) + u1_max_mpeg2_qp = u1_avg_mpeg2_qp_temp; + + u1_num_frms_input++; + } + + if(0 == u1_curr_frame_index) + u1_curr_frame_index = (MAX_FRAMES_MODELLED - 1); + else + u1_curr_frame_index--; + } + + /************************************************************************/ + /* Add Pivot Points to the Data set to be used for finding Quadratic */ + /* Model Coeffs. These will help in constraining the shape of Quadratic*/ + /* to adapt too much to the Local deviations. */ + /************************************************************************/ + u1_min2_mpeg2_qp = u1_min_mpeg2_qp; + u1_max2_mpeg2_qp = u1_max_mpeg2_qp; + u1_min_qp_frame_indx = INVALID_FRAME_INDEX; + u1_max_qp_frame_indx = INVALID_FRAME_INDEX; + + /* Loop runnning over the Stored Frame Level Data + to find frames of MinQp and MaxQp */ + for(; i < ps_rd_model->u1_num_frms_in_model; i++) + { + u1_num_skips_temp = ps_rd_model->pu1_num_skips[u1_curr_frame_index]; + u1_avg_mpeg2_qp_temp = ps_rd_model->pu1_avg_qp[u1_curr_frame_index]; + + if(((0 == u4_num_skips) && (0 != u1_num_skips_temp)) + || ((1 == u4_num_skips) && (u1_num_skips_temp > 1))) + continue; + + if(u1_min2_mpeg2_qp > u1_avg_mpeg2_qp_temp) + { + u1_min2_mpeg2_qp = u1_avg_mpeg2_qp_temp; + u1_min_qp_frame_indx = u1_curr_frame_index; + } + if(u1_max2_mpeg2_qp < u1_avg_mpeg2_qp_temp) + { + u1_max2_mpeg2_qp = u1_avg_mpeg2_qp_temp; + u1_max_qp_frame_indx = u1_curr_frame_index; + } + if(0 == u1_curr_frame_index) + u1_curr_frame_index = (MAX_FRAMES_MODELLED - 1); + else + u1_curr_frame_index--; + } + + /* Add the Chosen Points to the regression data set */ + if(INVALID_FRAME_INDEX != u1_min_qp_frame_indx) + { + pi1_frame_index[u1_num_frms_input] = (WORD8)u1_min_qp_frame_indx; + u1_num_frms_input++; + } + if(INVALID_FRAME_INDEX != u1_max_qp_frame_indx) + { + pi1_frame_index[u1_num_frms_input] = (WORD8)u1_max_qp_frame_indx; + u1_num_frms_input++; + } + memcpy(pi1_frame_index_initial, pi1_frame_index, MAX_FRAMES_MODELLED); + + /***** Call the Module to Return the Coeffs for the Fed Data *****/ + ps_rd_model->u1_model_used = find_model_coeffs(ps_rd_model->pi4_res_bits, + ps_rd_model->pi4_sad, + ps_rd_model->pu1_num_skips, + ps_rd_model->pu1_avg_qp, + u1_num_frms_input, + ps_rd_model->u1_model_used, + pi1_frame_index, + model_coeff_array, + model_coeff_array_lin, + model_coeff_array_lin_wo_int, + ps_rd_model); + + ps_rd_model->model_coeff_b_lin_wo_int = model_coeff_array_lin_wo_int[0]; + ps_rd_model->model_coeff_a_lin_wo_int = model_coeff_array_lin_wo_int[1]; + ps_rd_model->model_coeff_c_lin_wo_int = model_coeff_array_lin_wo_int[2]; +} + +UWORD32 irc_estimate_bits_for_qp(rc_rd_model_t *ps_rd_model, + UWORD32 u4_estimated_sad, + UWORD8 u1_avg_qp) +{ + float fl_num_bits = 0; + + fl_num_bits = ps_rd_model->model_coeff_a_lin_wo_int + * ((float)(u4_estimated_sad / u1_avg_qp)); + + return ((UWORD32)fl_num_bits); +} + +UWORD8 irc_find_qp_for_target_bits(rc_rd_model_t *ps_rd_model, + UWORD32 u4_target_res_bits, + UWORD32 u4_estimated_sad, + UWORD8 u1_min_qp, + UWORD8 u1_max_qp) +{ + UWORD8 u1_qp; + float x_value = 1.0, f_qp; + + ps_rd_model->u1_model_used = PREV_FRAME_MODEL; + + { + x_value = (float)u4_target_res_bits + / ps_rd_model->model_coeff_a_lin_wo_int; + } + + if(0 != x_value) + f_qp = u4_estimated_sad / x_value; + else + f_qp = 255; + + if(f_qp > 255) + f_qp = 255; + + /* Truncating the QP to the Max and Min Qp values possible */ + if(f_qp < u1_min_qp) + f_qp = u1_min_qp; + if(f_qp > u1_max_qp) + f_qp = u1_max_qp; + + u1_qp = (UWORD8)(f_qp + 0.5); + + return u1_qp; +} + +void irc_add_frame_to_rd_model(rc_rd_model_t *ps_rd_model, + UWORD32 i4_res_bits, + UWORD8 u1_avg_mp2qp, + UWORD32 i4_sad_h264, + UWORD8 u1_num_skips) +{ + UWORD8 u1_curr_frame_index; + u1_curr_frame_index = ps_rd_model->u1_curr_frm_counter; + + /*Insert the Present Frame Data into the RD Model State Memory*/ + ps_rd_model->pi4_res_bits[u1_curr_frame_index] = i4_res_bits; + ps_rd_model->pi4_sad[u1_curr_frame_index] = i4_sad_h264; + ps_rd_model->pu1_num_skips[u1_curr_frame_index] = u1_num_skips; + ps_rd_model->pu1_avg_qp[u1_curr_frame_index] = u1_avg_mp2qp; + + ps_rd_model->u1_curr_frm_counter++; + if(MAX_FRAMES_MODELLED == ps_rd_model->u1_curr_frm_counter) + ps_rd_model->u1_curr_frm_counter = 0; + + if(ps_rd_model->u1_num_frms_in_model < ps_rd_model->u1_max_frms_to_model) + { + ps_rd_model->u1_num_frms_in_model++; + } + irc_update_frame_rd_model(ps_rd_model); +} + +/***************************************************************************** + *Function Name : irc_calc_per_frm_bits + *Description : + *Inputs : pu2_num_pics_of_a_pic_type + * - pointer to RC api pointer + * pu2_num_pics_of_a_pic_type + * - N1, N2,...Nk + * pu1_update_pic_type_model + * - flag which tells whether or not to update model + * coefficients of a particular pic-type + * u1_num_pic_types + * - value of k + * pu4_num_skip_of_a_pic_type + * - the number of skips of that pic-type. It "may" be used to + * update the model coefficients at a later point. Right now + * it is not being used at all. + * u1_base_pic_type + * - base pic type index wrt which alpha & beta are calculated + * pfl_gamma + * - gamma_i = beta_i / alpha_i + * pfl_eta + * - + * u1_curr_pic_type + * - the current pic-type for which the targetted bits need to + * be computed + * u4_bits_for_sub_gop + * - the number of bits to be consumed for the remaining part of + * sub-gop + * u4_curr_estimated_sad + * - + * pu1_curr_pic_type_qp + * - output of this function + *****************************************************************************/ + +WORD32 irc_calc_per_frm_bits(rc_rd_model_t *ps_rd_model, + UWORD16 *pu2_num_pics_of_a_pic_type, + UWORD8 *pu1_update_pic_type_model, + UWORD8 u1_num_pic_types, + UWORD32 *pu4_num_skip_of_a_pic_type, + UWORD8 u1_base_pic_type, + float *pfl_gamma, + float *pfl_eta, + UWORD8 u1_curr_pic_type, + UWORD32 u4_bits_for_sub_gop, + UWORD32 u4_curr_estimated_sad, + UWORD8 *pu1_curr_pic_type_qp) +{ + WORD32 i4_per_frm_bits_Ti; + UWORD8 u1_i; + rc_rd_model_t *ps_rd_model_of_pic_type; + + UNUSED(pu4_num_skip_of_a_pic_type); + UNUSED(u1_base_pic_type); + + /* First part of this function updates all the model coefficients */ + /*for all the pic-types */ + { + for(u1_i = 0; u1_i < u1_num_pic_types; u1_i++) + { + if((0 != pu2_num_pics_of_a_pic_type[u1_i]) + && (1 == pu1_update_pic_type_model[u1_i])) + { + irc_update_frame_rd_model(&ps_rd_model[u1_i]); + } + } + } + + /* + * The second part of this function deals with solving the + * equation using all the pic-types models + */ + { + UWORD8 u1_combined_model_used; + + /* solve the equation */ + { + model_coeff eff_A; + float fl_sad_by_qp_base; + float fl_sad_by_qp_curr_frm = 1.0; + float fl_qp_curr_frm; + float fl_bits_for_curr_frm = 0; + + + + /* If the combined chosen model is linear model without an intercept */ + + u1_combined_model_used = PREV_FRAME_MODEL; + { + eff_A = 0.0; + + for(u1_i = 0; u1_i < u1_num_pic_types; u1_i++) + { + ps_rd_model_of_pic_type = ps_rd_model + u1_i; + + eff_A += ((pfl_eta[u1_i] + + pu2_num_pics_of_a_pic_type[u1_i]- 1) + * ps_rd_model_of_pic_type->model_coeff_a_lin_wo_int + * pfl_gamma[u1_i]); + } + + fl_sad_by_qp_base = u4_bits_for_sub_gop / eff_A; + + fl_sad_by_qp_curr_frm = fl_sad_by_qp_base + * pfl_gamma[u1_curr_pic_type] + * pfl_eta[u1_curr_pic_type]; + + ps_rd_model_of_pic_type = ps_rd_model + u1_curr_pic_type; + + fl_bits_for_curr_frm = + ps_rd_model_of_pic_type->model_coeff_a_lin_wo_int + * fl_sad_by_qp_curr_frm; + } + + /* + * Store the model that was finally used to calculate Qp. + * This is so that the same model is used in further calculations + * for this picture. + */ + ps_rd_model_of_pic_type = ps_rd_model + u1_curr_pic_type; + ps_rd_model_of_pic_type->u1_model_used = u1_combined_model_used; + + i4_per_frm_bits_Ti = (WORD32)(fl_bits_for_curr_frm + 0.5); + + if(fl_sad_by_qp_curr_frm > 0) + fl_qp_curr_frm = (float)u4_curr_estimated_sad + / fl_sad_by_qp_curr_frm; + else + fl_qp_curr_frm = 255; + + if(fl_qp_curr_frm > 255) + fl_qp_curr_frm = 255; + + *pu1_curr_pic_type_qp = (fl_qp_curr_frm + 0.5); + + } + } + return (i4_per_frm_bits_Ti); +} + +model_coeff irc_get_linear_coefficient(rc_rd_model_t *ps_rd_model) +{ + return (ps_rd_model->model_coeff_a_lin_wo_int); +} + + diff --git a/encoder/irc_rd_model.h b/encoder/irc_rd_model.h new file mode 100755 index 0000000..8be31c1 --- /dev/null +++ b/encoder/irc_rd_model.h @@ -0,0 +1,98 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* File Name : irc_rd_model.h */ +/* */ +/* Description : Implements all the Functions to Model the */ +/* Rate Distortion Behaviour of the Codec over the Last */ +/* Few Frames. */ +/* */ +/* List of Functions : irc_update_frame_rd_model */ +/* estimate_mpeg2_qp_for_resbits */ +/* update_mb_rd_model */ +/* find_model_coeffs */ +/* refine_set_of_points */ +/* init_mb_rd_model */ +/* irc_add_frame_to_rd_model */ +/* irc_find_qp_for_target_bits */ +/* */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 21 06 2006 Sarat Initial Version */ +/*****************************************************************************/ + +#ifndef RC_RD_MODEL +#define RC_RD_MODEL + +#define MAX_FRAMES_MODELLED 16 + +typedef float model_coeff; +typedef struct rc_rd_model_t *rc_rd_model_handle; + +WORD32 irc_rd_model_num_fill_use_free_memtab(rc_rd_model_handle *pps_rc_rd_model, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); +/* Interface Functions */ +/* Initialise the rate distortion model */ +void irc_init_frm_rc_rd_model(rc_rd_model_handle ps_rd_model, + UWORD8 u1_max_frames_modelled); + +/* Reset the rate distortion model */ +void irc_reset_frm_rc_rd_model(rc_rd_model_handle ps_rd_model); + +/* Returns the Qp to be used for the given bits and SAD */ +UWORD8 irc_find_qp_for_target_bits(rc_rd_model_handle ps_rd_model, + UWORD32 u4_target_res_bits, + UWORD32 u4_estimated_sad, + UWORD8 u1_max_qp, + UWORD8 u1_min_qp); + +/* Updates the frame level statistics after encoding a frame */ +void irc_add_frame_to_rd_model(rc_rd_model_handle ps_rd_model, + UWORD32 i4_res_bits, + UWORD8 u1_avg_mp2qp, + UWORD32 i4_sad_h264, + UWORD8 u1_num_skips); + +UWORD32 irc_estimate_bits_for_qp(rc_rd_model_handle ps_rd_model, + UWORD32 u4_estimated_sad, + UWORD8 u1_avg_qp); + +/* Get the Linear model coefficient */ +model_coeff irc_get_linear_coefficient(rc_rd_model_handle ps_rd_model); + +WORD32 irc_calc_per_frm_bits(rc_rd_model_handle ps_rd_model, + UWORD16 *pu2_num_pics_of_a_pic_type, + UWORD8 *pu1_update_pic_type_model, + UWORD8 u1_num_pic_types, + UWORD32 *pu4_num_skip_of_a_pic_type, + UWORD8 u1_base_pic_type, + float *pfl_gamma, + float *pfl_eta, + UWORD8 u1_curr_pic_type, + UWORD32 u4_bits_for_sub_gop, + UWORD32 u4_curr_estimated_sad, + UWORD8 *pu1_curr_pic_type_qp); +#endif + diff --git a/encoder/irc_rd_model_struct.h b/encoder/irc_rd_model_struct.h new file mode 100755 index 0000000..dc4c0ea --- /dev/null +++ b/encoder/irc_rd_model_struct.h @@ -0,0 +1,75 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef RC_RD_MODEL_STRUCT +#define RC_RD_MODEL_STRUCT + +/*Enable or diable QUAD model*/ +#define ENABLE_QUAD_RC_MODEL 0 +#define ENABLE_LIN_MODEL_WITH_INTERCEPT 0 + +/* Number of elements for QP */ +#define MPEG2_QP_ELEM (MAX_MPEG2_QP + 1) + + +#if ENABLE_QUAD_RC_MODEL +#define QUAD 1 +#define MIN_FRAMES_FOR_QUAD_MODEL 5 +#endif + +#define MAX_ACTIVE_FRAMES 16 +#define MIN_FRAMES_FOR_LIN_MODEL 3 +#define INVALID_FRAME_INDEX 255 + +#define UP_THR_SM 1 /* (1 /pow(2,4) = 0.0625 */ +#define UP_THR_E 4 + +#define LO_THR_SM 368 /* (368.64 / pow(2,14)) = 0.0225 */ +#define LO_THR_E 14 + +#define LIN_DEV_THR_SM 1 /* (1 / pow(1,2)) = .25*/ +#define LIN_DEV_THR_E 2 + +#define PREV_FRAME_MODEL 2 + +/* Q Factors used for fixed point calculation */ +#define Q_FORMAT_GAMMA 8 +#define Q_FORMAT_ETA 8 + +typedef struct rc_rd_model_t +{ + UWORD8 u1_curr_frm_counter; + UWORD8 u1_num_frms_in_model; + UWORD8 u1_max_frms_to_model; + UWORD8 u1_model_used; + + UWORD32 pi4_res_bits[MAX_FRAMES_MODELLED]; + UWORD32 pi4_sad[MAX_FRAMES_MODELLED]; + + UWORD8 pu1_num_skips[MAX_FRAMES_MODELLED]; + UWORD8 pu1_avg_qp[MAX_FRAMES_MODELLED]; + UWORD8 au1_num_frames[MPEG2_QP_ELEM]; + + model_coeff model_coeff_a_lin_wo_int; + model_coeff model_coeff_b_lin_wo_int; + model_coeff model_coeff_c_lin_wo_int; +} rc_rd_model_t; + +#endif /* RC_RD_MODEL_STRUCT */ diff --git a/encoder/irc_trace_support.h b/encoder/irc_trace_support.h new file mode 100755 index 0000000..c35bd4f --- /dev/null +++ b/encoder/irc_trace_support.h @@ -0,0 +1,61 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_trace_support.h +* +* @brief +* This file contains extern declarations of routines that could be helpful +* for debugging purposes. +* +* @author +* Harish +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef TRACE_SUPPORT_H_ +#define TRACE_SUPPORT_H_ + +/*****************************************************************************/ +/* Structures */ +/*****************************************************************************/ + +typedef struct +{ + WORD8 * pu1_buf; + WORD32 i4_offset; + WORD32 i4_max_size; +}trace_support_t; + +/*****************************************************************************/ +/* Extern function declarations */ +/*****************************************************************************/ + +void init_trace_support(WORD8 *pu1_buf, WORD32 i4_size); + +int trace_printf(const WORD8 *format, ...); + +#endif // TRACE_SUPPORT_H_ diff --git a/encoder/irc_vbr_storage_vbv.c b/encoder/irc_vbr_storage_vbv.c new file mode 100755 index 0000000..23e9959 --- /dev/null +++ b/encoder/irc_vbr_storage_vbv.c @@ -0,0 +1,368 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/*****************************************************************************/ +/* Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_common.h" +#include "irc_cntrl_param.h" +#include "irc_mem_req_and_acq.h" +#include "irc_fixed_point_error_bits.h" +#include "irc_vbr_storage_vbv.h" +#include "irc_trace_support.h" + +#define MAX(x, y) ((x) > (y) ? (x) : (y)) + +typedef struct vbr_storage_vbv_t +{ + WORD32 i4_max_buf_size; + WORD32 i4_cur_buf_size; + WORD32 i4_max_bits_inflow_per_frm_period; + WORD32 i4_max_bits_per_tgt_frm; + /* Storing input variables */ + WORD32 i4_max_bit_rate; + WORD32 i4_max_frame_rate; + /* Error bits calculation module */ + error_bits_handle ps_error_bits; + +} vbr_storage_vbv_t; + +static void overflow_avoided_summation(WORD32 *pi4_accumulator, WORD32 i4_input) +{ + if((pi4_accumulator[0] > 0) + && (((int)0x7fffffff - pi4_accumulator[0]) < i4_input)) + { + pi4_accumulator[0] = 0x7fffffff; + } + else if((pi4_accumulator[0] < 0) + && (((int)0x80000000 - pi4_accumulator[0]) > i4_input)) + { + pi4_accumulator[0] = 0x80000000; + } + else + { + pi4_accumulator[0] += i4_input; + } +} + +WORD32 irc_vbr_vbv_num_fill_use_free_memtab(vbr_storage_vbv_t **pps_vbr_storage_vbv, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type) +{ + WORD32 i4_mem_tab_idx = 0; + static vbr_storage_vbv_t s_vbr_storage_vbv_temp; + + /* + * Hack for al alloc, during which we don't have any state memory. + * Dereferencing can cause issues + */ + if(e_func_type == GET_NUM_MEMTAB || e_func_type == FILL_MEMTAB) + (*pps_vbr_storage_vbv) = &s_vbr_storage_vbv_temp; + + /*for src rate control state structure*/ + if(e_func_type != GET_NUM_MEMTAB) + { + fill_memtab(&ps_memtab[i4_mem_tab_idx], sizeof(vbr_storage_vbv_t), + ALIGN_128_BYTE, PERSISTENT, DDR); + use_or_fill_base(&ps_memtab[0], (void**)pps_vbr_storage_vbv, + e_func_type); + } + i4_mem_tab_idx++; + + i4_mem_tab_idx += irc_error_bits_num_fill_use_free_memtab( + &pps_vbr_storage_vbv[0]->ps_error_bits, + &ps_memtab[i4_mem_tab_idx], e_func_type); + return (i4_mem_tab_idx); +} + +void irc_init_vbr_vbv(vbr_storage_vbv_t *ps_vbr_storage_vbv, + WORD32 i4_max_bit_rate, + WORD32 i4_frm_rate, + WORD32 i4_max_vbv_buff_size) +{ + ps_vbr_storage_vbv->i4_max_buf_size = i4_max_vbv_buff_size; + ps_vbr_storage_vbv->i4_cur_buf_size = i4_max_vbv_buff_size; + + /* + * Calculate the max number of bits that flow into the decoder + * in the interval of two frames + */ + X_PROD_Y_DIV_Z(i4_max_bit_rate, 1000, i4_frm_rate, + ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period); + + /* init error bits */ + irc_init_error_bits(ps_vbr_storage_vbv->ps_error_bits, i4_frm_rate, + i4_max_bit_rate); + + /* Storing the input values */ + ps_vbr_storage_vbv->i4_max_bits_per_tgt_frm = + ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period; + ps_vbr_storage_vbv->i4_max_bit_rate = i4_max_bit_rate; + ps_vbr_storage_vbv->i4_max_frame_rate = i4_frm_rate; +} + +void irc_update_vbr_vbv(vbr_storage_vbv_t *ps_vbr_storage_vbv, + WORD32 i4_total_bits_decoded) +{ + WORD32 i4_error_bits = irc_get_error_bits( + ps_vbr_storage_vbv->ps_error_bits); + /* + * In the time interval between two decoded frames the buffer would have been + * filled up by the max_bits_inflow_per_frm_period. + */ + overflow_avoided_summation( + &ps_vbr_storage_vbv->i4_cur_buf_size, + (ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period + + i4_error_bits)); + + if(ps_vbr_storage_vbv->i4_cur_buf_size + > ps_vbr_storage_vbv->i4_max_buf_size) + { + ps_vbr_storage_vbv->i4_cur_buf_size = + ps_vbr_storage_vbv->i4_max_buf_size; + } + + ps_vbr_storage_vbv->i4_cur_buf_size -= i4_total_bits_decoded; + + /* Update the error bits state */ + irc_update_error_bits(ps_vbr_storage_vbv->ps_error_bits); + +} + +WORD32 irc_get_max_target_bits(vbr_storage_vbv_t *ps_vbr_storage_vbv) +{ + WORD32 i4_cur_buf_size = ps_vbr_storage_vbv->i4_cur_buf_size; + WORD32 i4_error_bits = irc_get_error_bits( + ps_vbr_storage_vbv->ps_error_bits); + + /* The buffer size when the next frame is decoded */ + overflow_avoided_summation( + &i4_cur_buf_size, + (ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period + + i4_error_bits)); + if(i4_cur_buf_size > ps_vbr_storage_vbv->i4_max_buf_size) + { + i4_cur_buf_size = ps_vbr_storage_vbv->i4_max_buf_size; + } + + /* + * Thus for the next frame the maximum number of bits the decoder can consume + * without underflow is i4_cur_buf_size + */ + return i4_cur_buf_size; +} + +/**************************************************************************** + Function Name : irc_get_buffer_status + Description : Gets the state of VBV buffer + Inputs : Rate control API , header and texture bits + Outputs : 0 = normal, 1 = underflow, 2= overflow + Returns : vbv_buf_status_e + *****************************************************************************/ +vbv_buf_status_e irc_get_vbv_buffer_status(vbr_storage_vbv_t *ps_vbr_storage_vbv, + WORD32 i4_total_frame_bits, + WORD32 *pi4_num_bits_to_prevent_vbv_underflow) +{ + vbv_buf_status_e e_buf_status; + WORD32 i4_cur_buf; + WORD32 i4_error_bits = irc_get_error_bits( + ps_vbr_storage_vbv->ps_error_bits); + + /* error bits due to fixed point computation of drain rate*/ + i4_cur_buf = ps_vbr_storage_vbv->i4_cur_buf_size; + overflow_avoided_summation( + &i4_cur_buf, + (ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period + + i4_error_bits)); + + if(i4_cur_buf > ps_vbr_storage_vbv->i4_max_buf_size) + { + i4_cur_buf = ps_vbr_storage_vbv->i4_max_buf_size; + } + + pi4_num_bits_to_prevent_vbv_underflow[0] = i4_cur_buf; + + i4_cur_buf -= i4_total_frame_bits; + if(i4_cur_buf < 0) + { + e_buf_status = VBV_UNDERFLOW; + } + else if(i4_cur_buf > ps_vbr_storage_vbv->i4_max_buf_size) + { + e_buf_status = VBV_OVERFLOW; + } + else if(i4_cur_buf < (ps_vbr_storage_vbv->i4_max_buf_size >> 2)) + { + e_buf_status = VBR_CAUTION; + } + else + { + e_buf_status = VBV_NORMAL; + } + + return e_buf_status; +} + +UWORD8 irc_restrict_swing_dvd_comp(vbr_storage_vbv_t *ps_vbr_storage_vbv) +{ + UWORD8 u1_restrict_swing = 1; + + if(ps_vbr_storage_vbv->i4_cur_buf_size + < (ps_vbr_storage_vbv->i4_max_buf_size >> 1)) + { + u1_restrict_swing = 0; + } + + return (u1_restrict_swing); +} + +WORD32 irc_get_max_vbv_buf_size(vbr_storage_vbv_t *ps_vbr_storage_vbv) +{ + return (ps_vbr_storage_vbv->i4_max_buf_size); +} + +WORD32 irc_get_cur_vbv_buf_size(vbr_storage_vbv_t *ps_vbr_storage_vbv) +{ + return (ps_vbr_storage_vbv->i4_cur_buf_size); +} + +WORD32 irc_get_max_bits_inflow_per_frm_periode(vbr_storage_vbv_t *ps_vbr_storage_vbv) +{ + return (ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period); +} + +WORD32 irc_get_max_bits_per_tgt_frm(vbr_storage_vbv_t *ps_vbr_storage_vbv) +{ + return (ps_vbr_storage_vbv->i4_max_bits_per_tgt_frm); +} + +WORD32 irc_vbv_get_vbv_buf_fullness(vbr_storage_vbv_t *ps_vbr_storage_vbv, + UWORD32 u4_bits) +{ + WORD32 i4_error_bits = irc_get_error_bits( + ps_vbr_storage_vbv->ps_error_bits); + WORD32 i4_cur_buf_size = ps_vbr_storage_vbv->i4_cur_buf_size; + + overflow_avoided_summation( + &i4_cur_buf_size, + (ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period + + i4_error_bits)); + + if(i4_cur_buf_size > ps_vbr_storage_vbv->i4_max_buf_size) + { + i4_cur_buf_size = ps_vbr_storage_vbv->i4_max_buf_size; + } + + i4_cur_buf_size -= u4_bits; + + return (i4_cur_buf_size); +} + +WORD32 irc_get_max_tgt_bits_dvd_comp(vbr_storage_vbv_t *ps_vbr_storage_vbv, + WORD32 i4_rem_bits_in_gop, + WORD32 i4_rem_frms_in_gop, + picture_type_e e_pic_type) +{ + WORD32 i4_dbf_max, i4_dbf_min, i4_dbf_prev, i4_vbv_size, i4_dbf_desired; + WORD32 i4_max_tgt_bits; + + i4_vbv_size = ps_vbr_storage_vbv->i4_max_buf_size; + i4_dbf_max = 95 * i4_vbv_size / 100; + i4_dbf_min = 10 * i4_vbv_size / 100; + i4_dbf_prev = ps_vbr_storage_vbv->i4_cur_buf_size; + + if(i4_rem_bits_in_gop < 0) + i4_rem_bits_in_gop = 0; + if(i4_rem_frms_in_gop <= 0) + i4_rem_frms_in_gop = 1; + + if(e_pic_type == I_PIC) + { + i4_dbf_desired = i4_dbf_min; + } + else + { + i4_dbf_desired = (i4_dbf_max - i4_rem_bits_in_gop / i4_rem_frms_in_gop + - i4_dbf_prev) / i4_rem_frms_in_gop; + i4_dbf_desired += i4_dbf_prev; + } + + i4_dbf_prev += ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period; + if(i4_dbf_prev > ps_vbr_storage_vbv->i4_max_buf_size) + { + i4_dbf_prev = ps_vbr_storage_vbv->i4_max_buf_size; + } + + i4_max_tgt_bits = MAX(0, (i4_dbf_prev - i4_dbf_desired)); + return (i4_max_tgt_bits); +} + +void irc_change_vbr_vbv_frame_rate(vbr_storage_vbv_t *ps_vbr_storage_vbv, + WORD32 i4_frm_rate) +{ + /* + * Calculate the max number of bits that flow into the decoder + * in the interval of two frames + */ + X_PROD_Y_DIV_Z(ps_vbr_storage_vbv->i4_max_bit_rate, 1000, i4_frm_rate, + ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period); + + /* Update the lower modules */ + irc_change_frm_rate_in_error_bits(ps_vbr_storage_vbv->ps_error_bits, + i4_frm_rate); + /* Storing the input values */ + ps_vbr_storage_vbv->i4_max_frame_rate = i4_frm_rate; +} + +void irc_change_vbr_vbv_bit_rate(vbr_storage_vbv_t *ps_vbr_storage_vbv, + WORD32 i4_max_bit_rate) +{ + /* + * Calculate the max number of bits that flow into the decoder + * in the interval of two frames + */ + X_PROD_Y_DIV_Z(i4_max_bit_rate, 1000, ps_vbr_storage_vbv->i4_max_frame_rate, + ps_vbr_storage_vbv->i4_max_bits_inflow_per_frm_period); + + /* update the lower modules */ + irc_change_bitrate_in_error_bits(ps_vbr_storage_vbv->ps_error_bits, + i4_max_bit_rate); + + /* Storing the input values */ + ps_vbr_storage_vbv->i4_max_bit_rate = i4_max_bit_rate; +} + +void irc_change_vbr_max_bits_per_tgt_frm(vbr_storage_vbv_t *ps_vbr_storage_vbv, + WORD32 i4_tgt_frm_rate) +{ + /* + * Calculate the max number of bits that flow into the decoder + * in the interval of two frames + */ + X_PROD_Y_DIV_Z(ps_vbr_storage_vbv->i4_max_bit_rate, 1000, i4_tgt_frm_rate, + ps_vbr_storage_vbv->i4_max_bits_per_tgt_frm); + +} diff --git a/encoder/irc_vbr_storage_vbv.h b/encoder/irc_vbr_storage_vbv.h new file mode 100755 index 0000000..c53c66d --- /dev/null +++ b/encoder/irc_vbr_storage_vbv.h @@ -0,0 +1,119 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _VBR_STORAGE_VBV_H_ +#define _VBR_STORAGE_VBV_H_ +/****************************************************************************** +VBR STORAGE (VBV): +Max. buffer filling rate: Rmax +Max. buffer size: Bmax (as specified by level and profile) +Current Buffer Level: Bcur +Frame Rate: F + +For a storage scenario, the initial buffer size is assumed to be max. For every +frame the Maximum bits filled in to the buffer is given by Rmaxfrm = Rmax/F. If +the buffer overflows then the buffer is thresholded to the max buffer size. + + (overflow) + B(0) /| +---|--------------/-|------------------------------ Bmax + | / | + | /|/ | + | /| / | + | / | /|/ | + |/ | / | /| + |/ |/ | + | + | +-----------------------|--------------------------- + |<->| | +(1/F)=>1/frame_rate (underflow) + + + B"(i) - Bits in buffer just before decoding a frame. + B'(i) - Bits in buffer just after decoding a frame. + + + B(0) (initBuffer size) = Bmax. + B'(i) = B"(i) - bits_decoded + B"(i) = Min( Bmax, B'(i-1) + Rmaxfrm) + +Overflow Scenario: In VBR case, since we have only a max filling rate (or input bit rate) +buffer overflow is not a issue (since the buffer filling rate can be reduced to any value +below this rate) + +Underflow Scenario: B'(i) should always be > 0. If not then, the buffer underflows. To +prevent this condition the number bits that needs to be decoded must be equal to B"(i) +which is equal to Min( Bmax, B'(i-1) + Rmaxfrm) +****************************************************************************************/ + +typedef struct vbr_storage_vbv_t* vbr_storage_vbv_handle; + +WORD32 irc_vbr_vbv_num_fill_use_free_memtab(vbr_storage_vbv_handle *pps_vbr_storage_vbv, + itt_memtab_t *ps_memtab, + ITT_FUNC_TYPE_E e_func_type); + +/* Initalises the vbv buffer status */ +void irc_init_vbr_vbv(vbr_storage_vbv_handle ps_vbr_storage_vbv, + WORD32 max_bit_rate, /* In bits/sec*/ + WORD32 max_frm_rate, /* In frames/1000 sec*/ + WORD32 i4_max_vbv_buff_size); /* in bits*/ + +/* Updates the buffer after decoding a frame */ +void irc_update_vbr_vbv(vbr_storage_vbv_handle ps_vbr_storage_vbv, + WORD32 i4_total_bits_decoded); + +/* gets the max_number of bits that can be decoded out of the VBV without underflow */ +WORD32 irc_get_max_target_bits(vbr_storage_vbv_handle ps_vbr_storage_vbv); + +WORD32 irc_get_max_bits_inflow_per_frm_periode(vbr_storage_vbv_handle ps_vbr_storage_vbv); + +WORD32 irc_get_max_bits_per_tgt_frm(vbr_storage_vbv_handle ps_vbr_storage_vbv); + +WORD32 irc_get_cur_vbv_buf_size(vbr_storage_vbv_handle ps_vbr_storage_vbv); + +/* Queries the VBV buffer for the buffer status */ +vbv_buf_status_e irc_get_vbv_buffer_status(vbr_storage_vbv_handle ps_vbr_storage_vbv, + WORD32 i4_total_frame_bits, + WORD32 *pi4_num_bits_to_prevent_vbv_underflow); + +UWORD8 irc_restrict_swing_dvd_comp(vbr_storage_vbv_handle ps_vbr_storage_vbv); + +WORD32 irc_get_max_vbv_buf_size(vbr_storage_vbv_handle ps_vbr_storage_vbv); + +WORD32 irc_vbv_get_vbv_buf_fullness(vbr_storage_vbv_handle ps_vbr_storage_vbv, + UWORD32 u4_bits); + +WORD32 irc_get_max_tgt_bits_dvd_comp(vbr_storage_vbv_handle ps_vbr_storage_vbv, + WORD32 i4_rem_bits_in_gop, + WORD32 i4_rem_frms_in_gop, + picture_type_e e_pic_type); + +/* Changing input values at run time */ +void irc_change_vbr_vbv_bit_rate(vbr_storage_vbv_handle ps_vbr_storage_vbv, + WORD32 i4_max_bit_rate); + +void irc_change_vbr_vbv_frame_rate(vbr_storage_vbv_handle ps_vbr_storage_vbv, + WORD32 i4_frm_rate); + +void irc_change_vbr_max_bits_per_tgt_frm(vbr_storage_vbv_handle ps_vbr_storage_vbv, + WORD32 i4_tgt_frm_rate); +#endif + diff --git a/encoder/irc_vbr_str_prms.c b/encoder/irc_vbr_str_prms.c new file mode 100755 index 0000000..29055c2 --- /dev/null +++ b/encoder/irc_vbr_str_prms.c @@ -0,0 +1,199 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> + +/* User include files */ +#include "irc_datatypes.h" +#include "irc_cntrl_param.h" +#include "irc_vbr_str_prms.h" + +/****************************************************************************** + Function Name : irc_init_vbv_str_prms + Description : Initializes and calculates the number of I frame and P frames + in the delay period + Return Values : void + *****************************************************************************/ +void irc_init_vbv_str_prms(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_intra_frm_interval, + UWORD32 u4_src_ticks, + UWORD32 u4_tgt_ticks, + UWORD32 u4_frms_in_delay_period) +{ + + UWORD32 i4_num_i_frms_in_delay_per, i4_num_p_frms_in_delay_per; + + p_vbr_str_prms->u4_frms_in_delay_prd = u4_frms_in_delay_period; + p_vbr_str_prms->u4_src_ticks = u4_src_ticks; + p_vbr_str_prms->u4_tgt_ticks = u4_tgt_ticks; + p_vbr_str_prms->u4_intra_frame_int = u4_intra_frm_interval; + + /* + * Finding the number of I frames and P frames in delay period. This + * value along with the drain rates for the corresponding picture types will + * be used to calculate the buffer sizes + */ + i4_num_i_frms_in_delay_per = ((u4_frms_in_delay_period * u4_src_ticks) + / (u4_intra_frm_interval * u4_tgt_ticks)); + + /* Ceiling the above result*/ + if((i4_num_i_frms_in_delay_per * u4_intra_frm_interval * u4_tgt_ticks) + < (u4_frms_in_delay_period * u4_src_ticks)) + { + i4_num_i_frms_in_delay_per++; + + } + i4_num_p_frms_in_delay_per = u4_frms_in_delay_period + - i4_num_i_frms_in_delay_per; + + p_vbr_str_prms->u4_num_pics_in_delay_prd[I_PIC] = + i4_num_i_frms_in_delay_per; + p_vbr_str_prms->u4_num_pics_in_delay_prd[P_PIC] = + i4_num_p_frms_in_delay_per; + p_vbr_str_prms->u4_intra_prd_pos_in_tgt_ticks = (u4_intra_frm_interval + * (p_vbr_str_prms->u4_num_pics_in_delay_prd[I_PIC])) + * u4_tgt_ticks; + p_vbr_str_prms->u4_pic_num = 0; + p_vbr_str_prms->u4_cur_pos_in_src_ticks = 0; +} + +WORD32 irc_get_vsp_num_pics_in_dly_prd(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 *pu4_num_pics_in_delay_prd) +{ + pu4_num_pics_in_delay_prd[I_PIC] = + p_vbr_str_prms->u4_num_pics_in_delay_prd[I_PIC]; + pu4_num_pics_in_delay_prd[P_PIC] = + p_vbr_str_prms->u4_num_pics_in_delay_prd[P_PIC]; + return (p_vbr_str_prms->u4_frms_in_delay_prd); +} + +/****************************************************************************** + Function Name : irc_update_vbr_str_prms + Description : update the number of I frames and P/B frames in the delay period + for buffer size calculations + *****************************************************************************/ +void irc_update_vbr_str_prms(vbr_str_prms_t *p_vbr_str_prms, + picture_type_e e_pic_type) +{ + /* + * Updating the number of I frames and P frames after encoding every + * picture. These values along with the drain rates for the corresponding + * picture types will be used to calculate the CBR buffer size every frame + */ + + if(e_pic_type == I_PIC) + { + p_vbr_str_prms->u4_num_pics_in_delay_prd[I_PIC]--; + } + else + { + p_vbr_str_prms->u4_num_pics_in_delay_prd[P_PIC]--; + } + + /* If the next I frame falls within the delay period, we need to increment + * the number of I frames in the period, else increment the number of P + * frames + */ + if((p_vbr_str_prms->u4_cur_pos_in_src_ticks + + (p_vbr_str_prms->u4_frms_in_delay_prd + * p_vbr_str_prms->u4_src_ticks)) + >= p_vbr_str_prms->u4_intra_prd_pos_in_tgt_ticks) + { + p_vbr_str_prms->u4_intra_prd_pos_in_tgt_ticks -= + p_vbr_str_prms->u4_cur_pos_in_src_ticks; + p_vbr_str_prms->u4_intra_prd_pos_in_tgt_ticks += + p_vbr_str_prms->u4_intra_frame_int + * p_vbr_str_prms->u4_tgt_ticks; + p_vbr_str_prms->u4_num_pics_in_delay_prd[I_PIC]++; + p_vbr_str_prms->u4_pic_num = 0; + p_vbr_str_prms->u4_cur_pos_in_src_ticks = 0; + } + else + { + p_vbr_str_prms->u4_num_pics_in_delay_prd[P_PIC]++; + } + p_vbr_str_prms->u4_pic_num++; + p_vbr_str_prms->u4_cur_pos_in_src_ticks += p_vbr_str_prms->u4_src_ticks; +} + +void irc_get_vsp_src_tgt_ticks(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 *pu4_src_ticks, + UWORD32 *pu4_tgt_ticks) +{ + pu4_src_ticks[0] = p_vbr_str_prms->u4_src_ticks; + pu4_tgt_ticks[0] = p_vbr_str_prms->u4_tgt_ticks; +} + +/******************************************************************************* + Function Name : change_vbr_str_prms + Description : Takes in changes of Intra frame interval, source and target + ticks and recalculates the position of the next I frame + ******************************************************************************/ +void irc_change_vsp_ifi(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_intra_frame_int) +{ + irc_init_vbv_str_prms(p_vbr_str_prms, u4_intra_frame_int, + p_vbr_str_prms->u4_src_ticks, + p_vbr_str_prms->u4_tgt_ticks, + p_vbr_str_prms->u4_frms_in_delay_prd); +} + +void irc_change_vsp_tgt_ticks(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_tgt_ticks) +{ + UWORD32 u4_rem_intra_per_scaled; + UWORD32 u4_prev_tgt_ticks = p_vbr_str_prms->u4_tgt_ticks; + + /* + * If the target frame rate is changed, recalculate the position of the next + * I frame based on the new target frame rate + * LIMITATIONS : + * Currently no support is available for dynamic change in source frame rate + */ + + u4_rem_intra_per_scaled = ((p_vbr_str_prms->u4_intra_prd_pos_in_tgt_ticks + - p_vbr_str_prms->u4_cur_pos_in_src_ticks) + / u4_prev_tgt_ticks) * u4_tgt_ticks; + + p_vbr_str_prms->u4_intra_prd_pos_in_tgt_ticks = u4_rem_intra_per_scaled + + p_vbr_str_prms->u4_cur_pos_in_src_ticks; + +} + +void irc_change_vsp_src_ticks(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_src_ticks) +{ + irc_init_vbv_str_prms(p_vbr_str_prms, p_vbr_str_prms->u4_intra_frame_int, + u4_src_ticks, p_vbr_str_prms->u4_tgt_ticks, + p_vbr_str_prms->u4_frms_in_delay_prd); +} + +void irc_change_vsp_fidp(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_frms_in_delay_period) +{ + irc_init_vbv_str_prms(p_vbr_str_prms, p_vbr_str_prms->u4_intra_frame_int, + p_vbr_str_prms->u4_src_ticks, + p_vbr_str_prms->u4_tgt_ticks, + u4_frms_in_delay_period); +} diff --git a/encoder/irc_vbr_str_prms.h b/encoder/irc_vbr_str_prms.h new file mode 100755 index 0000000..34301d8 --- /dev/null +++ b/encoder/irc_vbr_str_prms.h @@ -0,0 +1,65 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef _VBR_STR_PRMS_H_ +#define _VBR_STR_PRMS_H_ + +typedef struct +{ + UWORD32 u4_num_pics_in_delay_prd[MAX_PIC_TYPE]; + UWORD32 u4_pic_num; + UWORD32 u4_intra_prd_pos_in_tgt_ticks; + UWORD32 u4_cur_pos_in_src_ticks; + UWORD32 u4_intra_frame_int; + UWORD32 u4_src_ticks; + UWORD32 u4_tgt_ticks; + UWORD32 u4_frms_in_delay_prd; +} vbr_str_prms_t; + +void irc_init_vbv_str_prms(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_intra_frm_interval, + UWORD32 u4_src_ticks, + UWORD32 u4_tgt_ticks, + UWORD32 u4_frms_in_delay_period); + +WORD32 irc_get_vsp_num_pics_in_dly_prd(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 *pu4_num_pics_in_delay_prd); + +void irc_get_vsp_src_tgt_ticks(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 *pu4_src_ticks, + UWORD32 *pu4_tgt_ticks); + +void irc_update_vbr_str_prms(vbr_str_prms_t *p_vbr_str_prms, + picture_type_e e_pic_type); + +void irc_change_vsp_ifi(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_intra_frame_int); + +void irc_change_vsp_tgt_ticks(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_tgt_ticks); + +void irc_change_vsp_src_ticks(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_src_ticks); + +void irc_change_vsp_fidp(vbr_str_prms_t *p_vbr_str_prms, + UWORD32 u4_frms_in_delay_period); + +#endif + diff --git a/encoder/ithread.h b/encoder/ithread.h new file mode 100755 index 0000000..82170a5 --- /dev/null +++ b/encoder/ithread.h @@ -0,0 +1,101 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ithread.h */ +/* */ +/* Description : This file contains all the necessary structure and */ +/* enumeration definitions needed for the Application */ +/* Program Interface(API) of the */ +/* Thread Abstraction Layer */ +/* */ +/* List of Functions : ithread_get_handle_size() */ +/* ithread_get_mutex_lock_size() */ +/* ithread_create() */ +/* ithread_exit() */ +/* ithread_join() */ +/* ithread_get_mutex_struct_size() */ +/* ithread_mutex_init() */ +/* ithread_mutex_destroy() */ +/* ithread_mutex_lock() */ +/* ithread_mutex_unlock() */ +/* ithread_yield() */ +/* ithread_sleep() */ +/* ithread_msleep() */ +/* ithread_usleep() */ +/* ithread_get_sem_struct_size() */ +/* ithread_sem_init() */ +/* ithread_sem_post() */ +/* ithread_sem_wait() */ +/* ithread_sem_destroy() */ +/* ithread_set_affinity() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 06 09 2012 Harish Initial Version */ +/* */ +/*****************************************************************************/ + +#ifndef _ITHREAD_H_ +#define _ITHREAD_H_ + +UWORD32 ithread_get_handle_size(void); + +UWORD32 ithread_get_mutex_lock_size(void); + +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument); + +void ithread_exit(void *val_ptr); + +WORD32 ithread_join(void *thread_id, void ** val_ptr); + +WORD32 ithread_get_mutex_struct_size(void); + +WORD32 ithread_mutex_init(void *mutex); + +WORD32 ithread_mutex_destroy(void *mutex); + +WORD32 ithread_mutex_lock(void *mutex); + +WORD32 ithread_mutex_unlock(void *mutex); + +void ithread_yield(void); + +void ithread_sleep(UWORD32 u4_time); + +void ithread_msleep(UWORD32 u4_time_ms); + +void ithread_usleep(UWORD32 u4_time_us); + +UWORD32 ithread_get_sem_struct_size(void); + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value); + +WORD32 ithread_sem_post(void *sem); + +WORD32 ithread_sem_wait(void *sem); + +WORD32 ithread_sem_destroy(void *sem); + +WORD32 ithread_set_affinity(WORD32 core_id); +#endif /* _ITHREAD_H_ */ diff --git a/encoder/iv2.h b/encoder/iv2.h new file mode 100755 index 0000000..538bb1e --- /dev/null +++ b/encoder/iv2.h @@ -0,0 +1,386 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* iv2.h +* +* @brief +* This file contains all the necessary structure and enumeration +* definitions needed for the Application Program Interface(API) of the +* Ittiam Video codecs This is version 2 of Ittiam Video API +* +* @author +* Ittiam +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IV2_H_ +#define _IV2_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ +#define IV_MAX_RAW_COMPONENTS 4 + +/*****************************************************************************/ +/* Typedefs */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ + + +/** Function status */ +typedef enum{ + IV_STATUS_NA = 0x7FFFFFFF, + IV_SUCCESS = 0x0, + IV_FAIL = 0x1, +}IV_STATUS_T; + + +/** Defines the types of memory */ +typedef enum { + IV_NA_MEM_TYPE = 0x7FFFFFFF, + IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM = 0x0, + IV_EXTERNAL_CACHEABLE_SCRATCH_MEM = 0x1, + IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x2, + IV_EXTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x3, + IV_INTERNAL_CACHEABLE_PERSISTENT_MEM = 0x10, + IV_INTERNAL_CACHEABLE_SCRATCH_MEM = 0x11, + IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x12, + IV_INTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x13, +}IV_MEM_TYPE_T; + +/* The color formats used in video/image codecs */ + +typedef enum { + IV_CHROMA_NA = 0x7FFFFFFF, + IV_YUV_420P = 0x0, + IV_YUV_420SP_UV = 0x1, + IV_YUV_420SP_VU = 0x2, + + IV_YUV_422P = 0x10, + IV_YUV_422IBE = 0x11, + IV_YUV_422ILE = 0x12, + + IV_YUV_444P = 0x20, + IV_YUV_411P = 0x21, + + IV_GRAY = 0x30, + + IV_RGB_565 = 0x31, + IV_RGB_24 = 0x32, + IV_RGBA_8888 = 0x33 +}IV_COLOR_FORMAT_T; + +/** Frame/Field coding types */ +typedef enum { + IV_NA_FRAME = 0x7FFFFFFF, + IV_I_FRAME = 0x0, + IV_P_FRAME = 0x1, + IV_B_FRAME = 0x2, + IV_IDR_FRAME = 0x3, + IV_II_FRAME = 0x4, + IV_IP_FRAME = 0x5, + IV_IB_FRAME = 0x6, + IV_PI_FRAME = 0x7, + IV_PP_FRAME = 0x8, + IV_PB_FRAME = 0x9, + IV_BI_FRAME = 0xa, + IV_BP_FRAME = 0xb, + IV_BB_FRAME = 0xc, + IV_MBAFF_I_FRAME = 0xd, + IV_MBAFF_P_FRAME = 0xe, + IV_MBAFF_B_FRAME = 0xf, + IV_MBAFF_IDR_FRAME = 0x10, + IV_NOT_CODED_FRAME = 0x11, + IV_FRAMETYPE_DEFAULT = IV_I_FRAME +}IV_PICTURE_CODING_TYPE_T; + +/** Field type */ +typedef enum { + IV_NA_FLD = 0x7FFFFFFF, + IV_TOP_FLD = 0x0, + IV_BOT_FLD = 0x1, + IV_FLD_TYPE_DEFAULT = IV_TOP_FLD +}IV_FLD_TYPE_T; + +/** Video content type progressive/interlaced etc */ +typedef enum { + IV_CONTENTTYPE_NA = 0x7FFFFFFF, + IV_PROGRESSIVE = 0x0, + IV_INTERLACED = 0x1, + IV_PROGRESSIVE_FRAME = 0x2, + IV_INTERLACED_FRAME = 0x3, + IV_INTERLACED_TOPFIELD = 0x4, + IV_INTERLACED_BOTTOMFIELD = 0x5, + IV_CONTENTTYPE_DEFAULT = IV_PROGRESSIVE, +}IV_CONTENT_TYPE_T; + +/** Profile */ +typedef enum +{ + IV_PROFILE_NA = 0x7FFFFFFF, + IV_PROFILE_BASE = 0x0, + IV_PROFILE_MAIN = 0x1, + IV_PROFILE_HIGH = 0x2, + + + IV_PROFILE_SIMPLE = 0x100, + IV_PROFILE_ADVSIMPLE = 0x101, + IV_PROFILE_DEFAULT = IV_PROFILE_BASE, +}IV_PROFILE_T; + + +/** Architecture Enumeration */ +typedef enum +{ + ARCH_NA = 0x7FFFFFFF, + ARCH_ARM_NONEON = 0x0, + ARCH_ARM_A9Q, + ARCH_ARM_A9A, + ARCH_ARM_A9, + ARCH_ARM_A7, + ARCH_ARM_A5, + ARCH_ARM_A15, + ARCH_ARM_NEONINTR, + ARCH_X86_GENERIC, + ARCH_X86_SSSE3, + ARCH_X86_SSE42, + ARCH_ARM_A53, + ARCH_ARM_A57, + ARCH_ARM_V8_NEON +}IV_ARCH_T; + +/** SOC Enumeration */ +typedef enum +{ + SOC_NA = 0x7FFFFFFF, + SOC_GENERIC = 0x0, + SOC_HISI_37X +}IV_SOC_T; + + +/** API command type */ +typedef enum { + IV_CMD_NA = 0x7FFFFFFF, + IV_CMD_GET_NUM_MEM_REC = 0x0, + IV_CMD_FILL_NUM_MEM_REC = 0x1, + IV_CMD_RETRIEVE_MEMREC = 0x2, + IV_CMD_INIT = 0x3, + /* Do not add anything after the following entry */ + IV_CMD_EXTENSIONS = 0x100 +}IV_API_COMMAND_TYPE_T; + +/*****************************************************************************/ +/* Structure Definitions */ +/*****************************************************************************/ + +/** This structure defines the handle for the codec instance */ + +typedef struct{ + /** size of the structure */ + UWORD32 u4_size; + /** Pointer to the API function pointer table of the codec */ + void *pv_fxns; + /** Pointer to the handle of the codec */ + void *pv_codec_handle; +}iv_obj_t; + +/** This structure defines the memory record holder which will * + * be used by the codec to communicate its memory requirements to the * + * application through appropriate API functions */ + +typedef struct { + /** size of the structure */ + UWORD32 u4_size; + /** Pointer to the memory allocated by the application */ + void *pv_base; + /** u4_size of the memory to be allocated */ + UWORD32 u4_mem_size; + /** Alignment of the memory pointer */ + UWORD32 u4_mem_alignment; + /** Type of the memory to be allocated */ + IV_MEM_TYPE_T e_mem_type; +}iv_mem_rec_t; + +/** This structure defines attributes for the raw buffer */ +typedef struct { + /** size of the structure */ + UWORD32 u4_size; + + /** Color format */ + IV_COLOR_FORMAT_T e_color_fmt; + + /** Pointer to each component */ + void *apv_bufs[IV_MAX_RAW_COMPONENTS]; + + /** Width of each component */ + UWORD32 au4_wd[IV_MAX_RAW_COMPONENTS]; + + /** Height of each component */ + UWORD32 au4_ht[IV_MAX_RAW_COMPONENTS]; + + /** Stride of each component */ + UWORD32 au4_strd[IV_MAX_RAW_COMPONENTS]; + +}iv_raw_buf_t; + +/** This structure defines attributes for the bitstream buffer */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Pointer to buffer */ + void *pv_buf; + + /** Number of valid bytes in the buffer */ + UWORD32 u4_bytes; + + /** Allocated size of the buffer */ + UWORD32 u4_bufsize; + +}iv_bits_buf_t; +/*****************************************************************************/ +/* Get Number of Memory Records */ +/*****************************************************************************/ + +/** Input structure : Get number of memory records */ +typedef struct { + /** size of the structure */ + UWORD32 u4_size; + + /** Command type */ + IV_API_COMMAND_TYPE_T e_cmd; +}iv_num_mem_rec_ip_t; + +/** Output structure : Get number of memory records */ +typedef struct{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + + /** Number of memory records that will be used by the codec */ + UWORD32 u4_num_mem_rec; +}iv_num_mem_rec_op_t; + + +/*****************************************************************************/ +/* Fill Memory Records */ +/*****************************************************************************/ + +/** Input structure : Fill memory records */ + +typedef struct { + /** size of the structure */ + UWORD32 u4_size; + + /** Command type */ + IV_API_COMMAND_TYPE_T e_cmd; + + /** Number of memory records */ + UWORD32 u4_num_mem_rec; + + /** pointer to array of memrecords structures should be filled by codec + with details of memory resource requirements */ + iv_mem_rec_t *ps_mem_rec; + + /** maximum width for which codec should request memory requirements */ + UWORD32 u4_max_wd; + + /** maximum height for which codec should request memory requirements*/ + UWORD32 u4_max_ht; + + /** Maximum number of reference frames */ + UWORD32 u4_max_ref_cnt; + + /** Maximum number of reorder frames */ + UWORD32 u4_max_reorder_cnt; + + /** Maximum level supported */ + UWORD32 u4_max_level; + + /** Color format that codec supports for input/output */ + IV_COLOR_FORMAT_T e_color_format; + + /** Maximum search range to be used in X direction */ + UWORD32 u4_max_srch_rng_x; + + /** Maximum search range to be used in Y direction */ + UWORD32 u4_max_srch_rng_y; + +}iv_fill_mem_rec_ip_t; + + +/** Output structure : Fill memory records */ +typedef struct{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + + /** no of memory record structures which are filled by codec */ + UWORD32 u4_num_mem_rec; +}iv_fill_mem_rec_op_t; + + +/*****************************************************************************/ +/* Retrieve Memory Records */ +/*****************************************************************************/ + +/** Input structure : Retrieve memory records */ + +typedef struct { + /** size of the structure */ + UWORD32 u4_size; + + /** Command type */ + IV_API_COMMAND_TYPE_T e_cmd; + + /** array of structures where codec should fill with all memory requested earlier */ + iv_mem_rec_t *ps_mem_rec; +}iv_retrieve_mem_rec_ip_t; + + +typedef struct{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + + /** no of memory record structures which are filled by codec */ + UWORD32 u4_num_mem_rec_filled; +}iv_retrieve_mem_rec_op_t; + +#endif /* _IV2_H_ */ + diff --git a/encoder/ive2.h b/encoder/ive2.h new file mode 100755 index 0000000..8cb0fd1 --- /dev/null +++ b/encoder/ive2.h @@ -0,0 +1,1445 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ive2.h +* +* @brief +* This file contains all the necessary structure and enumeration +* definitions needed for the Application Program Interface(API) of the +* Ittiam Video Encoders This is version 2 +* +* @author +* Ittiam +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IVE2_H_ +#define _IVE2_H_ + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + +/** Maximum number of components in I/O Buffers */ +#define IVE_MAX_IO_BUFFER_COMPONENTS 4 + +/** Maximum number of reference pictures */ +#define IVE_MAX_REF 16 + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ + +/** Slice modes */ +typedef enum +{ + IVE_SLICE_MODE_NA = 0x7FFFFFFF, + IVE_SLICE_MODE_NONE = 0x0, + + IVE_SLICE_MODE_BYTES = 0x1, + IVE_SLICE_MODE_BLOCKS = 0x2, +}IVE_SLICE_MODE_T; + +/** Adaptive Intra refresh modes */ +typedef enum +{ + IVE_AIR_MODE_NA = 0x7FFFFFFF, + IVE_AIR_MODE_NONE = 0x0, + IVE_AIR_MODE_CYCLIC = 0x1, + IVE_AIR_MODE_RANDOM = 0x2, + IVE_AIR_MODE_DISTORTION = 0x3, +}IVE_AIR_MODE_T; + +/** Rate control modes */ +typedef enum +{ + IVE_RC_NA = 0x7FFFFFFF, + IVE_RC_NONE = 0x0, + IVE_RC_STORAGE = 0x1, + IVE_RC_CBR_NON_LOW_DELAY = 0x2, + IVE_RC_CBR_LOW_DELAY = 0x3, + IVE_RC_TWOPASS = 0x4, + IVE_RC_RATECONTROLPRESET_DEFAULT = IVE_RC_STORAGE +}IVE_RC_MODE_T; + +/** Encoder mode */ +typedef enum +{ + IVE_ENC_MODE_NA = 0x7FFFFFFF, + IVE_ENC_MODE_HEADER = 0x1, + IVE_ENC_MODE_PICTURE = 0x0, + IVE_ENC_MODE_DEFAULT = IVE_ENC_MODE_PICTURE, +}IVE_ENC_MODE_T; + +/** Speed Config */ +typedef enum IVE_SPEED_CONFIG +{ + IVE_QUALITY_DUMMY = 0x7FFFFFFF, + IVE_CONFIG = 0, + IVE_SLOWEST = 1, + IVE_NORMAL = 2, + IVE_FAST = 3, + IVE_HIGH_SPEED = 4, + IVE_FASTEST = 5, +}IVE_SPEED_CONFIG; + +/** API command type */ +typedef enum +{ + IVE_CMD_VIDEO_NA = 0x7FFFFFFF, + IVE_CMD_VIDEO_CTL = IV_CMD_EXTENSIONS + 1, + IVE_CMD_VIDEO_ENCODE, + IVE_CMD_QUEUE_INPUT, + IVE_CMD_DEQUEUE_INPUT, + IVE_CMD_QUEUE_OUTPUT, + IVE_CMD_DEQUEUE_OUTPUT, + IVE_CMD_GET_RECON, +}IVE_API_COMMAND_TYPE_T; + +/** Video Control API command type */ +typedef enum +{ + IVE_CMD_CT_NA = 0x7FFFFFFF, + IVE_CMD_CTL_SETDEFAULT = 0x0, + IVE_CMD_CTL_SET_DIMENSIONS = 0x1, + IVE_CMD_CTL_SET_FRAMERATE = 0x2, + IVE_CMD_CTL_SET_BITRATE = 0x3, + IVE_CMD_CTL_SET_FRAMETYPE = 0x4, + IVE_CMD_CTL_SET_QP = 0x5, + IVE_CMD_CTL_SET_ENC_MODE = 0x6, + IVE_CMD_CTL_SET_VBV_PARAMS = 0x7, + IVE_CMD_CTL_SET_AIR_PARAMS = 0x8, + IVE_CMD_CTL_SET_ME_PARAMS = 0X9, + IVE_CMD_CTL_SET_GOP_PARAMS = 0XA, + IVE_CMD_CTL_SET_PROFILE_PARAMS = 0XB, + IVE_CMD_CTL_SET_DEBLOCK_PARAMS = 0XC, + IVE_CMD_CTL_SET_IPE_PARAMS = 0XD, + IVE_CMD_CTL_SET_NUM_CORES = 0x30, + IVE_CMD_CTL_RESET = 0xA0, + IVE_CMD_CTL_FLUSH = 0xB0, + IVE_CMD_CTL_GETBUFINFO = 0xC0, + IVE_CMD_CTL_GETVERSION = 0xC1, + IVE_CMD_CTL_CODEC_SUBCMD_START = 0x100, +}IVE_CONTROL_API_COMMAND_TYPE_T; + +/* IVE_ERROR_BITS_T: A UWORD32 container will be used for reporting the error*/ +/* code to the application. The first 8 bits starting from LSB have been */ +/* reserved for the codec to report internal error details. The rest of the */ +/* bits will be generic for all video encoders and each bit has an associated*/ +/* meaning as mentioned below. The unused bit fields are reserved for future */ +/* extenstions and will be zero in the current implementation */ +typedef enum { + + /* Bit 8 - Unsupported input parameter or configuration. */ + IVE_UNSUPPORTEDPARAM = 0x8, + + /* Bit 9 - Fatal error (stop the codec).If there is an */ + /* error and this bit is not set, the error is a recoverable one. */ + IVE_FATALERROR = 0x9, + + IVE_ERROR_BITS_T_DUMMY_ELEMENT = 0x7FFFFFFF +}IVE_ERROR_BITS_T; + +/* IVE_ERROR_CODES_T: The list of error codes depicting the possible error */ +/* scenarios that can be encountered while encoding */ +typedef enum +{ + + IVE_ERR_NA = 0x7FFFFFFF, + IVE_ERR_NONE = 0x00, + IVE_ERR_INVALID_API_CMD = 0x01, + IVE_ERR_INVALID_API_SUB_CMD = 0x02, + IVE_ERR_IP_GET_MEM_REC_API_STRUCT_SIZE_INCORRECT = 0x03, + IVE_ERR_OP_GET_MEM_REC_API_STRUCT_SIZE_INCORRECT = 0x04, + IVE_ERR_IP_FILL_MEM_REC_API_STRUCT_SIZE_INCORRECT = 0x05, + IVE_ERR_OP_FILL_MEM_REC_API_STRUCT_SIZE_INCORRECT = 0x06, + IVE_ERR_IP_INIT_API_STRUCT_SIZE_INCORRECT = 0x07, + IVE_ERR_OP_INIT_API_STRUCT_SIZE_INCORRECT = 0x08, + IVE_ERR_IP_RETRIEVE_MEM_REC_API_STRUCT_SIZE_INCORRECT = 0x09, + IVE_ERR_OP_RETRIEVE_MEM_REC_API_STRUCT_SIZE_INCORRECT = 0x0A, + IVE_ERR_IP_ENCODE_API_STRUCT_SIZE_INCORRECT = 0x0B, + IVE_ERR_OP_ENCODE_API_STRUCT_SIZE_INCORRECT = 0x0C, + IVE_ERR_IP_CTL_SETDEF_API_STRUCT_SIZE_INCORRECT = 0x0D, + IVE_ERR_OP_CTL_SETDEF_API_STRUCT_SIZE_INCORRECT = 0x0E, + IVE_ERR_IP_CTL_GETBUFINFO_API_STRUCT_SIZE_INCORRECT = 0x0F, + IVE_ERR_OP_CTL_GETBUFINFO_API_STRUCT_SIZE_INCORRECT = 0x10, + IVE_ERR_IP_CTL_GETVERSION_API_STRUCT_SIZE_INCORRECT = 0x11, + IVE_ERR_OP_CTL_GETVERSION_API_STRUCT_SIZE_INCORRECT = 0x12, + IVE_ERR_IP_CTL_FLUSH_API_STRUCT_SIZE_INCORRECT = 0x13, + IVE_ERR_OP_CTL_FLUSH_API_STRUCT_SIZE_INCORRECT = 0x14, + IVE_ERR_IP_CTL_RESET_API_STRUCT_SIZE_INCORRECT = 0x15, + IVE_ERR_OP_CTL_RESET_API_STRUCT_SIZE_INCORRECT = 0x16, + IVE_ERR_IP_CTL_SETCORES_API_STRUCT_SIZE_INCORRECT = 0x17, + IVE_ERR_OP_CTL_SETCORES_API_STRUCT_SIZE_INCORRECT = 0x18, + IVE_ERR_IP_CTL_SETDIM_API_STRUCT_SIZE_INCORRECT = 0x19, + IVE_ERR_OP_CTL_SETDIM_API_STRUCT_SIZE_INCORRECT = 0x1A, + IVE_ERR_IP_CTL_SETFRAMERATE_API_STRUCT_SIZE_INCORRECT = 0x1B, + IVE_ERR_OP_CTL_SETFRAMERATE_API_STRUCT_SIZE_INCORRECT = 0x1C, + IVE_ERR_IP_CTL_SETBITRATE_API_STRUCT_SIZE_INCORRECT = 0x1D, + IVE_ERR_OP_CTL_SETBITRATE_API_STRUCT_SIZE_INCORRECT = 0x1E, + IVE_ERR_IP_CTL_SETFRAMETYPE_API_STRUCT_SIZE_INCORRECT = 0x1F, + IVE_ERR_OP_CTL_SETFRAMETYPE_API_STRUCT_SIZE_INCORRECT = 0x20, + IVE_ERR_IP_CTL_SETMEPARAMS_API_STRUCT_SIZE_INCORRECT = 0x21, + IVE_ERR_OP_CTL_SETMEPARAMS_API_STRUCT_SIZE_INCORRECT = 0x22, + IVE_ERR_IP_CTL_SETIPEPARAMS_API_STRUCT_SIZE_INCORRECT = 0x23, + IVE_ERR_OP_CTL_SETIPEPARAMS_API_STRUCT_SIZE_INCORRECT = 0x24, + IVE_ERR_IP_CTL_SETGOPPARAMS_API_STRUCT_SIZE_INCORRECT = 0x25, + IVE_ERR_OP_CTL_SETGOPPARAMS_API_STRUCT_SIZE_INCORRECT = 0x26, + IVE_ERR_IP_CTL_SETDEBLKPARAMS_API_STRUCT_SIZE_INCORRECT = 0x27, + IVE_ERR_OP_CTL_SETDEBLKPARAMS_API_STRUCT_SIZE_INCORRECT = 0x28, + IVE_ERR_IP_CTL_SETQPPARAMS_API_STRUCT_SIZE_INCORRECT = 0x29, + IVE_ERR_OP_CTL_SETQPPARAMS_API_STRUCT_SIZE_INCORRECT = 0x2A, + IVE_ERR_FILL_NUM_MEM_RECS_POINTER_NULL = 0x2B, + IVE_ERR_NUM_MEM_REC_NOT_SUFFICIENT = 0x2C, + IVE_ERR_MEM_REC_STRUCT_SIZE_INCORRECT = 0x2D, + IVE_ERR_MEM_REC_BASE_POINTER_NULL = 0x2E, + IVE_ERR_MEM_REC_OVERLAP_ERR = 0x2F, + IVE_ERR_MEM_REC_INSUFFICIENT_SIZE = 0x30, + IVE_ERR_MEM_REC_ALIGNMENT_ERR = 0x31, + IVE_ERR_MEM_REC_INCORRECT_TYPE = 0x32, + IVE_ERR_HANDLE_NULL = 0x33, + IVE_ERR_HANDLE_STRUCT_SIZE_INCORRECT = 0x34, + IVE_ERR_API_FUNCTION_PTR_NULL = 0x35, + IVE_ERR_INVALID_CODEC_HANDLE = 0x36, + IVE_ERR_CTL_GET_VERSION_BUFFER_IS_NULL = 0x37, + IVE_ERR_IP_CTL_SETAIRPARAMS_API_STRUCT_SIZE_INCORRECT = 0x38, + IVE_ERR_OP_CTL_SETAIRPARAMS_API_STRUCT_SIZE_INCORRECT = 0x39, + IVE_ERR_IP_CTL_SETENCMODE_API_STRUCT_SIZE_INCORRECT = 0x3A, + IVE_ERR_OP_CTL_SETENCMODE_API_STRUCT_SIZE_INCORRECT = 0x3B, + IVE_ERR_IP_CTL_SETVBVPARAMS_API_STRUCT_SIZE_INCORRECT = 0x3C, + IVE_ERR_OP_CTL_SETVBVPARAMS_API_STRUCT_SIZE_INCORRECT = 0x3D, + IVE_ERR_IP_CTL_SETPROFILE_API_STRUCT_SIZE_INCORRECT = 0x3E, + IVE_ERR_OP_CTL_SETPROFILE_API_STRUCT_SIZE_INCORRECT = 0x3F, + +}IVE_ERROR_CODES_T; + + +/*****************************************************************************/ +/* Initialize encoder */ +/*****************************************************************************/ + +/** Input structure : Initialize the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type */ + IV_API_COMMAND_TYPE_T e_cmd; + + /** Number of memory records */ + UWORD32 u4_num_mem_rec; + + /** pointer to array of memrecords structures should be filled by codec + with details of memory resource requirements */ + iv_mem_rec_t *ps_mem_rec; + + /** maximum width for which codec should request memory requirements */ + UWORD32 u4_max_wd; + + /** maximum height for which codec should request memory requirements */ + UWORD32 u4_max_ht; + + /** Maximum number of reference frames */ + UWORD32 u4_max_ref_cnt; + + /** Maximum number of reorder frames */ + UWORD32 u4_max_reorder_cnt; + + /** Maximum level supported */ + UWORD32 u4_max_level; + + /** Input color format */ + IV_COLOR_FORMAT_T e_inp_color_fmt; + + /** Flag to enable/disable - To be used only for debugging/testing */ + UWORD32 u4_enable_recon; + + /** Recon color format */ + IV_COLOR_FORMAT_T e_recon_color_fmt; + + /** Rate control mode */ + IVE_RC_MODE_T e_rc_mode; + + /** Maximum frame rate to be supported */ + UWORD32 u4_max_framerate; + + /** Maximum bitrate to be supported */ + UWORD32 u4_max_bitrate; + + /** Maximum number of consecutive B frames */ + UWORD32 u4_max_num_bframes; + + /** Content type Interlaced/Progressive */ + IV_CONTENT_TYPE_T e_content_type; + + /** Maximum search range to be used in X direction */ + UWORD32 u4_max_srch_rng_x; + + /** Maximum search range to be used in Y direction */ + UWORD32 u4_max_srch_rng_y; + + /** Slice Mode */ + IVE_SLICE_MODE_T e_slice_mode; + + /** Slice parameter */ + UWORD32 u4_slice_param; + + /** Processor architecture */ + IV_ARCH_T e_arch; + + /** SOC details */ + IV_SOC_T e_soc; + + +}ive_init_ip_t; + +/** Output structure : Initialize the encoder */ +typedef struct +{ + /** Size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_init_op_t; + + +/*****************************************************************************/ +/* Video Encode - Deprecated */ +/*****************************************************************************/ + +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Descriptor for input raw buffer */ + iv_raw_buf_t s_inp_buf; + + /** Buffer containing pic info if mb_info_type is non-zero */ + void *pv_bufs; + + /** Flag to indicate if mb info is sent along with input buffer */ + UWORD32 u4_mb_info_type; + + /** Buffer containing mb info if mb_info_type is non-zero */ + void *pv_mb_info; + + /** Flag to indicate if pic info is sent along with input buffer */ + UWORD32 u4_pic_info_type; + + /** Buffer containing pic info if mb_info_type is non-zero */ + void *pv_pic_info; + + /** Lower 32bits of input time stamp */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of input time stamp */ + UWORD32 u4_timestamp_high; + + /** Flag to indicate if this is the last input in the stream */ + UWORD32 u4_is_last; + + /** Descriptor for output bit-stream buffer */ + iv_bits_buf_t s_out_buf; + + /** Descriptor for recon buffer */ + iv_raw_buf_t s_recon_buf; + +}ive_video_encode_ip_t; + + +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** error code */ + UWORD32 u4_error_code; + + /* Output present */ + WORD32 output_present; + + /* dump recon */ + WORD32 dump_recon; + + /* encoded frame type */ + UWORD32 u4_encoded_frame_type; + + /** Descriptor for input raw buffer freed from codec */ + iv_raw_buf_t s_inp_buf; + + /** Descriptor for output bit-stream buffer */ + iv_bits_buf_t s_out_buf; + + /** Descriptor for recon buffer */ + iv_raw_buf_t s_recon_buf; + +}ive_video_encode_op_t; + +/*****************************************************************************/ +/* Queue Input raw buffer - Send the YUV buffer to be encoded */ +/*****************************************************************************/ +/** Input structure : Queue input buffer to the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command : IVE_CMD_QUEUE_INPUT */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Descriptor for input raw buffer */ + iv_raw_buf_t s_inp_buf; + + /** Flag to indicate if mb info is sent along with input buffer */ + UWORD32 u4_mb_info_type; + + /** Flag to indicate the size of mb info structure */ + UWORD32 u4_mb_info_size; + + /** Buffer containing mb info if mb_info_type is non-zero */ + void *pv_mb_info; + + /** Flag to indicate if pic info is sent along with input buffer */ + UWORD32 u4_pic_info_type; + + /** Buffer containing pic info if mb_info_type is non-zero */ + void *pv_pic_info; + + /** Lower 32bits of input time stamp */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of input time stamp */ + UWORD32 u4_timestamp_high; + + + /** Flag to enable/disable blocking the current API call */ + UWORD32 u4_is_blocking; + + /** Flag to indicate if this is the last input in the stream */ + UWORD32 u4_is_last; + +}ive_queue_inp_ip_t; + +/** Input structure : Queue output buffer to the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_queue_inp_op_t; + +/*****************************************************************************/ +/* Dequeue Input raw buffer - Get free YUV buffer from the encoder */ +/*****************************************************************************/ +/** Input structure : Dequeue input buffer from the encoder */ + +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command: IVE_CMD_DEQUEUE_INPUT */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Flag to enable/disable blocking the current API call */ + UWORD32 u4_is_blocking; + +}ive_dequeue_inp_ip_t; + +/** Output structure : Dequeue input buffer from the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + + /** Buffer descriptor of the buffer returned from encoder */ + iv_raw_buf_t s_inp_buf; + + /** Flag to indicate if mb info is sent along with input buffer */ + UWORD32 u4_mb_info_type; + + /** Flag to indicate the size of mb info structure */ + UWORD32 u4_mb_info_size; + + /** Buffer containing mb info if mb_info_type is non-zero */ + void *pv_mb_info; + + /** Flag to indicate if pic info is sent along with input buffer */ + UWORD32 u4_pic_info_type; + + /** Buffer containing pic info if mb_info_type is non-zero */ + void *pv_pic_info; + + /** Lower 32bits of input time stamp */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of input time stamp */ + UWORD32 u4_timestamp_high; + + /** Flag to indicate if this is the last input in the stream */ + UWORD32 u4_is_last; + + +}ive_dequeue_inp_op_t; + +/*****************************************************************************/ +/* Queue Output bitstream buffer - Send the bistream buffer to be filled */ +/*****************************************************************************/ +/** Input structure : Queue output buffer to the encoder */ + +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command : IVE_CMD_QUEUE_OUTPUT */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Descriptor for output bit-stream buffer */ + iv_bits_buf_t s_out_buf; + + /** Flag to enable/disable blocking the current API call */ + UWORD32 u4_is_blocking; + + /** Flag to indicate if this is the last output in the stream */ + UWORD32 u4_is_last; + +}ive_queue_out_ip_t; + +/** Output structure : Queue output buffer to the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + +}ive_queue_out_op_t; + + +/*****************************************************************************/ +/* Dequeue Output bitstream buffer - Get the bistream buffer filled */ +/*****************************************************************************/ +/** Input structure : Dequeue output buffer from the encoder */ + +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command : IVE_CMD_DEQUEUE_OUTPUT */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Flag to enable/disable blocking the current API call */ + UWORD32 u4_is_blocking; +}ive_dequeue_out_ip_t; + +/** Output structure : Dequeue output buffer from the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + + /** Descriptor for output bit-stream buffer */ + iv_bits_buf_t s_out_buf; + + /** Lower 32bits of timestamp corresponding to this buffer */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of timestamp corresponding to this buffer */ + UWORD32 u4_timestamp_high; + + /** Flag to indicate if this is the last output in the stream */ + UWORD32 u4_is_last; + +}ive_dequeue_out_op_t; + +/*****************************************************************************/ +/* Get Recon data - Get the reconstructed data from encoder */ +/*****************************************************************************/ +/** Input structure : Get recon data from the encoder */ + +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command : IVE_CMD_GET_RECON */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Flag to enable/disable blocking the current API call */ + UWORD32 u4_is_blocking; + + /** Descriptor for recon buffer */ + iv_raw_buf_t s_recon_buf; + + /** Flag to indicate if this is the last recon in the stream */ + UWORD32 u4_is_last; + +}ive_get_recon_ip_t; + +/** Output structure : Get recon data from the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + + /** Lower 32bits of time stamp corresponding to this buffer */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to this buffer */ + UWORD32 u4_timestamp_high; + + /** Flag to indicate if this is the last recon in the stream */ + UWORD32 u4_is_last; + +}ive_get_recon_op_t; + +/*****************************************************************************/ +/* Video control Flush */ +/*****************************************************************************/ + +/** Input structure : Flush all the buffers from the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_FLUSH */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; +}ive_ctl_flush_ip_t; + +/** Output structure : Flush all the buffers from the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_flush_op_t; + +/*****************************************************************************/ +/* Video control reset */ +/*****************************************************************************/ +/** Input structure : Reset the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_RESET */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; +}ive_ctl_reset_ip_t; + +/** Output structure : Reset the encoder */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_reset_op_t; + +/*****************************************************************************/ +/* Video control:Get Buf Info */ +/*****************************************************************************/ + +/** Input structure : Get encoder buffer requirements */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_GETBUFINFO */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** maximum width for which codec should request memory requirements */ + UWORD32 u4_max_wd; + + /** maximum height for which codec should request memory requirements */ + UWORD32 u4_max_ht; + + /** Input color format */ + IV_COLOR_FORMAT_T e_inp_color_fmt; + +}ive_ctl_getbufinfo_ip_t; + +/** Output structure : Get encoder buffer requirements */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + + /** Minimum number of input buffers required for codec */ + UWORD32 u4_min_inp_bufs; + + /** Minimum number of output buffers required for codec */ + UWORD32 u4_min_out_bufs; + + /** Number of components in input buffers required for codec */ + UWORD32 u4_inp_comp_cnt; + + /** Number of components in output buffers required for codec */ + UWORD32 u4_out_comp_cnt; + + /** Minimum sizes of each component in input buffer required */ + UWORD32 au4_min_in_buf_size[IVE_MAX_IO_BUFFER_COMPONENTS]; + + /** Minimum sizes of each component in output buffer required */ + UWORD32 au4_min_out_buf_size[IVE_MAX_IO_BUFFER_COMPONENTS]; + +}ive_ctl_getbufinfo_op_t; + + + + +/*****************************************************************************/ +/* Video control:Get Version Info */ +/*****************************************************************************/ + +/** Input structure : Get encoder version information */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_GETVERSION */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Buffer where version info will be returned */ + UWORD8 *pu1_version; + + /** Size of the buffer allocated for version info */ + UWORD32 u4_version_bufsize; +}ive_ctl_getversioninfo_ip_t; + +/** Output structure : Get encoder version information */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_getversioninfo_op_t; + + +/*****************************************************************************/ +/* Video control:set default params */ +/*****************************************************************************/ +/** Input structure : Set default encoder parameters */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SETDEFAULT */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_setdefault_ip_t; + +/** Output structure : Set default encoder parameters */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_setdefault_op_t; + +/*****************************************************************************/ +/* Video control Set Frame dimensions */ +/*****************************************************************************/ + +/** Input structure : Set frame dimensions */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_DIMENSIONS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Input width */ + UWORD32 u4_wd; + + /** Input height */ + UWORD32 u4_ht; + + /** Input stride */ + UWORD32 u4_strd; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_dimensions_ip_t; + +/** Output structure : Set frame dimensions */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_dimensions_op_t; + + +/*****************************************************************************/ +/* Video control Set Frame rates */ +/*****************************************************************************/ + +/** Input structure : Set frame rate */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_FRAMERATE */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Source frame rate */ + UWORD32 u4_src_frame_rate; + + /** Target frame rate */ + UWORD32 u4_tgt_frame_rate; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_frame_rate_ip_t; + +/** Output structure : Set frame rate */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_frame_rate_op_t; + +/*****************************************************************************/ +/* Video control Set Bitrate */ +/*****************************************************************************/ + +/** Input structure : Set bitrate */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_BITRATE */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Target bitrate in kilobits per second */ + UWORD32 u4_target_bitrate; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_bitrate_ip_t; + +/** Output structure : Set bitrate */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_bitrate_op_t; + +/*****************************************************************************/ +/* Video control Set Frame type */ +/*****************************************************************************/ + +/** Input structure : Set frametype */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_FRAMETYPE */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Force current frame type */ + IV_PICTURE_CODING_TYPE_T e_frame_type; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_frame_type_ip_t; + +/** Output structure : Set frametype */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_frame_type_op_t; + +/*****************************************************************************/ +/* Video control Set Encode mode */ +/*****************************************************************************/ + +/** Input structure : Set encode mode */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_ENC_MODE */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Encoder mode */ + IVE_ENC_MODE_T e_enc_mode; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_enc_mode_ip_t; + +/** Output structure : Set encode mode */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; + +}ive_ctl_set_enc_mode_op_t; + +/*****************************************************************************/ +/* Video control Set QP */ +/*****************************************************************************/ + +/** Input structure : Set QP */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_QP */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Set initial Qp for I pictures */ + UWORD32 u4_i_qp; + + /** Set initial Qp for P pictures */ + UWORD32 u4_p_qp; + + /** Set initial Qp for B pictures */ + UWORD32 u4_b_qp; + + /** Set minimum Qp for I pictures */ + UWORD32 u4_i_qp_min; + + /** Set maximum Qp for I pictures */ + UWORD32 u4_i_qp_max; + + /** Set minimum Qp for P pictures */ + UWORD32 u4_p_qp_min; + + /** Set maximum Qp for P pictures */ + UWORD32 u4_p_qp_max; + + /** Set minimum Qp for B pictures */ + UWORD32 u4_b_qp_min; + + /** Set maximum Qp for B pictures */ + UWORD32 u4_b_qp_max; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + + +}ive_ctl_set_qp_ip_t; + +/** Output structure : Set QP */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_qp_op_t; + +/*****************************************************************************/ +/* Video control Set AIR params */ +/*****************************************************************************/ + +/** Input structure : Set AIR params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_AIR_PARAMS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Adaptive intra refresh mode */ + IVE_AIR_MODE_T e_air_mode; + + /** Adaptive intra refresh period in frames */ + UWORD32 u4_air_refresh_period; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + + +}ive_ctl_set_air_params_ip_t; + +/** Output structure : Set AIR params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_air_params_op_t; + +/*****************************************************************************/ +/* Video control Set VBV params */ +/*****************************************************************************/ + +/** Input structure : Set VBV params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_VBV_PARAMS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** VBV buffer delay */ + UWORD32 u4_vbv_buffer_delay; + + /** VBV buffer size */ + UWORD32 u4_vbv_buf_size; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + + +}ive_ctl_set_vbv_params_ip_t; + +/** Output structure : Set VBV params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_vbv_params_op_t; + + +/*****************************************************************************/ +/* Video control Set Processor Details */ +/*****************************************************************************/ + +/** Input structure : Set processor details */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_NUM_CORES */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Total number of cores to be used */ + UWORD32 u4_num_cores; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_num_cores_ip_t; + +/** Output structure : Set processor details */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_num_cores_op_t; + +/*****************************************************************************/ +/* Video control Set Intra Prediction estimation params */ +/*****************************************************************************/ + +/** Input structure : Set IPE params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_IPE_PARAMS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Flag to enable/disbale intra 4x4 analysis */ + UWORD32 u4_enable_intra_4x4; + + /** Flag to enable/disable pre-enc stage of Intra Pred estimation */ + UWORD32 u4_pre_enc_ipe; + + /** Speed preset - Value between 0 (slowest) and 100 (fastest) */ + IVE_SPEED_CONFIG u4_enc_speed_preset; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_ipe_params_ip_t; + +/** Output structure : Set IPE Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_ipe_params_op_t; + +/*****************************************************************************/ +/* Video control Set Motion estimation params */ +/*****************************************************************************/ + +/** Input structure : Set ME Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_ME_PARAMS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Flag to enable/disable pre-enc stage of Motion estimation */ + UWORD32 u4_pre_enc_me; + + /** Speed preset - Value between 0 (slowest) and 100 (fastest) */ + UWORD32 u4_me_speed_preset; + + /** Flag to enable/disable half pel motion estimation */ + UWORD32 u4_enable_hpel; + + /** Flag to enable/disable quarter pel motion estimation */ + UWORD32 u4_enable_qpel; + + /** Flag to enable/disable fast SAD approximation */ + UWORD32 u4_enable_fast_sad; + + /** Flag to enable/disable alternate reference frames */ + UWORD32 u4_enable_alt_ref; + + /** Maximum search range in X direction for farthest reference */ + UWORD32 u4_srch_rng_x; + + /** Maximum search range in Y direction for farthest reference */ + UWORD32 u4_srch_rng_y; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_me_params_ip_t; + +/** Output structure : Set ME Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_me_params_op_t; + +/*****************************************************************************/ +/* Video control Set GOP params */ +/*****************************************************************************/ + +/** Input structure : Set GOP Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_GOP_PARAMS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** I frame interval */ + UWORD32 u4_i_frm_interval; + + /** IDR frame interval */ + UWORD32 u4_idr_frm_interval; + + /** consecutive B frames */ + UWORD32 u4_num_b_frames; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_gop_params_ip_t; + +/** Output structure : Set GOP params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_gop_params_op_t; + +/*****************************************************************************/ +/* Video control Set Deblock params */ +/*****************************************************************************/ + +/** Input structure : Set Deblock Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_GOP_PARAMS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Disable deblock level (0: Enable completely, 3: Disable completely */ + UWORD32 u4_disable_deblock_level; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_deblock_params_ip_t; + +/** Output structure : Set Deblock Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_deblock_params_op_t; + +/*****************************************************************************/ +/* Video control Set Profile params */ +/*****************************************************************************/ + +/** Input structure : Set Profile Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Command type : IVE_CMD_VIDEO_CTL */ + IVE_API_COMMAND_TYPE_T e_cmd; + + /** Sub command type : IVE_CMD_CTL_SET_PROFILE_PARAMS */ + IVE_CONTROL_API_COMMAND_TYPE_T e_sub_cmd; + + /** Profile */ + IV_PROFILE_T e_profile; + + /** Lower 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_low; + + /** Upper 32bits of time stamp corresponding to input buffer, + * from which this command takes effect */ + UWORD32 u4_timestamp_high; + +}ive_ctl_set_profile_params_ip_t; + +/** Output structure : Set Profile Params */ +typedef struct +{ + /** size of the structure */ + UWORD32 u4_size; + + /** Return error code */ + UWORD32 u4_error_code; +}ive_ctl_set_profile_params_op_t; + + +#endif /* _IVE2_H_ */ + diff --git a/encoder/mips/ih264e_function_selector.c b/encoder/mips/ih264e_function_selector.c new file mode 100755 index 0000000..58ec4d0 --- /dev/null +++ b/encoder/mips/ih264e_function_selector.c @@ -0,0 +1,110 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_function_selector.c +* +* @brief +* Contains functions to initialize function pointers used in h264 +* +* @author +* Ittiam +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include Files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include Files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264e_defs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ + +void ih264e_init_function_ptr(void *pv_codec) +{ + codec_t *ps_codec = (codec_t *)pv_codec; + ih264e_init_function_ptr_generic(ps_codec); +} + +IV_ARCH_T ih264e_default_arch(void) +{ + return ARCH_NA; +} + diff --git a/encoder/mips/ih264e_platform_macros.h b/encoder/mips/ih264e_platform_macros.h new file mode 100755 index 0000000..ed1edd4 --- /dev/null +++ b/encoder/mips/ih264e_platform_macros.h @@ -0,0 +1,135 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264e_platform_macros.h + * + * @brief + * Contains platform specific routines used for codec context intialization + * + * @author + * ittiam + * + * @remarks + * none + * + ******************************************************************************* + */ + + +#ifndef IH264E_PLATFORM_MACROS_H_ +#define IH264E_PLATFORM_MACROS_H_ + +#define DATA_SYNC() +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_generic(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr(void *pv_codec); + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void); + +/** +******************************************************************************* +* +* @brief Data Memory Barrier, Data Synchronization Barrier +* +* +* @par Description: These functions do nothing on x86 side. But on arm platforms, +* +* Data Memory Barrier acts as a memory barrier. It ensures that all explicit +* memory accesses that appear in program order before the DMB instruction are +* observed before any explicit memory accesses that appear in program order +* after the DMB instruction. It does not affect the ordering of any other +* instructions executing on the processor +* +* Data Synchronization Barrier acts as a special kind of memory barrier. No +* instruction in program order after this instruction executes until this instruction +* completes. This instruction completes when: +* 1. All explicit memory accesses before this instruction complete. +* 2. All Cache, Branch predictor and TLB maintenance operations before +* this instruction complete. +* +* @param[in] void +* +* @returns void +* +* @remarks none +* +******************************************************************************* +*/ + +#endif /* IH264E_PLATFORM_MACROS_H_ */ diff --git a/encoder/mips/ime_platform_macros.h b/encoder/mips/ime_platform_macros.h new file mode 100755 index 0000000..18e2e8f --- /dev/null +++ b/encoder/mips/ime_platform_macros.h @@ -0,0 +1,52 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ime_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + + +#ifndef _IME_PLATFORM_MACROS_H_ +#define _IME_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Function macro definitions */ +/*****************************************************************************/ + +#define USADA8(src,est,sad) \ + sad += ABS(src[0]-est[0]) + \ + ABS(src[1]-est[1]) + \ + ABS(src[2]-est[2]) + \ + ABS(src[3]-est[3]) + + +#endif /* _IH264_PLATFORM_MACROS_H_ */ diff --git a/encoder/x86/ih264e_function_selector.c b/encoder/x86/ih264e_function_selector.c new file mode 100755 index 0000000..429cdab --- /dev/null +++ b/encoder/x86/ih264e_function_selector.c @@ -0,0 +1,141 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* ih264e_function_selector.c +* +* @brief +* Contains functions to initialize function pointers used in h264 +* +* @author +* Ittiam +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System Include Files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include Files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "ih264_macros.h" +#include "ih264_platform_macros.h" +#include "ih264e_defs.h" +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr(void *pv_codec) +{ + codec_t *ps_codec = (codec_t *)pv_codec; + ih264e_init_function_ptr_generic(ps_codec); + switch(ps_codec->s_cfg.e_arch) + { + case ARCH_X86_GENERIC: + ih264e_init_function_ptr_generic(ps_codec); + break; + case ARCH_X86_SSSE3: + ih264e_init_function_ptr_ssse3(ps_codec); + break; + case ARCH_X86_SSE42: + default: + ih264e_init_function_ptr_ssse3(ps_codec); + ih264e_init_function_ptr_sse42(ps_codec); + break; + } +} + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void) +{ + return ARCH_X86_SSE42; +} + + diff --git a/encoder/x86/ih264e_function_selector_sse42.c b/encoder/x86/ih264e_function_selector_sse42.c new file mode 100755 index 0000000..6fa6308 --- /dev/null +++ b/encoder/x86/ih264e_function_selector_sse42.c @@ -0,0 +1,146 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_function_selector_sse42.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_sse42 +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264e_defs.h" +#include "ih264e_structs.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_core_coding.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_padding.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264_mem_fns.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_half_pel.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_sse42(codec_t *ps_codec) +{ + WORD32 i; + process_ctxt_t *ps_proc = NULL; + me_ctxt_t *ps_me_ctxt = NULL; + printf("Enabling SSE42 functions\n"); + + /* Init luma forward transform fn ptr */ + ps_codec->pf_resi_trans_quant_4x4 = ih264_resi_trans_quant_4x4_sse42; + ps_codec->pf_resi_trans_quant_chroma_4x4 = ih264_resi_trans_quant_chroma_4x4_sse42; + ps_codec->pf_hadamard_quant_4x4 = ih264_hadamard_quant_4x4_sse42; + ps_codec->pf_hadamard_quant_2x2_uv = ih264_hadamard_quant_2x2_uv_sse42; + + /* Init inverse transform fn ptr */ + ps_codec->pf_iquant_itrans_recon_4x4 = ih264_iquant_itrans_recon_4x4_sse42; + ps_codec->pf_iquant_itrans_recon_chroma_4x4 = ih264_iquant_itrans_recon_chroma_4x4_sse42; + ps_codec->pf_ihadamard_scaling_4x4 = ih264_ihadamard_scaling_4x4_sse42; + + /* sad me level functions */ + ps_codec->apf_compute_sad_16x16[0] = ime_compute_sad_16x16_sse42; + ps_codec->apf_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_sse42; + ps_codec->pf_compute_sad_16x8 = ime_compute_sad_16x8_sse42; + + /* sad me level functions */ + for(i = 0; i < (MAX_PROCESS_CTXT); i++) + { + ps_proc = &ps_codec->as_process[i]; + + ps_me_ctxt = &ps_proc->s_me_ctxt; + ps_me_ctxt->pf_ime_compute_sad_16x16[0] = ime_compute_sad_16x16_sse42; + ps_me_ctxt->pf_ime_compute_sad_16x16[1] = ime_compute_sad_16x16_fast_sse42; + ps_me_ctxt->pf_ime_compute_sad_16x8 = ime_compute_sad_16x8_sse42; + ps_me_ctxt->pf_ime_compute_sad4_diamond = ime_calculate_sad4_prog_sse42; + ps_me_ctxt->pf_ime_sub_pel_compute_sad_16x16 = ime_sub_pel_compute_sad_16x16_sse42; + ps_me_ctxt->pf_ime_compute_sad_stat_luma_16x16 = ime_compute_satqd_16x16_lumainter_sse42; + } +} diff --git a/encoder/x86/ih264e_function_selector_ssse3.c b/encoder/x86/ih264e_function_selector_ssse3.c new file mode 100755 index 0000000..7401e53 --- /dev/null +++ b/encoder/x86/ih264e_function_selector_ssse3.c @@ -0,0 +1,190 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_function_selector_ssse3.c +* +* @brief +* Contains functions to initialize function pointers of codec context +* +* @author +* Ittiam +* +* @par List of Functions: +* - ih264e_init_function_ptr_ssse3 +* +* @remarks +* None +* +******************************************************************************* +*/ + + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + + +/* System Include files */ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/* User Include files */ +#include "ih264_typedefs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_defs.h" +#include "ih264_size_defs.h" +#include "ih264e_defs.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_distortion_metrics.h" +#include "ime_structs.h" +#include "ih264_defs.h" +#include "ih264_error.h" +#include "ih264_structs.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" +#include "ih264e_structs.h" +#include "ih264e_platform_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264e_defs.h" +#include "ih264e_structs.h" +#include "ih264_deblk_edge_filters.h" +#include "ih264e_core_coding.h" +#include "ih264_cavlc_tables.h" +#include "ih264e_cavlc.h" +#include "ih264_padding.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264_mem_fns.h" +#include "ih264e_fmt_conv.h" +#include "ih264e_half_pel.h" + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_ssse3(codec_t *ps_codec) +{ + printf("Enabling SSSE3 functions\n"); + + /* Init function pointers for intra pred leaf level functions luma + * Intra 16x16 */ + ps_codec->apf_intra_pred_16_l[0] = ih264_intra_pred_luma_16x16_mode_vert_ssse3; + ps_codec->apf_intra_pred_16_l[1] = ih264_intra_pred_luma_16x16_mode_horz_ssse3; + ps_codec->apf_intra_pred_16_l[2] = ih264_intra_pred_luma_16x16_mode_dc_ssse3; + ps_codec->apf_intra_pred_16_l[3] = ih264_intra_pred_luma_16x16_mode_plane_ssse3; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 4x4 */ + ps_codec->apf_intra_pred_4_l[0] = ih264_intra_pred_luma_4x4_mode_vert_ssse3; + ps_codec->apf_intra_pred_4_l[1] = ih264_intra_pred_luma_4x4_mode_horz_ssse3; + ps_codec->apf_intra_pred_4_l[2] = ih264_intra_pred_luma_4x4_mode_dc_ssse3; + ps_codec->apf_intra_pred_4_l[3] = ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3; + ps_codec->apf_intra_pred_4_l[4] = ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3; + ps_codec->apf_intra_pred_4_l[5] = ih264_intra_pred_luma_4x4_mode_vert_r_ssse3; + ps_codec->apf_intra_pred_4_l[6] = ih264_intra_pred_luma_4x4_mode_horz_d_ssse3; + ps_codec->apf_intra_pred_4_l[7] = ih264_intra_pred_luma_4x4_mode_vert_l_ssse3; + ps_codec->apf_intra_pred_4_l[8] = ih264_intra_pred_luma_4x4_mode_horz_u_ssse3; + + /* Init function pointers for intra pred leaf level functions luma + * Intra 8x8 */ + ps_codec->apf_intra_pred_8_l[0] = ih264_intra_pred_luma_8x8_mode_vert_ssse3; + ps_codec->apf_intra_pred_8_l[2] = ih264_intra_pred_luma_8x8_mode_dc_ssse3; + ps_codec->apf_intra_pred_8_l[3] = ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3; + ps_codec->apf_intra_pred_8_l[4] = ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3; + ps_codec->apf_intra_pred_8_l[5] = ih264_intra_pred_luma_8x8_mode_vert_r_ssse3; + ps_codec->apf_intra_pred_8_l[6] = ih264_intra_pred_luma_8x8_mode_horz_d_ssse3; + ps_codec->apf_intra_pred_8_l[7] = ih264_intra_pred_luma_8x8_mode_vert_l_ssse3; + ps_codec->apf_intra_pred_8_l[8] = ih264_intra_pred_luma_8x8_mode_horz_u_ssse3; + + /* Init function pointers for intra pred leaf level functions chroma + * Intra 8x8 */ + ps_codec->apf_intra_pred_c[1] = ih264_intra_pred_chroma_8x8_mode_horz_ssse3; + ps_codec->apf_intra_pred_c[2] = ih264_intra_pred_chroma_8x8_mode_vert_ssse3; + ps_codec->apf_intra_pred_c[3] = ih264_intra_pred_chroma_8x8_mode_plane_ssse3; + + /* Init inverse transform fn ptr */ + ps_codec->pf_iquant_itrans_recon_8x8 = ih264_iquant_itrans_recon_8x8_ssse3; + ps_codec->pf_iquant_itrans_recon_4x4_dc = ih264_iquant_itrans_recon_4x4_dc_ssse3; + ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc = ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3; + + /* Init fn ptr luma deblocking */ + ps_codec->pf_deblk_luma_vert_bs4 = ih264_deblk_luma_vert_bs4_ssse3; + ps_codec->pf_deblk_luma_vert_bslt4 = ih264_deblk_luma_vert_bslt4_ssse3; + ps_codec->pf_deblk_luma_horz_bs4 = ih264_deblk_luma_horz_bs4_ssse3; + ps_codec->pf_deblk_luma_horz_bslt4 = ih264_deblk_luma_horz_bslt4_ssse3; + /* Init fn ptr chroma deblocking */ + ps_codec->pf_deblk_chroma_vert_bs4 = ih264_deblk_chroma_vert_bs4_ssse3; + ps_codec->pf_deblk_chroma_vert_bslt4 = ih264_deblk_chroma_vert_bslt4_ssse3; + ps_codec->pf_deblk_chroma_horz_bs4 = ih264_deblk_chroma_horz_bs4_ssse3; + ps_codec->pf_deblk_chroma_horz_bslt4 = ih264_deblk_chroma_horz_bslt4_ssse3; + + /* Padding Functions */ + ps_codec->pf_pad_left_luma = ih264_pad_left_luma_ssse3; + ps_codec->pf_pad_left_chroma = ih264_pad_left_chroma_ssse3; + ps_codec->pf_pad_right_luma = ih264_pad_right_luma_ssse3; + ps_codec->pf_pad_right_chroma = ih264_pad_right_chroma_ssse3; + + /* Inter pred leaf level functions */ + ps_codec->pf_inter_pred_luma_copy = ih264_inter_pred_luma_copy_ssse3; + ps_codec->pf_inter_pred_luma_horz = ih264_inter_pred_luma_horz_ssse3; + ps_codec->pf_inter_pred_luma_vert = ih264_inter_pred_luma_vert_ssse3; + ps_codec->pf_inter_pred_chroma = ih264_inter_pred_chroma_ssse3; + + /* memory handling operations */ + ps_codec->pf_mem_cpy_mul8 = ih264_memcpy_mul_8_ssse3; + ps_codec->pf_mem_set_mul8 = ih264_memset_mul_8_ssse3; + + /*intra mode eval -encoder level function*/ + ps_codec->pf_ih264e_evaluate_intra16x16_modes = ih264e_evaluate_intra16x16_modes_ssse3; + ps_codec->pf_ih264e_evaluate_intra_4x4_modes = ih264e_evaluate_intra_4x4_modes_ssse3; + ps_codec->pf_ih264e_evaluate_intra_chroma_modes = ih264e_evaluate_intra_chroma_modes_ssse3; + + /* Halp pel generation function - encoder level*/ + ps_codec->pf_ih264e_sixtapfilter_horz = ih264e_sixtapfilter_horz_ssse3; + ps_codec->pf_ih264e_sixtap_filter_2dvh_vert = ih264e_sixtap_filter_2dvh_vert_ssse3; +} diff --git a/encoder/x86/ih264e_half_pel_ssse3.c b/encoder/x86/ih264e_half_pel_ssse3.c new file mode 100755 index 0000000..42580fa --- /dev/null +++ b/encoder/x86/ih264e_half_pel_ssse3.c @@ -0,0 +1,487 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264e_half_pel_ssse3.c + * + * @brief + * Contains the x86 intrinsic function definitions for 6-tap vertical filter + * and cascaded 2D filter used in motion estimation in H264 encoder. + * + * @author + * Ittiam + * + * @par List of Functions: + * ih264e_sixtapfilter_horz_ssse3 + * ih264e_sixtap_filter_2dvh_vert_ssse3 + * + * @remarks + * None + * + ******************************************************************************* + */ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <assert.h> +#include <limits.h> + +/* User include files */ +#include "ih264_typedefs.h" +#include "ithread.h" +#include "ih264_platform_macros.h" +#include "ih264_defs.h" +#include "ih264e_half_pel.h" +#include "ih264_macros.h" +#include "ih264e_half_pel.h" +#include "ih264e_debug.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ +/* +******************************************************************************* +* +* @brief +* Interprediction luma filter for horizontal input(Filter run for width = 17 +* and height =16) +* +* @par Description: +* Applies a 6 tap horizontal filter .The output is clipped to 8 bits sec. +* 8.4.2.2.1 titled "Luma sample interpolation process" +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @returns +* None +* +* @remarks +* None +* +******************************************************************************* +*/ +void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 dst_strd) +{ + WORD32 ht; + WORD32 tmp; + + __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b; + __m128i src_r0_t1_16x8b, src_r1_t1_16x8b; + + __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b; + __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b; + + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + __m128i const_val16_8x16b; + + ht = 16; + pu1_src -= 2; // the filter input starts from x[-2] (till x[3]) + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + const_val16_8x16b = _mm_set1_epi16(16); + + //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9..... + //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9..... + //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels. + + do + { + src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 + src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 + + res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1 + //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1 + res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1 + //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10 + + res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3 + //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3 + res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3 + //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3 + + src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0 + src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0 + + src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0 + src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0 + + src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12 + src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12 + + res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5 + //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5 + res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5 + //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5 + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b); + res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b); + res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b); + res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b); + res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b); + + tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20]; + tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp; + + res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits. + res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5); + tmp = (tmp + 16) >> 5; + + src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b); + pu1_dst[16] = CLIP_U8(tmp); + + _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b); + + ht--; + pu1_src += src_strd; + pu1_dst += dst_strd; + } + while(ht > 0); +} + +/* +******************************************************************************* +* +* @brief +* This function implements a two stage cascaded six tap filter. It +* applies the six tap filter in the vertical direction on the +* predictor values, followed by applying the same filter in the +* horizontal direction on the output of the first stage. The six tap +* filtering operation is described in sec 8.4.2.2.1 titled "Luma sample +* interpolation process" (Filter run for width = 17 and height =17) +* +* @par Description: +* The function interpolates the predictors first in the vertical direction +* and then in the horizontal direction to output the (1/2,1/2). The output +* of the first stage of the filter is stored in the buffer pointed to by +* pi16_pred1(only in C) in 16 bit precision. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst1 +* UWORD8 pointer to the destination(Vertical filtered output) +* +* @param[out] pu1_dst2 +* UWORD8 pointer to the destination(out put after applying horizontal filter +* to the intermediate vertical output) +* +* @param[in] src_strd +* integer source stride + +* @param[in] dst_strd +* integer destination stride of pu1_dst +* +* @param[in]pi16_pred1 +* Pointer to 16bit intermediate buffer(used only in c) +* +* @param[in] pi16_pred1_strd +* integer destination stride of pi16_pred1 +* +* @returns +* None +* +* @remarks +* None +* +******************************************************************************* +*/ +void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_dst1, + UWORD8 *pu1_dst2, + WORD32 src_strd, + WORD32 dst_strd, + WORD32 *pi4_pred1, + WORD32 pred1_strd) +{ + WORD32 ht; + WORD16 *pi2_pred1; + + ht = 17; + pi2_pred1 = (WORD16 *)pi4_pred1; + pred1_strd = pred1_strd << 1; + + // Vertical 6-tap filter + { + __m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b; + __m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b; + __m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b; + __m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b; + + __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b; + + __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b; + __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b; + + coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + + pu1_src -= 2; + pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3]) + + // Loading first five rows to start first row processing. + // 22 values loaded in each row. + src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); + pu1_src += src_strd; + + src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); + pu1_src += src_strd; + + src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); + pu1_src += src_strd; + + src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); + pu1_src += src_strd; + + src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); + pu1_src += src_strd; + + do + { + src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14)); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b); + + src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b); + src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b); + src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b); + + src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b); + src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b); + src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b); + + res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b); + res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b); + res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b); + + res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b); + res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b); + + _mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b); + + src1_r0_16x8b = src1_r1_16x8b; + src1_r1_16x8b = src1_r2_16x8b; + src1_r2_16x8b = src1_r3_16x8b; + src1_r3_16x8b = src1_r4_16x8b; + src1_r4_16x8b = src1_r5_16x8b; + + src2_r0_16x8b = src2_r1_16x8b; + src2_r1_16x8b = src2_r2_16x8b; + src2_r2_16x8b = src2_r3_16x8b; + src2_r3_16x8b = src2_r4_16x8b; + src2_r4_16x8b = src2_r5_16x8b; + + ht--; + pu1_src += src_strd; + pi2_pred1 += pred1_strd; + } + while(ht > 0); + } + + ht = 17; + pi2_pred1 = (WORD16 *)pi4_pred1; + + // Horizontal 6-tap filter + { + WORD32 temp; + + __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b; + __m128i src_r4_8x16b, src_r5_8x16b; + __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b; + __m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b; + + __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b; + __m128i res_c0_8x16b, res_c1_8x16b; + + __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b; + __m128i const_val512_4x32b, const_val16_8x16b; + + coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1 + coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3 + coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5 + //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20 + const_val512_4x32b = _mm_set1_epi32(512); + const_val16_8x16b = _mm_set1_epi16(16); + + do + { + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4)); + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5)); + + res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b); + res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + + src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8)); + src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1)); + src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2)); + src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3)); + src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4)); + src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5)); + + res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b); + res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits. + + src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10); + + src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b); + src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b); + src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b); + + res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b); + res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b); + res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b); + + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b); + res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b); + res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b); + res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10); + + res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b); + + res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b); + _mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b); + pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5); + + res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b); + _mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b); + temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20]; + temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp; + pu1_dst2[16] = CLIP_U8((temp + 512) >> 10); + + ht--; + pi2_pred1 += pred1_strd; + pu1_dst1 += dst_strd; + pu1_dst2 += dst_strd; + } + while(ht > 0); + } +} diff --git a/encoder/x86/ih264e_intra_modes_eval_ssse3.c b/encoder/x86/ih264e_intra_modes_eval_ssse3.c new file mode 100755 index 0000000..657921f --- /dev/null +++ b/encoder/x86/ih264e_intra_modes_eval_ssse3.c @@ -0,0 +1,1259 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ih264e_intra_modes_eval_ssse3.c +* +* @brief +* This file contains definitions of routines that perform rate distortion +* analysis on a macroblock if they are to be coded as intra. +* +* @author +* Ittiam +* +* @par List of Functions: +* ih264e_evaluate_intra16x16_modes_ssse3 +* ih264e_evaluate_intra_4x4_modes_ssse3 +* ih264e_evaluate_intra_chroma_modes_ssse3 +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <string.h> +#include <limits.h> +#include <assert.h> +#include <immintrin.h> + +/* User include files */ +#include "ih264e_config.h" +#include "ih264_typedefs.h" +#include "ih264e_defs.h" +#include "iv2.h" +#include "ive2.h" +#include "ih264_debug.h" +#include "ih264_defs.h" +#include "ih264_macros.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_structs.h" +#include "ih264_common_tables.h" +#include "ih264_trans_quant_itrans_iquant.h" +#include "ih264_inter_pred_filters.h" +#include "ih264_mem_fns.h" +#include "ih264_padding.h" +#include "ih264_intra_pred_filters.h" +#include "ih264_deblk_edge_filters.h" +#include "ime_distortion_metrics.h" +#include "ih264e_error.h" +#include "ih264e_bitstream.h" +#include "ime_structs.h" + +#include "irc_cntrl_param.h" +#include "irc_frame_info_collector.h" +#include "ih264e_rate_control.h" + +#include "ih264e_structs.h" +#include "ih264e_intra_modes_eval.h" +#include "ih264e_globals.h" +#include "ime_platform_macros.h" + + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ +/** +****************************************************************************** +* +* @brief +* evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the +* prediction. +* +* @par Description +* This function evaluates first three 16x16 modes and compute corresponding +* SAD and returns the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[in] pu1_ngbr_pels_i16 +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* pointer to the variable in which minimum sad is returned +* +* @param[in] u4_valid_intra_modes +* says what all modes are valid +* +* @return +* None +* +****************************************************************************** +*/ +void ih264e_evaluate_intra16x16_modes_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels_i16, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes) +{ + UWORD8 *pu1_src_temp; + + WORD32 left, top, horz_flag, vert_flag, dc_flag; + WORD32 sad_vert, sad_horz, sad_dc, min_sad; + + WORD32 cnt, dcval; + WORD32 src_strd2, src_strd3, src_strd4; + WORD32 dst_strd2, dst_strd3, dst_strd4; + + __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b; + __m128i val1_16x8b, val2_16x8b, val3_16x8b, val4_16x8b; + __m128i sad1_8x16b, sad2_8x16b, sad3_8x16b, sad4_8x16b; + + __m128i sad_8x16b, val_16x8b, zero_vector; + + sad_vert = INT_MAX; + sad_horz = INT_MAX; + sad_dc = INT_MAX; + + src_strd2 = src_strd << 1; + src_strd4 = src_strd << 2; + src_strd3 = src_strd + src_strd2; + + dst_strd2 = dst_strd << 1; + dst_strd4 = dst_strd << 2; + dst_strd3 = dst_strd + dst_strd2; + + left = (n_avblty & LEFT_MB_AVAILABLE_MASK); + top = (n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; + + zero_vector = _mm_setzero_si128(); + + horz_flag = left && ((u4_valid_intra_modes & 02) != 0); + vert_flag = top && ((u4_valid_intra_modes & 01) != 0); + dc_flag = (u4_valid_intra_modes & 04) != 0; + + if(horz_flag) + { + pu1_src_temp = pu1_src; + + val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[15]); + val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[14]); + val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[13]); + val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[12]); + + src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); + src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); + src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); + src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); + + sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); + sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b); + sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b); + sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b); + + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); + sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); + + cnt = 11; + sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); + do + { + pu1_src_temp += src_strd4; + + val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]); + val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]); + val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]); + val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]); + + src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); + src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); + src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); + src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); + + sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); + sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b); + sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b); + sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b); + + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); + sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); + + cnt -= 4; + sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b); + } + while(cnt >= 0); + + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + + sad_horz = _mm_extract_epi16(sad_8x16b, 0); + } + + if(vert_flag) + { + pu1_src_temp = pu1_src; + + val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17)); + + src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); + src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); + src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); + src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); + + sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); + sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); + sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); + sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); + + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); + sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); + + cnt = 11; + sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); + do + { + pu1_src_temp += src_strd4; + + src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); + src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); + src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); + src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); + + sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); + sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); + sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); + sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); + + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); + sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); + + cnt -= 4; + sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b); + } + while(cnt >= 0); + + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + + sad_vert = _mm_extract_epi16(sad_8x16b, 0); + } + + dcval = 0; + + if(left) + { + val_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels_i16); + dcval += 8; + + sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); + dcval += _mm_extract_epi16(sad1_8x16b, 0); + dcval += _mm_extract_epi16(sad1_8x16b, 4); + } + if(top) + { + val_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17)); + dcval += 8; + + sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); + dcval += _mm_extract_epi16(sad1_8x16b, 0); + dcval += _mm_extract_epi16(sad1_8x16b, 4); + } + dcval = dcval >> (3 + left + top); + dcval += ((left == 0) & (top == 0)) << 7; + + if(dc_flag) + { + pu1_src_temp = pu1_src; + val1_16x8b = _mm_set1_epi8(dcval); + + src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); + src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); + src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); + src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); + + sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); + sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); + sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); + sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); + + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); + sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); + + cnt = 12; + sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); + do + { + pu1_src_temp += src_strd4; + + src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp); + src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd)); + src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2)); + src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3)); + + sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b); + sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b); + sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b); + sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b); + + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b); + sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b); + sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b); + + cnt -= 4; + sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b); + } + while(cnt > 0); + + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b); + + sad_dc = _mm_extract_epi16(sad_8x16b, 0); + } + + // Doing prediction for minimum SAD + min_sad = MIN3(sad_horz, sad_vert, sad_dc); + if(min_sad < *pu4_sadmin) + { + *pu4_sadmin = min_sad; + if(min_sad == sad_vert) + { + *u4_intra_mode = VERT_I16x16; + val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17)); + cnt = 15; + do + { + _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b); + + cnt -= 4; + pu1_dst += dst_strd4; + } + while(cnt > 0); + } + else if(min_sad == sad_horz) + { + *u4_intra_mode = HORZ_I16x16; + cnt = 15; + do + { + val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]); + val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]); + val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]); + val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]); + + _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val2_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val3_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val4_16x8b); + + cnt -= 4; + pu1_dst += dst_strd4; + } + while(cnt >= 0); + } + else + { + *u4_intra_mode = DC_I16x16; + val1_16x8b = _mm_set1_epi8(dcval); + cnt = 15; + do + { + _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b); + _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b); + + cnt -= 4; + pu1_dst += dst_strd4; + } + while(cnt > 0); + } + } +} + +/** +****************************************************************************** +* +* @brief :Evaluate best intra 4x4 mode and do the prediction. +* +* @par Description +* This function evaluates intra 4x4 modes, computes corresponding sad +* and returns the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +** @param[in] pu1_ngbr_pels +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* Pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* Pointer to the variable in which minimum cost is returned +* +* @param[in] u4_valid_intra_modes +* Says what all modes are valid +* +* * @param[in] u4_lambda +* Lamda value for computing cost from SAD +* +* @param[in] u4_predictd_mode +* Predicted mode for cost computation +* +* @return none +* +****************************************************************************** +*/ +void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 u4_n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes, + UWORD32 u4_lambda, + UWORD32 u4_predictd_mode) +{ + WORD32 left, top; + WORD32 sad[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX, + INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + WORD32 cost[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX, + INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + WORD32 min_cost; + WORD32 lambda4 = u4_lambda << 2; + WORD32 dst_strd2, dst_strd3; + + __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b; + __m128i pred1_16x8b, pred2_16x8b, pred3_16x8b, pred4_16x8b; + __m128i pred5_16x8b, pred6_16x8b, pred7_16x8b, pred8_16x8b; + __m128i shuffle_16x8b, zero_vector, mask_low_32b; + + left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK); + top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; + + dst_strd2 = dst_strd << 1; + dst_strd3 = dst_strd + dst_strd2; + + // loading the 4x4 source block and neighbouring pixels + { + __m128i row1_16x8b, row2_16x8b; + + row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + left_top_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels); + + pu1_src += src_strd << 1; + src_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b); + + row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); + row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); + zero_vector = _mm_setzero_si128(); + + row1_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b); + src_16x8b = _mm_unpacklo_epi64(src_16x8b, row1_16x8b); + } + + /* Computing SADs*/ + if(u4_valid_intra_modes & 1)/* VERT mode valid ????*/ + { + pred0_16x8b = _mm_srli_si128(left_top_16x8b, 5); + pred0_16x8b = _mm_shuffle_epi32(pred0_16x8b, 0); + sad_8x16b = _mm_sad_epu8(src_16x8b, pred0_16x8b); + + sad[VERT_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[VERT_I4x4] = sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes & 2)/* HORZ mode valid ????*/ + { + shuffle_16x8b = _mm_setr_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0); + pred1_16x8b = _mm_shuffle_epi8(left_top_16x8b, shuffle_16x8b); + + sad_8x16b = _mm_sad_epu8(src_16x8b, pred1_16x8b); + + sad[HORZ_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[HORZ_I4x4] = sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes & 4)/* DC mode valid ????*/ + { + if(top + left) + { + WORD32 shft = 1, dcval = 0; + + __m128i val_16x8b, temp_16x8b, temp_8x16b; + + val_16x8b = _mm_setzero_si128(); + + if(top) + { + temp_16x8b = _mm_srli_si128(left_top_16x8b, 5); + val_16x8b = _mm_alignr_epi8(temp_16x8b, val_16x8b, 4); + shft ++; + dcval += 2; + } + if(left) + { + val_16x8b = _mm_alignr_epi8(left_top_16x8b, val_16x8b, 4); + shft++; + dcval += 2; + } + + temp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector); + dcval += _mm_extract_epi16(temp_8x16b, 4); + dcval = dcval >> shft; + pred2_16x8b = _mm_set1_epi8(dcval); + } + else + pred2_16x8b = _mm_set1_epi8(128); + + sad_8x16b = _mm_sad_epu8(src_16x8b, pred2_16x8b); + + sad[DC_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[DC_I4x4] = sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes > 7)/* if modes other than VERT, HORZ and DC are valid ????*/ + { + __m128i w11_16x8b, w121_16x8b; + __m128i temp1_16x8b, temp2_16x8b; + + /* Performing FILT121 and FILT11 operation for all neighbour values*/ + { + __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b; + __m128i const_2_8x16b; + + const_2_8x16b = _mm_set1_epi16(2); + + temp1_8x16b = _mm_unpacklo_epi8(left_top_16x8b, zero_vector); //l3 l2 l1 l0 tl t0 t1 t2 + temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2); // 0 l3 l2 l1 l0 tl t0 t1 + temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5); //l3 l3 l2 l1 l0 tl t0 t1 + + temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b); //l3+l3 l3+l2 l2+l1... t1+t2 + temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2); //l3+l3 l3+l3 l3+l2... t0+t1 + temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5); + temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b); //4*l3 l3+2*l3+l2 l3+2*l2+l1... t0+2*t1+t2 + + temp1_8x16b = _mm_add_epi16(const_2_8x16b, temp1_8x16b); //4*l3+2 3*l3+l2+2 l3+2*l2+l1+2.. t0+2*t1+t2+2 + temp1_8x16b = _mm_srli_epi16(temp1_8x16b, 2); + + temp1_16x8b = _mm_srli_si128(left_top_16x8b, 1); + w11_16x8b = _mm_avg_epu8(left_top_16x8b, temp1_16x8b); + + temp2_16x8b = _mm_srli_si128(left_top_16x8b, 6); + temp2_8x16b = _mm_unpacklo_epi8(temp2_16x8b, zero_vector); //t1 t2 t3 t4 t5 t6 t7 0 + temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2); //t2 t3 t4 t5 t6 t7 0 0 + temp3_8x16b = _mm_shufflehi_epi16(temp3_8x16b, 0xd4); //t2 t3 t4 t5 t6 t7 t7 0 + + temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b); //t1+t2 t2+t3... t6+t7 t7+t7 0 + temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2); //t2+t3 t3+t4... t7+t7 0 0 + temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b); //t1+2*t2+t3 t2+2*t3+t4.. t6+2*t7+t7 t7+t7 0 + + temp2_8x16b = _mm_add_epi16(const_2_8x16b, temp2_8x16b); //t1+2*t2+t3+2 t2+2*t3+t4+2 t3+2*t4+t5+2... t6+2*t7+t7+2 t7+t7+2 2 + temp2_8x16b = _mm_srli_epi16(temp2_8x16b, 2); + + w121_16x8b = _mm_packus_epi16(temp1_8x16b, temp2_8x16b); + } + + if(u4_valid_intra_modes & 8)/* DIAG_DL */ + { + shuffle_16x8b = _mm_setr_epi8( 7, 8, 9, 10, + 8, 9, 10, 11, + 9, 10, 11, 12, + 10, 11, 12, 13); + pred3_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b); + sad_8x16b = _mm_sad_epu8(src_16x8b, pred3_16x8b); + + sad[DIAG_DL_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[DIAG_DL_I4x4] = sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes & 16)/* DIAG_DR */ + { + shuffle_16x8b = _mm_setr_epi8(5, 6, 7, 8, + 4, 5, 6, 7, + 3, 4, 5, 6, + 2, 3, 4, 5); + pred4_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b); + sad_8x16b = _mm_sad_epu8(src_16x8b, pred4_16x8b); + + sad[DIAG_DR_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[DIAG_DR_I4x4] = sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/ + { + temp1_16x8b = _mm_srli_si128(w121_16x8b, 1); + temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, w11_16x8b); + shuffle_16x8b = _mm_setr_epi8(12, 13, 14, 15, + 4, 5, 6, 7, + 3, 12, 13, 14, + 2, 4, 5, 6); + pred5_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); + sad_8x16b = _mm_sad_epu8(src_16x8b, pred5_16x8b); + + sad[VERT_R_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[VERT_R_I4x4] = sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/ + { + temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b); + shuffle_16x8b = _mm_setr_epi8(11, 5, 6, 7, + 10, 4, 11, 5, + 9, 3, 10, 4, + 8, 2, 9, 3); + pred6_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); + sad_8x16b = _mm_sad_epu8(src_16x8b, pred6_16x8b); + + sad[HORZ_D_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[HORZ_D_I4x4] = sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/ + { + temp1_16x8b = _mm_srli_si128(w121_16x8b, 5); + temp2_16x8b = _mm_srli_si128(w11_16x8b, 5); + temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, temp2_16x8b); + shuffle_16x8b = _mm_setr_epi8(8, 9, 10, 11, + 2, 3, 4, 5, + 9, 10, 11, 12, + 3, 4, 5, 6); + pred7_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); + sad_8x16b = _mm_sad_epu8(src_16x8b, pred7_16x8b); + + sad[VERT_L_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[VERT_L_I4x4] = sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ? u4_lambda: lambda4); + } + + if(u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/ + { + temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b); + shuffle_16x8b = _mm_setr_epi8(10, 3, 9, 2, + 9, 2, 8, 1, + 8, 1, 0, 0, + 0, 0, 0, 0); + pred8_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b); + sad_8x16b = _mm_sad_epu8(src_16x8b, pred8_16x8b); + + sad[HORZ_U_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + cost[HORZ_U_I4x4] = sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ? u4_lambda: lambda4); + } + + min_cost = MIN3(MIN3(cost[0], cost[1], cost[2]), + MIN3(cost[3], cost[4], cost[5]), + MIN3(cost[6], cost[7], cost[8])); + } + else + { /*Only first three modes valid*/ + min_cost = MIN3(cost[0], cost[1], cost[2]); + } + + *pu4_sadmin = min_cost; + + if(min_cost == cost[0]) + { + *u4_intra_mode = VERT_I4x4; + } + else if(min_cost == cost[1]) + { + *u4_intra_mode = HORZ_I4x4; + pred0_16x8b = pred1_16x8b; + } + else if(min_cost == cost[2]) + { + *u4_intra_mode = DC_I4x4; + pred0_16x8b = pred2_16x8b; + } + else if(min_cost == cost[3]) + { + *u4_intra_mode = DIAG_DL_I4x4; + pred0_16x8b = pred3_16x8b; + } + else if(min_cost == cost[4]) + { + *u4_intra_mode = DIAG_DR_I4x4; + pred0_16x8b = pred4_16x8b; + } + else if(min_cost == cost[5]) + { + *u4_intra_mode = VERT_R_I4x4; + pred0_16x8b = pred5_16x8b; + } + else if(min_cost == cost[6]) + { + *u4_intra_mode = HORZ_D_I4x4; + pred0_16x8b = pred6_16x8b; + } + else if(min_cost == cost[7]) + { + *u4_intra_mode = VERT_L_I4x4; + pred0_16x8b = pred7_16x8b; + } + else if(min_cost == cost[8]) + { + *u4_intra_mode = HORZ_U_I4x4; + pred0_16x8b = pred8_16x8b; + } + + mask_low_32b = _mm_set1_epi8(0xff); + mask_low_32b = _mm_srli_si128(mask_low_32b, 12); + + _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)pu1_dst); + pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4); + _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd)); + pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4); + _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2)); + pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4); + _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3)); + +} + +/** +****************************************************************************** +* +* @brief +* Evaluate best intra chroma mode (among VERT, HORZ and DC) and do the prediction. +* +* @par Description +* This function evaluates first three intra chroma modes and compute corresponding sad +* and return the buffer predicted with best mode. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +** @param[in] pu1_ngbr_pels +* UWORD8 pointer to neighbouring pels +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_n_avblty +* availability of neighbouring pixels +* +* @param[in] u4_intra_mode +* pointer to the variable in which best mode is returned +* +* @param[in] pu4_sadmin +* pointer to the variable in which minimum sad is returned +* +* @param[in] u4_valid_intra_modes +* says what all modes are valid +* +* @return +* none +* +****************************************************************************** +*/ + +void ih264e_evaluate_intra_chroma_modes_ssse3(UWORD8 *pu1_src, + UWORD8 *pu1_ngbr_pels, + UWORD8 *pu1_dst, + UWORD32 src_strd, + UWORD32 dst_strd, + WORD32 u4_n_avblty, + UWORD32 *u4_intra_mode, + WORD32 *pu4_sadmin, + UWORD32 u4_valid_intra_modes) +{ + WORD32 left, top; + WORD32 sad_vert = INT_MAX, sad_horz = INT_MAX, sad_dc = INT_MAX, min_sad; + + __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b; + __m128i src5_16x8b, src6_16x8b, src7_16x8b, src8_16x8b; + + __m128i top_16x8b, left_16x8b; + __m128i pred1_16x8b, pred2_16x8b; + __m128i tmp1_8x16b, tmp2_8x16b, sad_8x16b; + + left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK); + top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2; + + //Loading source + { + src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src2_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src3_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src4_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src5_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src6_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src7_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + pu1_src += src_strd; + src8_16x8b = _mm_loadu_si128((__m128i *)pu1_src); + } + + if(left) + { + left_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels); + + if(u4_valid_intra_modes & 02) //If HORZ mode is valid + { + __m128i left_tmp_16x8b, left_sh_16x8b; + __m128i const_14_15_16x8b; + + const_14_15_16x8b = _mm_set1_epi16(0x0f0e); + left_sh_16x8b = _mm_slli_si128(left_16x8b, 2); + + pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 1 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2 + tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred2_16x8b); + + left_tmp_16x8b = _mm_slli_si128(left_16x8b, 4); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); + + pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 3 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4 + tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred2_16x8b); + + left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 5 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6 + tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b); + + left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 7 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8 + tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b); + + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + sad_horz = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + } + } + + if(top) + { + UWORD8 *pu1_top; + + pu1_top = pu1_ngbr_pels + 2 * BLK8x8SIZE + 2; + top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); + + if(u4_valid_intra_modes & 04) //If VERT mode is valid + { + tmp1_8x16b = _mm_sad_epu8(src1_16x8b, top_16x8b); + tmp2_8x16b = _mm_sad_epu8(src2_16x8b, top_16x8b); + sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src3_16x8b, top_16x8b); + tmp2_8x16b = _mm_sad_epu8(src4_16x8b, top_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src5_16x8b, top_16x8b); + tmp2_8x16b = _mm_sad_epu8(src6_16x8b, top_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src7_16x8b, top_16x8b); + tmp2_8x16b = _mm_sad_epu8(src8_16x8b, top_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + sad_vert = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + } + } + + if(u4_valid_intra_modes & 01) //If DC mode is valid + { + if(left && top) + { + WORD32 left_up_u, left_down_u, left_up_v, left_down_v; + WORD32 top_left_u, top_right_u, top_left_v, top_right_v; + WORD32 dc_1u, dc_1v, dc_2u, dc_2v; + + __m128i val_sh_16x8b; + __m128i intrlv_mask_8x16b, zero_vector; + + intrlv_mask_8x16b = _mm_set1_epi16(0x00ff); + zero_vector = _mm_setzero_si128(); + + val_sh_16x8b = _mm_srli_si128(left_16x8b, 1); + + tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b); + tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b); + tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); + tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); + + left_up_u = _mm_extract_epi16(tmp1_8x16b, 4); + left_up_v = _mm_extract_epi16(tmp2_8x16b, 4); + left_down_u = _mm_extract_epi16(tmp1_8x16b, 0); + left_down_v = _mm_extract_epi16(tmp2_8x16b, 0); + + val_sh_16x8b = _mm_srli_si128(top_16x8b, 1); + + tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b); + tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b); + tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); + tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); + + top_left_u = _mm_extract_epi16(tmp1_8x16b, 0); + top_left_v = _mm_extract_epi16(tmp2_8x16b, 0); + top_right_u = _mm_extract_epi16(tmp1_8x16b, 4); + top_right_v = _mm_extract_epi16(tmp2_8x16b, 4); + + // First four rows + dc_1u = (left_up_u + top_left_u + 4) >> 3; + dc_1v = (left_up_v + top_left_v + 4) >> 3; + dc_2u = (top_right_u + 2) >> 2; + dc_2v = (top_right_v + 2) >> 2; + + pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, + dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v); + + tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + // Second four rows + dc_1u = (left_down_u + 2) >> 2; + dc_1v = (left_down_v + 2) >> 2; + dc_2u = (left_down_u + top_right_u + 4) >> 3; + dc_2v = (left_down_v + top_right_v + 4) >> 3; + + pred2_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, + dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v); + + tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b); + tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b); + tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + } + else if(left) + { + WORD32 left_up_u, left_down_u, left_up_v, left_down_v; + WORD32 dc_u, dc_v; + + __m128i left_sh_16x8b; + __m128i intrlv_mask_8x16b, zero_vector; + + intrlv_mask_8x16b = _mm_set1_epi16(0x00ff); + zero_vector = _mm_setzero_si128(); + + left_sh_16x8b = _mm_srli_si128(left_16x8b, 1); + + tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b); + tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_sh_16x8b); + tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); + tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); + + left_up_u = _mm_extract_epi16(tmp1_8x16b, 4); + left_up_v = _mm_extract_epi16(tmp2_8x16b, 4); + left_down_u = _mm_extract_epi16(tmp1_8x16b, 0); + left_down_v = _mm_extract_epi16(tmp2_8x16b, 0); + + // First four rows + dc_u = (left_up_u + 2) >> 2; + dc_v = (left_up_v + 2) >> 2; + + pred1_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8)); + + tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + // Second four rows + dc_u = (left_down_u + 2) >> 2; + dc_v = (left_down_v + 2) >> 2; + + pred2_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8)); + + tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b); + tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b); + tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + } + else if(top) + { + WORD32 top_left_u, top_right_u, top_left_v, top_right_v; + WORD32 dc_1u, dc_1v, dc_2u, dc_2v; + + __m128i top_sh_16x8b; + __m128i intrlv_mask_8x16b, zero_vector; + + intrlv_mask_8x16b = _mm_set1_epi16(0x00ff); + zero_vector = _mm_setzero_si128(); + + top_sh_16x8b = _mm_srli_si128(top_16x8b, 1); + + tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b); + tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_sh_16x8b); + tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b); + tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b); + + top_left_u = _mm_extract_epi16(tmp1_8x16b, 0); + top_left_v = _mm_extract_epi16(tmp2_8x16b, 0); + top_right_u = _mm_extract_epi16(tmp1_8x16b, 4); + top_right_v = _mm_extract_epi16(tmp2_8x16b, 4); + + dc_1u = (top_left_u + 2) >> 2; + dc_1v = (top_left_v + 2) >> 2; + dc_2u = (top_right_u + 2) >> 2; + dc_2v = (top_right_v + 2) >> 2; + + pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, + dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v); + + tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + } + else + { + pred1_16x8b = _mm_set1_epi8(128); + + tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b); + tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b); + sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b); + + sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4); + } + } + + min_sad = MIN3(sad_horz, sad_vert, sad_dc); + + /* Finding minimum SAD and doing corresponding prediction*/ + if(min_sad < *pu4_sadmin) + { + *pu4_sadmin = min_sad; + + if(min_sad == sad_dc) + { + *u4_intra_mode = DC_CH_I8x8; + + if(!left) + pred2_16x8b = pred1_16x8b; + + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + } + else if(min_sad == sad_horz) + { + __m128i left_sh_16x8b, const_14_15_16x8b; + + *u4_intra_mode = HORZ_CH_I8x8; + + const_14_15_16x8b = _mm_set1_epi16(0x0f0e); + + left_sh_16x8b = _mm_slli_si128(left_16x8b, 2); + pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 1 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2 + + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + + left_16x8b = _mm_slli_si128(left_16x8b, 4); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 3 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4 + + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + + left_16x8b = _mm_slli_si128(left_16x8b, 4); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 5 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6 + + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + + left_16x8b = _mm_slli_si128(left_16x8b, 4); + left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4); + pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b); //row 7 + pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8 + + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b); + } + else + { + *u4_intra_mode = VERT_CH_I8x8; + + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + pu1_dst += dst_strd; + _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); + } + } +} diff --git a/encoder/x86/ih264e_platform_macros.h b/encoder/x86/ih264e_platform_macros.h new file mode 100755 index 0000000..b4dfadd --- /dev/null +++ b/encoder/x86/ih264e_platform_macros.h @@ -0,0 +1,154 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** + ******************************************************************************* + * @file + * ih264e_platform_macros.h + * + * @brief + * Contains platform specific routines used for codec context intialization + * + * @author + * ittiam + * + * @remarks + * none + * + ******************************************************************************* + */ + + +#ifndef IH264E_PLATFORM_MACROS_H_ +#define IH264E_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Extern Function Declarations */ +/*****************************************************************************/ + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_generic(codec_t *ps_codec); +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr_ssse3(codec_t *ps_codec); +void ih264e_init_function_ptr_sse42(codec_t *ps_codec); + +/** +******************************************************************************* +* +* @brief Initialize the intra/inter/transform/deblk function pointers of +* codec context +* +* @par Description: the current routine initializes the function pointers of +* codec context basing on the architecture in use +* +* @param[in] ps_codec +* Codec context pointer +* +* @returns none +* +* @remarks none +* +******************************************************************************* +*/ +void ih264e_init_function_ptr(void *pv_codec); + +/** +******************************************************************************* +* +* @brief Determine the architecture of the encoder executing environment +* +* @par Description: This routine returns the architecture of the enviro- +* ment in which the current encoder is being tested +* +* @param[in] void +* +* @returns IV_ARCH_T +* architecture +* +* @remarks none +* +******************************************************************************* +*/ +IV_ARCH_T ih264e_default_arch(void); + +/** +******************************************************************************* +* +* @brief Data Memory Barrier, Data Synchronization Barrier +* +* +* @par Description: These functions do nothing on x86 side. But on arm platforms, +* +* Data Memory Barrier acts as a memory barrier. It ensures that all explicit +* memory accesses that appear in program order before the DMB instruction are +* observed before any explicit memory accesses that appear in program order +* after the DMB instruction. It does not affect the ordering of any other +* instructions executing on the processor +* +* Data Synchronization Barrier acts as a special kind of memory barrier. No +* instruction in program order after this instruction executes until this instruction +* completes. This instruction completes when: +* 1. All explicit memory accesses before this instruction complete. +* 2. All Cache, Branch predictor and TLB maintenance operations before +* this instruction complete. +* +* @param[in] void +* +* @returns void +* +* @remarks none +* +******************************************************************************* +*/ + +#endif /* IH264E_PLATFORM_MACROS_H_ */ diff --git a/encoder/x86/ime_distortion_metrics_sse42.c b/encoder/x86/ime_distortion_metrics_sse42.c new file mode 100755 index 0000000..0876788 --- /dev/null +++ b/encoder/x86/ime_distortion_metrics_sse42.c @@ -0,0 +1,1940 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +****************************************************************************** +* @file ime_distortion_metrics_sse42.c +* +* @brief +* This file contains definitions of routines that compute distortion +* between two macro/sub blocks of identical dimensions +* +* @author +* Ittiam +* +* @par List of Functions: +* - ime_compute_sad_16x16_sse42() +* - ime_compute_sad_16x16_fast_sse42() +* - ime_compute_sad_16x16_ea8_sse42() +* - ime_compute_sad_16x8_sse42() +* - ime_calculate_sad4_prog_sse42() +* - ime_sub_pel_compute_sad_16x16_sse42() +* - ime_compute_satqd_16x16_lumainter_sse42() +* +* @remarks +* None +* +******************************************************************************* +*/ + +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* User include files */ +#include "ime_typedefs.h" +#include "ime_defs.h" +#include "ime_macros.h" +#include "ime_statistics.h" +#include "ime_platform_macros.h" +#include "ime_distortion_metrics.h" +#include <immintrin.h> + +/*****************************************************************************/ +/* Function Definitions */ +/*****************************************************************************/ + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x16 blocks +* +* @par Description +* This functions computes SAD between 2 16x16 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] i4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i est_r0, est_r1, est_r2, est_r3; + __m128i res_r0, res_r1, res_r2, res_r3; + __m128i sad_val; + int val1, val2; + + // Row 0-3 sad calculation + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(res_r0, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + // Row 4-7 sad calculation + pu1_src += 4*src_strd; + pu1_est += 4*est_strd; + + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + // Row 8-11 sad calculation + pu1_src += 4*src_strd; + pu1_est += 4*est_strd; + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + // Row 12-15 sad calculation + pu1_src += 4*src_strd; + pu1_est += 4*est_strd; + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + val1 = _mm_extract_epi32(sad_val,0); + val2 = _mm_extract_epi32(sad_val, 2); + *pi4_mb_distortion = (val1+val2); + + return; +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x8 blocks +* +* +* @par Description +* This functions computes SAD between 2 16x8 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] u4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i est_r0, est_r1, est_r2, est_r3; + __m128i res_r0, res_r1, res_r2, res_r3; + __m128i sad_val; + int val1, val2; + + // Row 0-3 sad calculation + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(res_r0, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + // Row 4-7 sad calculation + pu1_src += 4*src_strd; + pu1_est += 4*est_strd; + + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + val1 = _mm_extract_epi32(sad_val,0); + val2 = _mm_extract_epi32(sad_val, 2); + *pi4_mb_distortion = (val1+val2); + return; +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x16 blocks +* +* @par Description +* This functions computes SAD between 2 16x16 blocks. There is a provision +* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To +* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] i4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i est_r0, est_r1, est_r2, est_r3; + __m128i res_r0, res_r1, res_r2, res_r3; + __m128i sad_val; + WORD32 val1, val2; + WORD32 i4_sad; + UWORD8 *pu1_src_temp = pu1_src + src_strd; + UWORD8 *pu1_est_temp = pu1_est + est_strd; + + // Row 0,2,4,6 sad calculation + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(res_r0, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + // Row 8,10,12,14 sad calculation + pu1_src += 8*src_strd; + pu1_est += 8*est_strd; + + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + pu1_src = pu1_src_temp; + pu1_est = pu1_est_temp; + + val1 = _mm_extract_epi32(sad_val, 0); + val2 = _mm_extract_epi32(sad_val, 2); + + i4_sad = val1 + val2; + if (i4_max_sad < i4_sad) + { + *pi4_mb_distortion = i4_sad; + return ; + } + // Row 1,3,5,7 sad calculation + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + // Row 9,11,13,15 sad calculation + pu1_src += 8*src_strd; + pu1_est += 8*est_strd; + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + val1 = _mm_extract_epi32(sad_val, 0); + val2 = _mm_extract_epi32(sad_val, 2); + *pi4_mb_distortion = (val1+val2); + + return; +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) +* +* @par Description +* This functions computes SAD between 2 16x16 blocks by processing alternate +* rows (fast mode). For fast mode it is assumed sad obtained by processing +* alternate rows is approximately twice as that for the whole block. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] i4_max_sad +* integer maximum allowed distortion +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + WORD32 i4_max_sad, + WORD32 *pi4_mb_distortion) +{ + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i est_r0, est_r1, est_r2, est_r3; + __m128i res_r0, res_r1, res_r2, res_r3; + __m128i sad_val; + WORD32 val1, val2; + WORD32 i4_sad; + UWORD8 *pu1_src_temp = pu1_src + src_strd; + UWORD8 *pu1_est_temp = pu1_est + est_strd; + + // Row 0,2,4,6 sad calculation + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(res_r0, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + // Row 8,10,12,14 sad calculation + pu1_src += 8 * src_strd; + pu1_est += 8 * est_strd; + + src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); + src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd)); + src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd)); + src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd)); + + est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); + est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd)); + est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd)); + est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd)); + + res_r0 = _mm_sad_epu8(src_r0, est_r0); + res_r1 = _mm_sad_epu8(src_r1, est_r1); + res_r2 = _mm_sad_epu8(src_r2, est_r2); + res_r3 = _mm_sad_epu8(src_r3, est_r3); + + sad_val = _mm_add_epi64(sad_val, res_r0); + sad_val = _mm_add_epi64(sad_val, res_r1); + sad_val = _mm_add_epi64(sad_val, res_r2); + sad_val = _mm_add_epi64(sad_val, res_r3); + + pu1_src = pu1_src_temp; + pu1_est = pu1_est_temp; + + val1 = _mm_extract_epi32(sad_val, 0); + val2 = _mm_extract_epi32(sad_val, 2); + + i4_sad = val1 + val2; + *pi4_mb_distortion = (i4_sad<<1); + return; +} + +/** +******************************************************************************* +* +* @brief compute sad +* +* @par Description: This function computes the sad at vertices of diamond grid +* centered at reference pointer and at unit distance from it. +* +* @param[in] pu1_ref +* UWORD8 pointer to the reference +* +* @param[out] pu1_src +* UWORD8 pointer to the source +* +* @param[in] ref_strd +* integer reference stride +* +* @param[in] src_strd +* integer source stride +* +* @param[out] pi4_sad +* pointer to integer array evaluated sad +* +* @returns sad at all evaluated vertexes +* +* @remarks none +* +******************************************************************************* +*/ +void ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref, + UWORD8 *pu1_src, + WORD32 ref_strd, + WORD32 src_strd, + WORD32 *pi4_sad) +{ + /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */ + UWORD8 *left_ptr = pu1_ref - 1; + UWORD8 *right_ptr = pu1_ref + 1; + UWORD8 *top_ptr = pu1_ref - ref_strd; + UWORD8 *bot_ptr = pu1_ref + ref_strd; + + WORD32 val1, val2; + __m128i src, ref_left, ref_right, ref_top, ref_bot; + __m128i res_r0, res_r1, res_r2, res_r3; + __m128i sad_r0, sad_r1, sad_r2, sad_r3; + + // Row 0 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + sad_r0 = _mm_sad_epu8(src, ref_left); + sad_r1 = _mm_sad_epu8(src, ref_right); + sad_r2 = _mm_sad_epu8(src, ref_top); + sad_r3 = _mm_sad_epu8(src, ref_bot); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 1 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 2 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 3 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 4 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 5 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 6 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 7 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 8 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 9 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 10 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 11 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 12 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 13 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 14 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + pu1_src += src_strd; + left_ptr += ref_strd; + right_ptr += ref_strd; + top_ptr += ref_strd; + bot_ptr += ref_strd; + + // Row 15 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); + ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); + ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); + ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); + + res_r0 = _mm_sad_epu8(src, ref_left); + res_r1 = _mm_sad_epu8(src, ref_right); + res_r2 = _mm_sad_epu8(src, ref_top); + res_r3 = _mm_sad_epu8(src, ref_bot); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + + val1 = _mm_extract_epi32(sad_r0, 0); + val2 = _mm_extract_epi32(sad_r0, 2); + pi4_sad[0] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r1, 0); + val2 = _mm_extract_epi32(sad_r1, 2); + pi4_sad[1] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r2, 0); + val2 = _mm_extract_epi32(sad_r2, 2); + pi4_sad[2] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r3, 0); + val2 = _mm_extract_epi32(sad_r3, 2); + pi4_sad[3] = (val1 + val2); +} + +/** +****************************************************************************** +* +* @brief computes distortion (SAD) at all subpel points about the src location +* +* @par Description +* This functions computes SAD at all points at a subpel distance from the +* current source location. +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_ref_half_x +* UWORD8 pointer to half pel buffer +* +* @param[out] pu1_ref_half_y +* UWORD8 pointer to half pel buffer +* +* @param[out] pu1_ref_half_xy +* UWORD8 pointer to half pel buffer +* +* @param[in] src_strd +* integer source stride +* +* @param[in] ref_strd +* integer ref stride +* +* @param[out] pi4_sad +* integer evaluated sad +* pi4_sad[0] - half x +* pi4_sad[1] - half x - 1 +* pi4_sad[2] - half y +* pi4_sad[3] - half y - 1 +* pi4_sad[4] - half xy +* pi4_sad[5] - half xy - 1 +* pi4_sad[6] - half xy - strd +* pi4_sad[7] - half xy - 1 - strd +* +* @remarks +* +****************************************************************************** +*/ +void ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_ref_half_x, + UWORD8 *pu1_ref_half_y, + UWORD8 *pu1_ref_half_xy, + WORD32 src_strd, + WORD32 ref_strd, + WORD32 *pi4_sad) +{ + UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1; + UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd; + UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1; + UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd; + UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1; + WORD32 val1, val2; + + __m128i src, ref_half_x, ref_half_y, ref_half_xy; + __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left; + __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7; + __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7; + // Row 0 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + sad_r0 = _mm_sad_epu8(src, ref_half_x); + sad_r1 = _mm_sad_epu8(src, ref_half_x_left); + sad_r2 = _mm_sad_epu8(src, ref_half_y); + sad_r3 = _mm_sad_epu8(src, ref_half_y_top); + sad_r4 = _mm_sad_epu8(src, ref_half_xy); + sad_r5 = _mm_sad_epu8(src, ref_half_xy_left); + sad_r6 = _mm_sad_epu8(src, ref_half_xy_top); + sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 1 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 2 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 3 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 4 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + + // Row 5 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 6 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 7 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 8 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 9 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 10 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 11 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 12 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 13 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 14 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + pu1_src += src_strd; + pu1_ref_half_x += ref_strd; + pu1_ref_half_x_left += ref_strd; + pu1_ref_half_y += ref_strd; + pu1_ref_half_y_top += ref_strd; + pu1_ref_half_xy += ref_strd; + pu1_ref_half_xy_left += ref_strd; + pu1_ref_half_xy_top += ref_strd; + pu1_ref_half_xy_top_left += ref_strd; + + // Row 15 sad calculation + src = _mm_loadu_si128((__m128i *) (pu1_src)); + ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); + ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); + ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); + ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); + ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); + ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); + ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); + ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); + + res_r0 = _mm_sad_epu8(src, ref_half_x); + res_r1 = _mm_sad_epu8(src, ref_half_x_left); + res_r2 = _mm_sad_epu8(src, ref_half_y); + res_r3 = _mm_sad_epu8(src, ref_half_y_top); + res_r4 = _mm_sad_epu8(src, ref_half_xy); + res_r5 = _mm_sad_epu8(src, ref_half_xy_left); + res_r6 = _mm_sad_epu8(src, ref_half_xy_top); + res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); + + sad_r0 = _mm_add_epi64(sad_r0, res_r0); + sad_r1 = _mm_add_epi64(sad_r1, res_r1); + sad_r2 = _mm_add_epi64(sad_r2, res_r2); + sad_r3 = _mm_add_epi64(sad_r3, res_r3); + sad_r4 = _mm_add_epi64(sad_r4, res_r4); + sad_r5 = _mm_add_epi64(sad_r5, res_r5); + sad_r6 = _mm_add_epi64(sad_r6, res_r6); + sad_r7 = _mm_add_epi64(sad_r7, res_r7); + + val1 = _mm_extract_epi32(sad_r0, 0); + val2 = _mm_extract_epi32(sad_r0, 2); + pi4_sad[0] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r1, 0); + val2 = _mm_extract_epi32(sad_r1, 2); + pi4_sad[1] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r2, 0); + val2 = _mm_extract_epi32(sad_r2, 2); + pi4_sad[2] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r3, 0); + val2 = _mm_extract_epi32(sad_r3, 2); + pi4_sad[3] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r4, 0); + val2 = _mm_extract_epi32(sad_r4, 2); + pi4_sad[4] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r5, 0); + val2 = _mm_extract_epi32(sad_r5, 2); + pi4_sad[5] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r6, 0); + val2 = _mm_extract_epi32(sad_r6, 2); + pi4_sad[6] = (val1 + val2); + + val1 = _mm_extract_epi32(sad_r7, 0); + val2 = _mm_extract_epi32(sad_r7, 2); + pi4_sad[7] = (val1 + val2); + + return; +} +/* +* +* @brief This function computes SAD between two 16x16 blocks +* It also computes if the block will be zero after H264 transform and quant for +* Intra 16x16 blocks +* +* @param[in] pu1_src +* UWORD8 pointer to the source +* +* @param[out] pu1_dst +* UWORD8 pointer to the destination +* +* @param[in] src_strd +* integer source stride +* +* @param[in] dst_strd +* integer destination stride +* +* @param[in] pu2_thrsh +* Threshold for each element of transofrmed quantized block +* +* @param[out] pi4_mb_distortion +* integer evaluated sad +* +* @param[out] pu4_is_zero +* Poitner to store if the block is zero after transform and quantization +* +* @remarks +* +****************************************************************************** +*/ +void ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src, + UWORD8 *pu1_est, + WORD32 src_strd, + WORD32 est_strd, + UWORD16 *pu2_thrsh, + WORD32 *pi4_mb_distortion, + UWORD32 *pu4_is_zero) +{ + __m128i src_r0, src_r1, src_r2, src_r3; + __m128i est_r0, est_r1, est_r2, est_r3; + __m128i temp0, temp1, temp2, temp3, temp4; + __m128i zero = _mm_setzero_si128(); // all bits reset to zero + __m128i all_one = _mm_set1_epi8(0xFF); + __m128i sad_b1, sad_b2, threshold; + WORD16 sad_1, sad_2; + WORD32 i; + UWORD32 flag = 0; + WORD32 test1, test2; + threshold = _mm_loadu_si128((__m128i *) pu2_thrsh); + (*pi4_mb_distortion) = 0; + + for (i=0; i<4; i++) + { + src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2 + src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2 + src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2 + src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2 + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r1 = _mm_cvtepu8_epi16(src_r1); + src_r2 = _mm_cvtepu8_epi16(src_r2); + src_r3 = _mm_cvtepu8_epi16(src_r3); + + est_r0 = _mm_loadl_epi64((__m128i *) pu1_est); + est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd)); + est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd)); + + est_r0 = _mm_cvtepu8_epi16(est_r0); + est_r1 = _mm_cvtepu8_epi16(est_r1); + est_r2 = _mm_cvtepu8_epi16(est_r2); + est_r3 = _mm_cvtepu8_epi16(est_r3); + + src_r0 = _mm_sub_epi16(src_r0, est_r0); + src_r1 = _mm_sub_epi16(src_r1, est_r1); + src_r2 = _mm_sub_epi16(src_r2, est_r2); + src_r3 = _mm_sub_epi16(src_r3, est_r3); + + src_r0 = _mm_abs_epi16(src_r0); + src_r1 = _mm_abs_epi16(src_r1); + src_r2 = _mm_abs_epi16(src_r2); + src_r3 = _mm_abs_epi16(src_r3); + + src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1 + src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2 + + //SAD calculation + temp0 = _mm_add_epi16(src_r0, src_r1); //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2 + temp0 = _mm_hadd_epi16(temp0, zero); + temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values + + sad_1 = _mm_extract_epi16(temp0, 0); + sad_2 = _mm_extract_epi16(temp0, 1); + + (*pi4_mb_distortion) += sad_1 + sad_2; + + if (flag == 0) { + sad_b1 = _mm_set1_epi16((sad_1 << 1)); + sad_b2 = _mm_set1_epi16((sad_2 << 1)); + + src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1 + src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4 + + src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2 + src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3 + + src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0 + src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0 + + temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0 + temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0 + + temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0 + temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0 + + temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0 + temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0 + + temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0 + + temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0 + temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0 + + temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0 + + temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) + temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1) + + temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) + temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1) + + sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0 + sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1 + + temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff + + temp1 = _mm_cmpgt_epi16(threshold, sad_b2); + + temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation + temp1 = _mm_xor_si128(temp1, all_one); + + test1 = _mm_test_all_zeros(temp0, all_one); + test2 = _mm_test_all_zeros(temp1, all_one); + + if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1 + || pu2_thrsh[8] <= sad_2) + flag = 1; + } + + pu1_src += 8; + pu1_est += 8; + + src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2 + src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2 + src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2 + src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2 + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r1 = _mm_cvtepu8_epi16(src_r1); + src_r2 = _mm_cvtepu8_epi16(src_r2); + src_r3 = _mm_cvtepu8_epi16(src_r3); + + est_r0 = _mm_loadl_epi64((__m128i *) pu1_est); + est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd)); + est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd)); + est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd)); + + est_r0 = _mm_cvtepu8_epi16(est_r0); + est_r1 = _mm_cvtepu8_epi16(est_r1); + est_r2 = _mm_cvtepu8_epi16(est_r2); + est_r3 = _mm_cvtepu8_epi16(est_r3); + + src_r0 = _mm_sub_epi16(src_r0, est_r0); + src_r1 = _mm_sub_epi16(src_r1, est_r1); + src_r2 = _mm_sub_epi16(src_r2, est_r2); + src_r3 = _mm_sub_epi16(src_r3, est_r3); + + src_r0 = _mm_abs_epi16(src_r0); + src_r1 = _mm_abs_epi16(src_r1); + src_r2 = _mm_abs_epi16(src_r2); + src_r3 = _mm_abs_epi16(src_r3); + + src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1 + src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2 + + //SAD calculation + temp0 = _mm_add_epi16(src_r0, src_r1); + temp0 = _mm_hadd_epi16(temp0, zero); + temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values + + sad_1 = _mm_extract_epi16(temp0, 0); + sad_2 = _mm_extract_epi16(temp0, 1); + + (*pi4_mb_distortion) += sad_1 + sad_2; + + if (flag == 0) { + sad_b1 = _mm_set1_epi16((sad_1 << 1)); + sad_b2 = _mm_set1_epi16((sad_2 << 1)); + + src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1 + src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4 + + src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2 + src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3 + + src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0 + src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0 + + temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0 + temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0 + + temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0 + temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0 + + temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0 + temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0 + + temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0 + + temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0 + temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0 + + temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0 + + temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) + temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1) + + temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) + temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1) + + sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0 + sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1 + + temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff + + temp1 = _mm_cmpgt_epi16(threshold, sad_b2); + + temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation + temp1 = _mm_xor_si128(temp1, all_one); + + test1 = _mm_test_all_zeros(temp0, all_one); + test2 = _mm_test_all_zeros(temp1, all_one); + + if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1 + || pu2_thrsh[8] <= sad_2) + flag = 1; + } + + pu1_src += 4*src_strd - 8; + pu1_est += 4*est_strd - 8; + } + + *pu4_is_zero = flag; +} diff --git a/encoder/x86/ime_platform_macros.h b/encoder/x86/ime_platform_macros.h new file mode 100755 index 0000000..18e2e8f --- /dev/null +++ b/encoder/x86/ime_platform_macros.h @@ -0,0 +1,52 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ime_platform_macros.h +* +* @brief +* Platform specific Macro definitions used in the codec +* +* @author +* Ittiam +* +* @remarks +* None +* +******************************************************************************* +*/ + + +#ifndef _IME_PLATFORM_MACROS_H_ +#define _IME_PLATFORM_MACROS_H_ + +/*****************************************************************************/ +/* Function macro definitions */ +/*****************************************************************************/ + +#define USADA8(src,est,sad) \ + sad += ABS(src[0]-est[0]) + \ + ABS(src[1]-est[1]) + \ + ABS(src[2]-est[2]) + \ + ABS(src[3]-est[3]) + + +#endif /* _IH264_PLATFORM_MACROS_H_ */ |