diff options
Diffstat (limited to 'common')
38 files changed, 13706 insertions, 0 deletions
diff --git a/common/arm/impeg2_format_conv.s b/common/arm/impeg2_format_conv.s new file mode 100644 index 0000000..c07edda --- /dev/null +++ b/common/arm/impeg2_format_conv.s @@ -0,0 +1,391 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ + +@/* +@//---------------------------------------------------------------------------- +@// File Name : impeg2_format_conv.s +@// +@// Description : This file has the Idct Implementations for the +@// MPEG4 SP decoder on neon platform. +@// +@// Reference Document : +@// +@// Revision History : +@// Date Author Detail Description +@// ------------ ---------------- ---------------------------------- +@// Jul 07, 2008 Naveen Kumar T Created +@// +@//------------------------------------------------------------------------- +@*/ + +@/* +@// ---------------------------------------------------------------------------- +@// Include Files +@// ---------------------------------------------------------------------------- +@*/ +.text +.p2align 2 +.equ log2_16 , 4 +.equ log2_2 , 1 +@/* +@// ---------------------------------------------------------------------------- +@// Struct/Union Types and Define +@// ---------------------------------------------------------------------------- +@*/ + +@/* +@// ---------------------------------------------------------------------------- +@// Static Global Data section variables +@// ---------------------------------------------------------------------------- +@*/ +@//--------------------------- NONE -------------------------------------------- + +@/* +@// ---------------------------------------------------------------------------- +@// Static Prototype Functions +@// ---------------------------------------------------------------------------- +@*/ +@// -------------------------- NONE -------------------------------------------- + +@/* +@// ---------------------------------------------------------------------------- +@// Exported functions +@// ---------------------------------------------------------------------------- +@*/ + +@/***************************************************************************** +@* * +@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q() * +@* * +@* Description : This function conversts the image from YUV420P color * +@* space to 420SP color space(UV interleaved). * +@* * +@* Arguments : R0 pu1_y * +@* R1 pu1_u * +@* R2 pu1_v * +@* R3 pu1_dest_y * +@* [R13 #40] pu1_dest_uv * +@* [R13 #44] u2_height * +@* [R13 #48] u2_width * +@* [R13 #52] u2_stridey * +@* [R13 #56] u2_strideu * +@* [R13 #60] u2_stridev * +@* [R13 #64] u2_dest_stride_y * +@* [R13 #68] u2_dest_stride_uv * +@* [R13 #72] convert_uv_only * +@* * +@* Values Returned : None * +@* * +@* Register Usage : R0 - R8, Q0 * +@* * +@* Stack Usage : 24 Bytes * +@* * +@* Interruptibility : Interruptible * +@* * +@* Known Limitations * +@* Assumptions: Image Width: Assumed to be multiple of 16 and * +@* greater than or equal to 16 * +@* Image Height: Assumed to be even. * +@* * +@* Revision History : * +@* DD MM YYYY Author(s) Changes (Describe the changes made) * +@* 07 06 2010 Varshita Draft * +@* 07 06 2010 Naveen Kr T Completed * +@* * +@*****************************************************************************/ + .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q +impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q: + + @// push the registers on the stack + stmfd sp!, {r4-r8, lr} + + ldr r4, [sp, #56] @// Load convert_uv_only + + cmp r4, #1 + beq yuv420sp_uv_chroma + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #28] @// Load u2_height from stack + + ldr r5, [sp, #32] @// Load u2_width from stack + + ldr r7, [sp, #36] @// Load u2_stridey from stack + + ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack + + sub r7, r7, r5 @// Source increment + + sub r8, r8, r5 @// Destination increment + + +yuv420sp_uv_row_loop_y: + mov r6, r5 + +yuv420sp_uv_col_loop_y: + pld [r0, #128] + vld1.8 {q0}, [r0]! + vst1.8 {q0}, [r3]! + sub r6, r6, #16 + cmp r6, #15 + bgt yuv420sp_uv_col_loop_y + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_y + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #16 + sub r0, r0, r6 + sub r3, r3, r6 + + vld1.8 {q0}, [r0]! + vst1.8 {q0}, [r3]! + +yuv420sp_uv_row_loop_end_y: + add r0, r0, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_y + +yuv420sp_uv_chroma: + + ldr r3, [sp, #24] @// Load pu1_dest_uv from stack + + ldr r4, [sp, #28] @// Load u2_height from stack + + ldr r5, [sp, #32] @// Load u2_width from stack + + + ldr r7, [sp, #40] @// Load u2_strideu from stack + + ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack + + sub r7, r7, r5, lsr #1 @// Source increment + + sub r8, r8, r5 @// Destination increment + + mov r5, r5, lsr #1 + mov r4, r4, lsr #1 + ldr r3, [sp, #24] @// Load pu1_dest_uv from stack +yuv420sp_uv_row_loop_uv: + mov r6, r5 + + +yuv420sp_uv_col_loop_uv: + pld [r1, #128] + pld [r2, #128] + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + sub r6, r6, #8 + cmp r6, #7 + bgt yuv420sp_uv_col_loop_uv + + cmp r6, #0 + beq yuv420sp_uv_row_loop_end_uv + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #8 + sub r1, r1, r6 + sub r2, r2, r6 + sub r3, r3, r6, lsl #1 + + vld1.8 d0, [r1]! + vld1.8 d1, [r2]! + vst2.8 {d0, d1}, [r3]! + +yuv420sp_uv_row_loop_end_uv: + add r1, r1, r7 + add r2, r2, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_uv_row_loop_uv + @//POP THE REGISTERS + ldmfd sp!, {r4-r8, pc} + + + + + +@/***************************************************************************** +@* * +@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q() * +@* * +@* Description : This function conversts the image from YUV420P color * +@* space to 420SP color space(VU interleaved). * +@* This function is similar to above function * +@* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * +@* VLD1.8 for chroma - order of registers is different * +@* * +@* Arguments : R0 pu1_y * +@* R1 pu1_u * +@* R2 pu1_v * +@* R3 pu1_dest_y * +@* [R13 #40] pu1_dest_uv * +@* [R13 #44] u2_height * +@* [R13 #48] u2_width * +@* [R13 #52] u2_stridey * +@* [R13 #56] u2_strideu * +@* [R13 #60] u2_stridev * +@* [R13 #64] u2_dest_stride_y * +@* [R13 #68] u2_dest_stride_uv * +@* [R13 #72] convert_uv_only * +@* * +@* Values Returned : None * +@* * +@* Register Usage : R0 - R8, Q0 * +@* * +@* Stack Usage : 24 Bytes * +@* * +@* Interruptibility : Interruptible * +@* * +@* Known Limitations * +@* Assumptions: Image Width: Assumed to be multiple of 16 and * +@* greater than or equal to 16 * +@* Image Height: Assumed to be even. * +@* * +@* Revision History : * +@* DD MM YYYY Author(s) Changes (Describe the changes made) * +@* 07 06 2010 Varshita Draft * +@* 07 06 2010 Naveen Kr T Completed * +@* * +@*****************************************************************************/ + + .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q +impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q: + + @// push the registers on the stack + stmfd sp!, {r4-r8, lr} + + ldr r4, [sp, #56] @// Load convert_uv_only + + cmp r4, #1 + beq yuv420sp_vu_chroma + + @/* Do the preprocessing before the main loops start */ + @// Load the parameters from stack + ldr r4, [sp, #28] @// Load u2_height from stack + + ldr r5, [sp, #32] @// Load u2_width from stack + + ldr r7, [sp, #36] @// Load u2_stridey from stack + + ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack + + sub r7, r7, r5 @// Source increment + + sub r8, r8, r5 @// Destination increment + + +yuv420sp_vu_row_loop_y: + mov r6, r5 + +yuv420sp_vu_col_loop_y: + pld [r0, #128] + vld1.8 {q0}, [r0]! + vst1.8 {q0}, [r3]! + sub r6, r6, #16 + cmp r6, #15 + bgt yuv420sp_vu_col_loop_y + + cmp r6, #0 + beq yuv420sp_vu_row_loop_end_y + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #16 + sub r0, r0, r6 + sub r3, r3, r6 + + vld1.8 {q0}, [r0]! + vst1.8 {q0}, [r3]! + +yuv420sp_vu_row_loop_end_y: + add r0, r0, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_vu_row_loop_y + +yuv420sp_vu_chroma: + + ldr r3, [sp, #24] @// Load pu1_dest_uv from stack + + ldr r4, [sp, #28] @// Load u2_height from stack + + ldr r5, [sp, #32] @// Load u2_width from stack + + + ldr r7, [sp, #40] @// Load u2_strideu from stack + + ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack + + sub r7, r7, r5, lsr #1 @// Source increment + + sub r8, r8, r5 @// Destination increment + + mov r5, r5, lsr #1 + mov r4, r4, lsr #1 + ldr r3, [sp, #24] @// Load pu1_dest_uv from stack +yuv420sp_vu_row_loop_uv: + mov r6, r5 + + +yuv420sp_vu_col_loop_uv: + pld [r1, #128] + pld [r2, #128] + vld1.8 d1, [r1]! + vld1.8 d0, [r2]! + vst2.8 {d0, d1}, [r3]! + sub r6, r6, #8 + cmp r6, #7 + bgt yuv420sp_vu_col_loop_uv + + cmp r6, #0 + beq yuv420sp_vu_row_loop_end_uv + @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + @//Ex if width is 162, above loop will process 160 pixels. And + @//Both source and destination will point to 146th pixel and then 16 bytes will be read + @// and written using VLD1 and VST1 + rsb r6, r6, #8 + sub r1, r1, r6 + sub r2, r2, r6 + sub r3, r3, r6, lsl #1 + + vld1.8 d1, [r1]! + vld1.8 d0, [r2]! + vst2.8 {d0, d1}, [r3]! + +yuv420sp_vu_row_loop_end_uv: + add r1, r1, r7 + add r2, r2, r7 + add r3, r3, r8 + subs r4, r4, #1 + bgt yuv420sp_vu_row_loop_uv + @//POP THE REGISTERS + ldmfd sp!, {r4-r8, pc} + + + + + diff --git a/common/arm/impeg2_idct.s b/common/arm/impeg2_idct.s new file mode 100644 index 0000000..22225bf --- /dev/null +++ b/common/arm/impeg2_idct.s @@ -0,0 +1,1204 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ + +@/* +@//---------------------------------------------------------------------------- +@// File Name : impeg2_idct.s +@// +@// Description : This file has the Idct Implementations for the +@// MPEG2 SP decoder on neon platform. +@// +@// Reference Document : +@// +@// Revision History : +@// Date Author Detail Description +@// ------------ ---------------- ---------------------------------- +@// Feb 22, 2008 Naveen Kumar T Created +@// +@//------------------------------------------------------------------------- +@*/ + +@/* +@// ---------------------------------------------------------------------------- +@// Include Files +@// ---------------------------------------------------------------------------- +@*/ + +.text +.p2align 2 +.equ idct_stg1_shift , 12 +.equ idct_stg2_shift , 16 +.equ idct_stg1_round , (1 << (idct_stg1_shift - 1)) +.equ idct_stg2_round , (1 << (idct_stg2_shift - 1)) +@/* +@// ---------------------------------------------------------------------------- +@// Struct/Union Types and Define +@// ---------------------------------------------------------------------------- +@*/ + +@/* +@// ---------------------------------------------------------------------------- +@// Static Global Data section variables +@// ---------------------------------------------------------------------------- +@*/ +@//--------------------------- NONE -------------------------------------------- + +@/* +@// ---------------------------------------------------------------------------- +@// Static Prototype Functions +@// ---------------------------------------------------------------------------- +@*/ +@// -------------------------- NONE -------------------------------------------- + +@/* +@// ---------------------------------------------------------------------------- +@// Exported functions +@// ---------------------------------------------------------------------------- +@*/ + + .extern gai2_impeg2_idct_q15 +.hidden gai2_impeg2_idct_q15 + .extern gai2_impeg2_idct_q11 +.hidden gai2_impeg2_idct_q11 + .extern gai2_impeg2_idct_first_col_q15 +.hidden gai2_impeg2_idct_first_col_q15 + .extern gai2_impeg2_idct_first_col_q11 +.hidden gai2_impeg2_idct_first_col_q11 + .extern gai2_impeg2_mismatch_stg2_additive +.hidden gai2_impeg2_mismatch_stg2_additive + +gai2_impeg2_idct_q15_addr1: + .long gai2_impeg2_idct_q15 - q15lbl1 - 8 +gai2_impeg2_idct_q15_addr2: + .long gai2_impeg2_idct_q15 - q15lbl2 - 8 +gai2_impeg2_idct_q11_addr1: + .long gai2_impeg2_idct_q11 - q11lbl1 - 8 +gai2_impeg2_idct_q11_addr2: + .long gai2_impeg2_idct_q11 - q11lbl2 - 8 +gai2_impeg2_idct_first_col_q15_addr1: + .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl1 - 8 +gai2_impeg2_idct_first_col_q15_addr2: + .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl2 - 8 +gai2_impeg2_idct_first_col_q15_addr3: + .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl3 - 8 +gai2_impeg2_mismatch_stg2_additive_addr: + .long gai2_impeg2_mismatch_stg2_additive - additive_lbl - 8 +gai2_impeg2_idct_first_col_q11_addr1: + .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl1 - 8 +gai2_impeg2_idct_first_col_q11_addr2: + .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl2 - 8 + + .global impeg2_idct_recon_dc_a9q +impeg2_idct_recon_dc_a9q: + stmfd sp!, {r4, r6, r12, lr} + @//r0: pi2_src + @//r1: pi2_tmp - not used, used as pred_strd + @//r2: pu1_pred + @//r3: pu1_dst + @//r4: used as scratch + @//r5: + + ldr r1, [sp, #20] @//pred_strd + ldr r6, [sp, #24] @//dst_strd + + ldr r14, gai2_impeg2_idct_q15_addr1 +q15lbl1: + add r14, r14, pc + ldrsh r12, [r14] + ldrsh r4, [r0] + + vld1.8 d0, [r2], r1 + mul r4, r4, r12 + + vld1.8 d1, [r2], r1 + add r4, #idct_stg1_round + + vld1.8 d2, [r2], r1 + asr r4, r4, #idct_stg1_shift + + ldr r14, gai2_impeg2_idct_q11_addr1 +q11lbl1: + add r14, r14, pc + ldrsh r12, [r14] + + vld1.8 d3, [r2], r1 + mul r4, r4, r12 + + vld1.8 d4, [r2], r1 + add r4, #idct_stg2_round + + vld1.8 d5, [r2], r1 + asr r4, r4, #idct_stg2_shift + + vld1.8 d6, [r2], r1 + vdup.s16 q15, r4 + + + vld1.8 d7, [r2], r1 + + vaddw.u8 q4, q15, d0 + + vaddw.u8 q5, q15, d1 + vqmovun.s16 d0, q4 + + vaddw.u8 q6, q15, d2 + vqmovun.s16 d1, q5 + vst1.8 d0, [r3], r6 + + vaddw.u8 q7, q15, d3 + vqmovun.s16 d2, q6 + vst1.8 d1, [r3], r6 + + vaddw.u8 q8, q15, d4 + vqmovun.s16 d3, q7 + vst1.8 d2, [r3], r6 + + vaddw.u8 q9, q15, d5 + vqmovun.s16 d4, q8 + vst1.8 d3, [r3], r6 + + vaddw.u8 q10, q15, d6 + vqmovun.s16 d5, q9 + vst1.8 d4, [r3], r6 + + vaddw.u8 q11, q15, d7 + vqmovun.s16 d6, q10 + vst1.8 d5, [r3], r6 + + vqmovun.s16 d7, q11 + vst1.8 d6, [r3], r6 + + + vst1.8 d7, [r3], r6 + + ldmfd sp!, {r4, r6, r12, pc} + + + + + .global impeg2_idct_recon_dc_mismatch_a9q +impeg2_idct_recon_dc_mismatch_a9q: + stmfd sp!, {r4-r12, lr} + + ldr r1, [sp, #44] @//pred_strd + ldr r6, [sp, #48] @//dst_strd + + ldr r14, gai2_impeg2_idct_q15_addr2 +q15lbl2: + add r14, r14, pc + ldrsh r12, [r14] + ldrsh r4, [r0] + + mul r4, r4, r12 + add r4, #idct_stg1_round + asr r4, r4, #idct_stg1_shift + + ldr r14, gai2_impeg2_idct_q11_addr2 +q11lbl2: + add r14, r14, pc + ldrsh r12, [r14] + mul r4, r4, r12 + vdup.s32 q0, r4 + + mov r14, #16 @//Increment for table read + ldr r4, gai2_impeg2_mismatch_stg2_additive_addr +additive_lbl: + add r4, r4, pc + + vld1.16 {q1}, [r4], r14 + + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + vld1.16 {q1}, [r4], r14 + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + vld1.16 {q1}, [r4], r14 + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + vld1.16 {q1}, [r4], r14 + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + vld1.16 {q1}, [r4], r14 + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + vld1.16 {q1}, [r4], r14 + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + vld1.16 {q1}, [r4], r14 + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + vld1.16 {q1}, [r4], r14 + vld1.8 d30, [r2], r1 + vmovl.s16 q4, d2 + vmovl.s16 q5, d3 + vraddhn.s32 d12, q0, q4 + vraddhn.s32 d13, q0, q5 + vaddw.u8 q7, q6, d30 + vqmovun.s16 d30, q7 + vst1.8 d30, [r3], r6 + + + ldmfd sp!, {r4-r12, pc} + + + + +@/** +@ ******************************************************************************* +@ * +@ * ;brief +@ * This function performs Inverse transform and reconstruction for 8x8 +@ * input block +@ * +@ * ;par Description: +@ * Performs inverse transform and adds the prediction data and clips output +@ * to 8 bit +@ * +@ * ;param[in] pi2_src +@ * Input 8x8 coefficients +@ * +@ * ;param[in] pi2_tmp +@ * Temporary 8x8 buffer for storing inverse +@ * +@ * transform +@ * 1st stage output +@ * +@ * ;param[in] pu1_pred +@ * Prediction 8x8 block +@ * +@ * ;param[out] pu1_dst +@ * Output 8x8 block +@ * +@ * ;param[in] src_strd +@ * Input stride +@ * +@ * ;param[in] pred_strd +@ * Prediction stride +@ * +@ * ;param[in] dst_strd +@ * Output Stride +@ * +@ * ;param[in] shift +@ * Output shift +@ * +@ * ;param[in] zero_cols +@ * Zero columns in pi2_src +@ * +@ * ;returns Void +@ * +@ * ;remarks +@ * None +@ * +@ ******************************************************************************* +@ */ + +@void impeg2_itrans_recon_8x8(WORD16 *pi2_src, +@ WORD16 *pi2_tmp, +@ UWORD8 *pu1_pred, +@ UWORD8 *pu1_dst, +@ WORD32 src_strd, +@ WORD32 pred_strd, +@ WORD32 dst_strd, +@ WORD32 zero_cols +@ WORD32 zero_rows ) + +@**************Variables Vs Registers************************* +@ r0 => *pi2_src +@ r1 => *pi2_tmp +@ r2 => *pu1_pred +@ r3 => *pu1_dst +@ src_strd +@ pred_strd +@ dst_strd +@ zero_cols + + + + .global impeg2_idct_recon_a9q +impeg2_idct_recon_a9q: +@//Register Usage Reference - loading and Until IDCT of columns +@// Cosine Constants - D0 +@// Sine Constants - D1 +@// Row 0 First Half - D2 - y0 +@// Row 1 First Half - D6 - y1 +@// Row 2 First Half - D3 - y2 +@// Row 3 First Half - D7 - y3 +@// Row 4 First Half - D10 - y4 +@// Row 5 First Half - D14 - y5 +@// Row 6 First Half - D11 - y6 +@// Row 7 First Half - D15 - y7 + +@// Row 0 Second Half - D4 - y0 +@// Row 1 Second Half - D8 - y1 +@// Row 2 Second Half - D5 - y2 +@// Row 3 Second Half - D9 - y3 +@// Row 4 Second Half - D12 - y4 +@// Row 5 Second Half - D16 - y5 +@// Row 6 Second Half - D13 - y6 +@// Row 7 Second Half - D17 - y7 + + @// Copy the input pointer to another register + @// Step 1 : load all constants + stmfd sp!, {r4-r12, lr} + add sp, sp, #40 + ldr r8, [sp, #4] @ prediction stride + ldr r7, [sp, #8] @ destination stride + ldr r6, [sp] @ src stride + ldr r12, [sp, #12] + ldr r11, [sp, #16] + mov r6, r6, lsl #1 @ x sizeof(word16) + add r9, r0, r6, lsl #1 @ 2 rows + + add r10, r6, r6, lsl #1 @ 3 rows + + sub r10, r10, #8 @ - 4 cols * sizeof(WORD16) + sub r5, r6, #8 @ src_strd - 4 cols * sizeof(WORD16) + + + ldr r14, gai2_impeg2_idct_first_col_q15_addr1 +fcq15_lbl1: + add r14, r14, pc + vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data + + @//Step 2 Load all the input data + @//Step 3 Operate first 4 colums at a time + + and r11, r11, #0xff + and r12, r12, #0xff + + cmp r11, #0xf0 + bge skip_last4_rows + + + vld1.16 d2, [r0]! + vld1.16 d3, [r9]! + vld1.16 d4, [r0], r5 + vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) + vld1.16 d5, [r9], r5 + vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) + vld1.16 d6, [r0]! + vld1.16 d7, [r9]! + vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) + vld1.16 d8, [r0], r10 + vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) + vld1.16 d9, [r9], r10 + vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) + vld1.16 d10, [r0]! + vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) + vld1.16 d11, [r9]! + vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) + vld1.16 d12, [r0], r5 + vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) + vld1.16 d13, [r9], r5 + vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) + vld1.16 d14, [r0]! + vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) + vld1.16 d15, [r9]! + vmull.s16 q11, d10, d0[0] @// y4 * cos4(part of c0 and c1) + vld1.16 d16, [r0], r10 + vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) + vld1.16 d17, [r9], r10 + + @/* This following was activated when alignment is not there */ +@// VLD1.16 D2,[r0]! +@// VLD1.16 D3,[r2]! +@// VLD1.16 D4,[r0]! +@// VLD1.16 D5,[r2]! +@// VLD1.16 D6,[r0]! +@// VLD1.16 D7,[r2]! +@// VLD1.16 D8,[r0],r3 +@// VLD1.16 D9,[r2],r3 +@// VLD1.16 D10,[r0]! +@// VLD1.16 D11,[r2]! +@// VLD1.16 D12,[r0]! +@// VLD1.16 D13,[r2]! +@// VLD1.16 D14,[r0]! +@// VLD1.16 D15,[r2]! +@// VLD1.16 D16,[r0],r3 +@// VLD1.16 D17,[r2],r3 + + + + + vmlal.s16 q12, d14, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + vmlsl.s16 q13, d14, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + vmlal.s16 q14, d14, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + vmlal.s16 q15, d14, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + vmlsl.s16 q9, d11, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + vmlal.s16 q3, d11, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + vadd.s32 q5, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + vmlal.s16 q12, d15, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) + vmlsl.s16 q13, d15, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) + vmlal.s16 q14, d15, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) + vmlsl.s16 q15, d15, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) + + vadd.s32 q7, q5, q3 @// a0 = c0 + d0(part of r0,r7) + vsub.s32 q5, q5, q3 @// a3 = c0 - d0(part of r3,r4) + vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) + vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) + + vadd.s32 q10, q7, q12 @// a0 + b0(part of r0) + vsub.s32 q3, q7, q12 @// a0 - b0(part of r7) + + vadd.s32 q12, q11, q14 @// a2 + b2(part of r2) + vsub.s32 q11, q11, q14 @// a2 - b2(part of r5) + + vadd.s32 q14, q9, q13 @// a1 + b1(part of r1) + vsub.s32 q9, q9, q13 @// a1 - b1(part of r6) + + vadd.s32 q13, q5, q15 @// a3 + b3(part of r3) + vsub.s32 q15, q5, q15 @// a3 - b3(part of r4) + + vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) + + + b last4_cols + + + +skip_last4_rows: + + + ldr r14, gai2_impeg2_idct_first_col_q15_addr2 +fcq15_lbl2: + add r14, r14, pc + vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data + + vld1.16 d2, [r0]! + vld1.16 d3, [r9]! + vld1.16 d4, [r0], r5 + vld1.16 d5, [r9], r5 + vld1.16 d6, [r0]! + vld1.16 d7, [r9]! + vld1.16 d8, [r0], r10 + vld1.16 d9, [r9], r10 + + + + vmov.s16 q6, #0 + vmov.s16 q8, #0 + + + + + vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) + vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) + vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) + vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) + + vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) + vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) + vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) + vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) + + vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) + vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) + + vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) + + + vadd.s32 q7, q10, q3 @// a0 = c0 + d0(part of r0,r7) + vsub.s32 q5, q10, q3 @// a3 = c0 - d0(part of r3,r4) + vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) + vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) + + vadd.s32 q10, q7, q12 @// a0 + b0(part of r0) + vsub.s32 q3, q7, q12 @// a0 - b0(part of r7) + + vadd.s32 q12, q11, q14 @// a2 + b2(part of r2) + vsub.s32 q11, q11, q14 @// a2 - b2(part of r5) + + vadd.s32 q14, q9, q13 @// a1 + b1(part of r1) + vsub.s32 q9, q9, q13 @// a1 - b1(part of r6) + + vadd.s32 q13, q5, q15 @// a3 + b3(part of r3) + vsub.s32 q15, q5, q15 @// a3 - b3(part of r4) + + vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) + + +last4_cols: + + + cmp r12, #0xf0 + bge skip_last4cols + + ldr r14, gai2_impeg2_idct_first_col_q15_addr3 +fcq15_lbl3: + add r14, r14, pc + vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data + + vmull.s16 q12, d8, d0[1] @// y1 * cos1(part of b0) + vmull.s16 q13, d8, d0[3] @// y1 * cos3(part of b1) + vmull.s16 q14, d8, d1[1] @// y1 * sin3(part of b2) + vmull.s16 q15, d8, d1[3] @// y1 * sin1(part of b3) + + vmlal.s16 q12, d9, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) + vmlsl.s16 q13, d9, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) + vmlsl.s16 q14, d9, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) + vmlsl.s16 q15, d9, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) + + vmull.s16 q9, d5, d1[2] @// y2 * sin2 (Q4 is freed by this time)(part of d1) + vmull.s16 q4, d5, d0[2] @// y2 * cos2(part of d0) + + vmull.s16 q10, d4, d0[0] @// y0 * cos4(part of c0 and c1) + vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1) + + vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + vmlal.s16 q4, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) + vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) + vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) + vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) + + vadd.s32 q8, q6, q4 @// a0 = c0 + d0(part of e0,e7) + vsub.s32 q6, q6, q4 @// a3 = c0 - d0(part of e3,e4) + vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of e2,e5) + vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of e1,e6) + + vadd.s32 q10, q8, q12 @// a0 + b0(part of e0) + vsub.s32 q4, q8, q12 @// a0 - b0(part of e7) + + vadd.s32 q12, q11, q14 @// a2 + b2(part of e2) + vsub.s32 q11, q11, q14 @// a2 - b2(part of e5) + + vadd.s32 q14, q9, q13 @// a1 + b1(part of e1) + vsub.s32 q9, q9, q13 @// a1 - b1(part of e6) + + vadd.s32 q13, q6, q15 @// a3 + b3(part of e3) + vsub.s32 q15, q6, q15 @// a3 - b3(part of r4) + + vqrshrn.s32 d4, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d17, q4, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d5, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d16, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d8, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d13, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d9, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) + vqrshrn.s32 d12, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) + b end_skip_last4cols + + + +skip_last4cols: + + + + ldr r14, gai2_impeg2_idct_first_col_q11_addr1 +fcq11_lbl1: + add r14, r14, pc + vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data + + + + vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing + + vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing + + + vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued..... + vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued..... + + vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued..... + vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued..... + + + vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) + vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) + vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) + vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) + + vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) + vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) + vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) + vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) + + vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) +@ VMULL.S16 Q11,D4,D0[0] ;// y4 * cos4(part of c0 and c1) + + vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) + vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) + + + + + vsub.s32 q11, q10, q3 @// a3 = c0 - d0(part of r3,r4) + vadd.s32 q2, q10, q3 @// a0 = c0 + d0(part of r0,r7) + + + vadd.s32 q1, q2, q12 + + vsub.s32 q3, q2, q12 + + vadd.s32 q4, q11, q15 + + vsub.s32 q12, q11, q15 + + vqrshrn.s32 d5, q4, #idct_stg2_shift + vqrshrn.s32 d2, q1, #idct_stg2_shift + vqrshrn.s32 d9, q3, #idct_stg2_shift + vqrshrn.s32 d6, q12, #idct_stg2_shift + + vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) + vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) + + + vadd.s32 q15, q11, q14 + + vsub.s32 q12, q11, q14 + + vadd.s32 q14, q9, q13 + + vsub.s32 q11, q9, q13 + vqrshrn.s32 d4, q15, #idct_stg2_shift + vqrshrn.s32 d7, q12, #idct_stg2_shift + vqrshrn.s32 d3, q14, #idct_stg2_shift + vqrshrn.s32 d8, q11, #idct_stg2_shift + + + + + + + + + + + vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0) + + vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1) + vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2) + vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3) + + vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) + vtrn.16 d2, d3 + vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) + vtrn.16 d4, d5 + vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) + vtrn.16 d6, d7 + vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) + vtrn.16 d8, d9 + vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1) + vtrn.32 d2, d4 + + vtrn.32 d3, d5 + vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1) + vtrn.32 d6, d8 + vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0) + vtrn.32 d7, d9 + + + add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data + + + add r5, r8, r8, lsl #1 @ + + + add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data + + + add r10, r7, r7, lsl #1 @ + + + vswp d3, d6 + + + vswp d5, d8 + + + vsub.s32 q11, q10, q7 @// a3 = c0 - d0(part of r3,r4) + vadd.s32 q6, q10, q7 @// a0 = c0 + d0(part of r0,r7) + + + vadd.s32 q0, q6, q12 + + + vsub.s32 q12, q6, q12 + + + vadd.s32 q6, q11, q15 + + + vsub.s32 q7, q11, q15 + + vqrshrn.s32 d10, q0, #idct_stg2_shift + vqrshrn.s32 d17, q12, #idct_stg2_shift + vqrshrn.s32 d13, q6, #idct_stg2_shift + vqrshrn.s32 d14, q7, #idct_stg2_shift + + vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) + vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) + + + vadd.s32 q0, q11, q14 + + + vsub.s32 q12, q11, q14 + + + vadd.s32 q14, q9, q13 + + + vsub.s32 q13, q9, q13 + vld1.8 d18, [r2], r8 + + vqrshrn.s32 d12, q0, #idct_stg2_shift + vld1.8 d20, [r2], r5 + + + vqrshrn.s32 d15, q12, #idct_stg2_shift + vld1.8 d19, [r2], r8 + + + + + vqrshrn.s32 d11, q14, #idct_stg2_shift + vld1.8 d22, [r4], r8 + + + + + vqrshrn.s32 d16, q13, #idct_stg2_shift + vld1.8 d21, [r2], r5 + + + b pred_buff_addition +end_skip_last4cols: + + ldr r14, gai2_impeg2_idct_first_col_q11_addr2 +fcq11_lbl2: + add r14, r14, pc + vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data + + +@/* Now the Idct of columns is done, transpose so that row idct done efficiently(step5) */ + vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing + vtrn.16 q2, q4 @//[r3,r1],[r2,r0] second qudrant transposing + vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing + vtrn.16 q6, q8 @//[r7,r5],[r6,r4] fourth qudrant transposing + + vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued..... + vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued..... + vtrn.32 d4, d5 @//r0,r1,r2,r3 second qudrant transposing continued..... + vtrn.32 d8, d9 @//r0,r1,r2,r3 second qudrant transposing continued..... + vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued..... + vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued..... + vtrn.32 d12, d13 @//r4,r5,r6,r7 fourth qudrant transposing continued..... + vtrn.32 d16, d17 @//r4,r5,r6,r7 fourth qudrant transposing continued..... + + @//step6 Operate on first four rows and find their idct + @//Register Usage Reference - storing and IDCT of rows +@// Cosine Constants - D0 +@// Sine Constants - D1 +@// Element 0 First four - D2 - y0 +@// Element 1 First four - D6 - y1 +@// Element 2 First four - D3 - y2 +@// Element 3 First four - D7 - y3 +@// Element 4 First four - D4 - y4 +@// Element 5 First four - D8 - y5 +@// Element 6 First four - D5 - y6 +@// Element 7 First four - D9 - y7 +@// Element 0 Second four - D10 - y0 +@// Element 1 Second four - D14 - y1 +@// Element 2 Second four - D11 - y2 +@// Element 3 Second four - D15 - y3 +@// Element 4 Second four - D12 - y4 +@// Element 5 Second four - D16 - y5 +@// Element 6 Second four - D13 - y6 +@// Element 7 Second four - D17 - y7 + + @// Map between first kernel code seq and current +@// D2 -> D2 +@// D6 -> D6 +@// D3 -> D3 +@// D7 -> D7 +@// D10 -> D4 +@// D14 -> D8 +@// D11 -> D5 +@// D15 -> D9 +@// Q3 -> Q3 +@// Q5 -> Q2 +@// Q7 -> Q4 + + vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0) + vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1) + vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2) + vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3) + + vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) + vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) + vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) + vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) + + vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1) + vmull.s16 q11, d4, d0[0] @// y4 * cos4(part of c0 and c1) + + vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1) + vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0) + + + vmlal.s16 q12, d8, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + vmlsl.s16 q13, d8, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + vmlal.s16 q14, d8, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + vmlal.s16 q15, d8, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + vmlsl.s16 q9, d5, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + vmlal.s16 q3, d5, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + vadd.s32 q1, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + vmlal.s16 q12, d9, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) + vmlsl.s16 q13, d9, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) + vmlal.s16 q14, d9, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) + vmlsl.s16 q15, d9, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) + + vsub.s32 q11, q1, q3 @// a3 = c0 - d0(part of r3,r4) + vadd.s32 q2, q1, q3 @// a0 = c0 + d0(part of r0,r7) + + + vadd.s32 q1, q2, q12 + + vsub.s32 q3, q2, q12 + + vadd.s32 q4, q11, q15 + + vsub.s32 q12, q11, q15 + + vqrshrn.s32 d5, q4, #idct_stg2_shift + vqrshrn.s32 d2, q1, #idct_stg2_shift + vqrshrn.s32 d9, q3, #idct_stg2_shift + vqrshrn.s32 d6, q12, #idct_stg2_shift + + vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) + vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) + + + vadd.s32 q15, q11, q14 + + vsub.s32 q12, q11, q14 + + vadd.s32 q14, q9, q13 + + vsub.s32 q11, q9, q13 + vqrshrn.s32 d4, q15, #idct_stg2_shift + vqrshrn.s32 d7, q12, #idct_stg2_shift + vqrshrn.s32 d3, q14, #idct_stg2_shift + vqrshrn.s32 d8, q11, #idct_stg2_shift + + + + + + + + + + + vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0) + + vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1) + vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2) + vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3) + + vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0) + vtrn.16 d2, d3 + vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1) + vtrn.16 d4, d5 + vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2) + vtrn.16 d6, d7 + vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3) + vtrn.16 d8, d9 + vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1) + vtrn.32 d2, d4 + vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1) + vtrn.32 d3, d5 + vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1) + vtrn.32 d6, d8 + vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0) + vtrn.32 d7, d9 + vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + + add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data + vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + + add r5, r8, r8, lsl #1 @ + vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + + add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data + vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + add r10, r7, r7, lsl #1 @ + vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + + + vmlal.s16 q7, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7) + vswp d3, d6 + vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6) + + vswp d5, d8 + vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5) + vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4) + + vsub.s32 q11, q6, q7 @// a3 = c0 - d0(part of r3,r4) + vadd.s32 q6, q6, q7 @// a0 = c0 + d0(part of r0,r7) + + + vadd.s32 q0, q6, q12 + + + vsub.s32 q12, q6, q12 + + + vadd.s32 q6, q11, q15 + + + vsub.s32 q7, q11, q15 + + vqrshrn.s32 d10, q0, #idct_stg2_shift + vqrshrn.s32 d17, q12, #idct_stg2_shift + vqrshrn.s32 d13, q6, #idct_stg2_shift + vqrshrn.s32 d14, q7, #idct_stg2_shift + + vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5) + vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6) + + + vadd.s32 q0, q11, q14 + + + vsub.s32 q12, q11, q14 + + + vadd.s32 q14, q9, q13 + + + vsub.s32 q13, q9, q13 + vld1.8 d18, [r2], r8 + + vqrshrn.s32 d12, q0, #idct_stg2_shift + vld1.8 d20, [r2], r5 + + + vqrshrn.s32 d15, q12, #idct_stg2_shift + vld1.8 d19, [r2], r8 + + + + + vqrshrn.s32 d11, q14, #idct_stg2_shift + vld1.8 d22, [r4], r8 + + + + + vqrshrn.s32 d16, q13, #idct_stg2_shift + vld1.8 d21, [r2], r5 + + + + +pred_buff_addition: + + + vtrn.16 d10, d11 + vld1.8 d24, [r4], r5 + + vtrn.16 d12, d13 + vld1.8 d23, [r4], r8 + + vaddw.u8 q1, q1, d18 + vld1.8 d25, [r4], r5 + + vtrn.16 d14, d15 + vaddw.u8 q2, q2, d22 + + vtrn.16 d16, d17 + vaddw.u8 q3, q3, d20 + + vtrn.32 d10, d12 + vaddw.u8 q4, q4, d24 + + vtrn.32 d11, d13 + vtrn.32 d14, d16 + vtrn.32 d15, d17 + + vswp d11, d14 + vswp d13, d16 + +@ Row values stored in the q register. + +@Q1 :r0 +@Q3: r1 +@Q2: r2 +@Q4: r3 +@Q5: r4 +@Q7: r5 +@Q6: r6 +@Q8: r7 + + + +@/// Adding the prediction buffer + + + + + + + + + + @ Load prediction data + + + + + + @Adding recon with prediction + + + + + + vaddw.u8 q5, q5, d19 + vqmovun.s16 d2, q1 + vaddw.u8 q7, q7, d21 + vqmovun.s16 d4, q2 + vaddw.u8 q6, q6, d23 + vqmovun.s16 d6, q3 + vaddw.u8 q8, q8, d25 + vqmovun.s16 d8, q4 + + + + + + + + vst1.8 {d2}, [r3], r7 + vqmovun.s16 d10, q5 + vst1.8 {d6}, [r3], r10 + vqmovun.s16 d14, q7 + vst1.8 {d4}, [r0], r7 + vqmovun.s16 d12, q6 + vst1.8 {d8}, [r0], r10 + vqmovun.s16 d16, q8 + + + + + + + + vst1.8 {d10}, [r3], r7 + vst1.8 {d14}, [r3], r10 + vst1.8 {d12}, [r0], r7 + vst1.8 {d16}, [r0], r10 + + + + + sub sp, sp, #40 + ldmfd sp!, {r4-r12, pc} + + + diff --git a/common/arm/impeg2_inter_pred.s b/common/arm/impeg2_inter_pred.s new file mode 100644 index 0000000..f1b3dde --- /dev/null +++ b/common/arm/impeg2_inter_pred.s @@ -0,0 +1,801 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ + +@/* +@//---------------------------------------------------------------------------- +@// File Name : impeg2_inter_pred.s +@// +@// Description : This file has motion compensation related +@// interpolation functions on Neon + CortexA-8 platform +@// +@// Reference Document : +@// +@// Revision History : +@// Date Author Detail Description +@// ------------ ---------------- ---------------------------------- +@// 18 jun 2010 S Hamsalekha Created +@// +@//------------------------------------------------------------------------- +@*/ + +@/* +@// ---------------------------------------------------------------------------- +@// Include Files +@// ---------------------------------------------------------------------------- +@*/ +.text +.p2align 2 + + +@/* +@// ---------------------------------------------------------------------------- +@// Struct/Union Types and Define +@// ---------------------------------------------------------------------------- +@*/ + + +@/* +@// ---------------------------------------------------------------------------- +@// Static Global Data section variables +@// ---------------------------------------------------------------------------- +@*/ +@// -------------------------- NONE -------------------------------------------- + + +@/* +@// ---------------------------------------------------------------------------- +@// Static Prototype Functions +@// ---------------------------------------------------------------------------- +@*/ +@// -------------------------- NONE -------------------------------------------- + +@/* +@// ---------------------------------------------------------------------------- +@// Exported functions +@// ---------------------------------------------------------------------------- +@*/ + +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_copy_mb_a9q() +@// +@// Detail Description : Copies one MB worth of data from src to the dst +@// +@// Inputs : r0 - pointer to src +@// r1 - pointer to dst +@// r2 - source width +@// r3 - destination width +@// Registers Used : r4, r5, d0, d1 +@// +@// Stack Usage : 12 bytes +@// +@// Outputs : +@// +@// Return Data : None +@// +@// Programming Note : <program limitation> +@//----------------------------------------------------------------------------- +@*/ + + + + .global impeg2_copy_mb_a9q + + +impeg2_copy_mb_a9q: + + stmfd r13!, {r4, r5, r14} + + + ldr r4, [r0] @src->y + ldr r5, [r1] @dst->y + @Read one row of data from the src + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + + @//Repeat 15 times for y + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + vld1.8 {d0, d1}, [r4], r2 @Load and increment src + vst1.8 {d0, d1}, [r5], r3 @Store and increment dst + + mov r2, r2, lsr #1 @src_offset /= 2 + mov r3, r3, lsr #1 @dst_offset /= 2 + + ldr r4, [r0, #4] @src->u + ldr r5, [r1, #4] @dst->u + @Read one row of data from the src + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + + @//Repeat 7 times for u + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + + ldr r4, [r0, #8] @src->v + ldr r5, [r1, #8] @dst->v + @Read one row of data from the src + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + + @//Repeat 7 times for v + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + vld1.8 {d0}, [r4], r2 @Load and increment src + vst1.8 {d0}, [r5], r3 @Store and increment dst + + ldmfd r13!, {r4, r5, pc} + + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_mc_fullx_halfy_8x8_a9q() +@// +@// Detail Description : This function pastes the reference block in the +@// current frame buffer.This function is called for +@// blocks that are not coded and have motion vectors +@// with a half pel resolution. +@// +@// Inputs : r0 - out : Current Block Pointer +@// r1 - ref : Refernce Block Pointer +@// r2 - ref_wid : Refernce Block Width +@// r3 - out_wid ; Current Block Width +@// +@// Registers Used : D0-D9 +@// +@// Stack Usage : 4 bytes +@// +@// Outputs : The Motion Compensated Block +@// +@// Return Data : None +@// +@// Programming Note : <program limitation> +@//----------------------------------------------------------------------------- +@*/ + + .global impeg2_mc_fullx_halfy_8x8_a9q + +impeg2_mc_fullx_halfy_8x8_a9q: + + stmfd r13!, {r14} + add r14, r1, r2 + mov r2, r2, lsl #1 + +@/* Load 8 + 1 rows from reference block */ +@/* Do the addition with out rounding off as rounding value is 1 */ + vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0 + vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2 + vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4 + vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6 + vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1 + vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3 + vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9 + vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5 + vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1 + vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7 + vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3 + vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8 + vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5 + + add r14, r0, r3 + mov r3, r3, lsl #1 + +@/* Store the eight rows calculated above */ + vst1.8 {d2}, [r14], r3 @// second row hence D2 + vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7 + vst1.8 {d0}, [r0], r3 @// first row hence D0 + vst1.8 {d9}, [r14], r3 @// fourth row hence D9 + vst1.8 {d4}, [r0], r3 @// third row hence D4 + vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3 + vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1 + vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7 + vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5 + + ldmfd sp!, {pc} + + + + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_mc_halfx_fully_8x8_a9q() +@// +@// Detail Description : This function pastes the reference block in the +@// current frame buffer.This function is called for +@// blocks that are not coded and have motion vectors +@// with a half pel resolutionand VopRoundingType is 0 .. +@// +@// Inputs : r0 - out : Current Block Pointer +@// r1 - ref : Refernce Block Pointer +@// r2 - ref_wid : Refernce Block Width +@// r3 - out_wid ; Current Block Width +@// +@// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22 + +@// +@// Stack Usage : 8 bytes +@// +@// Outputs : The Motion Compensated Block +@// +@// Return Data : None +@// +@// Programming Note : <program limitation> +@//----------------------------------------------------------------------------- +@*/ + + + + .global impeg2_mc_halfx_fully_8x8_a9q + + + +impeg2_mc_halfx_fully_8x8_a9q: + + stmfd sp!, {r12, lr} + + add r14, r1, r2, lsl #2 + + add r12, r0, r3, lsl#2 + + vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 + + vld1.8 {d2, d3}, [r14], r2 @ row5 + + + vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 + + vld1.8 {d6, d7}, [r14], r2 @row6 + + + vext.8 d8, d0, d1, #1 @Extract pixels (1-8) of row1 + + vext.8 d12, d2, d3, #1 @Extract pixels (1-8) of row5 + + vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2 + + vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6 + + + vld1.8 {d9, d10}, [r1], r2 @load row3 + + vld1.8 {d13, d14}, [r14], r2 @load row7 + + vld1.8 {d17, d18}, [r1], r2 @load row4 + + vld1.8 {d21, d22}, [r14], r2 @load row8 + + + vext.8 d1, d9, d10, #1 @Extract pixels (1-8) of row3 + + vext.8 d3, d13, d14, #1 @Extract pixels (1-8) of row7 + + + + vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4 + + vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8 + + + vrhadd.u8 q0, q0, q4 @operate on row1 and row3 + + vrhadd.u8 q1, q1, q6 @operate on row5 and row7 + + + vrhadd.u8 q2, q2, q8 @operate on row2 and row4 + + + + vrhadd.u8 q3, q3, q10 @operate on row6 and row8 + + vst1.8 d0, [r0], r3 @store row1 + + vst1.8 d2, [r12], r3 @store row5 + + vst1.8 d4, [r0], r3 @store row2 + + vst1.8 d6, [r12], r3 @store row6 + + vst1.8 d1, [r0], r3 @store row3 + + vst1.8 d3, [r12], r3 @store row7 + + vst1.8 d5, [r0], r3 @store row4 + + vst1.8 d7, [r12], r3 @store row8 + + + + ldmfd sp!, {r12, pc} + + + + + + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_mc_halfx_halfy_8x8_a9q() +@// +@// Detail Description : This function pastes the reference block in the +@// current frame buffer.This function is called for +@// blocks that are not coded and have motion vectors +@// with a half pel resolutionand VopRoundingType is 0 .. +@// +@// Inputs : r0 - out : Current Block Pointer +@// r1 - ref : Refernce Block Pointer +@// r2 - ref_wid : Refernce Block Width +@// r3 - out_wid ; Current Block Width +@// +@// Registers Used : r14, q0-q15 + +@// +@// Stack Usage : 4 bytes +@// +@// Outputs : The Motion Compensated Block +@// +@// Return Data : None +@// +@// Programming Note : <program limitation> +@//----------------------------------------------------------------------------- +@*/ + + + .global impeg2_mc_halfx_halfy_8x8_a9q + +impeg2_mc_halfx_halfy_8x8_a9q: + + stmfd sp!, {r14} + + add r14, r1, r2, lsl #2 + + vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1 + + vld1.8 {d2, d3}, [r14], r2 @ row5 + + vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2 + + vld1.8 {d6, d7}, [r14], r2 @row6 + + vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1 + + + + vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5 + + + + vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2 + + vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6 + + + + + vld1.8 {d8, d9}, [r1], r2 @load row3 + + + + vld1.8 {d10, d11}, [r14], r2 @load row7 + + vld1.8 {d12, d13}, [r1], r2 @load row4 + + vld1.8 {d14, d15}, [r14], r2 @load row8 + + vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3 + + vld1.8 {d16, d17}, [r14], r2 @load row9 + + + + + + vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7 + + + + vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4 + + + + vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8 + + vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9 + + + @interpolation in x direction + + vaddl.u8 q0, d0, d1 @operate row1 + + vaddl.u8 q1, d2, d3 @operate row5 + + vaddl.u8 q2, d4, d5 @operate row2 + + vaddl.u8 q3, d6, d7 @operate row6 + + vaddl.u8 q4, d8, d9 @operate row3 + + vaddl.u8 q5, d10, d11 @operate row7 + + vaddl.u8 q6, d12, d13 @operate row4 + + vaddl.u8 q7, d14, d15 @operate row8 + + vaddl.u8 q8, d16, d17 @operate row9 + + @interpolation in y direction + + add r14, r0, r3, lsl #2 + + + + vadd.u16 q9, q0, q2 @operate row1 and row2 + + vadd.u16 q13, q1, q3 @operate row5 and row6 + + vadd.u16 q10, q2, q4 @operate row2 and row3 + + vadd.u16 q14, q3, q5 @operate row6 and row7 + + vrshrn.u16 d18, q9, #2 @row1 + + vrshrn.u16 d26, q13, #2 @row5 + + vrshrn.u16 d20, q10, #2 @row2 + + vrshrn.u16 d28, q14, #2 @row6 + + vadd.u16 q11, q4, q6 @operate row3 and row4 + + vst1.8 d18, [r0], r3 @store row1 + + vadd.u16 q15, q5, q7 @operate row7 and row8 + + vst1.8 d26, [r14], r3 @store row5 + + vadd.u16 q12, q6, q1 @operate row4 and row5 + + vst1.8 d20, [r0], r3 @store row2 + + vadd.u16 q7, q7, q8 @operate row8 and row9 + + vst1.8 d28, [r14], r3 @store row6 + + + + vrshrn.u16 d22, q11, #2 @row3 + + vrshrn.u16 d30, q15, #2 @row7 + + vrshrn.u16 d24, q12, #2 @row4 + + vrshrn.u16 d14, q7, #2 @row8 + + + vst1.8 d22, [r0], r3 @store row3 + vst1.8 d30, [r14], r3 @store row7 + vst1.8 d24, [r0], r3 @store row4 + vst1.8 d14, [r14], r3 @store row8 + + + + ldmfd sp!, {pc} + + + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_mc_fullx_fully_8x8_a9q() +@// +@// Detail Description : This function pastes the reference block in the +@// current frame buffer.This function is called for +@// blocks that are not coded and have motion vectors +@// with a half pel resolutionand .. +@// +@// Inputs : r0 - out : Current Block Pointer +@// r1 - ref : Refernce Block Pointer +@// r2 - ref_wid : Refernce Block Width +@// r3 - out_wid ; Current Block Width +@// +@// Registers Used : r12, r14, d0-d3 + +@// +@// Stack Usage : 8 bytes +@// +@// Outputs : The Motion Compensated Block +@// +@// Return Data : None +@// +@// Programming Note : <program limitation> +@//----------------------------------------------------------------------------- +@*/ + + + .global impeg2_mc_fullx_fully_8x8_a9q +impeg2_mc_fullx_fully_8x8_a9q: + + + stmfd sp!, {r12, lr} + + add r14, r1, r2, lsl #2 + + add r12, r0, r3, lsl #2 + + + vld1.8 d0, [r1], r2 @load row1 + + vld1.8 d1, [r14], r2 @load row4 + + vld1.8 d2, [r1], r2 @load row2 + + vld1.8 d3, [r14], r2 @load row5 + + + vst1.8 d0, [r0], r3 @store row1 + + vst1.8 d1, [r12], r3 @store row4 + + vst1.8 d2, [r0], r3 @store row2 + + vst1.8 d3, [r12], r3 @store row5 + + + vld1.8 d0, [r1], r2 @load row3 + + vld1.8 d1, [r14], r2 @load row6 + + vld1.8 d2, [r1], r2 @load row4 + + vld1.8 d3, [r14], r2 @load row8 + + + vst1.8 d0, [r0], r3 @store row3 + + vst1.8 d1, [r12], r3 @store row6 + + vst1.8 d2, [r0], r3 @store row4 + + vst1.8 d3, [r12], r3 @store row8 + + + ldmfd sp!, {r12, pc} + + + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_interpolate_a9q() +@// +@// Detail Description : interpolates two buffers and adds pred +@// +@// Inputs : r0 - pointer to src1 +@// r1 - pointer to src2 +@// r2 - dest buf +@// r3 - dst stride +@// Registers Used : r4, r5, r7, r14, d0-d15 +@// +@// Stack Usage : 20 bytes +@// +@// Outputs : The Motion Compensated Block +@// +@// Return Data : None +@// +@// Programming Note : <program limitation> +@//----------------------------------------------------------------------------- +@*/ + + + .global impeg2_interpolate_a9q + + +impeg2_interpolate_a9q: + + stmfd r13!, {r4, r5, r7, r12, r14} + + ldr r4, [r0, #0] @ptr_y src1 + + ldr r5, [r1, #0] @ptr_y src2 + + ldr r7, [r2, #0] @ptr_y dst buf + + mov r12, #4 @counter for number of blocks + + +interp_lumablocks_stride: + + vld1.8 {d0, d1}, [r4]! @row1 src1 + + vld1.8 {d2, d3}, [r4]! @row2 src1 + + vld1.8 {d4, d5}, [r4]! @row3 src1 + + vld1.8 {d6, d7}, [r4]! @row4 src1 + + + vld1.8 {d8, d9}, [r5]! @row1 src2 + + vld1.8 {d10, d11}, [r5]! @row2 src2 + + vld1.8 {d12, d13}, [r5]! @row3 src2 + + vld1.8 {d14, d15}, [r5]! @row4 src2 + + + + + vrhadd.u8 q0, q0, q4 @operate on row1 + + vrhadd.u8 q1, q1, q5 @operate on row2 + + vrhadd.u8 q2, q2, q6 @operate on row3 + + vrhadd.u8 q3, q3, q7 @operate on row4 + + + + vst1.8 {d0, d1}, [r7], r3 @row1 + + vst1.8 {d2, d3}, [r7], r3 @row2 + + vst1.8 {d4, d5}, [r7], r3 @row3 + + vst1.8 {d6, d7}, [r7], r3 @row4 + + subs r12, r12, #1 + + bne interp_lumablocks_stride + + + mov r3, r3, lsr #1 @stride >> 1 + + ldr r4, [r0, #4] @ptr_u src1 + + ldr r5, [r1, #4] @ptr_u src2 + + ldr r7 , [r2, #4] @ptr_u dst buf + + mov r12, #2 @counter for number of blocks + + + +@chroma blocks + +interp_chromablocks_stride: + + vld1.8 {d0, d1}, [r4]! @row1 & 2 src1 + + vld1.8 {d2, d3}, [r4]! @row3 & 4 src1 + + vld1.8 {d4, d5}, [r4]! @row5 & 6 src1 + + vld1.8 {d6, d7}, [r4]! @row7 & 8 src1 + + + vld1.8 {d8, d9}, [r5]! @row1 & 2 src2 + + vld1.8 {d10, d11}, [r5]! @row3 & 4 src2 + + vld1.8 {d12, d13}, [r5]! @row5 & 6 src2 + + vld1.8 {d14, d15}, [r5]! @row7 & 8 src2 + + + + + vrhadd.u8 q0, q0, q4 @operate on row1 & 2 + + vrhadd.u8 q1, q1, q5 @operate on row3 & 4 + + vrhadd.u8 q2, q2, q6 @operate on row5 & 6 + + vrhadd.u8 q3, q3, q7 @operate on row7 & 8 + + + vst1.8 {d0}, [r7], r3 @row1 + + vst1.8 {d1}, [r7], r3 @row2 + + vst1.8 {d2}, [r7], r3 @row3 + + vst1.8 {d3}, [r7], r3 @row4 + + vst1.8 {d4}, [r7], r3 @row5 + + vst1.8 {d5}, [r7], r3 @row6 + + vst1.8 {d6}, [r7], r3 @row7 + + vst1.8 {d7}, [r7], r3 @row8 + + + + ldr r4, [r0, #8] @ptr_v src1 + + ldr r5, [r1, #8] @ptr_v src2 + + ldr r7, [r2, #8] @ptr_v dst buf + + subs r12, r12, #1 + + bne interp_chromablocks_stride + + + ldmfd r13!, {r4, r5, r7, r12, pc} + + + + + diff --git a/common/arm/impeg2_mem_func.s b/common/arm/impeg2_mem_func.s new file mode 100755 index 0000000..869b7d7 --- /dev/null +++ b/common/arm/impeg2_mem_func.s @@ -0,0 +1,177 @@ +@/****************************************************************************** +@ * +@ * Copyright (C) 2015 The Android Open Source Project +@ * +@ * Licensed under the Apache License, Version 2.0 (the "License"); +@ * you may not use this file except in compliance with the License. +@ * You may obtain a copy of the License at: +@ * +@ * http://www.apache.org/licenses/LICENSE-2.0 +@ * +@ * Unless required by applicable law or agreed to in writing, software +@ * distributed under the License is distributed on an "AS IS" BASIS, +@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ * See the License for the specific language governing permissions and +@ * limitations under the License. +@ * +@ ***************************************************************************** +@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +@*/ + +@/* +@//---------------------------------------------------------------------------- +@// File Name : impeg2_mem_func.s +@// +@// Description : This file has motion compensation related +@// interpolation functions on Neon + CortexA-8 platform +@// +@// Reference Document : +@// +@// Revision History : +@// Date Author Detail Description +@// ------------ ---------------- ---------------------------------- +@// 18 jun 2010 S Hamsalekha Created +@// +@//------------------------------------------------------------------------- +@*/ + +@/* +@// ---------------------------------------------------------------------------- +@// Include Files +@// ---------------------------------------------------------------------------- +@*/ +.text +.p2align 2 + + +@/* +@// ---------------------------------------------------------------------------- +@// Struct/Union Types and Define +@// ---------------------------------------------------------------------------- +@*/ + + +@/* +@// ---------------------------------------------------------------------------- +@// Static Global Data section variables +@// ---------------------------------------------------------------------------- +@*/ +@// -------------------------- NONE -------------------------------------------- + + +@/* +@// ---------------------------------------------------------------------------- +@// Static Prototype Functions +@// ---------------------------------------------------------------------------- +@*/ +@// -------------------------- NONE -------------------------------------------- + +@/* +@// ---------------------------------------------------------------------------- +@// Exported functions +@// ---------------------------------------------------------------------------- +@*/ + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_memset_8bit_8x8_block_a9q() +@// +@// Detail Description : This routine intialises the Block matrix buffer contents to a +@// particular Value. This function also assumes the buffer size +@// to be set is 64 Bytes fixed. It also assumes that blk matrix +@// used is 64 bit aligned. +@// +@// Inputs : r0: pi2_blk_mat : Block Pointer +@// r1: u2_val : Value with which the block is initialized +@// r2: u4_dst_width: Destination Width +@// +@// Registers Used : q0 +@// +@// Stack Usage : 4 bytes +@// +@// Outputs : Block Matrix Initialized to given value +@// +@// Return Data : None +@// +@// Programming Note : None +@//----------------------------------------------------------------------------- +@*/ + .global impeg2_memset_8bit_8x8_block_a9q +impeg2_memset_8bit_8x8_block_a9q: + str lr, [sp, #-4]! + + vdup.8 d0, r1 @//r1 is the 8-bit value to be set into + + vst1.8 {d0}, [r0], r2 @//Store the row 1 + vst1.8 {d0}, [r0], r2 @//Store the row 2 + vst1.8 {d0}, [r0], r2 @//Store the row 3 + vst1.8 {d0}, [r0], r2 @//Store the row 4 + vst1.8 {d0}, [r0], r2 @//Store the row 5 + vst1.8 {d0}, [r0], r2 @//Store the row 6 + vst1.8 {d0}, [r0], r2 @//Store the row 7 + vst1.8 {d0}, [r0], r2 @//Store the row 8 + + ldr pc, [sp], #4 + + + + + + + +@/* +@//--------------------------------------------------------------------------- +@// Function Name : impeg2_memset0_16bit_8x8_linear_block_a9q() +@// +@// Detail Description : memsets 128 byte long linear buf to 0 +@// +@// Inputs : r0 - Buffer +@// Registers Used : q0 + +@// +@// Stack Usage : 4 bytes +@// +@// Outputs : None +@// +@// Return Data : None +@// +@// Programming Note : <program limitation> +@//----------------------------------------------------------------------------- +@*/ + + + + .global impeg2_memset0_16bit_8x8_linear_block_a9q + + +impeg2_memset0_16bit_8x8_linear_block_a9q: + + stmfd r13!, {r14} + + vmov.i16 q0, #0 + +@Y data + + vst1.16 {d0, d1} , [r0]! @row1 + + vst1.16 {d0, d1} , [r0]! @row2 + + vst1.16 {d0, d1} , [r0]! @row3 + + vst1.16 {d0, d1} , [r0]! @row4 + + vst1.16 {d0, d1} , [r0]! @row5 + + vst1.16 {d0, d1} , [r0]! @row6 + + vst1.16 {d0, d1} , [r0]! @row7 + + vst1.16 {d0, d1} , [r0]! @row8 + + + + ldmfd r13!, {pc} + + + + diff --git a/common/arm/impeg2_platform_macros.h b/common/arm/impeg2_platform_macros.h new file mode 100644 index 0000000..11db302 --- /dev/null +++ b/common/arm/impeg2_platform_macros.h @@ -0,0 +1,75 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_PLATFORM_MACROS_H__ +#define __IMPEG2_PLATFORM_MACROS_H__ + + +#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = \ + (u4_temp1 << 24) | \ + ((u4_temp1 & 0xff00) << 8) | \ + ((u4_temp1 & 0xff0000) >> 8) | \ + (u4_temp1 >> 24); + +static __inline UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return (__builtin_clz(u4_word)); + else + return 32; +} +static __inline WORD32 CLIP_U8(WORD32 x) +{ + asm("usat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S8(WORD32 x) +{ + asm("ssat %0, #8, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U12(WORD32 x) +{ + asm("usat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_S12(WORD32 x) +{ + asm("ssat %0, #12, %1" : "=r"(x) : "r"(x)); + return x; +} + +static __inline WORD32 CLIP_U16(WORD32 x) +{ + asm("usat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} +static __inline WORD32 CLIP_S16(WORD32 x) +{ + asm("ssat %0, #16, %1" : "=r"(x) : "r"(x)); + return x; +} + +#define INLINE +#define PLD(x) __pld(x) + +#endif /* __IMPEG2_PLATFORM_MACROS_H__ */ diff --git a/common/armv8/impeg2_format_conv.s b/common/armv8/impeg2_format_conv.s new file mode 100644 index 0000000..48baf04 --- /dev/null +++ b/common/armv8/impeg2_format_conv.s @@ -0,0 +1,409 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ + +///* +////---------------------------------------------------------------------------- +//// File Name : impeg2_format_conv.s +//// +//// Description : This file has the Idct Implementations for the +//// MPEG4 SP decoder on neon platform. +//// +//// Reference Document : +//// +//// Revision History : +//// Date Author Detail Description +//// ------------ ---------------- ---------------------------------- +//// Jul 07, 2008 Naveen Kumar T Created +//// +////------------------------------------------------------------------------- +//*/ + +///* +//// ---------------------------------------------------------------------------- +//// Include Files +//// ---------------------------------------------------------------------------- +//*/ +.set log2_16 , 4 +.set log2_2 , 1 + +.text +.include "impeg2_neon_macros.s" +///* +//// ---------------------------------------------------------------------------- +//// Struct/Union Types and Define +//// ---------------------------------------------------------------------------- +//*/ + +///* +//// ---------------------------------------------------------------------------- +//// Static Global Data section variables +//// ---------------------------------------------------------------------------- +//*/ +////--------------------------- NONE -------------------------------------------- + +///* +//// ---------------------------------------------------------------------------- +//// Static Prototype Functions +//// ---------------------------------------------------------------------------- +//*/ +//// -------------------------- NONE -------------------------------------------- + +///* +//// ---------------------------------------------------------------------------- +//// Exported functions +//// ---------------------------------------------------------------------------- +//*/ + + +///***************************************************************************** +//* * +//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() * +//* * +//* Description : This function conversts the image from YUV420P color * +//* space to 420SP color space(UV interleaved). * +//* * +//* Arguments : x0 pu1_y * +//* x1 pu1_u * +//* x2 pu1_v * +//* x3 pu1_dest_y * +//* x4 pu1_dest_uv * +//* x5 u2_height * +//* x6 u2_width * +//* x7 u2_stridey * +//* sp, #80 u2_strideu * +//* sp, #88 u2_stridev * +//* sp, #96 u2_dest_stride_y * +//* sp, #104 u2_dest_stride_uv * +//* sp, #112 convert_uv_only * +//* * +//* Values Returned : None * +//* * +//* Register Usage : x8, x10, x16, x20, v0, v1 * +//* * +//* Stack Usage : 80 Bytes * +//* * +//* Interruptibility : Interruptible * +//* * +//* Known Limitations * +//* Assumptions: Image Width: Assumed to be multiple of 16 and * +//* greater than or equal to 16 * +//* Image Height: Assumed to be even. * +//* * +//* Revision History : * +//* DD MM YYYY Author(s) Changes (Describe the changes made) * +//* 07 06 2010 Varshita Draft * +//* 07 06 2010 Naveen Kr T Completed * +//* * +//*****************************************************************************/ +.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8 +impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8: + + //// push the registers on the stack + // pu1_y, - x0 + // pu1_u, - x1 + // pu1_v, - x2 + // pu1_dest_y, - x3 + // pu1_dest_uv, - x4 + // u2_height, - x5 + // u2_width, - x6 + // u2_stridey, - x7 + // u2_strideu, - sp, #80 + // u2_stridev, - sp, #88 + // u2_dest_stride_y, - sp, #96 + // u2_dest_stride_uv, - sp, #104 + // convert_uv_only - sp, #112 + // STMFD sp!,{x4-x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + ldr w14, [sp, #112] //// Load convert_uv_only + + cmp w14, #1 + beq yuv420sp_uv_chroma + ///* Do the preprocessing before the main loops start */ + //// Load the parameters from stack + + ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack + uxtw x8, w8 + + sub x7, x7, x6 //// Source increment + + sub x8, x8, x6 //// Destination increment + + +yuv420sp_uv_row_loop_y: + mov x16, x6 + +yuv420sp_uv_col_loop_y: + prfm pldl1keep, [x0, #128] + ld1 {v0.8b, v1.8b}, [x0], #16 + st1 {v0.8b, v1.8b}, [x3], #16 + sub x16, x16, #16 + cmp x16, #15 + bgt yuv420sp_uv_col_loop_y + + cmp x16, #0 + beq yuv420sp_uv_row_loop__y + ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + ////Ex if width is 162, above loop will process 160 pixels. And + ////Both source and destination will point to 146th pixel and then 16 bytes will be read + //// and written using VLD1 and VST1 + sub x20, x16, #16 + neg x16, x20 + sub x0, x0, x16 + sub x3, x3, x16 + + ld1 {v0.8b, v1.8b}, [x0], #16 + st1 {v0.8b, v1.8b}, [x3], #16 + +yuv420sp_uv_row_loop__y: + add x0, x0, x7 + add x3, x3, x8 + subs x5, x5, #1 + bgt yuv420sp_uv_row_loop_y + +yuv420sp_uv_chroma: + ldr w7, [sp, #88] //// Load u2_strideu from stack + sxtw x7, w7 + + ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack + sxtw x8, w8 + + sub x7, x7, x6, lsr #1 //// Source increment + + sub x8, x8, x6 //// Destination increment + + lsr x6, x6, #1 + lsr x5, x5, #1 +yuv420sp_uv_row_loop_uv: + mov x16, x6 + + +yuv420sp_uv_col_loop_uv: + prfm pldl1keep, [x1, #128] + prfm pldl1keep, [x2, #128] + + ld1 {v0.8b}, [x1], #8 + ld1 {v1.8b}, [x2], #8 + st2 {v0.8b, v1.8b}, [x4], #16 + + sub x16, x16, #8 + cmp x16, #7 + bgt yuv420sp_uv_col_loop_uv + + cmp x16, #0 + beq yuv420sp_uv_row_loop__uv + ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + ////Ex if width is 162, above loop will process 160 pixels. And + ////Both source and destination will point to 146th pixel and then 16 bytes will be read + //// and written using VLD1 and VST1 + sub x20, x16, #8 + neg x16, x20 + sub x1, x1, x16 + sub x2, x2, x16 + sub x4, x4, x16, lsl #1 + + ld1 {v0.8b}, [x1], #8 + ld1 {v1.8b}, [x2], #8 + st2 {v0.8b, v1.8b}, [x4], #16 + +yuv420sp_uv_row_loop__uv: + add x1, x1, x7 + add x2, x2, x7 + add x4, x4, x8 + subs x5, x5, #1 + bgt yuv420sp_uv_row_loop_uv + ////POP THE REGISTERS + // LDMFD sp!,{x4-x12,PC} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + + +///***************************************************************************** +//* * +//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() * +//* * +//* Description : This function conversts the image from YUV420P color * +//* space to 420SP color space(VU interleaved). * +//* This function is similar to above function * +//* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * +//* VLD1.8 for chroma - order of registers is different * +//* * +//* Arguments : x0 pu1_y * +//* x1 pu1_u * +//* x2 pu1_v * +//* x3 pu1_dest_y * +//* x4 pu1_dest_uv * +//* x5 u2_height * +//* x6 u2_width * +//* x7 u2_stridey * +//* sp, #80 u2_strideu * +//* sp, #88 u2_stridev * +//* sp, #96 u2_dest_stride_y * +//* sp, #104 u2_dest_stride_uv * +//* sp, #112 convert_uv_only * +//* * +//* Values Returned : None * +//* * +//* Register Usage : x8, x14, x16, x20, v0, v1 * +//* * +//* Stack Usage : 80 Bytes * +//* * +//* Interruptibility : Interruptible * +//* * +//* Known Limitations * +//* Assumptions: Image Width: Assumed to be multiple of 16 and * +//* greater than or equal to 16 * +//* Image Height: Assumed to be even. * +//* * +//* Revision History : * +//* DD MM YYYY Author(s) Changes (Describe the changes made) * +//* 07 06 2010 Varshita Draft * +//* 07 06 2010 Naveen Kr T Completed * +//* * +//*****************************************************************************/ + +.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8 +impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8: + + //// push the registers on the stack + // pu1_y, - x0 + // pu1_u, - x1 + // pu1_v, - x2 + // pu1_dest_y, - x3 + // pu1_dest_uv, - x4 + // u2_height, - x5 + // u2_width, - x6 + // u2_stridey, - x7 + // u2_strideu, - sp, #80 + // u2_stridev, - sp, #88 + // u2_dest_stride_y, - sp, #96 + // u2_dest_stride_uv, - sp, #104 + // convert_uv_only - sp, #112 + // STMFD sp!,{x4-x12,x14} + push_v_regs + stp x19, x20, [sp, #-16]! + + ldr w14, [sp, #112] //// Load convert_uv_only + + cmp w14, #1 + beq yuv420sp_vu_chroma + + ///* Do the preprocessing before the main loops start */ + //// Load the parameters from stack + + ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack + uxtw x8, w8 + + sub x7, x7, x6 //// Source increment + + sub x8, x8, x6 //// Destination increment + + +yuv420sp_vu_row_loop_y: + mov x16, x6 + +yuv420sp_vu_col_loop_y: + prfm pldl1keep, [x0, #128] + ld1 {v0.8b, v1.8b}, [x0], #16 + st1 {v0.8b, v1.8b}, [x3], #16 + sub x16, x16, #16 + cmp x16, #15 + bgt yuv420sp_vu_col_loop_y + + cmp x16, #0 + beq yuv420sp_vu_row_loop__y + ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + ////Ex if width is 162, above loop will process 160 pixels. And + ////Both source and destination will point to 146th pixel and then 16 bytes will be read + //// and written using VLD1 and VST1 + sub x20, x16, #16 + neg x16, x20 + sub x0, x0, x16 + sub x3, x3, x16 + + ld1 {v0.8b, v1.8b}, [x0], #16 + st1 {v0.8b, v1.8b}, [x3], #16 + +yuv420sp_vu_row_loop__y: + add x0, x0, x7 + add x3, x3, x8 + subs x5, x5, #1 + bgt yuv420sp_vu_row_loop_y + +yuv420sp_vu_chroma: + ldr w7, [sp, #80] //// Load u2_strideu from stack + sxtw x7, w7 + + ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack + sxtw x8, w8 + + sub x7, x7, x6, lsr #1 //// Source increment + + sub x8, x8, x6 //// Destination increment + + lsr x6, x6, #1 + lsr x5, x5, #1 +yuv420sp_vu_row_loop_uv: + mov x16, x6 + + +yuv420sp_vu_col_loop_uv: + prfm pldl1keep, [x1, #128] + prfm pldl1keep, [x2, #128] + ld1 {v1.8b}, [x1], #8 + ld1 {v0.8b}, [x2], #8 + st2 {v0.8b, v1.8b}, [x4], #16 + sub x16, x16, #8 + cmp x16, #7 + bgt yuv420sp_vu_col_loop_uv + + cmp x16, #0 + beq yuv420sp_vu_row_loop__uv + ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read + ////Ex if width is 162, above loop will process 160 pixels. And + ////Both source and destination will point to 146th pixel and then 16 bytes will be read + //// and written using VLD1 and VST1 + sub x20, x16, #8 + neg x16, x20 + sub x1, x1, x16 + sub x2, x2, x16 + sub x4, x4, x16, lsl #1 + + ld1 {v1.8b}, [x1], #8 + ld1 {v0.8b}, [x2], #8 + st2 {v0.8b, v1.8b}, [x4], #16 + +yuv420sp_vu_row_loop__uv: + add x1, x1, x7 + add x2, x2, x7 + add x4, x4, x8 + subs x5, x5, #1 + bgt yuv420sp_vu_row_loop_uv + ////POP THE REGISTERS + // LDMFD sp!,{x4-x12,PC} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + diff --git a/common/armv8/impeg2_idct.s b/common/armv8/impeg2_idct.s new file mode 100644 index 0000000..4956e54 --- /dev/null +++ b/common/armv8/impeg2_idct.s @@ -0,0 +1,1247 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +// ******************************************************************************* +// * @file +// * impeg2_idct.s +// * +// * @brief +// * contains function definitions for single stage inverse transform +// * +// * @author +// * anand s +// * +// * @par list of functions: +// * - impeg2_idct_recon_dc_av8() +// * +// * @remarks +// * none +// * +// ******************************************************************************* +//*/ + +///** +// ******************************************************************************* +// * +// * @brief +// * this function performs inverse transform and reconstruction for 8x8 +// * input block +// * +// * @par description: +// * performs inverse transform and adds the prediction data and clips output +// * to 8 bit +// * +// * @param[in] pi2_src +// * input 8x8 coefficients +// * +// * @param[in] pi2_tmp +// * temporary 8x8 buffer for storing inverse +// * +// * transform +// * 1st stage output +// * +// * @param[in] pu1_pred +// * prediction 8x8 block +// * +// * @param[out] pu1_dst +// * output 8x8 block +// * +// * @param[in] src_strd +// * input stride +// * +// * @param[in] pred_strd +// * prediction stride +// * +// * @param[in] dst_strd +// * output stride +// * +// * @param[in] shift +// * output shift +// * +// * @param[in] zero_cols +// * zero columns in pi2_src +// * +// * @returns void +// * +// * @remarks +// * none +// * +// ******************************************************************************* +// */ + +//void impeg2_itrans_recon_8x8(word16 *pi2_src, +// word16 *pi2_tmp, +// uword8 *pu1_pred, +// uword8 *pu1_dst, +// word32 src_strd, +// word32 pred_strd, +// word32 dst_strd, +// word32 zero_cols +// word32 zero_rows ) + +//**************variables vs registers************************* +// x0 => *pi2_src +// x1 => *pi2_tmp +// x2 => *pu1_pred +// x3 => *pu1_dst +// src_strd +// pred_strd +// dst_strd +// zero_cols + + + +.text +.align 4 +.include "impeg2_neon_macros.s" + +.set idct_stg1_shift , 12 +.set idct_stg2_shift , 16 +.set idct_stg1_round , (1 << (idct_stg1_shift - 1)) +.set idct_stg2_round , (1 << (idct_stg2_shift - 1)) + +.extern gai2_impeg2_idct_q15 +.extern gai2_impeg2_idct_q11 +.extern gai2_impeg2_idct_first_col_q15 +.extern gai2_impeg2_idct_first_col_q11 +.extern gai2_impeg2_mismatch_stg2_additive + +.global impeg2_idct_recon_dc_av8 +impeg2_idct_recon_dc_av8: + // STMFD sp!,{x4,x6,x12,x14} + push_v_regs + ////x0: pi2_src + ////x1: pi2_tmp - not used, used as pred_strd + ////x2: pu1_pred + ////x3: pu1_dst + ////x4: used as scratch + ////x5: pred_strd + ////x6: dst_strd + + ldrsh x4, [x0] + adrp x14, :got:gai2_impeg2_idct_q15 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15] + ldrsh x12, [x14] + + ld1 {v0.8b}, [x2], x5 + mul x4, x4, x12 + + ld1 {v1.8b}, [x2], x5 + add x4, x4, #idct_stg1_round + + ld1 {v2.8b}, [x2], x5 + asr x4, x4, #idct_stg1_shift + + adrp x14, :got:gai2_impeg2_idct_q11 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11] + ldrsh x12, [x14] + + ld1 {v3.8b}, [x2], x5 + mul x4, x4, x12 + + ld1 {v4.8b}, [x2], x5 + add x4, x4, #idct_stg2_round + + ld1 {v5.8b}, [x2], x5 + asr x4, x4, #idct_stg2_shift + + ld1 {v6.8b}, [x2], x5 + dup v30.8h, w4 + + + ld1 {v7.8b}, [x2], x5 + + uaddw v8.8h, v30.8h , v0.8b + + uaddw v10.8h, v30.8h , v1.8b + sqxtun v0.8b, v8.8h + + uaddw v12.8h, v30.8h , v2.8b + sqxtun v1.8b, v10.8h + st1 {v0.8b}, [x3], x6 + + uaddw v14.8h, v30.8h , v3.8b + sqxtun v2.8b, v12.8h + st1 {v1.8b}, [x3], x6 + + uaddw v16.8h, v30.8h , v4.8b + sqxtun v3.8b, v14.8h + st1 {v2.8b}, [x3], x6 + + uaddw v18.8h, v30.8h , v5.8b + sqxtun v4.8b, v16.8h + st1 {v3.8b}, [x3], x6 + + uaddw v20.8h, v30.8h , v6.8b + sqxtun v5.8b, v18.8h + st1 {v4.8b}, [x3], x6 + + uaddw v22.8h, v30.8h , v7.8b + sqxtun v6.8b, v20.8h + st1 {v5.8b}, [x3], x6 + + sqxtun v7.8b, v22.8h + st1 {v6.8b}, [x3], x6 + + + st1 {v7.8b}, [x3], x6 + + // LDMFD sp!,{x4,x6,x12,pc} + pop_v_regs + ret + + + +.global impeg2_idct_recon_dc_mismatch_av8 +.extern gai2_impeg2_idct_last_row_q11 +.extern gai2_impeg2_mismatch_stg1_outp +impeg2_idct_recon_dc_mismatch_av8: + // STMFD sp!,{x4-x12,x14} + push_v_regs + + ldrsh x4, [x0] + adrp x14, :got:gai2_impeg2_idct_q15 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15] + ldrsh x12, [x14] + + mul x4, x4, x12 + add x4, x4, #idct_stg1_round + asr x4, x4, #idct_stg1_shift + + adrp x14, :got:gai2_impeg2_idct_q11 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11] + ldrsh x12, [x14] + mul x4, x4, x12 + dup v0.4s, w4 + + mov x14, #16 ////Increment for table read + adrp x4, :got:gai2_impeg2_mismatch_stg2_additive + ldr x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive] + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + ld1 {v2.4h, v3.4h}, [x4], x14 + ld1 {v30.8b}, [x2], x5 + sxtl v8.4s, v2.4h + sxtl v10.4s, v3.4h + raddhn v12.4h, v0.4s, v8.4s + raddhn2 v12.8h, v0.4s, v10.4s + uaddw v14.8h, v12.8h , v30.8b + sqxtun v30.8b, v14.8h + st1 {v30.8b}, [x3], x6 + + + // LDMFD sp!,{x4-x12,pc} + pop_v_regs + ret + +.globl impeg2_idct_recon_av8 + +.type impeg2_idct_recon_av8, %function + +impeg2_idct_recon_av8: +////register usage.extern - loading and until idct of columns +//// cosine constants - d0 +//// sine constants - d1 +//// row 0 first half - d2 - y0 +//// row 1 first half - d6 - y1 +//// row 2 first half - d3 - y2 +//// row 3 first half - d7 - y3 +//// row 4 first half - d10 - y4 +//// row 5 first half - d14 - y5 +//// row 6 first half - d11 - y6 +//// row 7 first half - d15 - y7 + +//// row 0 second half - d4 - y0 +//// row 1 second half - d8 - y1 +//// row 2 second half - d5 - y2 +//// row 3 second half - d9 - y3 +//// row 4 second half - d12 - y4 +//// row 5 second half - d16 - y5 +//// row 6 second half - d13 - y6 +//// row 7 second half - d17 - y7 + + //// copy the input pointer to another register + //// step 1 : load all constants + // stmfd sp!,{x4-x12,x14} + + ldr w11, [sp] // zero rows + + push_v_regs + stp x19, x20, [sp, #-16]! + + mov x12, x7 // zero columns + mov x8, x5 // prediction stride + mov x7, x6 // destination stride + mov x6, x4 // src stride + lsl x6, x6, #1 // x sizeof(word16) + add x9, x0, x6, lsl #1 // 2 rows + + add x10, x6, x6, lsl #1 // 3 rows + + sub x10, x10, #8 // - 4 cols * sizeof(word16) + sub x5, x6, #8 // src_strd - 4 cols * sizeof(word16) + + adrp x14, :got:gai2_impeg2_idct_first_col_q15 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] + ld1 {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data + + ////step 2 load all the input data + ////step 3 operate first 4 colums at a time + + and x11, x11, #0xff + and x12, x12, #0xff + + cmp x11, #0xf0 + bge skip_last4_rows + + + ld1 {v2.4h}, [x0], #8 + ld1 {v3.4h}, [x9], #8 + ld1 {v4.4h}, [x0], x5 + smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + ld1 {v5.4h}, [x9], x5 + smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + ld1 {v6.4h}, [x0], #8 + ld1 {v7.4h}, [x9], #8 + smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) + ld1 {v8.4h}, [x0], x10 + smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + ld1 {v9.4h}, [x9], x10 + smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) + ld1 {v10.4h}, [x0], #8 + smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + ld1 {v11.4h}, [x9], #8 + smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + ld1 {v12.4h}, [x0], x5 + smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + ld1 {v13.4h}, [x9], x5 + smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + ld1 {v14.4h}, [x0], #8 + smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + ld1 {v15.4h}, [x9], #8 + smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) + ld1 {v16.4h}, [x0], x10 + smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + ld1 {v17.4h}, [x9], x10 + + ///* this following was activated when alignment is not there */ +//// vld1.16 d2,[x0]! +//// vld1.16 d3,[x2]! +//// vld1.16 d4,[x0]! +//// vld1.16 d5,[x2]! +//// vld1.16 d6,[x0]! +//// vld1.16 d7,[x2]! +//// vld1.16 d8,[x0],x3 +//// vld1.16 d9,[x2],x3 +//// vld1.16 d10,[x0]! +//// vld1.16 d11,[x2]! +//// vld1.16 d12,[x0]! +//// vld1.16 d13,[x2]! +//// vld1.16 d14,[x0]! +//// vld1.16 d15,[x2]! +//// vld1.16 d16,[x0],x3 +//// vld1.16 d17,[x2],x3 + + + + + smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) + smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) + smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) + smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) + + add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) + sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) + sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) + add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) + + add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) + sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) + + add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) + sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) + + add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) + sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) + + add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) + sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) + + sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) + + + b last4_cols + + + +skip_last4_rows: + adrp x14, :got:gai2_impeg2_idct_first_col_q15 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] + ld1 {v0.4h, v1.4h}, [x14] + + ld1 {v2.4h}, [x0], #8 + ld1 {v3.4h}, [x9], #8 + ld1 {v4.4h}, [x0], x5 + ld1 {v5.4h}, [x9], x5 + ld1 {v6.4h}, [x0], #8 + ld1 {v7.4h}, [x9], #8 + ld1 {v8.4h}, [x0], x10 + ld1 {v9.4h}, [x9], x10 + + + + movi v12.4h, #0 + movi v13.4h, #0 + movi v16.4h, #0 + movi v17.4h, #0 + + + + + smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + + smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + + + add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) + sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) + sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) + add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) + + add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) + sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) + + add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) + sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) + + add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) + sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) + + add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) + sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) + + sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) + + +last4_cols: + adrp x14, :got:gai2_impeg2_idct_first_col_q15 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15] + ld1 {v0.4h, v1.4h}, [x14] + + + cmp x12, #0xf0 + bge skip_last4cols + + smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1) + smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0) + + smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) + + smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) + smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) + smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) + smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) + + add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7) + sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4) + sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5) + add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6) + + add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0) + sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7) + + add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2) + sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5) + + add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1) + sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6) + + add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3) + sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4) + + sqrshrn v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT) + sqrshrn v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT) + b end_skip_last4cols + + + +skip_last4cols: + adrp x14, :got:gai2_impeg2_idct_first_col_q11 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11] + ld1 {v0.4h, v1.4h}, [x14] + + umov x15, v25.d[0] + + trn1 v25.4h, v2.4h, v6.4h + trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing + + trn1 v27.4h, v3.4h, v7.4h + trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing + + trn1 v6.2s, v29.2s, v31.2s + trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... + trn1 v2.2s, v25.2s, v27.2s + trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued..... + + + trn1 v25.4h, v10.4h, v14.4h + trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing + + trn1 v27.4h, v11.4h, v15.4h + trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing + + trn1 v10.2s, v25.2s, v27.2s + trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued..... + trn1 v14.2s, v29.2s, v31.2s + trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... + + mov v25.d[0], x15 + + smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) +// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1) + + smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + + + + + sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) + add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) + + + add v2.4s, v4.4s , v24.4s + + sub v6.4s, v4.4s , v24.4s + + add v8.4s, v22.4s , v30.4s + + sub v24.4s, v22.4s , v30.4s + + sqrshrn v5.4h, v8.4s, #idct_stg2_shift + sqrshrn v2.4h, v2.4s, #idct_stg2_shift + sqrshrn v9.4h, v6.4s, #idct_stg2_shift + sqrshrn v6.4h, v24.4s, #idct_stg2_shift + + sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) + add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) + + + add v30.4s, v22.4s , v28.4s + + sub v24.4s, v22.4s , v28.4s + + add v28.4s, v18.4s , v26.4s + + sub v22.4s, v18.4s , v26.4s + sqrshrn v4.4h, v30.4s, #idct_stg2_shift + sqrshrn v7.4h, v24.4s, #idct_stg2_shift + sqrshrn v3.4h, v28.4s, #idct_stg2_shift + sqrshrn v8.4h, v22.4s, #idct_stg2_shift + + + + umov x19, v25.d[0] + umov x20, v25.d[1] + + trn1 v27.4h, v2.4h, v3.4h + trn2 v29.4h, v2.4h, v3.4h + trn1 v25.4h, v4.4h, v5.4h + trn2 v31.4h, v4.4h, v5.4h + + trn1 v2.2s, v27.2s, v25.2s + trn2 v4.2s, v27.2s, v25.2s + trn1 v3.2s, v29.2s, v31.2s + trn2 v5.2s, v29.2s, v31.2s + + trn1 v27.4h, v6.4h, v7.4h + trn2 v29.4h, v6.4h, v7.4h + trn1 v25.4h, v8.4h, v9.4h + trn2 v31.4h, v8.4h, v9.4h + + trn1 v6.2s, v27.2s, v25.2s + trn2 v8.2s, v27.2s, v25.2s + trn1 v7.2s, v29.2s, v31.2s + trn2 v9.2s, v29.2s, v31.2s + + mov v25.d[0], x19 + mov v25.d[1], x20 + + smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0) + + smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) + smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0) + + + add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data + + + add x5, x8, x8, lsl #1 // + + + add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data + + + add x10, x7, x7, lsl #1 // + + // swapping v3 and v6 + mov v31.d[0], v3.d[0] + mov v3.d[0], v6.d[0] + mov v6.d[0], v31.d[0] + + // swapping v5 and v8 + mov v31.d[0], v5.d[0] + mov v5.d[0], v8.d[0] + mov v8.d[0], v31.d[0] + + + sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) + add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) + + + add v0.4s, v12.4s , v24.4s + + + sub v24.4s, v12.4s , v24.4s + + + add v12.4s, v22.4s , v30.4s + + + sub v14.4s, v22.4s , v30.4s + + sqrshrn v10.4h, v0.4s, #idct_stg2_shift + sqrshrn v17.4h, v24.4s, #idct_stg2_shift + sqrshrn v13.4h, v12.4s, #idct_stg2_shift + sqrshrn v14.4h, v14.4s, #idct_stg2_shift + + sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) + add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) + + + add v0.4s, v22.4s , v28.4s + + + sub v24.4s, v22.4s , v28.4s + + + add v28.4s, v18.4s , v26.4s + + + sub v26.4s, v18.4s , v26.4s + ld1 {v18.8b}, [x2], x8 + + sqrshrn v12.4h, v0.4s, #idct_stg2_shift + ld1 {v20.8b}, [x2], x5 + + + sqrshrn v15.4h, v24.4s, #idct_stg2_shift + ld1 {v19.8b}, [x2], x8 + + + + + sqrshrn v11.4h, v28.4s, #idct_stg2_shift + ld1 {v22.8b}, [x4], x8 + + + + + sqrshrn v16.4h, v26.4s, #idct_stg2_shift + ld1 {v21.8b}, [x2], x5 + + + b pred_buff_addition +end_skip_last4cols: + adrp x14, :got:gai2_impeg2_idct_first_col_q11 + ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11] + ld1 {v0.4h, v1.4h}, [x14] + + + umov x19, v25.d[0] + umov x20, v25.d[1] + +///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */ + trn1 v27.4h, v2.4h, v6.4h + trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing + trn1 v25.4h, v3.4h, v7.4h + trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing + + trn1 v2.2s, v27.2s, v25.2s + trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued..... + trn1 v6.2s, v29.2s, v31.2s + trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... + + trn1 v27.4h, v4.4h, v8.4h + trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing + trn1 v25.4h, v5.4h, v9.4h + trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing + + trn1 v4.2s, v27.2s, v25.2s + trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued..... + trn1 v8.2s, v29.2s, v31.2s + trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued..... + + trn1 v27.4h, v10.4h, v14.4h + trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing + trn1 v25.4h, v11.4h, v15.4h + trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing + + trn1 v10.2s, v27.2s, v25.2s + trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued..... + trn1 v14.2s, v29.2s, v31.2s + trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... + + trn1 v27.4h, v12.4h, v16.4h + trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing + trn1 v25.4h, v13.4h, v17.4h + trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing + + trn1 v12.2s, v27.2s, v25.2s + trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... + trn1 v16.2s, v29.2s, v31.2s + trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... + + mov v25.d[0], x19 + mov v25.d[1], x20 + + ////step6 operate on first four rows and find their idct + ////register usage.extern - storing and idct of rows +//// cosine constants - d0 +//// sine constants - d1 +//// element 0 first four - d2 - y0 +//// element 1 first four - d6 - y1 +//// element 2 first four - d3 - y2 +//// element 3 first four - d7 - y3 +//// element 4 first four - d4 - y4 +//// element 5 first four - d8 - y5 +//// element 6 first four - d5 - y6 +//// element 7 first four - d9 - y7 +//// element 0 second four - d10 - y0 +//// element 1 second four - d14 - y1 +//// element 2 second four - d11 - y2 +//// element 3 second four - d15 - y3 +//// element 4 second four - d12 - y4 +//// element 5 second four - d16 - y5 +//// element 6 second four - d13 - y6 +//// element 7 second four - d17 - y7 + + //// map between first kernel code seq and current +//// d2 -> d2 +//// d6 -> d6 +//// d3 -> d3 +//// d7 -> d7 +//// d10 -> d4 +//// d14 -> d8 +//// d11 -> d5 +//// d15 -> d9 +//// q3 -> q3 +//// q5 -> q2 +//// q7 -> q4 + + smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3) + + smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + + smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) + + smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) + smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0) + + + smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) + smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) + smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) + smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) + + sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) + add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) + + + add v2.4s, v4.4s , v24.4s + + sub v6.4s, v4.4s , v24.4s + + add v8.4s, v22.4s , v30.4s + + sub v24.4s, v22.4s , v30.4s + + sqrshrn v5.4h, v8.4s, #idct_stg2_shift + sqrshrn v2.4h, v2.4s, #idct_stg2_shift + sqrshrn v9.4h, v6.4s, #idct_stg2_shift + sqrshrn v6.4h, v24.4s, #idct_stg2_shift + + sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) + add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) + + + add v30.4s, v22.4s , v28.4s + + sub v24.4s, v22.4s , v28.4s + + add v28.4s, v18.4s , v26.4s + + sub v22.4s, v18.4s , v26.4s + sqrshrn v4.4h, v30.4s, #idct_stg2_shift + sqrshrn v7.4h, v24.4s, #idct_stg2_shift + sqrshrn v3.4h, v28.4s, #idct_stg2_shift + sqrshrn v8.4h, v22.4s, #idct_stg2_shift + + + + umov x19, v25.d[0] + umov x20, v25.d[1] + + trn1 v27.4h, v2.4h, v3.4h + trn2 v29.4h, v2.4h, v3.4h + trn1 v25.4h, v4.4h, v5.4h + trn2 v31.4h, v4.4h, v5.4h + + trn1 v2.2s, v27.2s, v25.2s + trn2 v4.2s, v27.2s, v25.2s + trn1 v3.2s, v29.2s, v31.2s + trn2 v5.2s, v29.2s, v31.2s + + trn1 v27.4h, v6.4h, v7.4h + trn2 v29.4h, v6.4h, v7.4h + trn1 v25.4h, v8.4h, v9.4h + trn2 v31.4h, v8.4h, v9.4h + + trn1 v6.2s, v27.2s, v25.2s + trn2 v8.2s, v27.2s, v25.2s + trn1 v7.2s, v29.2s, v31.2s + trn2 v9.2s, v29.2s, v31.2s + + mov v25.d[0], x19 + mov v25.d[1], x20 + + + + smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0) + smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1) + smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2) + smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3) + smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0) + smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1) + smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2) + smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3) + smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1) + smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1) + smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) + smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0) + smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) + + add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data + smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) + + add x5, x8, x8, lsl #1 // + smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) + + add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data + smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) + + add x10, x7, x7, lsl #1 // + smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) + + + smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) + + add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) + sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) + + smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) + + // swapping v3 and v6 + mov v31.d[0], v3.d[0] + mov v3.d[0], v6.d[0] + mov v6.d[0], v31.d[0] + + smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) + // swapping v5 and v8 + mov v31.d[0], v5.d[0] + mov v5.d[0], v8.d[0] + mov v8.d[0], v31.d[0] + + smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) + smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) + + sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) + add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) + + + add v0.4s, v12.4s , v24.4s + + + sub v24.4s, v12.4s , v24.4s + + + add v12.4s, v22.4s , v30.4s + + + sub v14.4s, v22.4s , v30.4s + + sqrshrn v10.4h, v0.4s, #idct_stg2_shift + sqrshrn v17.4h, v24.4s, #idct_stg2_shift + sqrshrn v13.4h, v12.4s, #idct_stg2_shift + sqrshrn v14.4h, v14.4s, #idct_stg2_shift + + sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) + add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) + + + add v0.4s, v22.4s , v28.4s + + + sub v24.4s, v22.4s , v28.4s + + + add v28.4s, v18.4s , v26.4s + + + sub v26.4s, v18.4s , v26.4s + ld1 {v18.8b}, [x2], x8 + + sqrshrn v12.4h, v0.4s, #idct_stg2_shift + ld1 {v20.8b}, [x2], x5 + + + sqrshrn v15.4h, v24.4s, #idct_stg2_shift + ld1 {v19.8b}, [x2], x8 + + + + + sqrshrn v11.4h, v28.4s, #idct_stg2_shift + ld1 {v22.8b}, [x4], x8 + + + + + sqrshrn v16.4h, v26.4s, #idct_stg2_shift + ld1 {v21.8b}, [x2], x5 + + + + +pred_buff_addition: + + umov x19, v25.d[0] + umov x20, v25.d[1] + + trn1 v27.4h, v10.4h, v11.4h + trn2 v29.4h, v10.4h, v11.4h + trn1 v25.4h, v12.4h, v13.4h + trn2 v31.4h, v12.4h, v13.4h + + trn1 v10.2s, v27.2s, v25.2s + trn2 v12.2s, v27.2s, v25.2s + trn1 v11.2s, v29.2s, v31.2s + trn2 v13.2s, v29.2s, v31.2s + + trn1 v27.4h, v14.4h, v15.4h + trn2 v29.4h, v14.4h, v15.4h + trn1 v25.4h, v16.4h, v17.4h + trn2 v31.4h, v16.4h, v17.4h + + trn1 v14.2s, v27.2s, v25.2s + trn2 v16.2s, v27.2s, v25.2s + trn1 v15.2s, v29.2s, v31.2s + trn2 v17.2s, v29.2s, v31.2s + + + mov v25.d[0], x19 + mov v25.d[1], x20 + + + ld1 {v24.8b}, [x4], x5 + ld1 {v23.8b}, [x4], x8 + ld1 {v25.8b}, [x4], x5 + mov v2.d[1], v3.d[0] + mov v4.d[1], v5.d[0] + mov v6.d[1], v7.d[0] + mov v8.d[1], v9.d[0] + uaddw v2.8h, v2.8h , v18.8b + uaddw v4.8h, v4.8h , v22.8b + uaddw v6.8h, v6.8h , v20.8b + uaddw v8.8h, v8.8h , v24.8b + + // swapping v11 and v14 + mov v31.d[0], v11.d[0] + mov v11.d[0], v14.d[0] + mov v14.d[0], v31.d[0] + + // swapping v13 and v16 + mov v31.d[0], v13.d[0] + mov v13.d[0], v16.d[0] + mov v16.d[0], v31.d[0] +// row values stored in the q register. + +//q1 :x0 +//q3: x1 +//q2: x2 +//q4: x3 +//q5: x4 +//q7: x5 +//q6: x6 +//q8: x7 + + + +///// adding the prediction buffer + + + + + + + + + + // load prediction data + + + + + + //adding recon with prediction + + + + + mov v10.d[1], v11.d[0] + mov v12.d[1], v13.d[0] + mov v14.d[1], v15.d[0] + mov v16.d[1], v17.d[0] + uaddw v10.8h, v10.8h , v19.8b + sqxtun v2.8b, v2.8h + uaddw v14.8h, v14.8h , v21.8b + sqxtun v4.8b, v4.8h + uaddw v12.8h, v12.8h , v23.8b + sqxtun v6.8b, v6.8h + uaddw v16.8h, v16.8h , v25.8b + sqxtun v8.8b, v8.8h + + + + + + + + st1 {v2.8b}, [x3], x7 + sqxtun v10.8b, v10.8h + st1 {v6.8b}, [x3], x10 + sqxtun v14.8b, v14.8h + st1 {v4.8b}, [x0], x7 + sqxtun v12.8b, v12.8h + st1 {v8.8b}, [x0], x10 + sqxtun v16.8b, v16.8h + + + + + + + + st1 {v10.8b}, [x3], x7 + st1 {v14.8b}, [x3], x10 + st1 {v12.8b}, [x0], x7 + st1 {v16.8b}, [x0], x10 + + + + + // ldmfd sp!,{x4-x12,pc} + ldp x19, x20, [sp], #16 + pop_v_regs + ret + + + + diff --git a/common/armv8/impeg2_inter_pred.s b/common/armv8/impeg2_inter_pred.s new file mode 100644 index 0000000..98ade45 --- /dev/null +++ b/common/armv8/impeg2_inter_pred.s @@ -0,0 +1,814 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ + +///* +////---------------------------------------------------------------------------- +//// File Name : impeg2_inter_pred.s +//// +//// Description : This file has motion compensation related +//// interpolation functions on Neon + CortexA-8 platform +//// +//// Reference Document : +//// +//// Revision History : +//// Date Author Detail Description +//// ------------ ---------------- ---------------------------------- +//// 18 jun 2010 S Hamsalekha Created +//// +////------------------------------------------------------------------------- +//*/ + +///* +//// ---------------------------------------------------------------------------- +//// Include Files +//// ---------------------------------------------------------------------------- +//*/ +// PRESERVE8 +.text +.include "impeg2_neon_macros.s" + +///* +//// ---------------------------------------------------------------------------- +//// Struct/Union Types and Define +//// ---------------------------------------------------------------------------- +//*/ + + +///* +//// ---------------------------------------------------------------------------- +//// Static Global Data section variables +//// ---------------------------------------------------------------------------- +//*/ +//// -------------------------- NONE -------------------------------------------- + + +///* +//// ---------------------------------------------------------------------------- +//// Static Prototype Functions +//// ---------------------------------------------------------------------------- +//*/ +//// -------------------------- NONE -------------------------------------------- + +///* +//// ---------------------------------------------------------------------------- +//// Exported functions +//// ---------------------------------------------------------------------------- +//*/ + + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_copy_mb_av8() +//// +//// Detail Description : Copies one MB worth of data from src to the dst +//// +//// Inputs : x0 - pointer to src +//// x1 - pointer to dst +//// x2 - source width +//// x3 - destination width +//// Registers Used : v0, v1 +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : +//// +//// Return Data : None +//// +//// Programming Note : <program limitation> +////----------------------------------------------------------------------------- +//*/ + + + +.global impeg2_copy_mb_av8 + + +impeg2_copy_mb_av8: + +//STMFD x13!,{x4,x5,x12,x14} + push_v_regs + + + ldr x4, [x0] //src->y + ldr x5, [x1] //dst->y + + //Read one row of data from the src + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + + ////Repeat 15 times for y + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src + st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst + + lsr x2, x2, #1 //src_offset /= 2 + lsr x3, x3, #1 //dst_offset /= 2 + + ldr x4, [x0, #8] //src->u + ldr x5, [x1, #8] //dst->u + + //Read one row of data from the src + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + + ////Repeat 7 times for u + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + + ldr x4, [x0, #16] //src->v + ldr x5, [x1, #16] //dst->v + + //Read one row of data from the src + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + + ////Repeat 7 times for v + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + ld1 {v0.8b}, [x4], x2 //Load and increment src + st1 {v0.8b}, [x5], x3 //Store and increment dst + +//LDMFD x13!,{x4,x5,x12,PC} + pop_v_regs + ret + + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_mc_fullx_halfy_8x8_av8() +//// +//// Detail Description : This function pastes the reference block in the +//// current frame buffer.This function is called for +//// blocks that are not coded and have motion vectors +//// with a half pel resolution. +//// +//// Inputs : x0 - out : Current Block Pointer +//// x1 - ref : Refernce Block Pointer +//// x2 - ref_wid : Refernce Block Width +//// x3 - out_wid @ Current Block Width +//// +//// Registers Used : x14, D0-D9 +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : The Motion Compensated Block +//// +//// Return Data : None +//// +//// Programming Note : <program limitation> +////----------------------------------------------------------------------------- +//*/ + +.global impeg2_mc_fullx_halfy_8x8_av8 + +impeg2_mc_fullx_halfy_8x8_av8: + +//STMFD x13!,{x12,x14} + push_v_regs + add x14, x1, x2 + lsl x2, x2, #1 + +///* Load 8 + 1 rows from reference block */ +///* Do the addition with out rounding off as rounding value is 1 */ + ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0 + ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2 + ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4 + ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6 + ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1 + ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3 + urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9 + ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5 + urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1 + urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1 + ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7 + urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3 + urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3 + ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8 + urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5 + urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5 + + add x14, x0, x3 + lsl x3, x3, #1 + +///* Store the eight rows calculated above */ + st1 {v2.8b}, [x14], x3 //// second row hence D2 + urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7 + st1 {v0.8b}, [x0], x3 //// first row hence D0 + st1 {v9.8b}, [x14], x3 //// fourth row hence D9 + st1 {v4.8b}, [x0], x3 //// third row hence D4 + st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3 + st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1 + st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7 + st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5 + +// LDMFD sp!,{x12,pc} + pop_v_regs + ret + + + + + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_mc_halfx_fully_8x8_av8() +//// +//// Detail Description : This function pastes the reference block in the +//// current frame buffer.This function is called for +//// blocks that are not coded and have motion vectors +//// with a half pel resolutionand VopRoundingType is 0 .. +//// +//// Inputs : x0 - out : Current Block Pointer +//// x1 - ref : Refernce Block Pointer +//// x2 - ref_wid : Refernce Block Width +//// x3 - out_wid @ Current Block Width +//// +//// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22 + +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : The Motion Compensated Block +//// +//// Return Data : None +//// +//// Programming Note : <program limitation> +////----------------------------------------------------------------------------- +//*/ + + + +.global impeg2_mc_halfx_fully_8x8_av8 + + + +impeg2_mc_halfx_fully_8x8_av8: + + // STMFD sp!,{x12,x14} + push_v_regs + + add x14, x1, x2, lsl #2 + + add x12, x0, x3, lsl#2 + + ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 + + ld1 {v2.8b, v3.8b}, [x14], x2 // row5 + + + ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 + + ld1 {v6.8b, v7.8b}, [x14], x2 //row6 + + + ext v8.8b, v0.8b , v1.8b , #1 + + ext v12.8b, v2.8b , v3.8b , #1 + + ext v16.8b, v4.8b , v5.8b , #1 + + ext v20.8b, v6.8b , v7.8b , #1 + + + ld1 {v9.8b, v10.8b}, [x1], x2 //load row3 + + ld1 {v13.8b, v14.8b}, [x14], x2 //load row7 + + ld1 {v17.8b, v18.8b}, [x1], x2 //load row4 + + ld1 {v21.8b, v22.8b}, [x14], x2 //load row8 + + + ext v1.8b, v9.8b , v10.8b , #1 + + ext v3.8b, v13.8b , v14.8b , #1 + + + + ext v5.8b, v17.8b , v18.8b , #1 + + ext v7.8b, v21.8b , v22.8b , #1 + + + urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3 + urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3 + + urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7 + urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7 + + + urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4 + urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4 + + + urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8 + urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8 + + st1 {v0.8b}, [x0], x3 //store row1 + + st1 {v2.8b}, [x12], x3 //store row5 + + st1 {v4.8b}, [x0], x3 //store row2 + + st1 {v6.8b}, [x12], x3 //store row6 + + st1 {v1.8b}, [x0], x3 //store row3 + + st1 {v3.8b}, [x12], x3 //store row7 + + st1 {v5.8b}, [x0], x3 //store row4 + + st1 {v7.8b}, [x12], x3 //store row8 + + + + // LDMFD sp!,{x12,pc} + pop_v_regs + ret + + + + + + + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_mc_halfx_halfy_8x8_av8() +//// +//// Detail Description : This function pastes the reference block in the +//// current frame buffer.This function is called for +//// blocks that are not coded and have motion vectors +//// with a half pel resolutionand VopRoundingType is 0 .. +//// +//// Inputs : x0 - out : Current Block Pointer +//// x1 - ref : Refernce Block Pointer +//// x2 - ref_wid : Refernce Block Width +//// x3 - out_wid @ Current Block Width +//// +//// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30 + +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : The Motion Compensated Block +//// +//// Return Data : None +//// +//// Programming Note : <program limitation> +////----------------------------------------------------------------------------- +//*/ + + +.global impeg2_mc_halfx_halfy_8x8_av8 + +impeg2_mc_halfx_halfy_8x8_av8: + + // STMFD sp!,{x12,x14} + push_v_regs + + add x14, x1, x2, lsl #2 + + ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 + + ld1 {v2.8b, v3.8b}, [x14], x2 // row5 + + ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 + + ld1 {v6.8b, v7.8b}, [x14], x2 //row6 + + ext v1.8b, v0.8b , v1.8b , #1 + + + + ext v3.8b, v2.8b , v3.8b , #1 + + + + ext v5.8b, v4.8b , v5.8b , #1 + + ext v7.8b, v6.8b , v7.8b , #1 + + + + + ld1 {v8.8b, v9.8b}, [x1], x2 //load row3 + + + + ld1 {v10.8b, v11.8b}, [x14], x2 //load row7 + + ld1 {v12.8b, v13.8b}, [x1], x2 //load row4 + + ld1 {v14.8b, v15.8b}, [x14], x2 //load row8 + + ext v9.8b, v8.8b , v9.8b , #1 + + ld1 {v16.8b, v17.8b}, [x14], x2 //load row9 + + + + + + ext v11.8b, v10.8b , v11.8b , #1 + + + + ext v13.8b, v12.8b , v13.8b , #1 + + + + ext v15.8b, v14.8b , v15.8b , #1 + + ext v17.8b, v16.8b , v17.8b , #1 + + + //interpolation in x direction + + uaddl v0.8h, v0.8b, v1.8b //operate row1 + + uaddl v2.8h, v2.8b, v3.8b //operate row5 + + uaddl v4.8h, v4.8b, v5.8b //operate row2 + + uaddl v6.8h, v6.8b, v7.8b //operate row6 + + uaddl v8.8h, v8.8b, v9.8b //operate row3 + + uaddl v10.8h, v10.8b, v11.8b //operate row7 + + uaddl v12.8h, v12.8b, v13.8b //operate row4 + + uaddl v14.8h, v14.8b, v15.8b //operate row8 + + uaddl v16.8h, v16.8b, v17.8b //operate row9 + + //interpolation in y direction + + add x14, x0, x3, lsl #2 + + + + add v18.8h, v0.8h , v4.8h //operate row1 and row2 + + add v26.8h, v2.8h , v6.8h //operate row5 and row6 + + add v20.8h, v4.8h , v8.8h //operate row2 and row3 + + add v28.8h, v6.8h , v10.8h //operate row6 and row7 + + rshrn v18.8b, v18.8h, #2 //row1 + + rshrn v26.8b, v26.8h, #2 //row5 + + rshrn v20.8b, v20.8h, #2 //row2 + + rshrn v28.8b, v28.8h, #2 //row6 + + add v22.8h, v8.8h , v12.8h //operate row3 and row4 + + st1 {v18.8b}, [x0], x3 //store row1 + + add v30.8h, v10.8h , v14.8h //operate row7 and row8 + + st1 {v26.8b}, [x14], x3 //store row5 + + add v24.8h, v12.8h , v2.8h //operate row4 and row5 + + st1 {v20.8b}, [x0], x3 //store row2 + + add v14.8h, v14.8h , v16.8h //operate row8 and row9 + + st1 {v28.8b}, [x14], x3 //store row6 + + + + rshrn v22.8b, v22.8h, #2 //row3 + + rshrn v30.8b, v30.8h, #2 //row7 + + rshrn v24.8b, v24.8h, #2 //row4 + + rshrn v14.8b, v14.8h, #2 //row8 + + + st1 {v22.8b}, [x0], x3 //store row3 + st1 {v30.8b}, [x14], x3 //store row7 + st1 {v24.8b}, [x0], x3 //store row4 + st1 {v14.8b}, [x14], x3 //store row8 + + + + // LDMFD sp!,{x12,pc} + pop_v_regs + ret + + + + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_mc_fullx_fully_8x8_av8() +//// +//// Detail Description : This function pastes the reference block in the +//// current frame buffer.This function is called for +//// blocks that are not coded and have motion vectors +//// with a half pel resolutionand .. +//// +//// Inputs : x0 - out : Current Block Pointer +//// x1 - ref : Refernce Block Pointer +//// x2 - ref_wid : Refernce Block Width +//// x3 - out_wid @ Current Block Width +//// +//// Registers Used : x12, x14, v0-v3 + +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : The Motion Compensated Block +//// +//// Return Data : None +//// +//// Programming Note : <program limitation> +////----------------------------------------------------------------------------- +//*/ + + +.global impeg2_mc_fullx_fully_8x8_av8 +impeg2_mc_fullx_fully_8x8_av8: + + + // STMFD sp!,{x12,x14} + push_v_regs + + add x14, x1, x2, lsl #2 + + add x12, x0, x3, lsl #2 + + + ld1 {v0.8b}, [x1], x2 //load row1 + + ld1 {v1.8b}, [x14], x2 //load row4 + + ld1 {v2.8b}, [x1], x2 //load row2 + + ld1 {v3.8b}, [x14], x2 //load row5 + + + st1 {v0.8b}, [x0], x3 //store row1 + + st1 {v1.8b}, [x12], x3 //store row4 + + st1 {v2.8b}, [x0], x3 //store row2 + + st1 {v3.8b}, [x12], x3 //store row5 + + + ld1 {v0.8b}, [x1], x2 //load row3 + + ld1 {v1.8b}, [x14], x2 //load row6 + + ld1 {v2.8b}, [x1], x2 //load row4 + + ld1 {v3.8b}, [x14], x2 //load row8 + + + st1 {v0.8b}, [x0], x3 //store row3 + + st1 {v1.8b}, [x12], x3 //store row6 + + st1 {v2.8b}, [x0], x3 //store row4 + + st1 {v3.8b}, [x12], x3 //store row8 + + + // LDMFD sp!,{x12,pc} + pop_v_regs + ret + + + + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_interpolate_av8() +//// +//// Detail Description : interpolates two buffers and adds pred +//// +//// Inputs : x0 - pointer to src1 +//// x1 - pointer to src2 +//// x2 - dest buf +//// x3 - dst stride +//// Registers Used : x12, v0-v15 +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : The Motion Compensated Block +//// +//// Return Data : None +//// +//// Programming Note : <program limitation> +////----------------------------------------------------------------------------- +//*/ + + +.global impeg2_interpolate_av8 + + +impeg2_interpolate_av8: + +//STMFD x13!,{x4-x7,x12,x14} + push_v_regs + + ldr x4, [x0, #0] //ptr_y src1 + + ldr x5, [x1, #0] //ptr_y src2 + + ldr x7, [x2, #0] //ptr_y dst buf + + mov x12, #4 //counter for number of blocks + + +interp_lumablocks_stride: + ld1 {v0.16b}, [x4], #16 //row1 src1 + + ld1 {v2.16b}, [x4], #16 //row2 src1 + + ld1 {v4.16b}, [x4], #16 //row3 src1 + + ld1 {v6.16b}, [x4], #16 //row4 src1 + + + ld1 {v8.16b}, [x5], #16 //row1 src2 + + ld1 {v10.16b}, [x5], #16 //row2 src2 + + ld1 {v12.16b}, [x5], #16 //row3 src2 + + ld1 {v14.16b}, [x5], #16 //row4 src2 + + urhadd v0.16b, v0.16b , v8.16b //operate on row1 + + urhadd v2.16b, v2.16b , v10.16b //operate on row2 + + urhadd v4.16b, v4.16b , v12.16b //operate on row3 + + urhadd v6.16b, v6.16b , v14.16b //operate on row4 + st1 {v0.16b}, [x7], x3 //row1 + + st1 {v2.16b}, [x7], x3 //row2 + + st1 {v4.16b}, [x7], x3 //row3 + + st1 {v6.16b}, [x7], x3 //row4 + + subs x12, x12, #1 + + bne interp_lumablocks_stride + + + lsr x3, x3, #1 //stride >> 1 + + ldr x4, [x0, #8] //ptr_u src1 + + ldr x5, [x1, #8] //ptr_u src2 + + ldr x7 , [x2, #8] //ptr_u dst buf + + mov x12, #2 //counter for number of blocks + + + +//chroma blocks + +interp_chromablocks_stride: + ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1 + + ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1 + + ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1 + + ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1 + + + ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2 + + ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2 + + ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2 + + ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2 + + urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2 + urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2 + + urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4 + urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4 + + urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6 + urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6 + + urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8 + urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8 + + st1 {v0.8b}, [x7], x3 //row1 + + st1 {v1.8b}, [x7], x3 //row2 + + st1 {v2.8b}, [x7], x3 //row3 + + st1 {v3.8b}, [x7], x3 //row4 + + st1 {v4.8b}, [x7], x3 //row5 + + st1 {v5.8b}, [x7], x3 //row6 + + st1 {v6.8b}, [x7], x3 //row7 + + st1 {v7.8b}, [x7], x3 //row8 + + + ldr x4, [x0, #16] //ptr_v src1 + + ldr x5, [x1, #16] //ptr_v src2 + + ldr x7, [x2, #16] //ptr_v dst buf + + subs x12, x12, #1 + + bne interp_chromablocks_stride + + + //LDMFD x13!,{x4-x7,x12,PC} + pop_v_regs + ret + + + + diff --git a/common/armv8/impeg2_mem_func.s b/common/armv8/impeg2_mem_func.s new file mode 100644 index 0000000..f0bb590 --- /dev/null +++ b/common/armv8/impeg2_mem_func.s @@ -0,0 +1,181 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ + +///* +////---------------------------------------------------------------------------- +//// File Name : mot_comp_neon.s +//// +//// Description : This file has motion compensation related +//// interpolation functions on Neon + CortexA-8 platform +//// +//// Reference Document : +//// +//// Revision History : +//// Date Author Detail Description +//// ------------ ---------------- ---------------------------------- +//// 18 jun 2010 S Hamsalekha Created +//// +////------------------------------------------------------------------------- +//*/ + +///* +//// ---------------------------------------------------------------------------- +//// Include Files +//// ---------------------------------------------------------------------------- +//*/ +// PRESERVE8 +.text +.include "impeg2_neon_macros.s" +///* +//// ---------------------------------------------------------------------------- +//// Struct/Union Types and Define +//// ---------------------------------------------------------------------------- +//*/ + + +///* +//// ---------------------------------------------------------------------------- +//// Static Global Data section variables +//// ---------------------------------------------------------------------------- +//*/ +//// -------------------------- NONE -------------------------------------------- + + +///* +//// ---------------------------------------------------------------------------- +//// Static Prototype Functions +//// ---------------------------------------------------------------------------- +//*/ +//// -------------------------- NONE -------------------------------------------- + +///* +//// ---------------------------------------------------------------------------- +//// Exported functions +//// ---------------------------------------------------------------------------- +//*/ + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_memset_8bit_8x8_block_av8() +//// +//// Detail Description : This routine intialises the Block matrix buffer contents to a +//// particular Value. This function also assumes the buffer size +//// to be set is 64 Bytes fixed. It also assumes that blk matrix +//// used is 64 bit aligned. +//// +//// Inputs : pi2_blk_mat : Block Pointer +//// u2_val : Value with which the block is initialized +//// +//// Registers Used : v0 +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : Block Matrix Iniliazed to given value +//// +//// Return Data : None +//// +//// Programming Note : This implementation assumes that blk matrix buffer +//// is 128 bit aligned +////----------------------------------------------------------------------------- +//*/ +.global impeg2_memset_8bit_8x8_block_av8 +impeg2_memset_8bit_8x8_block_av8: + push_v_regs + +// ADD x3,x0,#WIDTH_X_SIZE @//x3 is another copy address offsetted + + dup v0.8b, w1 ////x1 is the 8-bit value to be set into + + st1 {v0.8b}, [x0], x2 ////Store the row 1 + st1 {v0.8b}, [x0], x2 ////Store the row 2 + st1 {v0.8b}, [x0], x2 ////Store the row 3 + st1 {v0.8b}, [x0], x2 ////Store the row 4 + st1 {v0.8b}, [x0], x2 ////Store the row 5 + st1 {v0.8b}, [x0], x2 ////Store the row 6 + st1 {v0.8b}, [x0], x2 ////Store the row 7 + st1 {v0.8b}, [x0], x2 ////Store the row 8 + + pop_v_regs + ret + + + + + + +///* +////--------------------------------------------------------------------------- +//// Function Name : impeg2_memset0_16bit_8x8_linear_block_av8() +//// +//// Detail Description : memsets resudual buf to 0 +//// +//// Inputs : x0 - pointer to y +//// x1 - pointer to u +//// x2 - pointer to v +//// Registers Used : v0 + +//// +//// Stack Usage : 64 bytes +//// +//// Outputs : The Motion Compensated Block +//// +//// Return Data : None +//// +//// Programming Note : <program limitation> +////----------------------------------------------------------------------------- +//*/ + + + +.global impeg2_memset0_16bit_8x8_linear_block_av8 + + +impeg2_memset0_16bit_8x8_linear_block_av8: + + push_v_regs + + movi v0.8h, #0 + + //Y data + + st1 {v0.8h} , [x0], #16 //row1 + + st1 {v0.8h} , [x0], #16 //row2 + + st1 {v0.8h} , [x0], #16 //row3 + + st1 {v0.8h} , [x0], #16 //row4 + + st1 {v0.8h} , [x0], #16 //row5 + + st1 {v0.8h} , [x0], #16 //row6 + + st1 {v0.8h} , [x0], #16 //row7 + + st1 {v0.8h} , [x0], #16 //row8 + + + + pop_v_regs + ret + + + + diff --git a/common/armv8/impeg2_neon_macros.s b/common/armv8/impeg2_neon_macros.s new file mode 100644 index 0000000..452ba45 --- /dev/null +++ b/common/armv8/impeg2_neon_macros.s @@ -0,0 +1,58 @@ +//****************************************************************************** +//* +//* Copyright (C) 2015 The Android Open Source Project +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//***************************************************************************** +//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +//*/ +///** +//******************************************************************************* +//* @file +//* impeg2_neon_macros.s +//* +//* @brief +//* Contains assembly macros +//* +//* @author +//* Naveen SR +//* +//* @par List of Functions: +//* +//* +//* @remarks +//* None +//* +//******************************************************************************* + + +.macro push_v_regs + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! +.endm +.macro pop_v_regs + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 +.endm + +.macro swp reg1, reg2 + eor \reg1, \reg1, \reg2 + eor \reg2, \reg1, \reg2 + eor \reg1, \reg1, \reg2 +.endm + diff --git a/common/armv8/impeg2_platform_macros.h b/common/armv8/impeg2_platform_macros.h new file mode 100644 index 0000000..ff31034 --- /dev/null +++ b/common/armv8/impeg2_platform_macros.h @@ -0,0 +1,49 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_PLATFORM_MACROS_H__ +#define __IMPEG2_PLATFORM_MACROS_H__ + +#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = \ + (u4_temp1 << 24) | \ + ((u4_temp1 & 0xff00) << 8) | \ + ((u4_temp1 & 0xff0000) >> 8) | \ + (u4_temp1 >> 24); + +static __inline UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return (__builtin_clz(u4_word)); + else + return 32; +} + +#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x)) +#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x)) + +#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x)) +#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x)) + +#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x)) +#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x)) + +#define INLINE +#define PLD(x) __pld(x) + +#endif /* __IMPEG2_PLATFORM_MACROS_H__ */ diff --git a/common/impeg2_buf_mgr.c b/common/impeg2_buf_mgr.c new file mode 100644 index 0000000..c4aca4a --- /dev/null +++ b/common/impeg2_buf_mgr.c @@ -0,0 +1,411 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** +******************************************************************************* +* @file +* impeg2_buf_mgr.c +* +* @brief +* Contains function definitions for buffer management +* +* @author +* Srinivas T +* +* @par List of Functions: +* - impeg2_buf_mgr_init() +* - impeg2_buf_mgr_add() +* - impeg2_buf_mgr_get_next_free() +* - impeg2_buf_mgr_check_free() +* - impeg2_buf_mgr_release() +* - impeg2_buf_mgr_set_status() +* - impeg2_buf_mgr_get_status() +* - impeg2_buf_mgr_get_buf() +* - impeg2_buf_mgr_get_num_active_buf() +* +* @remarks +* None +* +******************************************************************************* +*/ +#include <stdio.h> +#include <stdlib.h> +#include "iv_datatypedef.h" +#include "impeg2_defs.h" +#include "impeg2_buf_mgr.h" + + + +/** +******************************************************************************* +* +* @brief +* Buffer manager initialization function. +* +* @par Description: +* Initializes the buffer manager structure +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @returns +* +* @remarks +* None +* +******************************************************************************* +*/ + +void impeg2_buf_mgr_init( + buf_mgr_t *ps_buf_mgr) +{ + WORD32 id; + + ps_buf_mgr->u4_max_buf_cnt = BUF_MGR_MAX_CNT; + ps_buf_mgr->u4_active_buf_cnt = 0; + + for(id = 0; id < BUF_MGR_MAX_CNT; id++) + { + ps_buf_mgr->au4_status[id] = 0; + ps_buf_mgr->apv_ptr[id] = NULL; + } +} + + +/** +******************************************************************************* +* +* @brief +* Adds and increments the buffer and buffer count. +* +* @par Description: +* Adds a buffer to the buffer manager if it is not already present and +* increments the active buffer count +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] pv_ptr +* Pointer to the buffer to be added +* +* @returns Returns 0 on success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 impeg2_buf_mgr_add( + buf_mgr_t *ps_buf_mgr, + void *pv_ptr, + WORD32 i4_buf_id) +{ + + /* Check if buffer ID is within allowed range */ + if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt) + { + return (-1); + } + + /* Check if the current ID is being used to hold some other buffer */ + if((ps_buf_mgr->apv_ptr[i4_buf_id] != NULL) && + (ps_buf_mgr->apv_ptr[i4_buf_id] != pv_ptr)) + { + return (-1); + } + ps_buf_mgr->apv_ptr[i4_buf_id] = pv_ptr; + + return 0; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the next free buffer. +* +* @par Description: +* Returns the next free buffer available and sets the corresponding status +* to DEC +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] pi4_buf_id +* Pointer to the id of the free buffer +* +* @returns Pointer to the free buffer +* +* @remarks +* None +* +******************************************************************************* +*/ +void* impeg2_buf_mgr_get_next_free( + buf_mgr_t *ps_buf_mgr, + WORD32 *pi4_buf_id) +{ + WORD32 id; + void *pv_ret_ptr; + + pv_ret_ptr = NULL; + for(id = 0; id < (WORD32)ps_buf_mgr->u4_max_buf_cnt; id++) + { + /* Check if the buffer is non-null and status is zero */ + if((ps_buf_mgr->au4_status[id] == 0) && (ps_buf_mgr->apv_ptr[id])) + { + *pi4_buf_id = id; + /* DEC is set to 1 */ + ps_buf_mgr->au4_status[id] = 1; + pv_ret_ptr = ps_buf_mgr->apv_ptr[id]; + break; + } + } + + return pv_ret_ptr; +} + + +/** +******************************************************************************* +* +* @brief +* Checks the buffer manager for free buffers available. +* +* @par Description: +* Checks if there are any free buffers available +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @returns Returns 0 if available, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 impeg2_buf_mgr_check_free( + buf_mgr_t *ps_buf_mgr) +{ + UWORD32 id; + + for(id = 0; id < ps_buf_mgr->u4_max_buf_cnt; id++) + { + if((ps_buf_mgr->au4_status[id] == 0) && + (ps_buf_mgr->apv_ptr[id])) + { + return 1; + } + } + + return 0; + +} + + +/** +******************************************************************************* +* +* @brief +* Resets the status bits. +* +* @par Description: +* resets the status bits that the mask contains (status corresponding to +* the id) +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer status to be released +* +* @param[in] mask +* Contains the bits that are to be reset +* +* @returns 0 if success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 impeg2_buf_mgr_release( + buf_mgr_t *ps_buf_mgr, + WORD32 i4_buf_id, + UWORD32 u4_mask) +{ + /* If the given id is pointing to an id which is not yet added */ + if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt) + { + return (-1); + } + + if(0 == (ps_buf_mgr->au4_status[i4_buf_id] & u4_mask)) + { + return (-1); + } + + ps_buf_mgr->au4_status[i4_buf_id] &= ~u4_mask; + + /* If both the REF and DISP are zero, DEC is set to zero */ + if(ps_buf_mgr->au4_status[i4_buf_id] == 1) + { + ps_buf_mgr->au4_status[i4_buf_id] = 0; + } + + return 0; +} + + +/** +******************************************************************************* +* +* @brief +* Sets the status bit. +* +* @par Description: +* sets the status bits that the mask contains (status corresponding to the +* id) +* +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer whose status needs to be modified +* +* +* @param[in] mask +* Contains the bits that are to be set +* +* @returns 0 if success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 impeg2_buf_mgr_set_status( + buf_mgr_t *ps_buf_mgr, + WORD32 i4_buf_id, + UWORD32 u4_mask) +{ + if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt) + { + return (-1); + } + + + if((ps_buf_mgr->au4_status[i4_buf_id] & u4_mask) != 0) + { + return (-1); + } + + ps_buf_mgr->au4_status[i4_buf_id] |= u4_mask; + return 0; +} + + +/** +******************************************************************************* +* +* @brief +* Returns the status of the buffer. +* +* @par Description: +* Returns the status of the buffer corresponding to the id +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer status required +* +* @returns Status of the buffer corresponding to the id +* +* @remarks +* None +* +******************************************************************************* +*/ +UWORD32 impeg2_buf_mgr_get_status( + buf_mgr_t *ps_buf_mgr, + WORD32 i4_buf_id) +{ + return ps_buf_mgr->au4_status[i4_buf_id]; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the buffer from the buffer manager +* +* @par Description: +* Returns the pointer to the buffer corresponding to the id +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @param[in] buf_id +* ID of the buffer required +* +* @returns Pointer to the buffer required +* +* @remarks +* None +* +******************************************************************************* +*/ +void* impeg2_buf_mgr_get_buf( + buf_mgr_t *ps_buf_mgr, + WORD32 i4_buf_id) +{ + return ps_buf_mgr->apv_ptr[i4_buf_id]; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the no.of active buffer +* +* @par Description: +* Return the number of active buffers in the buffer manager +* +* @param[in] ps_buf_mgr +* Pointer to the buffer manager +* +* @returns number of active buffers +* +* @remarks +* None +* +******************************************************************************* +*/ +UWORD32 impeg2_buf_mgr_get_num_active_buf( + buf_mgr_t *ps_buf_mgr) +{ + return ps_buf_mgr->u4_max_buf_cnt; +} diff --git a/common/impeg2_buf_mgr.h b/common/impeg2_buf_mgr.h new file mode 100644 index 0000000..6b1cbef --- /dev/null +++ b/common/impeg2_buf_mgr.h @@ -0,0 +1,115 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* impeg2_buf_mgr.h +* +* @brief +* Function declarations used for buffer management +* +* @author +* Srinivas T +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IMPEG2_BUF_MGR_H_ +#define _IMPEG2_BUF_MGR_H_ + +#define BUF_MGR_MAX_CNT 64 + +#define BUF_MGR_DEC 1 +#define BUF_MGR_REF (1 << 1) +#define BUF_MGR_DISP (1 << 2) + +typedef struct +{ + /** + * max_buf_cnt + */ + UWORD32 u4_max_buf_cnt; + + /** + * active_buf_cnt + */ + UWORD32 u4_active_buf_cnt; + /** + * au4_status[BUF_MGR_MAX_CNT] + */ + UWORD32 au4_status[BUF_MGR_MAX_CNT]; + /* The last three bit of status are: */ + /* Bit 0 - DEC */ + /* Bit 1 - REF */ + /* Bit 2 - DISP */ + + void *apv_ptr[BUF_MGR_MAX_CNT]; +}buf_mgr_t; + +// intializes the buffer API structure +void impeg2_buf_mgr_init( + buf_mgr_t *ps_buf_mgr); + +// Add buffer to buffer manager. 0: success, -1: fail (u4_active_buf_cnt has reached u4_max_buf_cnt) +WORD32 impeg2_buf_mgr_add( + buf_mgr_t *ps_buf_mgr, + void *pv_ptr, + WORD32 buf_id); + +// this function will set the buffer status to DEC +void* impeg2_buf_mgr_get_next_free( + buf_mgr_t *ps_buf_mgr, + WORD32 *pi4_id); + +// this function will check if there are any free buffers +WORD32 impeg2_buf_mgr_check_free( + buf_mgr_t *ps_buf_mgr); + +// mask will have who released it: DISP:REF:DEC +WORD32 impeg2_buf_mgr_release( + buf_mgr_t *ps_buf_mgr, + WORD32 id, + UWORD32 mask); + +// sets the status to one or all of DISP:REF:DEC +WORD32 impeg2_buf_mgr_set_status( + buf_mgr_t *ps_buf_mgr, + WORD32 id, + UWORD32 mask); + +// Gets status of the buffer +UWORD32 impeg2_buf_mgr_get_status( + buf_mgr_t *ps_buf_mgr, + WORD32 id); + +// pass the ID - buffer will be returned +void* impeg2_buf_mgr_get_buf( + buf_mgr_t *ps_buf_mgr, + WORD32 id); + +// will return number of active buffers +UWORD32 impeg2_buf_mgr_get_num_active_buf( + buf_mgr_t *ps_buf_mgr); + + + +#endif //_IMPEG2_BUF_MGR_H_ diff --git a/common/impeg2_defs.h b/common/impeg2_defs.h new file mode 100644 index 0000000..f1523f2 --- /dev/null +++ b/common/impeg2_defs.h @@ -0,0 +1,331 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef __IMPEG2_DEFS_H__ +#define __IMPEG2_DEFS_H__ + +#include <assert.h> + +/* Decoder needs at least 4 reference buffers in order to support format conversion in a thread and +to support B pictures. Because of format conversion in a thread, codec delay is now 2 frames instead of 1. +To reduce this delay, format conversion has to wait for MB status before converting for B pictures. +To avoid this check the delay is increased to 2 and hence number of reference frames minimum is 4 */ +#define NUM_INT_FRAME_BUFFERS 4 + + +#define MAX_WIDTH 4096 +#define MAX_HEIGHT 2160 + +#define MIN_WIDTH 16 +#define MIN_HEIGHT 16 + + +#define MAX_FRM_SIZE (MAX_WIDTH * MAX_HEIGHT * 2) /* Supports only 420P and 422ILE */ + +#define DEC_ORDER 0 + +#define MAX_BITSTREAM_BUFFER_SIZE 2000 * 1024 + + +/****************************************************************************** +* MPEG2 Start code and other code definitions +*******************************************************************************/ +#define START_CODE_PREFIX 0x000001 +#define SEQUENCE_HEADER_CODE 0x000001B3 +#define EXTENSION_START_CODE 0x000001B5 +#define USER_DATA_START_CODE 0x000001B2 +#define GOP_START_CODE 0x000001B8 +#define PICTURE_START_CODE 0x00000100 +#define SEQUENCE_END_CODE 0x000001B7 +#define RESERVED_START_CODE 0x000001B0 +#define MB_ESCAPE_CODE 0x008 + +/****************************************************************************** +* MPEG2 Length of various codes definitions +*******************************************************************************/ +#define START_CODE_LEN 32 +#define START_CODE_PREFIX_LEN 24 +#define MB_ESCAPE_CODE_LEN 11 +#define EXT_ID_LEN 4 +#define MB_QUANT_SCALE_CODE_LEN 5 +#define MB_DCT_TYPE_LEN 1 +#define MB_MOTION_TYPE_LEN 2 +#define BYTE_LEN 8 + +/****************************************************************************** +* MPEG1 code definitions +*******************************************************************************/ +#define MB_STUFFING_CODE 0x00F + +/****************************************************************************** +* MPEG1 Length of various codes definitions +*******************************************************************************/ +#define MB_STUFFING_CODE_LEN 11 + +/****************************************************************************** +* MPEG2 MB definitions +*******************************************************************************/ +#define MPEG2_INTRA_MB 0x04 +#define MPEG2_INTRAQ_MB 0x44 +#define MPEG2_INTER_MB 0x28 +#define MB_MOTION_BIDIRECT 0x30 +#define MB_INTRA_OR_PATTERN 0x0C + +/****************************************************************************** +* Tools definitions +*******************************************************************************/ +#define SPATIAL_SCALABILITY 0x01 +#define TEMPORAL_SCALABILITY 0x03 + +/****************************************************************************** +* Extension IDs definitions +*******************************************************************************/ +#define SEQ_DISPLAY_EXT_ID 0x02 +#define SEQ_SCALABLE_EXT_ID 0x05 +#define QUANT_MATRIX_EXT_ID 0x03 +#define COPYRIGHT_EXT_ID 0x04 +#define PIC_DISPLAY_EXT_ID 0x07 +#define PIC_SPATIAL_SCALABLE_EXT_ID 0x09 +#define PIC_TEMPORAL_SCALABLE_EXT_ID 0x0A +#define CAMERA_PARAM_EXT_ID 0x0B +#define ITU_T_EXT_ID 0x0C +/****************************************************************************** +* Extension IDs Length definitions +*******************************************************************************/ +#define CAMERA_PARAMETER_EXTENSION_LEN 377 +#define COPYRIGHT_EXTENSION_LEN 88 +#define GROUP_OF_PICTURE_LEN 59 + + +/****************************************************************************** +* MPEG2 Picture structure definitions +*******************************************************************************/ +#define TOP_FIELD 1 +#define BOTTOM_FIELD 2 +#define FRAME_PICTURE 3 + +/****************************************************************************** +* MPEG2 Profile definitions +*******************************************************************************/ +#define MPEG2_SIMPLE_PROFILE 0x05 +#define MPEG2_MAIN_PROFILE 0x04 + +/****************************************************************************** +* MPEG2 Level definitions +*******************************************************************************/ +#define MPEG2_LOW_LEVEL 0x0a +#define MPEG2_MAIN_LEVEL 0x08 + +/****************************************************************************** +* MPEG2 Prediction types +*******************************************************************************/ +#define FIELD_PRED 0 +#define FRAME_PRED 1 +#define DUAL_PRED 2 +#define RESERVED -1 +#define MC_16X8_PRED 3 + +/***************************************************************************** +* MPEG2 Motion vector format +******************************************************************************/ +#define FIELD_MV 0 +#define FRAME_MV 1 + +/******************************************************************************/ +/* General Video related definitions */ +/******************************************************************************/ + +#define BLK_SIZE 8 +#define NUM_COEFFS ((BLK_SIZE)*(BLK_SIZE)) +#define LUMA_BLK_SIZE (2 * (BLK_SIZE)) +#define CHROMA_BLK_SIZE (BLK_SIZE) +#define BLOCKS_IN_MB 6 +#define MB_SIZE 16 +#define MB_CHROMA_SIZE 8 +#define NUM_PELS_IN_BLOCK 64 +#define NUM_LUMA_BLKS 4 +#define NUM_CHROMA_BLKS 2 +#define MAX_COLR_COMPS 3 +#define Y_LUMA 0 +#define U_CHROMA 1 +#define V_CHROMA 2 +#define MB_LUMA_MEM_SIZE ((MB_SIZE) * (MB_SIZE)) +#define MB_CHROMA_MEM_SIZE ((MB_SIZE/2) * (MB_SIZE/2)) + +#define BITS_IN_INT 32 +/******************************************************************************/ +/* MPEG2 Motion compensation related definitions */ +/******************************************************************************/ +#define REF_FRM_MB_WIDTH 18 +#define REF_FRM_MB_HEIGHT 18 +#define REF_FLD_MB_HEIGHT 10 +#define REF_FLD_MB_WIDTH 18 + +/******************************************************************************/ +/* Maximum number of bits per MB */ +/******************************************************************************/ +#define I_MB_BIT_SIZE 90 +#define P_MB_BIT_SIZE 90 +#define B_MB_BIT_SIZE 150 + +/******************************************************************************/ +/* Aspect ratio related definitions */ +/******************************************************************************/ +#define MPG1_NTSC_4_3 0x8 +#define MPG1_PAL_4_3 0xc +#define MPG1_NTSC_16_9 0x6 +#define MPG1_PAL_16_9 0x3 +#define MPG1_1_1 0x1 + +#define MPG2_4_3 0x2 +#define MPG2_16_9 0x3 +#define MPG2_1_1 0x1 + +/******************************************************************************/ +/* Inverse Quantizer Output range */ +/******************************************************************************/ +#define IQ_OUTPUT_MAX 2047 +#define IQ_OUTPUT_MIN -2048 + +/******************************************************************************/ +/* IDCT Output range */ +/******************************************************************************/ +#define IDCT_OUTPUT_MAX 255 +#define IDCT_OUTPUT_MIN -256 + +/******************************************************************************/ +/* Output pixel range */ +/******************************************************************************/ +#define PEL_VALUE_MAX 255 +#define PEL_VALUE_MIN 0 + +/******************************************************************************/ +/* inv scan types */ +/******************************************************************************/ +#define ZIG_ZAG_SCAN 0 +#define VERTICAL_SCAN 1 + +/******************************************************************************/ +/* Related VLD codes */ +/******************************************************************************/ +#define ESC_CODE_VALUE 0x0058 +#define EOB_CODE_VALUE 0x07d0 + +#define END_OF_BLOCK 0x01 +#define ESCAPE_CODE 0x06 + +#define END_OF_BLOCK_ZERO 0x01ff +#define END_OF_BLOCK_ONE 0x01ff + +/******************** Idct Specific ***************/ +#define TRANS_SIZE_8 8 +#define IDCT_STG1_SHIFT 12 +#define IDCT_STG2_SHIFT 16 + +#define IDCT_STG1_ROUND ((1 << IDCT_STG1_SHIFT) >> 1) +#define IDCT_STG2_ROUND ((1 << IDCT_STG2_SHIFT) >> 1) + + +/****************************************************************************** +* Sample Version Definitions +*******************************************************************************/ +#define SAMPLE_VERS_MAX_FRAMES_DECODE 999 + +#define MAX_FRAME_BUFFER 7 + +/* vop coding type */ +typedef enum +{ + I_PIC = 1, + P_PIC, + B_PIC, + D_PIC +} e_pic_type_t; + +typedef enum +{ + MPEG_2_VIDEO, + MPEG_1_VIDEO +} e_video_type_t; + +typedef enum +{ + FORW, + BACK, + BIDIRECT +} e_pred_direction_t; + +typedef enum +{ + TOP, + BOTTOM +} e_field_t; + +/* Motion vectors (first/second) */ +enum +{ + FIRST, + SECOND, + THIRD, + FOURTH +}; + +enum +{ + MV_X, + MV_Y +}; + +/* Enumeration defining the various kinds of interpolation possible in +motion compensation */ +typedef enum +{ + FULL_XFULL_Y, + FULL_XHALF_Y, + HALF_XFULL_Y, + HALF_XHALF_Y +} e_sample_type_t; +typedef enum +{ + /* Params of the reference buffer used as input to MC */ + /* frame prediction in P frame picture */ + MC_FRM_FW_OR_BK_1MV, + /* field prediction in P frame picture */ + MC_FRM_FW_OR_BK_2MV, + /* frame prediction in B frame picture */ + MC_FRM_FW_AND_BK_2MV, + /* field prediction in B frame picture */ + MC_FRM_FW_AND_BK_4MV, + /* dual prime prediction in P frame picture */ + MC_FRM_FW_DUAL_PRIME_1MV, + /* frame prediction in P field picture */ + MC_FLD_FW_OR_BK_1MV, + /* 16x8 prediction in P field picture */ + MC_FLD_FW_OR_BK_2MV, + /* field prediction in B field picture */ + MC_FLD_FW_AND_BK_2MV, + /* 16x8 prediction in B field picture */ + MC_FLD_FW_AND_BK_4MV, + /* dual prime prediction in P field picture */ + MC_FLD_FW_DUAL_PRIME_1MV, +} e_mb_type_t; + +#endif /* __IMPEG2_DEFS_H__ */ + diff --git a/common/impeg2_disp_mgr.c b/common/impeg2_disp_mgr.c new file mode 100644 index 0000000..f5ede84 --- /dev/null +++ b/common/impeg2_disp_mgr.c @@ -0,0 +1,172 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* impeg2_disp_mgr.c +* +* @brief +* Contains function definitions for display management +* +* @author +* Srinivas T +* +* @par List of Functions: +* - impeg2_disp_mgr_init() +* - impeg2_disp_mgr_add() +* - impeg2_disp_mgr_get() +* +* @remarks +* None +* +******************************************************************************* +*/ +#include <stdio.h> +#include <stdlib.h> +#include "iv_datatypedef.h" +#include "impeg2_defs.h" +#include "impeg2_disp_mgr.h" + +/** +******************************************************************************* +* +* @brief +* Initialization function for display buffer manager +* +* @par Description: +* Initializes the display buffer management structure +* +* @param[in] ps_disp_mgr +* Pointer to the display buffer management structure +* +* @returns none +* +* @remarks +* None +* +******************************************************************************* +*/ +void impeg2_disp_mgr_init( + disp_mgr_t *ps_disp_mgr) +{ + WORD32 id; + + + for(id = 0; id < DISP_MGR_MAX_CNT; id++) + { + ps_disp_mgr->apv_ptr[id] = NULL; + } + + ps_disp_mgr->i4_wr_idx = 0; + ps_disp_mgr->i4_rd_idx = 0; +} + + +/** +******************************************************************************* +* +* @brief +* Adds a buffer to the display manager +* +* @par Description: +* Adds a buffer to the display buffer manager +* +* @param[in] ps_disp_mgr +* Pointer to the diaplay buffer management structure +* +* @param[in] buf_id +* ID of the display buffer +* +* @param[in] abs_poc +* Absolute POC of the display buffer +* +* @param[in] pv_ptr +* Pointer to the display buffer +* +* @returns 0 if success, -1 otherwise +* +* @remarks +* None +* +******************************************************************************* +*/ +WORD32 impeg2_disp_mgr_add(disp_mgr_t *ps_disp_mgr, + void *pv_ptr, + WORD32 i4_buf_id) +{ + + + WORD32 id; + id = ps_disp_mgr->i4_wr_idx % DISP_MGR_MAX_CNT; + + ps_disp_mgr->apv_ptr[id] = pv_ptr; + ps_disp_mgr->ai4_buf_id[id] = i4_buf_id; + ps_disp_mgr->i4_wr_idx++; + + return 0; +} + + +/** +******************************************************************************* +* +* @brief +* Gets the next buffer +* +* @par Description: +* Gets the next display buffer +* +* @param[in] ps_disp_mgr +* Pointer to the display buffer structure +* +* @param[out] pi4_buf_id +* Pointer to hold buffer id of the display buffer being returned +* +* @returns Pointer to the next display buffer +* +* @remarks +* None +* +******************************************************************************* +*/ +void* impeg2_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id) +{ + WORD32 id; + + *pi4_buf_id = -1; + + if(ps_disp_mgr->i4_rd_idx < ps_disp_mgr->i4_wr_idx) + { + id = ps_disp_mgr->i4_rd_idx % DISP_MGR_MAX_CNT; + if(NULL == ps_disp_mgr->apv_ptr[id]) + { + return NULL; + } + + *pi4_buf_id = ps_disp_mgr->ai4_buf_id[id]; + + ps_disp_mgr->i4_rd_idx++; + + return ps_disp_mgr->apv_ptr[id]; + } + else + return NULL; + +} diff --git a/common/impeg2_disp_mgr.h b/common/impeg2_disp_mgr.h new file mode 100644 index 0000000..96b01b0 --- /dev/null +++ b/common/impeg2_disp_mgr.h @@ -0,0 +1,67 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* impeg2_disp_mgr.h +* +* @brief +* Function declarations used for display management +* +* @author +* Srinivas T +* +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef _IMPEG2_DISP_MGR_H_ +#define _IMPEG2_DISP_MGR_H_ + +#define DISP_MGR_MAX_CNT 64 +#define DEFAULT_POC 0x7FFFFFFF + +typedef struct +{ + /** + * apv_ptr[DISP_MGR_MAX_CNT] + */ + void *apv_ptr[DISP_MGR_MAX_CNT]; + + WORD32 ai4_buf_id[DISP_MGR_MAX_CNT]; + + WORD32 i4_wr_idx; + + WORD32 i4_rd_idx; +}disp_mgr_t; + +void impeg2_disp_mgr_init( + disp_mgr_t *ps_disp_mgr); + +WORD32 impeg2_disp_mgr_add( + disp_mgr_t *ps_disp_mgr, + void *pv_ptr, + WORD32 i4_buf_id); + +void* impeg2_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id); + +#endif //_IMPEG2_DISP_MGR_H_ diff --git a/common/impeg2_format_conv.c b/common/impeg2_format_conv.c new file mode 100644 index 0000000..ec0bcfb --- /dev/null +++ b/common/impeg2_format_conv.c @@ -0,0 +1,401 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : impeg2_format_conv .c */ +/* */ +/* Description : Contains functions needed to convert the images in */ +/* different color spaces to yuv 422i color space */ +/* */ +/* List of Functions : YUV420toYUV420() */ +/* YUV420toYUV422I() */ +/* YUV420toYUV420SP_VU() */ +/* YUV420toYUV420SP_UU() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 28 08 2007 Naveen Kumar T Draft */ +/* */ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ + +/* System include files */ + +/* User include files */ +#include <stdio.h> +#include <string.h> +#include "iv_datatypedef.h" +#include "iv.h" +#include "ithread.h" + +#include "iv_datatypedef.h" +#include "impeg2_macros.h" +#include "impeg2_buf_mgr.h" +#include "impeg2_disp_mgr.h" +#include "impeg2_defs.h" +#include "impeg2_platform_macros.h" + +#include "impeg2_job_queue.h" +#include "impeg2_format_conv.h" + + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_copy_frm_yuv420p() */ +/* */ +/* Description : This function performs conversion from YUV420 to */ +/* YUV422I color space. */ +/* */ +/* Inputs : pu1_src_y, - UWORD8 pointer to source y plane. */ +/* pu1_src_u, - UWORD8 pointer to source u plane. */ +/* pu1_src_v, - UWORD8 pointer to source v plane. */ +/* pu1_dst_y, - UWORD8 pointer to dest y plane. */ +/* pu1_dst_u, - UWORD8 pointer to dest u plane. */ +/* pu1_dst_v, - UWORD8 pointer to dest v plane. */ +/* u4_width, - Width of image. */ +/* u4_height, - Height of image. */ +/* u4_src_stride_y - Stride in pixels of source Y plane. */ +/* u4_src_stride_u - Stride in pixels of source U plane. */ +/* u4_src_stride_v - Stride in pixels of source V plane. */ +/* u4_dst_stride_y - Stride in pixels of dest Y plane. */ +/* u4_dst_stride_u - Stride in pixels of dest U plane. */ +/* u4_dst_stride_v - Stride in pixels of dest V plane. */ +/* */ +/* Globals : None */ +/* */ +/* Processing : One row is processed at a time. The one iteration of the */ +/* code will rearrange pixels into YUV422 interleaved */ +/* format. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 08 2007 Naveen Kumar T Draft */ +/* */ +/*****************************************************************************/ +void impeg2_copy_frm_yuv420p(UWORD8 *pu1_src_y, + UWORD8 *pu1_src_u, + UWORD8 *pu1_src_v, + UWORD8 *pu1_dst_y, + UWORD8 *pu1_dst_u, + UWORD8 *pu1_dst_v, + UWORD32 u4_width, + UWORD32 u4_height, + UWORD32 u4_src_stride_y, + UWORD32 u4_src_stride_u, + UWORD32 u4_src_stride_v, + UWORD32 u4_dst_stride_y, + UWORD32 u4_dst_stride_u, + UWORD32 u4_dst_stride_v) +{ + WORD32 i4_cnt; + WORD32 i4_y_height = (WORD32) u4_height; + WORD32 i4_uv_height = u4_height >> 1; + WORD32 i4_uv_width = u4_width >> 1; + + for(i4_cnt = 0; i4_cnt < i4_y_height; i4_cnt++) + { + memcpy(pu1_dst_y, pu1_src_y, u4_width); + pu1_dst_y += (u4_dst_stride_y); + pu1_src_y += (u4_src_stride_y); + } + + for(i4_cnt = 0; i4_cnt < i4_uv_height; i4_cnt++) + { + memcpy(pu1_dst_u, pu1_src_u, i4_uv_width); + pu1_dst_u += (u4_dst_stride_u); + pu1_src_u += (u4_src_stride_u); + + } + + for(i4_cnt = 0; i4_cnt < i4_uv_height; i4_cnt++) + { + memcpy(pu1_dst_v, pu1_src_v, i4_uv_width); + pu1_dst_v += (u4_dst_stride_v); + pu1_src_v += (u4_src_stride_v); + + } + +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_fmt_conv_yuv420p_to_yuv422ile() */ +/* */ +/* Description : This function performs conversion from YUV420 to */ +/* YUV422I color space. */ +/* */ +/* Inputs : pu1_y - UWORD8 pointer to y plane. */ +/* pu1_u - UWORD8 pointer to u plane. */ +/* pu1_v - UWORD8 pointer to u plane. */ +/* pu2_yuv422i - UWORD16 pointer to yuv422iimage. */ +/* u4_width - Width of the Y plane. */ +/* u4_height - Height of the Y plane. */ +/* u4_stride_y - Stride in pixels of Y plane. */ +/* u4_stride_u - Stride in pixels of U plane. */ +/* u4_stride_v - Stride in pixels of V plane. */ +/* u4_stride_yuv422i- Stride in pixels of yuv422i image. */ +/* */ +/* Globals : None */ +/* */ +/* Processing : One row is processed at a time. The one iteration of the */ +/* code will rearrange pixels into YUV422 interleaved */ +/* format. */ +/* */ +/* Outputs : None */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 08 2007 Naveen Kumar T Draft */ +/* */ +/*****************************************************************************/ + +void impeg2_fmt_conv_yuv420p_to_yuv422ile(register UWORD8 *pu1_y, + register UWORD8 *pu1_u, + register UWORD8 *pu1_v, + void *pv_yuv422i, + UWORD32 u4_width, + UWORD32 u4_height, + UWORD32 u4_stride_y, + UWORD32 u4_stride_u, + UWORD32 u4_stride_v, + UWORD32 u4_stride_yuv422i) +{ + /* Declare local variables */ + register WORD16 i,j; + register UWORD16 u2_offset1,u2_offset2,u2_offset3,u2_offset_yuv422i; + register UWORD8 u1_y1,u1_uv; + register UWORD32 u4_pixel; + register UWORD16 u2_width_cnt; + register UWORD32 *pu4_yuv422i; + + UWORD8 u1_flag; /* This flag is used to indicate wether the row is even or odd */ + + u1_flag=0x0; /* Intialize it with 0 indicating odd row */ + + /* Calculate the offsets necessary to make input and output buffers to point next row */ + u2_offset1 = u4_stride_y - u4_width; + u2_offset2 = u4_stride_u - ((u4_width + 1) >> 1); + u2_offset3 = u4_stride_v - ((u4_width + 1) >> 1); + u2_offset_yuv422i = (u4_stride_yuv422i >> 1) -((u4_width + 1) >> 1); + + /* Type cast the output pointer to UWORD32 */ + pu4_yuv422i = (UWORD32 *)pv_yuv422i; + + /* Calculate the loop counter for inner loop */ + u2_width_cnt = u4_width >> 1; + + /* Run the loop for height of input buffer */ + for(i = u4_height; i > 0; i--) + { + /* Run the loop for width/2 */ + for(j = u2_width_cnt; j > 0; j--) + { + /* Store the value in output buffer in the order U0Y0V0Y1U2Y2V2Y3.... */ + /* Load Y0 */ + u1_y1 = *pu1_y++; + /* Load Y1 */ + u4_pixel = *pu1_y++; + /* Load V0 */ + u1_uv = *pu1_v++; + u4_pixel = (u4_pixel << 8) + u1_uv; + /* Load U0 */ + u1_uv = *pu1_u++; + u4_pixel = (u4_pixel << 8) + u1_y1; + u4_pixel = (u4_pixel << 8) + u1_uv; + *pu4_yuv422i++ = u4_pixel; + } + /* Incase of width is odd number take care of last pixel */ + if(u4_width & 0x1) + { + /* Store the value in output buffer in the order U0Y0V0Y1U2Y2V2Y3.... */ + /* Load Y0 */ + u1_y1 = *pu1_y++; + /* Load V0 */ + u1_uv = *pu1_v++; + /* Take Y0 as Y1 */ + u4_pixel = u1_y1; + u4_pixel = (u4_pixel << 8) + u1_uv; + /* Load U0 */ + u1_uv = *pu1_u++; + u4_pixel = (u4_pixel << 8) + u1_y1; + u4_pixel = (u4_pixel << 8) + u1_uv; + *pu4_yuv422i++ = u4_pixel; + } + /* Make the pointers to buffer to point to next row */ + pu1_y = pu1_y + u2_offset1; + if(!u1_flag) + { + /* Restore the pointers of u and v buffer back so that the row of pixels are also */ + /* Processed with same row of u and values again */ + pu1_u = pu1_u - ((u4_width + 1) >> 1); + pu1_v = pu1_v - ((u4_width + 1) >> 1); + } + else + { + /* Adjust the u and v buffer pointers so that they will point to next row */ + pu1_u = pu1_u + u2_offset2; + pu1_v = pu1_v + u2_offset3; + } + + /* Adjust the output buffer pointer for next row */ + pu4_yuv422i = pu4_yuv422i + u2_offset_yuv422i; + /* Toggle the flag to convert between odd and even row */ + u1_flag= u1_flag ^ 0x1; + } +} + + + + +void impeg2_fmt_conv_yuv420p_to_yuv420sp_vu(UWORD8 *pu1_y, UWORD8 *pu1_u, UWORD8 *pu1_v, + UWORD8 *pu1_dest_y, UWORD8 *pu1_dest_uv, + UWORD32 u4_height, UWORD32 u4_width,UWORD32 u4_stridey, + UWORD32 u4_strideu, UWORD32 u4_stridev, + UWORD32 u4_dest_stride_y, UWORD32 u4_dest_stride_uv, + UWORD32 u4_convert_uv_only + ) + +{ + + + UWORD8 *pu1_src,*pu1_dst; + UWORD8 *pu1_src_u, *pu1_src_v; + UWORD16 i; + UWORD32 u2_width_uv; + + UWORD32 u4_dest_inc_y=0, u4_dest_inc_uv=0; + + + /* Copy Y buffer */ + pu1_dst = (UWORD8 *)pu1_dest_y; + pu1_src = (UWORD8 *)pu1_y; + + u4_dest_inc_y = u4_dest_stride_y; + u4_dest_inc_uv = u4_dest_stride_uv; + + if(0 == u4_convert_uv_only) + { + for(i = 0; i < u4_height; i++) + { + memcpy((void *)pu1_dst,(void *)pu1_src, u4_width); + pu1_dst += u4_dest_inc_y; + pu1_src += u4_stridey; + } + } + + /* Interleave Cb and Cr buffers */ + pu1_src_u = pu1_u; + pu1_src_v = pu1_v; + pu1_dst = pu1_dest_uv ; + + u4_height = (u4_height + 1) >> 1; + u2_width_uv = (u4_width + 1) >> 1; + for(i = 0; i < u4_height ; i++) + { + UWORD32 j; + for(j = 0; j < u2_width_uv; j++) + { + *pu1_dst++ = *pu1_src_v++; + *pu1_dst++ = *pu1_src_u++; + + } + + pu1_dst += u4_dest_inc_uv - u4_width; + pu1_src_u += u4_strideu - u2_width_uv; + pu1_src_v += u4_stridev - u2_width_uv; + } +} + +void impeg2_fmt_conv_yuv420p_to_yuv420sp_uv(UWORD8 *pu1_y, UWORD8 *pu1_u, UWORD8 *pu1_v, + UWORD8 *pu1_dest_y, UWORD8 *pu1_dest_uv, + UWORD32 u4_height, UWORD32 u4_width,UWORD32 u4_stridey, + UWORD32 u4_strideu, UWORD32 u4_stridev, + UWORD32 u4_dest_stride_y, UWORD32 u4_dest_stride_uv, + UWORD32 u4_convert_uv_only) + +{ + + + UWORD8 *pu1_src,*pu1_dst; + UWORD8 *pu1_src_u, *pu1_src_v; + UWORD16 i; + UWORD32 u2_width_uv; + + UWORD32 u4_dest_inc_y=0, u4_dest_inc_uv=0; + + + /* Copy Y buffer */ + pu1_dst = (UWORD8 *)pu1_dest_y; + pu1_src = (UWORD8 *)pu1_y; + + u4_dest_inc_y = u4_dest_stride_y; + u4_dest_inc_uv = u4_dest_stride_uv; + + if(0 == u4_convert_uv_only) + { + for(i = 0; i < u4_height; i++) + { + memcpy((void *)pu1_dst,(void *)pu1_src, u4_width); + pu1_dst += u4_dest_inc_y; + pu1_src += u4_stridey; + } + } + + /* Interleave Cb and Cr buffers */ + pu1_src_u = pu1_u; + pu1_src_v = pu1_v; + pu1_dst = pu1_dest_uv ; + + u4_height = (u4_height + 1) >> 1; + u2_width_uv = (u4_width + 1) >> 1; + for(i = 0; i < u4_height ; i++) + { + UWORD32 j; + for(j = 0; j < u2_width_uv; j++) + { + *pu1_dst++ = *pu1_src_u++; + *pu1_dst++ = *pu1_src_v++; + } + + pu1_dst += u4_dest_inc_uv - u4_width; + pu1_src_u += u4_strideu - u2_width_uv; + pu1_src_v += u4_stridev - u2_width_uv; + } + +} + + diff --git a/common/impeg2_format_conv.h b/common/impeg2_format_conv.h new file mode 100644 index 0000000..52400d3 --- /dev/null +++ b/common/impeg2_format_conv.h @@ -0,0 +1,133 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : impeg2_format_conv.h */ +/* */ +/* Description : Contains coefficients and constant reqquired for */ +/* converting from rgb and gray color spaces to yuv422i */ +/* color space */ +/* */ +/* List of Functions : None */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 27 08 2007 Naveen Kumar T Draft */ +/* */ +/*****************************************************************************/ + +#ifndef __IMPEG2_FORMAT_CONV_H__ +#define __IMPEG2_FORMAT_CONV_H__ + +/*****************************************************************************/ +/* Typedefs */ +/*****************************************************************************/ + +#define COEFF_0_Y 66 +#define COEFF_1_Y 129 +#define COEFF_2_Y 25 +#define COEFF_0_U -38 +#define COEFF_1_U -75 +#define COEFF_2_U 112 +#define COEFF_0_V 112 +#define COEFF_1_V -94 +#define COEFF_2_V -18 +#define CONST_RGB_YUV1 4096 +#define CONST_RGB_YUV2 32768 +#define CONST_GRAY_YUV 128 +#define COEF_2_V2_U 0xFFEE0070 + +#define COF_2Y_0Y 0X00190042 +#define COF_1U_0U 0XFFB5FFDA +#define COF_1V_0V 0XFFA20070 + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ +typedef enum { +GRAY_SCALE = 0, +YUV444 = 1, +YUV420 = 2, +YUV422H = 3, +YUV422V = 4, +YUV411 = 5, +RGB24 = 6, +RGB24i = 7 +}input_format_t; + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ +typedef void pf_copy_yuv420p_buf_t(UWORD8 *pu1_src_y, + UWORD8 *pu1_src_u, + UWORD8 *pu1_src_v, + UWORD8 *pu1_dst_y, + UWORD8 *pu1_dst_u, + UWORD8 *pu1_dst_v, + UWORD32 u4_width, + UWORD32 u4_height, + UWORD32 u4_src_stride_y, + UWORD32 u4_src_stride_u, + UWORD32 u4_src_stride_v, + UWORD32 u4_dst_stride_y, + UWORD32 u4_dst_stride_u, + UWORD32 u4_dst_stride_v); + +typedef void pf_fmt_conv_yuv420p_to_yuv422ile_t(UWORD8 *pu1_y, + UWORD8 *pu1_u, + UWORD8 *pu1_v, + void *pv_yuv422i, + UWORD32 u4_width, + UWORD32 u4_height, + UWORD32 u4_stride_y, + UWORD32 u4_stride_u, + UWORD32 u4_stride_v, + UWORD32 u4_stride_yuv422i); + +typedef void pf_fmt_conv_yuv420p_to_yuv420sp_t(UWORD8 *pu1_y, + UWORD8 *pu1_u, + UWORD8 *pu1_v, + UWORD8 *pu1_dest_y, + UWORD8 *pu1_dest_uv, + UWORD32 u2_height, + UWORD32 u2_width, + UWORD32 u2_stridey, + UWORD32 u2_strideu, + UWORD32 u2_stridev, + UWORD32 u2_dest_stride_y, + UWORD32 u2_dest_stride_uv, + UWORD32 convert_uv_only); + +pf_copy_yuv420p_buf_t impeg2_copy_frm_yuv420p; +pf_fmt_conv_yuv420p_to_yuv422ile_t impeg2_fmt_conv_yuv420p_to_yuv422ile; +pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu; +pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv; + +pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q; +pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q; + +pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8; +pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8; + + +#endif /* __IMPEG2_FORMAT_CONV_H__ */ diff --git a/common/impeg2_globals.c b/common/impeg2_globals.c new file mode 100644 index 0000000..9193ef7 --- /dev/null +++ b/common/impeg2_globals.c @@ -0,0 +1,351 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#include <stdio.h> +#include "iv_datatypedef.h" +#include "iv.h" +#include "impeg2_buf_mgr.h" +#include "impeg2_disp_mgr.h" +#include "impeg2_defs.h" +#include "impeg2_platform_macros.h" +#include "impeg2_globals.h" + +/* Table for converting the quantizer_scale_code to quantizer_scale */ +const UWORD8 gau1_impeg2_non_linear_quant_scale[] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, + 8,10,12,14,16,18,20,22, + 24,28,32,36,40,44,48,52, + 56,64,72,80,88,96,104,112 +}; + + +/* Default quantizer matrix to be used for intra blocks */ +const UWORD8 gau1_impeg2_intra_quant_matrix_default[] = +{ + 8, 16, 19, 22, 26, 27, 29, 34, + 16, 16, 22, 24, 27, 29, 34, 37, + 19, 22, 26, 27, 29, 34, 34, 38, + 22, 22, 26, 27, 29, 34, 37, 40, + 22, 26, 27, 29, 32, 35, 40, 48, + 26, 27, 29, 32, 35, 40, 48, 58, + 26, 27, 29, 34, 38, 46, 56, 69, + 27, 29, 35, 38, 46, 56, 69, 83 +}; + +/* Default quantizer matrix to be used for inter blocks */ +const UWORD8 gau1_impeg2_inter_quant_matrix_default[] = +{ + 16,16,16,16,16,16,16,16, + 16,16,16,16,16,16,16,16, + 16,16,16,16,16,16,16,16, + 16,16,16,16,16,16,16,16, + 16,16,16,16,16,16,16,16, + 16,16,16,16,16,16,16,16, + 16,16,16,16,16,16,16,16, + 16,16,16,16,16,16,16,16 +}; + +/* Table to perform inverse scan when the scan direction is zigzag */ +const UWORD8 gau1_impeg2_inv_scan_zig_zag[] = +{ + 0, 1, 8, 16, 9, 2, 3, 10, + 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, + 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, + 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, + 53, 60, 61, 54, 47, 55, 62, 63 +}; + +/* Table to perform inverse scan when the direction of scanning is vertical */ +const UWORD8 gau1_impeg2_inv_scan_vertical[] = +{ + 0, 8, 16, 24, 1, 9, 2, 10, + 17, 25, 32, 40, 48, 56, 57, 49, + 41, 33, 26, 18, 3, 11, 4, 12, + 19, 27, 34, 42, 50, 58, 35, 43, + 51, 59, 20, 28, 5, 13, 6, 14, + 21, 29, 36, 44, 52, 60, 37, 45, + 53, 61, 22, 30, 7, 15, 23, 31, + 38, 46, 54, 62, 39, 47, 55, 63 +}; + +/*****************************************************************************/ +/* Table that indicate which interpolation type is to used */ +/*****************************************************************************/ +/* Chroma when motion vector is positive */ +const UWORD16 gau2_impeg2_chroma_interp_mv[][16] = +{ + /* Pos X Pos Y */ + { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3 + }, + /* Neg X Pos Y */ + { + 0, 1, 1, 0, + 0, 1, 1, 0, + 2, 3, 3, 2, + 2, 3, 3, 2 + }, + /* Pos X Neg Y */ + { + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3, + 0, 0, 1, 1 + }, + /* Neg X Neg Y */ + { + 0, 1, 1, 0, + 2, 3, 3, 2, + 2, 3, 3, 2, + 0, 1, 1, 0 + } +}; +/*****************************************************************************/ +/* Input #1 Offset in bytes */ +/*****************************************************************************/ +/* Chroma */ +const UWORD16 gau2_impeg2_chroma_interp_inp1[][16] = +{ + /* Pos X Pos Y */ + { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 + }, + /* Neg X Pos Y */ + { + 0, 0, 0, 4, + 0, 0, 0, 4, + 0, 0, 0, 4, + 0, 0, 0, 4 + }, + /* Pos X Neg Y */ + { + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0, + 72, 72, 72, 72 + }, + /* Neg X Neg Y */ + { + 0, 0, 0, 4, + 0, 0, 0, 4, + 0, 0, 0, 4, + 72, 72, 72, 76 + } +}; +/* Luma */ +const UWORD16 gau2_impeg2_luma_interp_inp1[] = +{ + 1, 1, 3, 3, + 1, 1, 3, 3, + 37, 37, 39, 39, + 37, 37, 39, 39 +}; +/*****************************************************************************/ +/* Input #2 Offset from Input #1 in bytes */ +/*****************************************************************************/ +/* + FXFY 0, + HXFY 2, + FXHY 36, + HXHY 36 +*/ +const UWORD16 gau2_impeg2_luma_interp_inp2[] = +{ + 0, 2, 0, 2, + 36, 36, 36, 36, + 0, 2, 0, 2, + 36, 36, 36, 36 +}; +const UWORD16 gau2_impeg2_chroma_interp_inp2[] = +{ + /* FXFY */ + 0, + /* HXFY */ + 4, + /* FXHY */ + 72, + /* HXHY */ + 72 +}; + +/*****************************************************************************/ +/* Corresponds to Table 6-4 frame_rate_value of the standard */ +/*****************************************************************************/ +/* + frame_rate_code frame_rate_value + + 0000 Forbidden + 0001 24 000 ÷ 1001 + 0010 24 + 0011 25 + 0100 30 000 ÷ 1001 + 0101 30 + 0110 50 + 0111 60 000 ÷ 1001 + 1000 60 + 1001 Reserved + .... + 1111 Reserved +*/ +const UWORD16 gau2_impeg2_frm_rate_code[][2] = +{ + {1 , 1}, /* Forbidden */ + {24000, 1001}, + {24000, 1000}, + {25000, 1000}, + {30000, 1001}, + {30000, 1000}, + {50000, 1000}, + {60000, 1001}, + {60000, 1000} + /* Rest reserved */ +}; + +const WORD16 gai2_impeg2_idct_q15[] = +{ + 23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170, + 32138, 27246, 18205, 6393, -6393, -18205, -27246, -32138, + 30274, 12540, -12540, -30274, -30274, -12540, 12540, 30274, + 27246, -6393, -32138, -18205, 18205, 32138, 6393, -27246, + 23170, -23170, -23170, 23170, 23170, -23170, -23170, 23170, + 18205, -32138, 6393, 27246, -27246, -6393, 32138, -18205, + 12540, -30274, 30274, -12540, -12540, 30274, -30274, 12540, + 6393, -18205, 27246, -32138, 32138, -27246, 18205, -6393, +}; + +const WORD16 gai2_impeg2_idct_q11[] = +{ + 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448, + 2009, 1703, 1138, 400, -400, -1138, -1703, -2009, + 1892, 784, -784, -1892, -1892, -784, 784, 1892, + 1703, -400, -2009, -1138, 1138, 2009, 400, -1703, + 1448, -1448, -1448, 1448, 1448, -1448, -1448, 1448, + 1138, -2009, 400, 1703, -1703, -400, 2009, -1138, + 784, -1892, 1892, -784, -784, 1892, -1892, 784, + 400, -1138, 1703, -2009, 2009, -1703, 1138, -400, +}; + +const WORD16 gai2_impeg2_idct_even_8_q15[][8] = +{ + { 23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170 }, + { 12540, -30274, 12540, -30274, 12540, -30274, 12540, -30274 }, + { 30274, 12540, 30274, 12540, 30274, 12540, 30274, 12540 }, + { 23170, -23170, 23170, -23170, 23170, -23170, 23170, -23170 } +}; +const WORD16 gai2_impeg2_idct_odd_8_q15[][8] = +{ + { 32138, 27246, 32138, 27246, 32138, 27246, 32138, 27246 }, + { 18205, 6393, 18205, 6393, 18205, 6393, 18205, 6393 }, + { 27246, -6393, 27246, -6393, 27246, -6393, 27246, -6393 }, + { 32138, 18205, 32138, 18205, 32138, 18205, 32138, 18205 }, + { 18205, -32138, 18205, -32138, 18205, -32138, 18205, -32138 }, + { 6393, 27246, 6393, 27246, 6393, 27246, 6393, 27246 }, + { 6393, -18205, 6393, -18205, 6393, -18205, 6393, -18205 }, + { 27246, -32138, 27246, -32138, 27246, -32138, 27246, -32138 }, +}; + +const WORD16 gai2_impeg2_idct_even_8_q11[][8] = +{ + { 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448 }, + { 784, -1892, 784, -1892, 784, -1892, 784, -1892 }, + { 1892, 784, 1892, 784, 1892, 784, 1892, 784 }, + { 1448, -1448, 1448, -1448, 1448, -1448, 1448, -1448 } +}; +const WORD16 gai2_impeg2_idct_odd_8_q11[][8] = +{ + { 2009, 1703, 2009, 1703, 2009, 1703, 2009, 1703 }, + { 1138, 400, 1138, 400, 1138, 400, 1138, 400 }, + { 1703, -400, 1703, -400, 1703, -400, 1703, -400 }, + { 2009, 1138, 2009, 1138, 2009, 1138, 2009, 1138 }, + { 1138, -2009, 1138, -2009, 1138, -2009, 1138, -2009 }, + { 400, 1703, 400, 1703, 400, 1703, 400, 1703 }, + { 400, -1138, 400, -1138, 400, -1138, 400, -1138 }, + { 1703, -2009, 1703, -2009, 1703, -2009, 1703, -2009 }, +}; + + + +/*****************************************************************************/ +/* Last row IDCT Coefficients in Q11 format */ +/*****************************************************************************/ +const WORD16 gai2_impeg2_idct_last_row_q11[] = +{ + 400, -1138, 1703, -2009, 2009, -1703, 1138, -400, +}; + +const WORD16 gai2_impeg2_idct_first_col_q15[] = +{ + 23170, 32138, 30274, 27246, 23170, 18205, 12540, 6393, +}; + +const WORD16 gai2_impeg2_idct_first_col_q11[] = +{ + 1448, 2009, 1892, 1703, 1448, 1138, 784, 400, +}; + +/*****************************************************************************/ +/* Output of first stage dct (using gai2_impeg2_idct_q15 as coeffs) */ +/* for a 1D data (0, 0, 0, 0, 0, 0, 0, 1) */ +/*****************************************************************************/ + +const WORD16 gai2_impeg2_mismatch_stg1_outp[] = +{ + 2, -4, 7, -8, 8, -7, 4, -2 +}; + +const WORD16 gai2_impeg2_mismatch_stg2_additive[] = +{ + 800, -2276, 3406, -4018, 4018, -3406, 2276, -800, + -1600, 4552, -6812, 8036, -8036, 6812, -4552, 1600, + 2800, -7966, 11921, -14063, 14063, -11921, 7966, -2800, + -3200, 9104, -13624, 16072, -16072, 13624, -9104, 3200, + 3200, -9104, 13624, -16072, 16072, -13624, 9104, -3200, + -2800, 7966, -11921, 14063, -14063, 11921, -7966, 2800, + 1600, -4552, 6812, -8036, 8036, -6812, 4552, -1600, + -800, 2276, -3406, 4018, -4018, 3406, -2276, 800, +}; + + +const UWORD8 gau1_impeg2_zerobuf[] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; +/*****************************************************************************/ +/* Tables of offset needed to address block in an MB */ +/*****************************************************************************/ +const WORD16 gai2_impeg2_blk_y_off_fld[] = {0,0,1,1}; +const WORD16 gai2_impeg2_blk_y_off_frm[] = {0,0,8,8}; +const WORD16 gai2_impeg2_blk_x_off[] = {0,8,0,8}; diff --git a/common/impeg2_globals.h b/common/impeg2_globals.h new file mode 100755 index 0000000..e8c6865 --- /dev/null +++ b/common/impeg2_globals.h @@ -0,0 +1,57 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_GLOBALS_H__ +#define __IMPEG2_GLOBALS_H__ + +extern const UWORD8 gau1_impeg2_non_linear_quant_scale[]; +extern const UWORD8 gau1_impeg2_intra_quant_matrix_default[]; +extern const UWORD8 gau1_impeg2_inter_quant_matrix_default[]; +extern const UWORD8 gau1_impeg2_inv_scan_vertical[]; +extern const UWORD8 gau1_impeg2_inv_scan_zig_zag[]; +extern const UWORD16 gau2_impeg2_frm_rate_code[][2]; + +extern const UWORD16 gau2_impeg2_chroma_interp_mv[][16]; +extern const UWORD16 gau2_impeg2_chroma_interp_inp1[][16]; +extern const UWORD16 gau2_impeg2_luma_interp_inp1[]; +extern const UWORD16 gau2_impeg2_luma_interp_inp2[]; +extern const UWORD16 gau2_impeg2_chroma_interp_inp2[]; + +extern const WORD16 gai2_impeg2_idct_q15[]; +extern const WORD16 gai2_impeg2_idct_q11[]; + +extern const WORD16 gai2_impeg2_mismatch_stg1_outp[]; +extern const WORD16 gai2_impeg2_idct_last_row_q11[]; +extern const WORD16 gai2_impeg2_idct_first_col_q15[]; +extern const WORD16 gai2_impeg2_idct_first_col_q11[]; +extern const WORD16 gai2_impeg2_mismatch_stg2_additive[]; + +extern const WORD16 gai2_impeg2_blk_y_off_fld[]; +extern const WORD16 gai2_impeg2_blk_y_off_frm[]; +extern const WORD16 gai2_impeg2_blk_x_off[]; + +extern const UWORD8 gau1_impeg2_zerobuf[]; + +extern const WORD16 gai2_impeg2_idct_odd_8_q15[8][8]; +extern const WORD16 gai2_impeg2_idct_odd_8_q11[8][8]; + +extern const WORD16 gai2_impeg2_idct_even_8_q11[4][8]; +extern const WORD16 gai2_impeg2_idct_even_8_q15[4][8]; + +#endif /* __IMPEG2_GLOBALS_H__ */ diff --git a/common/impeg2_idct.c b/common/impeg2_idct.c new file mode 100644 index 0000000..6834260 --- /dev/null +++ b/common/impeg2_idct.c @@ -0,0 +1,500 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : impeg2_idct.c */ +/* */ +/* Description : Contains 2d idct and invese quantization functions */ +/* */ +/* List of Functions : impeg2_idct_recon_dc() */ +/* impeg2_idct_recon_dc_mismatch() */ +/* impeg2_idct_recon() */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 10 09 2005 Hairsh M First Version */ +/* */ +/*****************************************************************************/ +/* + IEEE - 1180 results for this IDCT + L 256 256 5 5 300 300 384 384 Thresholds + H 255 255 5 5 300 300 383 383 + sign 1 -1 1 -1 1 -1 1 -1 + Peak Error 1 1 1 1 1 1 1 1 1 + Peak Mean Square Error 0.0191 0.0188 0.0108 0.0111 0.0176 0.0188 0.0165 0.0177 0.06 + Overall Mean Square Error 0.01566406 0.01597656 0.0091875 0.00908906 0.01499063 0.01533281 0.01432344 0.01412344 0.02 + Peak Mean Error 0.0027 0.0026 0.0028 0.002 0.0017 0.0033 0.0031 0.0025 0.015 + Overall Mean Error 0.00002656 -0.00031406 0.00016875 0.00005469 -0.00003125 0.00011406 0.00009219 0.00004219 0.0015 + */ +#include <stdio.h> +#include <string.h> + +#include "iv_datatypedef.h" +#include "iv.h" +#include "impeg2_defs.h" +#include "impeg2_platform_macros.h" + +#include "impeg2_macros.h" +#include "impeg2_globals.h" +#include "impeg2_idct.h" + + +void impeg2_idct_recon_dc(WORD16 *pi2_src, + WORD16 *pi2_tmp, + UWORD8 *pu1_pred, + UWORD8 *pu1_dst, + WORD32 i4_src_strd, + WORD32 i4_pred_strd, + WORD32 i4_dst_strd, + WORD32 i4_zero_cols, + WORD32 i4_zero_rows) +{ + WORD32 i4_val, i, j; + + UNUSED(pi2_tmp); + UNUSED(i4_src_strd); + UNUSED(i4_zero_cols); + UNUSED(i4_zero_rows); + + i4_val = pi2_src[0] * gai2_impeg2_idct_q15[0]; + i4_val = ((i4_val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT); + i4_val = i4_val * gai2_impeg2_idct_q11[0]; + i4_val = ((i4_val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT); + + for(i = 0; i < TRANS_SIZE_8; i++) + { + for(j = 0; j < TRANS_SIZE_8; j++) + { + pu1_dst[j] = CLIP_U8(i4_val + pu1_pred[j]); + } + pu1_dst += i4_dst_strd; + pu1_pred += i4_pred_strd; + } +} +void impeg2_idct_recon_dc_mismatch(WORD16 *pi2_src, + WORD16 *pi2_tmp, + UWORD8 *pu1_pred, + UWORD8 *pu1_dst, + WORD32 i4_src_strd, + WORD32 i4_pred_strd, + WORD32 i4_dst_strd, + WORD32 i4_zero_cols, + WORD32 i4_zero_rows) + +{ + WORD32 i4_val, i, j; + WORD32 i4_count = 0; + WORD32 i4_sum; + + UNUSED(pi2_tmp); + UNUSED(i4_src_strd); + UNUSED(i4_zero_cols); + UNUSED(i4_zero_rows); + + i4_val = pi2_src[0] * gai2_impeg2_idct_q15[0]; + i4_val = ((i4_val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT); + + i4_val *= gai2_impeg2_idct_q11[0]; + for(i = 0; i < TRANS_SIZE_8; i++) + { + for (j = 0; j < TRANS_SIZE_8; j++) + { + i4_sum = i4_val; + i4_sum += gai2_impeg2_mismatch_stg2_additive[i4_count]; + i4_sum = ((i4_sum + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT); + i4_sum += pu1_pred[j]; + pu1_dst[j] = CLIP_U8(i4_sum); + i4_count++; + } + + pu1_dst += i4_dst_strd; + pu1_pred += i4_pred_strd; + } + +} +/** + ******************************************************************************* + * + * @brief + * This function performs Inverse transform and reconstruction for 8x8 + * input block + * + * @par Description: + * Performs inverse transform and adds the prediction data and clips output + * to 8 bit + * + * @param[in] pi2_src + * Input 8x8 coefficients + * + * @param[in] pi2_tmp + * Temporary 8x8 buffer for storing inverse + * + * transform + * 1st stage output + * + * @param[in] pu1_pred + * Prediction 8x8 block + * + * @param[out] pu1_dst + * Output 8x8 block + * + * @param[in] src_strd + * Input stride + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Output Stride + * + * @param[in] shift + * Output shift + * + * @param[in] zero_cols + * Zero columns in pi2_src + * + * @returns Void + * + * @remarks + * None + * + ******************************************************************************* + */ + +void impeg2_idct_recon(WORD16 *pi2_src, + WORD16 *pi2_tmp, + UWORD8 *pu1_pred, + UWORD8 *pu1_dst, + WORD32 i4_src_strd, + WORD32 i4_pred_strd, + WORD32 i4_dst_strd, + WORD32 i4_zero_cols, + WORD32 i4_zero_rows) +{ + WORD32 j, k; + WORD32 ai4_e[4], ai4_o[4]; + WORD32 ai4_ee[2], ai4_eo[2]; + WORD32 i4_add; + WORD32 i4_shift; + WORD16 *pi2_tmp_orig; + WORD32 i4_trans_size; + WORD32 i4_zero_rows_2nd_stage = i4_zero_cols; + WORD32 i4_row_limit_2nd_stage; + + i4_trans_size = TRANS_SIZE_8; + + pi2_tmp_orig = pi2_tmp; + + if((i4_zero_cols & 0xF0) == 0xF0) + i4_row_limit_2nd_stage = 4; + else + i4_row_limit_2nd_stage = TRANS_SIZE_8; + + + if((i4_zero_rows & 0xF0) == 0xF0) /* First 4 rows of input are non-zero */ + { + /************************************************************************************************/ + /**********************************START - IT_RECON_8x8******************************************/ + /************************************************************************************************/ + + /* Inverse Transform 1st stage */ + i4_shift = IDCT_STG1_SHIFT; + i4_add = 1 << (i4_shift - 1); + + for(j = 0; j < i4_row_limit_2nd_stage; j++) + { + /* Checking for Zero Cols */ + if((i4_zero_cols & 1) == 1) + { + memset(pi2_tmp, 0, i4_trans_size * sizeof(WORD16)); + } + else + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for(k = 0; k < 4; k++) + { + ai4_o[k] = gai2_impeg2_idct_q15[1 * 8 + k] * pi2_src[i4_src_strd] + + gai2_impeg2_idct_q15[3 * 8 + k] + * pi2_src[3 * i4_src_strd]; + } + ai4_eo[0] = gai2_impeg2_idct_q15[2 * 8 + 0] * pi2_src[2 * i4_src_strd]; + ai4_eo[1] = gai2_impeg2_idct_q15[2 * 8 + 1] * pi2_src[2 * i4_src_strd]; + ai4_ee[0] = gai2_impeg2_idct_q15[0 * 8 + 0] * pi2_src[0]; + ai4_ee[1] = gai2_impeg2_idct_q15[0 * 8 + 1] * pi2_src[0]; + + /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ + ai4_e[0] = ai4_ee[0] + ai4_eo[0]; + ai4_e[3] = ai4_ee[0] - ai4_eo[0]; + ai4_e[1] = ai4_ee[1] + ai4_eo[1]; + ai4_e[2] = ai4_ee[1] - ai4_eo[1]; + for(k = 0; k < 4; k++) + { + pi2_tmp[k] = + CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift)); + pi2_tmp[k + 4] = + CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift)); + } + } + pi2_src++; + pi2_tmp += i4_trans_size; + i4_zero_cols = i4_zero_cols >> 1; + } + + pi2_tmp = pi2_tmp_orig; + + /* Inverse Transform 2nd stage */ + i4_shift = IDCT_STG2_SHIFT; + i4_add = 1 << (i4_shift - 1); + if((i4_zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */ + { + for(j = 0; j < i4_trans_size; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for(k = 0; k < 4; k++) + { + ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size] + + gai2_impeg2_idct_q11[3 * 8 + k] * pi2_tmp[3 * i4_trans_size]; + } + ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size]; + ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size]; + ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0]; + ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0]; + + /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ + ai4_e[0] = ai4_ee[0] + ai4_eo[0]; + ai4_e[3] = ai4_ee[0] - ai4_eo[0]; + ai4_e[1] = ai4_ee[1] + ai4_eo[1]; + ai4_e[2] = ai4_ee[1] - ai4_eo[1]; + for(k = 0; k < 4; k++) + { + WORD32 itrans_out; + itrans_out = + CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift)); + pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); + itrans_out = + CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift)); + pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4])); + } + pi2_tmp++; + pu1_pred += i4_pred_strd; + pu1_dst += i4_dst_strd; + } + } + else /* All rows of output of 1st stage are non-zero */ + { + for(j = 0; j < i4_trans_size; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for(k = 0; k < 4; k++) + { + ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size] + + gai2_impeg2_idct_q11[3 * 8 + k] + * pi2_tmp[3 * i4_trans_size] + + gai2_impeg2_idct_q11[5 * 8 + k] + * pi2_tmp[5 * i4_trans_size] + + gai2_impeg2_idct_q11[7 * 8 + k] + * pi2_tmp[7 * i4_trans_size]; + } + + ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size] + + gai2_impeg2_idct_q11[6 * 8 + 0] * pi2_tmp[6 * i4_trans_size]; + ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size] + + gai2_impeg2_idct_q11[6 * 8 + 1] * pi2_tmp[6 * i4_trans_size]; + ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0] + + gai2_impeg2_idct_q11[4 * 8 + 0] * pi2_tmp[4 * i4_trans_size]; + ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0] + + gai2_impeg2_idct_q11[4 * 8 + 1] * pi2_tmp[4 * i4_trans_size]; + + /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ + ai4_e[0] = ai4_ee[0] + ai4_eo[0]; + ai4_e[3] = ai4_ee[0] - ai4_eo[0]; + ai4_e[1] = ai4_ee[1] + ai4_eo[1]; + ai4_e[2] = ai4_ee[1] - ai4_eo[1]; + for(k = 0; k < 4; k++) + { + WORD32 itrans_out; + itrans_out = + CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift)); + pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); + itrans_out = + CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift)); + pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4])); + } + pi2_tmp++; + pu1_pred += i4_pred_strd; + pu1_dst += i4_dst_strd; + } + } + /************************************************************************************************/ + /************************************END - IT_RECON_8x8******************************************/ + /************************************************************************************************/ + } + else /* All rows of input are non-zero */ + { + /************************************************************************************************/ + /**********************************START - IT_RECON_8x8******************************************/ + /************************************************************************************************/ + + /* Inverse Transform 1st stage */ + i4_shift = IDCT_STG1_SHIFT; + i4_add = 1 << (i4_shift - 1); + + for(j = 0; j < i4_row_limit_2nd_stage; j++) + { + /* Checking for Zero Cols */ + if((i4_zero_cols & 1) == 1) + { + memset(pi2_tmp, 0, i4_trans_size * sizeof(WORD16)); + } + else + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for(k = 0; k < 4; k++) + { + ai4_o[k] = gai2_impeg2_idct_q15[1 * 8 + k] * pi2_src[i4_src_strd] + + gai2_impeg2_idct_q15[3 * 8 + k] + * pi2_src[3 * i4_src_strd] + + gai2_impeg2_idct_q15[5 * 8 + k] + * pi2_src[5 * i4_src_strd] + + gai2_impeg2_idct_q15[7 * 8 + k] + * pi2_src[7 * i4_src_strd]; + } + + ai4_eo[0] = gai2_impeg2_idct_q15[2 * 8 + 0] * pi2_src[2 * i4_src_strd] + + gai2_impeg2_idct_q15[6 * 8 + 0] * pi2_src[6 * i4_src_strd]; + ai4_eo[1] = gai2_impeg2_idct_q15[2 * 8 + 1] * pi2_src[2 * i4_src_strd] + + gai2_impeg2_idct_q15[6 * 8 + 1] * pi2_src[6 * i4_src_strd]; + ai4_ee[0] = gai2_impeg2_idct_q15[0 * 8 + 0] * pi2_src[0] + + gai2_impeg2_idct_q15[4 * 8 + 0] * pi2_src[4 * i4_src_strd]; + ai4_ee[1] = gai2_impeg2_idct_q15[0 * 8 + 1] * pi2_src[0] + + gai2_impeg2_idct_q15[4 * 8 + 1] * pi2_src[4 * i4_src_strd]; + + /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ + ai4_e[0] = ai4_ee[0] + ai4_eo[0]; + ai4_e[3] = ai4_ee[0] - ai4_eo[0]; + ai4_e[1] = ai4_ee[1] + ai4_eo[1]; + ai4_e[2] = ai4_ee[1] - ai4_eo[1]; + for(k = 0; k < 4; k++) + { + pi2_tmp[k] = + CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift)); + pi2_tmp[k + 4] = + CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift)); + } + } + pi2_src++; + pi2_tmp += i4_trans_size; + i4_zero_cols = i4_zero_cols >> 1; + } + + pi2_tmp = pi2_tmp_orig; + + /* Inverse Transform 2nd stage */ + i4_shift = IDCT_STG2_SHIFT; + i4_add = 1 << (i4_shift - 1); + if((i4_zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */ + { + for(j = 0; j < i4_trans_size; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for(k = 0; k < 4; k++) + { + ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size] + + gai2_impeg2_idct_q11[3 * 8 + k] * pi2_tmp[3 * i4_trans_size]; + } + ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size]; + ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size]; + ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0]; + ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0]; + + /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ + ai4_e[0] = ai4_ee[0] + ai4_eo[0]; + ai4_e[3] = ai4_ee[0] - ai4_eo[0]; + ai4_e[1] = ai4_ee[1] + ai4_eo[1]; + ai4_e[2] = ai4_ee[1] - ai4_eo[1]; + for(k = 0; k < 4; k++) + { + WORD32 itrans_out; + itrans_out = + CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift)); + pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); + itrans_out = + CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift)); + pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4])); + } + pi2_tmp++; + pu1_pred += i4_pred_strd; + pu1_dst += i4_dst_strd; + } + } + else /* All rows of output of 1st stage are non-zero */ + { + for(j = 0; j < i4_trans_size; j++) + { + /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ + for(k = 0; k < 4; k++) + { + ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size] + + gai2_impeg2_idct_q11[3 * 8 + k] + * pi2_tmp[3 * i4_trans_size] + + gai2_impeg2_idct_q11[5 * 8 + k] + * pi2_tmp[5 * i4_trans_size] + + gai2_impeg2_idct_q11[7 * 8 + k] + * pi2_tmp[7 * i4_trans_size]; + } + + ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size] + + gai2_impeg2_idct_q11[6 * 8 + 0] * pi2_tmp[6 * i4_trans_size]; + ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size] + + gai2_impeg2_idct_q11[6 * 8 + 1] * pi2_tmp[6 * i4_trans_size]; + ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0] + + gai2_impeg2_idct_q11[4 * 8 + 0] * pi2_tmp[4 * i4_trans_size]; + ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0] + + gai2_impeg2_idct_q11[4 * 8 + 1] * pi2_tmp[4 * i4_trans_size]; + + /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */ + ai4_e[0] = ai4_ee[0] + ai4_eo[0]; + ai4_e[3] = ai4_ee[0] - ai4_eo[0]; + ai4_e[1] = ai4_ee[1] + ai4_eo[1]; + ai4_e[2] = ai4_ee[1] - ai4_eo[1]; + for(k = 0; k < 4; k++) + { + WORD32 itrans_out; + itrans_out = + CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift)); + pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k])); + itrans_out = + CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift)); + pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4])); + } + pi2_tmp++; + pu1_pred += i4_pred_strd; + pu1_dst += i4_dst_strd; + } + } + /************************************************************************************************/ + /************************************END - IT_RECON_8x8******************************************/ + /************************************************************************************************/ + } +} + diff --git a/common/impeg2_idct.h b/common/impeg2_idct.h new file mode 100644 index 0000000..80defde --- /dev/null +++ b/common/impeg2_idct.h @@ -0,0 +1,66 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_IDCT_H__ +#define __IMPEG2_IDCT_H__ + + +/*****************************************************************************/ +/* Function Declarations */ +/*****************************************************************************/ + +typedef void pf_idct_recon_t(WORD16 *pi2_src, + WORD16 *pi2_tmp, + UWORD8 *pu1_pred, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + WORD32 zero_cols, + WORD32 zero_rows); + +/* ARM assembly modules curently ignore non_zero_cols argument */ +pf_idct_recon_t impeg2_idct_recon_dc; + +pf_idct_recon_t impeg2_idct_recon_dc_mismatch; + +pf_idct_recon_t impeg2_idct_recon; + + +pf_idct_recon_t impeg2_idct_recon_dc_a9q; + +pf_idct_recon_t impeg2_idct_recon_dc_mismatch_a9q; + +pf_idct_recon_t impeg2_idct_recon_a9q; + + +pf_idct_recon_t impeg2_idct_recon_dc_av8; + +pf_idct_recon_t impeg2_idct_recon_dc_mismatch_av8; + +pf_idct_recon_t impeg2_idct_recon_av8; + +pf_idct_recon_t impeg2_idct_recon_sse42; + +pf_idct_recon_t impeg2_idct_recon_dc_mismatch_sse42; + +pf_idct_recon_t impeg2_idct_recon_dc_sse42; + +#endif /* #ifndef __IMPEG2_IDCT_H__ */ + diff --git a/common/impeg2_inter_pred.c b/common/impeg2_inter_pred.c new file mode 100644 index 0000000..019fa5c --- /dev/null +++ b/common/impeg2_inter_pred.c @@ -0,0 +1,467 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* impeg2d_mcu.c +* +* @brief +* Contains MC function definitions for MPEG2 decoder +* +* @author +* Harish +* +* @par List of Functions: +* - impeg2_copy_mb() +* - impeg2_interpolate() +* - impeg2_mc_halfx_halfy_8x8() +* - impeg2_mc_halfx_fully_8x8() +* - impeg2_mc_fullx_halfy_8x8() +* - impeg2_mc_fullx_fully_8x8() +* +* @remarks +* None +* +******************************************************************************* +*/ + +#include <stdio.h> +#include <string.h> +#include "iv_datatypedef.h" +#include "iv.h" +#include "impeg2_buf_mgr.h" +#include "impeg2_disp_mgr.h" +#include "impeg2_defs.h" +#include "impeg2_platform_macros.h" + +#include "impeg2_inter_pred.h" +#include "impeg2_globals.h" +#include "impeg2_macros.h" +#include "impeg2_idct.h" + +/******************************************************************************* +* Function Name : impeg2_copy_mb +* +* Description : copies 3 components to the frame from mc_buf +* +* Arguments : +* src_buf : Source Buffer +* dst_buf : Destination Buffer +* src_offset_x : X offset for source +* src_offset_y : Y offset for source +* dst_offset_x : X offset for destination +* dst_offset_y : Y offset for destination +* src_wd : Source Width +* dst_wd : destination Width +* rows : Number of rows +* cols : Number of columns +* +* Values Returned : None +*******************************************************************************/ +void impeg2_copy_mb(yuv_buf_t *ps_src_buf, + yuv_buf_t *ps_dst_buf, + UWORD32 u4_src_wd, + UWORD32 u4_dst_wd) +{ + UWORD8 *pu1_src; + UWORD8 *pu1_dst; + UWORD32 i; + UWORD32 u4_rows = MB_SIZE; + UWORD32 u4_cols = MB_SIZE; + + /*******************************************************/ + /* copy Y */ + /*******************************************************/ + pu1_src = ps_src_buf->pu1_y; + pu1_dst = ps_dst_buf->pu1_y; + for(i = 0; i < u4_rows; i++) + { + memcpy(pu1_dst, pu1_src, u4_cols); + pu1_src += u4_src_wd; + pu1_dst += u4_dst_wd; + } + + u4_src_wd >>= 1; + u4_dst_wd >>= 1; + u4_rows >>= 1; + u4_cols >>= 1; + + /*******************************************************/ + /* copy U */ + /*******************************************************/ + pu1_src = ps_src_buf->pu1_u; + pu1_dst = ps_dst_buf->pu1_u; + for(i = 0; i < u4_rows; i++) + { + memcpy(pu1_dst, pu1_src, u4_cols); + + pu1_src += u4_src_wd; + pu1_dst += u4_dst_wd; + } + /*******************************************************/ + /* copy V */ + /*******************************************************/ + pu1_src = ps_src_buf->pu1_v; + pu1_dst = ps_dst_buf->pu1_v; + for(i = 0; i < u4_rows; i++) + { + memcpy(pu1_dst, pu1_src, u4_cols); + + pu1_src += u4_src_wd; + pu1_dst += u4_dst_wd; + } + +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_interpolate */ +/* */ +/* Description : averages the contents of buf_src1 and buf_src2 and stores*/ +/* result in buf_dst */ +/* */ +/* Inputs : buf_src1 - First Source */ +/* buf_src2 - Second Source */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Avg the values from two sources and store the result in */ +/* destination buffer */ +/* */ +/* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */ +/* */ +/* Returns : None */ +/* */ +/* Issues : Assumes that all 3 buffers are of same size */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 14 09 2005 Harish M First Version */ +/* 15 09 2010 Venkat Added stride */ +/* */ +/*****************************************************************************/ +void impeg2_interpolate(yuv_buf_t *ps_buf_src1, + yuv_buf_t *ps_buf_src2, + yuv_buf_t *ps_buf_dst, + UWORD32 u4_stride) +{ + + UWORD32 i,j; + UWORD8 *pu1_src1,*pu1_src2,*pu1_dst; + pu1_src1 = ps_buf_src1->pu1_y; + pu1_src2 = ps_buf_src2->pu1_y; + pu1_dst = ps_buf_dst->pu1_y; + for(i = MB_SIZE; i > 0; i--) + { + for(j = MB_SIZE; j > 0; j--) + { + *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1; + } + + pu1_dst += u4_stride - MB_SIZE; + + } + + u4_stride >>= 1; + + pu1_src1 = ps_buf_src1->pu1_u; + pu1_src2 = ps_buf_src2->pu1_u; + pu1_dst = ps_buf_dst->pu1_u; + for(i = MB_CHROMA_SIZE; i > 0 ; i--) + { + for(j = MB_CHROMA_SIZE; j > 0; j--) + { + *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1; + } + + pu1_dst += u4_stride - MB_CHROMA_SIZE; + } + + pu1_src1 = ps_buf_src1->pu1_v; + pu1_src2 = ps_buf_src2->pu1_v; + pu1_dst = ps_buf_dst->pu1_v; + for(i = MB_CHROMA_SIZE; i > 0 ; i--) + { + for(j = MB_CHROMA_SIZE; j > 0; j--) + { + *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1; + } + + pu1_dst += u4_stride - MB_CHROMA_SIZE; + } + +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_halfx_halfy_8x8() */ +/* */ +/* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */ +/* the ref frame.Interpolate these four values to get the */ +/* value at(0.5,0.5).Repeat this to get an 8 x 8 block */ +/* using 9 x 9 block from reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 05 09 2005 Harish M First Version */ +/* */ +/*****************************************************************************/ +void impeg2_mc_halfx_halfy_8x8(UWORD8 *pu1_out, + UWORD8 *pu1_ref, + UWORD32 u4_ref_wid, + UWORD32 u4_out_wid) +{ + UWORD8 *pu1_ref_p0,*pu1_ref_p1,*pu1_ref_p2,*pu1_ref_p3; + UWORD32 i,j; + /* P0-P3 are the pixels in the reference frame and Q is the value being */ + /* estimated */ + /* + P0 P1 + Q + P2 P3 + */ + + pu1_ref_p0 = pu1_ref; + pu1_ref_p1 = pu1_ref + 1; + pu1_ref_p2 = pu1_ref + u4_ref_wid; + pu1_ref_p3 = pu1_ref + u4_ref_wid + 1; + + for(i = 0; i < BLK_SIZE; i++) + { + for(j = 0; j < BLK_SIZE; j++) + { + *pu1_out++ = (( (*pu1_ref_p0++ ) + + (*pu1_ref_p1++ ) + + (*pu1_ref_p2++ ) + + (*pu1_ref_p3++ ) + 2 ) >> 2); + } + pu1_ref_p0 += u4_ref_wid - BLK_SIZE; + pu1_ref_p1 += u4_ref_wid - BLK_SIZE; + pu1_ref_p2 += u4_ref_wid - BLK_SIZE; + pu1_ref_p3 += u4_ref_wid - BLK_SIZE; + + pu1_out += u4_out_wid - BLK_SIZE; + } + return; +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_halfx_fully_8x8() */ +/* */ +/* Description : Gets the buffer from (0.5,0) to (8.5,8) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0) and (1,0) position in the ref frame */ +/* Interpolate these two values to get the value at(0.5,0) */ +/* Repeat this to get an 8 x 8 block using 9 x 8 block from */ +/* reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 05 09 2005 Harish M First Version */ +/* */ +/*****************************************************************************/ +void impeg2_mc_halfx_fully_8x8(UWORD8 *pu1_out, + UWORD8 *pu1_ref, + UWORD32 u4_ref_wid, + UWORD32 u4_out_wid) +{ + UWORD8 *pu1_ref_p0, *pu1_ref_p1; + UWORD32 i,j; + + /* P0-P3 are the pixels in the reference frame and Q is the value being */ + /* estimated */ + /* + P0 Q P1 + */ + + pu1_ref_p0 = pu1_ref; + pu1_ref_p1 = pu1_ref + 1; + + for(i = 0; i < BLK_SIZE; i++) + { + for(j = 0; j < BLK_SIZE; j++) + { + *pu1_out++ = ((( *pu1_ref_p0++ ) + + (*pu1_ref_p1++) + 1 ) >> 1); + } + pu1_ref_p0 += u4_ref_wid - BLK_SIZE; + pu1_ref_p1 += u4_ref_wid - BLK_SIZE; + + pu1_out += u4_out_wid - BLK_SIZE; + } + return; +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_fullx_halfy_8x8() */ +/* */ +/* Description : Gets the buffer from (0,0.5) to (8,8.5) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0) and (0,1) position in the ref frame */ +/* Interpolate these two values to get the value at(0,0.5) */ +/* Repeat this to get an 8 x 8 block using 8 x 9 block from */ +/* reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 05 09 2005 Harish M First Version */ +/* */ +/*****************************************************************************/ +void impeg2_mc_fullx_halfy_8x8(UWORD8 *pu1_out, + UWORD8 *pu1_ref, + UWORD32 u4_ref_wid, + UWORD32 u4_out_wid) +{ + + UWORD8 *pu1_ref_p0, *pu1_ref_p1; + UWORD32 i,j; + /* P0-P3 are the pixels in the reference frame and Q is the value being */ + /* estimated */ + /* + P0 + x + P1 + */ + pu1_ref_p0 = pu1_ref; + pu1_ref_p1 = pu1_ref + u4_ref_wid; + + for(i = 0; i < BLK_SIZE; i++) + { + for(j = 0; j < BLK_SIZE; j++) + { + *pu1_out++ = ((( *pu1_ref_p0++) + + (*pu1_ref_p1++) + 1 ) >> 1); + } + pu1_ref_p0 += u4_ref_wid - BLK_SIZE; + pu1_ref_p1 += u4_ref_wid - BLK_SIZE; + + pu1_out += u4_out_wid - BLK_SIZE; + } + + return; +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_fullx_fully_8x8() */ +/* */ +/* Description : Gets the buffer from (x,y) to (x+8,y+8) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0) position in the ref frame */ +/* Get an 8 x 8 block from reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/* Revision History: */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 05 09 2005 Harish M First Version */ +/* */ +/*****************************************************************************/ +void impeg2_mc_fullx_fully_8x8(UWORD8 *pu1_out, + UWORD8 *pu1_ref, + UWORD32 u4_ref_wid, + UWORD32 u4_out_wid) +{ + + UWORD32 i; + + for(i = 0; i < BLK_SIZE; i++) + { + memcpy(pu1_out, pu1_ref, BLK_SIZE); + pu1_ref += u4_ref_wid; + pu1_out += u4_out_wid; + } + return; +} diff --git a/common/impeg2_inter_pred.h b/common/impeg2_inter_pred.h new file mode 100644 index 0000000..be3b0e5 --- /dev/null +++ b/common/impeg2_inter_pred.h @@ -0,0 +1,103 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_INTER_PRED_H__ +#define __IMPEG2_INTER_PRED_H__ + + +typedef struct +{ + UWORD8 *pu1_y; + UWORD8 *pu1_u; + UWORD8 *pu1_v; +}yuv_buf_t; + +typedef struct +{ + WORD16 *pi2_y; + WORD16 *pi2_u; + WORD16 *pi2_v; +}yuv_buf16_t; + +/** + * Picture buffer + */ +typedef struct +{ + UWORD8 *pu1_y; + UWORD8 *pu1_u; + UWORD8 *pu1_v; + + /** Used to store display Timestamp for current buffer */ + WORD32 u4_ts; + UWORD8 u1_used_as_ref; + + /** + * buffer ID from buffer manager + */ + WORD32 i4_buf_id; + +}pic_buf_t; + +typedef void pf_copy_mb_t (yuv_buf_t *src_buf, + yuv_buf_t *dst_buf, + UWORD32 src_wd, + UWORD32 dst_wd); + +typedef void pf_interpred_t(UWORD8 *out,UWORD8 *ref, UWORD32 ref_wid, UWORD32 out_wid); + +typedef void pf_interpolate_t(yuv_buf_t *buf_src1, + yuv_buf_t *buf_src2, + yuv_buf_t *buf_dst, + UWORD32 stride); + +pf_interpolate_t impeg2_interpolate; +pf_interpolate_t impeg2_interpolate_a9q; +pf_interpolate_t impeg2_interpolate_av8; + +pf_copy_mb_t impeg2_copy_mb; +pf_copy_mb_t impeg2_copy_mb_a9q; +pf_copy_mb_t impeg2_copy_mb_av8; + +pf_interpred_t impeg2_mc_halfx_halfy_8x8; +pf_interpred_t impeg2_mc_halfx_fully_8x8; +pf_interpred_t impeg2_mc_fullx_halfy_8x8; +pf_interpred_t impeg2_mc_fullx_fully_8x8; + +pf_interpred_t impeg2_mc_halfx_halfy_8x8_a9q; +pf_interpred_t impeg2_mc_halfx_fully_8x8_a9q; +pf_interpred_t impeg2_mc_fullx_halfy_8x8_a9q; +pf_interpred_t impeg2_mc_fullx_fully_8x8_a9q; + +/* AV8 Declarations */ +pf_interpred_t impeg2_mc_halfx_halfy_8x8_av8; +pf_interpred_t impeg2_mc_halfx_fully_8x8_av8; +pf_interpred_t impeg2_mc_fullx_halfy_8x8_av8; +pf_interpred_t impeg2_mc_fullx_fully_8x8_av8; + + +/* SSE4.2 Declarations*/ +pf_copy_mb_t impeg2_copy_mb_sse42; +pf_interpolate_t impeg2_interpolate_sse42; +pf_interpred_t impeg2_mc_halfx_halfy_8x8_sse42; +pf_interpred_t impeg2_mc_halfx_fully_8x8_sse42; +pf_interpred_t impeg2_mc_fullx_halfy_8x8_sse42; +pf_interpred_t impeg2_mc_fullx_fully_8x8_sse42; + +#endif /* #ifndef __IMPEG2_INTER_PRED_H__ */ diff --git a/common/impeg2_job_queue.c b/common/impeg2_job_queue.c new file mode 100644 index 0000000..d36ce7c --- /dev/null +++ b/common/impeg2_job_queue.c @@ -0,0 +1,530 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* impeg2d_job_queue.c +* +* @brief +* Contains functions for job queue +* +* @author +* Harish +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> + +#include "iv_datatypedef.h" +#include "iv.h" +#include "ithread.h" +#include "impeg2_macros.h" +#include "impeg2_job_queue.h" + +/** +******************************************************************************* +* +* @brief Returns size for job queue context. Does not include job queue buffer +* requirements +* +* @par Description +* Returns size for job queue context. Does not include job queue buffer +* requirements. Buffer size required to store the jobs should be allocated in +* addition to the value returned here. +* +* @returns Size of the job queue context +* +* @remarks +* +******************************************************************************* +*/ +WORD32 impeg2_jobq_ctxt_size() +{ + WORD32 i4_size; + i4_size = sizeof(jobq_t); + i4_size += ithread_get_mutex_lock_size(); + return i4_size; +} + +/** +******************************************************************************* +* +* @brief +* Locks the jobq conext +* +* @par Description +* Locks the jobq conext by calling ithread_mutex_lock() +* +* @param[in] ps_jobq +* Job Queue context +* +* @returns IMPEG2D_FAIL if mutex lock fails else IV_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IV_API_CALL_STATUS_T impeg2_jobq_lock(jobq_t *ps_jobq) +{ + WORD32 i4_ret_val; + i4_ret_val = ithread_mutex_lock(ps_jobq->pv_mutex); + if(i4_ret_val) + { + return IV_FAIL; + } + return IV_SUCCESS; +} + +/** +******************************************************************************* +* +* @brief +* Unlocks the jobq conext +* +* @par Description +* Unlocks the jobq conext by calling ithread_mutex_unlock() +* +* @param[in] ps_jobq +* Job Queue context +* +* @returns IMPEG2D_FAIL if mutex unlock fails else IV_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ + +IV_API_CALL_STATUS_T impeg2_jobq_unlock(jobq_t *ps_jobq) +{ + WORD32 i4_ret_val; + i4_ret_val = ithread_mutex_unlock(ps_jobq->pv_mutex); + if(i4_ret_val) + { + return IV_FAIL; + } + return IV_SUCCESS; + +} +/** +******************************************************************************* +* +* @brief +* Yeilds the thread +* +* @par Description +* Unlocks the jobq conext by calling +* impeg2_jobq_unlock(), ithread_yield() and then impeg2_jobq_lock() +* jobq is unlocked before to ensure the jobq can be accessed by other threads +* If unlock is not done before calling yield then no other thread can access +* the jobq functions and update jobq. +* +* @param[in] ps_jobq +* Job Queue context +* +* @returns IMPEG2D_FAIL if mutex lock unlock or yield fails else IV_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IV_API_CALL_STATUS_T impeg2_jobq_yield(jobq_t *ps_jobq) +{ + + IV_API_CALL_STATUS_T e_ret = IV_SUCCESS; + + IV_API_CALL_STATUS_T e_ret_tmp; + e_ret_tmp = impeg2_jobq_unlock(ps_jobq); + RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp); + + //NOP(1024 * 8); + ithread_yield(); + + e_ret_tmp = impeg2_jobq_lock(ps_jobq); + RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp); + return e_ret; +} + + +/** +******************************************************************************* +* +* @brief free the job queue pointers +* +* @par Description +* Frees the jobq context +* +* @param[in] pv_buf +* Memoy for job queue buffer and job queue context +* +* @returns Pointer to job queue context +* +* @remarks +* Since it will be called only once by master thread this is not thread safe. +* +******************************************************************************* +*/ +IV_API_CALL_STATUS_T impeg2_jobq_free(jobq_t *ps_jobq) +{ + WORD32 i4_ret; + i4_ret = ithread_mutex_destroy(ps_jobq->pv_mutex); + + if(0 == i4_ret) + return IV_SUCCESS; + else + return IV_FAIL; +} + +/** +******************************************************************************* +* +* @brief Initialize the job queue +* +* @par Description +* Initializes the jobq context and sets write and read pointers to start of +* job queue buffer +* +* @param[in] pv_buf +* Memoy for job queue buffer and job queue context +* +* @param[in] buf_size +* Size of the total memory allocated +* +* @returns Pointer to job queue context +* +* @remarks +* Since it will be called only once by master thread this is not thread safe. +* +******************************************************************************* +*/ +void* impeg2_jobq_init(void *pv_buf, WORD32 i4_buf_size) +{ + jobq_t *ps_jobq; + UWORD8 *pu1_buf; + pu1_buf = (UWORD8 *)pv_buf; + + ps_jobq = (jobq_t *)pu1_buf; + pu1_buf += sizeof(jobq_t); + i4_buf_size -= sizeof(jobq_t); + + ps_jobq->pv_mutex = pu1_buf; + pu1_buf += ithread_get_mutex_lock_size(); + i4_buf_size -= ithread_get_mutex_lock_size(); + + if(i4_buf_size <= 0) + return NULL; + + ithread_mutex_init(ps_jobq->pv_mutex); + + ps_jobq->pv_buf_base = pu1_buf; + ps_jobq->pv_buf_wr = pu1_buf; + ps_jobq->pv_buf_rd = pu1_buf; + ps_jobq->pv_buf_end = pu1_buf + i4_buf_size; + ps_jobq->i4_terminate = 0; + + + return ps_jobq; +} +/** +******************************************************************************* +* +* @brief +* Resets the jobq conext +* +* @par Description +* Resets the jobq conext by initilizing job queue context elements +* +* @param[in] ps_jobq +* Job Queue context +* +* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IV_API_CALL_STATUS_T impeg2_jobq_reset(jobq_t *ps_jobq) +{ + IV_API_CALL_STATUS_T e_ret = IV_SUCCESS; + e_ret = impeg2_jobq_lock(ps_jobq); + RETURN_IF((e_ret != IV_SUCCESS), e_ret); + + ps_jobq->pv_buf_wr = ps_jobq->pv_buf_base; + ps_jobq->pv_buf_rd = ps_jobq->pv_buf_base; + ps_jobq->i4_terminate = 0; + e_ret = impeg2_jobq_unlock(ps_jobq); + RETURN_IF((e_ret != IV_SUCCESS), e_ret); + + return e_ret; +} + +/** +******************************************************************************* +* +* @brief +* Deinitializes the jobq conext +* +* @par Description +* Deinitializes the jobq conext by calling impeg2_jobq_reset() +* and then destrying the mutex created +* +* @param[in] ps_jobq +* Job Queue context +* +* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ +IV_API_CALL_STATUS_T impeg2_jobq_deinit(jobq_t *ps_jobq) +{ + WORD32 i4_ret_val; + IV_API_CALL_STATUS_T e_ret = IV_SUCCESS; + + e_ret = impeg2_jobq_reset(ps_jobq); + RETURN_IF((e_ret != IV_SUCCESS), e_ret); + + i4_ret_val = ithread_mutex_destroy(ps_jobq->pv_mutex); + if(i4_ret_val) + { + return IV_FAIL; + } + + return IV_SUCCESS; +} + + +/** +******************************************************************************* +* +* @brief +* Terminates the jobq +* +* @par Description +* Terminates the jobq by setting a flag in context. +* +* @param[in] ps_jobq +* Job Queue context +* +* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS +* +* @remarks +* +******************************************************************************* +*/ + +IV_API_CALL_STATUS_T impeg2_jobq_terminate(jobq_t *ps_jobq) +{ + IV_API_CALL_STATUS_T e_ret = IV_SUCCESS; + e_ret = impeg2_jobq_lock(ps_jobq); + RETURN_IF((e_ret != IV_SUCCESS), e_ret); + + ps_jobq->i4_terminate = 1; + + e_ret = impeg2_jobq_unlock(ps_jobq); + RETURN_IF((e_ret != IV_SUCCESS), e_ret); + return e_ret; +} + + +/** +******************************************************************************* +* +* @brief Adds a job to the queue +* +* @par Description +* Adds a job to the queue and updates wr address to next location. +* Format/content of the job structure is abstracted and hence size of the job +* buffer is being passed. +* +* @param[in] ps_jobq +* Job Queue context +* +* @param[in] pv_job +* Pointer to the location that contains details of the job to be added +* +* @param[in] job_size +* Size of the job buffer +* +* @param[in] blocking +* To signal if the write is blocking or non-blocking. +* +* @returns +* +* @remarks +* Job Queue buffer is assumed to be allocated to handle worst case number of jobs +* Wrap around is not supported +* +******************************************************************************* +*/ +IV_API_CALL_STATUS_T impeg2_jobq_queue(jobq_t *ps_jobq, + void *pv_job, + WORD32 i4_job_size, + WORD32 i4_blocking, + WORD32 i4_lock) +{ + IV_API_CALL_STATUS_T e_ret = IV_SUCCESS; + IV_API_CALL_STATUS_T e_ret_tmp; + UWORD8 *pu1_buf; + UNUSED(i4_blocking); + + if(i4_lock) + { + e_ret_tmp = impeg2_jobq_lock(ps_jobq); + RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp); + } + pu1_buf = (UWORD8 *)ps_jobq->pv_buf_wr; + if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + i4_job_size)) + { + memcpy(ps_jobq->pv_buf_wr, pv_job, i4_job_size); + ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + i4_job_size; + e_ret = IV_SUCCESS; + } + else + { + /* Handle wrap around case */ + /* Wait for pv_buf_rd to consume first job_size number of bytes + * from the beginning of job queue + */ + e_ret = IV_FAIL; + } + + ps_jobq->i4_terminate = 0; + + if(i4_lock) + { + e_ret_tmp = impeg2_jobq_unlock(ps_jobq); + RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp); + } + + return e_ret; +} +/** +******************************************************************************* +* +* @brief Gets next from the Job queue +* +* @par Description +* Gets next job from the job queue and updates rd address to next location. +* Format/content of the job structure is abstracted and hence size of the job +* buffer is being passed. If it is a blocking call and if there is no new job +* then this functions unlocks the mutext and calls yield and then locks it back. +* and continues till a job is available or terminate is set +* +* @param[in] ps_jobq +* Job Queue context +* +* @param[out] pv_job +* Pointer to the location that contains details of the job to be written +* +* @param[in] job_size +* Size of the job buffer +* +* @param[in] blocking +* To signal if the read is blocking or non-blocking. +* +* @returns +* +* @remarks +* Job Queue buffer is assumed to be allocated to handle worst case number of jobs +* Wrap around is not supported +* +******************************************************************************* +*/ +IV_API_CALL_STATUS_T impeg2_jobq_dequeue(jobq_t *ps_jobq, + void *pv_job, + WORD32 i4_job_size, + WORD32 i4_blocking, + WORD32 i4_lock) +{ + IV_API_CALL_STATUS_T e_ret; + IV_API_CALL_STATUS_T e_ret_tmp; + volatile UWORD8 *pu1_buf; + if(i4_lock) + { + e_ret_tmp = impeg2_jobq_lock(ps_jobq); + RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp); + } + pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd; + + + if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + i4_job_size)) + { + while(1) + { + pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd; + if((UWORD8 *)ps_jobq->pv_buf_wr >= (pu1_buf + i4_job_size)) + { + memcpy(pv_job, ps_jobq->pv_buf_rd, i4_job_size); + ps_jobq->pv_buf_rd = (UWORD8 *)ps_jobq->pv_buf_rd + i4_job_size; + e_ret = IV_SUCCESS; + break; + } + else + { + /* If all the entries have been dequeued, then break and return */ + if(1 == ps_jobq->i4_terminate) + { + e_ret = IV_FAIL; + break; + } + + if((1 == i4_blocking) && (1 == i4_lock)) + { + impeg2_jobq_yield(ps_jobq); + + } + else + { + /* If there is no job available, + * and this is non blocking call then return fail */ + e_ret = IV_FAIL; + } + } + } + } + else + { + /* Handle wrap around case */ + /* Wait for pv_buf_rd to consume first i4_job_size number of bytes + * from the beginning of job queue + */ + e_ret = IV_FAIL; + } + if(i4_lock) + { + e_ret_tmp = impeg2_jobq_unlock(ps_jobq); + RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp); + } + + return e_ret; +} diff --git a/common/impeg2_job_queue.h b/common/impeg2_job_queue.h new file mode 100644 index 0000000..46d8bb9 --- /dev/null +++ b/common/impeg2_job_queue.h @@ -0,0 +1,72 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* impeg2_job_queue.h +* +* @brief +* Contains functions for job queue +* +* @author +* Harish +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + +#ifndef _IMPEG2_JOB_QUEUE_H_ +#define _IMPEG2_JOB_QUEUE_H_ + +typedef struct +{ + /** Pointer to buffer base which contains the jobs */ + void *pv_buf_base; + + /** Pointer to current address where new job can be added */ + void *pv_buf_wr; + + /** Pointer to current address from where next job can be obtained */ + void *pv_buf_rd; + + /** Pointer to end of job buffer */ + void *pv_buf_end; + + /** Mutex used to keep the functions thread-safe */ + void *pv_mutex; + + /** Flag to indicate jobq has to be terminated */ + WORD32 i4_terminate; +}jobq_t; + +WORD32 impeg2_jobq_ctxt_size(void); +void* impeg2_jobq_init(void *pv_buf, WORD32 buf_size); +IV_API_CALL_STATUS_T impeg2_jobq_free(jobq_t *ps_jobq); +IV_API_CALL_STATUS_T impeg2_jobq_reset(jobq_t *ps_jobq); +IV_API_CALL_STATUS_T impeg2_jobq_deinit(jobq_t *ps_jobq); +IV_API_CALL_STATUS_T impeg2_jobq_terminate(jobq_t *ps_jobq); +IV_API_CALL_STATUS_T impeg2_jobq_queue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking, WORD32 lock); +IV_API_CALL_STATUS_T impeg2_jobq_dequeue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking, WORD32 lock); + +#endif /* _IMPEG2_JOB_QUEUE_H_ */ diff --git a/common/impeg2_macros.h b/common/impeg2_macros.h new file mode 100644 index 0000000..366510f --- /dev/null +++ b/common/impeg2_macros.h @@ -0,0 +1,60 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_MACROS_H__ +#define __IMPEG2_MACROS_H__ + +#define ABS(x) ((x) < 0 ? (-1 * (x)) : (x)) + +#define MAX(x,y) ((x) > (y) ? (x) : (y)) + +#define MIN(x,y) ((x) < (y) ? (x) : (y)) + +#define CLIP(Number,Max,Min) if((Number) > (Max)) (Number) = (Max); \ +else if((Number) < (Min)) (Number) = (Min) + +#define SIGN(Number) (((Number) < 0) ? -1 : 1) + + +#define BITS(val,msb,lsb) (UWORD16)((((val) >> (lsb)) & ((1 << ((msb) - (lsb) + 1)) - 1))) + +#define BIT(val,bit) (UWORD16)(((val) >> (bit)) & 0x1) + +#define IS_VAL_IN_RANGE(val,upperLimit,lowerLimit) ((val) >= (lowerLimit) && (val) <= (upperLimit)) + +#define MSW(dword) (dword >> 16) +#define LSW(dword) (dword & 0xFFFF) +#define DIV_2_RND(mv) (((mv) + ((mv) > 0)) >> 1) +#define IS_NEG(Number) (((Number) < 0) ? 1 : 0) + +#define ALIGN128(x) ((((x) + 127) >> 7) << 7) +#define ALIGN64(x) ((((x) + 63) >> 6) << 6) +#define ALIGN32(x) ((((x) + 31) >> 5) << 5) +#define ALIGN16(x) ((((x) + 15) >> 4) << 4) +#define ALIGN8(x) ((((x) + 7) >> 3) << 3) + + +#define RETURN_IF(cond, retval) if(cond) {return (retval);} +#define UNUSED(x) ((void)(x)) + + +#define ASSERT(x) assert(x) + + +#endif /* __IMPEG2_IT_MACROS_H__ */ diff --git a/common/impeg2_mem_func.c b/common/impeg2_mem_func.c new file mode 100644 index 0000000..9268c01 --- /dev/null +++ b/common/impeg2_mem_func.c @@ -0,0 +1,87 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* impeg2_utils.c +* +* @brief +* Contains utility function definitions for MPEG2 codec +* +* @author +* Harish +* +* @par List of Functions: +* - impeg2_memset0_16bit_8x8_linear_block() +* - impeg2_memset_8bit_8x8_block() +* +* @remarks +* None +* +******************************************************************************* +*/ + +#include <stdio.h> +#include <string.h> +#include "iv_datatypedef.h" +#include "impeg2_defs.h" + +/******************************************************************************* +* Function Name : impeg2_memset0_16bit_8x8_linear_block +* +* Description : memsets resudial buf to 0 +* +* Arguments : destination buffer +* +* Values Returned : None +*******************************************************************************/ + + +void impeg2_memset0_16bit_8x8_linear_block (WORD16 *pi2_buf) +{ + memset(pi2_buf,0,64 * sizeof(WORD16)); +} + + + +/******************************************************************************* +* Function Name : impeg2_memset_8bit_8x8_block +* +* Description : memsets residual buf to value +* +* Arguments : destination buffer, value and stride +* +* Values Returned : None +*******************************************************************************/ + + +void impeg2_memset_8bit_8x8_block(UWORD8 *pu1_dst, WORD32 u4_dc_val, WORD32 u4_dst_wd) +{ + WORD32 j; + + for(j = BLK_SIZE; j > 0; j--) + { + memset(pu1_dst, u4_dc_val, BLK_SIZE); + pu1_dst += u4_dst_wd; + } +} + + + diff --git a/common/impeg2_mem_func.h b/common/impeg2_mem_func.h new file mode 100644 index 0000000..f73702c --- /dev/null +++ b/common/impeg2_mem_func.h @@ -0,0 +1,41 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +#ifndef IMPEG2_MEM_FUNC_H_ +#define IMPEG2_MEM_FUNC_H_ + +typedef void pf_memset0_one_16bit_buf_t (WORD16 *buf); +typedef void pf_memset_8bit_t (UWORD8 *dst, WORD32 dc_val, WORD32 dst_wd); + +pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block; +pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_a9q; + +pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_sse42; + +pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_av8; + +pf_memset_8bit_t impeg2_memset_8bit_8x8_block; +pf_memset_8bit_t impeg2_memset_8bit_8x8_block_a9q; + +pf_memset_8bit_t impeg2_memset_8bit_8x8_block_sse42; + +pf_memset_8bit_t impeg2_memset_8bit_8x8_block_av8; + +#endif /* IMPEG2_MEM_FUNC_H_ */ diff --git a/common/ithread.c b/common/ithread.c new file mode 100644 index 0000000..76fdad3 --- /dev/null +++ b/common/ithread.c @@ -0,0 +1,453 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : ithread.c */ +/* */ +/* Description : Contains abstraction for threads, mutex and semaphores*/ +/* */ +/* List of Functions : */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes */ +/* 07 09 2012 Harish Initial Version */ +/*****************************************************************************/ +/*****************************************************************************/ +/* File Includes */ +/*****************************************************************************/ +#include <string.h> +#include "iv_datatypedef.h" +#include "ithread.h" +#include <sys/types.h> + +#ifndef X86_MSVC +//#define PTHREAD_AFFINITY +//#define SYSCALL_AFFINITY + +#ifdef PTHREAD_AFFINITY +#define _GNU_SOURCE +#define __USE_GNU +#endif + +#include <pthread.h> +#include <sched.h> +#include <semaphore.h> +#include <unistd.h> + + +#endif +#if 0 +#include <sys/syscall.h> +#endif + +#ifdef X86_MSVC + +#include <windows.h> +#define SEM_MAX_COUNT 100 +#define SEM_INCREMENT_COUNT 1 + +UWORD32 ithread_get_handle_size(void) +{ + return (sizeof(HANDLE)); +} + +UWORD32 ithread_get_mutex_lock_size(void) +{ + return (sizeof(HANDLE)); +} + +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument) +{ + HANDLE *ppv_thread_handle; + HANDLE thread_handle_value; + + if(0 == thread_handle) + return -1; + + ppv_thread_handle = (HANDLE *)thread_handle; + thread_handle_value = (void *)CreateThread + (NULL, /* Attributes */ + 1024*128, /* Stack size */ + (LPTHREAD_START_ROUTINE)strt, /* Thread function */ + argument, /* Parameters */ + 0, /* Creation flags */ + NULL); /* Thread ID */ + *ppv_thread_handle = (HANDLE)thread_handle_value; + + return 0; +} + +WORD32 ithread_join(void *thread_handle, void ** val_ptr) +{ + HANDLE *ppv_thread_handle; + HANDLE thread_handle_value; + + if(0 == thread_handle) + return -1; + + ppv_thread_handle = (HANDLE *)thread_handle; + thread_handle_value = *ppv_thread_handle; + + if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE)) + { + CloseHandle(thread_handle_value); + } + + return 0; +} + +void ithread_exit(void *thread_handle) +{ + HANDLE *ppv_thread_handle; + HANDLE thread_handle_value; + DWORD thread_exit_code; + + if(0 == thread_handle) + return; + + ppv_thread_handle = (HANDLE *)thread_handle; + thread_handle_value = *ppv_thread_handle; + /* Get exit code for thread. If the return value is 0, means thread is busy */ + if( 0 != GetExitCodeThread(thread_handle_value, &thread_exit_code)) + { + TerminateThread(thread_handle_value, thread_exit_code); + } + + return; +} + +WORD32 ithread_get_mutex_struct_size(void) +{ + return (sizeof(HANDLE)); +} + +WORD32 ithread_mutex_init(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL); + *ppv_mutex_handle = mutex_handle_value; + return 0; +} + +WORD32 ithread_mutex_destroy(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = *ppv_mutex_handle; + CloseHandle(mutex_handle_value); + return 0; +} + +WORD32 ithread_mutex_lock(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + DWORD result = 0; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = *ppv_mutex_handle; + result = WaitForSingleObject(mutex_handle_value, INFINITE); + + if(WAIT_OBJECT_0 == result) + return 0; + + return 1; + +} + +WORD32 ithread_mutex_unlock(void *mutex) +{ + HANDLE *ppv_mutex_handle; + HANDLE mutex_handle_value; + DWORD result = 0; + + if(0 == mutex) + return -1; + + ppv_mutex_handle = (HANDLE *)mutex; + mutex_handle_value = *ppv_mutex_handle; + result = ReleaseSemaphore(mutex_handle_value, 1, NULL); + + if(0 == result) + return -1; + + return 0; +} + +void ithread_yield(void) { } + +void ithread_usleep(UWORD32 u4_time_us) +{ + UWORD32 u4_time_ms = u4_time_us / 1000; + Sleep(u4_time_ms); +} + +void ithread_msleep(UWORD32 u4_time_ms) +{ + Sleep(u4_time_ms); +} + +void ithread_sleep(UWORD32 u4_time) +{ + UWORD32 u4_time_ms = u4_time * 1000; + Sleep(u4_time_ms); +} + +UWORD32 ithread_get_sem_struct_size(void) +{ + return (sizeof(HANDLE)); +} + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value) +{ + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = CreateSemaphore(NULL, /* Security Attribute*/ + value, /* Initial count */ + SEM_MAX_COUNT,/* Max value */ + NULL); /* Name, not used */ + *sem_handle = sem_handle_value; + return 0; +} + +WORD32 ithread_sem_post(void *sem) +{ + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = *sem_handle; + + /* Post on Semaphore by releasing the lock on mutex */ + if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL)) + return 0; + + return -1; +} + +WORD32 ithread_sem_wait(void *sem) +{ + DWORD result = 0; + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = *sem_handle; + + /* Wait on Semaphore object infinitly */ + result = WaitForSingleObject(sem_handle_value, INFINITE); + + /* If lock on semaphore is acquired, return SUCCESS */ + if(WAIT_OBJECT_0 == result) + return 0; + + /* If call timeouts, return FAILURE */ + if(WAIT_TIMEOUT == result) + return -1; + + return 0; +} + +WORD32 ithread_sem_destroy(void *sem) +{ + HANDLE *sem_handle = (HANDLE *)sem; + HANDLE sem_handle_value; + + if(0 == sem) + return -1; + + sem_handle_value = *sem_handle; + + if(FALSE == CloseHandle(sem_handle_value) ) + { + return -1; + } + return 0; +} + +WORD32 ithread_set_affinity(WORD32 core_id) +{ + return 1; +} + +#else +UWORD32 ithread_get_handle_size(void) +{ + return sizeof(pthread_t); +} + +UWORD32 ithread_get_mutex_lock_size(void) +{ + return sizeof(pthread_mutex_t); +} + + +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument) +{ + ((void)(attribute)); + return pthread_create((pthread_t *)thread_handle, NULL,(void *(*)(void *)) strt, argument); +} + +WORD32 ithread_join(void *thread_handle, void ** val_ptr) +{ + pthread_t *pthread_handle = (pthread_t *)thread_handle; + ((void)(val_ptr)); + return pthread_join(*pthread_handle, NULL); +} + +void ithread_exit(void *val_ptr) +{ +return pthread_exit(val_ptr); +} + +WORD32 ithread_get_mutex_struct_size(void) +{ + return(sizeof(pthread_mutex_t)); +} +WORD32 ithread_mutex_init(void *mutex) +{ + return pthread_mutex_init((pthread_mutex_t *) mutex, NULL); +} + +WORD32 ithread_mutex_destroy(void *mutex) +{ + return pthread_mutex_destroy((pthread_mutex_t *) mutex); +} + +WORD32 ithread_mutex_lock(void *mutex) +{ + return pthread_mutex_lock((pthread_mutex_t *)mutex); +} + +WORD32 ithread_mutex_unlock(void *mutex) +{ + return pthread_mutex_unlock((pthread_mutex_t *)mutex); +} + +void ithread_yield(void) +{ + sched_yield(); +} + +void ithread_sleep(UWORD32 u4_time) +{ + usleep(u4_time * 1000 * 1000); +} + +void ithread_msleep(UWORD32 u4_time_ms) +{ + usleep(u4_time_ms * 1000); +} + +void ithread_usleep(UWORD32 u4_time_us) +{ + usleep(u4_time_us); +} + +UWORD32 ithread_get_sem_struct_size(void) +{ + return(sizeof(sem_t)); +} + + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value) +{ + return sem_init((sem_t *)sem,pshared,value); +} + +WORD32 ithread_sem_post(void *sem) +{ + return sem_post((sem_t *)sem); +} + + +WORD32 ithread_sem_wait(void *sem) +{ + return sem_wait((sem_t *)sem); +} + + +WORD32 ithread_sem_destroy(void *sem) +{ +return sem_destroy((sem_t *)sem); +} + + +WORD32 ithread_set_affinity(WORD32 core_id) +{ +#ifdef PTHREAD_AFFINITY + cpu_set_t cpuset; + int num_cores = sysconf(_SC_NPROCESSORS_ONLN); + pthread_t cur_thread = pthread_self(); + + if (core_id >= num_cores) + return -1; + + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + + return pthread_setaffinity_np(cur_thread, sizeof(cpu_set_t), &cpuset); + +#elif SYSCALL_AFFINITY + WORD32 i4_sys_res; + + pid_t pid = gettid(); + + + i4_sys_res = syscall(__NR_sched_setaffinity, pid, sizeof(i4_mask), &i4_mask); + if (i4_sys_res) + { + //WORD32 err; + //err = errno; + //perror("Error in setaffinity syscall PERROR : "); + //LOG_ERROR("Error in the syscall setaffinity: mask=0x%x err=0x%x", i4_mask, i4_sys_res); + return -1; + } +#endif + ((void)(core_id)); + return 1; + +} +#endif diff --git a/common/ithread.h b/common/ithread.h new file mode 100644 index 0000000..eb75d20 --- /dev/null +++ b/common/ithread.h @@ -0,0 +1,80 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* ithread.h +* +* @brief +* This file contains all the necessary structure and enumeration +* definitions needed for the Application Program Interface(API) of the +* Thread Abstraction Layer +* +* @author +* Harish +* +* @remarks +* None +* +******************************************************************************* +*/ +#ifndef __ITHREAD_H__ +#define __ITHREAD_H__ + +UWORD32 ithread_get_handle_size(void); + +UWORD32 ithread_get_mutex_lock_size(void); + +WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument); + +void ithread_exit(void *val_ptr); + +WORD32 ithread_join(void *thread_id, void ** val_ptr); + +WORD32 ithread_get_mutex_struct_size(void); + +WORD32 ithread_mutex_init(void *mutex); + +WORD32 ithread_mutex_destroy(void *mutex); + +WORD32 ithread_mutex_lock(void *mutex); + +WORD32 ithread_mutex_unlock(void *mutex); + +void ithread_yield(void); + +void ithread_sleep(UWORD32 u4_time); + +void ithread_msleep(UWORD32 u4_time_ms); + +void ithread_usleep(UWORD32 u4_time_us); + +UWORD32 ithread_get_sem_struct_size(void); + +WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value); + +WORD32 ithread_sem_post(void *sem); + +WORD32 ithread_sem_wait(void *sem); + +WORD32 ithread_sem_destroy(void *sem); + +WORD32 ithread_set_affinity(WORD32 core_id); +#endif /* __ITHREAD_H__ */ diff --git a/common/iv.h b/common/iv.h new file mode 100644 index 0000000..3941497 --- /dev/null +++ b/common/iv.h @@ -0,0 +1,420 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/** +******************************************************************************* +* @file +* iv.h +* +* @brief +* This file contains all the necessary structure and enumeration +* definitions needed for the Application Program Interface(API) of the +* Ittiam Video and Image codecs +* +* @author +* 100239(RCY) +* +* @par List of Functions: +* +* @remarks +* None +* +******************************************************************************* +*/ + + +#ifndef _IV_H +#define _IV_H + +/*****************************************************************************/ +/* Constant Macros */ +/*****************************************************************************/ + + +/*****************************************************************************/ +/* Typedefs */ +/*****************************************************************************/ + +/*****************************************************************************/ +/* Enums */ +/*****************************************************************************/ + + +/* IV_API_CALL_STATUS_T:This is only to return the FAIL/PASS status to the */ +/* application for the current API call */ + +typedef enum{ + IV_STATUS_NA = 0x7FFFFFFF, + IV_SUCCESS = 0x0, + IV_FAIL = 0x1, +}IV_API_CALL_STATUS_T; + +/* IV_MEM_TYPE_T: This Enumeration defines the type of memory (Internal/Ext */ +/* -ernal) along with the cacheable/non-cacheable attributes */ + +typedef enum { + IV_NA_MEM_TYPE = 0x7FFFFFFF, + IV_INTERNAL_CACHEABLE_PERSISTENT_MEM = 0x1, + IV_INTERNAL_CACHEABLE_SCRATCH_MEM = 0x2, + IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM = 0x3, + IV_EXTERNAL_CACHEABLE_SCRATCH_MEM = 0x4, + IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x5, + IV_INTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x6, + IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x7, + IV_EXTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x8 +}IV_MEM_TYPE_T; + +/* IV_COLOR_FORMAT_T: This enumeration lists all the color formats which */ +/* finds usage in video/image codecs */ + +typedef enum { + IV_CHROMA_NA = 0x7FFFFFFF, + IV_YUV_420P = 0x1, + IV_YUV_422P = 0x2, + IV_420_UV_INTL = 0x3, + IV_YUV_422IBE = 0x4, + IV_YUV_422ILE = 0x5, + IV_YUV_444P = 0x6, + IV_YUV_411P = 0x7, + IV_GRAY = 0x8, + IV_RGB_565 = 0x9, + IV_RGB_24 = 0xa, + IV_YUV_420SP_UV = 0xb, + IV_YUV_420SP_VU = 0xc, + IV_RGBA_8888 = 0xd +}IV_COLOR_FORMAT_T; + +/* IV_PICTURE_CODING_TYPE_T: VOP/Frame coding type Enumeration */ + +typedef enum { + IV_NA_FRAME = 0x7FFFFFFF, + IV_I_FRAME = 0x0, + IV_P_FRAME = 0x1, + IV_B_FRAME = 0x2, + IV_IDR_FRAME = 0x3, + IV_II_FRAME = 0x4, + IV_IP_FRAME = 0x5, + IV_IB_FRAME = 0x6, + IV_PI_FRAME = 0x7, + IV_PP_FRAME = 0x8, + IV_PB_FRAME = 0x9, + IV_BI_FRAME = 0xa, + IV_BP_FRAME = 0xb, + IV_BB_FRAME = 0xc, + IV_MBAFF_I_FRAME = 0xd, + IV_MBAFF_P_FRAME = 0xe, + IV_MBAFF_B_FRAME = 0xf, + IV_MBAFF_IDR_FRAME = 0x10, + IV_NOT_CODED_FRAME = 0x11, + IV_FRAMETYPE_DEFAULT = IV_I_FRAME +}IV_PICTURE_CODING_TYPE_T; + +/* IV_FLD_TYPE_T: field type Enumeration */ + +typedef enum { + IV_NA_FLD = 0x7FFFFFFF, + IV_TOP_FLD = 0x0, + IV_BOT_FLD = 0x1, + IV_FLD_TYPE_DEFAULT = IV_TOP_FLD +}IV_FLD_TYPE_T; + +/* IV_CONTENT_TYPE_T: Video content type */ + +typedef enum { + IV_CONTENTTYPE_NA = 0x7FFFFFFF, + IV_PROGRESSIVE = 0x0, + IV_INTERLACED = 0x1, + IV_PROGRESSIVE_FRAME = 0x2, + IV_INTERLACED_FRAME = 0x3, + IV_INTERLACED_TOPFIELD = 0x4, + IV_INTERLACED_BOTTOMFIELD = 0x5, + IV_CONTENTTYPE_DEFAULT = IV_PROGRESSIVE, +}IV_CONTENT_TYPE_T; + +/* IV_API_COMMAND_TYPE_T:API command type */ +typedef enum { + IV_CMD_NA = 0x7FFFFFFF, + IV_CMD_GET_NUM_MEM_REC = 0x0, + IV_CMD_FILL_NUM_MEM_REC = 0x1, + IV_CMD_RETRIEVE_MEMREC = 0x2, + IV_CMD_INIT = 0x3, + IV_CMD_DUMMY_ELEMENT = 0x4, +}IV_API_COMMAND_TYPE_T; + +/*****************************************************************************/ +/* Structure */ +/*****************************************************************************/ + +/* IV_OBJ_T: This structure defines the handle for the codec instance */ + +typedef struct{ + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * Pointer to the API function pointer table of the codec + */ + void *pv_fxns; + + /** + * Pointer to the handle of the codec + */ + void *pv_codec_handle; +}iv_obj_t; + +/* iv_mem_rec_t: This structure defines the memory record holder which will */ +/* be used by the codec to communicate its memory requirements to the */ +/* application through appropriate API functions */ + +typedef struct { + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * Pointer to the memory allocated by the application + */ + void *pv_base; + + /** + * u4_size of the memory to be allocated + */ + UWORD32 u4_mem_size; + + /** + * Alignment of the memory pointer + */ + UWORD32 u4_mem_alignment; + /** + * Nature of the memory to be allocated + */ + IV_MEM_TYPE_T e_mem_type; +}iv_mem_rec_t; + +/* IV_YUV_BUF_T: This structure defines attributes for the yuv buffer */ + +typedef struct { + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * Pointer to Luma (Y) Buffer + */ + + void *pv_y_buf; + /** + * Pointer to Chroma (Cb) Buffer + */ + void *pv_u_buf; + + /** + * Pointer to Chroma (Cr) Buffer + */ + void *pv_v_buf; + + /** + * Width of the Luma (Y) Buffer + */ + UWORD32 u4_y_wd; + + /** + * Height of the Luma (Y) Buffer + */ + UWORD32 u4_y_ht; + + /** + * Stride/Pitch of the Luma (Y) Buffer + */ + UWORD32 u4_y_strd; + + /** + * Width of the Chroma (Cb) Buffer + */ + UWORD32 u4_u_wd; + + /** + * Height of the Chroma (Cb) Buffer + */ + UWORD32 u4_u_ht; + + /** + * Stride/Pitch of the Chroma (Cb) Buffer + */ + UWORD32 u4_u_strd; + + /** + * Width of the Chroma (Cr) Buffer + */ + UWORD32 u4_v_wd; + + /** + * Height of the Chroma (Cr) Buffer + */ + UWORD32 u4_v_ht; + + /** + * Stride/Pitch of the Chroma (Cr) Buffer + */ + UWORD32 u4_v_strd; +}iv_yuv_buf_t; + +/*****************************************************************************/ +/* Get Number of Memory Records */ +/*****************************************************************************/ + +/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_GET_NUM_MEM_REC */ + + +typedef struct { + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * cmd + */ + IV_API_COMMAND_TYPE_T e_cmd; +}iv_num_mem_rec_ip_t; + + +typedef struct{ + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * error code + */ + UWORD32 u4_error_code; + + /** + * num_mem_rec + */ + UWORD32 u4_num_mem_rec; +}iv_num_mem_rec_op_t; + + +/*****************************************************************************/ +/* Fill Memory Records */ +/*****************************************************************************/ + +/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_FILL_NUM_MEM_REC */ + + +typedef struct { + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * cmd + */ + IV_API_COMMAND_TYPE_T e_cmd; + + /** + * pointer to array of memrecords structures should be filled by codec + with details of memory resource requirements + */ + iv_mem_rec_t *pv_mem_rec_location; + + /** + * maximum width for which codec should request memory requirements + */ + UWORD32 u4_max_frm_wd; + + /** + * maximum height for which codec should request memory requirements + */ + UWORD32 u4_max_frm_ht; +}iv_fill_mem_rec_ip_t; + + +typedef struct{ + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * error_code + */ + UWORD32 u4_error_code; + + /** + * no of memory record structures which are filled by codec + */ + UWORD32 u4_num_mem_rec_filled; +}iv_fill_mem_rec_op_t; + + +/*****************************************************************************/ +/* Retrieve Memory Records */ +/*****************************************************************************/ + +/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_RETRIEVE_MEMREC */ + + + +typedef struct { + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * cmd + */ + IV_API_COMMAND_TYPE_T e_cmd; + + /** + * array of structures where codec should fill with all resources(memory) with it + */ + iv_mem_rec_t *pv_mem_rec_location; +}iv_retrieve_mem_rec_ip_t; + + +typedef struct{ + /** + * u4_size of the structure + */ + UWORD32 u4_size; + + /** + * error_code + */ + UWORD32 u4_error_code; + + /** + * no of memory records filled by codec + */ + UWORD32 u4_num_mem_rec_filled; +}iv_retrieve_mem_rec_op_t; + + + +#endif /* _IV_H */ + diff --git a/common/iv_datatypedef.h b/common/iv_datatypedef.h new file mode 100644 index 0000000..3c45942 --- /dev/null +++ b/common/iv_datatypedef.h @@ -0,0 +1,81 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +/*****************************************************************************/ +/* */ +/* File Name : datatypedef.h */ +/* */ +/* Description : This file contains all the necessary data type */ +/* definitions. */ +/* */ +/* List of Functions : None */ +/* */ +/* Issues / Problems : None */ +/* */ +/* Revision History : */ +/* */ +/* DD MM YYYY Author(s) Changes (Describe the changes made) */ +/* 29 12 2006 Rajendra C Y Draft */ +/* */ +/*****************************************************************************/ + +#ifndef __IV_DATATYPEDEF_H__ +#define __IV_DATATYPEDEF_H__ + +/*****************************************************************************/ +/* Typedefs */ +/*****************************************************************************/ + +typedef int WORD32; +typedef unsigned int UWORD32; + +typedef short WORD16; +typedef unsigned short UWORD16; + +typedef char WORD8; +typedef unsigned char UWORD8; + +typedef char CHAR; +#ifndef NULL +#define NULL ((void *)0) + +#endif + +typedef enum +{ + IT_FALSE, + IT_TRUE +} IT_BOOL; + + +typedef enum +{ + IT_OK, + IT_ERROR = -1 +} IT_STATUS; + +/*****************************************************************************/ +/* Input and Output Parameter identifiers */ +/*****************************************************************************/ +#define IT_IN +#define IT_OUT + + +#endif /* __IV_DATATYPEDEF_H__ */ + diff --git a/common/mips/impeg2_platform_macros.h b/common/mips/impeg2_platform_macros.h new file mode 100644 index 0000000..05ff6da --- /dev/null +++ b/common/mips/impeg2_platform_macros.h @@ -0,0 +1,49 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_PLATFORM_MACROS_H__ +#define __IMPEG2_PLATFORM_MACROS_H__ + + +#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = (u4_temp1 << 24) | \ + ((u4_temp1 & 0xff00) << 8) | \ + ((u4_temp1 & 0xff0000) >> 8) | \ + (u4_temp1 >> 24); +static __inline UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return (__builtin_clz(u4_word)); + else + return 32; +} + + +#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x)) +#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x)) + +#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x)) +#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x)) + +#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x)) +#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x)) +#define PLD(x) + +#define INLINE + +#endif /* __IMPEG2_PLATFORM_MACROS_H__ */ diff --git a/common/x86/impeg2_idct_recon_sse42_intr.c b/common/x86/impeg2_idct_recon_sse42_intr.c new file mode 100755 index 0000000..4142032 --- /dev/null +++ b/common/x86/impeg2_idct_recon_sse42_intr.c @@ -0,0 +1,2205 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * impeg2_itrans_recon_x86_intr.c + * + * @brief + * Contains function definitions for inverse quantization, inverse + * transform and reconstruction + * + * @author + * 100470 + * 100592 (edited by) + * + * @par List of Functions: + * - impeg2_itrans_recon_8x8_sse42() + * + * @remarks + * None + * + ******************************************************************************* + */ +#include <stdio.h> +#include <string.h> +#include "iv_datatypedef.h" +#include "impeg2_macros.h" +#include "impeg2_defs.h" +#include "impeg2_globals.h" + +#include <immintrin.h> +#include <emmintrin.h> +#include <smmintrin.h> +#include <tmmintrin.h> + + +/** + ******************************************************************************* + * + * @brief + * This function performs inverse quantization, inverse transform and + * reconstruction for 8c8 input block + * + * @par Description: + * Performs inverse quantization , inverse transform and adds the + * prediction data and clips output to 8 bit + * + * @param[in] pi2_src + * Input 8x8 coefficients + * + * @param[in] pi2_tmp + * Temporary 8x8 buffer for storing inverse + * transform 1st stage output + * + * @param[in] pu1_pred + * Prediction 8x8 block + * + * @param[in] pi2_dequant_coeff + * Dequant Coeffs + * + * @param[out] pu1_dst + * Output 8x8 block + * + * @param[in] src_strd + * Input stride + * + * @param[in] qp_div + * Quantization parameter / 6 + * + * @param[in] qp_rem + * Quantization parameter % 6 + * + * @param[in] pred_strd + * Prediction stride + * + * @param[in] dst_strd + * Output Stride + * + * @param[in] zero_cols + * Zero columns in pi2_src + * + * @returns Void + * + * @remarks + * None + * + ******************************************************************************* + */ + + +void impeg2_idct_recon_sse42(WORD16 *pi2_src, + WORD16 *pi2_tmp, + UWORD8 *pu1_pred, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + WORD32 zero_cols, + WORD32 zero_rows) +{ + __m128i m_temp_reg_0; + __m128i m_temp_reg_1; + __m128i m_temp_reg_2; + __m128i m_temp_reg_3; + __m128i m_temp_reg_5; + __m128i m_temp_reg_6; + __m128i m_temp_reg_7; + __m128i m_temp_reg_4; + __m128i m_temp_reg_10; + __m128i m_temp_reg_11; + __m128i m_temp_reg_12; + __m128i m_temp_reg_13; + __m128i m_temp_reg_14; + __m128i m_temp_reg_15; + __m128i m_temp_reg_16; + __m128i m_temp_reg_17; + __m128i m_temp_reg_20; + __m128i m_temp_reg_21; + __m128i m_temp_reg_22; + __m128i m_temp_reg_23; + __m128i m_temp_reg_24; + __m128i m_temp_reg_25; + __m128i m_temp_reg_26; + __m128i m_temp_reg_27; + __m128i m_temp_reg_30; + __m128i m_temp_reg_31; + __m128i m_temp_reg_32; + __m128i m_temp_reg_33; + __m128i m_temp_reg_34; + __m128i m_temp_reg_35; + __m128i m_temp_reg_36; + __m128i m_temp_reg_37; + __m128i m_temp_reg_40; + __m128i m_temp_reg_41; + __m128i m_temp_reg_42; + __m128i m_temp_reg_43; + __m128i m_temp_reg_44; + __m128i m_temp_reg_45; + __m128i m_temp_reg_46; + __m128i m_temp_reg_47; + __m128i m_temp_reg_50; + __m128i m_temp_reg_51; + __m128i m_temp_reg_52; + __m128i m_temp_reg_53; + __m128i m_temp_reg_54; + __m128i m_temp_reg_55; + __m128i m_temp_reg_56; + __m128i m_temp_reg_57; + __m128i m_temp_reg_60; + __m128i m_temp_reg_61; + __m128i m_temp_reg_62; + __m128i m_temp_reg_63; + __m128i m_temp_reg_64; + __m128i m_temp_reg_65; + __m128i m_temp_reg_66; + __m128i m_temp_reg_67; + __m128i m_temp_reg_70; + __m128i m_temp_reg_71; + __m128i m_temp_reg_72; + __m128i m_temp_reg_73; + __m128i m_temp_reg_74; + __m128i m_temp_reg_75; + __m128i m_temp_reg_76; + __m128i m_temp_reg_77; + __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; + + WORD32 check_row_stage_1; /* Lokesh */ + WORD32 check_row_stage_2; /* Lokesh */ + + __m128i m_rdng_factor; + WORD32 i4_shift = IDCT_STG1_SHIFT; + UNUSED(pi2_tmp); + check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0; + check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0; + + m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src); + pi2_src += src_strd; + m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src); + pi2_src += src_strd; + m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src); + pi2_src += src_strd; + m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src); + pi2_src += src_strd; + + m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src); + pi2_src += src_strd; + m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src); + pi2_src += src_strd; + m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src); + pi2_src += src_strd; + m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src); + + if(!check_row_stage_2) + { + if(!check_row_stage_1) + { + /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ + /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ + { + //Interleaving 0,4 row in 0 , 1 Rishab + /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); + + m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + + } + + + /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ + /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ + /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ + { + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 + + /* Combining instructions to eliminate them based on zero_rows : Lokesh */ + //Interleaving 2,6 row in 4, 5 Rishab + m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); + + m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); + m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); + + + /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]); + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]); + + + + /* e */ + + /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ + /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ + /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ + /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ + m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); + m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); + + m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); + m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); + + } + + /* o */ + { + + /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ + { + + m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); + //o0:1B*89+3B*75,5B*50+7B*18 + m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + + m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); + m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); + + + + /* Column 0 of destination computed here */ + /* It is stored in m_temp_reg_50 */ + /* Column 7 of destination computed here */ + /* It is stored in m_temp_reg_57 */ + /* Upper 8 bytes of both registers are zero due to zero_cols*/ + + + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_setzero_si128(); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + //o1:1B*75-3B*18,5B*89+7B*50 + m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + + m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + + /* Loading coeff for computing o2 in the next block */ + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]); + + /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ + + + + /* Column 1 of destination computed here */ + /* It is stored in m_temp_reg_51 */ + /* Column 6 of destination computed here */ + /* It is stored in m_temp_reg_56 */ + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + //o2:1B*50-3B*89,5B*18+7B*75 + m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + + m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + + + /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ + + /* Loading coeff for computing o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]); + + + + /* Column 2 of destination computed here */ + /* It is stored in m_temp_reg_52 */ + /* Column 5 of destination computed here */ + /* It is stored in m_temp_reg_55 */ + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + //o3:1B*18-3B*50,5B*75-7B*89 + m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + + m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + + + + /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ + + + + /* Column 3 of destination computed here */ + /* It is stored in m_temp_reg_53 */ + /* Column 4 of destination computed here */ + /* It is stored in m_temp_reg_54 */ + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + + m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + } + } + + /* Transpose of the destination 8x8 matrix done here */ + /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ + /* respectively */ + { + m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); + + m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); + + m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); + + m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); + m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); + + m_temp_reg_54 = _mm_setzero_si128(); + m_temp_reg_55 = _mm_setzero_si128(); + m_temp_reg_56 = _mm_setzero_si128(); + m_temp_reg_57 = _mm_setzero_si128(); + } + } + else + { + /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ + /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ + { + //Interleaving 0,4 row in 0 , 1 Rishab + /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); + + m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + + } + + + /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ + /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ + /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ + { + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 + + /* Combining instructions to eliminate them based on zero_rows : Lokesh */ + //Interleaving 2,6 row in 4, 5 Rishab + m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); + + m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); + m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); + + + /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]); + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]); + + + + /* e */ + + /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ + /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ + /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ + /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ + m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); + m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); + + m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); + m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); + + } + + /* o */ + { + + /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ + { + + m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); + m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); + //o0:1B*89+3B*75,5B*50+7B*18 + m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); + + m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); + m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); + + m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); + + + + /* Column 0 of destination computed here */ + /* It is stored in m_temp_reg_50 */ + /* Column 7 of destination computed here */ + /* It is stored in m_temp_reg_57 */ + /* Upper 8 bytes of both registers are zero due to zero_cols*/ + + + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_setzero_si128(); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + //o1:1B*75-3B*18,5B*89+7B*50 + m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); + + m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + + /* Loading coeff for computing o2 in the next block */ + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]); + + /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ + m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); + + + + /* Column 1 of destination computed here */ + /* It is stored in m_temp_reg_51 */ + /* Column 6 of destination computed here */ + /* It is stored in m_temp_reg_56 */ + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + //o2:1B*50-3B*89,5B*18+7B*75 + m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); + + m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + + + /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ + + /* Loading coeff for computing o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]); + + m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); + + + /* Column 2 of destination computed here */ + /* It is stored in m_temp_reg_52 */ + /* Column 5 of destination computed here */ + /* It is stored in m_temp_reg_55 */ + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + //o3:1B*18-3B*50,5B*75-7B*89 + m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); + + m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + + + + /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ + + m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); + + + /* Column 3 of destination computed here */ + /* It is stored in m_temp_reg_53 */ + /* Column 4 of destination computed here */ + /* It is stored in m_temp_reg_54 */ + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + + + m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); + } + } + + /* Transpose of the destination 8x8 matrix done here */ + /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ + /* respectively */ + { + m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); + + m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); + + m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); + m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); + + m_temp_reg_54 = _mm_setzero_si128(); + m_temp_reg_55 = _mm_setzero_si128(); + m_temp_reg_56 = _mm_setzero_si128(); + m_temp_reg_57 = _mm_setzero_si128(); + } + } + + /* Stage 2 */ + i4_shift = IDCT_STG2_SHIFT; + { + /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ + /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ + { + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); + + m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); + + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]); + } + + + /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ + /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ + { + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); + + + m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); + + /* Loading coeff for computing o0 in the next block */ + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]); + + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); + + + + /* e */ + + /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ + /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ + /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ + /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ + m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); + m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); + + m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); + m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); + + m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); + m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); + + m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); + m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); + + } + + /* o */ + { + + /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ + { + //o0:1B*89+3B*75,1T*89+3T*75 + m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + + m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); + m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); + /* Loading coeff for computing o1 in the next block */ + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]); + + + + /* Column 0 of destination computed here */ + /* It is stored in m_temp_reg_50 */ + /* Column 7 of destination computed here */ + /* It is stored in m_temp_reg_57 */ + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); + m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); + + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); + m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); + m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); + m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); + + //o1:1B*75-3B*18,1T*75-3T*18 + m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); + m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); + + m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); + m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); + m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); + m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); + + m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); + m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); + + + /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ + + + /* Loading coeff for computing o2 in the next block */ + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]); + + + + /* Column 1 of destination computed here */ + /* It is stored in m_temp_reg_51 */ + /* Column 6 of destination computed here */ + /* It is stored in m_temp_reg_56 */ + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); + m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); + + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); + m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); + m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); + m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); + + //o2:1B*50-3B*89,5T*18+7T*75. + m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + + m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); + m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); + m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); + m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); + + m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); + m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); + + + /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ + + /* Loading coeff for computing o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]); + + + /* Column 2 of destination computed here */ + /* It is stored in m_temp_reg_52 */ + /* Column 5 of destination computed here */ + /* It is stored in m_temp_reg_55 */ + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); + m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); + + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); + m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); + m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); + m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); + + //o3:1B*18-3B*50,1T*18-3T*50 + m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); + m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); + + m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); + m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); + m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); + m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); + + + m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); + m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); + + + + /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ + + + /* Column 3 of destination computed here */ + /* It is stored in m_temp_reg_53 */ + /* Column 4 of destination computed here */ + /* It is stored in m_temp_reg_54 */ + + m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); + m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); + + m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); + m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); + + m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); + m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); + m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); + m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); + + m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); + m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); + m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); + m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); + + m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); + m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); + } + } + + /* Transpose of the destination 8x8 matrix done here */ + /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ + /* respectively */ + { + m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); + m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); + + m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); + m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); + m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); + m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); + + m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); + m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); + } + + /* Recon and store */ + { + m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); + + m_temp_reg_50 = _mm_setzero_si128(); + m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); + m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); + m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); + m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); + m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); + m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); + m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); + m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); + + m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); + m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); + m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); + m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); + m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); + m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); + m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); + m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); + + m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); + m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); + m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); + m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); + m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); + m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); + m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); + m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); + + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); + pu1_dst += dst_strd; + } + } + } + else + + { + + /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ + /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ + if(!check_row_stage_1) + { + /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ + /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ + { + //Interleaving 0,4 row in 0 , 1 Rishab + /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); + + m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + + + m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); + } + + + /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ + /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ + { + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 + + /* Combining instructions to eliminate them based on zero_rows : Lokesh */ + //Interleaving 2,6 row in 4, 5 Rishab + m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); + m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); + + m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); + m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); + + m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); + m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); + + + + /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); + //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]); + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); + //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]); + + } + + /* e */ + { + /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ + /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ + /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ + /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ + m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); + m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); + + m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); + m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); + + m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); + m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); + + m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); + m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); + + } + + /* o */ + { + + /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ + { + + m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); + m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); + //o0:1B*89+3B*75,1T*89+3T*75 + m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); + + m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); + m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); + + } + + /* Column 0 of destination computed here */ + /* It is stored in m_temp_reg_50 */ + /* Column 7 of destination computed here */ + /* It is stored in m_temp_reg_57 */ + { + + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 + m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); + + m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + + /* Loading coeff for computing o2 in the next block */ + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); + + } + + /* Column 1 of destination computed here */ + /* It is stored in m_temp_reg_51 */ + /* Column 6 of destination computed here */ + /* It is stored in m_temp_reg_56 */ + { + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + //o2:1B*50-3B*89,1T*50-3T*89 + m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); + + m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + + + /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ + + + /* Loading coeff for computing o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); + + } + + /* Column 2 of destination computed here */ + /* It is stored in m_temp_reg_52 */ + /* Column 5 of destination computed here */ + /* It is stored in m_temp_reg_55 */ + { + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + //o3:1B*18-3B*50,1T*18-3T*50 + m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); + + m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + + + + /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ + + + } + + /* Column 3 of destination computed here */ + /* It is stored in m_temp_reg_53 */ + /* Column 4 of destination computed here */ + /* It is stored in m_temp_reg_54 */ + { + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + } + } + + /* Transpose of the destination 8x8 matrix done here */ + /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ + /* respectively */ + { + + + m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); + m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); + + m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); + m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); + + m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); + m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); + + m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); + m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); + } + } + else + { + + /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ + /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ + { + //Interleaving 0,4 row in 0 , 1 Rishab + /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); + + m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + + + m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); + } + + + /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ + /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ + { + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 + + /* Combining instructions to eliminate them based on zero_rows : Lokesh */ + //Interleaving 2,6 row in 4, 5 Rishab + m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); + m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); + + m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); + m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); + + m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); + m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); + + + + /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]); + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]); + + } + + /* e */ + { + /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ + /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ + /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ + /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ + m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); + m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); + + m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); + m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); + + m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); + m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); + + m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); + m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); + + } + + /* o */ + { + + /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ + { + + m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); + m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); + m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); + m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); + //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 + m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); + m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); + m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); + + + m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); + m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); + + m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); + m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); + } + + /* Column 0 of destination computed here */ + /* It is stored in m_temp_reg_50 */ + /* Column 7 of destination computed here */ + /* It is stored in m_temp_reg_57 */ + { + + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 + m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); + m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); + m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); + + m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + + /* Loading coeff for computing o2 in the next block */ + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]); + + /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ + m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); + m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); + } + + /* Column 1 of destination computed here */ + /* It is stored in m_temp_reg_51 */ + /* Column 6 of destination computed here */ + /* It is stored in m_temp_reg_56 */ + { + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 + m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); + m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); + m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); + m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); + + m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + + + /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ + + + /* Loading coeff for computing o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]); + + m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); + m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); + } + + /* Column 2 of destination computed here */ + /* It is stored in m_temp_reg_52 */ + /* Column 5 of destination computed here */ + /* It is stored in m_temp_reg_55 */ + { + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 + m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); + m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); + m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); + m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); + + m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + + + + /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ + + + m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); + m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); + } + + /* Column 3 of destination computed here */ + /* It is stored in m_temp_reg_53 */ + /* Column 4 of destination computed here */ + /* It is stored in m_temp_reg_54 */ + { + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); + m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); + + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); + m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); + + m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); + m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); + m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); + m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); + + m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); + m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); + m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); + m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); + + m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); + m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); + } + } + + /* Transpose of the destination 8x8 matrix done here */ + /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ + /* respectively */ + { + + + m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); + m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); + + m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); + m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); + + m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); + m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); + + m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); + m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); + } + } + /* Stage 2 */ + + i4_shift = IDCT_STG2_SHIFT; + + { + + /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ + /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ + { + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); + + m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); + + + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]); + } + + + /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ + /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ + { + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); + + + m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); + m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); + + /* Loading coeff for computing o0 in the next block */ + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]); + + + m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); + m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); + } + + /* e */ + { + /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ + /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ + /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ + /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ + m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); + m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); + + m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); + m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); + + m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); + m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); + + m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); + m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); + + } + + /* o */ + { + m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57); + m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57); + + /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ + { + //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 + m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); + m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); + + m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); + m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); + /* Loading coeff for computing o1 in the next block */ + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]); + + m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); + m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); + } + + /* Column 0 of destination computed here */ + /* It is stored in m_temp_reg_50 */ + /* Column 7 of destination computed here */ + /* It is stored in m_temp_reg_57 */ + { + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); + m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); + + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); + m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); + m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); + m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); + + m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); + m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); + m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); + m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); + + //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 + m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); + m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); + m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); + m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); + + m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); + m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); + + + /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ + + + /* Loading coeff for computing o2 in the next block */ + m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]); + m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]); + + m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); + m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); + } + + /* Column 1 of destination computed here */ + /* It is stored in m_temp_reg_51 */ + /* Column 6 of destination computed here */ + /* It is stored in m_temp_reg_56 */ + { + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); + m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); + + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); + m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); + m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); + m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); + + m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); + m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); + m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); + m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); + + //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 + m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); + m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); + m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); + m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); + + m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); + m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); + + + /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ + + /* Loading coeff for computing o3 in the next block */ + + m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]); + m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]); + + m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); + m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); + } + + /* Column 2 of destination computed here */ + /* It is stored in m_temp_reg_52 */ + /* Column 5 of destination computed here */ + /* It is stored in m_temp_reg_55 */ + { + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); + m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); + + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); + m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); + + m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); + m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); + m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); + m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); + + m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); + m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); + m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); + m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); + + //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 + m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); + m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); + m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); + m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); + + m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); + m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); + + + + /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ + + + m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); + m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); + } + + /* Column 3 of destination computed here */ + /* It is stored in m_temp_reg_53 */ + /* Column 4 of destination computed here */ + /* It is stored in m_temp_reg_54 */ + { + m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); + m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); + + m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); + m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); + + m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); + m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); + m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); + m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); + + m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); + m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); + m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); + m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); + + m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); + m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); + } + } + + /* Transpose of the destination 8x8 matrix done here */ + /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ + /* respectively */ + { + m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); + m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); + m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); + m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); + m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); + + m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); + m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); + m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); + m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); + m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); + m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); + m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); + m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); + + m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); + m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); + m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); + } + + /* Recon and store */ + { + m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); + pu1_pred += pred_strd; + m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); + + + m_temp_reg_50 = _mm_setzero_si128(); + m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); + m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); + m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); + m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); + m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); + m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); + m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); + m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); + + m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); + m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); + m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); + m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); + m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); + m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); + m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); + m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); + + m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); + m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); + m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); + m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); + m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); + m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); + m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); + m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); + + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); + pu1_dst += dst_strd; + _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); + pu1_dst += dst_strd; + + } + + + } + + + } +} + +void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src, + WORD16 *pi2_tmp, + UWORD8 *pu1_pred, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + WORD32 zero_cols, + WORD32 zero_rows) +{ + WORD32 val; + __m128i value_4x32b, mismatch_stg2_additive; + __m128i pred_r, pred_half0, pred_half1; + __m128i temp0, temp1; + __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND); + + UNUSED(pi2_tmp); + UNUSED(src_strd); + UNUSED(zero_cols); + UNUSED(zero_rows); + + val = pi2_src[0] * gai2_impeg2_idct_q15[0]; + val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT); + val *= gai2_impeg2_idct_q11[0]; + value_4x32b = _mm_set1_epi32(val); + + // Row 0 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive); + pred_r = _mm_loadl_epi64((__m128i *) pu1_pred); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)pu1_dst, temp0); + + // Row 1 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8)); + pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0); + + // Row 2 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16)); + pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd)); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0); + + // Row 3 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24)); + pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd)); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0); + + // Row 4 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32)); + pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd)); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0); + + // Row 5 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40)); + pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd)); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0); + + // Row 6 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48)); + pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd)); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0); + + // Row 7 processing + mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56)); + pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd)); + pred_r = _mm_cvtepu8_epi16(pred_r); + temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); + pred_half0 = _mm_cvtepu16_epi32(pred_r); + temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); + + pred_r = _mm_srli_si128(pred_r, 8); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp0 = _mm_add_epi32(temp0, round_stg2); + temp1 = _mm_add_epi32(temp1, round_stg2); + pred_half1 = _mm_cvtepu16_epi32(pred_r); + temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); + temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); + temp0 = _mm_add_epi32(temp0, pred_half0); + temp1 = _mm_add_epi32(temp1, pred_half1); + + temp0 = _mm_packus_epi32(temp0, temp1); + temp0 = _mm_packus_epi16(temp0, temp1); + + _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0); +} + +void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src, + WORD16 *pi2_tmp, + UWORD8 *pu1_pred, + UWORD8 *pu1_dst, + WORD32 src_strd, + WORD32 pred_strd, + WORD32 dst_strd, + WORD32 zero_cols, + WORD32 zero_rows) +{ + WORD32 val; + __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3; + + UNUSED(pi2_tmp); + UNUSED(src_strd); + UNUSED(zero_cols); + UNUSED(zero_rows); + + val = pi2_src[0] * gai2_impeg2_idct_q15[0]; + val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT); + val = val * gai2_impeg2_idct_q11[0]; + val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT); + + value_4x32b = _mm_set1_epi32(val); + + //Row 0-1 processing + pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); + pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); + pred_r0 = _mm_cvtepu8_epi16(pred_r0); + pred_r1 = _mm_cvtepu8_epi16(pred_r1); + + temp0 = _mm_cvtepu16_epi32(pred_r0); + pred_r0 = _mm_srli_si128(pred_r0, 8); + temp2 = _mm_cvtepu16_epi32(pred_r1); + pred_r1 = _mm_srli_si128(pred_r1, 8); + temp1 = _mm_cvtepu16_epi32(pred_r0); + temp3 = _mm_cvtepu16_epi32(pred_r1); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp2 = _mm_add_epi32(temp2, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp3 = _mm_add_epi32(temp3, value_4x32b); + temp0 = _mm_packus_epi32(temp0, temp1); + temp2 = _mm_packus_epi32(temp2, temp3); + temp0 = _mm_packus_epi16(temp0, temp1); + temp2 = _mm_packus_epi16(temp2, temp3); + _mm_storel_epi64((__m128i *)(pu1_dst), temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); + + //Row 2-3 processing + pu1_pred += 2 * pred_strd; + pu1_dst += 2 * dst_strd; + + pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); + pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); + pred_r0 = _mm_cvtepu8_epi16(pred_r0); + pred_r1 = _mm_cvtepu8_epi16(pred_r1); + + temp0 = _mm_cvtepu16_epi32(pred_r0); + pred_r0 = _mm_srli_si128(pred_r0, 8); + temp2 = _mm_cvtepu16_epi32(pred_r1); + pred_r1 = _mm_srli_si128(pred_r1, 8); + temp1 = _mm_cvtepu16_epi32(pred_r0); + temp3 = _mm_cvtepu16_epi32(pred_r1); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp2 = _mm_add_epi32(temp2, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp3 = _mm_add_epi32(temp3, value_4x32b); + temp0 = _mm_packus_epi32(temp0, temp1); + temp2 = _mm_packus_epi32(temp2, temp3); + temp0 = _mm_packus_epi16(temp0, temp1); + temp2 = _mm_packus_epi16(temp2, temp3); + _mm_storel_epi64((__m128i *)(pu1_dst), temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); + + //Row 4-5 processing + pu1_pred += 2 * pred_strd; + pu1_dst += 2 * dst_strd; + + pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); + pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); + pred_r0 = _mm_cvtepu8_epi16(pred_r0); + pred_r1 = _mm_cvtepu8_epi16(pred_r1); + + temp0 = _mm_cvtepu16_epi32(pred_r0); + pred_r0 = _mm_srli_si128(pred_r0, 8); + temp2 = _mm_cvtepu16_epi32(pred_r1); + pred_r1 = _mm_srli_si128(pred_r1, 8); + temp1 = _mm_cvtepu16_epi32(pred_r0); + temp3 = _mm_cvtepu16_epi32(pred_r1); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp2 = _mm_add_epi32(temp2, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp3 = _mm_add_epi32(temp3, value_4x32b); + temp0 = _mm_packus_epi32(temp0, temp1); + temp2 = _mm_packus_epi32(temp2, temp3); + temp0 = _mm_packus_epi16(temp0, temp1); + temp2 = _mm_packus_epi16(temp2, temp3); + _mm_storel_epi64((__m128i *)(pu1_dst), temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); + + //Row 6-7 processing + pu1_pred += 2 * pred_strd; + pu1_dst += 2 * dst_strd; + + pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); + pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); + pred_r0 = _mm_cvtepu8_epi16(pred_r0); + pred_r1 = _mm_cvtepu8_epi16(pred_r1); + + temp0 = _mm_cvtepu16_epi32(pred_r0); + pred_r0 = _mm_srli_si128(pred_r0, 8); + temp2 = _mm_cvtepu16_epi32(pred_r1); + pred_r1 = _mm_srli_si128(pred_r1, 8); + temp1 = _mm_cvtepu16_epi32(pred_r0); + temp3 = _mm_cvtepu16_epi32(pred_r1); + + temp0 = _mm_add_epi32(temp0, value_4x32b); + temp2 = _mm_add_epi32(temp2, value_4x32b); + temp1 = _mm_add_epi32(temp1, value_4x32b); + temp3 = _mm_add_epi32(temp3, value_4x32b); + temp0 = _mm_packus_epi32(temp0, temp1); + temp2 = _mm_packus_epi32(temp2, temp3); + temp0 = _mm_packus_epi16(temp0, temp1); + temp2 = _mm_packus_epi16(temp2, temp3); + _mm_storel_epi64((__m128i *)(pu1_dst), temp0); + _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); +} diff --git a/common/x86/impeg2_inter_pred_sse42_intr.c b/common/x86/impeg2_inter_pred_sse42_intr.c new file mode 100644 index 0000000..4599afa --- /dev/null +++ b/common/x86/impeg2_inter_pred_sse42_intr.c @@ -0,0 +1,899 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * impeg2_inter_pred_sse42_intr.c + * + * @brief + * Contains Motion compensation function definitions for MPEG2 decoder + * + * @author + * Mohit [100664] + * + * - impeg2_copy_mb_sse42() + * - impeg2_interpolate_sse42() + * - impeg2_mc_halfx_halfy_8x8_sse42() + * - impeg2_mc_halfx_fully_8x8_sse42() + * - impeg2_mc_fullx_halfy_8x8_sse42() + * - impeg2_mc_fullx_fully_8x8_sse42() + * + * @remarks + * None + * + ******************************************************************************* + */ +#include <stdio.h> +#include <string.h> +#include "iv_datatypedef.h" +#include "impeg2_macros.h" +#include "impeg2_defs.h" +#include "impeg2_inter_pred.h" + +#include <immintrin.h> +#include <emmintrin.h> +#include <smmintrin.h> +#include <tmmintrin.h> + +/******************************************************************************* +* Function Name : impeg2_copy_mb +* +* Description : copies 3 components to the frame from mc_buf +* +* Arguments : +* src_buf : Source Buffer +* dst_buf : Destination Buffer +* src_wd : Source Width +* dst_wd : destination Width +* +* Values Returned : None +*******************************************************************************/ +void impeg2_copy_mb_sse42(yuv_buf_t *src_buf, + yuv_buf_t *dst_buf, + UWORD32 src_wd, + UWORD32 dst_wd) +{ + UWORD8 *src; + UWORD8 *dst; + __m128i src_r0, src_r1, src_r2, src_r3; + + /*******************************************************/ + /* copy Y */ + /*******************************************************/ + src = src_buf->pu1_y; + dst = dst_buf->pu1_y; + // Row 0-3 + src_r0 = _mm_loadu_si128((__m128i *) (src)); + src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); + src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); + src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); + + _mm_storeu_si128((__m128i *) dst, src_r0); + _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); + + // Row 4-7 + src += 4 * src_wd; + dst += 4 * dst_wd; + src_r0 = _mm_loadu_si128((__m128i *) (src)); + src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); + src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); + src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); + + _mm_storeu_si128((__m128i *) dst, src_r0); + _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); + + // Row 8-11 + src += 4 * src_wd; + dst += 4 * dst_wd; + src_r0 = _mm_loadu_si128((__m128i *) (src)); + src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); + src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); + src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); + + _mm_storeu_si128((__m128i *) dst, src_r0); + _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); + + // Row 12-15 + src += 4 * src_wd; + dst += 4 * dst_wd; + src_r0 = _mm_loadu_si128((__m128i *) (src)); + src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); + src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); + src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); + + _mm_storeu_si128((__m128i *) dst, src_r0); + _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); + + src_wd >>= 1; + dst_wd >>= 1; + + /*******************************************************/ + /* copy U */ + /*******************************************************/ + src = src_buf->pu1_u; + dst = dst_buf->pu1_u; + + // Row 0-3 + src_r0 = _mm_loadl_epi64((__m128i *)src); + src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); + src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); + src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); + + _mm_storel_epi64((__m128i *)dst, src_r0); + _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); + _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); + + // Row 4-7 + src += 4 * src_wd; + dst += 4 * dst_wd; + + src_r0 = _mm_loadl_epi64((__m128i *)src); + src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); + src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); + src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); + + _mm_storel_epi64((__m128i *)dst, src_r0); + _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); + _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); + + /*******************************************************/ + /* copy V */ + /*******************************************************/ + src = src_buf->pu1_v; + dst = dst_buf->pu1_v; + // Row 0-3 + src_r0 = _mm_loadl_epi64((__m128i *)src); + src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); + src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); + src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); + + _mm_storel_epi64((__m128i *)dst, src_r0); + _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); + _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); + + // Row 4-7 + src += 4 * src_wd; + dst += 4 * dst_wd; + + src_r0 = _mm_loadl_epi64((__m128i *)src); + src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); + src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); + src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); + + _mm_storel_epi64((__m128i *)dst, src_r0); + _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); + _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_interpolate */ +/* */ +/* Description : averages the contents of buf_src1 and buf_src2 and stores*/ +/* result in buf_dst */ +/* */ +/* Inputs : buf_src1 - First Source */ +/* buf_src2 - Second Source */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Avg the values from two sources and store the result in */ +/* destination buffer */ +/* */ +/* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */ +/* */ +/* Returns : None */ +/* */ +/* Issues : Assumes that all 3 buffers are of same size */ +/* */ +/*****************************************************************************/ +void impeg2_interpolate_sse42(yuv_buf_t *buf_src1, + yuv_buf_t *buf_src2, + yuv_buf_t *buf_dst, + UWORD32 stride) +{ + UWORD8 *src1, *src2; + UWORD8 *dst; + __m128i src1_r0, src1_r1, src1_r2, src1_r3; + __m128i src2_r0, src2_r1, src2_r2, src2_r3; + + /*******************************************************/ + /* interpolate Y */ + /*******************************************************/ + src1 = buf_src1->pu1_y; + src2 = buf_src2->pu1_y; + dst = buf_dst->pu1_y; + // Row 0-3 + src1_r0 = _mm_loadu_si128((__m128i *) (src1)); + src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); + src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); + src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); + + src2_r0 = _mm_loadu_si128((__m128i *) (src2)); + src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); + src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); + src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storeu_si128((__m128i *) dst, src1_r0); + _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); + + // Row 4-7 + src1 += 4 * 16; + src2 += 4 * 16; + dst += 4 * stride; + src1_r0 = _mm_loadu_si128((__m128i *) (src1)); + src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); + src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); + src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); + + src2_r0 = _mm_loadu_si128((__m128i *) (src2)); + src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); + src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); + src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storeu_si128((__m128i *) dst, src1_r0); + _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); + + // Row 8-11 + src1 += 4 * 16; + src2 += 4 * 16; + dst += 4 * stride; + src1_r0 = _mm_loadu_si128((__m128i *) (src1)); + src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); + src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); + src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); + + src2_r0 = _mm_loadu_si128((__m128i *) (src2)); + src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); + src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); + src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storeu_si128((__m128i *) dst, src1_r0); + _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); + + // Row 12-15 + src1 += 4 * 16; + src2 += 4 * 16; + dst += 4 * stride; + src1_r0 = _mm_loadu_si128((__m128i *) (src1)); + src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); + src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); + src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); + + src2_r0 = _mm_loadu_si128((__m128i *) (src2)); + src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); + src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); + src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storeu_si128((__m128i *) dst, src1_r0); + _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); + _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); + + stride >>= 1; + + /*******************************************************/ + /* interpolate U */ + /*******************************************************/ + src1 = buf_src1->pu1_u; + src2 = buf_src2->pu1_u; + dst = buf_dst->pu1_u; + // Row 0-3 + src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); + src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); + src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); + src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); + + src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); + src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); + src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); + src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storel_epi64((__m128i *) dst, src1_r0); + _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); + _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); + + // Row 4-7 + src1 += 4 * 8; + src2 += 4 * 8; + dst += 4 * stride; + + src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); + src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); + src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); + src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); + + src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); + src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); + src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); + src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storel_epi64((__m128i *) dst, src1_r0); + _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); + _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); + + /*******************************************************/ + /* interpolate V */ + /*******************************************************/ + src1 = buf_src1->pu1_v; + src2 = buf_src2->pu1_v; + dst = buf_dst->pu1_v; + + // Row 0-3 + src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); + src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); + src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); + src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); + + src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); + src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); + src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); + src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storel_epi64((__m128i *) dst, src1_r0); + _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); + _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); + + // Row 4-7 + src1 += 4 * 8; + src2 += 4 * 8; + dst += 4 * stride; + + src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); + src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); + src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); + src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); + + src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); + src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); + src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); + src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); + + src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); + src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); + src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); + src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); + + _mm_storel_epi64((__m128i *) dst, src1_r0); + _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); + _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); + _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */ +/* */ +/* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */ +/* the ref frame.Interpolate these four values to get the */ +/* value at(0.5,0.5).Repeat this to get an 8 x 8 block */ +/* using 9 x 9 block from reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/*****************************************************************************/ +void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out, + UWORD8 *ref, + UWORD32 ref_wid, + UWORD32 out_wid) +{ + UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3; + /* P0-P3 are the pixels in the reference frame and Q is the value being */ + /* estimated */ + /* + P0 P1 + Q + P2 P3 + */ + __m128i src_r0, src_r0_1, src_r1, src_r1_1; + __m128i tmp0, tmp1; + __m128i value_2 = _mm_set1_epi16(2); + + ref_p0 = ref; + ref_p1 = ref + 1; + ref_p2 = ref + ref_wid; + ref_p3 = ref + ref_wid + 1; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); + src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1 + src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + src_r1 = _mm_cvtepu8_epi16(src_r1); + src_r1_1 = _mm_cvtepu8_epi16(src_r1_1); + + tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation + tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation + tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation + tmp0 = _mm_add_epi16(tmp0, value_2); + tmp0 = _mm_srli_epi16(tmp0, 2); + tmp0 = _mm_packus_epi16(tmp0, value_2); + + _mm_storel_epi64((__m128i *)out, tmp0); + + //Row 1 + ref_p2 += ref_wid; + ref_p3 += ref_wid; + out += out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + + tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation + tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation + tmp1 = _mm_add_epi16(tmp1, value_2); + tmp1 = _mm_srli_epi16(tmp1, 2); + tmp1 = _mm_packus_epi16(tmp1, value_2); + + _mm_storel_epi64((__m128i *)out, tmp1); + + //Row 2 + ref_p2 += ref_wid; + ref_p3 += ref_wid; + out += out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + + tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation + + tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation + tmp0 = _mm_add_epi16(tmp0, value_2); + tmp0 = _mm_srli_epi16(tmp0, 2); + tmp0 = _mm_packus_epi16(tmp0, value_2); + + _mm_storel_epi64((__m128i *)out, tmp0); + + //Row 3 + ref_p2 += ref_wid; + ref_p3 += ref_wid; + out += out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + + tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation + + tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation + tmp1 = _mm_add_epi16(tmp1, value_2); + tmp1 = _mm_srli_epi16(tmp1, 2); + tmp1 = _mm_packus_epi16(tmp1, value_2); + + _mm_storel_epi64((__m128i *)out, tmp1); + + //Row 4 + ref_p2 += ref_wid; + ref_p3 += ref_wid; + out += out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + + tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation + + tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation + tmp0 = _mm_add_epi16(tmp0, value_2); + tmp0 = _mm_srli_epi16(tmp0, 2); + tmp0 = _mm_packus_epi16(tmp0, value_2); + + _mm_storel_epi64((__m128i *)out, tmp0); + + //Row 5 + ref_p2 += ref_wid; + ref_p3 += ref_wid; + out += out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + + tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation + + tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation + tmp1 = _mm_add_epi16(tmp1, value_2); + tmp1 = _mm_srli_epi16(tmp1, 2); + tmp1 = _mm_packus_epi16(tmp1, value_2); + + _mm_storel_epi64((__m128i *)out, tmp1); + + //Row 6 + ref_p2 += ref_wid; + ref_p3 += ref_wid; + out += out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + + tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation + + tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation + tmp0 = _mm_add_epi16(tmp0, value_2); + tmp0 = _mm_srli_epi16(tmp0, 2); + tmp0 = _mm_packus_epi16(tmp0, value_2); + + _mm_storel_epi64((__m128i *)out, tmp0); + + //Row 7 + ref_p2 += ref_wid; + ref_p3 += ref_wid; + out += out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); + + src_r0 = _mm_cvtepu8_epi16(src_r0); + src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); + + tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation + + tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation + tmp1 = _mm_add_epi16(tmp1, value_2); + tmp1 = _mm_srli_epi16(tmp1, 2); + tmp1 = _mm_packus_epi16(tmp1, value_2); + + _mm_storel_epi64((__m128i *)out, tmp1); + + return; +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */ +/* */ +/* Description : Gets the buffer from (0.5,0) to (8.5,8) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0) and (1,0) position in the ref frame */ +/* Interpolate these two values to get the value at(0.5,0) */ +/* Repeat this to get an 8 x 8 block using 9 x 8 block from */ +/* reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/*****************************************************************************/ +void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out, + UWORD8 *ref, + UWORD32 ref_wid, + UWORD32 out_wid) +{ + UWORD8 *ref_p0,*ref_p1; + __m128i src_r0, src_r0_1, src_r1, src_r1_1; + /* P0-P3 are the pixels in the reference frame and Q is the value being */ + /* estimated */ + /* + P0 Q P1 + */ + + ref_p0 = ref; + ref_p1 = ref + 1; + + // Row 0 and 1 + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); + src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1 + src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); + + src_r0 = _mm_avg_epu8(src_r0, src_r0_1); + src_r1 = _mm_avg_epu8(src_r1, src_r1_1); + + _mm_storel_epi64((__m128i *)out, src_r0); + _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); + + // Row 2 and 3 + ref_p0 += 2*ref_wid; + ref_p1 += 2*ref_wid; + out += 2*out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); + src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3 + src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); + + src_r0 = _mm_avg_epu8(src_r0, src_r0_1); + src_r1 = _mm_avg_epu8(src_r1, src_r1_1); + + _mm_storel_epi64((__m128i *)out, src_r0); + _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); + + // Row 4 and 5 + ref_p0 += 2*ref_wid; + ref_p1 += 2*ref_wid; + out += 2*out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); + src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5 + src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); + + src_r0 = _mm_avg_epu8(src_r0, src_r0_1); + src_r1 = _mm_avg_epu8(src_r1, src_r1_1); + + _mm_storel_epi64((__m128i *)out, src_r0); + _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); + + // Row 6 and 7 + ref_p0 += 2*ref_wid; + ref_p1 += 2*ref_wid; + out += 2*out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6 + src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); + src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7 + src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); + + src_r0 = _mm_avg_epu8(src_r0, src_r0_1); + src_r1 = _mm_avg_epu8(src_r1, src_r1_1); + + _mm_storel_epi64((__m128i *)out, src_r0); + _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); + + return; +} + + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */ +/* */ +/* Description : Gets the buffer from (0,0.5) to (8,8.5) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0) and (0,1) position in the ref frame */ +/* Interpolate these two values to get the value at(0,0.5) */ +/* Repeat this to get an 8 x 8 block using 8 x 9 block from */ +/* reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/*****************************************************************************/ +void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out, + UWORD8 *ref, + UWORD32 ref_wid, + UWORD32 out_wid) +{ + __m128i src_r0, src_r1, src_r2, temp0, temp1; + /* P0-P3 are the pixels in the reference frame and Q is the value being */ + /* estimated */ + /* + P0 + x + P1 + */ + src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0 + src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1 + src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2 + temp0 = _mm_avg_epu8(src_r0, src_r1); + temp1 = _mm_avg_epu8(src_r1, src_r2); + _mm_storel_epi64((__m128i *)out, temp0); //Row 0 + _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1 + + ref+= 3*ref_wid; + out+= 2*out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3 + src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4 + temp0 = _mm_avg_epu8(src_r2, src_r0); + temp1 = _mm_avg_epu8(src_r0, src_r1); + _mm_storel_epi64((__m128i *)out, temp0); //Row 2 + _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3 + + ref += 2*ref_wid; + out+= 2*out_wid; + + src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5 + src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6 + temp0 = _mm_avg_epu8(src_r1, src_r2); + temp1 = _mm_avg_epu8(src_r2, src_r0); + _mm_storel_epi64((__m128i *)out, temp0); //Row 4 + _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5 + + ref += 2*ref_wid; + out+= 2*out_wid; + + src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7 + src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8 + temp0 = _mm_avg_epu8(src_r0, src_r1); + temp1 = _mm_avg_epu8(src_r1, src_r2); + _mm_storel_epi64((__m128i *)out, temp0); //Row 6 + _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7 + + return; +} + +/*****************************************************************************/ +/* */ +/* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */ +/* */ +/* Description : Gets the buffer from (x,y) to (x+8,y+8) */ +/* and the above block of size 8 x 8 will be placed as a */ +/* block from the current position of out_buf */ +/* */ +/* Inputs : ref - Reference frame from which the block will be */ +/* block will be extracted. */ +/* ref_wid - WIdth of reference frame */ +/* out_wid - WIdth of the output frame */ +/* blk_width - width of the block */ +/* blk_width - height of the block */ +/* */ +/* Globals : None */ +/* */ +/* Processing : Point to the (0,0) position in the ref frame */ +/* Get an 8 x 8 block from reference frame */ +/* */ +/* Outputs : out - Output containing the extracted block */ +/* */ +/* Returns : None */ +/* */ +/* Issues : None */ +/* */ +/*****************************************************************************/ +void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out, + UWORD8 *ref, + UWORD32 ref_wid, + UWORD32 out_wid) +{ + __m128i src_r0, src_r1, src_r2, src_r3; + // Row 0-3 + src_r0 = _mm_loadl_epi64((__m128i *)ref); + src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); + src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); + src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); + + _mm_storel_epi64((__m128i *)out, src_r0); + _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); + _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); + _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); + + // Row 4-7 + ref += 4 * ref_wid; + out += 4 * out_wid; + + src_r0 = _mm_loadl_epi64((__m128i *)ref); + src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); + src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); + src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); + + _mm_storel_epi64((__m128i *)out, src_r0); + _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); + _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); + _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); + return; +} diff --git a/common/x86/impeg2_mem_func_sse42_intr.c b/common/x86/impeg2_mem_func_sse42_intr.c new file mode 100644 index 0000000..de7de8f --- /dev/null +++ b/common/x86/impeg2_mem_func_sse42_intr.c @@ -0,0 +1,100 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ + +/** + ******************************************************************************* + * @file + * impeg2_mem_func_sse42_intr.c + * + * @brief + * Contains utility function definitions for MPEG2 codec + * + * @author + * Mohit [100664] + * +* @par List of Functions: +* - impeg2_memset0_16bit_8x8_linear_block_sse42() +* - impeg2_memset_8bit_8x8_block_sse42() + * + * @remarks + * None + * + ******************************************************************************* + */ +#include <stdio.h> +#include <string.h> +#include "iv_datatypedef.h" +#include "impeg2_defs.h" + +#include <immintrin.h> +#include <emmintrin.h> +#include <smmintrin.h> +#include <tmmintrin.h> + +/******************************************************************************* +* Function Name : impeg2_memset0_16bit_8x8_linear_block +* +* Description : memsets resudial buf to 0 +* +* Arguments : destination buffer +* +* Values Returned : None +*******************************************************************************/ + + +void impeg2_memset0_16bit_8x8_linear_block_sse42 (WORD16 *buf) + { + __m128i zero_8x8_16b = _mm_set1_epi16(0); + _mm_storeu_si128((__m128i *) buf, zero_8x8_16b); + _mm_storeu_si128((__m128i *) (buf + 8), zero_8x8_16b); + _mm_storeu_si128((__m128i *) (buf + 16), zero_8x8_16b); + _mm_storeu_si128((__m128i *) (buf + 24), zero_8x8_16b); + _mm_storeu_si128((__m128i *) (buf + 32), zero_8x8_16b); + _mm_storeu_si128((__m128i *) (buf + 40), zero_8x8_16b); + _mm_storeu_si128((__m128i *) (buf + 48), zero_8x8_16b); + _mm_storeu_si128((__m128i *) (buf + 56), zero_8x8_16b); +} + + + +/******************************************************************************* +* Function Name : impeg2_memset_8bit_8x8_block +* +* Description : memsets residual buf to value +* +* Arguments : destination buffer, value and stride +* +* Values Returned : None +*******************************************************************************/ + + +void impeg2_memset_8bit_8x8_block_sse42(UWORD8 *dst, WORD32 dc_val, WORD32 dst_wd) +{ + __m128i value = _mm_set1_epi8((WORD8)dc_val); + + _mm_storel_epi64((__m128i *)dst, value); + _mm_storel_epi64((__m128i *) (dst + dst_wd), value); + _mm_storel_epi64((__m128i *) (dst + 2 * dst_wd), value); + _mm_storel_epi64((__m128i *) (dst + 3 * dst_wd), value); + _mm_storel_epi64((__m128i *) (dst + 4 * dst_wd), value); + _mm_storel_epi64((__m128i *) (dst + 5 * dst_wd), value); + _mm_storel_epi64((__m128i *) (dst + 6 * dst_wd), value); + _mm_storel_epi64((__m128i *) (dst + 7 * dst_wd), value); +} diff --git a/common/x86/impeg2_platform_macros.h b/common/x86/impeg2_platform_macros.h new file mode 100644 index 0000000..05ff6da --- /dev/null +++ b/common/x86/impeg2_platform_macros.h @@ -0,0 +1,49 @@ +/****************************************************************************** + * + * Copyright (C) 2015 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ***************************************************************************** + * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore +*/ +#ifndef __IMPEG2_PLATFORM_MACROS_H__ +#define __IMPEG2_PLATFORM_MACROS_H__ + + +#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = (u4_temp1 << 24) | \ + ((u4_temp1 & 0xff00) << 8) | \ + ((u4_temp1 & 0xff0000) >> 8) | \ + (u4_temp1 >> 24); +static __inline UWORD32 CLZ(UWORD32 u4_word) +{ + if(u4_word) + return (__builtin_clz(u4_word)); + else + return 32; +} + + +#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x)) +#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x)) + +#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x)) +#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x)) + +#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x)) +#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x)) +#define PLD(x) + +#define INLINE + +#endif /* __IMPEG2_PLATFORM_MACROS_H__ */ |