summaryrefslogtreecommitdiffstats
path: root/common
diff options
context:
space:
mode:
Diffstat (limited to 'common')
-rw-r--r--common/arm/impeg2_format_conv.s391
-rw-r--r--common/arm/impeg2_idct.s1204
-rw-r--r--common/arm/impeg2_inter_pred.s801
-rwxr-xr-xcommon/arm/impeg2_mem_func.s177
-rw-r--r--common/arm/impeg2_platform_macros.h75
-rw-r--r--common/armv8/impeg2_format_conv.s409
-rw-r--r--common/armv8/impeg2_idct.s1247
-rw-r--r--common/armv8/impeg2_inter_pred.s814
-rw-r--r--common/armv8/impeg2_mem_func.s181
-rw-r--r--common/armv8/impeg2_neon_macros.s58
-rw-r--r--common/armv8/impeg2_platform_macros.h49
-rw-r--r--common/impeg2_buf_mgr.c411
-rw-r--r--common/impeg2_buf_mgr.h115
-rw-r--r--common/impeg2_defs.h331
-rw-r--r--common/impeg2_disp_mgr.c172
-rw-r--r--common/impeg2_disp_mgr.h67
-rw-r--r--common/impeg2_format_conv.c401
-rw-r--r--common/impeg2_format_conv.h133
-rw-r--r--common/impeg2_globals.c351
-rwxr-xr-xcommon/impeg2_globals.h57
-rw-r--r--common/impeg2_idct.c500
-rw-r--r--common/impeg2_idct.h66
-rw-r--r--common/impeg2_inter_pred.c467
-rw-r--r--common/impeg2_inter_pred.h103
-rw-r--r--common/impeg2_job_queue.c530
-rw-r--r--common/impeg2_job_queue.h72
-rw-r--r--common/impeg2_macros.h60
-rw-r--r--common/impeg2_mem_func.c87
-rw-r--r--common/impeg2_mem_func.h41
-rw-r--r--common/ithread.c453
-rw-r--r--common/ithread.h80
-rw-r--r--common/iv.h420
-rw-r--r--common/iv_datatypedef.h81
-rw-r--r--common/mips/impeg2_platform_macros.h49
-rwxr-xr-xcommon/x86/impeg2_idct_recon_sse42_intr.c2205
-rw-r--r--common/x86/impeg2_inter_pred_sse42_intr.c899
-rw-r--r--common/x86/impeg2_mem_func_sse42_intr.c100
-rw-r--r--common/x86/impeg2_platform_macros.h49
38 files changed, 13706 insertions, 0 deletions
diff --git a/common/arm/impeg2_format_conv.s b/common/arm/impeg2_format_conv.s
new file mode 100644
index 0000000..c07edda
--- /dev/null
+++ b/common/arm/impeg2_format_conv.s
@@ -0,0 +1,391 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name : impeg2_format_conv.s
+@//
+@// Description : This file has the Idct Implementations for the
+@// MPEG4 SP decoder on neon platform.
+@//
+@// Reference Document :
+@//
+@// Revision History :
+@// Date Author Detail Description
+@// ------------ ---------------- ----------------------------------
+@// Jul 07, 2008 Naveen Kumar T Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+.text
+.p2align 2
+.equ log2_16 , 4
+.equ log2_2 , 1
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@//--------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*****************************************************************************
+@* *
+@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q() *
+@* *
+@* Description : This function conversts the image from YUV420P color *
+@* space to 420SP color space(UV interleaved). *
+@* *
+@* Arguments : R0 pu1_y *
+@* R1 pu1_u *
+@* R2 pu1_v *
+@* R3 pu1_dest_y *
+@* [R13 #40] pu1_dest_uv *
+@* [R13 #44] u2_height *
+@* [R13 #48] u2_width *
+@* [R13 #52] u2_stridey *
+@* [R13 #56] u2_strideu *
+@* [R13 #60] u2_stridev *
+@* [R13 #64] u2_dest_stride_y *
+@* [R13 #68] u2_dest_stride_uv *
+@* [R13 #72] convert_uv_only *
+@* *
+@* Values Returned : None *
+@* *
+@* Register Usage : R0 - R8, Q0 *
+@* *
+@* Stack Usage : 24 Bytes *
+@* *
+@* Interruptibility : Interruptible *
+@* *
+@* Known Limitations *
+@* Assumptions: Image Width: Assumed to be multiple of 16 and *
+@* greater than or equal to 16 *
+@* Image Height: Assumed to be even. *
+@* *
+@* Revision History : *
+@* DD MM YYYY Author(s) Changes (Describe the changes made) *
+@* 07 06 2010 Varshita Draft *
+@* 07 06 2010 Naveen Kr T Completed *
+@* *
+@*****************************************************************************/
+ .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q
+impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q:
+
+ @// push the registers on the stack
+ stmfd sp!, {r4-r8, lr}
+
+ ldr r4, [sp, #56] @// Load convert_uv_only
+
+ cmp r4, #1
+ beq yuv420sp_uv_chroma
+ @/* Do the preprocessing before the main loops start */
+ @// Load the parameters from stack
+ ldr r4, [sp, #28] @// Load u2_height from stack
+
+ ldr r5, [sp, #32] @// Load u2_width from stack
+
+ ldr r7, [sp, #36] @// Load u2_stridey from stack
+
+ ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack
+
+ sub r7, r7, r5 @// Source increment
+
+ sub r8, r8, r5 @// Destination increment
+
+
+yuv420sp_uv_row_loop_y:
+ mov r6, r5
+
+yuv420sp_uv_col_loop_y:
+ pld [r0, #128]
+ vld1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r3]!
+ sub r6, r6, #16
+ cmp r6, #15
+ bgt yuv420sp_uv_col_loop_y
+
+ cmp r6, #0
+ beq yuv420sp_uv_row_loop_end_y
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #16
+ sub r0, r0, r6
+ sub r3, r3, r6
+
+ vld1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r3]!
+
+yuv420sp_uv_row_loop_end_y:
+ add r0, r0, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_uv_row_loop_y
+
+yuv420sp_uv_chroma:
+
+ ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
+
+ ldr r4, [sp, #28] @// Load u2_height from stack
+
+ ldr r5, [sp, #32] @// Load u2_width from stack
+
+
+ ldr r7, [sp, #40] @// Load u2_strideu from stack
+
+ ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack
+
+ sub r7, r7, r5, lsr #1 @// Source increment
+
+ sub r8, r8, r5 @// Destination increment
+
+ mov r5, r5, lsr #1
+ mov r4, r4, lsr #1
+ ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
+yuv420sp_uv_row_loop_uv:
+ mov r6, r5
+
+
+yuv420sp_uv_col_loop_uv:
+ pld [r1, #128]
+ pld [r2, #128]
+ vld1.8 d0, [r1]!
+ vld1.8 d1, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+ sub r6, r6, #8
+ cmp r6, #7
+ bgt yuv420sp_uv_col_loop_uv
+
+ cmp r6, #0
+ beq yuv420sp_uv_row_loop_end_uv
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #8
+ sub r1, r1, r6
+ sub r2, r2, r6
+ sub r3, r3, r6, lsl #1
+
+ vld1.8 d0, [r1]!
+ vld1.8 d1, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+
+yuv420sp_uv_row_loop_end_uv:
+ add r1, r1, r7
+ add r2, r2, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_uv_row_loop_uv
+ @//POP THE REGISTERS
+ ldmfd sp!, {r4-r8, pc}
+
+
+
+
+
+@/*****************************************************************************
+@* *
+@* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q() *
+@* *
+@* Description : This function conversts the image from YUV420P color *
+@* space to 420SP color space(VU interleaved). *
+@* This function is similar to above function *
+@* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in *
+@* VLD1.8 for chroma - order of registers is different *
+@* *
+@* Arguments : R0 pu1_y *
+@* R1 pu1_u *
+@* R2 pu1_v *
+@* R3 pu1_dest_y *
+@* [R13 #40] pu1_dest_uv *
+@* [R13 #44] u2_height *
+@* [R13 #48] u2_width *
+@* [R13 #52] u2_stridey *
+@* [R13 #56] u2_strideu *
+@* [R13 #60] u2_stridev *
+@* [R13 #64] u2_dest_stride_y *
+@* [R13 #68] u2_dest_stride_uv *
+@* [R13 #72] convert_uv_only *
+@* *
+@* Values Returned : None *
+@* *
+@* Register Usage : R0 - R8, Q0 *
+@* *
+@* Stack Usage : 24 Bytes *
+@* *
+@* Interruptibility : Interruptible *
+@* *
+@* Known Limitations *
+@* Assumptions: Image Width: Assumed to be multiple of 16 and *
+@* greater than or equal to 16 *
+@* Image Height: Assumed to be even. *
+@* *
+@* Revision History : *
+@* DD MM YYYY Author(s) Changes (Describe the changes made) *
+@* 07 06 2010 Varshita Draft *
+@* 07 06 2010 Naveen Kr T Completed *
+@* *
+@*****************************************************************************/
+
+ .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q
+impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q:
+
+ @// push the registers on the stack
+ stmfd sp!, {r4-r8, lr}
+
+ ldr r4, [sp, #56] @// Load convert_uv_only
+
+ cmp r4, #1
+ beq yuv420sp_vu_chroma
+
+ @/* Do the preprocessing before the main loops start */
+ @// Load the parameters from stack
+ ldr r4, [sp, #28] @// Load u2_height from stack
+
+ ldr r5, [sp, #32] @// Load u2_width from stack
+
+ ldr r7, [sp, #36] @// Load u2_stridey from stack
+
+ ldr r8, [sp, #48] @// Load u2_dest_stride_y from stack
+
+ sub r7, r7, r5 @// Source increment
+
+ sub r8, r8, r5 @// Destination increment
+
+
+yuv420sp_vu_row_loop_y:
+ mov r6, r5
+
+yuv420sp_vu_col_loop_y:
+ pld [r0, #128]
+ vld1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r3]!
+ sub r6, r6, #16
+ cmp r6, #15
+ bgt yuv420sp_vu_col_loop_y
+
+ cmp r6, #0
+ beq yuv420sp_vu_row_loop_end_y
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #16
+ sub r0, r0, r6
+ sub r3, r3, r6
+
+ vld1.8 {q0}, [r0]!
+ vst1.8 {q0}, [r3]!
+
+yuv420sp_vu_row_loop_end_y:
+ add r0, r0, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_vu_row_loop_y
+
+yuv420sp_vu_chroma:
+
+ ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
+
+ ldr r4, [sp, #28] @// Load u2_height from stack
+
+ ldr r5, [sp, #32] @// Load u2_width from stack
+
+
+ ldr r7, [sp, #40] @// Load u2_strideu from stack
+
+ ldr r8, [sp, #52] @// Load u2_dest_stride_uv from stack
+
+ sub r7, r7, r5, lsr #1 @// Source increment
+
+ sub r8, r8, r5 @// Destination increment
+
+ mov r5, r5, lsr #1
+ mov r4, r4, lsr #1
+ ldr r3, [sp, #24] @// Load pu1_dest_uv from stack
+yuv420sp_vu_row_loop_uv:
+ mov r6, r5
+
+
+yuv420sp_vu_col_loop_uv:
+ pld [r1, #128]
+ pld [r2, #128]
+ vld1.8 d1, [r1]!
+ vld1.8 d0, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+ sub r6, r6, #8
+ cmp r6, #7
+ bgt yuv420sp_vu_col_loop_uv
+
+ cmp r6, #0
+ beq yuv420sp_vu_row_loop_end_uv
+ @//If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ @//Ex if width is 162, above loop will process 160 pixels. And
+ @//Both source and destination will point to 146th pixel and then 16 bytes will be read
+ @// and written using VLD1 and VST1
+ rsb r6, r6, #8
+ sub r1, r1, r6
+ sub r2, r2, r6
+ sub r3, r3, r6, lsl #1
+
+ vld1.8 d1, [r1]!
+ vld1.8 d0, [r2]!
+ vst2.8 {d0, d1}, [r3]!
+
+yuv420sp_vu_row_loop_end_uv:
+ add r1, r1, r7
+ add r2, r2, r7
+ add r3, r3, r8
+ subs r4, r4, #1
+ bgt yuv420sp_vu_row_loop_uv
+ @//POP THE REGISTERS
+ ldmfd sp!, {r4-r8, pc}
+
+
+
+
+
diff --git a/common/arm/impeg2_idct.s b/common/arm/impeg2_idct.s
new file mode 100644
index 0000000..22225bf
--- /dev/null
+++ b/common/arm/impeg2_idct.s
@@ -0,0 +1,1204 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name : impeg2_idct.s
+@//
+@// Description : This file has the Idct Implementations for the
+@// MPEG2 SP decoder on neon platform.
+@//
+@// Reference Document :
+@//
+@// Revision History :
+@// Date Author Detail Description
+@// ------------ ---------------- ----------------------------------
+@// Feb 22, 2008 Naveen Kumar T Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+
+.text
+.p2align 2
+.equ idct_stg1_shift , 12
+.equ idct_stg2_shift , 16
+.equ idct_stg1_round , (1 << (idct_stg1_shift - 1))
+.equ idct_stg2_round , (1 << (idct_stg2_shift - 1))
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@//--------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+ .extern gai2_impeg2_idct_q15
+.hidden gai2_impeg2_idct_q15
+ .extern gai2_impeg2_idct_q11
+.hidden gai2_impeg2_idct_q11
+ .extern gai2_impeg2_idct_first_col_q15
+.hidden gai2_impeg2_idct_first_col_q15
+ .extern gai2_impeg2_idct_first_col_q11
+.hidden gai2_impeg2_idct_first_col_q11
+ .extern gai2_impeg2_mismatch_stg2_additive
+.hidden gai2_impeg2_mismatch_stg2_additive
+
+gai2_impeg2_idct_q15_addr1:
+ .long gai2_impeg2_idct_q15 - q15lbl1 - 8
+gai2_impeg2_idct_q15_addr2:
+ .long gai2_impeg2_idct_q15 - q15lbl2 - 8
+gai2_impeg2_idct_q11_addr1:
+ .long gai2_impeg2_idct_q11 - q11lbl1 - 8
+gai2_impeg2_idct_q11_addr2:
+ .long gai2_impeg2_idct_q11 - q11lbl2 - 8
+gai2_impeg2_idct_first_col_q15_addr1:
+ .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl1 - 8
+gai2_impeg2_idct_first_col_q15_addr2:
+ .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl2 - 8
+gai2_impeg2_idct_first_col_q15_addr3:
+ .long gai2_impeg2_idct_first_col_q15 - fcq15_lbl3 - 8
+gai2_impeg2_mismatch_stg2_additive_addr:
+ .long gai2_impeg2_mismatch_stg2_additive - additive_lbl - 8
+gai2_impeg2_idct_first_col_q11_addr1:
+ .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl1 - 8
+gai2_impeg2_idct_first_col_q11_addr2:
+ .long gai2_impeg2_idct_first_col_q11 - fcq11_lbl2 - 8
+
+ .global impeg2_idct_recon_dc_a9q
+impeg2_idct_recon_dc_a9q:
+ stmfd sp!, {r4, r6, r12, lr}
+ @//r0: pi2_src
+ @//r1: pi2_tmp - not used, used as pred_strd
+ @//r2: pu1_pred
+ @//r3: pu1_dst
+ @//r4: used as scratch
+ @//r5:
+
+ ldr r1, [sp, #20] @//pred_strd
+ ldr r6, [sp, #24] @//dst_strd
+
+ ldr r14, gai2_impeg2_idct_q15_addr1
+q15lbl1:
+ add r14, r14, pc
+ ldrsh r12, [r14]
+ ldrsh r4, [r0]
+
+ vld1.8 d0, [r2], r1
+ mul r4, r4, r12
+
+ vld1.8 d1, [r2], r1
+ add r4, #idct_stg1_round
+
+ vld1.8 d2, [r2], r1
+ asr r4, r4, #idct_stg1_shift
+
+ ldr r14, gai2_impeg2_idct_q11_addr1
+q11lbl1:
+ add r14, r14, pc
+ ldrsh r12, [r14]
+
+ vld1.8 d3, [r2], r1
+ mul r4, r4, r12
+
+ vld1.8 d4, [r2], r1
+ add r4, #idct_stg2_round
+
+ vld1.8 d5, [r2], r1
+ asr r4, r4, #idct_stg2_shift
+
+ vld1.8 d6, [r2], r1
+ vdup.s16 q15, r4
+
+
+ vld1.8 d7, [r2], r1
+
+ vaddw.u8 q4, q15, d0
+
+ vaddw.u8 q5, q15, d1
+ vqmovun.s16 d0, q4
+
+ vaddw.u8 q6, q15, d2
+ vqmovun.s16 d1, q5
+ vst1.8 d0, [r3], r6
+
+ vaddw.u8 q7, q15, d3
+ vqmovun.s16 d2, q6
+ vst1.8 d1, [r3], r6
+
+ vaddw.u8 q8, q15, d4
+ vqmovun.s16 d3, q7
+ vst1.8 d2, [r3], r6
+
+ vaddw.u8 q9, q15, d5
+ vqmovun.s16 d4, q8
+ vst1.8 d3, [r3], r6
+
+ vaddw.u8 q10, q15, d6
+ vqmovun.s16 d5, q9
+ vst1.8 d4, [r3], r6
+
+ vaddw.u8 q11, q15, d7
+ vqmovun.s16 d6, q10
+ vst1.8 d5, [r3], r6
+
+ vqmovun.s16 d7, q11
+ vst1.8 d6, [r3], r6
+
+
+ vst1.8 d7, [r3], r6
+
+ ldmfd sp!, {r4, r6, r12, pc}
+
+
+
+
+ .global impeg2_idct_recon_dc_mismatch_a9q
+impeg2_idct_recon_dc_mismatch_a9q:
+ stmfd sp!, {r4-r12, lr}
+
+ ldr r1, [sp, #44] @//pred_strd
+ ldr r6, [sp, #48] @//dst_strd
+
+ ldr r14, gai2_impeg2_idct_q15_addr2
+q15lbl2:
+ add r14, r14, pc
+ ldrsh r12, [r14]
+ ldrsh r4, [r0]
+
+ mul r4, r4, r12
+ add r4, #idct_stg1_round
+ asr r4, r4, #idct_stg1_shift
+
+ ldr r14, gai2_impeg2_idct_q11_addr2
+q11lbl2:
+ add r14, r14, pc
+ ldrsh r12, [r14]
+ mul r4, r4, r12
+ vdup.s32 q0, r4
+
+ mov r14, #16 @//Increment for table read
+ ldr r4, gai2_impeg2_mismatch_stg2_additive_addr
+additive_lbl:
+ add r4, r4, pc
+
+ vld1.16 {q1}, [r4], r14
+
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+ vld1.16 {q1}, [r4], r14
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+ vld1.16 {q1}, [r4], r14
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+ vld1.16 {q1}, [r4], r14
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+ vld1.16 {q1}, [r4], r14
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+ vld1.16 {q1}, [r4], r14
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+ vld1.16 {q1}, [r4], r14
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+ vld1.16 {q1}, [r4], r14
+ vld1.8 d30, [r2], r1
+ vmovl.s16 q4, d2
+ vmovl.s16 q5, d3
+ vraddhn.s32 d12, q0, q4
+ vraddhn.s32 d13, q0, q5
+ vaddw.u8 q7, q6, d30
+ vqmovun.s16 d30, q7
+ vst1.8 d30, [r3], r6
+
+
+ ldmfd sp!, {r4-r12, pc}
+
+
+
+
+@/**
+@ *******************************************************************************
+@ *
+@ * ;brief
+@ * This function performs Inverse transform and reconstruction for 8x8
+@ * input block
+@ *
+@ * ;par Description:
+@ * Performs inverse transform and adds the prediction data and clips output
+@ * to 8 bit
+@ *
+@ * ;param[in] pi2_src
+@ * Input 8x8 coefficients
+@ *
+@ * ;param[in] pi2_tmp
+@ * Temporary 8x8 buffer for storing inverse
+@ *
+@ * transform
+@ * 1st stage output
+@ *
+@ * ;param[in] pu1_pred
+@ * Prediction 8x8 block
+@ *
+@ * ;param[out] pu1_dst
+@ * Output 8x8 block
+@ *
+@ * ;param[in] src_strd
+@ * Input stride
+@ *
+@ * ;param[in] pred_strd
+@ * Prediction stride
+@ *
+@ * ;param[in] dst_strd
+@ * Output Stride
+@ *
+@ * ;param[in] shift
+@ * Output shift
+@ *
+@ * ;param[in] zero_cols
+@ * Zero columns in pi2_src
+@ *
+@ * ;returns Void
+@ *
+@ * ;remarks
+@ * None
+@ *
+@ *******************************************************************************
+@ */
+
+@void impeg2_itrans_recon_8x8(WORD16 *pi2_src,
+@ WORD16 *pi2_tmp,
+@ UWORD8 *pu1_pred,
+@ UWORD8 *pu1_dst,
+@ WORD32 src_strd,
+@ WORD32 pred_strd,
+@ WORD32 dst_strd,
+@ WORD32 zero_cols
+@ WORD32 zero_rows )
+
+@**************Variables Vs Registers*************************
+@ r0 => *pi2_src
+@ r1 => *pi2_tmp
+@ r2 => *pu1_pred
+@ r3 => *pu1_dst
+@ src_strd
+@ pred_strd
+@ dst_strd
+@ zero_cols
+
+
+
+ .global impeg2_idct_recon_a9q
+impeg2_idct_recon_a9q:
+@//Register Usage Reference - loading and Until IDCT of columns
+@// Cosine Constants - D0
+@// Sine Constants - D1
+@// Row 0 First Half - D2 - y0
+@// Row 1 First Half - D6 - y1
+@// Row 2 First Half - D3 - y2
+@// Row 3 First Half - D7 - y3
+@// Row 4 First Half - D10 - y4
+@// Row 5 First Half - D14 - y5
+@// Row 6 First Half - D11 - y6
+@// Row 7 First Half - D15 - y7
+
+@// Row 0 Second Half - D4 - y0
+@// Row 1 Second Half - D8 - y1
+@// Row 2 Second Half - D5 - y2
+@// Row 3 Second Half - D9 - y3
+@// Row 4 Second Half - D12 - y4
+@// Row 5 Second Half - D16 - y5
+@// Row 6 Second Half - D13 - y6
+@// Row 7 Second Half - D17 - y7
+
+ @// Copy the input pointer to another register
+ @// Step 1 : load all constants
+ stmfd sp!, {r4-r12, lr}
+ add sp, sp, #40
+ ldr r8, [sp, #4] @ prediction stride
+ ldr r7, [sp, #8] @ destination stride
+ ldr r6, [sp] @ src stride
+ ldr r12, [sp, #12]
+ ldr r11, [sp, #16]
+ mov r6, r6, lsl #1 @ x sizeof(word16)
+ add r9, r0, r6, lsl #1 @ 2 rows
+
+ add r10, r6, r6, lsl #1 @ 3 rows
+
+ sub r10, r10, #8 @ - 4 cols * sizeof(WORD16)
+ sub r5, r6, #8 @ src_strd - 4 cols * sizeof(WORD16)
+
+
+ ldr r14, gai2_impeg2_idct_first_col_q15_addr1
+fcq15_lbl1:
+ add r14, r14, pc
+ vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data
+
+ @//Step 2 Load all the input data
+ @//Step 3 Operate first 4 colums at a time
+
+ and r11, r11, #0xff
+ and r12, r12, #0xff
+
+ cmp r11, #0xf0
+ bge skip_last4_rows
+
+
+ vld1.16 d2, [r0]!
+ vld1.16 d3, [r9]!
+ vld1.16 d4, [r0], r5
+ vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1)
+ vld1.16 d5, [r9], r5
+ vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+ vld1.16 d6, [r0]!
+ vld1.16 d7, [r9]!
+ vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0)
+ vld1.16 d8, [r0], r10
+ vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1)
+ vld1.16 d9, [r9], r10
+ vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2)
+ vld1.16 d10, [r0]!
+ vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3)
+ vld1.16 d11, [r9]!
+ vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vld1.16 d12, [r0], r5
+ vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vld1.16 d13, [r9], r5
+ vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vld1.16 d14, [r0]!
+ vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+ vld1.16 d15, [r9]!
+ vmull.s16 q11, d10, d0[0] @// y4 * cos4(part of c0 and c1)
+ vld1.16 d16, [r0], r10
+ vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0)
+ vld1.16 d17, [r9], r10
+
+ @/* This following was activated when alignment is not there */
+@// VLD1.16 D2,[r0]!
+@// VLD1.16 D3,[r2]!
+@// VLD1.16 D4,[r0]!
+@// VLD1.16 D5,[r2]!
+@// VLD1.16 D6,[r0]!
+@// VLD1.16 D7,[r2]!
+@// VLD1.16 D8,[r0],r3
+@// VLD1.16 D9,[r2],r3
+@// VLD1.16 D10,[r0]!
+@// VLD1.16 D11,[r2]!
+@// VLD1.16 D12,[r0]!
+@// VLD1.16 D13,[r2]!
+@// VLD1.16 D14,[r0]!
+@// VLD1.16 D15,[r2]!
+@// VLD1.16 D16,[r0],r3
+@// VLD1.16 D17,[r2],r3
+
+
+
+
+ vmlal.s16 q12, d14, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ vmlsl.s16 q13, d14, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ vmlal.s16 q14, d14, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ vmlal.s16 q15, d14, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ vmlsl.s16 q9, d11, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ vmlal.s16 q3, d11, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q5, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12, d15, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+ vmlsl.s16 q13, d15, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+ vmlal.s16 q14, d15, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+ vmlsl.s16 q15, d15, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+ vadd.s32 q7, q5, q3 @// a0 = c0 + d0(part of r0,r7)
+ vsub.s32 q5, q5, q3 @// a3 = c0 - d0(part of r3,r4)
+ vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6)
+
+ vadd.s32 q10, q7, q12 @// a0 + b0(part of r0)
+ vsub.s32 q3, q7, q12 @// a0 - b0(part of r7)
+
+ vadd.s32 q12, q11, q14 @// a2 + b2(part of r2)
+ vsub.s32 q11, q11, q14 @// a2 - b2(part of r5)
+
+ vadd.s32 q14, q9, q13 @// a1 + b1(part of r1)
+ vsub.s32 q9, q9, q13 @// a1 - b1(part of r6)
+
+ vadd.s32 q13, q5, q15 @// a3 + b3(part of r3)
+ vsub.s32 q15, q5, q15 @// a3 - b3(part of r4)
+
+ vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+ b last4_cols
+
+
+
+skip_last4_rows:
+
+
+ ldr r14, gai2_impeg2_idct_first_col_q15_addr2
+fcq15_lbl2:
+ add r14, r14, pc
+ vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data
+
+ vld1.16 d2, [r0]!
+ vld1.16 d3, [r9]!
+ vld1.16 d4, [r0], r5
+ vld1.16 d5, [r9], r5
+ vld1.16 d6, [r0]!
+ vld1.16 d7, [r9]!
+ vld1.16 d8, [r0], r10
+ vld1.16 d9, [r9], r10
+
+
+
+ vmov.s16 q6, #0
+ vmov.s16 q8, #0
+
+
+
+
+ vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+ vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0)
+
+ vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1)
+
+
+ vadd.s32 q7, q10, q3 @// a0 = c0 + d0(part of r0,r7)
+ vsub.s32 q5, q10, q3 @// a3 = c0 - d0(part of r3,r4)
+ vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6)
+
+ vadd.s32 q10, q7, q12 @// a0 + b0(part of r0)
+ vsub.s32 q3, q7, q12 @// a0 - b0(part of r7)
+
+ vadd.s32 q12, q11, q14 @// a2 + b2(part of r2)
+ vsub.s32 q11, q11, q14 @// a2 - b2(part of r5)
+
+ vadd.s32 q14, q9, q13 @// a1 + b1(part of r1)
+ vsub.s32 q9, q9, q13 @// a1 - b1(part of r6)
+
+ vadd.s32 q13, q5, q15 @// a3 + b3(part of r3)
+ vsub.s32 q15, q5, q15 @// a3 - b3(part of r4)
+
+ vqrshrn.s32 d2, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d15, q3, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d3, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d14, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d6, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d11, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d7, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d10, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+last4_cols:
+
+
+ cmp r12, #0xf0
+ bge skip_last4cols
+
+ ldr r14, gai2_impeg2_idct_first_col_q15_addr3
+fcq15_lbl3:
+ add r14, r14, pc
+ vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data
+
+ vmull.s16 q12, d8, d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13, d8, d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14, d8, d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15, d8, d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12, d9, d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13, d9, d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14, d9, d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15, d9, d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q9, d5, d1[2] @// y2 * sin2 (Q4 is freed by this time)(part of d1)
+ vmull.s16 q4, d5, d0[2] @// y2 * cos2(part of d0)
+
+ vmull.s16 q10, d4, d0[0] @// y0 * cos4(part of c0 and c1)
+ vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1)
+
+ vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ vmlal.s16 q4, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+ vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+ vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+ vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+ vadd.s32 q8, q6, q4 @// a0 = c0 + d0(part of e0,e7)
+ vsub.s32 q6, q6, q4 @// a3 = c0 - d0(part of e3,e4)
+ vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of e2,e5)
+ vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of e1,e6)
+
+ vadd.s32 q10, q8, q12 @// a0 + b0(part of e0)
+ vsub.s32 q4, q8, q12 @// a0 - b0(part of e7)
+
+ vadd.s32 q12, q11, q14 @// a2 + b2(part of e2)
+ vsub.s32 q11, q11, q14 @// a2 - b2(part of e5)
+
+ vadd.s32 q14, q9, q13 @// a1 + b1(part of e1)
+ vsub.s32 q9, q9, q13 @// a1 - b1(part of e6)
+
+ vadd.s32 q13, q6, q15 @// a3 + b3(part of e3)
+ vsub.s32 q15, q6, q15 @// a3 - b3(part of r4)
+
+ vqrshrn.s32 d4, q10, #idct_stg1_shift @// r0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d17, q4, #idct_stg1_shift @// r7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d5, q12, #idct_stg1_shift @// r2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d16, q11, #idct_stg1_shift @// r5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d8, q14, #idct_stg1_shift @// r1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d13, q9, #idct_stg1_shift @// r6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d9, q13, #idct_stg1_shift @// r3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ vqrshrn.s32 d12, q15, #idct_stg1_shift @// r4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ b end_skip_last4cols
+
+
+
+skip_last4cols:
+
+
+
+ ldr r14, gai2_impeg2_idct_first_col_q11_addr1
+fcq11_lbl1:
+ add r14, r14, pc
+ vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data
+
+
+
+ vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing
+
+ vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing
+
+
+ vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued.....
+ vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued.....
+
+ vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued.....
+ vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued.....
+
+
+ vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1)
+@ VMULL.S16 Q11,D4,D0[0] ;// y4 * cos4(part of c0 and c1)
+
+ vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+ vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0)
+
+
+
+
+ vsub.s32 q11, q10, q3 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q2, q10, q3 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q1, q2, q12
+
+ vsub.s32 q3, q2, q12
+
+ vadd.s32 q4, q11, q15
+
+ vsub.s32 q12, q11, q15
+
+ vqrshrn.s32 d5, q4, #idct_stg2_shift
+ vqrshrn.s32 d2, q1, #idct_stg2_shift
+ vqrshrn.s32 d9, q3, #idct_stg2_shift
+ vqrshrn.s32 d6, q12, #idct_stg2_shift
+
+ vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q15, q11, q14
+
+ vsub.s32 q12, q11, q14
+
+ vadd.s32 q14, q9, q13
+
+ vsub.s32 q11, q9, q13
+ vqrshrn.s32 d4, q15, #idct_stg2_shift
+ vqrshrn.s32 d7, q12, #idct_stg2_shift
+ vqrshrn.s32 d3, q14, #idct_stg2_shift
+ vqrshrn.s32 d8, q11, #idct_stg2_shift
+
+
+
+
+
+
+
+
+
+
+ vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0)
+
+ vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vtrn.16 d2, d3
+ vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vtrn.16 d4, d5
+ vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vtrn.16 d6, d7
+ vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+ vtrn.16 d8, d9
+ vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1)
+ vtrn.32 d2, d4
+
+ vtrn.32 d3, d5
+ vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1)
+ vtrn.32 d6, d8
+ vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0)
+ vtrn.32 d7, d9
+
+
+ add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
+
+
+ add r5, r8, r8, lsl #1 @
+
+
+ add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data
+
+
+ add r10, r7, r7, lsl #1 @
+
+
+ vswp d3, d6
+
+
+ vswp d5, d8
+
+
+ vsub.s32 q11, q10, q7 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q6, q10, q7 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q0, q6, q12
+
+
+ vsub.s32 q12, q6, q12
+
+
+ vadd.s32 q6, q11, q15
+
+
+ vsub.s32 q7, q11, q15
+
+ vqrshrn.s32 d10, q0, #idct_stg2_shift
+ vqrshrn.s32 d17, q12, #idct_stg2_shift
+ vqrshrn.s32 d13, q6, #idct_stg2_shift
+ vqrshrn.s32 d14, q7, #idct_stg2_shift
+
+ vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q0, q11, q14
+
+
+ vsub.s32 q12, q11, q14
+
+
+ vadd.s32 q14, q9, q13
+
+
+ vsub.s32 q13, q9, q13
+ vld1.8 d18, [r2], r8
+
+ vqrshrn.s32 d12, q0, #idct_stg2_shift
+ vld1.8 d20, [r2], r5
+
+
+ vqrshrn.s32 d15, q12, #idct_stg2_shift
+ vld1.8 d19, [r2], r8
+
+
+
+
+ vqrshrn.s32 d11, q14, #idct_stg2_shift
+ vld1.8 d22, [r4], r8
+
+
+
+
+ vqrshrn.s32 d16, q13, #idct_stg2_shift
+ vld1.8 d21, [r2], r5
+
+
+ b pred_buff_addition
+end_skip_last4cols:
+
+ ldr r14, gai2_impeg2_idct_first_col_q11_addr2
+fcq11_lbl2:
+ add r14, r14, pc
+ vld1.16 {d0, d1}, [r14] @//D0,D1 are used for storing the constant data
+
+
+@/* Now the Idct of columns is done, transpose so that row idct done efficiently(step5) */
+ vtrn.16 q1, q3 @//[r3,r1],[r2,r0] first qudrant transposing
+ vtrn.16 q2, q4 @//[r3,r1],[r2,r0] second qudrant transposing
+ vtrn.16 q5, q7 @//[r7,r5],[r6,r4] third qudrant transposing
+ vtrn.16 q6, q8 @//[r7,r5],[r6,r4] fourth qudrant transposing
+
+ vtrn.32 d6, d7 @//r0,r1,r2,r3 first qudrant transposing continued.....
+ vtrn.32 d2, d3 @//r0,r1,r2,r3 first qudrant transposing continued.....
+ vtrn.32 d4, d5 @//r0,r1,r2,r3 second qudrant transposing continued.....
+ vtrn.32 d8, d9 @//r0,r1,r2,r3 second qudrant transposing continued.....
+ vtrn.32 d10, d11 @//r4,r5,r6,r7 third qudrant transposing continued.....
+ vtrn.32 d14, d15 @//r4,r5,r6,r7 third qudrant transposing continued.....
+ vtrn.32 d12, d13 @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+ vtrn.32 d16, d17 @//r4,r5,r6,r7 fourth qudrant transposing continued.....
+
+ @//step6 Operate on first four rows and find their idct
+ @//Register Usage Reference - storing and IDCT of rows
+@// Cosine Constants - D0
+@// Sine Constants - D1
+@// Element 0 First four - D2 - y0
+@// Element 1 First four - D6 - y1
+@// Element 2 First four - D3 - y2
+@// Element 3 First four - D7 - y3
+@// Element 4 First four - D4 - y4
+@// Element 5 First four - D8 - y5
+@// Element 6 First four - D5 - y6
+@// Element 7 First four - D9 - y7
+@// Element 0 Second four - D10 - y0
+@// Element 1 Second four - D14 - y1
+@// Element 2 Second four - D11 - y2
+@// Element 3 Second four - D15 - y3
+@// Element 4 Second four - D12 - y4
+@// Element 5 Second four - D16 - y5
+@// Element 6 Second four - D13 - y6
+@// Element 7 Second four - D17 - y7
+
+ @// Map between first kernel code seq and current
+@// D2 -> D2
+@// D6 -> D6
+@// D3 -> D3
+@// D7 -> D7
+@// D10 -> D4
+@// D14 -> D8
+@// D11 -> D5
+@// D15 -> D9
+@// Q3 -> Q3
+@// Q5 -> Q2
+@// Q7 -> Q4
+
+ vmull.s16 q12, d6, d0[1] @// y1 * cos1(part of b0)
+ vmull.s16 q13, d6, d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14, d6, d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15, d6, d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12, d7, d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vmlsl.s16 q13, d7, d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vmlsl.s16 q14, d7, d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vmlsl.s16 q15, d7, d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+
+ vmull.s16 q10, d2, d0[0] @// y0 * cos4(part of c0 and c1)
+ vmull.s16 q11, d4, d0[0] @// y4 * cos4(part of c0 and c1)
+
+ vmull.s16 q9, d3, d1[2] @// y2 * sin2 (Q3 is freed by this time)(part of d1)
+ vmull.s16 q3, d3, d0[2] @// y2 * cos2(part of d0)
+
+
+ vmlal.s16 q12, d8, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ vmlsl.s16 q13, d8, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ vmlal.s16 q14, d8, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ vmlal.s16 q15, d8, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ vmlsl.s16 q9, d5, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ vmlal.s16 q3, d5, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q1, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12, d9, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+ vmlsl.s16 q13, d9, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+ vmlal.s16 q14, d9, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+ vmlsl.s16 q15, d9, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+ vsub.s32 q11, q1, q3 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q2, q1, q3 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q1, q2, q12
+
+ vsub.s32 q3, q2, q12
+
+ vadd.s32 q4, q11, q15
+
+ vsub.s32 q12, q11, q15
+
+ vqrshrn.s32 d5, q4, #idct_stg2_shift
+ vqrshrn.s32 d2, q1, #idct_stg2_shift
+ vqrshrn.s32 d9, q3, #idct_stg2_shift
+ vqrshrn.s32 d6, q12, #idct_stg2_shift
+
+ vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q15, q11, q14
+
+ vsub.s32 q12, q11, q14
+
+ vadd.s32 q14, q9, q13
+
+ vsub.s32 q11, q9, q13
+ vqrshrn.s32 d4, q15, #idct_stg2_shift
+ vqrshrn.s32 d7, q12, #idct_stg2_shift
+ vqrshrn.s32 d3, q14, #idct_stg2_shift
+ vqrshrn.s32 d8, q11, #idct_stg2_shift
+
+
+
+
+
+
+
+
+
+
+ vmull.s16 q12, d14, d0[1] @// y1 * cos1(part of b0)
+
+ vmull.s16 q13, d14, d0[3] @// y1 * cos3(part of b1)
+ vmull.s16 q14, d14, d1[1] @// y1 * sin3(part of b2)
+ vmull.s16 q15, d14, d1[3] @// y1 * sin1(part of b3)
+
+ vmlal.s16 q12, d15, d0[3] @// y1 * cos1 + y3 * cos3(part of b0)
+ vtrn.16 d2, d3
+ vmlsl.s16 q13, d15, d1[3] @// y1 * cos3 - y3 * sin1(part of b1)
+ vtrn.16 d4, d5
+ vmlsl.s16 q14, d15, d0[1] @// y1 * sin3 - y3 * cos1(part of b2)
+ vtrn.16 d6, d7
+ vmlsl.s16 q15, d15, d1[1] @// y1 * sin1 - y3 * sin3(part of b3)
+ vtrn.16 d8, d9
+ vmull.s16 q10, d10, d0[0] @// y0 * cos4(part of c0 and c1)
+ vtrn.32 d2, d4
+ vmull.s16 q11, d12, d0[0] @// y4 * cos4(part of c0 and c1)
+ vtrn.32 d3, d5
+ vmull.s16 q9, d11, d1[2] @// y2 * sin2 (Q7 is freed by this time)(part of d1)
+ vtrn.32 d6, d8
+ vmull.s16 q7, d11, d0[2] @// y2 * cos2(part of d0)
+ vtrn.32 d7, d9
+ vmlal.s16 q12, d16, d1[1] @// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+ add r4, r2, r8, lsl #1 @ r4 = r2 + pred_strd * 2 => r4 points to 3rd row of pred data
+ vmlsl.s16 q13, d16, d0[1] @// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+ add r5, r8, r8, lsl #1 @
+ vmlal.s16 q14, d16, d1[3] @// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+ add r0, r3, r7, lsl #1 @ r0 points to 3rd row of dest data
+ vmlal.s16 q15, d16, d0[3] @// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ add r10, r7, r7, lsl #1 @
+ vmlsl.s16 q9, d13, d0[2] @// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+ vmlal.s16 q7, d13, d1[2] @// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ vadd.s32 q6, q10, q11 @// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ vsub.s32 q10, q10, q11 @// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ vmlal.s16 q12, d17, d1[3] @// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of r0,r7)
+ vswp d3, d6
+ vmlsl.s16 q13, d17, d1[1] @// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of r1,r6)
+
+ vswp d5, d8
+ vmlal.s16 q14, d17, d0[3] @// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of r2,r5)
+ vmlsl.s16 q15, d17, d0[1] @// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of r3,r4)
+
+ vsub.s32 q11, q6, q7 @// a3 = c0 - d0(part of r3,r4)
+ vadd.s32 q6, q6, q7 @// a0 = c0 + d0(part of r0,r7)
+
+
+ vadd.s32 q0, q6, q12
+
+
+ vsub.s32 q12, q6, q12
+
+
+ vadd.s32 q6, q11, q15
+
+
+ vsub.s32 q7, q11, q15
+
+ vqrshrn.s32 d10, q0, #idct_stg2_shift
+ vqrshrn.s32 d17, q12, #idct_stg2_shift
+ vqrshrn.s32 d13, q6, #idct_stg2_shift
+ vqrshrn.s32 d14, q7, #idct_stg2_shift
+
+ vsub.s32 q11, q10, q9 @// a2 = c1 - d1(part of r2,r5)
+ vadd.s32 q9, q10, q9 @// a1 = c1 + d1(part of r1,r6)
+
+
+ vadd.s32 q0, q11, q14
+
+
+ vsub.s32 q12, q11, q14
+
+
+ vadd.s32 q14, q9, q13
+
+
+ vsub.s32 q13, q9, q13
+ vld1.8 d18, [r2], r8
+
+ vqrshrn.s32 d12, q0, #idct_stg2_shift
+ vld1.8 d20, [r2], r5
+
+
+ vqrshrn.s32 d15, q12, #idct_stg2_shift
+ vld1.8 d19, [r2], r8
+
+
+
+
+ vqrshrn.s32 d11, q14, #idct_stg2_shift
+ vld1.8 d22, [r4], r8
+
+
+
+
+ vqrshrn.s32 d16, q13, #idct_stg2_shift
+ vld1.8 d21, [r2], r5
+
+
+
+
+pred_buff_addition:
+
+
+ vtrn.16 d10, d11
+ vld1.8 d24, [r4], r5
+
+ vtrn.16 d12, d13
+ vld1.8 d23, [r4], r8
+
+ vaddw.u8 q1, q1, d18
+ vld1.8 d25, [r4], r5
+
+ vtrn.16 d14, d15
+ vaddw.u8 q2, q2, d22
+
+ vtrn.16 d16, d17
+ vaddw.u8 q3, q3, d20
+
+ vtrn.32 d10, d12
+ vaddw.u8 q4, q4, d24
+
+ vtrn.32 d11, d13
+ vtrn.32 d14, d16
+ vtrn.32 d15, d17
+
+ vswp d11, d14
+ vswp d13, d16
+
+@ Row values stored in the q register.
+
+@Q1 :r0
+@Q3: r1
+@Q2: r2
+@Q4: r3
+@Q5: r4
+@Q7: r5
+@Q6: r6
+@Q8: r7
+
+
+
+@/// Adding the prediction buffer
+
+
+
+
+
+
+
+
+
+ @ Load prediction data
+
+
+
+
+
+ @Adding recon with prediction
+
+
+
+
+
+ vaddw.u8 q5, q5, d19
+ vqmovun.s16 d2, q1
+ vaddw.u8 q7, q7, d21
+ vqmovun.s16 d4, q2
+ vaddw.u8 q6, q6, d23
+ vqmovun.s16 d6, q3
+ vaddw.u8 q8, q8, d25
+ vqmovun.s16 d8, q4
+
+
+
+
+
+
+
+ vst1.8 {d2}, [r3], r7
+ vqmovun.s16 d10, q5
+ vst1.8 {d6}, [r3], r10
+ vqmovun.s16 d14, q7
+ vst1.8 {d4}, [r0], r7
+ vqmovun.s16 d12, q6
+ vst1.8 {d8}, [r0], r10
+ vqmovun.s16 d16, q8
+
+
+
+
+
+
+
+ vst1.8 {d10}, [r3], r7
+ vst1.8 {d14}, [r3], r10
+ vst1.8 {d12}, [r0], r7
+ vst1.8 {d16}, [r0], r10
+
+
+
+
+ sub sp, sp, #40
+ ldmfd sp!, {r4-r12, pc}
+
+
+
diff --git a/common/arm/impeg2_inter_pred.s b/common/arm/impeg2_inter_pred.s
new file mode 100644
index 0000000..f1b3dde
--- /dev/null
+++ b/common/arm/impeg2_inter_pred.s
@@ -0,0 +1,801 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name : impeg2_inter_pred.s
+@//
+@// Description : This file has motion compensation related
+@// interpolation functions on Neon + CortexA-8 platform
+@//
+@// Reference Document :
+@//
+@// Revision History :
+@// Date Author Detail Description
+@// ------------ ---------------- ----------------------------------
+@// 18 jun 2010 S Hamsalekha Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+.text
+.p2align 2
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_copy_mb_a9q()
+@//
+@// Detail Description : Copies one MB worth of data from src to the dst
+@//
+@// Inputs : r0 - pointer to src
+@// r1 - pointer to dst
+@// r2 - source width
+@// r3 - destination width
+@// Registers Used : r4, r5, d0, d1
+@//
+@// Stack Usage : 12 bytes
+@//
+@// Outputs :
+@//
+@// Return Data : None
+@//
+@// Programming Note : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+
+ .global impeg2_copy_mb_a9q
+
+
+impeg2_copy_mb_a9q:
+
+ stmfd r13!, {r4, r5, r14}
+
+
+ ldr r4, [r0] @src->y
+ ldr r5, [r1] @dst->y
+ @Read one row of data from the src
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+
+ @//Repeat 15 times for y
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+ vld1.8 {d0, d1}, [r4], r2 @Load and increment src
+ vst1.8 {d0, d1}, [r5], r3 @Store and increment dst
+
+ mov r2, r2, lsr #1 @src_offset /= 2
+ mov r3, r3, lsr #1 @dst_offset /= 2
+
+ ldr r4, [r0, #4] @src->u
+ ldr r5, [r1, #4] @dst->u
+ @Read one row of data from the src
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+
+ @//Repeat 7 times for u
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+
+ ldr r4, [r0, #8] @src->v
+ ldr r5, [r1, #8] @dst->v
+ @Read one row of data from the src
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+
+ @//Repeat 7 times for v
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+ vld1.8 {d0}, [r4], r2 @Load and increment src
+ vst1.8 {d0}, [r5], r3 @Store and increment dst
+
+ ldmfd r13!, {r4, r5, pc}
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_mc_fullx_halfy_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@// current frame buffer.This function is called for
+@// blocks that are not coded and have motion vectors
+@// with a half pel resolution.
+@//
+@// Inputs : r0 - out : Current Block Pointer
+@// r1 - ref : Refernce Block Pointer
+@// r2 - ref_wid : Refernce Block Width
+@// r3 - out_wid ; Current Block Width
+@//
+@// Registers Used : D0-D9
+@//
+@// Stack Usage : 4 bytes
+@//
+@// Outputs : The Motion Compensated Block
+@//
+@// Return Data : None
+@//
+@// Programming Note : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+ .global impeg2_mc_fullx_halfy_8x8_a9q
+
+impeg2_mc_fullx_halfy_8x8_a9q:
+
+ stmfd r13!, {r14}
+ add r14, r1, r2
+ mov r2, r2, lsl #1
+
+@/* Load 8 + 1 rows from reference block */
+@/* Do the addition with out rounding off as rounding value is 1 */
+ vld1.8 {d0}, [r1], r2 @// first row hence r1 = D0
+ vld1.8 {d2}, [r14], r2 @// second row hence r2 = D2
+ vld1.8 {d4}, [r1], r2 @// third row hence r3 = D4
+ vld1.8 {d6}, [r14], r2 @// fourth row hence r4 = D6
+ vld1.8 {d1}, [r1], r2 @// fifth row hence r5 = D1
+ vld1.8 {d3}, [r14], r2 @// sixth row hence r6 = D3
+ vrhadd.u8 d9, d1, d6 @// estimated row 4 = D9
+ vld1.8 {d5}, [r1], r2 @// seventh row hence r7 = D5
+ vrhadd.u8 q0, q0, q1 @// estimated row 1 = D0, row 5 = D1
+ vld1.8 {d7}, [r14], r2 @// eighth row hence r8 = D7
+ vrhadd.u8 q1, q1, q2 @// estimated row 2 = D2, row 6 = D3
+ vld1.8 {d8}, [r1], r2 @// ninth row hence r9 = D8
+ vrhadd.u8 q2, q2, q3 @// estimated row 3 = D4, row 7 = D5
+
+ add r14, r0, r3
+ mov r3, r3, lsl #1
+
+@/* Store the eight rows calculated above */
+ vst1.8 {d2}, [r14], r3 @// second row hence D2
+ vrhadd.u8 d7, d7, d8 @// estimated row 8 = D7
+ vst1.8 {d0}, [r0], r3 @// first row hence D0
+ vst1.8 {d9}, [r14], r3 @// fourth row hence D9
+ vst1.8 {d4}, [r0], r3 @// third row hence D4
+ vst1.8 {d3}, [r14], r3 @// sixth row hence r6 = D3
+ vst1.8 {d1}, [r0], r3 @// fifth row hence r5 = D1
+ vst1.8 {d7}, [r14], r3 @// eighth row hence r8 = D7
+ vst1.8 {d5}, [r0], r3 @// seventh row hence r7 = D5
+
+ ldmfd sp!, {pc}
+
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_mc_halfx_fully_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@// current frame buffer.This function is called for
+@// blocks that are not coded and have motion vectors
+@// with a half pel resolutionand VopRoundingType is 0 ..
+@//
+@// Inputs : r0 - out : Current Block Pointer
+@// r1 - ref : Refernce Block Pointer
+@// r2 - ref_wid : Refernce Block Width
+@// r3 - out_wid ; Current Block Width
+@//
+@// Registers Used : r12, r14, d0-d10, d12-d14, d16-d18, d20-d22
+
+@//
+@// Stack Usage : 8 bytes
+@//
+@// Outputs : The Motion Compensated Block
+@//
+@// Return Data : None
+@//
+@// Programming Note : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+
+ .global impeg2_mc_halfx_fully_8x8_a9q
+
+
+
+impeg2_mc_halfx_fully_8x8_a9q:
+
+ stmfd sp!, {r12, lr}
+
+ add r14, r1, r2, lsl #2
+
+ add r12, r0, r3, lsl#2
+
+ vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1
+
+ vld1.8 {d2, d3}, [r14], r2 @ row5
+
+
+ vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2
+
+ vld1.8 {d6, d7}, [r14], r2 @row6
+
+
+ vext.8 d8, d0, d1, #1 @Extract pixels (1-8) of row1
+
+ vext.8 d12, d2, d3, #1 @Extract pixels (1-8) of row5
+
+ vext.8 d16, d4, d5, #1 @Extract pixels (1-8) of row2
+
+ vext.8 d20, d6, d7, #1 @Extract pixels (1-8) of row6
+
+
+ vld1.8 {d9, d10}, [r1], r2 @load row3
+
+ vld1.8 {d13, d14}, [r14], r2 @load row7
+
+ vld1.8 {d17, d18}, [r1], r2 @load row4
+
+ vld1.8 {d21, d22}, [r14], r2 @load row8
+
+
+ vext.8 d1, d9, d10, #1 @Extract pixels (1-8) of row3
+
+ vext.8 d3, d13, d14, #1 @Extract pixels (1-8) of row7
+
+
+
+ vext.8 d5, d17, d18, #1 @Extract pixels (1-8) of row4
+
+ vext.8 d7, d21, d22, #1 @Extract pixels (1-8) of row8
+
+
+ vrhadd.u8 q0, q0, q4 @operate on row1 and row3
+
+ vrhadd.u8 q1, q1, q6 @operate on row5 and row7
+
+
+ vrhadd.u8 q2, q2, q8 @operate on row2 and row4
+
+
+
+ vrhadd.u8 q3, q3, q10 @operate on row6 and row8
+
+ vst1.8 d0, [r0], r3 @store row1
+
+ vst1.8 d2, [r12], r3 @store row5
+
+ vst1.8 d4, [r0], r3 @store row2
+
+ vst1.8 d6, [r12], r3 @store row6
+
+ vst1.8 d1, [r0], r3 @store row3
+
+ vst1.8 d3, [r12], r3 @store row7
+
+ vst1.8 d5, [r0], r3 @store row4
+
+ vst1.8 d7, [r12], r3 @store row8
+
+
+
+ ldmfd sp!, {r12, pc}
+
+
+
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_mc_halfx_halfy_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@// current frame buffer.This function is called for
+@// blocks that are not coded and have motion vectors
+@// with a half pel resolutionand VopRoundingType is 0 ..
+@//
+@// Inputs : r0 - out : Current Block Pointer
+@// r1 - ref : Refernce Block Pointer
+@// r2 - ref_wid : Refernce Block Width
+@// r3 - out_wid ; Current Block Width
+@//
+@// Registers Used : r14, q0-q15
+
+@//
+@// Stack Usage : 4 bytes
+@//
+@// Outputs : The Motion Compensated Block
+@//
+@// Return Data : None
+@//
+@// Programming Note : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+ .global impeg2_mc_halfx_halfy_8x8_a9q
+
+impeg2_mc_halfx_halfy_8x8_a9q:
+
+ stmfd sp!, {r14}
+
+ add r14, r1, r2, lsl #2
+
+ vld1.8 {d0, d1}, [r1], r2 @load 16 pixels of row1
+
+ vld1.8 {d2, d3}, [r14], r2 @ row5
+
+ vld1.8 {d4, d5}, [r1], r2 @load 16 pixels row2
+
+ vld1.8 {d6, d7}, [r14], r2 @row6
+
+ vext.8 d1, d0, d1, #1 @Extract pixels (1-8) of row1
+
+
+
+ vext.8 d3, d2, d3, #1 @Extract pixels (1-8) of row5
+
+
+
+ vext.8 d5, d4, d5, #1 @Extract pixels (1-8) of row2
+
+ vext.8 d7, d6, d7, #1 @Extract pixels (1-8) of row6
+
+
+
+
+ vld1.8 {d8, d9}, [r1], r2 @load row3
+
+
+
+ vld1.8 {d10, d11}, [r14], r2 @load row7
+
+ vld1.8 {d12, d13}, [r1], r2 @load row4
+
+ vld1.8 {d14, d15}, [r14], r2 @load row8
+
+ vext.8 d9, d8, d9, #1 @Extract pixels (1-8) of row3
+
+ vld1.8 {d16, d17}, [r14], r2 @load row9
+
+
+
+
+
+ vext.8 d11, d10, d11, #1 @Extract pixels (1-8) of row7
+
+
+
+ vext.8 d13, d12, d13, #1 @Extract pixels (1-8) of row4
+
+
+
+ vext.8 d15, d14, d15, #1 @Extract pixels (1-8) of row8
+
+ vext.8 d17, d16, d17, #1 @Extract pixels (1-8) of row9
+
+
+ @interpolation in x direction
+
+ vaddl.u8 q0, d0, d1 @operate row1
+
+ vaddl.u8 q1, d2, d3 @operate row5
+
+ vaddl.u8 q2, d4, d5 @operate row2
+
+ vaddl.u8 q3, d6, d7 @operate row6
+
+ vaddl.u8 q4, d8, d9 @operate row3
+
+ vaddl.u8 q5, d10, d11 @operate row7
+
+ vaddl.u8 q6, d12, d13 @operate row4
+
+ vaddl.u8 q7, d14, d15 @operate row8
+
+ vaddl.u8 q8, d16, d17 @operate row9
+
+ @interpolation in y direction
+
+ add r14, r0, r3, lsl #2
+
+
+
+ vadd.u16 q9, q0, q2 @operate row1 and row2
+
+ vadd.u16 q13, q1, q3 @operate row5 and row6
+
+ vadd.u16 q10, q2, q4 @operate row2 and row3
+
+ vadd.u16 q14, q3, q5 @operate row6 and row7
+
+ vrshrn.u16 d18, q9, #2 @row1
+
+ vrshrn.u16 d26, q13, #2 @row5
+
+ vrshrn.u16 d20, q10, #2 @row2
+
+ vrshrn.u16 d28, q14, #2 @row6
+
+ vadd.u16 q11, q4, q6 @operate row3 and row4
+
+ vst1.8 d18, [r0], r3 @store row1
+
+ vadd.u16 q15, q5, q7 @operate row7 and row8
+
+ vst1.8 d26, [r14], r3 @store row5
+
+ vadd.u16 q12, q6, q1 @operate row4 and row5
+
+ vst1.8 d20, [r0], r3 @store row2
+
+ vadd.u16 q7, q7, q8 @operate row8 and row9
+
+ vst1.8 d28, [r14], r3 @store row6
+
+
+
+ vrshrn.u16 d22, q11, #2 @row3
+
+ vrshrn.u16 d30, q15, #2 @row7
+
+ vrshrn.u16 d24, q12, #2 @row4
+
+ vrshrn.u16 d14, q7, #2 @row8
+
+
+ vst1.8 d22, [r0], r3 @store row3
+ vst1.8 d30, [r14], r3 @store row7
+ vst1.8 d24, [r0], r3 @store row4
+ vst1.8 d14, [r14], r3 @store row8
+
+
+
+ ldmfd sp!, {pc}
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_mc_fullx_fully_8x8_a9q()
+@//
+@// Detail Description : This function pastes the reference block in the
+@// current frame buffer.This function is called for
+@// blocks that are not coded and have motion vectors
+@// with a half pel resolutionand ..
+@//
+@// Inputs : r0 - out : Current Block Pointer
+@// r1 - ref : Refernce Block Pointer
+@// r2 - ref_wid : Refernce Block Width
+@// r3 - out_wid ; Current Block Width
+@//
+@// Registers Used : r12, r14, d0-d3
+
+@//
+@// Stack Usage : 8 bytes
+@//
+@// Outputs : The Motion Compensated Block
+@//
+@// Return Data : None
+@//
+@// Programming Note : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+ .global impeg2_mc_fullx_fully_8x8_a9q
+impeg2_mc_fullx_fully_8x8_a9q:
+
+
+ stmfd sp!, {r12, lr}
+
+ add r14, r1, r2, lsl #2
+
+ add r12, r0, r3, lsl #2
+
+
+ vld1.8 d0, [r1], r2 @load row1
+
+ vld1.8 d1, [r14], r2 @load row4
+
+ vld1.8 d2, [r1], r2 @load row2
+
+ vld1.8 d3, [r14], r2 @load row5
+
+
+ vst1.8 d0, [r0], r3 @store row1
+
+ vst1.8 d1, [r12], r3 @store row4
+
+ vst1.8 d2, [r0], r3 @store row2
+
+ vst1.8 d3, [r12], r3 @store row5
+
+
+ vld1.8 d0, [r1], r2 @load row3
+
+ vld1.8 d1, [r14], r2 @load row6
+
+ vld1.8 d2, [r1], r2 @load row4
+
+ vld1.8 d3, [r14], r2 @load row8
+
+
+ vst1.8 d0, [r0], r3 @store row3
+
+ vst1.8 d1, [r12], r3 @store row6
+
+ vst1.8 d2, [r0], r3 @store row4
+
+ vst1.8 d3, [r12], r3 @store row8
+
+
+ ldmfd sp!, {r12, pc}
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_interpolate_a9q()
+@//
+@// Detail Description : interpolates two buffers and adds pred
+@//
+@// Inputs : r0 - pointer to src1
+@// r1 - pointer to src2
+@// r2 - dest buf
+@// r3 - dst stride
+@// Registers Used : r4, r5, r7, r14, d0-d15
+@//
+@// Stack Usage : 20 bytes
+@//
+@// Outputs : The Motion Compensated Block
+@//
+@// Return Data : None
+@//
+@// Programming Note : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+ .global impeg2_interpolate_a9q
+
+
+impeg2_interpolate_a9q:
+
+ stmfd r13!, {r4, r5, r7, r12, r14}
+
+ ldr r4, [r0, #0] @ptr_y src1
+
+ ldr r5, [r1, #0] @ptr_y src2
+
+ ldr r7, [r2, #0] @ptr_y dst buf
+
+ mov r12, #4 @counter for number of blocks
+
+
+interp_lumablocks_stride:
+
+ vld1.8 {d0, d1}, [r4]! @row1 src1
+
+ vld1.8 {d2, d3}, [r4]! @row2 src1
+
+ vld1.8 {d4, d5}, [r4]! @row3 src1
+
+ vld1.8 {d6, d7}, [r4]! @row4 src1
+
+
+ vld1.8 {d8, d9}, [r5]! @row1 src2
+
+ vld1.8 {d10, d11}, [r5]! @row2 src2
+
+ vld1.8 {d12, d13}, [r5]! @row3 src2
+
+ vld1.8 {d14, d15}, [r5]! @row4 src2
+
+
+
+
+ vrhadd.u8 q0, q0, q4 @operate on row1
+
+ vrhadd.u8 q1, q1, q5 @operate on row2
+
+ vrhadd.u8 q2, q2, q6 @operate on row3
+
+ vrhadd.u8 q3, q3, q7 @operate on row4
+
+
+
+ vst1.8 {d0, d1}, [r7], r3 @row1
+
+ vst1.8 {d2, d3}, [r7], r3 @row2
+
+ vst1.8 {d4, d5}, [r7], r3 @row3
+
+ vst1.8 {d6, d7}, [r7], r3 @row4
+
+ subs r12, r12, #1
+
+ bne interp_lumablocks_stride
+
+
+ mov r3, r3, lsr #1 @stride >> 1
+
+ ldr r4, [r0, #4] @ptr_u src1
+
+ ldr r5, [r1, #4] @ptr_u src2
+
+ ldr r7 , [r2, #4] @ptr_u dst buf
+
+ mov r12, #2 @counter for number of blocks
+
+
+
+@chroma blocks
+
+interp_chromablocks_stride:
+
+ vld1.8 {d0, d1}, [r4]! @row1 & 2 src1
+
+ vld1.8 {d2, d3}, [r4]! @row3 & 4 src1
+
+ vld1.8 {d4, d5}, [r4]! @row5 & 6 src1
+
+ vld1.8 {d6, d7}, [r4]! @row7 & 8 src1
+
+
+ vld1.8 {d8, d9}, [r5]! @row1 & 2 src2
+
+ vld1.8 {d10, d11}, [r5]! @row3 & 4 src2
+
+ vld1.8 {d12, d13}, [r5]! @row5 & 6 src2
+
+ vld1.8 {d14, d15}, [r5]! @row7 & 8 src2
+
+
+
+
+ vrhadd.u8 q0, q0, q4 @operate on row1 & 2
+
+ vrhadd.u8 q1, q1, q5 @operate on row3 & 4
+
+ vrhadd.u8 q2, q2, q6 @operate on row5 & 6
+
+ vrhadd.u8 q3, q3, q7 @operate on row7 & 8
+
+
+ vst1.8 {d0}, [r7], r3 @row1
+
+ vst1.8 {d1}, [r7], r3 @row2
+
+ vst1.8 {d2}, [r7], r3 @row3
+
+ vst1.8 {d3}, [r7], r3 @row4
+
+ vst1.8 {d4}, [r7], r3 @row5
+
+ vst1.8 {d5}, [r7], r3 @row6
+
+ vst1.8 {d6}, [r7], r3 @row7
+
+ vst1.8 {d7}, [r7], r3 @row8
+
+
+
+ ldr r4, [r0, #8] @ptr_v src1
+
+ ldr r5, [r1, #8] @ptr_v src2
+
+ ldr r7, [r2, #8] @ptr_v dst buf
+
+ subs r12, r12, #1
+
+ bne interp_chromablocks_stride
+
+
+ ldmfd r13!, {r4, r5, r7, r12, pc}
+
+
+
+
+
diff --git a/common/arm/impeg2_mem_func.s b/common/arm/impeg2_mem_func.s
new file mode 100755
index 0000000..869b7d7
--- /dev/null
+++ b/common/arm/impeg2_mem_func.s
@@ -0,0 +1,177 @@
+@/******************************************************************************
+@ *
+@ * Copyright (C) 2015 The Android Open Source Project
+@ *
+@ * Licensed under the Apache License, Version 2.0 (the "License");
+@ * you may not use this file except in compliance with the License.
+@ * You may obtain a copy of the License at:
+@ *
+@ * http://www.apache.org/licenses/LICENSE-2.0
+@ *
+@ * Unless required by applicable law or agreed to in writing, software
+@ * distributed under the License is distributed on an "AS IS" BASIS,
+@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@ * See the License for the specific language governing permissions and
+@ * limitations under the License.
+@ *
+@ *****************************************************************************
+@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+@*/
+
+@/*
+@//----------------------------------------------------------------------------
+@// File Name : impeg2_mem_func.s
+@//
+@// Description : This file has motion compensation related
+@// interpolation functions on Neon + CortexA-8 platform
+@//
+@// Reference Document :
+@//
+@// Revision History :
+@// Date Author Detail Description
+@// ------------ ---------------- ----------------------------------
+@// 18 jun 2010 S Hamsalekha Created
+@//
+@//-------------------------------------------------------------------------
+@*/
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Include Files
+@// ----------------------------------------------------------------------------
+@*/
+.text
+.p2align 2
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Struct/Union Types and Define
+@// ----------------------------------------------------------------------------
+@*/
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Global Data section variables
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Static Prototype Functions
+@// ----------------------------------------------------------------------------
+@*/
+@// -------------------------- NONE --------------------------------------------
+
+@/*
+@// ----------------------------------------------------------------------------
+@// Exported functions
+@// ----------------------------------------------------------------------------
+@*/
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_memset_8bit_8x8_block_a9q()
+@//
+@// Detail Description : This routine intialises the Block matrix buffer contents to a
+@// particular Value. This function also assumes the buffer size
+@// to be set is 64 Bytes fixed. It also assumes that blk matrix
+@// used is 64 bit aligned.
+@//
+@// Inputs : r0: pi2_blk_mat : Block Pointer
+@// r1: u2_val : Value with which the block is initialized
+@// r2: u4_dst_width: Destination Width
+@//
+@// Registers Used : q0
+@//
+@// Stack Usage : 4 bytes
+@//
+@// Outputs : Block Matrix Initialized to given value
+@//
+@// Return Data : None
+@//
+@// Programming Note : None
+@//-----------------------------------------------------------------------------
+@*/
+ .global impeg2_memset_8bit_8x8_block_a9q
+impeg2_memset_8bit_8x8_block_a9q:
+ str lr, [sp, #-4]!
+
+ vdup.8 d0, r1 @//r1 is the 8-bit value to be set into
+
+ vst1.8 {d0}, [r0], r2 @//Store the row 1
+ vst1.8 {d0}, [r0], r2 @//Store the row 2
+ vst1.8 {d0}, [r0], r2 @//Store the row 3
+ vst1.8 {d0}, [r0], r2 @//Store the row 4
+ vst1.8 {d0}, [r0], r2 @//Store the row 5
+ vst1.8 {d0}, [r0], r2 @//Store the row 6
+ vst1.8 {d0}, [r0], r2 @//Store the row 7
+ vst1.8 {d0}, [r0], r2 @//Store the row 8
+
+ ldr pc, [sp], #4
+
+
+
+
+
+
+
+@/*
+@//---------------------------------------------------------------------------
+@// Function Name : impeg2_memset0_16bit_8x8_linear_block_a9q()
+@//
+@// Detail Description : memsets 128 byte long linear buf to 0
+@//
+@// Inputs : r0 - Buffer
+@// Registers Used : q0
+
+@//
+@// Stack Usage : 4 bytes
+@//
+@// Outputs : None
+@//
+@// Return Data : None
+@//
+@// Programming Note : <program limitation>
+@//-----------------------------------------------------------------------------
+@*/
+
+
+
+ .global impeg2_memset0_16bit_8x8_linear_block_a9q
+
+
+impeg2_memset0_16bit_8x8_linear_block_a9q:
+
+ stmfd r13!, {r14}
+
+ vmov.i16 q0, #0
+
+@Y data
+
+ vst1.16 {d0, d1} , [r0]! @row1
+
+ vst1.16 {d0, d1} , [r0]! @row2
+
+ vst1.16 {d0, d1} , [r0]! @row3
+
+ vst1.16 {d0, d1} , [r0]! @row4
+
+ vst1.16 {d0, d1} , [r0]! @row5
+
+ vst1.16 {d0, d1} , [r0]! @row6
+
+ vst1.16 {d0, d1} , [r0]! @row7
+
+ vst1.16 {d0, d1} , [r0]! @row8
+
+
+
+ ldmfd r13!, {pc}
+
+
+
+
diff --git a/common/arm/impeg2_platform_macros.h b/common/arm/impeg2_platform_macros.h
new file mode 100644
index 0000000..11db302
--- /dev/null
+++ b/common/arm/impeg2_platform_macros.h
@@ -0,0 +1,75 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+
+#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = \
+ (u4_temp1 << 24) | \
+ ((u4_temp1 & 0xff00) << 8) | \
+ ((u4_temp1 & 0xff0000) >> 8) | \
+ (u4_temp1 >> 24);
+
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+static __inline WORD32 CLIP_U8(WORD32 x)
+{
+ asm("usat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S8(WORD32 x)
+{
+ asm("ssat %0, #8, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U12(WORD32 x)
+{
+ asm("usat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_S12(WORD32 x)
+{
+ asm("ssat %0, #12, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+static __inline WORD32 CLIP_U16(WORD32 x)
+{
+ asm("usat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+static __inline WORD32 CLIP_S16(WORD32 x)
+{
+ asm("ssat %0, #16, %1" : "=r"(x) : "r"(x));
+ return x;
+}
+
+#define INLINE
+#define PLD(x) __pld(x)
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */
diff --git a/common/armv8/impeg2_format_conv.s b/common/armv8/impeg2_format_conv.s
new file mode 100644
index 0000000..48baf04
--- /dev/null
+++ b/common/armv8/impeg2_format_conv.s
@@ -0,0 +1,409 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+
+///*
+////----------------------------------------------------------------------------
+//// File Name : impeg2_format_conv.s
+////
+//// Description : This file has the Idct Implementations for the
+//// MPEG4 SP decoder on neon platform.
+////
+//// Reference Document :
+////
+//// Revision History :
+//// Date Author Detail Description
+//// ------------ ---------------- ----------------------------------
+//// Jul 07, 2008 Naveen Kumar T Created
+////
+////-------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Include Files
+//// ----------------------------------------------------------------------------
+//*/
+.set log2_16 , 4
+.set log2_2 , 1
+
+.text
+.include "impeg2_neon_macros.s"
+///*
+//// ----------------------------------------------------------------------------
+//// Struct/Union Types and Define
+//// ----------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Global Data section variables
+//// ----------------------------------------------------------------------------
+//*/
+////--------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Prototype Functions
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Exported functions
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*****************************************************************************
+//* *
+//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() *
+//* *
+//* Description : This function conversts the image from YUV420P color *
+//* space to 420SP color space(UV interleaved). *
+//* *
+//* Arguments : x0 pu1_y *
+//* x1 pu1_u *
+//* x2 pu1_v *
+//* x3 pu1_dest_y *
+//* x4 pu1_dest_uv *
+//* x5 u2_height *
+//* x6 u2_width *
+//* x7 u2_stridey *
+//* sp, #80 u2_strideu *
+//* sp, #88 u2_stridev *
+//* sp, #96 u2_dest_stride_y *
+//* sp, #104 u2_dest_stride_uv *
+//* sp, #112 convert_uv_only *
+//* *
+//* Values Returned : None *
+//* *
+//* Register Usage : x8, x10, x16, x20, v0, v1 *
+//* *
+//* Stack Usage : 80 Bytes *
+//* *
+//* Interruptibility : Interruptible *
+//* *
+//* Known Limitations *
+//* Assumptions: Image Width: Assumed to be multiple of 16 and *
+//* greater than or equal to 16 *
+//* Image Height: Assumed to be even. *
+//* *
+//* Revision History : *
+//* DD MM YYYY Author(s) Changes (Describe the changes made) *
+//* 07 06 2010 Varshita Draft *
+//* 07 06 2010 Naveen Kr T Completed *
+//* *
+//*****************************************************************************/
+.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
+impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:
+
+ //// push the registers on the stack
+ // pu1_y, - x0
+ // pu1_u, - x1
+ // pu1_v, - x2
+ // pu1_dest_y, - x3
+ // pu1_dest_uv, - x4
+ // u2_height, - x5
+ // u2_width, - x6
+ // u2_stridey, - x7
+ // u2_strideu, - sp, #80
+ // u2_stridev, - sp, #88
+ // u2_dest_stride_y, - sp, #96
+ // u2_dest_stride_uv, - sp, #104
+ // convert_uv_only - sp, #112
+ // STMFD sp!,{x4-x12,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ldr w14, [sp, #112] //// Load convert_uv_only
+
+ cmp w14, #1
+ beq yuv420sp_uv_chroma
+ ///* Do the preprocessing before the main loops start */
+ //// Load the parameters from stack
+
+ ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
+ uxtw x8, w8
+
+ sub x7, x7, x6 //// Source increment
+
+ sub x8, x8, x6 //// Destination increment
+
+
+yuv420sp_uv_row_loop_y:
+ mov x16, x6
+
+yuv420sp_uv_col_loop_y:
+ prfm pldl1keep, [x0, #128]
+ ld1 {v0.8b, v1.8b}, [x0], #16
+ st1 {v0.8b, v1.8b}, [x3], #16
+ sub x16, x16, #16
+ cmp x16, #15
+ bgt yuv420sp_uv_col_loop_y
+
+ cmp x16, #0
+ beq yuv420sp_uv_row_loop__y
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20, x16, #16
+ neg x16, x20
+ sub x0, x0, x16
+ sub x3, x3, x16
+
+ ld1 {v0.8b, v1.8b}, [x0], #16
+ st1 {v0.8b, v1.8b}, [x3], #16
+
+yuv420sp_uv_row_loop__y:
+ add x0, x0, x7
+ add x3, x3, x8
+ subs x5, x5, #1
+ bgt yuv420sp_uv_row_loop_y
+
+yuv420sp_uv_chroma:
+ ldr w7, [sp, #88] //// Load u2_strideu from stack
+ sxtw x7, w7
+
+ ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
+ sxtw x8, w8
+
+ sub x7, x7, x6, lsr #1 //// Source increment
+
+ sub x8, x8, x6 //// Destination increment
+
+ lsr x6, x6, #1
+ lsr x5, x5, #1
+yuv420sp_uv_row_loop_uv:
+ mov x16, x6
+
+
+yuv420sp_uv_col_loop_uv:
+ prfm pldl1keep, [x1, #128]
+ prfm pldl1keep, [x2, #128]
+
+ ld1 {v0.8b}, [x1], #8
+ ld1 {v1.8b}, [x2], #8
+ st2 {v0.8b, v1.8b}, [x4], #16
+
+ sub x16, x16, #8
+ cmp x16, #7
+ bgt yuv420sp_uv_col_loop_uv
+
+ cmp x16, #0
+ beq yuv420sp_uv_row_loop__uv
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20, x16, #8
+ neg x16, x20
+ sub x1, x1, x16
+ sub x2, x2, x16
+ sub x4, x4, x16, lsl #1
+
+ ld1 {v0.8b}, [x1], #8
+ ld1 {v1.8b}, [x2], #8
+ st2 {v0.8b, v1.8b}, [x4], #16
+
+yuv420sp_uv_row_loop__uv:
+ add x1, x1, x7
+ add x2, x2, x7
+ add x4, x4, x8
+ subs x5, x5, #1
+ bgt yuv420sp_uv_row_loop_uv
+ ////POP THE REGISTERS
+ // LDMFD sp!,{x4-x12,PC}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
+
+///*****************************************************************************
+//* *
+//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() *
+//* *
+//* Description : This function conversts the image from YUV420P color *
+//* space to 420SP color space(VU interleaved). *
+//* This function is similar to above function *
+//* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in *
+//* VLD1.8 for chroma - order of registers is different *
+//* *
+//* Arguments : x0 pu1_y *
+//* x1 pu1_u *
+//* x2 pu1_v *
+//* x3 pu1_dest_y *
+//* x4 pu1_dest_uv *
+//* x5 u2_height *
+//* x6 u2_width *
+//* x7 u2_stridey *
+//* sp, #80 u2_strideu *
+//* sp, #88 u2_stridev *
+//* sp, #96 u2_dest_stride_y *
+//* sp, #104 u2_dest_stride_uv *
+//* sp, #112 convert_uv_only *
+//* *
+//* Values Returned : None *
+//* *
+//* Register Usage : x8, x14, x16, x20, v0, v1 *
+//* *
+//* Stack Usage : 80 Bytes *
+//* *
+//* Interruptibility : Interruptible *
+//* *
+//* Known Limitations *
+//* Assumptions: Image Width: Assumed to be multiple of 16 and *
+//* greater than or equal to 16 *
+//* Image Height: Assumed to be even. *
+//* *
+//* Revision History : *
+//* DD MM YYYY Author(s) Changes (Describe the changes made) *
+//* 07 06 2010 Varshita Draft *
+//* 07 06 2010 Naveen Kr T Completed *
+//* *
+//*****************************************************************************/
+
+.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
+impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:
+
+ //// push the registers on the stack
+ // pu1_y, - x0
+ // pu1_u, - x1
+ // pu1_v, - x2
+ // pu1_dest_y, - x3
+ // pu1_dest_uv, - x4
+ // u2_height, - x5
+ // u2_width, - x6
+ // u2_stridey, - x7
+ // u2_strideu, - sp, #80
+ // u2_stridev, - sp, #88
+ // u2_dest_stride_y, - sp, #96
+ // u2_dest_stride_uv, - sp, #104
+ // convert_uv_only - sp, #112
+ // STMFD sp!,{x4-x12,x14}
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ ldr w14, [sp, #112] //// Load convert_uv_only
+
+ cmp w14, #1
+ beq yuv420sp_vu_chroma
+
+ ///* Do the preprocessing before the main loops start */
+ //// Load the parameters from stack
+
+ ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
+ uxtw x8, w8
+
+ sub x7, x7, x6 //// Source increment
+
+ sub x8, x8, x6 //// Destination increment
+
+
+yuv420sp_vu_row_loop_y:
+ mov x16, x6
+
+yuv420sp_vu_col_loop_y:
+ prfm pldl1keep, [x0, #128]
+ ld1 {v0.8b, v1.8b}, [x0], #16
+ st1 {v0.8b, v1.8b}, [x3], #16
+ sub x16, x16, #16
+ cmp x16, #15
+ bgt yuv420sp_vu_col_loop_y
+
+ cmp x16, #0
+ beq yuv420sp_vu_row_loop__y
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20, x16, #16
+ neg x16, x20
+ sub x0, x0, x16
+ sub x3, x3, x16
+
+ ld1 {v0.8b, v1.8b}, [x0], #16
+ st1 {v0.8b, v1.8b}, [x3], #16
+
+yuv420sp_vu_row_loop__y:
+ add x0, x0, x7
+ add x3, x3, x8
+ subs x5, x5, #1
+ bgt yuv420sp_vu_row_loop_y
+
+yuv420sp_vu_chroma:
+ ldr w7, [sp, #80] //// Load u2_strideu from stack
+ sxtw x7, w7
+
+ ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
+ sxtw x8, w8
+
+ sub x7, x7, x6, lsr #1 //// Source increment
+
+ sub x8, x8, x6 //// Destination increment
+
+ lsr x6, x6, #1
+ lsr x5, x5, #1
+yuv420sp_vu_row_loop_uv:
+ mov x16, x6
+
+
+yuv420sp_vu_col_loop_uv:
+ prfm pldl1keep, [x1, #128]
+ prfm pldl1keep, [x2, #128]
+ ld1 {v1.8b}, [x1], #8
+ ld1 {v0.8b}, [x2], #8
+ st2 {v0.8b, v1.8b}, [x4], #16
+ sub x16, x16, #8
+ cmp x16, #7
+ bgt yuv420sp_vu_col_loop_uv
+
+ cmp x16, #0
+ beq yuv420sp_vu_row_loop__uv
+ ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
+ ////Ex if width is 162, above loop will process 160 pixels. And
+ ////Both source and destination will point to 146th pixel and then 16 bytes will be read
+ //// and written using VLD1 and VST1
+ sub x20, x16, #8
+ neg x16, x20
+ sub x1, x1, x16
+ sub x2, x2, x16
+ sub x4, x4, x16, lsl #1
+
+ ld1 {v1.8b}, [x1], #8
+ ld1 {v0.8b}, [x2], #8
+ st2 {v0.8b, v1.8b}, [x4], #16
+
+yuv420sp_vu_row_loop__uv:
+ add x1, x1, x7
+ add x2, x2, x7
+ add x4, x4, x8
+ subs x5, x5, #1
+ bgt yuv420sp_vu_row_loop_uv
+ ////POP THE REGISTERS
+ // LDMFD sp!,{x4-x12,PC}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
diff --git a/common/armv8/impeg2_idct.s b/common/armv8/impeg2_idct.s
new file mode 100644
index 0000000..4956e54
--- /dev/null
+++ b/common/armv8/impeg2_idct.s
@@ -0,0 +1,1247 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+// *******************************************************************************
+// * @file
+// * impeg2_idct.s
+// *
+// * @brief
+// * contains function definitions for single stage inverse transform
+// *
+// * @author
+// * anand s
+// *
+// * @par list of functions:
+// * - impeg2_idct_recon_dc_av8()
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+//*/
+
+///**
+// *******************************************************************************
+// *
+// * @brief
+// * this function performs inverse transform and reconstruction for 8x8
+// * input block
+// *
+// * @par description:
+// * performs inverse transform and adds the prediction data and clips output
+// * to 8 bit
+// *
+// * @param[in] pi2_src
+// * input 8x8 coefficients
+// *
+// * @param[in] pi2_tmp
+// * temporary 8x8 buffer for storing inverse
+// *
+// * transform
+// * 1st stage output
+// *
+// * @param[in] pu1_pred
+// * prediction 8x8 block
+// *
+// * @param[out] pu1_dst
+// * output 8x8 block
+// *
+// * @param[in] src_strd
+// * input stride
+// *
+// * @param[in] pred_strd
+// * prediction stride
+// *
+// * @param[in] dst_strd
+// * output stride
+// *
+// * @param[in] shift
+// * output shift
+// *
+// * @param[in] zero_cols
+// * zero columns in pi2_src
+// *
+// * @returns void
+// *
+// * @remarks
+// * none
+// *
+// *******************************************************************************
+// */
+
+//void impeg2_itrans_recon_8x8(word16 *pi2_src,
+// word16 *pi2_tmp,
+// uword8 *pu1_pred,
+// uword8 *pu1_dst,
+// word32 src_strd,
+// word32 pred_strd,
+// word32 dst_strd,
+// word32 zero_cols
+// word32 zero_rows )
+
+//**************variables vs registers*************************
+// x0 => *pi2_src
+// x1 => *pi2_tmp
+// x2 => *pu1_pred
+// x3 => *pu1_dst
+// src_strd
+// pred_strd
+// dst_strd
+// zero_cols
+
+
+
+.text
+.align 4
+.include "impeg2_neon_macros.s"
+
+.set idct_stg1_shift , 12
+.set idct_stg2_shift , 16
+.set idct_stg1_round , (1 << (idct_stg1_shift - 1))
+.set idct_stg2_round , (1 << (idct_stg2_shift - 1))
+
+.extern gai2_impeg2_idct_q15
+.extern gai2_impeg2_idct_q11
+.extern gai2_impeg2_idct_first_col_q15
+.extern gai2_impeg2_idct_first_col_q11
+.extern gai2_impeg2_mismatch_stg2_additive
+
+.global impeg2_idct_recon_dc_av8
+impeg2_idct_recon_dc_av8:
+ // STMFD sp!,{x4,x6,x12,x14}
+ push_v_regs
+ ////x0: pi2_src
+ ////x1: pi2_tmp - not used, used as pred_strd
+ ////x2: pu1_pred
+ ////x3: pu1_dst
+ ////x4: used as scratch
+ ////x5: pred_strd
+ ////x6: dst_strd
+
+ ldrsh x4, [x0]
+ adrp x14, :got:gai2_impeg2_idct_q15
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
+ ldrsh x12, [x14]
+
+ ld1 {v0.8b}, [x2], x5
+ mul x4, x4, x12
+
+ ld1 {v1.8b}, [x2], x5
+ add x4, x4, #idct_stg1_round
+
+ ld1 {v2.8b}, [x2], x5
+ asr x4, x4, #idct_stg1_shift
+
+ adrp x14, :got:gai2_impeg2_idct_q11
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
+ ldrsh x12, [x14]
+
+ ld1 {v3.8b}, [x2], x5
+ mul x4, x4, x12
+
+ ld1 {v4.8b}, [x2], x5
+ add x4, x4, #idct_stg2_round
+
+ ld1 {v5.8b}, [x2], x5
+ asr x4, x4, #idct_stg2_shift
+
+ ld1 {v6.8b}, [x2], x5
+ dup v30.8h, w4
+
+
+ ld1 {v7.8b}, [x2], x5
+
+ uaddw v8.8h, v30.8h , v0.8b
+
+ uaddw v10.8h, v30.8h , v1.8b
+ sqxtun v0.8b, v8.8h
+
+ uaddw v12.8h, v30.8h , v2.8b
+ sqxtun v1.8b, v10.8h
+ st1 {v0.8b}, [x3], x6
+
+ uaddw v14.8h, v30.8h , v3.8b
+ sqxtun v2.8b, v12.8h
+ st1 {v1.8b}, [x3], x6
+
+ uaddw v16.8h, v30.8h , v4.8b
+ sqxtun v3.8b, v14.8h
+ st1 {v2.8b}, [x3], x6
+
+ uaddw v18.8h, v30.8h , v5.8b
+ sqxtun v4.8b, v16.8h
+ st1 {v3.8b}, [x3], x6
+
+ uaddw v20.8h, v30.8h , v6.8b
+ sqxtun v5.8b, v18.8h
+ st1 {v4.8b}, [x3], x6
+
+ uaddw v22.8h, v30.8h , v7.8b
+ sqxtun v6.8b, v20.8h
+ st1 {v5.8b}, [x3], x6
+
+ sqxtun v7.8b, v22.8h
+ st1 {v6.8b}, [x3], x6
+
+
+ st1 {v7.8b}, [x3], x6
+
+ // LDMFD sp!,{x4,x6,x12,pc}
+ pop_v_regs
+ ret
+
+
+
+.global impeg2_idct_recon_dc_mismatch_av8
+.extern gai2_impeg2_idct_last_row_q11
+.extern gai2_impeg2_mismatch_stg1_outp
+impeg2_idct_recon_dc_mismatch_av8:
+ // STMFD sp!,{x4-x12,x14}
+ push_v_regs
+
+ ldrsh x4, [x0]
+ adrp x14, :got:gai2_impeg2_idct_q15
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
+ ldrsh x12, [x14]
+
+ mul x4, x4, x12
+ add x4, x4, #idct_stg1_round
+ asr x4, x4, #idct_stg1_shift
+
+ adrp x14, :got:gai2_impeg2_idct_q11
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
+ ldrsh x12, [x14]
+ mul x4, x4, x12
+ dup v0.4s, w4
+
+ mov x14, #16 ////Increment for table read
+ adrp x4, :got:gai2_impeg2_mismatch_stg2_additive
+ ldr x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive]
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+ ld1 {v2.4h, v3.4h}, [x4], x14
+ ld1 {v30.8b}, [x2], x5
+ sxtl v8.4s, v2.4h
+ sxtl v10.4s, v3.4h
+ raddhn v12.4h, v0.4s, v8.4s
+ raddhn2 v12.8h, v0.4s, v10.4s
+ uaddw v14.8h, v12.8h , v30.8b
+ sqxtun v30.8b, v14.8h
+ st1 {v30.8b}, [x3], x6
+
+
+ // LDMFD sp!,{x4-x12,pc}
+ pop_v_regs
+ ret
+
+.globl impeg2_idct_recon_av8
+
+.type impeg2_idct_recon_av8, %function
+
+impeg2_idct_recon_av8:
+////register usage.extern - loading and until idct of columns
+//// cosine constants - d0
+//// sine constants - d1
+//// row 0 first half - d2 - y0
+//// row 1 first half - d6 - y1
+//// row 2 first half - d3 - y2
+//// row 3 first half - d7 - y3
+//// row 4 first half - d10 - y4
+//// row 5 first half - d14 - y5
+//// row 6 first half - d11 - y6
+//// row 7 first half - d15 - y7
+
+//// row 0 second half - d4 - y0
+//// row 1 second half - d8 - y1
+//// row 2 second half - d5 - y2
+//// row 3 second half - d9 - y3
+//// row 4 second half - d12 - y4
+//// row 5 second half - d16 - y5
+//// row 6 second half - d13 - y6
+//// row 7 second half - d17 - y7
+
+ //// copy the input pointer to another register
+ //// step 1 : load all constants
+ // stmfd sp!,{x4-x12,x14}
+
+ ldr w11, [sp] // zero rows
+
+ push_v_regs
+ stp x19, x20, [sp, #-16]!
+
+ mov x12, x7 // zero columns
+ mov x8, x5 // prediction stride
+ mov x7, x6 // destination stride
+ mov x6, x4 // src stride
+ lsl x6, x6, #1 // x sizeof(word16)
+ add x9, x0, x6, lsl #1 // 2 rows
+
+ add x10, x6, x6, lsl #1 // 3 rows
+
+ sub x10, x10, #8 // - 4 cols * sizeof(word16)
+ sub x5, x6, #8 // src_strd - 4 cols * sizeof(word16)
+
+ adrp x14, :got:gai2_impeg2_idct_first_col_q15
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
+ ld1 {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data
+
+ ////step 2 load all the input data
+ ////step 3 operate first 4 colums at a time
+
+ and x11, x11, #0xff
+ and x12, x12, #0xff
+
+ cmp x11, #0xf0
+ bge skip_last4_rows
+
+
+ ld1 {v2.4h}, [x0], #8
+ ld1 {v3.4h}, [x9], #8
+ ld1 {v4.4h}, [x0], x5
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ ld1 {v5.4h}, [x9], x5
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ ld1 {v6.4h}, [x0], #8
+ ld1 {v7.4h}, [x9], #8
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ ld1 {v8.4h}, [x0], x10
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ ld1 {v9.4h}, [x9], x10
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ ld1 {v10.4h}, [x0], #8
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ ld1 {v11.4h}, [x9], #8
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ ld1 {v12.4h}, [x0], x5
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ ld1 {v13.4h}, [x9], x5
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ ld1 {v14.4h}, [x0], #8
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ ld1 {v15.4h}, [x9], #8
+ smull v22.4s, v10.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ ld1 {v16.4h}, [x0], x10
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ ld1 {v17.4h}, [x9], x10
+
+ ///* this following was activated when alignment is not there */
+//// vld1.16 d2,[x0]!
+//// vld1.16 d3,[x2]!
+//// vld1.16 d4,[x0]!
+//// vld1.16 d5,[x2]!
+//// vld1.16 d6,[x0]!
+//// vld1.16 d7,[x2]!
+//// vld1.16 d8,[x0],x3
+//// vld1.16 d9,[x2],x3
+//// vld1.16 d10,[x0]!
+//// vld1.16 d11,[x2]!
+//// vld1.16 d12,[x0]!
+//// vld1.16 d13,[x2]!
+//// vld1.16 d14,[x0]!
+//// vld1.16 d15,[x2]!
+//// vld1.16 d16,[x0],x3
+//// vld1.16 d17,[x2],x3
+
+
+
+
+ smlal v24.4s, v14.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v14.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v14.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v14.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ smlsl v18.4s, v11.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v11.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v15.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v15.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v15.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v15.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+ add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+ sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+ add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
+ sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
+
+ add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
+ sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
+
+ add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
+ sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
+
+ add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
+ sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
+
+ sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+ b last4_cols
+
+
+
+skip_last4_rows:
+ adrp x14, :got:gai2_impeg2_idct_first_col_q15
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
+ ld1 {v0.4h, v1.4h}, [x14]
+
+ ld1 {v2.4h}, [x0], #8
+ ld1 {v3.4h}, [x9], #8
+ ld1 {v4.4h}, [x0], x5
+ ld1 {v5.4h}, [x9], x5
+ ld1 {v6.4h}, [x0], #8
+ ld1 {v7.4h}, [x9], #8
+ ld1 {v8.4h}, [x0], x10
+ ld1 {v9.4h}, [x9], x10
+
+
+
+ movi v12.4h, #0
+ movi v13.4h, #0
+ movi v16.4h, #0
+ movi v17.4h, #0
+
+
+
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+
+
+ add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+ sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+ add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
+ sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
+
+ add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
+ sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
+
+ add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
+ sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
+
+ add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
+ sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
+
+ sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+
+
+last4_cols:
+ adrp x14, :got:gai2_impeg2_idct_first_col_q15
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
+ ld1 {v0.4h, v1.4h}, [x14]
+
+
+ cmp x12, #0xf0
+ bge skip_last4cols
+
+ smull v24.4s, v8.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v8.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v8.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v8.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v9.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v9.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v9.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v9.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v18.4s, v5.4h, v1.4h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
+ smull v8.4s, v5.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+ smull v20.4s, v4.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+
+ smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v8.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
+ smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
+ smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
+ smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
+
+ add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7)
+ sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6)
+
+ add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0)
+ sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7)
+
+ add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2)
+ sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5)
+
+ add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1)
+ sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6)
+
+ add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3)
+ sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4)
+
+ sqrshrn v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ sqrshrn v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
+ b end_skip_last4cols
+
+
+
+skip_last4cols:
+ adrp x14, :got:gai2_impeg2_idct_first_col_q11
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
+ ld1 {v0.4h, v1.4h}, [x14]
+
+ umov x15, v25.d[0]
+
+ trn1 v25.4h, v2.4h, v6.4h
+ trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+ trn1 v27.4h, v3.4h, v7.4h
+ trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+ trn1 v6.2s, v29.2s, v31.2s
+ trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+ trn1 v2.2s, v25.2s, v27.2s
+ trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+
+ trn1 v25.4h, v10.4h, v14.4h
+ trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+ trn1 v27.4h, v11.4h, v15.4h
+ trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+ trn1 v10.2s, v25.2s, v27.2s
+ trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+ trn1 v14.2s, v29.2s, v31.2s
+ trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+ mov v25.d[0], x15
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
+
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+
+
+ sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v2.4s, v4.4s , v24.4s
+
+ sub v6.4s, v4.4s , v24.4s
+
+ add v8.4s, v22.4s , v30.4s
+
+ sub v24.4s, v22.4s , v30.4s
+
+ sqrshrn v5.4h, v8.4s, #idct_stg2_shift
+ sqrshrn v2.4h, v2.4s, #idct_stg2_shift
+ sqrshrn v9.4h, v6.4s, #idct_stg2_shift
+ sqrshrn v6.4h, v24.4s, #idct_stg2_shift
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v30.4s, v22.4s , v28.4s
+
+ sub v24.4s, v22.4s , v28.4s
+
+ add v28.4s, v18.4s , v26.4s
+
+ sub v22.4s, v18.4s , v26.4s
+ sqrshrn v4.4h, v30.4s, #idct_stg2_shift
+ sqrshrn v7.4h, v24.4s, #idct_stg2_shift
+ sqrshrn v3.4h, v28.4s, #idct_stg2_shift
+ sqrshrn v8.4h, v22.4s, #idct_stg2_shift
+
+
+
+ umov x19, v25.d[0]
+ umov x20, v25.d[1]
+
+ trn1 v27.4h, v2.4h, v3.4h
+ trn2 v29.4h, v2.4h, v3.4h
+ trn1 v25.4h, v4.4h, v5.4h
+ trn2 v31.4h, v4.4h, v5.4h
+
+ trn1 v2.2s, v27.2s, v25.2s
+ trn2 v4.2s, v27.2s, v25.2s
+ trn1 v3.2s, v29.2s, v31.2s
+ trn2 v5.2s, v29.2s, v31.2s
+
+ trn1 v27.4h, v6.4h, v7.4h
+ trn2 v29.4h, v6.4h, v7.4h
+ trn1 v25.4h, v8.4h, v9.4h
+ trn2 v31.4h, v8.4h, v9.4h
+
+ trn1 v6.2s, v27.2s, v25.2s
+ trn2 v8.2s, v27.2s, v25.2s
+ trn1 v7.2s, v29.2s, v31.2s
+ trn2 v9.2s, v29.2s, v31.2s
+
+ mov v25.d[0], x19
+ mov v25.d[1], x20
+
+ smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+
+ smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+ add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
+
+
+ add x5, x8, x8, lsl #1 //
+
+
+ add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data
+
+
+ add x10, x7, x7, lsl #1 //
+
+ // swapping v3 and v6
+ mov v31.d[0], v3.d[0]
+ mov v3.d[0], v6.d[0]
+ mov v6.d[0], v31.d[0]
+
+ // swapping v5 and v8
+ mov v31.d[0], v5.d[0]
+ mov v5.d[0], v8.d[0]
+ mov v8.d[0], v31.d[0]
+
+
+ sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
+ add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v0.4s, v12.4s , v24.4s
+
+
+ sub v24.4s, v12.4s , v24.4s
+
+
+ add v12.4s, v22.4s , v30.4s
+
+
+ sub v14.4s, v22.4s , v30.4s
+
+ sqrshrn v10.4h, v0.4s, #idct_stg2_shift
+ sqrshrn v17.4h, v24.4s, #idct_stg2_shift
+ sqrshrn v13.4h, v12.4s, #idct_stg2_shift
+ sqrshrn v14.4h, v14.4s, #idct_stg2_shift
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v0.4s, v22.4s , v28.4s
+
+
+ sub v24.4s, v22.4s , v28.4s
+
+
+ add v28.4s, v18.4s , v26.4s
+
+
+ sub v26.4s, v18.4s , v26.4s
+ ld1 {v18.8b}, [x2], x8
+
+ sqrshrn v12.4h, v0.4s, #idct_stg2_shift
+ ld1 {v20.8b}, [x2], x5
+
+
+ sqrshrn v15.4h, v24.4s, #idct_stg2_shift
+ ld1 {v19.8b}, [x2], x8
+
+
+
+
+ sqrshrn v11.4h, v28.4s, #idct_stg2_shift
+ ld1 {v22.8b}, [x4], x8
+
+
+
+
+ sqrshrn v16.4h, v26.4s, #idct_stg2_shift
+ ld1 {v21.8b}, [x2], x5
+
+
+ b pred_buff_addition
+end_skip_last4cols:
+ adrp x14, :got:gai2_impeg2_idct_first_col_q11
+ ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
+ ld1 {v0.4h, v1.4h}, [x14]
+
+
+ umov x19, v25.d[0]
+ umov x20, v25.d[1]
+
+///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
+ trn1 v27.4h, v2.4h, v6.4h
+ trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
+ trn1 v25.4h, v3.4h, v7.4h
+ trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
+
+ trn1 v2.2s, v27.2s, v25.2s
+ trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+ trn1 v6.2s, v29.2s, v31.2s
+ trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
+
+ trn1 v27.4h, v4.4h, v8.4h
+ trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing
+ trn1 v25.4h, v5.4h, v9.4h
+ trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing
+
+ trn1 v4.2s, v27.2s, v25.2s
+ trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
+ trn1 v8.2s, v29.2s, v31.2s
+ trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
+
+ trn1 v27.4h, v10.4h, v14.4h
+ trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
+ trn1 v25.4h, v11.4h, v15.4h
+ trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
+
+ trn1 v10.2s, v27.2s, v25.2s
+ trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+ trn1 v14.2s, v29.2s, v31.2s
+ trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
+
+ trn1 v27.4h, v12.4h, v16.4h
+ trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
+ trn1 v25.4h, v13.4h, v17.4h
+ trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
+
+ trn1 v12.2s, v27.2s, v25.2s
+ trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+ trn1 v16.2s, v29.2s, v31.2s
+ trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
+
+ mov v25.d[0], x19
+ mov v25.d[1], x20
+
+ ////step6 operate on first four rows and find their idct
+ ////register usage.extern - storing and idct of rows
+//// cosine constants - d0
+//// sine constants - d1
+//// element 0 first four - d2 - y0
+//// element 1 first four - d6 - y1
+//// element 2 first four - d3 - y2
+//// element 3 first four - d7 - y3
+//// element 4 first four - d4 - y4
+//// element 5 first four - d8 - y5
+//// element 6 first four - d5 - y6
+//// element 7 first four - d9 - y7
+//// element 0 second four - d10 - y0
+//// element 1 second four - d14 - y1
+//// element 2 second four - d11 - y2
+//// element 3 second four - d15 - y3
+//// element 4 second four - d12 - y4
+//// element 5 second four - d16 - y5
+//// element 6 second four - d13 - y6
+//// element 7 second four - d17 - y7
+
+ //// map between first kernel code seq and current
+//// d2 -> d2
+//// d6 -> d6
+//// d3 -> d3
+//// d7 -> d7
+//// d10 -> d4
+//// d14 -> d8
+//// d11 -> d5
+//// d15 -> d9
+//// q3 -> q3
+//// q5 -> q2
+//// q7 -> q4
+
+ smull v24.4s, v6.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v6.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v6.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v6.4h, v1.4h[3] //// y1 * sin1(part of b3)
+
+ smlal v24.4s, v7.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v7.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v7.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v7.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+
+ smull v20.4s, v2.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v4.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+
+ smull v18.4s, v3.4h, v1.4h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
+ smull v6.4s, v3.4h, v0.4h[2] //// y2 * cos2(part of d0)
+
+
+ smlal v24.4s, v8.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+ smlsl v26.4s, v8.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+ smlal v28.4s, v8.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+ smlal v30.4s, v8.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ smlsl v18.4s, v5.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+ smlal v6.4s, v5.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v9.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+ smlsl v26.4s, v9.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ smlal v28.4s, v9.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v9.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+ sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
+ add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v2.4s, v4.4s , v24.4s
+
+ sub v6.4s, v4.4s , v24.4s
+
+ add v8.4s, v22.4s , v30.4s
+
+ sub v24.4s, v22.4s , v30.4s
+
+ sqrshrn v5.4h, v8.4s, #idct_stg2_shift
+ sqrshrn v2.4h, v2.4s, #idct_stg2_shift
+ sqrshrn v9.4h, v6.4s, #idct_stg2_shift
+ sqrshrn v6.4h, v24.4s, #idct_stg2_shift
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v30.4s, v22.4s , v28.4s
+
+ sub v24.4s, v22.4s , v28.4s
+
+ add v28.4s, v18.4s , v26.4s
+
+ sub v22.4s, v18.4s , v26.4s
+ sqrshrn v4.4h, v30.4s, #idct_stg2_shift
+ sqrshrn v7.4h, v24.4s, #idct_stg2_shift
+ sqrshrn v3.4h, v28.4s, #idct_stg2_shift
+ sqrshrn v8.4h, v22.4s, #idct_stg2_shift
+
+
+
+ umov x19, v25.d[0]
+ umov x20, v25.d[1]
+
+ trn1 v27.4h, v2.4h, v3.4h
+ trn2 v29.4h, v2.4h, v3.4h
+ trn1 v25.4h, v4.4h, v5.4h
+ trn2 v31.4h, v4.4h, v5.4h
+
+ trn1 v2.2s, v27.2s, v25.2s
+ trn2 v4.2s, v27.2s, v25.2s
+ trn1 v3.2s, v29.2s, v31.2s
+ trn2 v5.2s, v29.2s, v31.2s
+
+ trn1 v27.4h, v6.4h, v7.4h
+ trn2 v29.4h, v6.4h, v7.4h
+ trn1 v25.4h, v8.4h, v9.4h
+ trn2 v31.4h, v8.4h, v9.4h
+
+ trn1 v6.2s, v27.2s, v25.2s
+ trn2 v8.2s, v27.2s, v25.2s
+ trn1 v7.2s, v29.2s, v31.2s
+ trn2 v9.2s, v29.2s, v31.2s
+
+ mov v25.d[0], x19
+ mov v25.d[1], x20
+
+
+
+ smull v24.4s, v14.4h, v0.4h[1] //// y1 * cos1(part of b0)
+ smull v26.4s, v14.4h, v0.4h[3] //// y1 * cos3(part of b1)
+ smull v28.4s, v14.4h, v1.4h[1] //// y1 * sin3(part of b2)
+ smull v30.4s, v14.4h, v1.4h[3] //// y1 * sin1(part of b3)
+ smlal v24.4s, v15.4h, v0.4h[3] //// y1 * cos1 + y3 * cos3(part of b0)
+ smlsl v26.4s, v15.4h, v1.4h[3] //// y1 * cos3 - y3 * sin1(part of b1)
+ smlsl v28.4s, v15.4h, v0.4h[1] //// y1 * sin3 - y3 * cos1(part of b2)
+ smlsl v30.4s, v15.4h, v1.4h[1] //// y1 * sin1 - y3 * sin3(part of b3)
+ smull v20.4s, v10.4h, v0.4h[0] //// y0 * cos4(part of c0 and c1)
+ smull v22.4s, v12.4h, v0.4h[0] //// y4 * cos4(part of c0 and c1)
+ smull v18.4s, v11.4h, v1.4h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
+ smull v14.4s, v11.4h, v0.4h[2] //// y2 * cos2(part of d0)
+ smlal v24.4s, v16.4h, v1.4h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
+
+ add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
+ smlsl v26.4s, v16.4h, v0.4h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
+
+ add x5, x8, x8, lsl #1 //
+ smlal v28.4s, v16.4h, v1.4h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
+
+ add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data
+ smlal v30.4s, v16.4h, v0.4h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
+
+ add x10, x7, x7, lsl #1 //
+ smlsl v18.4s, v13.4h, v0.4h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
+
+
+ smlal v14.4s, v13.4h, v1.4h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
+
+ add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
+ sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
+
+ smlal v24.4s, v17.4h, v1.4h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
+
+ // swapping v3 and v6
+ mov v31.d[0], v3.d[0]
+ mov v3.d[0], v6.d[0]
+ mov v6.d[0], v31.d[0]
+
+ smlsl v26.4s, v17.4h, v1.4h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
+ // swapping v5 and v8
+ mov v31.d[0], v5.d[0]
+ mov v5.d[0], v8.d[0]
+ mov v8.d[0], v31.d[0]
+
+ smlal v28.4s, v17.4h, v0.4h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
+ smlsl v30.4s, v17.4h, v0.4h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
+
+ sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
+ add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
+
+
+ add v0.4s, v12.4s , v24.4s
+
+
+ sub v24.4s, v12.4s , v24.4s
+
+
+ add v12.4s, v22.4s , v30.4s
+
+
+ sub v14.4s, v22.4s , v30.4s
+
+ sqrshrn v10.4h, v0.4s, #idct_stg2_shift
+ sqrshrn v17.4h, v24.4s, #idct_stg2_shift
+ sqrshrn v13.4h, v12.4s, #idct_stg2_shift
+ sqrshrn v14.4h, v14.4s, #idct_stg2_shift
+
+ sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
+ add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
+
+
+ add v0.4s, v22.4s , v28.4s
+
+
+ sub v24.4s, v22.4s , v28.4s
+
+
+ add v28.4s, v18.4s , v26.4s
+
+
+ sub v26.4s, v18.4s , v26.4s
+ ld1 {v18.8b}, [x2], x8
+
+ sqrshrn v12.4h, v0.4s, #idct_stg2_shift
+ ld1 {v20.8b}, [x2], x5
+
+
+ sqrshrn v15.4h, v24.4s, #idct_stg2_shift
+ ld1 {v19.8b}, [x2], x8
+
+
+
+
+ sqrshrn v11.4h, v28.4s, #idct_stg2_shift
+ ld1 {v22.8b}, [x4], x8
+
+
+
+
+ sqrshrn v16.4h, v26.4s, #idct_stg2_shift
+ ld1 {v21.8b}, [x2], x5
+
+
+
+
+pred_buff_addition:
+
+ umov x19, v25.d[0]
+ umov x20, v25.d[1]
+
+ trn1 v27.4h, v10.4h, v11.4h
+ trn2 v29.4h, v10.4h, v11.4h
+ trn1 v25.4h, v12.4h, v13.4h
+ trn2 v31.4h, v12.4h, v13.4h
+
+ trn1 v10.2s, v27.2s, v25.2s
+ trn2 v12.2s, v27.2s, v25.2s
+ trn1 v11.2s, v29.2s, v31.2s
+ trn2 v13.2s, v29.2s, v31.2s
+
+ trn1 v27.4h, v14.4h, v15.4h
+ trn2 v29.4h, v14.4h, v15.4h
+ trn1 v25.4h, v16.4h, v17.4h
+ trn2 v31.4h, v16.4h, v17.4h
+
+ trn1 v14.2s, v27.2s, v25.2s
+ trn2 v16.2s, v27.2s, v25.2s
+ trn1 v15.2s, v29.2s, v31.2s
+ trn2 v17.2s, v29.2s, v31.2s
+
+
+ mov v25.d[0], x19
+ mov v25.d[1], x20
+
+
+ ld1 {v24.8b}, [x4], x5
+ ld1 {v23.8b}, [x4], x8
+ ld1 {v25.8b}, [x4], x5
+ mov v2.d[1], v3.d[0]
+ mov v4.d[1], v5.d[0]
+ mov v6.d[1], v7.d[0]
+ mov v8.d[1], v9.d[0]
+ uaddw v2.8h, v2.8h , v18.8b
+ uaddw v4.8h, v4.8h , v22.8b
+ uaddw v6.8h, v6.8h , v20.8b
+ uaddw v8.8h, v8.8h , v24.8b
+
+ // swapping v11 and v14
+ mov v31.d[0], v11.d[0]
+ mov v11.d[0], v14.d[0]
+ mov v14.d[0], v31.d[0]
+
+ // swapping v13 and v16
+ mov v31.d[0], v13.d[0]
+ mov v13.d[0], v16.d[0]
+ mov v16.d[0], v31.d[0]
+// row values stored in the q register.
+
+//q1 :x0
+//q3: x1
+//q2: x2
+//q4: x3
+//q5: x4
+//q7: x5
+//q6: x6
+//q8: x7
+
+
+
+///// adding the prediction buffer
+
+
+
+
+
+
+
+
+
+ // load prediction data
+
+
+
+
+
+ //adding recon with prediction
+
+
+
+
+ mov v10.d[1], v11.d[0]
+ mov v12.d[1], v13.d[0]
+ mov v14.d[1], v15.d[0]
+ mov v16.d[1], v17.d[0]
+ uaddw v10.8h, v10.8h , v19.8b
+ sqxtun v2.8b, v2.8h
+ uaddw v14.8h, v14.8h , v21.8b
+ sqxtun v4.8b, v4.8h
+ uaddw v12.8h, v12.8h , v23.8b
+ sqxtun v6.8b, v6.8h
+ uaddw v16.8h, v16.8h , v25.8b
+ sqxtun v8.8b, v8.8h
+
+
+
+
+
+
+
+ st1 {v2.8b}, [x3], x7
+ sqxtun v10.8b, v10.8h
+ st1 {v6.8b}, [x3], x10
+ sqxtun v14.8b, v14.8h
+ st1 {v4.8b}, [x0], x7
+ sqxtun v12.8b, v12.8h
+ st1 {v8.8b}, [x0], x10
+ sqxtun v16.8b, v16.8h
+
+
+
+
+
+
+
+ st1 {v10.8b}, [x3], x7
+ st1 {v14.8b}, [x3], x10
+ st1 {v12.8b}, [x0], x7
+ st1 {v16.8b}, [x0], x10
+
+
+
+
+ // ldmfd sp!,{x4-x12,pc}
+ ldp x19, x20, [sp], #16
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/armv8/impeg2_inter_pred.s b/common/armv8/impeg2_inter_pred.s
new file mode 100644
index 0000000..98ade45
--- /dev/null
+++ b/common/armv8/impeg2_inter_pred.s
@@ -0,0 +1,814 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+
+///*
+////----------------------------------------------------------------------------
+//// File Name : impeg2_inter_pred.s
+////
+//// Description : This file has motion compensation related
+//// interpolation functions on Neon + CortexA-8 platform
+////
+//// Reference Document :
+////
+//// Revision History :
+//// Date Author Detail Description
+//// ------------ ---------------- ----------------------------------
+//// 18 jun 2010 S Hamsalekha Created
+////
+////-------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Include Files
+//// ----------------------------------------------------------------------------
+//*/
+// PRESERVE8
+.text
+.include "impeg2_neon_macros.s"
+
+///*
+//// ----------------------------------------------------------------------------
+//// Struct/Union Types and Define
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Global Data section variables
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Prototype Functions
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Exported functions
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_copy_mb_av8()
+////
+//// Detail Description : Copies one MB worth of data from src to the dst
+////
+//// Inputs : x0 - pointer to src
+//// x1 - pointer to dst
+//// x2 - source width
+//// x3 - destination width
+//// Registers Used : v0, v1
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs :
+////
+//// Return Data : None
+////
+//// Programming Note : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+
+.global impeg2_copy_mb_av8
+
+
+impeg2_copy_mb_av8:
+
+//STMFD x13!,{x4,x5,x12,x14}
+ push_v_regs
+
+
+ ldr x4, [x0] //src->y
+ ldr x5, [x1] //dst->y
+
+ //Read one row of data from the src
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+
+ ////Repeat 15 times for y
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
+
+ lsr x2, x2, #1 //src_offset /= 2
+ lsr x3, x3, #1 //dst_offset /= 2
+
+ ldr x4, [x0, #8] //src->u
+ ldr x5, [x1, #8] //dst->u
+
+ //Read one row of data from the src
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+
+ ////Repeat 7 times for u
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+
+ ldr x4, [x0, #16] //src->v
+ ldr x5, [x1, #16] //dst->v
+
+ //Read one row of data from the src
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+
+ ////Repeat 7 times for v
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+ ld1 {v0.8b}, [x4], x2 //Load and increment src
+ st1 {v0.8b}, [x5], x3 //Store and increment dst
+
+//LDMFD x13!,{x4,x5,x12,PC}
+ pop_v_regs
+ ret
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_mc_fullx_halfy_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+//// current frame buffer.This function is called for
+//// blocks that are not coded and have motion vectors
+//// with a half pel resolution.
+////
+//// Inputs : x0 - out : Current Block Pointer
+//// x1 - ref : Refernce Block Pointer
+//// x2 - ref_wid : Refernce Block Width
+//// x3 - out_wid @ Current Block Width
+////
+//// Registers Used : x14, D0-D9
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs : The Motion Compensated Block
+////
+//// Return Data : None
+////
+//// Programming Note : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+.global impeg2_mc_fullx_halfy_8x8_av8
+
+impeg2_mc_fullx_halfy_8x8_av8:
+
+//STMFD x13!,{x12,x14}
+ push_v_regs
+ add x14, x1, x2
+ lsl x2, x2, #1
+
+///* Load 8 + 1 rows from reference block */
+///* Do the addition with out rounding off as rounding value is 1 */
+ ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0
+ ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2
+ ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4
+ ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6
+ ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1
+ ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3
+ urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9
+ ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5
+ urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1
+ urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1
+ ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7
+ urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3
+ urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3
+ ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8
+ urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5
+ urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5
+
+ add x14, x0, x3
+ lsl x3, x3, #1
+
+///* Store the eight rows calculated above */
+ st1 {v2.8b}, [x14], x3 //// second row hence D2
+ urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7
+ st1 {v0.8b}, [x0], x3 //// first row hence D0
+ st1 {v9.8b}, [x14], x3 //// fourth row hence D9
+ st1 {v4.8b}, [x0], x3 //// third row hence D4
+ st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3
+ st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1
+ st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7
+ st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5
+
+// LDMFD sp!,{x12,pc}
+ pop_v_regs
+ ret
+
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_mc_halfx_fully_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+//// current frame buffer.This function is called for
+//// blocks that are not coded and have motion vectors
+//// with a half pel resolutionand VopRoundingType is 0 ..
+////
+//// Inputs : x0 - out : Current Block Pointer
+//// x1 - ref : Refernce Block Pointer
+//// x2 - ref_wid : Refernce Block Width
+//// x3 - out_wid @ Current Block Width
+////
+//// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22
+
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs : The Motion Compensated Block
+////
+//// Return Data : None
+////
+//// Programming Note : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+
+.global impeg2_mc_halfx_fully_8x8_av8
+
+
+
+impeg2_mc_halfx_fully_8x8_av8:
+
+ // STMFD sp!,{x12,x14}
+ push_v_regs
+
+ add x14, x1, x2, lsl #2
+
+ add x12, x0, x3, lsl#2
+
+ ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1
+
+ ld1 {v2.8b, v3.8b}, [x14], x2 // row5
+
+
+ ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
+
+ ld1 {v6.8b, v7.8b}, [x14], x2 //row6
+
+
+ ext v8.8b, v0.8b , v1.8b , #1
+
+ ext v12.8b, v2.8b , v3.8b , #1
+
+ ext v16.8b, v4.8b , v5.8b , #1
+
+ ext v20.8b, v6.8b , v7.8b , #1
+
+
+ ld1 {v9.8b, v10.8b}, [x1], x2 //load row3
+
+ ld1 {v13.8b, v14.8b}, [x14], x2 //load row7
+
+ ld1 {v17.8b, v18.8b}, [x1], x2 //load row4
+
+ ld1 {v21.8b, v22.8b}, [x14], x2 //load row8
+
+
+ ext v1.8b, v9.8b , v10.8b , #1
+
+ ext v3.8b, v13.8b , v14.8b , #1
+
+
+
+ ext v5.8b, v17.8b , v18.8b , #1
+
+ ext v7.8b, v21.8b , v22.8b , #1
+
+
+ urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3
+ urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3
+
+ urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7
+ urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7
+
+
+ urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4
+ urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4
+
+
+ urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8
+ urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8
+
+ st1 {v0.8b}, [x0], x3 //store row1
+
+ st1 {v2.8b}, [x12], x3 //store row5
+
+ st1 {v4.8b}, [x0], x3 //store row2
+
+ st1 {v6.8b}, [x12], x3 //store row6
+
+ st1 {v1.8b}, [x0], x3 //store row3
+
+ st1 {v3.8b}, [x12], x3 //store row7
+
+ st1 {v5.8b}, [x0], x3 //store row4
+
+ st1 {v7.8b}, [x12], x3 //store row8
+
+
+
+ // LDMFD sp!,{x12,pc}
+ pop_v_regs
+ ret
+
+
+
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_mc_halfx_halfy_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+//// current frame buffer.This function is called for
+//// blocks that are not coded and have motion vectors
+//// with a half pel resolutionand VopRoundingType is 0 ..
+////
+//// Inputs : x0 - out : Current Block Pointer
+//// x1 - ref : Refernce Block Pointer
+//// x2 - ref_wid : Refernce Block Width
+//// x3 - out_wid @ Current Block Width
+////
+//// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30
+
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs : The Motion Compensated Block
+////
+//// Return Data : None
+////
+//// Programming Note : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+.global impeg2_mc_halfx_halfy_8x8_av8
+
+impeg2_mc_halfx_halfy_8x8_av8:
+
+ // STMFD sp!,{x12,x14}
+ push_v_regs
+
+ add x14, x1, x2, lsl #2
+
+ ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1
+
+ ld1 {v2.8b, v3.8b}, [x14], x2 // row5
+
+ ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
+
+ ld1 {v6.8b, v7.8b}, [x14], x2 //row6
+
+ ext v1.8b, v0.8b , v1.8b , #1
+
+
+
+ ext v3.8b, v2.8b , v3.8b , #1
+
+
+
+ ext v5.8b, v4.8b , v5.8b , #1
+
+ ext v7.8b, v6.8b , v7.8b , #1
+
+
+
+
+ ld1 {v8.8b, v9.8b}, [x1], x2 //load row3
+
+
+
+ ld1 {v10.8b, v11.8b}, [x14], x2 //load row7
+
+ ld1 {v12.8b, v13.8b}, [x1], x2 //load row4
+
+ ld1 {v14.8b, v15.8b}, [x14], x2 //load row8
+
+ ext v9.8b, v8.8b , v9.8b , #1
+
+ ld1 {v16.8b, v17.8b}, [x14], x2 //load row9
+
+
+
+
+
+ ext v11.8b, v10.8b , v11.8b , #1
+
+
+
+ ext v13.8b, v12.8b , v13.8b , #1
+
+
+
+ ext v15.8b, v14.8b , v15.8b , #1
+
+ ext v17.8b, v16.8b , v17.8b , #1
+
+
+ //interpolation in x direction
+
+ uaddl v0.8h, v0.8b, v1.8b //operate row1
+
+ uaddl v2.8h, v2.8b, v3.8b //operate row5
+
+ uaddl v4.8h, v4.8b, v5.8b //operate row2
+
+ uaddl v6.8h, v6.8b, v7.8b //operate row6
+
+ uaddl v8.8h, v8.8b, v9.8b //operate row3
+
+ uaddl v10.8h, v10.8b, v11.8b //operate row7
+
+ uaddl v12.8h, v12.8b, v13.8b //operate row4
+
+ uaddl v14.8h, v14.8b, v15.8b //operate row8
+
+ uaddl v16.8h, v16.8b, v17.8b //operate row9
+
+ //interpolation in y direction
+
+ add x14, x0, x3, lsl #2
+
+
+
+ add v18.8h, v0.8h , v4.8h //operate row1 and row2
+
+ add v26.8h, v2.8h , v6.8h //operate row5 and row6
+
+ add v20.8h, v4.8h , v8.8h //operate row2 and row3
+
+ add v28.8h, v6.8h , v10.8h //operate row6 and row7
+
+ rshrn v18.8b, v18.8h, #2 //row1
+
+ rshrn v26.8b, v26.8h, #2 //row5
+
+ rshrn v20.8b, v20.8h, #2 //row2
+
+ rshrn v28.8b, v28.8h, #2 //row6
+
+ add v22.8h, v8.8h , v12.8h //operate row3 and row4
+
+ st1 {v18.8b}, [x0], x3 //store row1
+
+ add v30.8h, v10.8h , v14.8h //operate row7 and row8
+
+ st1 {v26.8b}, [x14], x3 //store row5
+
+ add v24.8h, v12.8h , v2.8h //operate row4 and row5
+
+ st1 {v20.8b}, [x0], x3 //store row2
+
+ add v14.8h, v14.8h , v16.8h //operate row8 and row9
+
+ st1 {v28.8b}, [x14], x3 //store row6
+
+
+
+ rshrn v22.8b, v22.8h, #2 //row3
+
+ rshrn v30.8b, v30.8h, #2 //row7
+
+ rshrn v24.8b, v24.8h, #2 //row4
+
+ rshrn v14.8b, v14.8h, #2 //row8
+
+
+ st1 {v22.8b}, [x0], x3 //store row3
+ st1 {v30.8b}, [x14], x3 //store row7
+ st1 {v24.8b}, [x0], x3 //store row4
+ st1 {v14.8b}, [x14], x3 //store row8
+
+
+
+ // LDMFD sp!,{x12,pc}
+ pop_v_regs
+ ret
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_mc_fullx_fully_8x8_av8()
+////
+//// Detail Description : This function pastes the reference block in the
+//// current frame buffer.This function is called for
+//// blocks that are not coded and have motion vectors
+//// with a half pel resolutionand ..
+////
+//// Inputs : x0 - out : Current Block Pointer
+//// x1 - ref : Refernce Block Pointer
+//// x2 - ref_wid : Refernce Block Width
+//// x3 - out_wid @ Current Block Width
+////
+//// Registers Used : x12, x14, v0-v3
+
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs : The Motion Compensated Block
+////
+//// Return Data : None
+////
+//// Programming Note : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+.global impeg2_mc_fullx_fully_8x8_av8
+impeg2_mc_fullx_fully_8x8_av8:
+
+
+ // STMFD sp!,{x12,x14}
+ push_v_regs
+
+ add x14, x1, x2, lsl #2
+
+ add x12, x0, x3, lsl #2
+
+
+ ld1 {v0.8b}, [x1], x2 //load row1
+
+ ld1 {v1.8b}, [x14], x2 //load row4
+
+ ld1 {v2.8b}, [x1], x2 //load row2
+
+ ld1 {v3.8b}, [x14], x2 //load row5
+
+
+ st1 {v0.8b}, [x0], x3 //store row1
+
+ st1 {v1.8b}, [x12], x3 //store row4
+
+ st1 {v2.8b}, [x0], x3 //store row2
+
+ st1 {v3.8b}, [x12], x3 //store row5
+
+
+ ld1 {v0.8b}, [x1], x2 //load row3
+
+ ld1 {v1.8b}, [x14], x2 //load row6
+
+ ld1 {v2.8b}, [x1], x2 //load row4
+
+ ld1 {v3.8b}, [x14], x2 //load row8
+
+
+ st1 {v0.8b}, [x0], x3 //store row3
+
+ st1 {v1.8b}, [x12], x3 //store row6
+
+ st1 {v2.8b}, [x0], x3 //store row4
+
+ st1 {v3.8b}, [x12], x3 //store row8
+
+
+ // LDMFD sp!,{x12,pc}
+ pop_v_regs
+ ret
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_interpolate_av8()
+////
+//// Detail Description : interpolates two buffers and adds pred
+////
+//// Inputs : x0 - pointer to src1
+//// x1 - pointer to src2
+//// x2 - dest buf
+//// x3 - dst stride
+//// Registers Used : x12, v0-v15
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs : The Motion Compensated Block
+////
+//// Return Data : None
+////
+//// Programming Note : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+.global impeg2_interpolate_av8
+
+
+impeg2_interpolate_av8:
+
+//STMFD x13!,{x4-x7,x12,x14}
+ push_v_regs
+
+ ldr x4, [x0, #0] //ptr_y src1
+
+ ldr x5, [x1, #0] //ptr_y src2
+
+ ldr x7, [x2, #0] //ptr_y dst buf
+
+ mov x12, #4 //counter for number of blocks
+
+
+interp_lumablocks_stride:
+ ld1 {v0.16b}, [x4], #16 //row1 src1
+
+ ld1 {v2.16b}, [x4], #16 //row2 src1
+
+ ld1 {v4.16b}, [x4], #16 //row3 src1
+
+ ld1 {v6.16b}, [x4], #16 //row4 src1
+
+
+ ld1 {v8.16b}, [x5], #16 //row1 src2
+
+ ld1 {v10.16b}, [x5], #16 //row2 src2
+
+ ld1 {v12.16b}, [x5], #16 //row3 src2
+
+ ld1 {v14.16b}, [x5], #16 //row4 src2
+
+ urhadd v0.16b, v0.16b , v8.16b //operate on row1
+
+ urhadd v2.16b, v2.16b , v10.16b //operate on row2
+
+ urhadd v4.16b, v4.16b , v12.16b //operate on row3
+
+ urhadd v6.16b, v6.16b , v14.16b //operate on row4
+ st1 {v0.16b}, [x7], x3 //row1
+
+ st1 {v2.16b}, [x7], x3 //row2
+
+ st1 {v4.16b}, [x7], x3 //row3
+
+ st1 {v6.16b}, [x7], x3 //row4
+
+ subs x12, x12, #1
+
+ bne interp_lumablocks_stride
+
+
+ lsr x3, x3, #1 //stride >> 1
+
+ ldr x4, [x0, #8] //ptr_u src1
+
+ ldr x5, [x1, #8] //ptr_u src2
+
+ ldr x7 , [x2, #8] //ptr_u dst buf
+
+ mov x12, #2 //counter for number of blocks
+
+
+
+//chroma blocks
+
+interp_chromablocks_stride:
+ ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1
+
+ ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1
+
+ ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1
+
+ ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1
+
+
+ ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2
+
+ ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2
+
+ ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2
+
+ ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2
+
+ urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2
+ urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2
+
+ urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4
+ urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4
+
+ urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6
+ urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6
+
+ urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8
+ urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8
+
+ st1 {v0.8b}, [x7], x3 //row1
+
+ st1 {v1.8b}, [x7], x3 //row2
+
+ st1 {v2.8b}, [x7], x3 //row3
+
+ st1 {v3.8b}, [x7], x3 //row4
+
+ st1 {v4.8b}, [x7], x3 //row5
+
+ st1 {v5.8b}, [x7], x3 //row6
+
+ st1 {v6.8b}, [x7], x3 //row7
+
+ st1 {v7.8b}, [x7], x3 //row8
+
+
+ ldr x4, [x0, #16] //ptr_v src1
+
+ ldr x5, [x1, #16] //ptr_v src2
+
+ ldr x7, [x2, #16] //ptr_v dst buf
+
+ subs x12, x12, #1
+
+ bne interp_chromablocks_stride
+
+
+ //LDMFD x13!,{x4-x7,x12,PC}
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/armv8/impeg2_mem_func.s b/common/armv8/impeg2_mem_func.s
new file mode 100644
index 0000000..f0bb590
--- /dev/null
+++ b/common/armv8/impeg2_mem_func.s
@@ -0,0 +1,181 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+
+///*
+////----------------------------------------------------------------------------
+//// File Name : mot_comp_neon.s
+////
+//// Description : This file has motion compensation related
+//// interpolation functions on Neon + CortexA-8 platform
+////
+//// Reference Document :
+////
+//// Revision History :
+//// Date Author Detail Description
+//// ------------ ---------------- ----------------------------------
+//// 18 jun 2010 S Hamsalekha Created
+////
+////-------------------------------------------------------------------------
+//*/
+
+///*
+//// ----------------------------------------------------------------------------
+//// Include Files
+//// ----------------------------------------------------------------------------
+//*/
+// PRESERVE8
+.text
+.include "impeg2_neon_macros.s"
+///*
+//// ----------------------------------------------------------------------------
+//// Struct/Union Types and Define
+//// ----------------------------------------------------------------------------
+//*/
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Global Data section variables
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+
+///*
+//// ----------------------------------------------------------------------------
+//// Static Prototype Functions
+//// ----------------------------------------------------------------------------
+//*/
+//// -------------------------- NONE --------------------------------------------
+
+///*
+//// ----------------------------------------------------------------------------
+//// Exported functions
+//// ----------------------------------------------------------------------------
+//*/
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_memset_8bit_8x8_block_av8()
+////
+//// Detail Description : This routine intialises the Block matrix buffer contents to a
+//// particular Value. This function also assumes the buffer size
+//// to be set is 64 Bytes fixed. It also assumes that blk matrix
+//// used is 64 bit aligned.
+////
+//// Inputs : pi2_blk_mat : Block Pointer
+//// u2_val : Value with which the block is initialized
+////
+//// Registers Used : v0
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs : Block Matrix Iniliazed to given value
+////
+//// Return Data : None
+////
+//// Programming Note : This implementation assumes that blk matrix buffer
+//// is 128 bit aligned
+////-----------------------------------------------------------------------------
+//*/
+.global impeg2_memset_8bit_8x8_block_av8
+impeg2_memset_8bit_8x8_block_av8:
+ push_v_regs
+
+// ADD x3,x0,#WIDTH_X_SIZE @//x3 is another copy address offsetted
+
+ dup v0.8b, w1 ////x1 is the 8-bit value to be set into
+
+ st1 {v0.8b}, [x0], x2 ////Store the row 1
+ st1 {v0.8b}, [x0], x2 ////Store the row 2
+ st1 {v0.8b}, [x0], x2 ////Store the row 3
+ st1 {v0.8b}, [x0], x2 ////Store the row 4
+ st1 {v0.8b}, [x0], x2 ////Store the row 5
+ st1 {v0.8b}, [x0], x2 ////Store the row 6
+ st1 {v0.8b}, [x0], x2 ////Store the row 7
+ st1 {v0.8b}, [x0], x2 ////Store the row 8
+
+ pop_v_regs
+ ret
+
+
+
+
+
+
+///*
+////---------------------------------------------------------------------------
+//// Function Name : impeg2_memset0_16bit_8x8_linear_block_av8()
+////
+//// Detail Description : memsets resudual buf to 0
+////
+//// Inputs : x0 - pointer to y
+//// x1 - pointer to u
+//// x2 - pointer to v
+//// Registers Used : v0
+
+////
+//// Stack Usage : 64 bytes
+////
+//// Outputs : The Motion Compensated Block
+////
+//// Return Data : None
+////
+//// Programming Note : <program limitation>
+////-----------------------------------------------------------------------------
+//*/
+
+
+
+.global impeg2_memset0_16bit_8x8_linear_block_av8
+
+
+impeg2_memset0_16bit_8x8_linear_block_av8:
+
+ push_v_regs
+
+ movi v0.8h, #0
+
+ //Y data
+
+ st1 {v0.8h} , [x0], #16 //row1
+
+ st1 {v0.8h} , [x0], #16 //row2
+
+ st1 {v0.8h} , [x0], #16 //row3
+
+ st1 {v0.8h} , [x0], #16 //row4
+
+ st1 {v0.8h} , [x0], #16 //row5
+
+ st1 {v0.8h} , [x0], #16 //row6
+
+ st1 {v0.8h} , [x0], #16 //row7
+
+ st1 {v0.8h} , [x0], #16 //row8
+
+
+
+ pop_v_regs
+ ret
+
+
+
+
diff --git a/common/armv8/impeg2_neon_macros.s b/common/armv8/impeg2_neon_macros.s
new file mode 100644
index 0000000..452ba45
--- /dev/null
+++ b/common/armv8/impeg2_neon_macros.s
@@ -0,0 +1,58 @@
+//******************************************************************************
+//*
+//* Copyright (C) 2015 The Android Open Source Project
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************
+//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+//*/
+///**
+//*******************************************************************************
+//* @file
+//* impeg2_neon_macros.s
+//*
+//* @brief
+//* Contains assembly macros
+//*
+//* @author
+//* Naveen SR
+//*
+//* @par List of Functions:
+//*
+//*
+//* @remarks
+//* None
+//*
+//*******************************************************************************
+
+
+.macro push_v_regs
+ stp d8, d9, [sp, #-16]!
+ stp d10, d11, [sp, #-16]!
+ stp d12, d13, [sp, #-16]!
+ stp d14, d15, [sp, #-16]!
+.endm
+.macro pop_v_regs
+ ldp d14, d15, [sp], #16
+ ldp d12, d13, [sp], #16
+ ldp d10, d11, [sp], #16
+ ldp d8, d9, [sp], #16
+.endm
+
+.macro swp reg1, reg2
+ eor \reg1, \reg1, \reg2
+ eor \reg2, \reg1, \reg2
+ eor \reg1, \reg1, \reg2
+.endm
+
diff --git a/common/armv8/impeg2_platform_macros.h b/common/armv8/impeg2_platform_macros.h
new file mode 100644
index 0000000..ff31034
--- /dev/null
+++ b/common/armv8/impeg2_platform_macros.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = \
+ (u4_temp1 << 24) | \
+ ((u4_temp1 & 0xff00) << 8) | \
+ ((u4_temp1 & 0xff0000) >> 8) | \
+ (u4_temp1 >> 24);
+
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+
+#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x))
+#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x))
+
+#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x))
+#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x))
+
+#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x))
+#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x))
+
+#define INLINE
+#define PLD(x) __pld(x)
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */
diff --git a/common/impeg2_buf_mgr.c b/common/impeg2_buf_mgr.c
new file mode 100644
index 0000000..c4aca4a
--- /dev/null
+++ b/common/impeg2_buf_mgr.c
@@ -0,0 +1,411 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+*******************************************************************************
+* @file
+* impeg2_buf_mgr.c
+*
+* @brief
+* Contains function definitions for buffer management
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - impeg2_buf_mgr_init()
+* - impeg2_buf_mgr_add()
+* - impeg2_buf_mgr_get_next_free()
+* - impeg2_buf_mgr_check_free()
+* - impeg2_buf_mgr_release()
+* - impeg2_buf_mgr_set_status()
+* - impeg2_buf_mgr_get_status()
+* - impeg2_buf_mgr_get_buf()
+* - impeg2_buf_mgr_get_num_active_buf()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+#include "impeg2_buf_mgr.h"
+
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Buffer manager initialization function.
+*
+* @par Description:
+* Initializes the buffer manager structure
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+void impeg2_buf_mgr_init(
+ buf_mgr_t *ps_buf_mgr)
+{
+ WORD32 id;
+
+ ps_buf_mgr->u4_max_buf_cnt = BUF_MGR_MAX_CNT;
+ ps_buf_mgr->u4_active_buf_cnt = 0;
+
+ for(id = 0; id < BUF_MGR_MAX_CNT; id++)
+ {
+ ps_buf_mgr->au4_status[id] = 0;
+ ps_buf_mgr->apv_ptr[id] = NULL;
+ }
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Adds and increments the buffer and buffer count.
+*
+* @par Description:
+* Adds a buffer to the buffer manager if it is not already present and
+* increments the active buffer count
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] pv_ptr
+* Pointer to the buffer to be added
+*
+* @returns Returns 0 on success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_add(
+ buf_mgr_t *ps_buf_mgr,
+ void *pv_ptr,
+ WORD32 i4_buf_id)
+{
+
+ /* Check if buffer ID is within allowed range */
+ if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+ {
+ return (-1);
+ }
+
+ /* Check if the current ID is being used to hold some other buffer */
+ if((ps_buf_mgr->apv_ptr[i4_buf_id] != NULL) &&
+ (ps_buf_mgr->apv_ptr[i4_buf_id] != pv_ptr))
+ {
+ return (-1);
+ }
+ ps_buf_mgr->apv_ptr[i4_buf_id] = pv_ptr;
+
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the next free buffer.
+*
+* @par Description:
+* Returns the next free buffer available and sets the corresponding status
+* to DEC
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] pi4_buf_id
+* Pointer to the id of the free buffer
+*
+* @returns Pointer to the free buffer
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* impeg2_buf_mgr_get_next_free(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 *pi4_buf_id)
+{
+ WORD32 id;
+ void *pv_ret_ptr;
+
+ pv_ret_ptr = NULL;
+ for(id = 0; id < (WORD32)ps_buf_mgr->u4_max_buf_cnt; id++)
+ {
+ /* Check if the buffer is non-null and status is zero */
+ if((ps_buf_mgr->au4_status[id] == 0) && (ps_buf_mgr->apv_ptr[id]))
+ {
+ *pi4_buf_id = id;
+ /* DEC is set to 1 */
+ ps_buf_mgr->au4_status[id] = 1;
+ pv_ret_ptr = ps_buf_mgr->apv_ptr[id];
+ break;
+ }
+ }
+
+ return pv_ret_ptr;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Checks the buffer manager for free buffers available.
+*
+* @par Description:
+* Checks if there are any free buffers available
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns Returns 0 if available, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_check_free(
+ buf_mgr_t *ps_buf_mgr)
+{
+ UWORD32 id;
+
+ for(id = 0; id < ps_buf_mgr->u4_max_buf_cnt; id++)
+ {
+ if((ps_buf_mgr->au4_status[id] == 0) &&
+ (ps_buf_mgr->apv_ptr[id]))
+ {
+ return 1;
+ }
+ }
+
+ return 0;
+
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Resets the status bits.
+*
+* @par Description:
+* resets the status bits that the mask contains (status corresponding to
+* the id)
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer status to be released
+*
+* @param[in] mask
+* Contains the bits that are to be reset
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_release(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 i4_buf_id,
+ UWORD32 u4_mask)
+{
+ /* If the given id is pointing to an id which is not yet added */
+ if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+ {
+ return (-1);
+ }
+
+ if(0 == (ps_buf_mgr->au4_status[i4_buf_id] & u4_mask))
+ {
+ return (-1);
+ }
+
+ ps_buf_mgr->au4_status[i4_buf_id] &= ~u4_mask;
+
+ /* If both the REF and DISP are zero, DEC is set to zero */
+ if(ps_buf_mgr->au4_status[i4_buf_id] == 1)
+ {
+ ps_buf_mgr->au4_status[i4_buf_id] = 0;
+ }
+
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Sets the status bit.
+*
+* @par Description:
+* sets the status bits that the mask contains (status corresponding to the
+* id)
+*
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer whose status needs to be modified
+*
+*
+* @param[in] mask
+* Contains the bits that are to be set
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_buf_mgr_set_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 i4_buf_id,
+ UWORD32 u4_mask)
+{
+ if(i4_buf_id >= (WORD32)ps_buf_mgr->u4_max_buf_cnt)
+ {
+ return (-1);
+ }
+
+
+ if((ps_buf_mgr->au4_status[i4_buf_id] & u4_mask) != 0)
+ {
+ return (-1);
+ }
+
+ ps_buf_mgr->au4_status[i4_buf_id] |= u4_mask;
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Returns the status of the buffer.
+*
+* @par Description:
+* Returns the status of the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer status required
+*
+* @returns Status of the buffer corresponding to the id
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+UWORD32 impeg2_buf_mgr_get_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 i4_buf_id)
+{
+ return ps_buf_mgr->au4_status[i4_buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the buffer from the buffer manager
+*
+* @par Description:
+* Returns the pointer to the buffer corresponding to the id
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @param[in] buf_id
+* ID of the buffer required
+*
+* @returns Pointer to the buffer required
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* impeg2_buf_mgr_get_buf(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 i4_buf_id)
+{
+ return ps_buf_mgr->apv_ptr[i4_buf_id];
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the no.of active buffer
+*
+* @par Description:
+* Return the number of active buffers in the buffer manager
+*
+* @param[in] ps_buf_mgr
+* Pointer to the buffer manager
+*
+* @returns number of active buffers
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+UWORD32 impeg2_buf_mgr_get_num_active_buf(
+ buf_mgr_t *ps_buf_mgr)
+{
+ return ps_buf_mgr->u4_max_buf_cnt;
+}
diff --git a/common/impeg2_buf_mgr.h b/common/impeg2_buf_mgr.h
new file mode 100644
index 0000000..6b1cbef
--- /dev/null
+++ b/common/impeg2_buf_mgr.h
@@ -0,0 +1,115 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* impeg2_buf_mgr.h
+*
+* @brief
+* Function declarations used for buffer management
+*
+* @author
+* Srinivas T
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IMPEG2_BUF_MGR_H_
+#define _IMPEG2_BUF_MGR_H_
+
+#define BUF_MGR_MAX_CNT 64
+
+#define BUF_MGR_DEC 1
+#define BUF_MGR_REF (1 << 1)
+#define BUF_MGR_DISP (1 << 2)
+
+typedef struct
+{
+ /**
+ * max_buf_cnt
+ */
+ UWORD32 u4_max_buf_cnt;
+
+ /**
+ * active_buf_cnt
+ */
+ UWORD32 u4_active_buf_cnt;
+ /**
+ * au4_status[BUF_MGR_MAX_CNT]
+ */
+ UWORD32 au4_status[BUF_MGR_MAX_CNT];
+ /* The last three bit of status are: */
+ /* Bit 0 - DEC */
+ /* Bit 1 - REF */
+ /* Bit 2 - DISP */
+
+ void *apv_ptr[BUF_MGR_MAX_CNT];
+}buf_mgr_t;
+
+// intializes the buffer API structure
+void impeg2_buf_mgr_init(
+ buf_mgr_t *ps_buf_mgr);
+
+// Add buffer to buffer manager. 0: success, -1: fail (u4_active_buf_cnt has reached u4_max_buf_cnt)
+WORD32 impeg2_buf_mgr_add(
+ buf_mgr_t *ps_buf_mgr,
+ void *pv_ptr,
+ WORD32 buf_id);
+
+// this function will set the buffer status to DEC
+void* impeg2_buf_mgr_get_next_free(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 *pi4_id);
+
+// this function will check if there are any free buffers
+WORD32 impeg2_buf_mgr_check_free(
+ buf_mgr_t *ps_buf_mgr);
+
+// mask will have who released it: DISP:REF:DEC
+WORD32 impeg2_buf_mgr_release(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id,
+ UWORD32 mask);
+
+// sets the status to one or all of DISP:REF:DEC
+WORD32 impeg2_buf_mgr_set_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id,
+ UWORD32 mask);
+
+// Gets status of the buffer
+UWORD32 impeg2_buf_mgr_get_status(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id);
+
+// pass the ID - buffer will be returned
+void* impeg2_buf_mgr_get_buf(
+ buf_mgr_t *ps_buf_mgr,
+ WORD32 id);
+
+// will return number of active buffers
+UWORD32 impeg2_buf_mgr_get_num_active_buf(
+ buf_mgr_t *ps_buf_mgr);
+
+
+
+#endif //_IMPEG2_BUF_MGR_H_
diff --git a/common/impeg2_defs.h b/common/impeg2_defs.h
new file mode 100644
index 0000000..f1523f2
--- /dev/null
+++ b/common/impeg2_defs.h
@@ -0,0 +1,331 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+#ifndef __IMPEG2_DEFS_H__
+#define __IMPEG2_DEFS_H__
+
+#include <assert.h>
+
+/* Decoder needs at least 4 reference buffers in order to support format conversion in a thread and
+to support B pictures. Because of format conversion in a thread, codec delay is now 2 frames instead of 1.
+To reduce this delay, format conversion has to wait for MB status before converting for B pictures.
+To avoid this check the delay is increased to 2 and hence number of reference frames minimum is 4 */
+#define NUM_INT_FRAME_BUFFERS 4
+
+
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 2160
+
+#define MIN_WIDTH 16
+#define MIN_HEIGHT 16
+
+
+#define MAX_FRM_SIZE (MAX_WIDTH * MAX_HEIGHT * 2) /* Supports only 420P and 422ILE */
+
+#define DEC_ORDER 0
+
+#define MAX_BITSTREAM_BUFFER_SIZE 2000 * 1024
+
+
+/******************************************************************************
+* MPEG2 Start code and other code definitions
+*******************************************************************************/
+#define START_CODE_PREFIX 0x000001
+#define SEQUENCE_HEADER_CODE 0x000001B3
+#define EXTENSION_START_CODE 0x000001B5
+#define USER_DATA_START_CODE 0x000001B2
+#define GOP_START_CODE 0x000001B8
+#define PICTURE_START_CODE 0x00000100
+#define SEQUENCE_END_CODE 0x000001B7
+#define RESERVED_START_CODE 0x000001B0
+#define MB_ESCAPE_CODE 0x008
+
+/******************************************************************************
+* MPEG2 Length of various codes definitions
+*******************************************************************************/
+#define START_CODE_LEN 32
+#define START_CODE_PREFIX_LEN 24
+#define MB_ESCAPE_CODE_LEN 11
+#define EXT_ID_LEN 4
+#define MB_QUANT_SCALE_CODE_LEN 5
+#define MB_DCT_TYPE_LEN 1
+#define MB_MOTION_TYPE_LEN 2
+#define BYTE_LEN 8
+
+/******************************************************************************
+* MPEG1 code definitions
+*******************************************************************************/
+#define MB_STUFFING_CODE 0x00F
+
+/******************************************************************************
+* MPEG1 Length of various codes definitions
+*******************************************************************************/
+#define MB_STUFFING_CODE_LEN 11
+
+/******************************************************************************
+* MPEG2 MB definitions
+*******************************************************************************/
+#define MPEG2_INTRA_MB 0x04
+#define MPEG2_INTRAQ_MB 0x44
+#define MPEG2_INTER_MB 0x28
+#define MB_MOTION_BIDIRECT 0x30
+#define MB_INTRA_OR_PATTERN 0x0C
+
+/******************************************************************************
+* Tools definitions
+*******************************************************************************/
+#define SPATIAL_SCALABILITY 0x01
+#define TEMPORAL_SCALABILITY 0x03
+
+/******************************************************************************
+* Extension IDs definitions
+*******************************************************************************/
+#define SEQ_DISPLAY_EXT_ID 0x02
+#define SEQ_SCALABLE_EXT_ID 0x05
+#define QUANT_MATRIX_EXT_ID 0x03
+#define COPYRIGHT_EXT_ID 0x04
+#define PIC_DISPLAY_EXT_ID 0x07
+#define PIC_SPATIAL_SCALABLE_EXT_ID 0x09
+#define PIC_TEMPORAL_SCALABLE_EXT_ID 0x0A
+#define CAMERA_PARAM_EXT_ID 0x0B
+#define ITU_T_EXT_ID 0x0C
+/******************************************************************************
+* Extension IDs Length definitions
+*******************************************************************************/
+#define CAMERA_PARAMETER_EXTENSION_LEN 377
+#define COPYRIGHT_EXTENSION_LEN 88
+#define GROUP_OF_PICTURE_LEN 59
+
+
+/******************************************************************************
+* MPEG2 Picture structure definitions
+*******************************************************************************/
+#define TOP_FIELD 1
+#define BOTTOM_FIELD 2
+#define FRAME_PICTURE 3
+
+/******************************************************************************
+* MPEG2 Profile definitions
+*******************************************************************************/
+#define MPEG2_SIMPLE_PROFILE 0x05
+#define MPEG2_MAIN_PROFILE 0x04
+
+/******************************************************************************
+* MPEG2 Level definitions
+*******************************************************************************/
+#define MPEG2_LOW_LEVEL 0x0a
+#define MPEG2_MAIN_LEVEL 0x08
+
+/******************************************************************************
+* MPEG2 Prediction types
+*******************************************************************************/
+#define FIELD_PRED 0
+#define FRAME_PRED 1
+#define DUAL_PRED 2
+#define RESERVED -1
+#define MC_16X8_PRED 3
+
+/*****************************************************************************
+* MPEG2 Motion vector format
+******************************************************************************/
+#define FIELD_MV 0
+#define FRAME_MV 1
+
+/******************************************************************************/
+/* General Video related definitions */
+/******************************************************************************/
+
+#define BLK_SIZE 8
+#define NUM_COEFFS ((BLK_SIZE)*(BLK_SIZE))
+#define LUMA_BLK_SIZE (2 * (BLK_SIZE))
+#define CHROMA_BLK_SIZE (BLK_SIZE)
+#define BLOCKS_IN_MB 6
+#define MB_SIZE 16
+#define MB_CHROMA_SIZE 8
+#define NUM_PELS_IN_BLOCK 64
+#define NUM_LUMA_BLKS 4
+#define NUM_CHROMA_BLKS 2
+#define MAX_COLR_COMPS 3
+#define Y_LUMA 0
+#define U_CHROMA 1
+#define V_CHROMA 2
+#define MB_LUMA_MEM_SIZE ((MB_SIZE) * (MB_SIZE))
+#define MB_CHROMA_MEM_SIZE ((MB_SIZE/2) * (MB_SIZE/2))
+
+#define BITS_IN_INT 32
+/******************************************************************************/
+/* MPEG2 Motion compensation related definitions */
+/******************************************************************************/
+#define REF_FRM_MB_WIDTH 18
+#define REF_FRM_MB_HEIGHT 18
+#define REF_FLD_MB_HEIGHT 10
+#define REF_FLD_MB_WIDTH 18
+
+/******************************************************************************/
+/* Maximum number of bits per MB */
+/******************************************************************************/
+#define I_MB_BIT_SIZE 90
+#define P_MB_BIT_SIZE 90
+#define B_MB_BIT_SIZE 150
+
+/******************************************************************************/
+/* Aspect ratio related definitions */
+/******************************************************************************/
+#define MPG1_NTSC_4_3 0x8
+#define MPG1_PAL_4_3 0xc
+#define MPG1_NTSC_16_9 0x6
+#define MPG1_PAL_16_9 0x3
+#define MPG1_1_1 0x1
+
+#define MPG2_4_3 0x2
+#define MPG2_16_9 0x3
+#define MPG2_1_1 0x1
+
+/******************************************************************************/
+/* Inverse Quantizer Output range */
+/******************************************************************************/
+#define IQ_OUTPUT_MAX 2047
+#define IQ_OUTPUT_MIN -2048
+
+/******************************************************************************/
+/* IDCT Output range */
+/******************************************************************************/
+#define IDCT_OUTPUT_MAX 255
+#define IDCT_OUTPUT_MIN -256
+
+/******************************************************************************/
+/* Output pixel range */
+/******************************************************************************/
+#define PEL_VALUE_MAX 255
+#define PEL_VALUE_MIN 0
+
+/******************************************************************************/
+/* inv scan types */
+/******************************************************************************/
+#define ZIG_ZAG_SCAN 0
+#define VERTICAL_SCAN 1
+
+/******************************************************************************/
+/* Related VLD codes */
+/******************************************************************************/
+#define ESC_CODE_VALUE 0x0058
+#define EOB_CODE_VALUE 0x07d0
+
+#define END_OF_BLOCK 0x01
+#define ESCAPE_CODE 0x06
+
+#define END_OF_BLOCK_ZERO 0x01ff
+#define END_OF_BLOCK_ONE 0x01ff
+
+/******************** Idct Specific ***************/
+#define TRANS_SIZE_8 8
+#define IDCT_STG1_SHIFT 12
+#define IDCT_STG2_SHIFT 16
+
+#define IDCT_STG1_ROUND ((1 << IDCT_STG1_SHIFT) >> 1)
+#define IDCT_STG2_ROUND ((1 << IDCT_STG2_SHIFT) >> 1)
+
+
+/******************************************************************************
+* Sample Version Definitions
+*******************************************************************************/
+#define SAMPLE_VERS_MAX_FRAMES_DECODE 999
+
+#define MAX_FRAME_BUFFER 7
+
+/* vop coding type */
+typedef enum
+{
+ I_PIC = 1,
+ P_PIC,
+ B_PIC,
+ D_PIC
+} e_pic_type_t;
+
+typedef enum
+{
+ MPEG_2_VIDEO,
+ MPEG_1_VIDEO
+} e_video_type_t;
+
+typedef enum
+{
+ FORW,
+ BACK,
+ BIDIRECT
+} e_pred_direction_t;
+
+typedef enum
+{
+ TOP,
+ BOTTOM
+} e_field_t;
+
+/* Motion vectors (first/second) */
+enum
+{
+ FIRST,
+ SECOND,
+ THIRD,
+ FOURTH
+};
+
+enum
+{
+ MV_X,
+ MV_Y
+};
+
+/* Enumeration defining the various kinds of interpolation possible in
+motion compensation */
+typedef enum
+{
+ FULL_XFULL_Y,
+ FULL_XHALF_Y,
+ HALF_XFULL_Y,
+ HALF_XHALF_Y
+} e_sample_type_t;
+typedef enum
+{
+ /* Params of the reference buffer used as input to MC */
+ /* frame prediction in P frame picture */
+ MC_FRM_FW_OR_BK_1MV,
+ /* field prediction in P frame picture */
+ MC_FRM_FW_OR_BK_2MV,
+ /* frame prediction in B frame picture */
+ MC_FRM_FW_AND_BK_2MV,
+ /* field prediction in B frame picture */
+ MC_FRM_FW_AND_BK_4MV,
+ /* dual prime prediction in P frame picture */
+ MC_FRM_FW_DUAL_PRIME_1MV,
+ /* frame prediction in P field picture */
+ MC_FLD_FW_OR_BK_1MV,
+ /* 16x8 prediction in P field picture */
+ MC_FLD_FW_OR_BK_2MV,
+ /* field prediction in B field picture */
+ MC_FLD_FW_AND_BK_2MV,
+ /* 16x8 prediction in B field picture */
+ MC_FLD_FW_AND_BK_4MV,
+ /* dual prime prediction in P field picture */
+ MC_FLD_FW_DUAL_PRIME_1MV,
+} e_mb_type_t;
+
+#endif /* __IMPEG2_DEFS_H__ */
+
diff --git a/common/impeg2_disp_mgr.c b/common/impeg2_disp_mgr.c
new file mode 100644
index 0000000..f5ede84
--- /dev/null
+++ b/common/impeg2_disp_mgr.c
@@ -0,0 +1,172 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* impeg2_disp_mgr.c
+*
+* @brief
+* Contains function definitions for display management
+*
+* @author
+* Srinivas T
+*
+* @par List of Functions:
+* - impeg2_disp_mgr_init()
+* - impeg2_disp_mgr_add()
+* - impeg2_disp_mgr_get()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#include <stdio.h>
+#include <stdlib.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+#include "impeg2_disp_mgr.h"
+
+/**
+*******************************************************************************
+*
+* @brief
+* Initialization function for display buffer manager
+*
+* @par Description:
+* Initializes the display buffer management structure
+*
+* @param[in] ps_disp_mgr
+* Pointer to the display buffer management structure
+*
+* @returns none
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void impeg2_disp_mgr_init(
+ disp_mgr_t *ps_disp_mgr)
+{
+ WORD32 id;
+
+
+ for(id = 0; id < DISP_MGR_MAX_CNT; id++)
+ {
+ ps_disp_mgr->apv_ptr[id] = NULL;
+ }
+
+ ps_disp_mgr->i4_wr_idx = 0;
+ ps_disp_mgr->i4_rd_idx = 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Adds a buffer to the display manager
+*
+* @par Description:
+* Adds a buffer to the display buffer manager
+*
+* @param[in] ps_disp_mgr
+* Pointer to the diaplay buffer management structure
+*
+* @param[in] buf_id
+* ID of the display buffer
+*
+* @param[in] abs_poc
+* Absolute POC of the display buffer
+*
+* @param[in] pv_ptr
+* Pointer to the display buffer
+*
+* @returns 0 if success, -1 otherwise
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+WORD32 impeg2_disp_mgr_add(disp_mgr_t *ps_disp_mgr,
+ void *pv_ptr,
+ WORD32 i4_buf_id)
+{
+
+
+ WORD32 id;
+ id = ps_disp_mgr->i4_wr_idx % DISP_MGR_MAX_CNT;
+
+ ps_disp_mgr->apv_ptr[id] = pv_ptr;
+ ps_disp_mgr->ai4_buf_id[id] = i4_buf_id;
+ ps_disp_mgr->i4_wr_idx++;
+
+ return 0;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Gets the next buffer
+*
+* @par Description:
+* Gets the next display buffer
+*
+* @param[in] ps_disp_mgr
+* Pointer to the display buffer structure
+*
+* @param[out] pi4_buf_id
+* Pointer to hold buffer id of the display buffer being returned
+*
+* @returns Pointer to the next display buffer
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+void* impeg2_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id)
+{
+ WORD32 id;
+
+ *pi4_buf_id = -1;
+
+ if(ps_disp_mgr->i4_rd_idx < ps_disp_mgr->i4_wr_idx)
+ {
+ id = ps_disp_mgr->i4_rd_idx % DISP_MGR_MAX_CNT;
+ if(NULL == ps_disp_mgr->apv_ptr[id])
+ {
+ return NULL;
+ }
+
+ *pi4_buf_id = ps_disp_mgr->ai4_buf_id[id];
+
+ ps_disp_mgr->i4_rd_idx++;
+
+ return ps_disp_mgr->apv_ptr[id];
+ }
+ else
+ return NULL;
+
+}
diff --git a/common/impeg2_disp_mgr.h b/common/impeg2_disp_mgr.h
new file mode 100644
index 0000000..96b01b0
--- /dev/null
+++ b/common/impeg2_disp_mgr.h
@@ -0,0 +1,67 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* impeg2_disp_mgr.h
+*
+* @brief
+* Function declarations used for display management
+*
+* @author
+* Srinivas T
+*
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef _IMPEG2_DISP_MGR_H_
+#define _IMPEG2_DISP_MGR_H_
+
+#define DISP_MGR_MAX_CNT 64
+#define DEFAULT_POC 0x7FFFFFFF
+
+typedef struct
+{
+ /**
+ * apv_ptr[DISP_MGR_MAX_CNT]
+ */
+ void *apv_ptr[DISP_MGR_MAX_CNT];
+
+ WORD32 ai4_buf_id[DISP_MGR_MAX_CNT];
+
+ WORD32 i4_wr_idx;
+
+ WORD32 i4_rd_idx;
+}disp_mgr_t;
+
+void impeg2_disp_mgr_init(
+ disp_mgr_t *ps_disp_mgr);
+
+WORD32 impeg2_disp_mgr_add(
+ disp_mgr_t *ps_disp_mgr,
+ void *pv_ptr,
+ WORD32 i4_buf_id);
+
+void* impeg2_disp_mgr_get(disp_mgr_t *ps_disp_mgr, WORD32 *pi4_buf_id);
+
+#endif //_IMPEG2_DISP_MGR_H_
diff --git a/common/impeg2_format_conv.c b/common/impeg2_format_conv.c
new file mode 100644
index 0000000..ec0bcfb
--- /dev/null
+++ b/common/impeg2_format_conv.c
@@ -0,0 +1,401 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : impeg2_format_conv .c */
+/* */
+/* Description : Contains functions needed to convert the images in */
+/* different color spaces to yuv 422i color space */
+/* */
+/* List of Functions : YUV420toYUV420() */
+/* YUV420toYUV422I() */
+/* YUV420toYUV420SP_VU() */
+/* YUV420toYUV420SP_UU() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 28 08 2007 Naveen Kumar T Draft */
+/* */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+
+/* System include files */
+
+/* User include files */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "ithread.h"
+
+#include "iv_datatypedef.h"
+#include "impeg2_macros.h"
+#include "impeg2_buf_mgr.h"
+#include "impeg2_disp_mgr.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+
+#include "impeg2_job_queue.h"
+#include "impeg2_format_conv.h"
+
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_copy_frm_yuv420p() */
+/* */
+/* Description : This function performs conversion from YUV420 to */
+/* YUV422I color space. */
+/* */
+/* Inputs : pu1_src_y, - UWORD8 pointer to source y plane. */
+/* pu1_src_u, - UWORD8 pointer to source u plane. */
+/* pu1_src_v, - UWORD8 pointer to source v plane. */
+/* pu1_dst_y, - UWORD8 pointer to dest y plane. */
+/* pu1_dst_u, - UWORD8 pointer to dest u plane. */
+/* pu1_dst_v, - UWORD8 pointer to dest v plane. */
+/* u4_width, - Width of image. */
+/* u4_height, - Height of image. */
+/* u4_src_stride_y - Stride in pixels of source Y plane. */
+/* u4_src_stride_u - Stride in pixels of source U plane. */
+/* u4_src_stride_v - Stride in pixels of source V plane. */
+/* u4_dst_stride_y - Stride in pixels of dest Y plane. */
+/* u4_dst_stride_u - Stride in pixels of dest U plane. */
+/* u4_dst_stride_v - Stride in pixels of dest V plane. */
+/* */
+/* Globals : None */
+/* */
+/* Processing : One row is processed at a time. The one iteration of the */
+/* code will rearrange pixels into YUV422 interleaved */
+/* format. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 08 2007 Naveen Kumar T Draft */
+/* */
+/*****************************************************************************/
+void impeg2_copy_frm_yuv420p(UWORD8 *pu1_src_y,
+ UWORD8 *pu1_src_u,
+ UWORD8 *pu1_src_v,
+ UWORD8 *pu1_dst_y,
+ UWORD8 *pu1_dst_u,
+ UWORD8 *pu1_dst_v,
+ UWORD32 u4_width,
+ UWORD32 u4_height,
+ UWORD32 u4_src_stride_y,
+ UWORD32 u4_src_stride_u,
+ UWORD32 u4_src_stride_v,
+ UWORD32 u4_dst_stride_y,
+ UWORD32 u4_dst_stride_u,
+ UWORD32 u4_dst_stride_v)
+{
+ WORD32 i4_cnt;
+ WORD32 i4_y_height = (WORD32) u4_height;
+ WORD32 i4_uv_height = u4_height >> 1;
+ WORD32 i4_uv_width = u4_width >> 1;
+
+ for(i4_cnt = 0; i4_cnt < i4_y_height; i4_cnt++)
+ {
+ memcpy(pu1_dst_y, pu1_src_y, u4_width);
+ pu1_dst_y += (u4_dst_stride_y);
+ pu1_src_y += (u4_src_stride_y);
+ }
+
+ for(i4_cnt = 0; i4_cnt < i4_uv_height; i4_cnt++)
+ {
+ memcpy(pu1_dst_u, pu1_src_u, i4_uv_width);
+ pu1_dst_u += (u4_dst_stride_u);
+ pu1_src_u += (u4_src_stride_u);
+
+ }
+
+ for(i4_cnt = 0; i4_cnt < i4_uv_height; i4_cnt++)
+ {
+ memcpy(pu1_dst_v, pu1_src_v, i4_uv_width);
+ pu1_dst_v += (u4_dst_stride_v);
+ pu1_src_v += (u4_src_stride_v);
+
+ }
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_fmt_conv_yuv420p_to_yuv422ile() */
+/* */
+/* Description : This function performs conversion from YUV420 to */
+/* YUV422I color space. */
+/* */
+/* Inputs : pu1_y - UWORD8 pointer to y plane. */
+/* pu1_u - UWORD8 pointer to u plane. */
+/* pu1_v - UWORD8 pointer to u plane. */
+/* pu2_yuv422i - UWORD16 pointer to yuv422iimage. */
+/* u4_width - Width of the Y plane. */
+/* u4_height - Height of the Y plane. */
+/* u4_stride_y - Stride in pixels of Y plane. */
+/* u4_stride_u - Stride in pixels of U plane. */
+/* u4_stride_v - Stride in pixels of V plane. */
+/* u4_stride_yuv422i- Stride in pixels of yuv422i image. */
+/* */
+/* Globals : None */
+/* */
+/* Processing : One row is processed at a time. The one iteration of the */
+/* code will rearrange pixels into YUV422 interleaved */
+/* format. */
+/* */
+/* Outputs : None */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 08 2007 Naveen Kumar T Draft */
+/* */
+/*****************************************************************************/
+
+void impeg2_fmt_conv_yuv420p_to_yuv422ile(register UWORD8 *pu1_y,
+ register UWORD8 *pu1_u,
+ register UWORD8 *pu1_v,
+ void *pv_yuv422i,
+ UWORD32 u4_width,
+ UWORD32 u4_height,
+ UWORD32 u4_stride_y,
+ UWORD32 u4_stride_u,
+ UWORD32 u4_stride_v,
+ UWORD32 u4_stride_yuv422i)
+{
+ /* Declare local variables */
+ register WORD16 i,j;
+ register UWORD16 u2_offset1,u2_offset2,u2_offset3,u2_offset_yuv422i;
+ register UWORD8 u1_y1,u1_uv;
+ register UWORD32 u4_pixel;
+ register UWORD16 u2_width_cnt;
+ register UWORD32 *pu4_yuv422i;
+
+ UWORD8 u1_flag; /* This flag is used to indicate wether the row is even or odd */
+
+ u1_flag=0x0; /* Intialize it with 0 indicating odd row */
+
+ /* Calculate the offsets necessary to make input and output buffers to point next row */
+ u2_offset1 = u4_stride_y - u4_width;
+ u2_offset2 = u4_stride_u - ((u4_width + 1) >> 1);
+ u2_offset3 = u4_stride_v - ((u4_width + 1) >> 1);
+ u2_offset_yuv422i = (u4_stride_yuv422i >> 1) -((u4_width + 1) >> 1);
+
+ /* Type cast the output pointer to UWORD32 */
+ pu4_yuv422i = (UWORD32 *)pv_yuv422i;
+
+ /* Calculate the loop counter for inner loop */
+ u2_width_cnt = u4_width >> 1;
+
+ /* Run the loop for height of input buffer */
+ for(i = u4_height; i > 0; i--)
+ {
+ /* Run the loop for width/2 */
+ for(j = u2_width_cnt; j > 0; j--)
+ {
+ /* Store the value in output buffer in the order U0Y0V0Y1U2Y2V2Y3.... */
+ /* Load Y0 */
+ u1_y1 = *pu1_y++;
+ /* Load Y1 */
+ u4_pixel = *pu1_y++;
+ /* Load V0 */
+ u1_uv = *pu1_v++;
+ u4_pixel = (u4_pixel << 8) + u1_uv;
+ /* Load U0 */
+ u1_uv = *pu1_u++;
+ u4_pixel = (u4_pixel << 8) + u1_y1;
+ u4_pixel = (u4_pixel << 8) + u1_uv;
+ *pu4_yuv422i++ = u4_pixel;
+ }
+ /* Incase of width is odd number take care of last pixel */
+ if(u4_width & 0x1)
+ {
+ /* Store the value in output buffer in the order U0Y0V0Y1U2Y2V2Y3.... */
+ /* Load Y0 */
+ u1_y1 = *pu1_y++;
+ /* Load V0 */
+ u1_uv = *pu1_v++;
+ /* Take Y0 as Y1 */
+ u4_pixel = u1_y1;
+ u4_pixel = (u4_pixel << 8) + u1_uv;
+ /* Load U0 */
+ u1_uv = *pu1_u++;
+ u4_pixel = (u4_pixel << 8) + u1_y1;
+ u4_pixel = (u4_pixel << 8) + u1_uv;
+ *pu4_yuv422i++ = u4_pixel;
+ }
+ /* Make the pointers to buffer to point to next row */
+ pu1_y = pu1_y + u2_offset1;
+ if(!u1_flag)
+ {
+ /* Restore the pointers of u and v buffer back so that the row of pixels are also */
+ /* Processed with same row of u and values again */
+ pu1_u = pu1_u - ((u4_width + 1) >> 1);
+ pu1_v = pu1_v - ((u4_width + 1) >> 1);
+ }
+ else
+ {
+ /* Adjust the u and v buffer pointers so that they will point to next row */
+ pu1_u = pu1_u + u2_offset2;
+ pu1_v = pu1_v + u2_offset3;
+ }
+
+ /* Adjust the output buffer pointer for next row */
+ pu4_yuv422i = pu4_yuv422i + u2_offset_yuv422i;
+ /* Toggle the flag to convert between odd and even row */
+ u1_flag= u1_flag ^ 0x1;
+ }
+}
+
+
+
+
+void impeg2_fmt_conv_yuv420p_to_yuv420sp_vu(UWORD8 *pu1_y, UWORD8 *pu1_u, UWORD8 *pu1_v,
+ UWORD8 *pu1_dest_y, UWORD8 *pu1_dest_uv,
+ UWORD32 u4_height, UWORD32 u4_width,UWORD32 u4_stridey,
+ UWORD32 u4_strideu, UWORD32 u4_stridev,
+ UWORD32 u4_dest_stride_y, UWORD32 u4_dest_stride_uv,
+ UWORD32 u4_convert_uv_only
+ )
+
+{
+
+
+ UWORD8 *pu1_src,*pu1_dst;
+ UWORD8 *pu1_src_u, *pu1_src_v;
+ UWORD16 i;
+ UWORD32 u2_width_uv;
+
+ UWORD32 u4_dest_inc_y=0, u4_dest_inc_uv=0;
+
+
+ /* Copy Y buffer */
+ pu1_dst = (UWORD8 *)pu1_dest_y;
+ pu1_src = (UWORD8 *)pu1_y;
+
+ u4_dest_inc_y = u4_dest_stride_y;
+ u4_dest_inc_uv = u4_dest_stride_uv;
+
+ if(0 == u4_convert_uv_only)
+ {
+ for(i = 0; i < u4_height; i++)
+ {
+ memcpy((void *)pu1_dst,(void *)pu1_src, u4_width);
+ pu1_dst += u4_dest_inc_y;
+ pu1_src += u4_stridey;
+ }
+ }
+
+ /* Interleave Cb and Cr buffers */
+ pu1_src_u = pu1_u;
+ pu1_src_v = pu1_v;
+ pu1_dst = pu1_dest_uv ;
+
+ u4_height = (u4_height + 1) >> 1;
+ u2_width_uv = (u4_width + 1) >> 1;
+ for(i = 0; i < u4_height ; i++)
+ {
+ UWORD32 j;
+ for(j = 0; j < u2_width_uv; j++)
+ {
+ *pu1_dst++ = *pu1_src_v++;
+ *pu1_dst++ = *pu1_src_u++;
+
+ }
+
+ pu1_dst += u4_dest_inc_uv - u4_width;
+ pu1_src_u += u4_strideu - u2_width_uv;
+ pu1_src_v += u4_stridev - u2_width_uv;
+ }
+}
+
+void impeg2_fmt_conv_yuv420p_to_yuv420sp_uv(UWORD8 *pu1_y, UWORD8 *pu1_u, UWORD8 *pu1_v,
+ UWORD8 *pu1_dest_y, UWORD8 *pu1_dest_uv,
+ UWORD32 u4_height, UWORD32 u4_width,UWORD32 u4_stridey,
+ UWORD32 u4_strideu, UWORD32 u4_stridev,
+ UWORD32 u4_dest_stride_y, UWORD32 u4_dest_stride_uv,
+ UWORD32 u4_convert_uv_only)
+
+{
+
+
+ UWORD8 *pu1_src,*pu1_dst;
+ UWORD8 *pu1_src_u, *pu1_src_v;
+ UWORD16 i;
+ UWORD32 u2_width_uv;
+
+ UWORD32 u4_dest_inc_y=0, u4_dest_inc_uv=0;
+
+
+ /* Copy Y buffer */
+ pu1_dst = (UWORD8 *)pu1_dest_y;
+ pu1_src = (UWORD8 *)pu1_y;
+
+ u4_dest_inc_y = u4_dest_stride_y;
+ u4_dest_inc_uv = u4_dest_stride_uv;
+
+ if(0 == u4_convert_uv_only)
+ {
+ for(i = 0; i < u4_height; i++)
+ {
+ memcpy((void *)pu1_dst,(void *)pu1_src, u4_width);
+ pu1_dst += u4_dest_inc_y;
+ pu1_src += u4_stridey;
+ }
+ }
+
+ /* Interleave Cb and Cr buffers */
+ pu1_src_u = pu1_u;
+ pu1_src_v = pu1_v;
+ pu1_dst = pu1_dest_uv ;
+
+ u4_height = (u4_height + 1) >> 1;
+ u2_width_uv = (u4_width + 1) >> 1;
+ for(i = 0; i < u4_height ; i++)
+ {
+ UWORD32 j;
+ for(j = 0; j < u2_width_uv; j++)
+ {
+ *pu1_dst++ = *pu1_src_u++;
+ *pu1_dst++ = *pu1_src_v++;
+ }
+
+ pu1_dst += u4_dest_inc_uv - u4_width;
+ pu1_src_u += u4_strideu - u2_width_uv;
+ pu1_src_v += u4_stridev - u2_width_uv;
+ }
+
+}
+
+
diff --git a/common/impeg2_format_conv.h b/common/impeg2_format_conv.h
new file mode 100644
index 0000000..52400d3
--- /dev/null
+++ b/common/impeg2_format_conv.h
@@ -0,0 +1,133 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : impeg2_format_conv.h */
+/* */
+/* Description : Contains coefficients and constant reqquired for */
+/* converting from rgb and gray color spaces to yuv422i */
+/* color space */
+/* */
+/* List of Functions : None */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 27 08 2007 Naveen Kumar T Draft */
+/* */
+/*****************************************************************************/
+
+#ifndef __IMPEG2_FORMAT_CONV_H__
+#define __IMPEG2_FORMAT_CONV_H__
+
+/*****************************************************************************/
+/* Typedefs */
+/*****************************************************************************/
+
+#define COEFF_0_Y 66
+#define COEFF_1_Y 129
+#define COEFF_2_Y 25
+#define COEFF_0_U -38
+#define COEFF_1_U -75
+#define COEFF_2_U 112
+#define COEFF_0_V 112
+#define COEFF_1_V -94
+#define COEFF_2_V -18
+#define CONST_RGB_YUV1 4096
+#define CONST_RGB_YUV2 32768
+#define CONST_GRAY_YUV 128
+#define COEF_2_V2_U 0xFFEE0070
+
+#define COF_2Y_0Y 0X00190042
+#define COF_1U_0U 0XFFB5FFDA
+#define COF_1V_0V 0XFFA20070
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+typedef enum {
+GRAY_SCALE = 0,
+YUV444 = 1,
+YUV420 = 2,
+YUV422H = 3,
+YUV422V = 4,
+YUV411 = 5,
+RGB24 = 6,
+RGB24i = 7
+}input_format_t;
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+typedef void pf_copy_yuv420p_buf_t(UWORD8 *pu1_src_y,
+ UWORD8 *pu1_src_u,
+ UWORD8 *pu1_src_v,
+ UWORD8 *pu1_dst_y,
+ UWORD8 *pu1_dst_u,
+ UWORD8 *pu1_dst_v,
+ UWORD32 u4_width,
+ UWORD32 u4_height,
+ UWORD32 u4_src_stride_y,
+ UWORD32 u4_src_stride_u,
+ UWORD32 u4_src_stride_v,
+ UWORD32 u4_dst_stride_y,
+ UWORD32 u4_dst_stride_u,
+ UWORD32 u4_dst_stride_v);
+
+typedef void pf_fmt_conv_yuv420p_to_yuv422ile_t(UWORD8 *pu1_y,
+ UWORD8 *pu1_u,
+ UWORD8 *pu1_v,
+ void *pv_yuv422i,
+ UWORD32 u4_width,
+ UWORD32 u4_height,
+ UWORD32 u4_stride_y,
+ UWORD32 u4_stride_u,
+ UWORD32 u4_stride_v,
+ UWORD32 u4_stride_yuv422i);
+
+typedef void pf_fmt_conv_yuv420p_to_yuv420sp_t(UWORD8 *pu1_y,
+ UWORD8 *pu1_u,
+ UWORD8 *pu1_v,
+ UWORD8 *pu1_dest_y,
+ UWORD8 *pu1_dest_uv,
+ UWORD32 u2_height,
+ UWORD32 u2_width,
+ UWORD32 u2_stridey,
+ UWORD32 u2_strideu,
+ UWORD32 u2_stridev,
+ UWORD32 u2_dest_stride_y,
+ UWORD32 u2_dest_stride_uv,
+ UWORD32 convert_uv_only);
+
+pf_copy_yuv420p_buf_t impeg2_copy_frm_yuv420p;
+pf_fmt_conv_yuv420p_to_yuv422ile_t impeg2_fmt_conv_yuv420p_to_yuv422ile;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv;
+
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_a9q;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_a9q;
+
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8;
+pf_fmt_conv_yuv420p_to_yuv420sp_t impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8;
+
+
+#endif /* __IMPEG2_FORMAT_CONV_H__ */
diff --git a/common/impeg2_globals.c b/common/impeg2_globals.c
new file mode 100644
index 0000000..9193ef7
--- /dev/null
+++ b/common/impeg2_globals.c
@@ -0,0 +1,351 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#include <stdio.h>
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "impeg2_buf_mgr.h"
+#include "impeg2_disp_mgr.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+#include "impeg2_globals.h"
+
+/* Table for converting the quantizer_scale_code to quantizer_scale */
+const UWORD8 gau1_impeg2_non_linear_quant_scale[] =
+{
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8,10,12,14,16,18,20,22,
+ 24,28,32,36,40,44,48,52,
+ 56,64,72,80,88,96,104,112
+};
+
+
+/* Default quantizer matrix to be used for intra blocks */
+const UWORD8 gau1_impeg2_intra_quant_matrix_default[] =
+{
+ 8, 16, 19, 22, 26, 27, 29, 34,
+ 16, 16, 22, 24, 27, 29, 34, 37,
+ 19, 22, 26, 27, 29, 34, 34, 38,
+ 22, 22, 26, 27, 29, 34, 37, 40,
+ 22, 26, 27, 29, 32, 35, 40, 48,
+ 26, 27, 29, 32, 35, 40, 48, 58,
+ 26, 27, 29, 34, 38, 46, 56, 69,
+ 27, 29, 35, 38, 46, 56, 69, 83
+};
+
+/* Default quantizer matrix to be used for inter blocks */
+const UWORD8 gau1_impeg2_inter_quant_matrix_default[] =
+{
+ 16,16,16,16,16,16,16,16,
+ 16,16,16,16,16,16,16,16,
+ 16,16,16,16,16,16,16,16,
+ 16,16,16,16,16,16,16,16,
+ 16,16,16,16,16,16,16,16,
+ 16,16,16,16,16,16,16,16,
+ 16,16,16,16,16,16,16,16,
+ 16,16,16,16,16,16,16,16
+};
+
+/* Table to perform inverse scan when the scan direction is zigzag */
+const UWORD8 gau1_impeg2_inv_scan_zig_zag[] =
+{
+ 0, 1, 8, 16, 9, 2, 3, 10,
+ 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34,
+ 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36,
+ 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46,
+ 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+/* Table to perform inverse scan when the direction of scanning is vertical */
+const UWORD8 gau1_impeg2_inv_scan_vertical[] =
+{
+ 0, 8, 16, 24, 1, 9, 2, 10,
+ 17, 25, 32, 40, 48, 56, 57, 49,
+ 41, 33, 26, 18, 3, 11, 4, 12,
+ 19, 27, 34, 42, 50, 58, 35, 43,
+ 51, 59, 20, 28, 5, 13, 6, 14,
+ 21, 29, 36, 44, 52, 60, 37, 45,
+ 53, 61, 22, 30, 7, 15, 23, 31,
+ 38, 46, 54, 62, 39, 47, 55, 63
+};
+
+/*****************************************************************************/
+/* Table that indicate which interpolation type is to used */
+/*****************************************************************************/
+/* Chroma when motion vector is positive */
+const UWORD16 gau2_impeg2_chroma_interp_mv[][16] =
+{
+ /* Pos X Pos Y */
+ {
+ 0, 0, 1, 1,
+ 0, 0, 1, 1,
+ 2, 2, 3, 3,
+ 2, 2, 3, 3
+ },
+ /* Neg X Pos Y */
+ {
+ 0, 1, 1, 0,
+ 0, 1, 1, 0,
+ 2, 3, 3, 2,
+ 2, 3, 3, 2
+ },
+ /* Pos X Neg Y */
+ {
+ 0, 0, 1, 1,
+ 2, 2, 3, 3,
+ 2, 2, 3, 3,
+ 0, 0, 1, 1
+ },
+ /* Neg X Neg Y */
+ {
+ 0, 1, 1, 0,
+ 2, 3, 3, 2,
+ 2, 3, 3, 2,
+ 0, 1, 1, 0
+ }
+};
+/*****************************************************************************/
+/* Input #1 Offset in bytes */
+/*****************************************************************************/
+/* Chroma */
+const UWORD16 gau2_impeg2_chroma_interp_inp1[][16] =
+{
+ /* Pos X Pos Y */
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0
+ },
+ /* Neg X Pos Y */
+ {
+ 0, 0, 0, 4,
+ 0, 0, 0, 4,
+ 0, 0, 0, 4,
+ 0, 0, 0, 4
+ },
+ /* Pos X Neg Y */
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 72, 72, 72, 72
+ },
+ /* Neg X Neg Y */
+ {
+ 0, 0, 0, 4,
+ 0, 0, 0, 4,
+ 0, 0, 0, 4,
+ 72, 72, 72, 76
+ }
+};
+/* Luma */
+const UWORD16 gau2_impeg2_luma_interp_inp1[] =
+{
+ 1, 1, 3, 3,
+ 1, 1, 3, 3,
+ 37, 37, 39, 39,
+ 37, 37, 39, 39
+};
+/*****************************************************************************/
+/* Input #2 Offset from Input #1 in bytes */
+/*****************************************************************************/
+/*
+ FXFY 0,
+ HXFY 2,
+ FXHY 36,
+ HXHY 36
+*/
+const UWORD16 gau2_impeg2_luma_interp_inp2[] =
+{
+ 0, 2, 0, 2,
+ 36, 36, 36, 36,
+ 0, 2, 0, 2,
+ 36, 36, 36, 36
+};
+const UWORD16 gau2_impeg2_chroma_interp_inp2[] =
+{
+ /* FXFY */
+ 0,
+ /* HXFY */
+ 4,
+ /* FXHY */
+ 72,
+ /* HXHY */
+ 72
+};
+
+/*****************************************************************************/
+/* Corresponds to Table 6-4 frame_rate_value of the standard */
+/*****************************************************************************/
+/*
+ frame_rate_code frame_rate_value
+
+ 0000 Forbidden
+ 0001 24 000 ÷ 1001
+ 0010 24
+ 0011 25
+ 0100 30 000 ÷ 1001
+ 0101 30
+ 0110 50
+ 0111 60 000 ÷ 1001
+ 1000 60
+ 1001 Reserved
+ ....
+ 1111 Reserved
+*/
+const UWORD16 gau2_impeg2_frm_rate_code[][2] =
+{
+ {1 , 1}, /* Forbidden */
+ {24000, 1001},
+ {24000, 1000},
+ {25000, 1000},
+ {30000, 1001},
+ {30000, 1000},
+ {50000, 1000},
+ {60000, 1001},
+ {60000, 1000}
+ /* Rest reserved */
+};
+
+const WORD16 gai2_impeg2_idct_q15[] =
+{
+ 23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170,
+ 32138, 27246, 18205, 6393, -6393, -18205, -27246, -32138,
+ 30274, 12540, -12540, -30274, -30274, -12540, 12540, 30274,
+ 27246, -6393, -32138, -18205, 18205, 32138, 6393, -27246,
+ 23170, -23170, -23170, 23170, 23170, -23170, -23170, 23170,
+ 18205, -32138, 6393, 27246, -27246, -6393, 32138, -18205,
+ 12540, -30274, 30274, -12540, -12540, 30274, -30274, 12540,
+ 6393, -18205, 27246, -32138, 32138, -27246, 18205, -6393,
+};
+
+const WORD16 gai2_impeg2_idct_q11[] =
+{
+ 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448,
+ 2009, 1703, 1138, 400, -400, -1138, -1703, -2009,
+ 1892, 784, -784, -1892, -1892, -784, 784, 1892,
+ 1703, -400, -2009, -1138, 1138, 2009, 400, -1703,
+ 1448, -1448, -1448, 1448, 1448, -1448, -1448, 1448,
+ 1138, -2009, 400, 1703, -1703, -400, 2009, -1138,
+ 784, -1892, 1892, -784, -784, 1892, -1892, 784,
+ 400, -1138, 1703, -2009, 2009, -1703, 1138, -400,
+};
+
+const WORD16 gai2_impeg2_idct_even_8_q15[][8] =
+{
+ { 23170, 23170, 23170, 23170, 23170, 23170, 23170, 23170 },
+ { 12540, -30274, 12540, -30274, 12540, -30274, 12540, -30274 },
+ { 30274, 12540, 30274, 12540, 30274, 12540, 30274, 12540 },
+ { 23170, -23170, 23170, -23170, 23170, -23170, 23170, -23170 }
+};
+const WORD16 gai2_impeg2_idct_odd_8_q15[][8] =
+{
+ { 32138, 27246, 32138, 27246, 32138, 27246, 32138, 27246 },
+ { 18205, 6393, 18205, 6393, 18205, 6393, 18205, 6393 },
+ { 27246, -6393, 27246, -6393, 27246, -6393, 27246, -6393 },
+ { 32138, 18205, 32138, 18205, 32138, 18205, 32138, 18205 },
+ { 18205, -32138, 18205, -32138, 18205, -32138, 18205, -32138 },
+ { 6393, 27246, 6393, 27246, 6393, 27246, 6393, 27246 },
+ { 6393, -18205, 6393, -18205, 6393, -18205, 6393, -18205 },
+ { 27246, -32138, 27246, -32138, 27246, -32138, 27246, -32138 },
+};
+
+const WORD16 gai2_impeg2_idct_even_8_q11[][8] =
+{
+ { 1448, 1448, 1448, 1448, 1448, 1448, 1448, 1448 },
+ { 784, -1892, 784, -1892, 784, -1892, 784, -1892 },
+ { 1892, 784, 1892, 784, 1892, 784, 1892, 784 },
+ { 1448, -1448, 1448, -1448, 1448, -1448, 1448, -1448 }
+};
+const WORD16 gai2_impeg2_idct_odd_8_q11[][8] =
+{
+ { 2009, 1703, 2009, 1703, 2009, 1703, 2009, 1703 },
+ { 1138, 400, 1138, 400, 1138, 400, 1138, 400 },
+ { 1703, -400, 1703, -400, 1703, -400, 1703, -400 },
+ { 2009, 1138, 2009, 1138, 2009, 1138, 2009, 1138 },
+ { 1138, -2009, 1138, -2009, 1138, -2009, 1138, -2009 },
+ { 400, 1703, 400, 1703, 400, 1703, 400, 1703 },
+ { 400, -1138, 400, -1138, 400, -1138, 400, -1138 },
+ { 1703, -2009, 1703, -2009, 1703, -2009, 1703, -2009 },
+};
+
+
+
+/*****************************************************************************/
+/* Last row IDCT Coefficients in Q11 format */
+/*****************************************************************************/
+const WORD16 gai2_impeg2_idct_last_row_q11[] =
+{
+ 400, -1138, 1703, -2009, 2009, -1703, 1138, -400,
+};
+
+const WORD16 gai2_impeg2_idct_first_col_q15[] =
+{
+ 23170, 32138, 30274, 27246, 23170, 18205, 12540, 6393,
+};
+
+const WORD16 gai2_impeg2_idct_first_col_q11[] =
+{
+ 1448, 2009, 1892, 1703, 1448, 1138, 784, 400,
+};
+
+/*****************************************************************************/
+/* Output of first stage dct (using gai2_impeg2_idct_q15 as coeffs) */
+/* for a 1D data (0, 0, 0, 0, 0, 0, 0, 1) */
+/*****************************************************************************/
+
+const WORD16 gai2_impeg2_mismatch_stg1_outp[] =
+{
+ 2, -4, 7, -8, 8, -7, 4, -2
+};
+
+const WORD16 gai2_impeg2_mismatch_stg2_additive[] =
+{
+ 800, -2276, 3406, -4018, 4018, -3406, 2276, -800,
+ -1600, 4552, -6812, 8036, -8036, 6812, -4552, 1600,
+ 2800, -7966, 11921, -14063, 14063, -11921, 7966, -2800,
+ -3200, 9104, -13624, 16072, -16072, 13624, -9104, 3200,
+ 3200, -9104, 13624, -16072, 16072, -13624, 9104, -3200,
+ -2800, 7966, -11921, 14063, -14063, 11921, -7966, 2800,
+ 1600, -4552, 6812, -8036, 8036, -6812, 4552, -1600,
+ -800, 2276, -3406, 4018, -4018, 3406, -2276, 800,
+};
+
+
+const UWORD8 gau1_impeg2_zerobuf[] =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0,
+};
+/*****************************************************************************/
+/* Tables of offset needed to address block in an MB */
+/*****************************************************************************/
+const WORD16 gai2_impeg2_blk_y_off_fld[] = {0,0,1,1};
+const WORD16 gai2_impeg2_blk_y_off_frm[] = {0,0,8,8};
+const WORD16 gai2_impeg2_blk_x_off[] = {0,8,0,8};
diff --git a/common/impeg2_globals.h b/common/impeg2_globals.h
new file mode 100755
index 0000000..e8c6865
--- /dev/null
+++ b/common/impeg2_globals.h
@@ -0,0 +1,57 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_GLOBALS_H__
+#define __IMPEG2_GLOBALS_H__
+
+extern const UWORD8 gau1_impeg2_non_linear_quant_scale[];
+extern const UWORD8 gau1_impeg2_intra_quant_matrix_default[];
+extern const UWORD8 gau1_impeg2_inter_quant_matrix_default[];
+extern const UWORD8 gau1_impeg2_inv_scan_vertical[];
+extern const UWORD8 gau1_impeg2_inv_scan_zig_zag[];
+extern const UWORD16 gau2_impeg2_frm_rate_code[][2];
+
+extern const UWORD16 gau2_impeg2_chroma_interp_mv[][16];
+extern const UWORD16 gau2_impeg2_chroma_interp_inp1[][16];
+extern const UWORD16 gau2_impeg2_luma_interp_inp1[];
+extern const UWORD16 gau2_impeg2_luma_interp_inp2[];
+extern const UWORD16 gau2_impeg2_chroma_interp_inp2[];
+
+extern const WORD16 gai2_impeg2_idct_q15[];
+extern const WORD16 gai2_impeg2_idct_q11[];
+
+extern const WORD16 gai2_impeg2_mismatch_stg1_outp[];
+extern const WORD16 gai2_impeg2_idct_last_row_q11[];
+extern const WORD16 gai2_impeg2_idct_first_col_q15[];
+extern const WORD16 gai2_impeg2_idct_first_col_q11[];
+extern const WORD16 gai2_impeg2_mismatch_stg2_additive[];
+
+extern const WORD16 gai2_impeg2_blk_y_off_fld[];
+extern const WORD16 gai2_impeg2_blk_y_off_frm[];
+extern const WORD16 gai2_impeg2_blk_x_off[];
+
+extern const UWORD8 gau1_impeg2_zerobuf[];
+
+extern const WORD16 gai2_impeg2_idct_odd_8_q15[8][8];
+extern const WORD16 gai2_impeg2_idct_odd_8_q11[8][8];
+
+extern const WORD16 gai2_impeg2_idct_even_8_q11[4][8];
+extern const WORD16 gai2_impeg2_idct_even_8_q15[4][8];
+
+#endif /* __IMPEG2_GLOBALS_H__ */
diff --git a/common/impeg2_idct.c b/common/impeg2_idct.c
new file mode 100644
index 0000000..6834260
--- /dev/null
+++ b/common/impeg2_idct.c
@@ -0,0 +1,500 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : impeg2_idct.c */
+/* */
+/* Description : Contains 2d idct and invese quantization functions */
+/* */
+/* List of Functions : impeg2_idct_recon_dc() */
+/* impeg2_idct_recon_dc_mismatch() */
+/* impeg2_idct_recon() */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 10 09 2005 Hairsh M First Version */
+/* */
+/*****************************************************************************/
+/*
+ IEEE - 1180 results for this IDCT
+ L 256 256 5 5 300 300 384 384 Thresholds
+ H 255 255 5 5 300 300 383 383
+ sign 1 -1 1 -1 1 -1 1 -1
+ Peak Error 1 1 1 1 1 1 1 1 1
+ Peak Mean Square Error 0.0191 0.0188 0.0108 0.0111 0.0176 0.0188 0.0165 0.0177 0.06
+ Overall Mean Square Error 0.01566406 0.01597656 0.0091875 0.00908906 0.01499063 0.01533281 0.01432344 0.01412344 0.02
+ Peak Mean Error 0.0027 0.0026 0.0028 0.002 0.0017 0.0033 0.0031 0.0025 0.015
+ Overall Mean Error 0.00002656 -0.00031406 0.00016875 0.00005469 -0.00003125 0.00011406 0.00009219 0.00004219 0.0015
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+
+#include "impeg2_macros.h"
+#include "impeg2_globals.h"
+#include "impeg2_idct.h"
+
+
+void impeg2_idct_recon_dc(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_pred_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_zero_cols,
+ WORD32 i4_zero_rows)
+{
+ WORD32 i4_val, i, j;
+
+ UNUSED(pi2_tmp);
+ UNUSED(i4_src_strd);
+ UNUSED(i4_zero_cols);
+ UNUSED(i4_zero_rows);
+
+ i4_val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+ i4_val = ((i4_val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+ i4_val = i4_val * gai2_impeg2_idct_q11[0];
+ i4_val = ((i4_val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
+
+ for(i = 0; i < TRANS_SIZE_8; i++)
+ {
+ for(j = 0; j < TRANS_SIZE_8; j++)
+ {
+ pu1_dst[j] = CLIP_U8(i4_val + pu1_pred[j]);
+ }
+ pu1_dst += i4_dst_strd;
+ pu1_pred += i4_pred_strd;
+ }
+}
+void impeg2_idct_recon_dc_mismatch(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_pred_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_zero_cols,
+ WORD32 i4_zero_rows)
+
+{
+ WORD32 i4_val, i, j;
+ WORD32 i4_count = 0;
+ WORD32 i4_sum;
+
+ UNUSED(pi2_tmp);
+ UNUSED(i4_src_strd);
+ UNUSED(i4_zero_cols);
+ UNUSED(i4_zero_rows);
+
+ i4_val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+ i4_val = ((i4_val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+
+ i4_val *= gai2_impeg2_idct_q11[0];
+ for(i = 0; i < TRANS_SIZE_8; i++)
+ {
+ for (j = 0; j < TRANS_SIZE_8; j++)
+ {
+ i4_sum = i4_val;
+ i4_sum += gai2_impeg2_mismatch_stg2_additive[i4_count];
+ i4_sum = ((i4_sum + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
+ i4_sum += pu1_pred[j];
+ pu1_dst[j] = CLIP_U8(i4_sum);
+ i4_count++;
+ }
+
+ pu1_dst += i4_dst_strd;
+ pu1_pred += i4_pred_strd;
+ }
+
+}
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs Inverse transform and reconstruction for 8x8
+ * input block
+ *
+ * @par Description:
+ * Performs inverse transform and adds the prediction data and clips output
+ * to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 8x8 buffer for storing inverse
+ *
+ * transform
+ * 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] shift
+ * Output shift
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+void impeg2_idct_recon(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 i4_src_strd,
+ WORD32 i4_pred_strd,
+ WORD32 i4_dst_strd,
+ WORD32 i4_zero_cols,
+ WORD32 i4_zero_rows)
+{
+ WORD32 j, k;
+ WORD32 ai4_e[4], ai4_o[4];
+ WORD32 ai4_ee[2], ai4_eo[2];
+ WORD32 i4_add;
+ WORD32 i4_shift;
+ WORD16 *pi2_tmp_orig;
+ WORD32 i4_trans_size;
+ WORD32 i4_zero_rows_2nd_stage = i4_zero_cols;
+ WORD32 i4_row_limit_2nd_stage;
+
+ i4_trans_size = TRANS_SIZE_8;
+
+ pi2_tmp_orig = pi2_tmp;
+
+ if((i4_zero_cols & 0xF0) == 0xF0)
+ i4_row_limit_2nd_stage = 4;
+ else
+ i4_row_limit_2nd_stage = TRANS_SIZE_8;
+
+
+ if((i4_zero_rows & 0xF0) == 0xF0) /* First 4 rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+
+ /* Inverse Transform 1st stage */
+ i4_shift = IDCT_STG1_SHIFT;
+ i4_add = 1 << (i4_shift - 1);
+
+ for(j = 0; j < i4_row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((i4_zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, i4_trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ ai4_o[k] = gai2_impeg2_idct_q15[1 * 8 + k] * pi2_src[i4_src_strd]
+ + gai2_impeg2_idct_q15[3 * 8 + k]
+ * pi2_src[3 * i4_src_strd];
+ }
+ ai4_eo[0] = gai2_impeg2_idct_q15[2 * 8 + 0] * pi2_src[2 * i4_src_strd];
+ ai4_eo[1] = gai2_impeg2_idct_q15[2 * 8 + 1] * pi2_src[2 * i4_src_strd];
+ ai4_ee[0] = gai2_impeg2_idct_q15[0 * 8 + 0] * pi2_src[0];
+ ai4_ee[1] = gai2_impeg2_idct_q15[0 * 8 + 1] * pi2_src[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+ ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+ ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+ ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+ pi2_tmp[k + 4] =
+ CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += i4_trans_size;
+ i4_zero_cols = i4_zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ i4_shift = IDCT_STG2_SHIFT;
+ i4_add = 1 << (i4_shift - 1);
+ if((i4_zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < i4_trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+ + gai2_impeg2_idct_q11[3 * 8 + k] * pi2_tmp[3 * i4_trans_size];
+ }
+ ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size];
+ ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size];
+ ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0];
+ ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+ ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+ ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+ ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += i4_pred_strd;
+ pu1_dst += i4_dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < i4_trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+ + gai2_impeg2_idct_q11[3 * 8 + k]
+ * pi2_tmp[3 * i4_trans_size]
+ + gai2_impeg2_idct_q11[5 * 8 + k]
+ * pi2_tmp[5 * i4_trans_size]
+ + gai2_impeg2_idct_q11[7 * 8 + k]
+ * pi2_tmp[7 * i4_trans_size];
+ }
+
+ ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size]
+ + gai2_impeg2_idct_q11[6 * 8 + 0] * pi2_tmp[6 * i4_trans_size];
+ ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size]
+ + gai2_impeg2_idct_q11[6 * 8 + 1] * pi2_tmp[6 * i4_trans_size];
+ ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0]
+ + gai2_impeg2_idct_q11[4 * 8 + 0] * pi2_tmp[4 * i4_trans_size];
+ ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0]
+ + gai2_impeg2_idct_q11[4 * 8 + 1] * pi2_tmp[4 * i4_trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+ ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+ ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+ ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += i4_pred_strd;
+ pu1_dst += i4_dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+ }
+ else /* All rows of input are non-zero */
+ {
+ /************************************************************************************************/
+ /**********************************START - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+
+ /* Inverse Transform 1st stage */
+ i4_shift = IDCT_STG1_SHIFT;
+ i4_add = 1 << (i4_shift - 1);
+
+ for(j = 0; j < i4_row_limit_2nd_stage; j++)
+ {
+ /* Checking for Zero Cols */
+ if((i4_zero_cols & 1) == 1)
+ {
+ memset(pi2_tmp, 0, i4_trans_size * sizeof(WORD16));
+ }
+ else
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ ai4_o[k] = gai2_impeg2_idct_q15[1 * 8 + k] * pi2_src[i4_src_strd]
+ + gai2_impeg2_idct_q15[3 * 8 + k]
+ * pi2_src[3 * i4_src_strd]
+ + gai2_impeg2_idct_q15[5 * 8 + k]
+ * pi2_src[5 * i4_src_strd]
+ + gai2_impeg2_idct_q15[7 * 8 + k]
+ * pi2_src[7 * i4_src_strd];
+ }
+
+ ai4_eo[0] = gai2_impeg2_idct_q15[2 * 8 + 0] * pi2_src[2 * i4_src_strd]
+ + gai2_impeg2_idct_q15[6 * 8 + 0] * pi2_src[6 * i4_src_strd];
+ ai4_eo[1] = gai2_impeg2_idct_q15[2 * 8 + 1] * pi2_src[2 * i4_src_strd]
+ + gai2_impeg2_idct_q15[6 * 8 + 1] * pi2_src[6 * i4_src_strd];
+ ai4_ee[0] = gai2_impeg2_idct_q15[0 * 8 + 0] * pi2_src[0]
+ + gai2_impeg2_idct_q15[4 * 8 + 0] * pi2_src[4 * i4_src_strd];
+ ai4_ee[1] = gai2_impeg2_idct_q15[0 * 8 + 1] * pi2_src[0]
+ + gai2_impeg2_idct_q15[4 * 8 + 1] * pi2_src[4 * i4_src_strd];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+ ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+ ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+ ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ pi2_tmp[k] =
+ CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+ pi2_tmp[k + 4] =
+ CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+ }
+ }
+ pi2_src++;
+ pi2_tmp += i4_trans_size;
+ i4_zero_cols = i4_zero_cols >> 1;
+ }
+
+ pi2_tmp = pi2_tmp_orig;
+
+ /* Inverse Transform 2nd stage */
+ i4_shift = IDCT_STG2_SHIFT;
+ i4_add = 1 << (i4_shift - 1);
+ if((i4_zero_rows_2nd_stage & 0xF0) == 0xF0) /* First 4 rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < i4_trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+ + gai2_impeg2_idct_q11[3 * 8 + k] * pi2_tmp[3 * i4_trans_size];
+ }
+ ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size];
+ ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size];
+ ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0];
+ ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+ ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+ ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+ ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += i4_pred_strd;
+ pu1_dst += i4_dst_strd;
+ }
+ }
+ else /* All rows of output of 1st stage are non-zero */
+ {
+ for(j = 0; j < i4_trans_size; j++)
+ {
+ /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
+ for(k = 0; k < 4; k++)
+ {
+ ai4_o[k] = gai2_impeg2_idct_q11[1 * 8 + k] * pi2_tmp[i4_trans_size]
+ + gai2_impeg2_idct_q11[3 * 8 + k]
+ * pi2_tmp[3 * i4_trans_size]
+ + gai2_impeg2_idct_q11[5 * 8 + k]
+ * pi2_tmp[5 * i4_trans_size]
+ + gai2_impeg2_idct_q11[7 * 8 + k]
+ * pi2_tmp[7 * i4_trans_size];
+ }
+
+ ai4_eo[0] = gai2_impeg2_idct_q11[2 * 8 + 0] * pi2_tmp[2 * i4_trans_size]
+ + gai2_impeg2_idct_q11[6 * 8 + 0] * pi2_tmp[6 * i4_trans_size];
+ ai4_eo[1] = gai2_impeg2_idct_q11[2 * 8 + 1] * pi2_tmp[2 * i4_trans_size]
+ + gai2_impeg2_idct_q11[6 * 8 + 1] * pi2_tmp[6 * i4_trans_size];
+ ai4_ee[0] = gai2_impeg2_idct_q11[0 * 8 + 0] * pi2_tmp[0]
+ + gai2_impeg2_idct_q11[4 * 8 + 0] * pi2_tmp[4 * i4_trans_size];
+ ai4_ee[1] = gai2_impeg2_idct_q11[0 * 8 + 1] * pi2_tmp[0]
+ + gai2_impeg2_idct_q11[4 * 8 + 1] * pi2_tmp[4 * i4_trans_size];
+
+ /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
+ ai4_e[0] = ai4_ee[0] + ai4_eo[0];
+ ai4_e[3] = ai4_ee[0] - ai4_eo[0];
+ ai4_e[1] = ai4_ee[1] + ai4_eo[1];
+ ai4_e[2] = ai4_ee[1] - ai4_eo[1];
+ for(k = 0; k < 4; k++)
+ {
+ WORD32 itrans_out;
+ itrans_out =
+ CLIP_S16(((ai4_e[k] + ai4_o[k] + i4_add) >> i4_shift));
+ pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
+ itrans_out =
+ CLIP_S16(((ai4_e[3 - k] - ai4_o[3 - k] + i4_add) >> i4_shift));
+ pu1_dst[k + 4] = CLIP_U8((itrans_out + pu1_pred[k + 4]));
+ }
+ pi2_tmp++;
+ pu1_pred += i4_pred_strd;
+ pu1_dst += i4_dst_strd;
+ }
+ }
+ /************************************************************************************************/
+ /************************************END - IT_RECON_8x8******************************************/
+ /************************************************************************************************/
+ }
+}
+
diff --git a/common/impeg2_idct.h b/common/impeg2_idct.h
new file mode 100644
index 0000000..80defde
--- /dev/null
+++ b/common/impeg2_idct.h
@@ -0,0 +1,66 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_IDCT_H__
+#define __IMPEG2_IDCT_H__
+
+
+/*****************************************************************************/
+/* Function Declarations */
+/*****************************************************************************/
+
+typedef void pf_idct_recon_t(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows);
+
+/* ARM assembly modules curently ignore non_zero_cols argument */
+pf_idct_recon_t impeg2_idct_recon_dc;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch;
+
+pf_idct_recon_t impeg2_idct_recon;
+
+
+pf_idct_recon_t impeg2_idct_recon_dc_a9q;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch_a9q;
+
+pf_idct_recon_t impeg2_idct_recon_a9q;
+
+
+pf_idct_recon_t impeg2_idct_recon_dc_av8;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch_av8;
+
+pf_idct_recon_t impeg2_idct_recon_av8;
+
+pf_idct_recon_t impeg2_idct_recon_sse42;
+
+pf_idct_recon_t impeg2_idct_recon_dc_mismatch_sse42;
+
+pf_idct_recon_t impeg2_idct_recon_dc_sse42;
+
+#endif /* #ifndef __IMPEG2_IDCT_H__ */
+
diff --git a/common/impeg2_inter_pred.c b/common/impeg2_inter_pred.c
new file mode 100644
index 0000000..019fa5c
--- /dev/null
+++ b/common/impeg2_inter_pred.c
@@ -0,0 +1,467 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* impeg2d_mcu.c
+*
+* @brief
+* Contains MC function definitions for MPEG2 decoder
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+* - impeg2_copy_mb()
+* - impeg2_interpolate()
+* - impeg2_mc_halfx_halfy_8x8()
+* - impeg2_mc_halfx_fully_8x8()
+* - impeg2_mc_fullx_halfy_8x8()
+* - impeg2_mc_fullx_fully_8x8()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "impeg2_buf_mgr.h"
+#include "impeg2_disp_mgr.h"
+#include "impeg2_defs.h"
+#include "impeg2_platform_macros.h"
+
+#include "impeg2_inter_pred.h"
+#include "impeg2_globals.h"
+#include "impeg2_macros.h"
+#include "impeg2_idct.h"
+
+/*******************************************************************************
+* Function Name : impeg2_copy_mb
+*
+* Description : copies 3 components to the frame from mc_buf
+*
+* Arguments :
+* src_buf : Source Buffer
+* dst_buf : Destination Buffer
+* src_offset_x : X offset for source
+* src_offset_y : Y offset for source
+* dst_offset_x : X offset for destination
+* dst_offset_y : Y offset for destination
+* src_wd : Source Width
+* dst_wd : destination Width
+* rows : Number of rows
+* cols : Number of columns
+*
+* Values Returned : None
+*******************************************************************************/
+void impeg2_copy_mb(yuv_buf_t *ps_src_buf,
+ yuv_buf_t *ps_dst_buf,
+ UWORD32 u4_src_wd,
+ UWORD32 u4_dst_wd)
+{
+ UWORD8 *pu1_src;
+ UWORD8 *pu1_dst;
+ UWORD32 i;
+ UWORD32 u4_rows = MB_SIZE;
+ UWORD32 u4_cols = MB_SIZE;
+
+ /*******************************************************/
+ /* copy Y */
+ /*******************************************************/
+ pu1_src = ps_src_buf->pu1_y;
+ pu1_dst = ps_dst_buf->pu1_y;
+ for(i = 0; i < u4_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, u4_cols);
+ pu1_src += u4_src_wd;
+ pu1_dst += u4_dst_wd;
+ }
+
+ u4_src_wd >>= 1;
+ u4_dst_wd >>= 1;
+ u4_rows >>= 1;
+ u4_cols >>= 1;
+
+ /*******************************************************/
+ /* copy U */
+ /*******************************************************/
+ pu1_src = ps_src_buf->pu1_u;
+ pu1_dst = ps_dst_buf->pu1_u;
+ for(i = 0; i < u4_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, u4_cols);
+
+ pu1_src += u4_src_wd;
+ pu1_dst += u4_dst_wd;
+ }
+ /*******************************************************/
+ /* copy V */
+ /*******************************************************/
+ pu1_src = ps_src_buf->pu1_v;
+ pu1_dst = ps_dst_buf->pu1_v;
+ for(i = 0; i < u4_rows; i++)
+ {
+ memcpy(pu1_dst, pu1_src, u4_cols);
+
+ pu1_src += u4_src_wd;
+ pu1_dst += u4_dst_wd;
+ }
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_interpolate */
+/* */
+/* Description : averages the contents of buf_src1 and buf_src2 and stores*/
+/* result in buf_dst */
+/* */
+/* Inputs : buf_src1 - First Source */
+/* buf_src2 - Second Source */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Avg the values from two sources and store the result in */
+/* destination buffer */
+/* */
+/* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */
+/* */
+/* Returns : None */
+/* */
+/* Issues : Assumes that all 3 buffers are of same size */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 14 09 2005 Harish M First Version */
+/* 15 09 2010 Venkat Added stride */
+/* */
+/*****************************************************************************/
+void impeg2_interpolate(yuv_buf_t *ps_buf_src1,
+ yuv_buf_t *ps_buf_src2,
+ yuv_buf_t *ps_buf_dst,
+ UWORD32 u4_stride)
+{
+
+ UWORD32 i,j;
+ UWORD8 *pu1_src1,*pu1_src2,*pu1_dst;
+ pu1_src1 = ps_buf_src1->pu1_y;
+ pu1_src2 = ps_buf_src2->pu1_y;
+ pu1_dst = ps_buf_dst->pu1_y;
+ for(i = MB_SIZE; i > 0; i--)
+ {
+ for(j = MB_SIZE; j > 0; j--)
+ {
+ *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1;
+ }
+
+ pu1_dst += u4_stride - MB_SIZE;
+
+ }
+
+ u4_stride >>= 1;
+
+ pu1_src1 = ps_buf_src1->pu1_u;
+ pu1_src2 = ps_buf_src2->pu1_u;
+ pu1_dst = ps_buf_dst->pu1_u;
+ for(i = MB_CHROMA_SIZE; i > 0 ; i--)
+ {
+ for(j = MB_CHROMA_SIZE; j > 0; j--)
+ {
+ *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1;
+ }
+
+ pu1_dst += u4_stride - MB_CHROMA_SIZE;
+ }
+
+ pu1_src1 = ps_buf_src1->pu1_v;
+ pu1_src2 = ps_buf_src2->pu1_v;
+ pu1_dst = ps_buf_dst->pu1_v;
+ for(i = MB_CHROMA_SIZE; i > 0 ; i--)
+ {
+ for(j = MB_CHROMA_SIZE; j > 0; j--)
+ {
+ *pu1_dst++ = ((*pu1_src1++) + (*pu1_src2++) + 1) >> 1;
+ }
+
+ pu1_dst += u4_stride - MB_CHROMA_SIZE;
+ }
+
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_halfx_halfy_8x8() */
+/* */
+/* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */
+/* the ref frame.Interpolate these four values to get the */
+/* value at(0.5,0.5).Repeat this to get an 8 x 8 block */
+/* using 9 x 9 block from reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 05 09 2005 Harish M First Version */
+/* */
+/*****************************************************************************/
+void impeg2_mc_halfx_halfy_8x8(UWORD8 *pu1_out,
+ UWORD8 *pu1_ref,
+ UWORD32 u4_ref_wid,
+ UWORD32 u4_out_wid)
+{
+ UWORD8 *pu1_ref_p0,*pu1_ref_p1,*pu1_ref_p2,*pu1_ref_p3;
+ UWORD32 i,j;
+ /* P0-P3 are the pixels in the reference frame and Q is the value being */
+ /* estimated */
+ /*
+ P0 P1
+ Q
+ P2 P3
+ */
+
+ pu1_ref_p0 = pu1_ref;
+ pu1_ref_p1 = pu1_ref + 1;
+ pu1_ref_p2 = pu1_ref + u4_ref_wid;
+ pu1_ref_p3 = pu1_ref + u4_ref_wid + 1;
+
+ for(i = 0; i < BLK_SIZE; i++)
+ {
+ for(j = 0; j < BLK_SIZE; j++)
+ {
+ *pu1_out++ = (( (*pu1_ref_p0++ )
+ + (*pu1_ref_p1++ )
+ + (*pu1_ref_p2++ )
+ + (*pu1_ref_p3++ ) + 2 ) >> 2);
+ }
+ pu1_ref_p0 += u4_ref_wid - BLK_SIZE;
+ pu1_ref_p1 += u4_ref_wid - BLK_SIZE;
+ pu1_ref_p2 += u4_ref_wid - BLK_SIZE;
+ pu1_ref_p3 += u4_ref_wid - BLK_SIZE;
+
+ pu1_out += u4_out_wid - BLK_SIZE;
+ }
+ return;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_halfx_fully_8x8() */
+/* */
+/* Description : Gets the buffer from (0.5,0) to (8.5,8) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0) and (1,0) position in the ref frame */
+/* Interpolate these two values to get the value at(0.5,0) */
+/* Repeat this to get an 8 x 8 block using 9 x 8 block from */
+/* reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 05 09 2005 Harish M First Version */
+/* */
+/*****************************************************************************/
+void impeg2_mc_halfx_fully_8x8(UWORD8 *pu1_out,
+ UWORD8 *pu1_ref,
+ UWORD32 u4_ref_wid,
+ UWORD32 u4_out_wid)
+{
+ UWORD8 *pu1_ref_p0, *pu1_ref_p1;
+ UWORD32 i,j;
+
+ /* P0-P3 are the pixels in the reference frame and Q is the value being */
+ /* estimated */
+ /*
+ P0 Q P1
+ */
+
+ pu1_ref_p0 = pu1_ref;
+ pu1_ref_p1 = pu1_ref + 1;
+
+ for(i = 0; i < BLK_SIZE; i++)
+ {
+ for(j = 0; j < BLK_SIZE; j++)
+ {
+ *pu1_out++ = ((( *pu1_ref_p0++ )
+ + (*pu1_ref_p1++) + 1 ) >> 1);
+ }
+ pu1_ref_p0 += u4_ref_wid - BLK_SIZE;
+ pu1_ref_p1 += u4_ref_wid - BLK_SIZE;
+
+ pu1_out += u4_out_wid - BLK_SIZE;
+ }
+ return;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_fullx_halfy_8x8() */
+/* */
+/* Description : Gets the buffer from (0,0.5) to (8,8.5) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0) and (0,1) position in the ref frame */
+/* Interpolate these two values to get the value at(0,0.5) */
+/* Repeat this to get an 8 x 8 block using 8 x 9 block from */
+/* reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 05 09 2005 Harish M First Version */
+/* */
+/*****************************************************************************/
+void impeg2_mc_fullx_halfy_8x8(UWORD8 *pu1_out,
+ UWORD8 *pu1_ref,
+ UWORD32 u4_ref_wid,
+ UWORD32 u4_out_wid)
+{
+
+ UWORD8 *pu1_ref_p0, *pu1_ref_p1;
+ UWORD32 i,j;
+ /* P0-P3 are the pixels in the reference frame and Q is the value being */
+ /* estimated */
+ /*
+ P0
+ x
+ P1
+ */
+ pu1_ref_p0 = pu1_ref;
+ pu1_ref_p1 = pu1_ref + u4_ref_wid;
+
+ for(i = 0; i < BLK_SIZE; i++)
+ {
+ for(j = 0; j < BLK_SIZE; j++)
+ {
+ *pu1_out++ = ((( *pu1_ref_p0++)
+ + (*pu1_ref_p1++) + 1 ) >> 1);
+ }
+ pu1_ref_p0 += u4_ref_wid - BLK_SIZE;
+ pu1_ref_p1 += u4_ref_wid - BLK_SIZE;
+
+ pu1_out += u4_out_wid - BLK_SIZE;
+ }
+
+ return;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_fullx_fully_8x8() */
+/* */
+/* Description : Gets the buffer from (x,y) to (x+8,y+8) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0) position in the ref frame */
+/* Get an 8 x 8 block from reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/* Revision History: */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 05 09 2005 Harish M First Version */
+/* */
+/*****************************************************************************/
+void impeg2_mc_fullx_fully_8x8(UWORD8 *pu1_out,
+ UWORD8 *pu1_ref,
+ UWORD32 u4_ref_wid,
+ UWORD32 u4_out_wid)
+{
+
+ UWORD32 i;
+
+ for(i = 0; i < BLK_SIZE; i++)
+ {
+ memcpy(pu1_out, pu1_ref, BLK_SIZE);
+ pu1_ref += u4_ref_wid;
+ pu1_out += u4_out_wid;
+ }
+ return;
+}
diff --git a/common/impeg2_inter_pred.h b/common/impeg2_inter_pred.h
new file mode 100644
index 0000000..be3b0e5
--- /dev/null
+++ b/common/impeg2_inter_pred.h
@@ -0,0 +1,103 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_INTER_PRED_H__
+#define __IMPEG2_INTER_PRED_H__
+
+
+typedef struct
+{
+ UWORD8 *pu1_y;
+ UWORD8 *pu1_u;
+ UWORD8 *pu1_v;
+}yuv_buf_t;
+
+typedef struct
+{
+ WORD16 *pi2_y;
+ WORD16 *pi2_u;
+ WORD16 *pi2_v;
+}yuv_buf16_t;
+
+/**
+ * Picture buffer
+ */
+typedef struct
+{
+ UWORD8 *pu1_y;
+ UWORD8 *pu1_u;
+ UWORD8 *pu1_v;
+
+ /** Used to store display Timestamp for current buffer */
+ WORD32 u4_ts;
+ UWORD8 u1_used_as_ref;
+
+ /**
+ * buffer ID from buffer manager
+ */
+ WORD32 i4_buf_id;
+
+}pic_buf_t;
+
+typedef void pf_copy_mb_t (yuv_buf_t *src_buf,
+ yuv_buf_t *dst_buf,
+ UWORD32 src_wd,
+ UWORD32 dst_wd);
+
+typedef void pf_interpred_t(UWORD8 *out,UWORD8 *ref, UWORD32 ref_wid, UWORD32 out_wid);
+
+typedef void pf_interpolate_t(yuv_buf_t *buf_src1,
+ yuv_buf_t *buf_src2,
+ yuv_buf_t *buf_dst,
+ UWORD32 stride);
+
+pf_interpolate_t impeg2_interpolate;
+pf_interpolate_t impeg2_interpolate_a9q;
+pf_interpolate_t impeg2_interpolate_av8;
+
+pf_copy_mb_t impeg2_copy_mb;
+pf_copy_mb_t impeg2_copy_mb_a9q;
+pf_copy_mb_t impeg2_copy_mb_av8;
+
+pf_interpred_t impeg2_mc_halfx_halfy_8x8;
+pf_interpred_t impeg2_mc_halfx_fully_8x8;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8;
+pf_interpred_t impeg2_mc_fullx_fully_8x8;
+
+pf_interpred_t impeg2_mc_halfx_halfy_8x8_a9q;
+pf_interpred_t impeg2_mc_halfx_fully_8x8_a9q;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8_a9q;
+pf_interpred_t impeg2_mc_fullx_fully_8x8_a9q;
+
+/* AV8 Declarations */
+pf_interpred_t impeg2_mc_halfx_halfy_8x8_av8;
+pf_interpred_t impeg2_mc_halfx_fully_8x8_av8;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8_av8;
+pf_interpred_t impeg2_mc_fullx_fully_8x8_av8;
+
+
+/* SSE4.2 Declarations*/
+pf_copy_mb_t impeg2_copy_mb_sse42;
+pf_interpolate_t impeg2_interpolate_sse42;
+pf_interpred_t impeg2_mc_halfx_halfy_8x8_sse42;
+pf_interpred_t impeg2_mc_halfx_fully_8x8_sse42;
+pf_interpred_t impeg2_mc_fullx_halfy_8x8_sse42;
+pf_interpred_t impeg2_mc_fullx_fully_8x8_sse42;
+
+#endif /* #ifndef __IMPEG2_INTER_PRED_H__ */
diff --git a/common/impeg2_job_queue.c b/common/impeg2_job_queue.c
new file mode 100644
index 0000000..d36ce7c
--- /dev/null
+++ b/common/impeg2_job_queue.c
@@ -0,0 +1,530 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* impeg2d_job_queue.c
+*
+* @brief
+* Contains functions for job queue
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "iv_datatypedef.h"
+#include "iv.h"
+#include "ithread.h"
+#include "impeg2_macros.h"
+#include "impeg2_job_queue.h"
+
+/**
+*******************************************************************************
+*
+* @brief Returns size for job queue context. Does not include job queue buffer
+* requirements
+*
+* @par Description
+* Returns size for job queue context. Does not include job queue buffer
+* requirements. Buffer size required to store the jobs should be allocated in
+* addition to the value returned here.
+*
+* @returns Size of the job queue context
+*
+* @remarks
+*
+*******************************************************************************
+*/
+WORD32 impeg2_jobq_ctxt_size()
+{
+ WORD32 i4_size;
+ i4_size = sizeof(jobq_t);
+ i4_size += ithread_get_mutex_lock_size();
+ return i4_size;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Locks the jobq conext
+*
+* @par Description
+* Locks the jobq conext by calling ithread_mutex_lock()
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IMPEG2D_FAIL if mutex lock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_lock(jobq_t *ps_jobq)
+{
+ WORD32 i4_ret_val;
+ i4_ret_val = ithread_mutex_lock(ps_jobq->pv_mutex);
+ if(i4_ret_val)
+ {
+ return IV_FAIL;
+ }
+ return IV_SUCCESS;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Unlocks the jobq conext
+*
+* @par Description
+* Unlocks the jobq conext by calling ithread_mutex_unlock()
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IMPEG2D_FAIL if mutex unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IV_API_CALL_STATUS_T impeg2_jobq_unlock(jobq_t *ps_jobq)
+{
+ WORD32 i4_ret_val;
+ i4_ret_val = ithread_mutex_unlock(ps_jobq->pv_mutex);
+ if(i4_ret_val)
+ {
+ return IV_FAIL;
+ }
+ return IV_SUCCESS;
+
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Yeilds the thread
+*
+* @par Description
+* Unlocks the jobq conext by calling
+* impeg2_jobq_unlock(), ithread_yield() and then impeg2_jobq_lock()
+* jobq is unlocked before to ensure the jobq can be accessed by other threads
+* If unlock is not done before calling yield then no other thread can access
+* the jobq functions and update jobq.
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IMPEG2D_FAIL if mutex lock unlock or yield fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_yield(jobq_t *ps_jobq)
+{
+
+ IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+
+ IV_API_CALL_STATUS_T e_ret_tmp;
+ e_ret_tmp = impeg2_jobq_unlock(ps_jobq);
+ RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+
+ //NOP(1024 * 8);
+ ithread_yield();
+
+ e_ret_tmp = impeg2_jobq_lock(ps_jobq);
+ RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+ return e_ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief free the job queue pointers
+*
+* @par Description
+* Frees the jobq context
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_free(jobq_t *ps_jobq)
+{
+ WORD32 i4_ret;
+ i4_ret = ithread_mutex_destroy(ps_jobq->pv_mutex);
+
+ if(0 == i4_ret)
+ return IV_SUCCESS;
+ else
+ return IV_FAIL;
+}
+
+/**
+*******************************************************************************
+*
+* @brief Initialize the job queue
+*
+* @par Description
+* Initializes the jobq context and sets write and read pointers to start of
+* job queue buffer
+*
+* @param[in] pv_buf
+* Memoy for job queue buffer and job queue context
+*
+* @param[in] buf_size
+* Size of the total memory allocated
+*
+* @returns Pointer to job queue context
+*
+* @remarks
+* Since it will be called only once by master thread this is not thread safe.
+*
+*******************************************************************************
+*/
+void* impeg2_jobq_init(void *pv_buf, WORD32 i4_buf_size)
+{
+ jobq_t *ps_jobq;
+ UWORD8 *pu1_buf;
+ pu1_buf = (UWORD8 *)pv_buf;
+
+ ps_jobq = (jobq_t *)pu1_buf;
+ pu1_buf += sizeof(jobq_t);
+ i4_buf_size -= sizeof(jobq_t);
+
+ ps_jobq->pv_mutex = pu1_buf;
+ pu1_buf += ithread_get_mutex_lock_size();
+ i4_buf_size -= ithread_get_mutex_lock_size();
+
+ if(i4_buf_size <= 0)
+ return NULL;
+
+ ithread_mutex_init(ps_jobq->pv_mutex);
+
+ ps_jobq->pv_buf_base = pu1_buf;
+ ps_jobq->pv_buf_wr = pu1_buf;
+ ps_jobq->pv_buf_rd = pu1_buf;
+ ps_jobq->pv_buf_end = pu1_buf + i4_buf_size;
+ ps_jobq->i4_terminate = 0;
+
+
+ return ps_jobq;
+}
+/**
+*******************************************************************************
+*
+* @brief
+* Resets the jobq conext
+*
+* @par Description
+* Resets the jobq conext by initilizing job queue context elements
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_reset(jobq_t *ps_jobq)
+{
+ IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+ e_ret = impeg2_jobq_lock(ps_jobq);
+ RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+ ps_jobq->pv_buf_wr = ps_jobq->pv_buf_base;
+ ps_jobq->pv_buf_rd = ps_jobq->pv_buf_base;
+ ps_jobq->i4_terminate = 0;
+ e_ret = impeg2_jobq_unlock(ps_jobq);
+ RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+ return e_ret;
+}
+
+/**
+*******************************************************************************
+*
+* @brief
+* Deinitializes the jobq conext
+*
+* @par Description
+* Deinitializes the jobq conext by calling impeg2_jobq_reset()
+* and then destrying the mutex created
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_deinit(jobq_t *ps_jobq)
+{
+ WORD32 i4_ret_val;
+ IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+
+ e_ret = impeg2_jobq_reset(ps_jobq);
+ RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+ i4_ret_val = ithread_mutex_destroy(ps_jobq->pv_mutex);
+ if(i4_ret_val)
+ {
+ return IV_FAIL;
+ }
+
+ return IV_SUCCESS;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief
+* Terminates the jobq
+*
+* @par Description
+* Terminates the jobq by setting a flag in context.
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @returns IMPEG2D_FAIL if lock unlock fails else IV_SUCCESS
+*
+* @remarks
+*
+*******************************************************************************
+*/
+
+IV_API_CALL_STATUS_T impeg2_jobq_terminate(jobq_t *ps_jobq)
+{
+ IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+ e_ret = impeg2_jobq_lock(ps_jobq);
+ RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+
+ ps_jobq->i4_terminate = 1;
+
+ e_ret = impeg2_jobq_unlock(ps_jobq);
+ RETURN_IF((e_ret != IV_SUCCESS), e_ret);
+ return e_ret;
+}
+
+
+/**
+*******************************************************************************
+*
+* @brief Adds a job to the queue
+*
+* @par Description
+* Adds a job to the queue and updates wr address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed.
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @param[in] pv_job
+* Pointer to the location that contains details of the job to be added
+*
+* @param[in] job_size
+* Size of the job buffer
+*
+* @param[in] blocking
+* To signal if the write is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_queue(jobq_t *ps_jobq,
+ void *pv_job,
+ WORD32 i4_job_size,
+ WORD32 i4_blocking,
+ WORD32 i4_lock)
+{
+ IV_API_CALL_STATUS_T e_ret = IV_SUCCESS;
+ IV_API_CALL_STATUS_T e_ret_tmp;
+ UWORD8 *pu1_buf;
+ UNUSED(i4_blocking);
+
+ if(i4_lock)
+ {
+ e_ret_tmp = impeg2_jobq_lock(ps_jobq);
+ RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+ }
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_wr;
+ if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + i4_job_size))
+ {
+ memcpy(ps_jobq->pv_buf_wr, pv_job, i4_job_size);
+ ps_jobq->pv_buf_wr = (UWORD8 *)ps_jobq->pv_buf_wr + i4_job_size;
+ e_ret = IV_SUCCESS;
+ }
+ else
+ {
+ /* Handle wrap around case */
+ /* Wait for pv_buf_rd to consume first job_size number of bytes
+ * from the beginning of job queue
+ */
+ e_ret = IV_FAIL;
+ }
+
+ ps_jobq->i4_terminate = 0;
+
+ if(i4_lock)
+ {
+ e_ret_tmp = impeg2_jobq_unlock(ps_jobq);
+ RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+ }
+
+ return e_ret;
+}
+/**
+*******************************************************************************
+*
+* @brief Gets next from the Job queue
+*
+* @par Description
+* Gets next job from the job queue and updates rd address to next location.
+* Format/content of the job structure is abstracted and hence size of the job
+* buffer is being passed. If it is a blocking call and if there is no new job
+* then this functions unlocks the mutext and calls yield and then locks it back.
+* and continues till a job is available or terminate is set
+*
+* @param[in] ps_jobq
+* Job Queue context
+*
+* @param[out] pv_job
+* Pointer to the location that contains details of the job to be written
+*
+* @param[in] job_size
+* Size of the job buffer
+*
+* @param[in] blocking
+* To signal if the read is blocking or non-blocking.
+*
+* @returns
+*
+* @remarks
+* Job Queue buffer is assumed to be allocated to handle worst case number of jobs
+* Wrap around is not supported
+*
+*******************************************************************************
+*/
+IV_API_CALL_STATUS_T impeg2_jobq_dequeue(jobq_t *ps_jobq,
+ void *pv_job,
+ WORD32 i4_job_size,
+ WORD32 i4_blocking,
+ WORD32 i4_lock)
+{
+ IV_API_CALL_STATUS_T e_ret;
+ IV_API_CALL_STATUS_T e_ret_tmp;
+ volatile UWORD8 *pu1_buf;
+ if(i4_lock)
+ {
+ e_ret_tmp = impeg2_jobq_lock(ps_jobq);
+ RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+ }
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+
+
+ if((UWORD8 *)ps_jobq->pv_buf_end >= (pu1_buf + i4_job_size))
+ {
+ while(1)
+ {
+ pu1_buf = (UWORD8 *)ps_jobq->pv_buf_rd;
+ if((UWORD8 *)ps_jobq->pv_buf_wr >= (pu1_buf + i4_job_size))
+ {
+ memcpy(pv_job, ps_jobq->pv_buf_rd, i4_job_size);
+ ps_jobq->pv_buf_rd = (UWORD8 *)ps_jobq->pv_buf_rd + i4_job_size;
+ e_ret = IV_SUCCESS;
+ break;
+ }
+ else
+ {
+ /* If all the entries have been dequeued, then break and return */
+ if(1 == ps_jobq->i4_terminate)
+ {
+ e_ret = IV_FAIL;
+ break;
+ }
+
+ if((1 == i4_blocking) && (1 == i4_lock))
+ {
+ impeg2_jobq_yield(ps_jobq);
+
+ }
+ else
+ {
+ /* If there is no job available,
+ * and this is non blocking call then return fail */
+ e_ret = IV_FAIL;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* Handle wrap around case */
+ /* Wait for pv_buf_rd to consume first i4_job_size number of bytes
+ * from the beginning of job queue
+ */
+ e_ret = IV_FAIL;
+ }
+ if(i4_lock)
+ {
+ e_ret_tmp = impeg2_jobq_unlock(ps_jobq);
+ RETURN_IF((e_ret_tmp != IV_SUCCESS), e_ret_tmp);
+ }
+
+ return e_ret;
+}
diff --git a/common/impeg2_job_queue.h b/common/impeg2_job_queue.h
new file mode 100644
index 0000000..46d8bb9
--- /dev/null
+++ b/common/impeg2_job_queue.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* impeg2_job_queue.h
+*
+* @brief
+* Contains functions for job queue
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#ifndef _IMPEG2_JOB_QUEUE_H_
+#define _IMPEG2_JOB_QUEUE_H_
+
+typedef struct
+{
+ /** Pointer to buffer base which contains the jobs */
+ void *pv_buf_base;
+
+ /** Pointer to current address where new job can be added */
+ void *pv_buf_wr;
+
+ /** Pointer to current address from where next job can be obtained */
+ void *pv_buf_rd;
+
+ /** Pointer to end of job buffer */
+ void *pv_buf_end;
+
+ /** Mutex used to keep the functions thread-safe */
+ void *pv_mutex;
+
+ /** Flag to indicate jobq has to be terminated */
+ WORD32 i4_terminate;
+}jobq_t;
+
+WORD32 impeg2_jobq_ctxt_size(void);
+void* impeg2_jobq_init(void *pv_buf, WORD32 buf_size);
+IV_API_CALL_STATUS_T impeg2_jobq_free(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_reset(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_deinit(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_terminate(jobq_t *ps_jobq);
+IV_API_CALL_STATUS_T impeg2_jobq_queue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking, WORD32 lock);
+IV_API_CALL_STATUS_T impeg2_jobq_dequeue(jobq_t *ps_jobq, void *pv_job, WORD32 job_size, WORD32 blocking, WORD32 lock);
+
+#endif /* _IMPEG2_JOB_QUEUE_H_ */
diff --git a/common/impeg2_macros.h b/common/impeg2_macros.h
new file mode 100644
index 0000000..366510f
--- /dev/null
+++ b/common/impeg2_macros.h
@@ -0,0 +1,60 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_MACROS_H__
+#define __IMPEG2_MACROS_H__
+
+#define ABS(x) ((x) < 0 ? (-1 * (x)) : (x))
+
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+
+#define CLIP(Number,Max,Min) if((Number) > (Max)) (Number) = (Max); \
+else if((Number) < (Min)) (Number) = (Min)
+
+#define SIGN(Number) (((Number) < 0) ? -1 : 1)
+
+
+#define BITS(val,msb,lsb) (UWORD16)((((val) >> (lsb)) & ((1 << ((msb) - (lsb) + 1)) - 1)))
+
+#define BIT(val,bit) (UWORD16)(((val) >> (bit)) & 0x1)
+
+#define IS_VAL_IN_RANGE(val,upperLimit,lowerLimit) ((val) >= (lowerLimit) && (val) <= (upperLimit))
+
+#define MSW(dword) (dword >> 16)
+#define LSW(dword) (dword & 0xFFFF)
+#define DIV_2_RND(mv) (((mv) + ((mv) > 0)) >> 1)
+#define IS_NEG(Number) (((Number) < 0) ? 1 : 0)
+
+#define ALIGN128(x) ((((x) + 127) >> 7) << 7)
+#define ALIGN64(x) ((((x) + 63) >> 6) << 6)
+#define ALIGN32(x) ((((x) + 31) >> 5) << 5)
+#define ALIGN16(x) ((((x) + 15) >> 4) << 4)
+#define ALIGN8(x) ((((x) + 7) >> 3) << 3)
+
+
+#define RETURN_IF(cond, retval) if(cond) {return (retval);}
+#define UNUSED(x) ((void)(x))
+
+
+#define ASSERT(x) assert(x)
+
+
+#endif /* __IMPEG2_IT_MACROS_H__ */
diff --git a/common/impeg2_mem_func.c b/common/impeg2_mem_func.c
new file mode 100644
index 0000000..9268c01
--- /dev/null
+++ b/common/impeg2_mem_func.c
@@ -0,0 +1,87 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* impeg2_utils.c
+*
+* @brief
+* Contains utility function definitions for MPEG2 codec
+*
+* @author
+* Harish
+*
+* @par List of Functions:
+* - impeg2_memset0_16bit_8x8_linear_block()
+* - impeg2_memset_8bit_8x8_block()
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+
+/*******************************************************************************
+* Function Name : impeg2_memset0_16bit_8x8_linear_block
+*
+* Description : memsets resudial buf to 0
+*
+* Arguments : destination buffer
+*
+* Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset0_16bit_8x8_linear_block (WORD16 *pi2_buf)
+{
+ memset(pi2_buf,0,64 * sizeof(WORD16));
+}
+
+
+
+/*******************************************************************************
+* Function Name : impeg2_memset_8bit_8x8_block
+*
+* Description : memsets residual buf to value
+*
+* Arguments : destination buffer, value and stride
+*
+* Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset_8bit_8x8_block(UWORD8 *pu1_dst, WORD32 u4_dc_val, WORD32 u4_dst_wd)
+{
+ WORD32 j;
+
+ for(j = BLK_SIZE; j > 0; j--)
+ {
+ memset(pu1_dst, u4_dc_val, BLK_SIZE);
+ pu1_dst += u4_dst_wd;
+ }
+}
+
+
+
diff --git a/common/impeg2_mem_func.h b/common/impeg2_mem_func.h
new file mode 100644
index 0000000..f73702c
--- /dev/null
+++ b/common/impeg2_mem_func.h
@@ -0,0 +1,41 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+#ifndef IMPEG2_MEM_FUNC_H_
+#define IMPEG2_MEM_FUNC_H_
+
+typedef void pf_memset0_one_16bit_buf_t (WORD16 *buf);
+typedef void pf_memset_8bit_t (UWORD8 *dst, WORD32 dc_val, WORD32 dst_wd);
+
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block;
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_a9q;
+
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_sse42;
+
+pf_memset0_one_16bit_buf_t impeg2_memset0_16bit_8x8_linear_block_av8;
+
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block;
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block_a9q;
+
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block_sse42;
+
+pf_memset_8bit_t impeg2_memset_8bit_8x8_block_av8;
+
+#endif /* IMPEG2_MEM_FUNC_H_ */
diff --git a/common/ithread.c b/common/ithread.c
new file mode 100644
index 0000000..76fdad3
--- /dev/null
+++ b/common/ithread.c
@@ -0,0 +1,453 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : ithread.c */
+/* */
+/* Description : Contains abstraction for threads, mutex and semaphores*/
+/* */
+/* List of Functions : */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes */
+/* 07 09 2012 Harish Initial Version */
+/*****************************************************************************/
+/*****************************************************************************/
+/* File Includes */
+/*****************************************************************************/
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "ithread.h"
+#include <sys/types.h>
+
+#ifndef X86_MSVC
+//#define PTHREAD_AFFINITY
+//#define SYSCALL_AFFINITY
+
+#ifdef PTHREAD_AFFINITY
+#define _GNU_SOURCE
+#define __USE_GNU
+#endif
+
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <unistd.h>
+
+
+#endif
+#if 0
+#include <sys/syscall.h>
+#endif
+
+#ifdef X86_MSVC
+
+#include <windows.h>
+#define SEM_MAX_COUNT 100
+#define SEM_INCREMENT_COUNT 1
+
+UWORD32 ithread_get_handle_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+
+ if(0 == thread_handle)
+ return -1;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = (void *)CreateThread
+ (NULL, /* Attributes */
+ 1024*128, /* Stack size */
+ (LPTHREAD_START_ROUTINE)strt, /* Thread function */
+ argument, /* Parameters */
+ 0, /* Creation flags */
+ NULL); /* Thread ID */
+ *ppv_thread_handle = (HANDLE)thread_handle_value;
+
+ return 0;
+}
+
+WORD32 ithread_join(void *thread_handle, void ** val_ptr)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+
+ if(0 == thread_handle)
+ return -1;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = *ppv_thread_handle;
+
+ if(WAIT_OBJECT_0 == WaitForSingleObject(thread_handle_value, INFINITE))
+ {
+ CloseHandle(thread_handle_value);
+ }
+
+ return 0;
+}
+
+void ithread_exit(void *thread_handle)
+{
+ HANDLE *ppv_thread_handle;
+ HANDLE thread_handle_value;
+ DWORD thread_exit_code;
+
+ if(0 == thread_handle)
+ return;
+
+ ppv_thread_handle = (HANDLE *)thread_handle;
+ thread_handle_value = *ppv_thread_handle;
+ /* Get exit code for thread. If the return value is 0, means thread is busy */
+ if( 0 != GetExitCodeThread(thread_handle_value, &thread_exit_code))
+ {
+ TerminateThread(thread_handle_value, thread_exit_code);
+ }
+
+ return;
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_mutex_init(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = CreateSemaphore(NULL, 1, 1, NULL);
+ *ppv_mutex_handle = mutex_handle_value;
+ return 0;
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ CloseHandle(mutex_handle_value);
+ return 0;
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+ DWORD result = 0;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ result = WaitForSingleObject(mutex_handle_value, INFINITE);
+
+ if(WAIT_OBJECT_0 == result)
+ return 0;
+
+ return 1;
+
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+ HANDLE *ppv_mutex_handle;
+ HANDLE mutex_handle_value;
+ DWORD result = 0;
+
+ if(0 == mutex)
+ return -1;
+
+ ppv_mutex_handle = (HANDLE *)mutex;
+ mutex_handle_value = *ppv_mutex_handle;
+ result = ReleaseSemaphore(mutex_handle_value, 1, NULL);
+
+ if(0 == result)
+ return -1;
+
+ return 0;
+}
+
+void ithread_yield(void) { }
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+ UWORD32 u4_time_ms = u4_time_us / 1000;
+ Sleep(u4_time_ms);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+ Sleep(u4_time_ms);
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+ UWORD32 u4_time_ms = u4_time * 1000;
+ Sleep(u4_time_ms);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+ return (sizeof(HANDLE));
+}
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = CreateSemaphore(NULL, /* Security Attribute*/
+ value, /* Initial count */
+ SEM_MAX_COUNT,/* Max value */
+ NULL); /* Name, not used */
+ *sem_handle = sem_handle_value;
+ return 0;
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ /* Post on Semaphore by releasing the lock on mutex */
+ if(ReleaseSemaphore(sem_handle_value, SEM_INCREMENT_COUNT, NULL))
+ return 0;
+
+ return -1;
+}
+
+WORD32 ithread_sem_wait(void *sem)
+{
+ DWORD result = 0;
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ /* Wait on Semaphore object infinitly */
+ result = WaitForSingleObject(sem_handle_value, INFINITE);
+
+ /* If lock on semaphore is acquired, return SUCCESS */
+ if(WAIT_OBJECT_0 == result)
+ return 0;
+
+ /* If call timeouts, return FAILURE */
+ if(WAIT_TIMEOUT == result)
+ return -1;
+
+ return 0;
+}
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+ HANDLE *sem_handle = (HANDLE *)sem;
+ HANDLE sem_handle_value;
+
+ if(0 == sem)
+ return -1;
+
+ sem_handle_value = *sem_handle;
+
+ if(FALSE == CloseHandle(sem_handle_value) )
+ {
+ return -1;
+ }
+ return 0;
+}
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+ return 1;
+}
+
+#else
+UWORD32 ithread_get_handle_size(void)
+{
+ return sizeof(pthread_t);
+}
+
+UWORD32 ithread_get_mutex_lock_size(void)
+{
+ return sizeof(pthread_mutex_t);
+}
+
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument)
+{
+ ((void)(attribute));
+ return pthread_create((pthread_t *)thread_handle, NULL,(void *(*)(void *)) strt, argument);
+}
+
+WORD32 ithread_join(void *thread_handle, void ** val_ptr)
+{
+ pthread_t *pthread_handle = (pthread_t *)thread_handle;
+ ((void)(val_ptr));
+ return pthread_join(*pthread_handle, NULL);
+}
+
+void ithread_exit(void *val_ptr)
+{
+return pthread_exit(val_ptr);
+}
+
+WORD32 ithread_get_mutex_struct_size(void)
+{
+ return(sizeof(pthread_mutex_t));
+}
+WORD32 ithread_mutex_init(void *mutex)
+{
+ return pthread_mutex_init((pthread_mutex_t *) mutex, NULL);
+}
+
+WORD32 ithread_mutex_destroy(void *mutex)
+{
+ return pthread_mutex_destroy((pthread_mutex_t *) mutex);
+}
+
+WORD32 ithread_mutex_lock(void *mutex)
+{
+ return pthread_mutex_lock((pthread_mutex_t *)mutex);
+}
+
+WORD32 ithread_mutex_unlock(void *mutex)
+{
+ return pthread_mutex_unlock((pthread_mutex_t *)mutex);
+}
+
+void ithread_yield(void)
+{
+ sched_yield();
+}
+
+void ithread_sleep(UWORD32 u4_time)
+{
+ usleep(u4_time * 1000 * 1000);
+}
+
+void ithread_msleep(UWORD32 u4_time_ms)
+{
+ usleep(u4_time_ms * 1000);
+}
+
+void ithread_usleep(UWORD32 u4_time_us)
+{
+ usleep(u4_time_us);
+}
+
+UWORD32 ithread_get_sem_struct_size(void)
+{
+ return(sizeof(sem_t));
+}
+
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value)
+{
+ return sem_init((sem_t *)sem,pshared,value);
+}
+
+WORD32 ithread_sem_post(void *sem)
+{
+ return sem_post((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_wait(void *sem)
+{
+ return sem_wait((sem_t *)sem);
+}
+
+
+WORD32 ithread_sem_destroy(void *sem)
+{
+return sem_destroy((sem_t *)sem);
+}
+
+
+WORD32 ithread_set_affinity(WORD32 core_id)
+{
+#ifdef PTHREAD_AFFINITY
+ cpu_set_t cpuset;
+ int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+ pthread_t cur_thread = pthread_self();
+
+ if (core_id >= num_cores)
+ return -1;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(core_id, &cpuset);
+
+ return pthread_setaffinity_np(cur_thread, sizeof(cpu_set_t), &cpuset);
+
+#elif SYSCALL_AFFINITY
+ WORD32 i4_sys_res;
+
+ pid_t pid = gettid();
+
+
+ i4_sys_res = syscall(__NR_sched_setaffinity, pid, sizeof(i4_mask), &i4_mask);
+ if (i4_sys_res)
+ {
+ //WORD32 err;
+ //err = errno;
+ //perror("Error in setaffinity syscall PERROR : ");
+ //LOG_ERROR("Error in the syscall setaffinity: mask=0x%x err=0x%x", i4_mask, i4_sys_res);
+ return -1;
+ }
+#endif
+ ((void)(core_id));
+ return 1;
+
+}
+#endif
diff --git a/common/ithread.h b/common/ithread.h
new file mode 100644
index 0000000..eb75d20
--- /dev/null
+++ b/common/ithread.h
@@ -0,0 +1,80 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* ithread.h
+*
+* @brief
+* This file contains all the necessary structure and enumeration
+* definitions needed for the Application Program Interface(API) of the
+* Thread Abstraction Layer
+*
+* @author
+* Harish
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+#ifndef __ITHREAD_H__
+#define __ITHREAD_H__
+
+UWORD32 ithread_get_handle_size(void);
+
+UWORD32 ithread_get_mutex_lock_size(void);
+
+WORD32 ithread_create(void *thread_handle, void *attribute, void *strt, void *argument);
+
+void ithread_exit(void *val_ptr);
+
+WORD32 ithread_join(void *thread_id, void ** val_ptr);
+
+WORD32 ithread_get_mutex_struct_size(void);
+
+WORD32 ithread_mutex_init(void *mutex);
+
+WORD32 ithread_mutex_destroy(void *mutex);
+
+WORD32 ithread_mutex_lock(void *mutex);
+
+WORD32 ithread_mutex_unlock(void *mutex);
+
+void ithread_yield(void);
+
+void ithread_sleep(UWORD32 u4_time);
+
+void ithread_msleep(UWORD32 u4_time_ms);
+
+void ithread_usleep(UWORD32 u4_time_us);
+
+UWORD32 ithread_get_sem_struct_size(void);
+
+WORD32 ithread_sem_init(void *sem,WORD32 pshared,UWORD32 value);
+
+WORD32 ithread_sem_post(void *sem);
+
+WORD32 ithread_sem_wait(void *sem);
+
+WORD32 ithread_sem_destroy(void *sem);
+
+WORD32 ithread_set_affinity(WORD32 core_id);
+#endif /* __ITHREAD_H__ */
diff --git a/common/iv.h b/common/iv.h
new file mode 100644
index 0000000..3941497
--- /dev/null
+++ b/common/iv.h
@@ -0,0 +1,420 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/**
+*******************************************************************************
+* @file
+* iv.h
+*
+* @brief
+* This file contains all the necessary structure and enumeration
+* definitions needed for the Application Program Interface(API) of the
+* Ittiam Video and Image codecs
+*
+* @author
+* 100239(RCY)
+*
+* @par List of Functions:
+*
+* @remarks
+* None
+*
+*******************************************************************************
+*/
+
+
+#ifndef _IV_H
+#define _IV_H
+
+/*****************************************************************************/
+/* Constant Macros */
+/*****************************************************************************/
+
+
+/*****************************************************************************/
+/* Typedefs */
+/*****************************************************************************/
+
+/*****************************************************************************/
+/* Enums */
+/*****************************************************************************/
+
+
+/* IV_API_CALL_STATUS_T:This is only to return the FAIL/PASS status to the */
+/* application for the current API call */
+
+typedef enum{
+ IV_STATUS_NA = 0x7FFFFFFF,
+ IV_SUCCESS = 0x0,
+ IV_FAIL = 0x1,
+}IV_API_CALL_STATUS_T;
+
+/* IV_MEM_TYPE_T: This Enumeration defines the type of memory (Internal/Ext */
+/* -ernal) along with the cacheable/non-cacheable attributes */
+
+typedef enum {
+ IV_NA_MEM_TYPE = 0x7FFFFFFF,
+ IV_INTERNAL_CACHEABLE_PERSISTENT_MEM = 0x1,
+ IV_INTERNAL_CACHEABLE_SCRATCH_MEM = 0x2,
+ IV_EXTERNAL_CACHEABLE_PERSISTENT_MEM = 0x3,
+ IV_EXTERNAL_CACHEABLE_SCRATCH_MEM = 0x4,
+ IV_INTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x5,
+ IV_INTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x6,
+ IV_EXTERNAL_NONCACHEABLE_PERSISTENT_MEM = 0x7,
+ IV_EXTERNAL_NONCACHEABLE_SCRATCH_MEM = 0x8
+}IV_MEM_TYPE_T;
+
+/* IV_COLOR_FORMAT_T: This enumeration lists all the color formats which */
+/* finds usage in video/image codecs */
+
+typedef enum {
+ IV_CHROMA_NA = 0x7FFFFFFF,
+ IV_YUV_420P = 0x1,
+ IV_YUV_422P = 0x2,
+ IV_420_UV_INTL = 0x3,
+ IV_YUV_422IBE = 0x4,
+ IV_YUV_422ILE = 0x5,
+ IV_YUV_444P = 0x6,
+ IV_YUV_411P = 0x7,
+ IV_GRAY = 0x8,
+ IV_RGB_565 = 0x9,
+ IV_RGB_24 = 0xa,
+ IV_YUV_420SP_UV = 0xb,
+ IV_YUV_420SP_VU = 0xc,
+ IV_RGBA_8888 = 0xd
+}IV_COLOR_FORMAT_T;
+
+/* IV_PICTURE_CODING_TYPE_T: VOP/Frame coding type Enumeration */
+
+typedef enum {
+ IV_NA_FRAME = 0x7FFFFFFF,
+ IV_I_FRAME = 0x0,
+ IV_P_FRAME = 0x1,
+ IV_B_FRAME = 0x2,
+ IV_IDR_FRAME = 0x3,
+ IV_II_FRAME = 0x4,
+ IV_IP_FRAME = 0x5,
+ IV_IB_FRAME = 0x6,
+ IV_PI_FRAME = 0x7,
+ IV_PP_FRAME = 0x8,
+ IV_PB_FRAME = 0x9,
+ IV_BI_FRAME = 0xa,
+ IV_BP_FRAME = 0xb,
+ IV_BB_FRAME = 0xc,
+ IV_MBAFF_I_FRAME = 0xd,
+ IV_MBAFF_P_FRAME = 0xe,
+ IV_MBAFF_B_FRAME = 0xf,
+ IV_MBAFF_IDR_FRAME = 0x10,
+ IV_NOT_CODED_FRAME = 0x11,
+ IV_FRAMETYPE_DEFAULT = IV_I_FRAME
+}IV_PICTURE_CODING_TYPE_T;
+
+/* IV_FLD_TYPE_T: field type Enumeration */
+
+typedef enum {
+ IV_NA_FLD = 0x7FFFFFFF,
+ IV_TOP_FLD = 0x0,
+ IV_BOT_FLD = 0x1,
+ IV_FLD_TYPE_DEFAULT = IV_TOP_FLD
+}IV_FLD_TYPE_T;
+
+/* IV_CONTENT_TYPE_T: Video content type */
+
+typedef enum {
+ IV_CONTENTTYPE_NA = 0x7FFFFFFF,
+ IV_PROGRESSIVE = 0x0,
+ IV_INTERLACED = 0x1,
+ IV_PROGRESSIVE_FRAME = 0x2,
+ IV_INTERLACED_FRAME = 0x3,
+ IV_INTERLACED_TOPFIELD = 0x4,
+ IV_INTERLACED_BOTTOMFIELD = 0x5,
+ IV_CONTENTTYPE_DEFAULT = IV_PROGRESSIVE,
+}IV_CONTENT_TYPE_T;
+
+/* IV_API_COMMAND_TYPE_T:API command type */
+typedef enum {
+ IV_CMD_NA = 0x7FFFFFFF,
+ IV_CMD_GET_NUM_MEM_REC = 0x0,
+ IV_CMD_FILL_NUM_MEM_REC = 0x1,
+ IV_CMD_RETRIEVE_MEMREC = 0x2,
+ IV_CMD_INIT = 0x3,
+ IV_CMD_DUMMY_ELEMENT = 0x4,
+}IV_API_COMMAND_TYPE_T;
+
+/*****************************************************************************/
+/* Structure */
+/*****************************************************************************/
+
+/* IV_OBJ_T: This structure defines the handle for the codec instance */
+
+typedef struct{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * Pointer to the API function pointer table of the codec
+ */
+ void *pv_fxns;
+
+ /**
+ * Pointer to the handle of the codec
+ */
+ void *pv_codec_handle;
+}iv_obj_t;
+
+/* iv_mem_rec_t: This structure defines the memory record holder which will */
+/* be used by the codec to communicate its memory requirements to the */
+/* application through appropriate API functions */
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * Pointer to the memory allocated by the application
+ */
+ void *pv_base;
+
+ /**
+ * u4_size of the memory to be allocated
+ */
+ UWORD32 u4_mem_size;
+
+ /**
+ * Alignment of the memory pointer
+ */
+ UWORD32 u4_mem_alignment;
+ /**
+ * Nature of the memory to be allocated
+ */
+ IV_MEM_TYPE_T e_mem_type;
+}iv_mem_rec_t;
+
+/* IV_YUV_BUF_T: This structure defines attributes for the yuv buffer */
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * Pointer to Luma (Y) Buffer
+ */
+
+ void *pv_y_buf;
+ /**
+ * Pointer to Chroma (Cb) Buffer
+ */
+ void *pv_u_buf;
+
+ /**
+ * Pointer to Chroma (Cr) Buffer
+ */
+ void *pv_v_buf;
+
+ /**
+ * Width of the Luma (Y) Buffer
+ */
+ UWORD32 u4_y_wd;
+
+ /**
+ * Height of the Luma (Y) Buffer
+ */
+ UWORD32 u4_y_ht;
+
+ /**
+ * Stride/Pitch of the Luma (Y) Buffer
+ */
+ UWORD32 u4_y_strd;
+
+ /**
+ * Width of the Chroma (Cb) Buffer
+ */
+ UWORD32 u4_u_wd;
+
+ /**
+ * Height of the Chroma (Cb) Buffer
+ */
+ UWORD32 u4_u_ht;
+
+ /**
+ * Stride/Pitch of the Chroma (Cb) Buffer
+ */
+ UWORD32 u4_u_strd;
+
+ /**
+ * Width of the Chroma (Cr) Buffer
+ */
+ UWORD32 u4_v_wd;
+
+ /**
+ * Height of the Chroma (Cr) Buffer
+ */
+ UWORD32 u4_v_ht;
+
+ /**
+ * Stride/Pitch of the Chroma (Cr) Buffer
+ */
+ UWORD32 u4_v_strd;
+}iv_yuv_buf_t;
+
+/*****************************************************************************/
+/* Get Number of Memory Records */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_GET_NUM_MEM_REC */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IV_API_COMMAND_TYPE_T e_cmd;
+}iv_num_mem_rec_ip_t;
+
+
+typedef struct{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * num_mem_rec
+ */
+ UWORD32 u4_num_mem_rec;
+}iv_num_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/* Fill Memory Records */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_FILL_NUM_MEM_REC */
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IV_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * pointer to array of memrecords structures should be filled by codec
+ with details of memory resource requirements
+ */
+ iv_mem_rec_t *pv_mem_rec_location;
+
+ /**
+ * maximum width for which codec should request memory requirements
+ */
+ UWORD32 u4_max_frm_wd;
+
+ /**
+ * maximum height for which codec should request memory requirements
+ */
+ UWORD32 u4_max_frm_ht;
+}iv_fill_mem_rec_ip_t;
+
+
+typedef struct{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * no of memory record structures which are filled by codec
+ */
+ UWORD32 u4_num_mem_rec_filled;
+}iv_fill_mem_rec_op_t;
+
+
+/*****************************************************************************/
+/* Retrieve Memory Records */
+/*****************************************************************************/
+
+/* IV_API_COMMAND_TYPE_T::e_cmd = IV_CMD_RETRIEVE_MEMREC */
+
+
+
+typedef struct {
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * cmd
+ */
+ IV_API_COMMAND_TYPE_T e_cmd;
+
+ /**
+ * array of structures where codec should fill with all resources(memory) with it
+ */
+ iv_mem_rec_t *pv_mem_rec_location;
+}iv_retrieve_mem_rec_ip_t;
+
+
+typedef struct{
+ /**
+ * u4_size of the structure
+ */
+ UWORD32 u4_size;
+
+ /**
+ * error_code
+ */
+ UWORD32 u4_error_code;
+
+ /**
+ * no of memory records filled by codec
+ */
+ UWORD32 u4_num_mem_rec_filled;
+}iv_retrieve_mem_rec_op_t;
+
+
+
+#endif /* _IV_H */
+
diff --git a/common/iv_datatypedef.h b/common/iv_datatypedef.h
new file mode 100644
index 0000000..3c45942
--- /dev/null
+++ b/common/iv_datatypedef.h
@@ -0,0 +1,81 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+/*****************************************************************************/
+/* */
+/* File Name : datatypedef.h */
+/* */
+/* Description : This file contains all the necessary data type */
+/* definitions. */
+/* */
+/* List of Functions : None */
+/* */
+/* Issues / Problems : None */
+/* */
+/* Revision History : */
+/* */
+/* DD MM YYYY Author(s) Changes (Describe the changes made) */
+/* 29 12 2006 Rajendra C Y Draft */
+/* */
+/*****************************************************************************/
+
+#ifndef __IV_DATATYPEDEF_H__
+#define __IV_DATATYPEDEF_H__
+
+/*****************************************************************************/
+/* Typedefs */
+/*****************************************************************************/
+
+typedef int WORD32;
+typedef unsigned int UWORD32;
+
+typedef short WORD16;
+typedef unsigned short UWORD16;
+
+typedef char WORD8;
+typedef unsigned char UWORD8;
+
+typedef char CHAR;
+#ifndef NULL
+#define NULL ((void *)0)
+
+#endif
+
+typedef enum
+{
+ IT_FALSE,
+ IT_TRUE
+} IT_BOOL;
+
+
+typedef enum
+{
+ IT_OK,
+ IT_ERROR = -1
+} IT_STATUS;
+
+/*****************************************************************************/
+/* Input and Output Parameter identifiers */
+/*****************************************************************************/
+#define IT_IN
+#define IT_OUT
+
+
+#endif /* __IV_DATATYPEDEF_H__ */
+
diff --git a/common/mips/impeg2_platform_macros.h b/common/mips/impeg2_platform_macros.h
new file mode 100644
index 0000000..05ff6da
--- /dev/null
+++ b/common/mips/impeg2_platform_macros.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+
+#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = (u4_temp1 << 24) | \
+ ((u4_temp1 & 0xff00) << 8) | \
+ ((u4_temp1 & 0xff0000) >> 8) | \
+ (u4_temp1 >> 24);
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+
+
+#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x))
+#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x))
+
+#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x))
+#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x))
+
+#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x))
+#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x))
+#define PLD(x)
+
+#define INLINE
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */
diff --git a/common/x86/impeg2_idct_recon_sse42_intr.c b/common/x86/impeg2_idct_recon_sse42_intr.c
new file mode 100755
index 0000000..4142032
--- /dev/null
+++ b/common/x86/impeg2_idct_recon_sse42_intr.c
@@ -0,0 +1,2205 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ * impeg2_itrans_recon_x86_intr.c
+ *
+ * @brief
+ * Contains function definitions for inverse quantization, inverse
+ * transform and reconstruction
+ *
+ * @author
+ * 100470
+ * 100592 (edited by)
+ *
+ * @par List of Functions:
+ * - impeg2_itrans_recon_8x8_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_macros.h"
+#include "impeg2_defs.h"
+#include "impeg2_globals.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+
+/**
+ *******************************************************************************
+ *
+ * @brief
+ * This function performs inverse quantization, inverse transform and
+ * reconstruction for 8c8 input block
+ *
+ * @par Description:
+ * Performs inverse quantization , inverse transform and adds the
+ * prediction data and clips output to 8 bit
+ *
+ * @param[in] pi2_src
+ * Input 8x8 coefficients
+ *
+ * @param[in] pi2_tmp
+ * Temporary 8x8 buffer for storing inverse
+ * transform 1st stage output
+ *
+ * @param[in] pu1_pred
+ * Prediction 8x8 block
+ *
+ * @param[in] pi2_dequant_coeff
+ * Dequant Coeffs
+ *
+ * @param[out] pu1_dst
+ * Output 8x8 block
+ *
+ * @param[in] src_strd
+ * Input stride
+ *
+ * @param[in] qp_div
+ * Quantization parameter / 6
+ *
+ * @param[in] qp_rem
+ * Quantization parameter % 6
+ *
+ * @param[in] pred_strd
+ * Prediction stride
+ *
+ * @param[in] dst_strd
+ * Output Stride
+ *
+ * @param[in] zero_cols
+ * Zero columns in pi2_src
+ *
+ * @returns Void
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+
+
+void impeg2_idct_recon_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ __m128i m_temp_reg_0;
+ __m128i m_temp_reg_1;
+ __m128i m_temp_reg_2;
+ __m128i m_temp_reg_3;
+ __m128i m_temp_reg_5;
+ __m128i m_temp_reg_6;
+ __m128i m_temp_reg_7;
+ __m128i m_temp_reg_4;
+ __m128i m_temp_reg_10;
+ __m128i m_temp_reg_11;
+ __m128i m_temp_reg_12;
+ __m128i m_temp_reg_13;
+ __m128i m_temp_reg_14;
+ __m128i m_temp_reg_15;
+ __m128i m_temp_reg_16;
+ __m128i m_temp_reg_17;
+ __m128i m_temp_reg_20;
+ __m128i m_temp_reg_21;
+ __m128i m_temp_reg_22;
+ __m128i m_temp_reg_23;
+ __m128i m_temp_reg_24;
+ __m128i m_temp_reg_25;
+ __m128i m_temp_reg_26;
+ __m128i m_temp_reg_27;
+ __m128i m_temp_reg_30;
+ __m128i m_temp_reg_31;
+ __m128i m_temp_reg_32;
+ __m128i m_temp_reg_33;
+ __m128i m_temp_reg_34;
+ __m128i m_temp_reg_35;
+ __m128i m_temp_reg_36;
+ __m128i m_temp_reg_37;
+ __m128i m_temp_reg_40;
+ __m128i m_temp_reg_41;
+ __m128i m_temp_reg_42;
+ __m128i m_temp_reg_43;
+ __m128i m_temp_reg_44;
+ __m128i m_temp_reg_45;
+ __m128i m_temp_reg_46;
+ __m128i m_temp_reg_47;
+ __m128i m_temp_reg_50;
+ __m128i m_temp_reg_51;
+ __m128i m_temp_reg_52;
+ __m128i m_temp_reg_53;
+ __m128i m_temp_reg_54;
+ __m128i m_temp_reg_55;
+ __m128i m_temp_reg_56;
+ __m128i m_temp_reg_57;
+ __m128i m_temp_reg_60;
+ __m128i m_temp_reg_61;
+ __m128i m_temp_reg_62;
+ __m128i m_temp_reg_63;
+ __m128i m_temp_reg_64;
+ __m128i m_temp_reg_65;
+ __m128i m_temp_reg_66;
+ __m128i m_temp_reg_67;
+ __m128i m_temp_reg_70;
+ __m128i m_temp_reg_71;
+ __m128i m_temp_reg_72;
+ __m128i m_temp_reg_73;
+ __m128i m_temp_reg_74;
+ __m128i m_temp_reg_75;
+ __m128i m_temp_reg_76;
+ __m128i m_temp_reg_77;
+ __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
+
+ WORD32 check_row_stage_1; /* Lokesh */
+ WORD32 check_row_stage_2; /* Lokesh */
+
+ __m128i m_rdng_factor;
+ WORD32 i4_shift = IDCT_STG1_SHIFT;
+ UNUSED(pi2_tmp);
+ check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
+ check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
+
+ m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+
+ m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
+ pi2_src += src_strd;
+ m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
+
+ if(!check_row_stage_2)
+ {
+ if(!check_row_stage_1)
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ //o0:1B*89+3B*75,5B*50+7B*18
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_setzero_si128();
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o1:1B*75-3B*18,5B*89+7B*50
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o2:1B*50-3B*89,5B*18+7B*75
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
+
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o3:1B*18-3B*50,5B*75-7B*89
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_setzero_si128();
+ m_temp_reg_55 = _mm_setzero_si128();
+ m_temp_reg_56 = _mm_setzero_si128();
+ m_temp_reg_57 = _mm_setzero_si128();
+ }
+ }
+ else
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ //o0:1B*89+3B*75,5B*50+7B*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ /* Upper 8 bytes of both registers are zero due to zero_cols*/
+
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_setzero_si128();
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o1:1B*75-3B*18,5B*89+7B*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o2:1B*50-3B*89,5B*18+7B*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+ //o3:1B*18-3B*50,5B*75-7B*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_setzero_si128();
+ m_temp_reg_55 = _mm_setzero_si128();
+ m_temp_reg_56 = _mm_setzero_si128();
+ m_temp_reg_57 = _mm_setzero_si128();
+ }
+ }
+
+ /* Stage 2 */
+ i4_shift = IDCT_STG2_SHIFT;
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ /* Loading coeff for computing o0 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+
+
+
+ /* e */
+
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+ //o0:1B*89+3B*75,1T*89+3T*75
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+ /* Loading coeff for computing o1 in the next block */
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
+
+
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o1:1B*75-3B*18,1T*75-3T*18
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+ /* Loading coeff for computing o2 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
+
+
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o2:1B*50-3B*89,5T*18+7T*75.
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
+
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ //o3:1B*18-3B*50,1T*18-3T*50
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+ m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+ m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+ m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+
+ /* Recon and store */
+ {
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+ m_temp_reg_50 = _mm_setzero_si128();
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+ m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+ m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+ m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+ m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+ m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+ m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+ m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+ m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+ m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+ m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+ m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+ m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+ m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+ m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+ m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+ pu1_dst += dst_strd;
+ }
+ }
+ }
+ else
+
+ {
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ if(!check_row_stage_1)
+ {
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+ //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+ //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]);
+
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ //o0:1B*89+3B*75,1T*89+3T*75
+ m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89
+ m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50
+ m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+ }
+ else
+ {
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ //Interleaving 0,4 row in 0 , 1 Rishab
+ /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+
+
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
+
+ /* Combining instructions to eliminate them based on zero_rows : Lokesh */
+ //Interleaving 2,6 row in 4, 5 Rishab
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+
+
+ /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
+
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+
+ m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
+ m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
+ m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
+ //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+ /* Loading coeff for computing o2 in the next block */
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
+ m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
+ m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
+ m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
+
+ m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
+ m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
+ m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
+ m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+
+
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+
+ m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+ }
+ /* Stage 2 */
+
+ i4_shift = IDCT_STG2_SHIFT;
+
+ {
+
+ /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
+ /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
+ {
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
+
+ m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
+ }
+
+
+ /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
+ /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
+ {
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
+
+
+ m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
+ m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
+
+ /* Loading coeff for computing o0 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]);
+
+
+ m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
+ m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
+ }
+
+ /* e */
+ {
+ /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
+ /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
+ /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
+ /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
+ m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
+ m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
+
+ m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
+ m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
+
+ m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
+ m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
+
+ m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
+ m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
+
+ }
+
+ /* o */
+ {
+ m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
+ m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
+
+ /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
+ {
+ //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+ m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
+ m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
+ /* Loading coeff for computing o1 in the next block */
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]);
+
+ m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 0 of destination computed here */
+ /* It is stored in m_temp_reg_50 */
+ /* Column 7 of destination computed here */
+ /* It is stored in m_temp_reg_57 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+ m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
+
+
+ /* Loading coeff for computing o2 in the next block */
+ m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
+ m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]);
+
+ m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 1 of destination computed here */
+ /* It is stored in m_temp_reg_51 */
+ /* Column 6 of destination computed here */
+ /* It is stored in m_temp_reg_56 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
+ m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
+ m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
+ m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
+ m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
+
+ m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+ /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
+
+ /* Loading coeff for computing o3 in the next block */
+
+ m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
+ m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]);
+
+ m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
+ m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
+ }
+
+ /* Column 2 of destination computed here */
+ /* It is stored in m_temp_reg_52 */
+ /* Column 5 of destination computed here */
+ /* It is stored in m_temp_reg_55 */
+ {
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
+ m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
+
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
+ m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
+
+ m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
+ m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
+ m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
+ m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
+
+ m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
+ m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
+ m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
+ m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
+
+ //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
+ m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
+ m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
+ m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
+ m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
+
+ m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
+ m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
+
+
+
+ /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
+
+
+ m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
+ m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
+ }
+
+ /* Column 3 of destination computed here */
+ /* It is stored in m_temp_reg_53 */
+ /* Column 4 of destination computed here */
+ /* It is stored in m_temp_reg_54 */
+ {
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
+ m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
+
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
+ m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
+
+ m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
+ m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
+ m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
+ m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
+
+ m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
+ m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
+ m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
+ m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
+
+ m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
+ m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
+ }
+ }
+
+ /* Transpose of the destination 8x8 matrix done here */
+ /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
+ /* respectively */
+ {
+ m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
+ m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
+ m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
+ m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
+ m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
+
+ m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
+ m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
+ m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
+ m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
+ m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
+ m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
+ m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
+
+ m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
+ m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
+ m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
+ }
+
+ /* Recon and store */
+ {
+ m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
+ pu1_pred += pred_strd;
+ m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
+
+
+ m_temp_reg_50 = _mm_setzero_si128();
+ m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
+ m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
+ m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
+ m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
+ m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
+ m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
+ m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
+ m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
+
+ m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
+ m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
+ m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
+ m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
+ m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
+ m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
+ m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
+ m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
+
+ m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
+ m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
+ m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
+ m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
+ m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
+ m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
+ m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
+ m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
+ pu1_dst += dst_strd;
+ _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
+ pu1_dst += dst_strd;
+
+ }
+
+
+ }
+
+
+ }
+}
+
+void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 val;
+ __m128i value_4x32b, mismatch_stg2_additive;
+ __m128i pred_r, pred_half0, pred_half1;
+ __m128i temp0, temp1;
+ __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND);
+
+ UNUSED(pi2_tmp);
+ UNUSED(src_strd);
+ UNUSED(zero_cols);
+ UNUSED(zero_rows);
+
+ val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+ val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+ val *= gai2_impeg2_idct_q11[0];
+ value_4x32b = _mm_set1_epi32(val);
+
+ // Row 0 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive);
+ pred_r = _mm_loadl_epi64((__m128i *) pu1_pred);
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)pu1_dst, temp0);
+
+ // Row 1 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8));
+ pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0);
+
+ // Row 2 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16));
+ pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd));
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0);
+
+ // Row 3 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24));
+ pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd));
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0);
+
+ // Row 4 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32));
+ pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd));
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0);
+
+ // Row 5 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40));
+ pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd));
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0);
+
+ // Row 6 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48));
+ pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd));
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0);
+
+ // Row 7 processing
+ mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56));
+ pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd));
+ pred_r = _mm_cvtepu8_epi16(pred_r);
+ temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+ mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
+ pred_half0 = _mm_cvtepu16_epi32(pred_r);
+ temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
+
+ pred_r = _mm_srli_si128(pred_r, 8);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp0 = _mm_add_epi32(temp0, round_stg2);
+ temp1 = _mm_add_epi32(temp1, round_stg2);
+ pred_half1 = _mm_cvtepu16_epi32(pred_r);
+ temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
+ temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
+ temp0 = _mm_add_epi32(temp0, pred_half0);
+ temp1 = _mm_add_epi32(temp1, pred_half1);
+
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+
+ _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0);
+}
+
+void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src,
+ WORD16 *pi2_tmp,
+ UWORD8 *pu1_pred,
+ UWORD8 *pu1_dst,
+ WORD32 src_strd,
+ WORD32 pred_strd,
+ WORD32 dst_strd,
+ WORD32 zero_cols,
+ WORD32 zero_rows)
+{
+ WORD32 val;
+ __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3;
+
+ UNUSED(pi2_tmp);
+ UNUSED(src_strd);
+ UNUSED(zero_cols);
+ UNUSED(zero_rows);
+
+ val = pi2_src[0] * gai2_impeg2_idct_q15[0];
+ val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
+ val = val * gai2_impeg2_idct_q11[0];
+ val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
+
+ value_4x32b = _mm_set1_epi32(val);
+
+ //Row 0-1 processing
+ pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+ pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+ pred_r0 = _mm_cvtepu8_epi16(pred_r0);
+ pred_r1 = _mm_cvtepu8_epi16(pred_r1);
+
+ temp0 = _mm_cvtepu16_epi32(pred_r0);
+ pred_r0 = _mm_srli_si128(pred_r0, 8);
+ temp2 = _mm_cvtepu16_epi32(pred_r1);
+ pred_r1 = _mm_srli_si128(pred_r1, 8);
+ temp1 = _mm_cvtepu16_epi32(pred_r0);
+ temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp2 = _mm_add_epi32(temp2, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp3 = _mm_add_epi32(temp3, value_4x32b);
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp2 = _mm_packus_epi32(temp2, temp3);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+ temp2 = _mm_packus_epi16(temp2, temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+
+ //Row 2-3 processing
+ pu1_pred += 2 * pred_strd;
+ pu1_dst += 2 * dst_strd;
+
+ pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+ pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+ pred_r0 = _mm_cvtepu8_epi16(pred_r0);
+ pred_r1 = _mm_cvtepu8_epi16(pred_r1);
+
+ temp0 = _mm_cvtepu16_epi32(pred_r0);
+ pred_r0 = _mm_srli_si128(pred_r0, 8);
+ temp2 = _mm_cvtepu16_epi32(pred_r1);
+ pred_r1 = _mm_srli_si128(pred_r1, 8);
+ temp1 = _mm_cvtepu16_epi32(pred_r0);
+ temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp2 = _mm_add_epi32(temp2, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp3 = _mm_add_epi32(temp3, value_4x32b);
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp2 = _mm_packus_epi32(temp2, temp3);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+ temp2 = _mm_packus_epi16(temp2, temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+
+ //Row 4-5 processing
+ pu1_pred += 2 * pred_strd;
+ pu1_dst += 2 * dst_strd;
+
+ pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+ pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+ pred_r0 = _mm_cvtepu8_epi16(pred_r0);
+ pred_r1 = _mm_cvtepu8_epi16(pred_r1);
+
+ temp0 = _mm_cvtepu16_epi32(pred_r0);
+ pred_r0 = _mm_srli_si128(pred_r0, 8);
+ temp2 = _mm_cvtepu16_epi32(pred_r1);
+ pred_r1 = _mm_srli_si128(pred_r1, 8);
+ temp1 = _mm_cvtepu16_epi32(pred_r0);
+ temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp2 = _mm_add_epi32(temp2, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp3 = _mm_add_epi32(temp3, value_4x32b);
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp2 = _mm_packus_epi32(temp2, temp3);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+ temp2 = _mm_packus_epi16(temp2, temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+
+ //Row 6-7 processing
+ pu1_pred += 2 * pred_strd;
+ pu1_dst += 2 * dst_strd;
+
+ pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
+ pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
+ pred_r0 = _mm_cvtepu8_epi16(pred_r0);
+ pred_r1 = _mm_cvtepu8_epi16(pred_r1);
+
+ temp0 = _mm_cvtepu16_epi32(pred_r0);
+ pred_r0 = _mm_srli_si128(pred_r0, 8);
+ temp2 = _mm_cvtepu16_epi32(pred_r1);
+ pred_r1 = _mm_srli_si128(pred_r1, 8);
+ temp1 = _mm_cvtepu16_epi32(pred_r0);
+ temp3 = _mm_cvtepu16_epi32(pred_r1);
+
+ temp0 = _mm_add_epi32(temp0, value_4x32b);
+ temp2 = _mm_add_epi32(temp2, value_4x32b);
+ temp1 = _mm_add_epi32(temp1, value_4x32b);
+ temp3 = _mm_add_epi32(temp3, value_4x32b);
+ temp0 = _mm_packus_epi32(temp0, temp1);
+ temp2 = _mm_packus_epi32(temp2, temp3);
+ temp0 = _mm_packus_epi16(temp0, temp1);
+ temp2 = _mm_packus_epi16(temp2, temp3);
+ _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
+ _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
+}
diff --git a/common/x86/impeg2_inter_pred_sse42_intr.c b/common/x86/impeg2_inter_pred_sse42_intr.c
new file mode 100644
index 0000000..4599afa
--- /dev/null
+++ b/common/x86/impeg2_inter_pred_sse42_intr.c
@@ -0,0 +1,899 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ * impeg2_inter_pred_sse42_intr.c
+ *
+ * @brief
+ * Contains Motion compensation function definitions for MPEG2 decoder
+ *
+ * @author
+ * Mohit [100664]
+ *
+ * - impeg2_copy_mb_sse42()
+ * - impeg2_interpolate_sse42()
+ * - impeg2_mc_halfx_halfy_8x8_sse42()
+ * - impeg2_mc_halfx_fully_8x8_sse42()
+ * - impeg2_mc_fullx_halfy_8x8_sse42()
+ * - impeg2_mc_fullx_fully_8x8_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_macros.h"
+#include "impeg2_defs.h"
+#include "impeg2_inter_pred.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/*******************************************************************************
+* Function Name : impeg2_copy_mb
+*
+* Description : copies 3 components to the frame from mc_buf
+*
+* Arguments :
+* src_buf : Source Buffer
+* dst_buf : Destination Buffer
+* src_wd : Source Width
+* dst_wd : destination Width
+*
+* Values Returned : None
+*******************************************************************************/
+void impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
+ yuv_buf_t *dst_buf,
+ UWORD32 src_wd,
+ UWORD32 dst_wd)
+{
+ UWORD8 *src;
+ UWORD8 *dst;
+ __m128i src_r0, src_r1, src_r2, src_r3;
+
+ /*******************************************************/
+ /* copy Y */
+ /*******************************************************/
+ src = src_buf->pu1_y;
+ dst = dst_buf->pu1_y;
+ // Row 0-3
+ src_r0 = _mm_loadu_si128((__m128i *) (src));
+ src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+ src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+ src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+ _mm_storeu_si128((__m128i *) dst, src_r0);
+ _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+ // Row 4-7
+ src += 4 * src_wd;
+ dst += 4 * dst_wd;
+ src_r0 = _mm_loadu_si128((__m128i *) (src));
+ src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+ src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+ src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+ _mm_storeu_si128((__m128i *) dst, src_r0);
+ _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+ // Row 8-11
+ src += 4 * src_wd;
+ dst += 4 * dst_wd;
+ src_r0 = _mm_loadu_si128((__m128i *) (src));
+ src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+ src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+ src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+ _mm_storeu_si128((__m128i *) dst, src_r0);
+ _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+ // Row 12-15
+ src += 4 * src_wd;
+ dst += 4 * dst_wd;
+ src_r0 = _mm_loadu_si128((__m128i *) (src));
+ src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
+ src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
+ src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
+
+ _mm_storeu_si128((__m128i *) dst, src_r0);
+ _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
+
+ src_wd >>= 1;
+ dst_wd >>= 1;
+
+ /*******************************************************/
+ /* copy U */
+ /*******************************************************/
+ src = src_buf->pu1_u;
+ dst = dst_buf->pu1_u;
+
+ // Row 0-3
+ src_r0 = _mm_loadl_epi64((__m128i *)src);
+ src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
+ src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+ src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+ _mm_storel_epi64((__m128i *)dst, src_r0);
+ _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+ _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+
+ // Row 4-7
+ src += 4 * src_wd;
+ dst += 4 * dst_wd;
+
+ src_r0 = _mm_loadl_epi64((__m128i *)src);
+ src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
+ src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+ src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+ _mm_storel_epi64((__m128i *)dst, src_r0);
+ _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+ _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+
+ /*******************************************************/
+ /* copy V */
+ /*******************************************************/
+ src = src_buf->pu1_v;
+ dst = dst_buf->pu1_v;
+ // Row 0-3
+ src_r0 = _mm_loadl_epi64((__m128i *)src);
+ src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
+ src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+ src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+ _mm_storel_epi64((__m128i *)dst, src_r0);
+ _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+ _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+
+ // Row 4-7
+ src += 4 * src_wd;
+ dst += 4 * dst_wd;
+
+ src_r0 = _mm_loadl_epi64((__m128i *)src);
+ src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
+ src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
+ src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
+
+ _mm_storel_epi64((__m128i *)dst, src_r0);
+ _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
+ _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_interpolate */
+/* */
+/* Description : averages the contents of buf_src1 and buf_src2 and stores*/
+/* result in buf_dst */
+/* */
+/* Inputs : buf_src1 - First Source */
+/* buf_src2 - Second Source */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Avg the values from two sources and store the result in */
+/* destination buffer */
+/* */
+/* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */
+/* */
+/* Returns : None */
+/* */
+/* Issues : Assumes that all 3 buffers are of same size */
+/* */
+/*****************************************************************************/
+void impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
+ yuv_buf_t *buf_src2,
+ yuv_buf_t *buf_dst,
+ UWORD32 stride)
+{
+ UWORD8 *src1, *src2;
+ UWORD8 *dst;
+ __m128i src1_r0, src1_r1, src1_r2, src1_r3;
+ __m128i src2_r0, src2_r1, src2_r2, src2_r3;
+
+ /*******************************************************/
+ /* interpolate Y */
+ /*******************************************************/
+ src1 = buf_src1->pu1_y;
+ src2 = buf_src2->pu1_y;
+ dst = buf_dst->pu1_y;
+ // Row 0-3
+ src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+ src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+ src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+ src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+ src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+ src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+ src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+ src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storeu_si128((__m128i *) dst, src1_r0);
+ _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+ // Row 4-7
+ src1 += 4 * 16;
+ src2 += 4 * 16;
+ dst += 4 * stride;
+ src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+ src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+ src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+ src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+ src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+ src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+ src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+ src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storeu_si128((__m128i *) dst, src1_r0);
+ _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+ // Row 8-11
+ src1 += 4 * 16;
+ src2 += 4 * 16;
+ dst += 4 * stride;
+ src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+ src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+ src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+ src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+ src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+ src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+ src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+ src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storeu_si128((__m128i *) dst, src1_r0);
+ _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+ // Row 12-15
+ src1 += 4 * 16;
+ src2 += 4 * 16;
+ dst += 4 * stride;
+ src1_r0 = _mm_loadu_si128((__m128i *) (src1));
+ src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
+ src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
+ src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
+
+ src2_r0 = _mm_loadu_si128((__m128i *) (src2));
+ src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
+ src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
+ src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storeu_si128((__m128i *) dst, src1_r0);
+ _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
+ _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
+
+ stride >>= 1;
+
+ /*******************************************************/
+ /* interpolate U */
+ /*******************************************************/
+ src1 = buf_src1->pu1_u;
+ src2 = buf_src2->pu1_u;
+ dst = buf_dst->pu1_u;
+ // Row 0-3
+ src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+ src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+ src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+ src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+ src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+ src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+ src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+ src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storel_epi64((__m128i *) dst, src1_r0);
+ _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+ _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+
+ // Row 4-7
+ src1 += 4 * 8;
+ src2 += 4 * 8;
+ dst += 4 * stride;
+
+ src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+ src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+ src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+ src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+ src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+ src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+ src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+ src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storel_epi64((__m128i *) dst, src1_r0);
+ _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+ _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+
+ /*******************************************************/
+ /* interpolate V */
+ /*******************************************************/
+ src1 = buf_src1->pu1_v;
+ src2 = buf_src2->pu1_v;
+ dst = buf_dst->pu1_v;
+
+ // Row 0-3
+ src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+ src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+ src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+ src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+ src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+ src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+ src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+ src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storel_epi64((__m128i *) dst, src1_r0);
+ _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+ _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+
+ // Row 4-7
+ src1 += 4 * 8;
+ src2 += 4 * 8;
+ dst += 4 * stride;
+
+ src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
+ src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
+ src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
+ src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
+
+ src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
+ src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
+ src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
+ src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
+
+ src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
+ src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
+ src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
+ src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
+
+ _mm_storel_epi64((__m128i *) dst, src1_r0);
+ _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
+ _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
+ _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */
+/* */
+/* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */
+/* the ref frame.Interpolate these four values to get the */
+/* value at(0.5,0.5).Repeat this to get an 8 x 8 block */
+/* using 9 x 9 block from reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/*****************************************************************************/
+void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
+ UWORD8 *ref,
+ UWORD32 ref_wid,
+ UWORD32 out_wid)
+{
+ UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
+ /* P0-P3 are the pixels in the reference frame and Q is the value being */
+ /* estimated */
+ /*
+ P0 P1
+ Q
+ P2 P3
+ */
+ __m128i src_r0, src_r0_1, src_r1, src_r1_1;
+ __m128i tmp0, tmp1;
+ __m128i value_2 = _mm_set1_epi16(2);
+
+ ref_p0 = ref;
+ ref_p1 = ref + 1;
+ ref_p2 = ref + ref_wid;
+ ref_p3 = ref + ref_wid + 1;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+ src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1
+ src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+ src_r1 = _mm_cvtepu8_epi16(src_r1);
+ src_r1_1 = _mm_cvtepu8_epi16(src_r1_1);
+
+ tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation
+ tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation
+ tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation
+ tmp0 = _mm_add_epi16(tmp0, value_2);
+ tmp0 = _mm_srli_epi16(tmp0, 2);
+ tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp0);
+
+ //Row 1
+ ref_p2 += ref_wid;
+ ref_p3 += ref_wid;
+ out += out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+
+ tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation
+ tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation
+ tmp1 = _mm_add_epi16(tmp1, value_2);
+ tmp1 = _mm_srli_epi16(tmp1, 2);
+ tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp1);
+
+ //Row 2
+ ref_p2 += ref_wid;
+ ref_p3 += ref_wid;
+ out += out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+
+ tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation
+
+ tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation
+ tmp0 = _mm_add_epi16(tmp0, value_2);
+ tmp0 = _mm_srli_epi16(tmp0, 2);
+ tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp0);
+
+ //Row 3
+ ref_p2 += ref_wid;
+ ref_p3 += ref_wid;
+ out += out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+
+ tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation
+
+ tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation
+ tmp1 = _mm_add_epi16(tmp1, value_2);
+ tmp1 = _mm_srli_epi16(tmp1, 2);
+ tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp1);
+
+ //Row 4
+ ref_p2 += ref_wid;
+ ref_p3 += ref_wid;
+ out += out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+
+ tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation
+
+ tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation
+ tmp0 = _mm_add_epi16(tmp0, value_2);
+ tmp0 = _mm_srli_epi16(tmp0, 2);
+ tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp0);
+
+ //Row 5
+ ref_p2 += ref_wid;
+ ref_p3 += ref_wid;
+ out += out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+
+ tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation
+
+ tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation
+ tmp1 = _mm_add_epi16(tmp1, value_2);
+ tmp1 = _mm_srli_epi16(tmp1, 2);
+ tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp1);
+
+ //Row 6
+ ref_p2 += ref_wid;
+ ref_p3 += ref_wid;
+ out += out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+
+ tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation
+
+ tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation
+ tmp0 = _mm_add_epi16(tmp0, value_2);
+ tmp0 = _mm_srli_epi16(tmp0, 2);
+ tmp0 = _mm_packus_epi16(tmp0, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp0);
+
+ //Row 7
+ ref_p2 += ref_wid;
+ ref_p3 += ref_wid;
+ out += out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
+
+ src_r0 = _mm_cvtepu8_epi16(src_r0);
+ src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
+
+ tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation
+
+ tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation
+ tmp1 = _mm_add_epi16(tmp1, value_2);
+ tmp1 = _mm_srli_epi16(tmp1, 2);
+ tmp1 = _mm_packus_epi16(tmp1, value_2);
+
+ _mm_storel_epi64((__m128i *)out, tmp1);
+
+ return;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */
+/* */
+/* Description : Gets the buffer from (0.5,0) to (8.5,8) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0) and (1,0) position in the ref frame */
+/* Interpolate these two values to get the value at(0.5,0) */
+/* Repeat this to get an 8 x 8 block using 9 x 8 block from */
+/* reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/*****************************************************************************/
+void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
+ UWORD8 *ref,
+ UWORD32 ref_wid,
+ UWORD32 out_wid)
+{
+ UWORD8 *ref_p0,*ref_p1;
+ __m128i src_r0, src_r0_1, src_r1, src_r1_1;
+ /* P0-P3 are the pixels in the reference frame and Q is the value being */
+ /* estimated */
+ /*
+ P0 Q P1
+ */
+
+ ref_p0 = ref;
+ ref_p1 = ref + 1;
+
+ // Row 0 and 1
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+ src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1
+ src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+ src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+ src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+ _mm_storel_epi64((__m128i *)out, src_r0);
+ _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+ // Row 2 and 3
+ ref_p0 += 2*ref_wid;
+ ref_p1 += 2*ref_wid;
+ out += 2*out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+ src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3
+ src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+ src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+ src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+ _mm_storel_epi64((__m128i *)out, src_r0);
+ _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+ // Row 4 and 5
+ ref_p0 += 2*ref_wid;
+ ref_p1 += 2*ref_wid;
+ out += 2*out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+ src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5
+ src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+ src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+ src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+ _mm_storel_epi64((__m128i *)out, src_r0);
+ _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+ // Row 6 and 7
+ ref_p0 += 2*ref_wid;
+ ref_p1 += 2*ref_wid;
+ out += 2*out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6
+ src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
+ src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7
+ src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
+
+ src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
+ src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
+
+ _mm_storel_epi64((__m128i *)out, src_r0);
+ _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+
+ return;
+}
+
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */
+/* */
+/* Description : Gets the buffer from (0,0.5) to (8,8.5) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0) and (0,1) position in the ref frame */
+/* Interpolate these two values to get the value at(0,0.5) */
+/* Repeat this to get an 8 x 8 block using 8 x 9 block from */
+/* reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/*****************************************************************************/
+void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
+ UWORD8 *ref,
+ UWORD32 ref_wid,
+ UWORD32 out_wid)
+{
+ __m128i src_r0, src_r1, src_r2, temp0, temp1;
+ /* P0-P3 are the pixels in the reference frame and Q is the value being */
+ /* estimated */
+ /*
+ P0
+ x
+ P1
+ */
+ src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0
+ src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1
+ src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2
+ temp0 = _mm_avg_epu8(src_r0, src_r1);
+ temp1 = _mm_avg_epu8(src_r1, src_r2);
+ _mm_storel_epi64((__m128i *)out, temp0); //Row 0
+ _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1
+
+ ref+= 3*ref_wid;
+ out+= 2*out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3
+ src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4
+ temp0 = _mm_avg_epu8(src_r2, src_r0);
+ temp1 = _mm_avg_epu8(src_r0, src_r1);
+ _mm_storel_epi64((__m128i *)out, temp0); //Row 2
+ _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3
+
+ ref += 2*ref_wid;
+ out+= 2*out_wid;
+
+ src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5
+ src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6
+ temp0 = _mm_avg_epu8(src_r1, src_r2);
+ temp1 = _mm_avg_epu8(src_r2, src_r0);
+ _mm_storel_epi64((__m128i *)out, temp0); //Row 4
+ _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5
+
+ ref += 2*ref_wid;
+ out+= 2*out_wid;
+
+ src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7
+ src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8
+ temp0 = _mm_avg_epu8(src_r0, src_r1);
+ temp1 = _mm_avg_epu8(src_r1, src_r2);
+ _mm_storel_epi64((__m128i *)out, temp0); //Row 6
+ _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7
+
+ return;
+}
+
+/*****************************************************************************/
+/* */
+/* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */
+/* */
+/* Description : Gets the buffer from (x,y) to (x+8,y+8) */
+/* and the above block of size 8 x 8 will be placed as a */
+/* block from the current position of out_buf */
+/* */
+/* Inputs : ref - Reference frame from which the block will be */
+/* block will be extracted. */
+/* ref_wid - WIdth of reference frame */
+/* out_wid - WIdth of the output frame */
+/* blk_width - width of the block */
+/* blk_width - height of the block */
+/* */
+/* Globals : None */
+/* */
+/* Processing : Point to the (0,0) position in the ref frame */
+/* Get an 8 x 8 block from reference frame */
+/* */
+/* Outputs : out - Output containing the extracted block */
+/* */
+/* Returns : None */
+/* */
+/* Issues : None */
+/* */
+/*****************************************************************************/
+void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
+ UWORD8 *ref,
+ UWORD32 ref_wid,
+ UWORD32 out_wid)
+{
+ __m128i src_r0, src_r1, src_r2, src_r3;
+ // Row 0-3
+ src_r0 = _mm_loadl_epi64((__m128i *)ref);
+ src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));
+ src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
+ src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
+
+ _mm_storel_epi64((__m128i *)out, src_r0);
+ _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+ _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
+ _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
+
+ // Row 4-7
+ ref += 4 * ref_wid;
+ out += 4 * out_wid;
+
+ src_r0 = _mm_loadl_epi64((__m128i *)ref);
+ src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));
+ src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
+ src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
+
+ _mm_storel_epi64((__m128i *)out, src_r0);
+ _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
+ _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
+ _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
+ return;
+}
diff --git a/common/x86/impeg2_mem_func_sse42_intr.c b/common/x86/impeg2_mem_func_sse42_intr.c
new file mode 100644
index 0000000..de7de8f
--- /dev/null
+++ b/common/x86/impeg2_mem_func_sse42_intr.c
@@ -0,0 +1,100 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+
+/**
+ *******************************************************************************
+ * @file
+ * impeg2_mem_func_sse42_intr.c
+ *
+ * @brief
+ * Contains utility function definitions for MPEG2 codec
+ *
+ * @author
+ * Mohit [100664]
+ *
+* @par List of Functions:
+* - impeg2_memset0_16bit_8x8_linear_block_sse42()
+* - impeg2_memset_8bit_8x8_block_sse42()
+ *
+ * @remarks
+ * None
+ *
+ *******************************************************************************
+ */
+#include <stdio.h>
+#include <string.h>
+#include "iv_datatypedef.h"
+#include "impeg2_defs.h"
+
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+
+/*******************************************************************************
+* Function Name : impeg2_memset0_16bit_8x8_linear_block
+*
+* Description : memsets resudial buf to 0
+*
+* Arguments : destination buffer
+*
+* Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset0_16bit_8x8_linear_block_sse42 (WORD16 *buf)
+ {
+ __m128i zero_8x8_16b = _mm_set1_epi16(0);
+ _mm_storeu_si128((__m128i *) buf, zero_8x8_16b);
+ _mm_storeu_si128((__m128i *) (buf + 8), zero_8x8_16b);
+ _mm_storeu_si128((__m128i *) (buf + 16), zero_8x8_16b);
+ _mm_storeu_si128((__m128i *) (buf + 24), zero_8x8_16b);
+ _mm_storeu_si128((__m128i *) (buf + 32), zero_8x8_16b);
+ _mm_storeu_si128((__m128i *) (buf + 40), zero_8x8_16b);
+ _mm_storeu_si128((__m128i *) (buf + 48), zero_8x8_16b);
+ _mm_storeu_si128((__m128i *) (buf + 56), zero_8x8_16b);
+}
+
+
+
+/*******************************************************************************
+* Function Name : impeg2_memset_8bit_8x8_block
+*
+* Description : memsets residual buf to value
+*
+* Arguments : destination buffer, value and stride
+*
+* Values Returned : None
+*******************************************************************************/
+
+
+void impeg2_memset_8bit_8x8_block_sse42(UWORD8 *dst, WORD32 dc_val, WORD32 dst_wd)
+{
+ __m128i value = _mm_set1_epi8((WORD8)dc_val);
+
+ _mm_storel_epi64((__m128i *)dst, value);
+ _mm_storel_epi64((__m128i *) (dst + dst_wd), value);
+ _mm_storel_epi64((__m128i *) (dst + 2 * dst_wd), value);
+ _mm_storel_epi64((__m128i *) (dst + 3 * dst_wd), value);
+ _mm_storel_epi64((__m128i *) (dst + 4 * dst_wd), value);
+ _mm_storel_epi64((__m128i *) (dst + 5 * dst_wd), value);
+ _mm_storel_epi64((__m128i *) (dst + 6 * dst_wd), value);
+ _mm_storel_epi64((__m128i *) (dst + 7 * dst_wd), value);
+}
diff --git a/common/x86/impeg2_platform_macros.h b/common/x86/impeg2_platform_macros.h
new file mode 100644
index 0000000..05ff6da
--- /dev/null
+++ b/common/x86/impeg2_platform_macros.h
@@ -0,0 +1,49 @@
+/******************************************************************************
+ *
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *****************************************************************************
+ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
+*/
+#ifndef __IMPEG2_PLATFORM_MACROS_H__
+#define __IMPEG2_PLATFORM_MACROS_H__
+
+
+#define CONV_LE_TO_BE(u4_temp2,u4_temp1) u4_temp2 = (u4_temp1 << 24) | \
+ ((u4_temp1 & 0xff00) << 8) | \
+ ((u4_temp1 & 0xff0000) >> 8) | \
+ (u4_temp1 >> 24);
+static __inline UWORD32 CLZ(UWORD32 u4_word)
+{
+ if(u4_word)
+ return (__builtin_clz(u4_word));
+ else
+ return 32;
+}
+
+
+#define CLIP_U8(x) ((x) > 255) ? (255) : (((x) < 0) ? (0) : (x))
+#define CLIP_S8(x) ((x) > 127) ? (127) : (((x) < -128) ? (-128) : (x))
+
+#define CLIP_U12(x) ((x) > 4095) ? (4095) : (((x) < 0) ? (0) : (x))
+#define CLIP_S12(x) ((x) > 2047) ? (2047) : (((x) < -2048) ? (-2048) : (x))
+
+#define CLIP_U16(x) ((x) > 65535) ? (65535) : (((x) < 0) ? (0) : (x))
+#define CLIP_S16(x) ((x) > 65535) ? (65535) : (((x) < -65536) ? (-65536) : (x))
+#define PLD(x)
+
+#define INLINE
+
+#endif /* __IMPEG2_PLATFORM_MACROS_H__ */