summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarish Mahendrakar <harish.mahendrakar@ittiam.com>2015-04-20 15:33:05 +0530
committerLajos Molnar <lajos@google.com>2015-05-05 17:51:53 +0000
commit7497191460a9504f8b4f64df169ab633f0b74353 (patch)
tree216431d3e98141f6dcf4ac957e4e2f98e34d077c
parenta2b49e5f0574dee76f81507f288143d83a4b7c1a (diff)
downloadandroid_external_libavc-7497191460a9504f8b4f64df169ab633f0b74353.tar.gz
android_external_libavc-7497191460a9504f8b4f64df169ab633f0b74353.tar.bz2
android_external_libavc-7497191460a9504f8b4f64df169ab633f0b74353.zip
Resolved warnings and fixed alignment of few assemblies
Resolved warnings seen in x86 modules Fixed alignment of few modules Updated comments in few arm modules for consistency Fixed warnings seen in clang build Change-Id: I0623169b5e84a6a6f09c3d2212e754101272f5e9
-rw-r--r--common/arm/ih264_arm_memory_barrier.s3
-rw-r--r--common/arm/ih264_deblk_chroma_a9.s48
-rw-r--r--common/arm/ih264_deblk_luma_a9.s24
-rw-r--r--common/arm/ih264_default_weighted_pred_a9q.s9
-rw-r--r--common/arm/ih264_ihadamard_scaling_a9.s14
-rw-r--r--common/arm/ih264_inter_pred_chroma_a9q.s18
-rw-r--r--common/arm/ih264_inter_pred_filters_luma_horz_a9q.s21
-rw-r--r--common/arm/ih264_inter_pred_filters_luma_vert_a9q.s18
-rw-r--r--common/arm/ih264_inter_pred_luma_bilinear_a9q.s22
-rw-r--r--common/arm/ih264_inter_pred_luma_copy_a9q.s10
-rw-r--r--common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s14
-rw-r--r--common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s16
-rw-r--r--common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s20
-rw-r--r--common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s14
-rw-r--r--common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s11
-rw-r--r--common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s16
-rw-r--r--common/arm/ih264_intra_pred_chroma_a9q.s32
-rw-r--r--common/arm/ih264_intra_pred_luma_16x16_a9q.s42
-rw-r--r--common/arm/ih264_intra_pred_luma_4x4_a9q.s55
-rw-r--r--common/arm/ih264_intra_pred_luma_8x8_a9q.s59
-rw-r--r--common/arm/ih264_iquant_itrans_recon_a9.s16
-rw-r--r--common/arm/ih264_iquant_itrans_recon_dc_a9.s17
-rw-r--r--common/arm/ih264_itrans_recon_a9.s8
-rw-r--r--common/arm/ih264_mem_fns_neon.s14
-rw-r--r--common/arm/ih264_padding_neon.s25
-rw-r--r--common/arm/ih264_resi_trans_a9.s604
-rw-r--r--common/arm/ih264_resi_trans_quant_a9.s2
-rw-r--r--common/arm/ih264_weighted_bi_pred_a9q.s8
-rw-r--r--common/arm/ih264_weighted_pred_a9q.s8
-rw-r--r--common/armv8/ih264_default_weighted_pred_av8.s1
-rw-r--r--common/armv8/ih264_inter_pred_filters_luma_vert_av8.s8
-rw-r--r--common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s4
-rw-r--r--common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s2
-rw-r--r--common/armv8/ih264_intra_pred_chroma_av8.s2
-rw-r--r--common/armv8/ih264_weighted_bi_pred_av8.s1
-rw-r--r--common/armv8/ih264_weighted_pred_av8.s1
-rw-r--r--common/ih264_dpb_mgr.c2
-rw-r--r--common/ithread.c5
-rw-r--r--common/x86/ih264_deblk_luma_ssse3.c2
-rw-r--r--common/x86/ih264_ihadamard_scaling_sse42.c31
-rw-r--r--common/x86/ih264_ihadamard_scaling_ssse3.c19
-rw-r--r--common/x86/ih264_inter_pred_filters_ssse3.c5
-rw-r--r--common/x86/ih264_iquant_itrans_recon_dc_ssse3.c12
-rw-r--r--common/x86/ih264_iquant_itrans_recon_sse42.c3
-rw-r--r--common/x86/ih264_iquant_itrans_recon_ssse3.c5
-rw-r--r--common/x86/ih264_resi_trans_quant_sse42.c9
-rw-r--r--decoder/ih264d_thread_parse_decode.c11
-rw-r--r--encoder/arm/ime_distortion_metrics_a9q.s1350
-rw-r--r--encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s2
-rw-r--r--encoder/armv8/ih264e_half_pel_av8.s7
-rw-r--r--encoder/armv8/ime_distortion_metrics_av8.s1
-rw-r--r--encoder/x86/ih264e_intra_modes_eval_ssse3.c2
-rw-r--r--encoder/x86/ime_distortion_metrics_sse42.c3
-rw-r--r--test/encoder/main.c20
54 files changed, 1051 insertions, 1625 deletions
diff --git a/common/arm/ih264_arm_memory_barrier.s b/common/arm/ih264_arm_memory_barrier.s
index 523218f..3816409 100644
--- a/common/arm/ih264_arm_memory_barrier.s
+++ b/common/arm/ih264_arm_memory_barrier.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@*******************************************************************************
@* @file
@* ih264_arm_memory_barrier.s
@@ -39,7 +39,6 @@
.text
.p2align 2
-
@*****************************************************************************
@*
@* Function Name : ih264_arm_dsb
diff --git a/common/arm/ih264_deblk_chroma_a9.s b/common/arm/ih264_deblk_chroma_a9.s
index 66102a7..8c9960a 100644
--- a/common/arm/ih264_deblk_chroma_a9.s
+++ b/common/arm/ih264_deblk_chroma_a9.s
@@ -54,7 +54,7 @@
.text
.p2align 2
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -84,7 +84,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_horz_bs4_bp_a9
@@ -130,7 +130,7 @@ ih264_deblk_chroma_horz_bs4_bp_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -160,7 +160,7 @@ ih264_deblk_chroma_horz_bs4_bp_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bs4_bp_a9
@@ -224,7 +224,7 @@ ih264_deblk_chroma_vert_bs4_bp_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -260,7 +260,7 @@ ih264_deblk_chroma_vert_bs4_bp_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_horz_bslt4_bp_a9
@@ -326,7 +326,7 @@ ih264_deblk_chroma_horz_bslt4_bp_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -362,7 +362,7 @@ ih264_deblk_chroma_horz_bslt4_bp_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bslt4_bp_a9
@@ -465,7 +465,7 @@ ih264_deblk_chroma_vert_bslt4_bp_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -495,7 +495,7 @@ ih264_deblk_chroma_vert_bslt4_bp_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9
@@ -543,7 +543,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -579,7 +579,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9
@@ -656,7 +656,7 @@ ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -692,7 +692,7 @@ ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_horz_bs4_a9
@@ -743,7 +743,7 @@ ih264_deblk_chroma_horz_bs4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -779,7 +779,7 @@ ih264_deblk_chroma_horz_bs4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bs4_a9
@@ -848,7 +848,7 @@ ih264_deblk_chroma_vert_bs4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -893,7 +893,7 @@ ih264_deblk_chroma_vert_bs4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_horz_bslt4_a9
@@ -968,7 +968,7 @@ ih264_deblk_chroma_horz_bslt4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -1013,7 +1013,7 @@ ih264_deblk_chroma_horz_bslt4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bslt4_a9
@@ -1119,7 +1119,7 @@ ih264_deblk_chroma_vert_bslt4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -1155,7 +1155,7 @@ ih264_deblk_chroma_vert_bslt4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bs4_mbaff_a9
@@ -1206,7 +1206,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -1251,7 +1251,7 @@ ih264_deblk_chroma_vert_bs4_mbaff_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_chroma_vert_bslt4_mbaff_a9
diff --git a/common/arm/ih264_deblk_luma_a9.s b/common/arm/ih264_deblk_luma_a9.s
index 3e6a4d9..9217ed2 100644
--- a/common/arm/ih264_deblk_luma_a9.s
+++ b/common/arm/ih264_deblk_luma_a9.s
@@ -47,7 +47,7 @@
.text
.p2align 2
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -83,7 +83,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_luma_horz_bslt4_a9
@@ -187,7 +187,7 @@ ih264_deblk_luma_horz_bslt4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -217,7 +217,7 @@ ih264_deblk_luma_horz_bslt4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_luma_horz_bs4_a9
@@ -353,7 +353,7 @@ ih264_deblk_luma_horz_bs4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -389,7 +389,7 @@ ih264_deblk_luma_horz_bs4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_luma_vert_bslt4_a9
@@ -574,7 +574,7 @@ ih264_deblk_luma_vert_bslt4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -604,7 +604,7 @@ ih264_deblk_luma_vert_bslt4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_luma_vert_bs4_a9
@@ -800,7 +800,7 @@ ih264_deblk_luma_vert_bs4_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -830,7 +830,7 @@ ih264_deblk_luma_vert_bs4_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_luma_vert_bs4_mbaff_a9
@@ -942,7 +942,7 @@ ih264_deblk_luma_vert_bs4_mbaff_a9:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -978,7 +978,7 @@ ih264_deblk_luma_vert_bs4_mbaff_a9:
@* None
@*
@*******************************************************************************
-@*/
+@*
.global ih264_deblk_luma_vert_bslt4_mbaff_a9
diff --git a/common/arm/ih264_default_weighted_pred_a9q.s b/common/arm/ih264_default_weighted_pred_a9q.s
index 94cda46..a4688f2 100644
--- a/common/arm/ih264_default_weighted_pred_a9q.s
+++ b/common/arm/ih264_default_weighted_pred_a9q.s
@@ -17,14 +17,13 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_default_weighted_pred_a9q.s
@*
@* @brief
@* Contains function definitions for default weighted prediction.
-@* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
@*
@* @author
@* Kaushik Senthoor R
@@ -38,7 +37,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@*******************************************************************************
@* @function
@* ih264_default_weighted_pred_luma_a9q()
@@ -82,7 +81,7 @@
@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
@*
@*******************************************************************************
-@*/
+@*
@void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1,
@ UWORD8 *pu1_src2,
@ UWORD8 *pu1_dst,
@@ -256,7 +255,7 @@ end_loops:
@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
@*
@*******************************************************************************
-@*/
+@*
@void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1,
@ UWORD8 *pu1_src2,
@ UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_ihadamard_scaling_a9.s b/common/arm/ih264_ihadamard_scaling_a9.s
index 687099a..c7feddd 100644
--- a/common/arm/ih264_ihadamard_scaling_a9.s
+++ b/common/arm/ih264_ihadamard_scaling_a9.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@ *******************************************************************************
@ * @file
@ * ih264_ihadamard_scaling_a9.s
@@ -37,7 +37,7 @@
@ * None
@ *
@ *******************************************************************************
-@ */
+@ *
@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
@ * of a 16x16 intra prediction macroblock, and then performs scaling.
@ * prediction buffer
@@ -69,10 +69,10 @@
@ * @remarks none
@ *
@ *******************************************************************************
-@ */
+@ *
@ *
@ *******************************************************************************
-@ */
+@ *
@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
@ WORD16* pi2_out,
@ const UWORD16 *pu2_iscal_mat,
@@ -161,7 +161,7 @@ ih264_ihadamard_scaling_4x4_a9:
@ *******************************************************************************
-@ */
+@ *
@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
@ *
@ * @par Description:
@@ -189,10 +189,10 @@ ih264_ihadamard_scaling_4x4_a9:
@ * @remarks none
@ *
@ *******************************************************************************
-@ */
+@ *
@ *
@ *******************************************************************************
-@ */
+@ *
@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
@ WORD16* pi2_out,
@ const UWORD16 *pu2_iscal_mat,
diff --git a/common/arm/ih264_inter_pred_chroma_a9q.s b/common/arm/ih264_inter_pred_chroma_a9q.s
index afd2860..6681a7c 100644
--- a/common/arm/ih264_inter_pred_chroma_a9q.s
+++ b/common/arm/ih264_inter_pred_chroma_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_chroma_a9q.s
@@ -36,16 +36,16 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
-@/**
+@**
+@**
+@**
@
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -85,7 +85,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_inter_pred_chroma(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@@ -112,8 +112,6 @@
ih264_inter_pred_chroma_a9q:
-
-
stmfd sp!, {r4-r12, r14} @store register values to stack
vstmdb sp!, {d8-d15} @push neon registers to stack
ldr r4, [sp, #104]
diff --git a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
index ea6bba0..62b4b94 100644
--- a/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
+++ b/common/arm/ih264_inter_pred_filters_luma_horz_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_horz_a9q.s
@@ -36,13 +36,13 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
+@**
+@**
@*******************************************************************************
@*
@* @brief
@@ -76,7 +76,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_inter_pred_luma_horz (
@ UWORD8 *pu1_src,
@@ -102,6 +102,9 @@
ih264_inter_pred_luma_horz_a9q:
+
+
+
stmfd sp!, {r4-r12, r14} @store register values to stack
vstmdb sp!, {d8-d15} @push neon registers to stack
ldr r5, [sp, #104] @Loads ht
@@ -116,7 +119,7 @@ ih264_inter_pred_luma_horz_a9q:
beq loop_4
loop_16: @when wd=16
- @// Processing row0 and row1
+ @ Processing row0 and row1
vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 ;for checking loop
vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
@@ -173,7 +176,7 @@ loop_16: @when wd=16
b loop_16 @ loop if height == 8 or 16
loop_8:
-@// Processing row0 and row1
+@ Processing row0 and row1
vld1.8 {d5, d6}, [r0], r2 @// Load row1
vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
vld1.8 {d2, d3}, [r0], r2 @// Load row0
@@ -204,7 +207,7 @@ loop_8:
beq end_func @ Branch if height==4
- b loop_8 @looping if height =8 or 16
+ b loop_8 @looping if height =8 or 16
loop_4:
vld1.8 {d5, d6}, [r0], r2 @// Load row1
diff --git a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
index 5b29e02..65c40a6 100644
--- a/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
+++ b/common/arm/ih264_inter_pred_filters_luma_vert_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_vert_a9q.s
@@ -36,14 +36,14 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
-@/**
+@**
+@**
+@**
@ *******************************************************************************
@ *
@ * @brief
@@ -195,10 +195,10 @@ loop_16: @when wd=16
subne r0, r0, r2
beq end_func @ Branch if height==4
- b loop_16 @ looping if height = 8 or 16
+ b loop_16 @ looping if height = 8 or 16
loop_8:
-@// Processing row0 and row1
+@ Processing row0 and row1
vld1.u32 d0, [r0], r2 @ Vector load from src[0_0]
vld1.u32 d1, [r0], r2 @ Vector load from src[1_0]
@@ -248,7 +248,7 @@ loop_8:
loop_4:
-@// Processing row0 and row1
+@ Processing row0 and row1
vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0]
vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0]
diff --git a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
index 6a3c83d..8f049f8 100644
--- a/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_bilinear_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_bilinear_a9q.s
@@ -36,14 +36,14 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
-@/**
+@**
+@**
+@**
@ *******************************************************************************
@ * function:ih264_inter_pred_luma_bilinear
@ *
@@ -89,7 +89,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
@ UWORD8 *pu1_src2,
@@ -192,7 +192,7 @@ loop_16: @when wd=16
subs r12, r6, #8
vst1.8 {q15}, [r2], r5 @//Store dest row7
- beq end_func @ end function if ht=8
+ beq end_func @ end function if ht=8
vld1.8 {q0}, [r0], r3 @// Load row8 ;src1
vaddl.u8 q10, d0, d4
@@ -275,7 +275,7 @@ loop_8: @wd=8;
vqrshrun.s16 d31, q13, #1
subs r12, r6, #4
vst1.8 {d31}, [r2], r5 @//Store dest row3
- beq end_func @ end function if ht=4
+ beq end_func @ end function if ht=4
vld1.8 {d12}, [r1], r4 @// Load row4 ;src2
vld1.8 {d8}, [r0], r3 @// Load row4 ;src1
@@ -298,7 +298,7 @@ loop_8: @wd=8;
vqrshrun.s16 d31, q11, #1
subs r12, r6, #8
vst1.8 {d31}, [r2], r5 @//Store dest row7
- beq end_func @ end function if ht=8
+ beq end_func @ end function if ht=8
vld1.8 {d0}, [r0], r3 @// Load row8 ;src1
vld1.8 {d4}, [r1], r4 @// Load row8 ;src2
@@ -367,7 +367,7 @@ loop_4:
vqrshrun.s16 d31, q13, #1
subs r12, r6, #4
vst1.32 d31[0], [r2], r5 @//Store dest row3
- beq end_func @ end function if ht=4
+ beq end_func @ end function if ht=4
vld1.32 d12[0], [r1], r4 @// Load row4 ;src2
vld1.32 d8[0], [r0], r3 @// Load row4 ;src1
diff --git a/common/arm/ih264_inter_pred_luma_copy_a9q.s b/common/arm/ih264_inter_pred_luma_copy_a9q.s
index 8ba2fbf..c0b0568 100644
--- a/common/arm/ih264_inter_pred_luma_copy_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_copy_a9q.s
@@ -17,8 +17,8 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
-@/**
+@**
+@**
@*******************************************************************************
@*
@* @brief
@@ -53,7 +53,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_inter_pred_luma_copy (
@ UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@@ -182,7 +182,7 @@ end_inner_loop_wd_16:
ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
-@ /*
+@ *
@ ********************************************************************************
@ *
@ * @brief This function copies a 4x4 block to destination
@@ -208,7 +208,7 @@ end_inner_loop_wd_16:
@ * Currently wd and height is not used, ie a 4x4 block is always copied
@ *
@ *******************************************************************************
-@ */
+@ *
@ void ih264_interleave_copy(WORD16 *pi2_src,
@ UWORD8 *pu1_out,
@ WORD32 pred_strd,
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
index 43321a8..54183f0 100644
--- a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_horz_hpel_vert_hpel_a9q.s
@@ -36,14 +36,14 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
-@/**
+@**
+@**
+@**
@*******************************************************************************
@*
@* @brief
@@ -88,7 +88,7 @@
@* None
@*
@*******************************************************************************
-@*/;
+@*;
@void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
index 65a6de7..c8edf38 100644
--- a/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_horz_hpel_vert_qpel_a9q.s
@@ -36,14 +36,14 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
-@/**
+@**
+@**
+@**
@*******************************************************************************
@*
@* @brief
@@ -91,7 +91,7 @@
@* None
@*
@*******************************************************************************
-@*/;
+@*;
@void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@@ -835,7 +835,7 @@ loop_8:
vmov q7, q14
vst1.32 d30, [r1], r3 @ store row 3
- bgt loop_8 @if height =8 or 16 loop
+ bgt loop_8 @if height =8 or 16 loop
b end_func
loop_4_start:
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
index c39ae01..ab1d1d1 100644
--- a/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_horz_qpel_a9q.s
@@ -30,19 +30,19 @@
@*
@* @par List of Functions:
@*
-@* - ih264_inter_pred_luma_horz_qpe_a9ql()
+@* - ih264_inter_pred_luma_horz_qpel_a9q()
@*
@* @remarks
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
+@**
+@**
@*******************************************************************************
@*
@* @brief
@@ -79,7 +79,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_inter_pred_luma_horz (
@ UWORD8 *pu1_src,
@@ -126,7 +126,7 @@ ih264_inter_pred_luma_horz_qpel_a9q:
beq loop_4
loop_16: @when wd=16
- @// Processing row0 and row1
+ @ Processing row0 and row1
vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0
vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0)
vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1
@@ -187,7 +187,7 @@ loop_16: @when wd=16
b loop_16
loop_8:
-@// Processing row0 and row1
+@ Processing row0 and row1
vld1.8 {d5, d6}, [r0], r2 @// Load row1
vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1)
@@ -221,7 +221,7 @@ loop_8:
subs r5, r5, #2 @ 2 rows done, decrement by 2
beq end_func @ Branch if height==4
- b loop_8 @looping if height == 8 or 16
+ b loop_8 @looping if height == 8 or 16
loop_4:
vld1.8 {d5, d6}, [r0], r2 @// Load row1
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
index 565cc80..3c63ca3 100644
--- a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_horz_qpel_vert_hpel_a9q.s
@@ -36,14 +36,14 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
-@/**
+@**
+@**
+@**
@*******************************************************************************
@*
@* @brief
@@ -91,7 +91,7 @@
@* None
@*
@*******************************************************************************
-@*/;
+@*;
@void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
index 3c8b60a..cfe03a0 100644
--- a/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_horz_qpel_vert_qpel_a9q.s
@@ -36,14 +36,11 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
-@/**
@*******************************************************************************
@*
@* @brief
@@ -90,7 +87,7 @@
@* None
@*
@*******************************************************************************
-@*/;
+@*;
@void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
index d45055e..e2c68ef 100644
--- a/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
+++ b/common/arm/ih264_inter_pred_luma_vert_qpel_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_inter_pred_luma_vert_qpel_a9q.s
@@ -36,13 +36,11 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_inter_pred_filters.c
+@* All the functions here are replicated from ih264_inter_pred_filters.c
@
-@/**
-@/**
@*******************************************************************************
@*
@* @brief
@@ -79,7 +77,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_inter_pred_luma_vert (
@ UWORD8 *pu1_src,
@@ -211,12 +209,12 @@ loop_16: @when wd=16
subne r0, r0, r2
beq end_func @ Branch if height==4
- b loop_16 @ looping if height = 8 or 16
+ b loop_16 @ looping if height = 8 or 16
loop_8:
- @// Processing row0 and row1
+ @ Processing row0 and row1
vld1.u32 d0, [r0], r2 @ Vector load from src[0_0]
vld1.u32 d1, [r0], r2 @ Vector load from src[1_0]
vld1.u32 d2, [r0], r2 @ Vector load from src[2_0]
@@ -270,7 +268,7 @@ loop_8:
b loop_8 @looping if height == 8 or 16
loop_4:
-@// Processing row0 and row1
+@ Processing row0 and row1
vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0]
vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0]
diff --git a/common/arm/ih264_intra_pred_chroma_a9q.s b/common/arm/ih264_intra_pred_chroma_a9q.s
index d03fc55..ccd5c0d 100644
--- a/common/arm/ih264_intra_pred_chroma_a9q.s
+++ b/common/arm/ih264_intra_pred_chroma_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_intra_pred_chroma_a9q.s
@@ -39,15 +39,11 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
+@* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
@
-@/**
-@/**
-@/**
-@
.text
.p2align 2
@@ -60,7 +56,7 @@ scratch_chroma_intrapred_addr1:
scratch_intrapred_chroma_plane_addr1:
.long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_dc
@@ -91,7 +87,7 @@ scratch_intrapred_chroma_plane_addr1:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -105,8 +101,6 @@ scratch_intrapred_chroma_plane_addr1:
@ r3 => dst_strd
@ r4 => ui_neighboravailability
-
-
.global ih264_intra_pred_chroma_8x8_mode_dc_a9q
ih264_intra_pred_chroma_8x8_mode_dc_a9q:
@@ -191,10 +185,10 @@ str_pred:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_horz
@@ -226,7 +220,7 @@ str_pred:
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -270,7 +264,7 @@ loop_8x8_horz:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_vert
@@ -339,10 +333,10 @@ ih264_intra_pred_chroma_8x8_mode_vert_a9q:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_plane
@@ -373,7 +367,7 @@ ih264_intra_pred_chroma_8x8_mode_vert_a9q:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -393,7 +387,6 @@ ih264_intra_pred_chroma_8x8_mode_plane_a9q:
stmfd sp!, {r4-r10, r12, lr}
vpush {d8-d15}
-
vld1.32 d0, [r0]
add r10, r0, #10
vld1.32 d1, [r10]
@@ -542,7 +535,6 @@ scrlblc2:
end_func_plane:
-
vpop {d8-d15}
ldmfd sp!, {r4-r10, r12, pc}
diff --git a/common/arm/ih264_intra_pred_luma_16x16_a9q.s b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
index e38e203..0dd82f3 100644
--- a/common/arm/ih264_intra_pred_luma_16x16_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_16x16_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_intra_pred_luma_16x16_a9q.s
@@ -39,14 +39,14 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
-@/* All the functions here are replicated from ih264_intra_pred_filters.c
+@* All the functions here are replicated from ih264_intra_pred_filters.c
@
-@/**
-@/**
-@/**
+@**
+@**
+@**
@
.text
@@ -57,10 +57,10 @@
.hidden ih264_gai1_intrapred_luma_plane_coeffs
scratch_intrapred_addr1:
.long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8
-@/**
+@**
@*******************************************************************************
@*
-@*ih264_intra_pred_luma_16x16_mode_vert_a9q
+@*ih264_intra_pred_luma_16x16_mode_vert
@*
@* @brief
@* Perform Intra prediction for luma_16x16 mode:vertical
@@ -135,13 +135,13 @@ ih264_intra_pred_luma_16x16_mode_vert_a9q:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
-@*ih264_intra_pred_luma_16x16_mode_horz_a9q
+@*ih264_intra_pred_luma_16x16_mode_horz
@*
@* @brief
@* Perform Intra prediction for luma_16x16 mode:horizontal
@@ -170,7 +170,7 @@ ih264_intra_pred_luma_16x16_mode_vert_a9q:
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -213,13 +213,13 @@ loop_16x16_horz:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
-@*ih264_intra_pred_luma_16x16_mode_dc_a9q
+@*ih264_intra_pred_luma_16x16_mode_dc
@*
@* @brief
@* Perform Intra prediction for luma_16x16 mode:DC
@@ -247,7 +247,7 @@ loop_16x16_horz:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -300,7 +300,7 @@ top_available: @ONLY TOP AVAILABLE
vdup.u8 q0, d0[0]
b str_pred
-left_available: @ONLY LEFT AVAILABLE
+left_available: @ONLY LEFT AVAILABLE
vld1.u8 {q0}, [r0]
vpaddl.u8 q0, q0
vadd.u16 d0, d0, d1
@@ -337,13 +337,13 @@ str_pred:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
-@*ih264_intra_pred_luma_16x16_mode_plane_a9q
+@*ih264_intra_pred_luma_16x16_mode_plane
@*
@* @brief
@* Perform Intra prediction for luma_16x16 mode:PLANE
@@ -371,7 +371,7 @@ str_pred:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
diff --git a/common/arm/ih264_intra_pred_luma_4x4_a9q.s b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
index cb386ea..5cc7e23 100644
--- a/common/arm/ih264_intra_pred_luma_4x4_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_4x4_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_intra_pred_luma_4x4_a9q.s
@@ -44,21 +44,16 @@
@* None
@*
@*******************************************************************************
-@*/
-
-@/* All the functions here are replicated from ih264_intra_pred_filters.c
-@
+@*
-@/**
-@/**
-@/**
+@* All the functions here are replicated from ih264_intra_pred_filters.c
@
.text
.p2align 2
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_vert
@@ -128,10 +123,10 @@ ih264_intra_pred_luma_4x4_mode_vert_a9q:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_horz
@@ -163,7 +158,7 @@ ih264_intra_pred_luma_4x4_mode_vert_a9q:
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_intra_pred_luma_4x4_mode_horz(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -210,10 +205,10 @@ ih264_intra_pred_luma_4x4_mode_horz_a9q:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_dc
@@ -244,7 +239,7 @@ ih264_intra_pred_luma_4x4_mode_horz_a9q:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_4x4_mode_dc(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -352,7 +347,7 @@ end_func:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_diag_dl
@@ -383,7 +378,7 @@ end_func:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_4x4_mode_diag_dl(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -434,7 +429,7 @@ end_func_diag_dl:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_diag_dr
@@ -465,7 +460,7 @@ end_func_diag_dl:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_4x4_mode_diag_dr(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -514,7 +509,7 @@ end_func_diag_dr:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_vert_r
@@ -545,7 +540,7 @@ end_func_diag_dr:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_4x4_mode_vert_r(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -596,7 +591,7 @@ end_func_vert_r:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_horz_d
@@ -627,7 +622,7 @@ end_func_vert_r:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_4x4_mode_horz_d(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -659,7 +654,7 @@ ih264_intra_pred_luma_4x4_mode_horz_d_a9q:
vqrshrun.s16 d5, q12, #2
sub r5, r3, #2
vmov.8 d6, d5
- vtrn.8 d4, d5 @
+ vtrn.8 d4, d5 @
vst1.u16 {d5[1]}, [r1]!
vst1.16 {d6[2]}, [r1], r5
vst1.u16 {d4[1]}, [r1]!
@@ -678,7 +673,7 @@ end_func_horz_d:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_vert_l
@@ -709,7 +704,7 @@ end_func_horz_d:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_4x4_mode_vert_l(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -759,7 +754,7 @@ end_func_vert_l:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_4x4_mode_horz_u
@@ -790,7 +785,7 @@ end_func_vert_l:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_4x4_mode_horz_u(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -825,9 +820,9 @@ ih264_intra_pred_luma_4x4_mode_horz_u_a9q:
vext.8 d6, d5, d4, #1
vst1.8 {d4[2]}, [r1]!
vst1.8 {d6[0]}, [r1]!
- vtrn.8 d6, d5 @
+ vtrn.8 d6, d5 @
sub r5, r3, #2
- vtrn.8 d4, d6 @
+ vtrn.8 d4, d6 @
vdup.8 d7, r9
vst1.16 {d6[0]}, [r1], r5
vst1.16 {d6[0]}, [r1]!
diff --git a/common/arm/ih264_intra_pred_luma_8x8_a9q.s b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
index 6da1c95..352d29d 100644
--- a/common/arm/ih264_intra_pred_luma_8x8_a9q.s
+++ b/common/arm/ih264_intra_pred_luma_8x8_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_intra_pred_luma_8x8_a9q.s
@@ -45,17 +45,11 @@
@* None
@*
@*******************************************************************************
-@*/
-
-@/* All the functions here are replicated from ih264_intra_pred_filters.c
-@
+@*
-@/**
-@/**
-@/**
+@* All the functions here are replicated from ih264_intra_pred_filters.c
@
-
.text
.p2align 2
@@ -64,7 +58,7 @@
scratch_intrapred_addr_8x8:
.long ih264_gai1_intrapred_luma_8x8_horz_u - scrlb8x8l2 - 8
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_ref_filtering
@@ -95,7 +89,7 @@ scratch_intrapred_addr_8x8:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_ref_filtering(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst)
@@ -111,7 +105,6 @@ ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q:
stmfd sp!, {r4-r12, r14} @store register values to stack
vpush {d8-d15}
-
vld1.u8 {q0}, [r0]! @
vld1.u8 {q1}, [r0]
add r0, r0, #8 @
@@ -141,6 +134,7 @@ ih264_intra_pred_luma_8x8_mode_ref_filtering_a9q:
end_func_ref_filt:
+
vpop {d8-d15}
ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
@@ -149,7 +143,7 @@ end_func_ref_filt:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_vert
@@ -219,10 +213,10 @@ ih264_intra_pred_luma_8x8_mode_vert_a9q:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_horz
@@ -254,7 +248,7 @@ ih264_intra_pred_luma_8x8_mode_vert_a9q:
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_intra_pred_luma_8x8_mode_horz(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -299,10 +293,10 @@ loop_8x8_horz:
-@/******************************************************************************
+@******************************************************************************
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_dc
@@ -333,7 +327,7 @@ loop_8x8_horz:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_dc(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -413,7 +407,7 @@ str_pred:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_diag_dl
@@ -444,7 +438,7 @@ str_pred:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_diag_dl(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -506,7 +500,7 @@ end_func_diag_dl:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_diag_dr
@@ -537,7 +531,7 @@ end_func_diag_dl:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_diag_dr(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -597,7 +591,7 @@ end_func_diag_dr:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_vert_r
@@ -628,7 +622,7 @@ end_func_diag_dr:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_vert_r(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -717,7 +711,7 @@ end_func_vert_r:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_horz_d
@@ -748,7 +742,7 @@ end_func_vert_r:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_horz_d(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -791,7 +785,7 @@ ih264_intra_pred_luma_8x8_mode_horz_d_a9q:
vmov.8 q4, q2
vmov.8 q5, q3
sub r6, r3, #6
- vtrn.8 q4, q5 @
+ vtrn.8 q4, q5 @
vmov.8 q6, q4
vmov.8 q7, q5
sub r5, r3, #4
@@ -835,7 +829,7 @@ end_func_horz_d:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_vert_l
@@ -866,7 +860,7 @@ end_func_horz_d:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_vert_l(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -887,6 +881,7 @@ ih264_intra_pred_luma_8x8_mode_vert_l_a9q:
stmfd sp!, {r4-r12, r14} @Restoring registers from stack
vpush {d8-d15}
+
add r0, r0, #9
vld1.u8 {q0}, [r0]
add r0, r0, #1
@@ -935,7 +930,7 @@ end_func_vert_l:
-@/**
+@**
@*******************************************************************************
@*
@*ih264_intra_pred_luma_8x8_mode_horz_u
@@ -966,7 +961,7 @@ end_func_vert_l:
@* @remarks
@* None
@*
-@*******************************************************************************/
+@*******************************************************************************
@void ih264_intra_pred_luma_8x8_mode_horz_u(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
diff --git a/common/arm/ih264_iquant_itrans_recon_a9.s b/common/arm/ih264_iquant_itrans_recon_a9.s
index f71ca69..4e49f6a 100644
--- a/common/arm/ih264_iquant_itrans_recon_a9.s
+++ b/common/arm/ih264_iquant_itrans_recon_a9.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@ *******************************************************************************
@ * @file
@ * ih264_iquant_itrans_recon_a9.s
@@ -38,8 +38,8 @@
@ * None
@ *
@ *******************************************************************************
-@*/
-@/**
+@*
+@**
@ *******************************************************************************
@ *
@ * @brief
@@ -82,7 +82,7 @@
@ * None
@ *
@ *******************************************************************************
-@ */
+@ *
@void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
@ UWORD8 *pu1_pred,
@ UWORD8 *pu1_out,
@@ -225,7 +225,7 @@ ih264_iquant_itrans_recon_4x4_a9:
ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
- @/**
+@**
@ *******************************************************************************
@ *
@ * @brief
@@ -268,7 +268,7 @@ ih264_iquant_itrans_recon_4x4_a9:
@ * None
@ *
@ *******************************************************************************
-@ */
+@ *
@void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
@ UWORD8 *pu1_pred,
@ UWORD8 *pu1_out,
@@ -416,7 +416,7 @@ ih264_iquant_itrans_recon_chroma_4x4_a9:
ldmfd sp!, {r4-r12, r15} @Reload the registers from SP
-@/*
+@*
@ *******************************************************************************
@ *
@ * @brief
@@ -459,7 +459,7 @@ ih264_iquant_itrans_recon_chroma_4x4_a9:
@ * None
@ *
@ *******************************************************************************
-@ */
+@ *
@void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
@ UWORD8 *pu1_pred,
@ UWORD8 *pu1_out,
diff --git a/common/arm/ih264_iquant_itrans_recon_dc_a9.s b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
index 8d71bdb..97c4724 100644
--- a/common/arm/ih264_iquant_itrans_recon_dc_a9.s
+++ b/common/arm/ih264_iquant_itrans_recon_dc_a9.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@ *******************************************************************************
@ * @file
@ * ih264_iquant_itrans_recon_dc_a9.s
@@ -37,8 +37,8 @@
@ * None
@ *
@ *******************************************************************************
-@*/
-@/**
+@*
+@**
@ *******************************************************************************
@ *
@ * @brief
@@ -83,7 +83,7 @@
@ * None
@ *
@ *******************************************************************************
-@ */
+@ *
@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
@ UWORD8 *pu1_pred,
@ UWORD8 *pu1_out,
@@ -167,7 +167,7 @@ ih264_iquant_itrans_recon_4x4_dc_a9:
-@/*
+@*
@ *******************************************************************************
@ *
@ * @brief
@@ -212,7 +212,7 @@ ih264_iquant_itrans_recon_4x4_dc_a9:
@ * None
@ *
@ *******************************************************************************
-@ */
+@ *
@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
@ UWORD8 *pu1_pred,
@ UWORD8 *pu1_out,
@@ -300,7 +300,7 @@ ih264_iquant_itrans_recon_8x8_dc_a9:
ldmfd sp!, {r4-r8, r15}
-@ /*
+@ *
@ ********************************************************************************
@ *
@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
@@ -328,7 +328,7 @@ ih264_iquant_itrans_recon_8x8_dc_a9:
@ * @remarks none
@ *
@ *******************************************************************************
-@ */
+@ *
@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
@ UWORD8 *pu1_pred,
@ UWORD8 *pu1_out,
@@ -368,6 +368,7 @@ ih264_iquant_itrans_recon_chroma_4x4_dc_a9:
vmov.u16 q15, #0x00ff
+
vld1.u8 d18, [r2], r0 @load out [8 bit size) -8 coeffs
vaddw.u8 q1, q0, d2 @Add pred
vld1.u8 d19, [r2], r0
diff --git a/common/arm/ih264_itrans_recon_a9.s b/common/arm/ih264_itrans_recon_a9.s
index 1d74da5..769d5d7 100644
--- a/common/arm/ih264_itrans_recon_a9.s
+++ b/common/arm/ih264_itrans_recon_a9.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@ *******************************************************************************
@ * @file
@ * ih264_itrans_recon_neon_a9.s
@@ -33,8 +33,8 @@
@ * None
@ *
@ *******************************************************************************
-@*/
-@/**
+@*
+@**
@ *******************************************************************************
@ *
@ * @brief
@@ -72,7 +72,7 @@
@ *
@ *
@ *******************************************************************************
-@ */
+@ *
@void ih264_itrans_recon_4x4(
@ WORD16 *pi2_src,
@ UWORD8 *pu1_pred,
diff --git a/common/arm/ih264_mem_fns_neon.s b/common/arm/ih264_mem_fns_neon.s
index 2808897..39ad9b3 100644
--- a/common/arm/ih264_mem_fns_neon.s
+++ b/common/arm/ih264_mem_fns_neon.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@ *******************************************************************************
@ * @file
@ * ih264_mem_fns_neon.s
@@ -40,9 +40,9 @@
@ * None
@ *
@ *******************************************************************************
-@*/
+@*
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -65,7 +65,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
@ UWORD8 *pu1_src,
@ UWORD8 num_bytes)
@@ -94,7 +94,7 @@ loop_neon_memcpy_mul_8:
@*******************************************************************************
-@*/
+@*
@void ih264_memcpy(UWORD8 *pu1_dst,
@ UWORD8 *pu1_src,
@ UWORD8 num_bytes)
@@ -143,6 +143,8 @@ loop_memcpy:
+
+
.global ih264_memset_mul_8_a9q
ih264_memset_mul_8_a9q:
@@ -208,6 +210,8 @@ loop_memset:
+
+
.global ih264_memset_16bit_mul_8_a9q
ih264_memset_16bit_mul_8_a9q:
diff --git a/common/arm/ih264_padding_neon.s b/common/arm/ih264_padding_neon.s
index 9bab268..e7a1f91 100644
--- a/common/arm/ih264_padding_neon.s
+++ b/common/arm/ih264_padding_neon.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@*
@ *******************************************************************************
@ * @file
@ * ih264_padding_neon.s
@@ -39,10 +39,10 @@
@ * None
@ *
@ *******************************************************************************
-@*/
+@*
-@/**
+@**
@*******************************************************************************
@*
@* @brief pad at the top of a 2d array
@@ -67,7 +67,7 @@
@* @remarks none
@*
@*******************************************************************************
-@*/
+@*
@void ih264_pad_top(UWORD8 *pu1_src,
@ WORD32 src_strd,
@ WORD32 wd,
@@ -110,7 +110,7 @@ loop_neon_pad_top:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -147,7 +147,7 @@ loop_neon_pad_top:
@* None
@*
@*******************************************************************************
-@*/
+@*
@#if PAD_LEFT_LUMA == C
@void ih264_pad_left_luma(UWORD8 *pu1_src,
@ WORD32 src_strd,
@@ -160,6 +160,7 @@ loop_neon_pad_top:
@ r3 => pad_size
+
.global ih264_pad_left_luma_a9q
ih264_pad_left_luma_a9q:
@@ -245,7 +246,7 @@ end_func:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -282,7 +283,7 @@ end_func:
@* None
@*
@*******************************************************************************
-@*/
+@*
@#if PAD_LEFT_CHROMA == C
@void ih264_pad_left_chroma(UWORD8 *pu1_src,
@ WORD32 src_strd,
@@ -373,7 +374,7 @@ end_func_l_c:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -410,7 +411,7 @@ end_func_l_c:
@* None
@*
@*******************************************************************************
-@*/
+@*
@#if PAD_RIGHT_LUMA == C
@void ih264_pad_right_luma(UWORD8 *pu1_src,
@ WORD32 src_strd,
@@ -519,7 +520,7 @@ end_func_r:
-@/**
+@**
@*******************************************************************************
@*
@* @brief
@@ -556,7 +557,7 @@ end_func_r:
@* None
@*
@*******************************************************************************
-@*/
+@*
@#if PAD_RIGHT_CHROMA == C
@void ih264_pad_right_chroma(UWORD8 *pu1_src,
@ WORD32 src_strd,
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s
deleted file mode 100644
index 08821f5..0000000
--- a/common/arm/ih264_resi_trans_a9.s
+++ /dev/null
@@ -1,604 +0,0 @@
-@/******************************************************************************
-@ *
-@ * Copyright (C) 2015 The Android Open Source Project
-@ *
-@ * Licensed under the Apache License, Version 2.0 (the "License");
-@ * you may not use this file except in compliance with the License.
-@ * You may obtain a copy of the License at:
-@ *
-@ * http://www.apache.org/licenses/LICENSE-2.0
-@ *
-@ * Unless required by applicable law or agreed to in writing, software
-@ * distributed under the License is distributed on an "AS IS" BASIS,
-@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ * See the License for the specific language governing permissions and
-@ * limitations under the License.
-@ *
-@ *****************************************************************************
-@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
-@*/
-@/**
-@*******************************************************************************
-@* @file
-@* ih264_resi_trans_a9.s
-@*
-@* @brief
-@* Contains function definitions for residual and forward trans
-@*
-@* @author
-@* Ittiam
-@*
-@* @par List of Functions:
-@* ih264_resi_trans_4x4_a9
-@* ih264_resi_trans_8x8_a9
-@* @remarks
-@* None
-@*
-@*******************************************************************************
-
-
-.text
-.p2align 2
-@*****************************************************************************
-@*
-@* Function Name : ih264_resi_trans_4x4_a9
-@* Description : This function does cf4 of H264 followed by and approximate scaling
-@*
-@* Arguments :
-@ R0 :pointer to src buffer
-@ R1 :pointer to pred buffer
-@ R2 :pointer to dst buffer
-@ R3 :src_stride
-@ STACk :pred_stride,dst_stride
-
-@* Values Returned : NONE
-@*
-@* Register Usage :
-@* Stack Usage :
-@* Cycles : Around
-@* Interruptiaility : Interruptable
-@*
-@* Known Limitations
-@* \Assumptions :
-@*
-@* Revision History :
-@* DD MM YYYY Author(s) Changes
-@* 30 12 2009 100633 First version
-@*
-@*****************************************************************************
-
-
- .global ih264_resi_trans_4x4_a9
- .extern g_scal_coff_h264_4x4
-g_scal_coff_h264_4x4_addr:
- .long g_scal_coff_h264_4x4 - 4x4lbl - 8
-
-ih264_resi_trans_4x4_a9:
-
- @R0 :pointer to src buffer
- @R1 :pointer to pred buffer
- @R2 :pointer to dst buffer
- @R3 :src_stride
- @STACk :pred_stride,dst_stride
-
- push {r4-r12, lr} @push all the variables first
-
- mov r6, sp
- add r6, r6, #40 @decrement stack pointer,to accomodate two variables
- ldmfd r6, {r4-r5} @load the strides into registers
- @R4 pred_stride
- @R5 dst_stride
-
-
- @we have to give the stride as post inrement in VLDR1
- @but since thr stride is from end of row 1 to start of row 2,
- @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
- @ADD R3,#4
- @ADD R4,#4
- @ADD R5,#4
- @in case of dst the stride represnts 16 bit ie 2*8bits
- @hence we need to add #4 to it and thenm multiply by 2
- @--------------------function loading done------------------------
-
- @lets find residual
- @data is like 1a -> d0[1:31] d0[32:64]
- @ a b c d # # # #
- vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
- vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
- @ data is like 1a -> q4[1:63] q4[64:148]
- @ d8[1:63] d9[1:63]
- @ a b c d # # # #
-
- vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0]
- vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0]
-
- vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0]
- vsubl.u8 q0, d30, d31 @curr - pred for row one
-
- vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0]
- vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0
-
- vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0]
-
- vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0]
- vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2]
-
- lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values
- ldr r6, g_scal_coff_h264_4x4_addr
-4x4lbl:
- add r6, r6, pc @ load the address of global array
-
- vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6
-
- @after this
- @D0 -> 1a
- @D2 -> 2a
- @D4 -> 3a
- @D6 -> 4a
-
- @transpose the matrix so that we can do the horizontal transform first
- @#1 #2 #3 #4
- @a b c d ---- D0
- @e f g h -----D2
- @i j k l -----D4
- @m n o p -----D6
- @transpose the inner 2x2 blocks
- vtrn.16 d0, d2
- vld1.s16 {q10}, [r6]! @ load the scaling values 0-7;
- vtrn.16 d4, d6
- @a e c g
- @b f d h
- @i m k o
- @j n l p
- vtrn.32 d0, d4
- vtrn.32 d2, d6
- @a e i m #1 -- D0 --- x4
- @b f j n #2 -- D2 --- x5
- @c g k o #3 -- D4 ----x6
- @d h l p #4 -- D6 ----x7
-
- @we have loaded the residuals into the registers , now we need to add and subtract them
- @let us do the horiz transform first
-
- vsub.s16 d5, d2, d4 @x2 = x5-x6
- vsub.s16 d7, d0, d6 @x3 = x4-x7;
-
- vadd.s16 d3, d2, d4 @x1 = x5+x6
- vadd.s16 d1, d0, d6 @x0 = x4+x7
-
-
- vshl.s16 d31, d7, #1 @
- vshl.s16 d30, d5, #1 @
-
- vadd.s16 d0, d1, d3 @x0 + x1;
- vsub.s16 d4, d1, d3 @x0 - x1;
-
- vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2;
- vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft);
-
- @taking transform again so as to make do vert transform
- vtrn.16 d0, d2
- vtrn.16 d4, d6
-
- vtrn.32 d0, d4
- vtrn.32 d2, d6
-
- @let us do vertical transform
- @same code as horiz
-
- vadd.s16 d1, d0, d6 @x0 = x4+x7
- vadd.s16 d3, d2, d4 @x1 = x5+x6
- vsub.s16 d7, d0, d6 @x3 = x4-x7;
- vsub.s16 d5, d2, d4 @x2 = x5-x6
-
-
-@Since we are going to do scal / quant or whatever, we are going to divide by
-@a 32 bit number. So we have to expand the values
-
- @VADDL.S16 Q12,D1,D3;x0 + x1
- @VSUBL.S16 Q14,D1,D3;x0 - x1
-
- @VSHL.S16 D8,D5,#1;
- @VSHL.S16 D9,D7,#1;
-
- @VADDL.S16 Q13,D9,D5 ; + x2
- @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
-
-@scaling follows
-
-@now we need to do the scaling,so load the scaling matrix
-@mutliplying by the scaling coeffient; store the results from q5-q8 ;
-
- vadd.s16 d24, d3, d1 @x4 = x0 + x1
- vsub.s16 d28, d1, d3 @x6 = x0 - x1
-
- vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft)
- vmull.s16 q4, d24, d20 @x4*s0
-
- vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft)
-
- vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2
- vmull.s16 q5, d26, d21 @x5*s1
-
- vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride
-
- vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients
-
- vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft)
-
- vmull.s16 q6, d28, d20 @x6*s2
- vst1.s32 {q5}, [r2], r5
-
- vmull.s16 q7, d30, d21 @x7*s3
-
-
- vst1.s32 {q6}, [r2], r5
- vst1.s32 {q7}, [r2]
-
- pop {r4-r12, pc} @pop back all variables
-
-
-
-
-@*****************************************************************************
-@* Function Name : ih264_resi_trans_8x8_a9
-@* Description : This function does cf8 followd by an approximate normalization of H264
-@*
-@* Arguments :
-@* R0 :pointer to src buffer
-@ R1 :pointer to pred buffer
-@ R2 :pointer to dst buffer
-@ R3 :src_stride
-@ STACk :pred_stride,dst_st
-@*
-@*
-@* Values Returned : NONE
-@*
-@* Register Usage :
-@* Stack Usage :
-@* Cycles : Around
-@* Interruptiaility : Interruptable
-@*
-@* Known Limitations
-@* \Assumptions :
-@*
-@* Revision History :
-@* DD MM YYYY Author(s) Changes
-@* 30 12 2009 100633 First version
-@*
-@*****************************************************************************
-
-
- .global ih264_resi_trans_8x8_a9
- .extern g_scal_coff_h264_8x8
-g_scal_coff_h264_8x8_addr:
- .long g_scal_coff_h264_8x8 - 8x8lbl - 8
-
-
-ih264_resi_trans_8x8_a9:
-
- @R0 :pointer to src buffer
- @R1 :pointer to pred buffer
- @R2 :pointer to dst buffer
- @R3 :src_stride
- @STACk :pred_stride,dst_stride
-
- push {r4-r12, lr} @push all the variables first
-
- mov r6, sp
- add r6, r6, #40 @decrement stack pointer,to accomodate two variables
- ldmfd r6, {r4-r5} @load the strides into registers
- @R4 pred_stride
- @R5 dst_stride
-
- @we have to give the stride as post inrement in vst1
- @in case of dst the stride represnts 16 bit ie 2*8bits
- @hence we need to add #4 to it and thenm multiply by 2
- @--------------------function loading done------------------------
-
- @lets find residual
- @data is like 1a -> d0[1:31] d0[32:64]
- @ a b c d # # # #
- vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
- vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
-
- vld1.u8 d28, [r0], r3 @src rw2
- vld1.u8 d29, [r1], r4 @pred rw2
- vsubl.u8 q0, d30, d31 @src-pred rw1
-
- vld1.u8 d26, [r0], r3
- vld1.u8 d27, [r1], r4
- vsubl.u8 q1, d28, d29
-
- vld1.u8 d24, [r0], r3
- vld1.u8 d25, [r1], r4
- vsubl.u8 q2, d26, d27
-
- vld1.u8 d22, [r0], r3
- vld1.u8 d23, [r1], r4
- vsubl.u8 q3, d24, d25
-
- vld1.u8 d20, [r0], r3
- vld1.u8 d21, [r1], r4
- vsubl.u8 q4, d22, d23
-
- vld1.u8 d18, [r0], r3
- vld1.u8 d19, [r1], r4
- vsubl.u8 q5, d20, d21
-
- vld1.u8 d16, [r0], r3
- vld1.u8 d17, [r1], r4
- vsubl.u8 q6, d18, d19
-
- lsl r5, r5, #2
-
-
- vsubl.u8 q7, d16, d17
-
- @after this
- @Q0 -> 1a
- @Q1 -> 2a
- @Q2 -> 3a
- @Q3 -> 4a
- @Q4 -> 5a
- @Q5 -> 6a
- @Q6 -> 7a
- @Q7 -> 8a
-
- @transpose the matrix so that we can do the horizontal transform first
-
- @transpose the inner 2x2 blocks
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- @transpose the inner 4x4 blocks
- vtrn.32 q0, q2
- vtrn.32 q1, q3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
-
- @transpose the outer 8x8 blocks
- vswp d1, d8
- vswp d7, d14
- vswp d3, d10
- vswp d5, d12
- @transpose done
-
-@@this point we will have data in Q0-Q7
-@Q7 will be populated within 2 clock cycle
-@all others are availabe @ this clock cycle
-
- @we have loaded the residuals into the registers , now we need to add and subtract them
- @let us do the horiz transform first
-
- vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
- vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
- vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
- vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
-
- vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
- vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
- vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
- vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
-
- vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
- vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
- vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
- vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
-
- ldr r6, g_scal_coff_h264_8x8_addr
-8x8lbl:
- add r6, r6, pc @ load the address of global array
-
- vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
- vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
-
- vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
-
- vadd.s16 q2, q5, q8 @
-
-
- vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
- vsub.s16 q6, q9, q7 @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
- vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
- vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
- vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
- vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
-
- vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
- vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
- vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
- vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
- vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
- vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
- vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
- vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
-
-
- vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
- vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
- vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
- vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
-
- @------------horiz transform done-------------------------
- @results are in Q0-Q7
- @all other neon registes can be used at will
-
-@doing vertical transform
-@code exact copy of horiz transform above
-
- @transpose the inner 2x2 blocks
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- @transpose the inner 4x4 blocks
- vtrn.32 q0, q2
- vtrn.32 q1, q3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
-
- @transpose the outer 8x8 blocks
- vswp d1, d8
- vswp d3, d10
- vswp d5, d12
- vswp d7, d14
-
- @transpose done
-
- vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
- vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
- vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
- vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
-
- vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
- vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
- vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
- vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
-
- vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
- vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
- vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
- vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
-
-
- vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
-
- vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
- @DSHIFT_TO_0 Q8,Q7,#1,#0
- vadd.s16 q2, q5, q8 @
-
- vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
-
- vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
- vsub.s16 q6, q9, q7 @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
- vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
- vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
- vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
- vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
-
-
- vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
- vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
- vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
- vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
- vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
- vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
- vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
- vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
- vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
-
-
-@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
- vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
- vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
- vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
- vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
-
- @------------vert transform done-------------------------
- @results are in Q0-Q7
- @all other neon registes can be used at will
-
- @scaling
- @since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
- @we need only load 4 values for each row and in total 4 rows
- vld1.s16 {q14-q15}, [r6] @
-
- @since we need to get a 32 bit o/p for two 16 bit multiplications
- @we need a VMULL instruction
-@-----------------------------first and second row
-
- vmull.s16 q8, d0, d28 @scale the first row first 4 elem
- vmull.s16 q9, d28, d1 @scale the second row last 4 elemts
-
- vmull.s16 q10, d2, d29 @ scale second row first 4 elem
- vmull.s16 q11, d29, d3 @scale the second row last 4 elem
- vmull.s16 q12, d4, d30 @scale third row first 4 elem
-
- vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete
-
- vmull.s16 q13, d30, d5 @scale the third row last 4 elem
- vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem
-
-
- vst1.s32 {q10, q11}, [r2], r5 @store the second row complete
-
-@------------------------------- 3rd and 4th row
-
- vmull.s16 q9, d31, d7 @scale the fourth row second column
-
- vst1.s32 {q12, q13}, [r2], r5 @store the third row complete
-
- vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms
- vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems
-
- vmull.s16 q12, d10, d29 @scale the 6th row first4 elements
-
-
- vst1.s32 {q8, q9}, [r2], r5 @store fifth row
-
-@--------------------------------5th and 6th row
-
- vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems
-
- vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms
-
- vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements
-
- vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms
- vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms
-
-
- vst1.s32 {q12, q13}, [r2], r5 @store 6th row
-
-@----------------------------------7th and 8th row
- vmull.s16 q11, d31, d15 @scale 8th row second 4 elms
-
- vst1.s32 {q8, q9}, [r2], r5 @store 7th row
- vst1.s32 {q10, q11}, [r2], r5 @store 8th row
-
-@----------------------------------done writing
-
- pop {r4-r12, pc} @pop back all variables
-
-
-
-
-
-
diff --git a/common/arm/ih264_resi_trans_quant_a9.s b/common/arm/ih264_resi_trans_quant_a9.s
index caf362e..bb836bd 100644
--- a/common/arm/ih264_resi_trans_quant_a9.s
+++ b/common/arm/ih264_resi_trans_quant_a9.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@*******************************************************************************
@* @file
@* ih264_resi_trans_quant_a9.s
diff --git a/common/arm/ih264_weighted_bi_pred_a9q.s b/common/arm/ih264_weighted_bi_pred_a9q.s
index ccae779..33859e6 100644
--- a/common/arm/ih264_weighted_bi_pred_a9q.s
+++ b/common/arm/ih264_weighted_bi_pred_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_weighted_bi_pred_a9q.s
@@ -37,7 +37,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@*******************************************************************************
@* @function
@* ih264_weighted_bi_pred_luma_a9q()
@@ -96,7 +96,7 @@
@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
@*
@*******************************************************************************
-@*/
+@*
@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
@ UWORD8 *pu1_src2,
@ UWORD8 *pu1_dst,
@@ -411,7 +411,7 @@ end_loops:
@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
@*
@*******************************************************************************
-@*/
+@*
@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
@ UWORD8 *pu1_src2,
@ UWORD8 *pu1_dst,
diff --git a/common/arm/ih264_weighted_pred_a9q.s b/common/arm/ih264_weighted_pred_a9q.s
index 1ce94d0..81d26d4 100644
--- a/common/arm/ih264_weighted_pred_a9q.s
+++ b/common/arm/ih264_weighted_pred_a9q.s
@@ -17,7 +17,7 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
@******************************************************************************
@* @file
@* ih264_weighted_pred_a9q.s
@@ -37,7 +37,7 @@
@* None
@*
@*******************************************************************************
-@*/
+@*
@*******************************************************************************
@* @function
@* ih264_weighted_pred_luma_a9q()
@@ -84,7 +84,7 @@
@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
@*
@*******************************************************************************
-@*/
+@*
@void ih264_weighted_pred_luma_a9q(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@@ -314,7 +314,7 @@ end_loops:
@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
@*
@*******************************************************************************
-@*/
+@*
@void ih264_weighted_pred_chroma_a9q(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
diff --git a/common/armv8/ih264_default_weighted_pred_av8.s b/common/armv8/ih264_default_weighted_pred_av8.s
index aefb902..6823015 100644
--- a/common/armv8/ih264_default_weighted_pred_av8.s
+++ b/common/armv8/ih264_default_weighted_pred_av8.s
@@ -24,7 +24,6 @@
//*
//* @brief
//* Contains function definitions for default weighted prediction.
-//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
//*
//* @author
//* Kaushik Senthoor R
diff --git a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
index 38934c9..9564f99 100644
--- a/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
+++ b/common/armv8/ih264_inter_pred_filters_luma_vert_av8.s
@@ -247,8 +247,8 @@ loop_16: //when wd=16
st1 {v30.2s, v31.2s}, [x1], x3 // store row 6
sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5)
- swp v0.8b v4.8b
- swp v1.8b v5.8b
+ swp v0.8b, v4.8b
+ swp v1.8b, v5.8b
@@ -257,8 +257,8 @@ loop_16: //when wd=16
mov v7.8b, v11.8b
subs x12, x14, #1 // if height==16 - looping
- swp v4.8b v8.8b
- swp v5.8b v9.8b
+ swp v4.8b, v8.8b
+ swp v5.8b, v9.8b
sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
index ea7645e..202c516 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_hpel_av8.s
@@ -68,7 +68,7 @@
ih264_inter_pred_luma_horz_hpel_vert_hpel_av8:
- //store register values to stack
+ //store register values to stack
push_v_regs
stp x19, x20, [sp, #-16]!
@@ -811,7 +811,7 @@ loop_4:
bgt loop_4
end_func:
- //Restoring registers from stack
+ //Restoring registers from stack
ldp x19, x20, [sp], #16
pop_v_regs
ret
diff --git a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
index 3737e3f..38f971b 100644
--- a/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
+++ b/common/armv8/ih264_inter_pred_luma_horz_hpel_vert_qpel_av8.s
@@ -1111,7 +1111,7 @@ loop_4:
bgt loop_4
end_func:
- //Restoring registers from stack
+ //Restoring registers from stack
ldp x19, x20, [sp], #16
pop_v_regs
ret
diff --git a/common/armv8/ih264_intra_pred_chroma_av8.s b/common/armv8/ih264_intra_pred_chroma_av8.s
index 62edfdc..2c5efb3 100644
--- a/common/armv8/ih264_intra_pred_chroma_av8.s
+++ b/common/armv8/ih264_intra_pred_chroma_av8.s
@@ -262,7 +262,7 @@ ih264_intra_pred_chroma_8x8_mode_horz_av8:
- push_v_regs
+ push_v_regs
ld1 {v0.8h}, [x0]
dup v10.8h, v0.h[7]
diff --git a/common/armv8/ih264_weighted_bi_pred_av8.s b/common/armv8/ih264_weighted_bi_pred_av8.s
index f7d0846..96ef50a 100644
--- a/common/armv8/ih264_weighted_bi_pred_av8.s
+++ b/common/armv8/ih264_weighted_bi_pred_av8.s
@@ -24,7 +24,6 @@
//*
//* @brief
//* Contains function definitions for weighted biprediction.
-//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
//*
//* @author
//* Kaushik Senthoor R
diff --git a/common/armv8/ih264_weighted_pred_av8.s b/common/armv8/ih264_weighted_pred_av8.s
index 6a03875..ec5bb7a 100644
--- a/common/armv8/ih264_weighted_pred_av8.s
+++ b/common/armv8/ih264_weighted_pred_av8.s
@@ -24,7 +24,6 @@
//*
//* @brief
//* Contains function definitions for weighted prediction.
-//* Functions are coded using NEON intrinsics and can be compiled using ARM RVCT
//*
//* @author
//* Kaushik Senthoor R
diff --git a/common/ih264_dpb_mgr.c b/common/ih264_dpb_mgr.c
index 8e087d3..9380b7e 100644
--- a/common/ih264_dpb_mgr.c
+++ b/common/ih264_dpb_mgr.c
@@ -536,7 +536,7 @@ WORD32 ih264_dpb_mgr_alternate_ref_fields(dpb_mgr_t *ps_dpb_mgr,
BOTTOM_FIELD:TOP_FIELD;
}
- if((reference_type == SHORT_TERM_REF))
+ if(reference_type == SHORT_TERM_REF)
{
ps_dpb_mgr->ps_dpb_short_term_head = ps_dpb_head->ps_prev_dpb;
}
diff --git a/common/ithread.c b/common/ithread.c
index 4ffb98a..25a8cd0 100644
--- a/common/ithread.c
+++ b/common/ithread.c
@@ -327,6 +327,11 @@ WORD32 ithread_set_affinity(WORD32 core_id)
return 1;
}
+void ithread_set_name(CHAR *pc_thread_name)
+{
+ return;
+}
+
#else
UWORD32 ithread_get_handle_size(void)
diff --git a/common/x86/ih264_deblk_luma_ssse3.c b/common/x86/ih264_deblk_luma_ssse3.c
index 440d5f0..e29bebb 100644
--- a/common/x86/ih264_deblk_luma_ssse3.c
+++ b/common/x86/ih264_deblk_luma_ssse3.c
@@ -856,7 +856,7 @@ void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
{
UWORD8 u1_Bs, u1_Bs1;
- UWORD32 j = 0;
+ WORD32 j = 0;
__m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
__m128i int1, int2, int3, int4, high1, high2;
diff --git a/common/x86/ih264_ihadamard_scaling_sse42.c b/common/x86/ih264_ihadamard_scaling_sse42.c
index 895291b..d68d105 100644
--- a/common/x86/ih264_ihadamard_scaling_sse42.c
+++ b/common/x86/ih264_ihadamard_scaling_sse42.c
@@ -86,14 +86,19 @@
*
*******************************************************************************
*/
-void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out,
- const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
- UWORD32 u4_qp_div_6, WORD32* pi4_tmp) {
+void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src,
+ WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD32* pi4_tmp)
+{
__m128i src_r0_r1, src_r2_r3;
__m128i src_r0, src_r1, src_r2, src_r3;
__m128i temp0, temp1, temp2, temp3;
__m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6)));
__m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
+ UNUSED (pi4_tmp);
src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
@@ -171,12 +176,15 @@ void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out,
src_r3 = _mm_mullo_epi32(src_r3, mult_val);
//Scaling
- if (u4_qp_div_6 >= 6) {
+ if(u4_qp_div_6 >= 6)
+ {
src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
- } else {
+ }
+ else
+ {
temp0 = _mm_add_epi32(src_r0, add_rshift);
temp1 = _mm_add_epi32(src_r1, add_rshift);
temp2 = _mm_add_epi32(src_r2, add_rshift);
@@ -194,16 +202,17 @@ void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, WORD16* pi2_out,
}
void ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src,
- WORD16* pi2_out,
- const UWORD16 *pu2_iscal_mat,
- const UWORD16 *pu2_weigh_mat,
- UWORD32 u4_qp_div_6,
- WORD32* pi4_tmp)
+ WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD32* pi4_tmp)
{
- UNUSED(pi4_tmp);
__m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
__m128i zero_8x16b = _mm_setzero_si128();
__m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0]));
+ UNUSED(pi4_tmp);
+
src = _mm_loadu_si128((__m128i *) pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3
sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits
diff --git a/common/x86/ih264_ihadamard_scaling_ssse3.c b/common/x86/ih264_ihadamard_scaling_ssse3.c
index 232d9fa..1b940ea 100644
--- a/common/x86/ih264_ihadamard_scaling_ssse3.c
+++ b/common/x86/ih264_ihadamard_scaling_ssse3.c
@@ -85,9 +85,13 @@
*
*******************************************************************************
*/
-void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out,
- const UWORD16 *pu2_iscal_mat, const UWORD16 *pu2_weigh_mat,
- UWORD32 u4_qp_div_6, WORD32* pi4_tmp) {
+void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src,
+ WORD16* pi2_out,
+ const UWORD16 *pu2_iscal_mat,
+ const UWORD16 *pu2_weigh_mat,
+ UWORD32 u4_qp_div_6,
+ WORD32* pi4_tmp)
+{
int val = 0xFFFF;
__m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128();
__m128i src_r0, src_r1, src_r2, src_r3;
@@ -96,6 +100,8 @@ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out,
__m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);
__m128i mask = _mm_set1_epi32(val);
+ UNUSED (pi4_tmp);
+
mult_val = _mm_and_si128(mult_val, mask);
src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
@@ -177,12 +183,15 @@ void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src, WORD16* pi2_out,
src_r3 = _mm_madd_epi16(src_r3, mult_val);
//Scaling
- if (u4_qp_div_6 >= 6) {
+ if(u4_qp_div_6 >= 6)
+ {
src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
- } else {
+ }
+ else
+ {
temp0 = _mm_add_epi32(src_r0, add_rshift);
temp1 = _mm_add_epi32(src_r1, add_rshift);
temp2 = _mm_add_epi32(src_r2, add_rshift);
diff --git a/common/x86/ih264_inter_pred_filters_ssse3.c b/common/x86/ih264_inter_pred_filters_ssse3.c
index 64e364e..6d318c9 100644
--- a/common/x86/ih264_inter_pred_filters_ssse3.c
+++ b/common/x86/ih264_inter_pred_filters_ssse3.c
@@ -98,11 +98,10 @@ void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
{
__m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
+ WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
UNUSED(pu1_tmp);
UNUSED(dydx);
- WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
-
src_strd2 = src_strd << 1;
dst_strd2 = dst_strd << 1;
src_strd4 = src_strd << 2;
@@ -1825,7 +1824,6 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
WORD32 y_offset;
UWORD8 *pu1_pred1;
- UNUSED(pu1_tmp);
__m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
__m128i src_r5_16x8b, src_r6_16x8b;
@@ -1835,6 +1833,7 @@ void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
__m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
__m128i const_val16_8x16b;
+ UNUSED(pu1_tmp);
y_offset = dydx & 0xf;
coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
diff --git a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
index d43c8e2..565cc75 100644
--- a/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
+++ b/common/x86/ih264_iquant_itrans_recon_dc_ssse3.c
@@ -113,6 +113,8 @@ void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src,
UWORD32 *pu4_out = (UWORD32 *)pu1_out;
WORD32 q0 = pi2_src[0];
WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
+ UNUSED (pi2_tmp);
+
INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
if (iq_start_idx != 0 )
@@ -233,6 +235,10 @@ void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src,
{
WORD32 q0 = pi2_src[0];
WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0;
+ UNUSED (pi2_tmp);
+ UNUSED (iq_start_idx);
+ UNUSED (pi2_dc_ld_addr);
+
INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
i_macro = ((q0 + 32) >> 6);
@@ -392,6 +398,12 @@ void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src,
__m128i chroma_mask = _mm_set1_epi16 (0xFF);
__m128i value_add = _mm_set1_epi16(i_macro);
+ UNUSED (pi2_src);
+ UNUSED (pu2_iscal_mat);
+ UNUSED (pu2_weigh_mat);
+ UNUSED (u4_qp_div_6);
+ UNUSED (pi2_tmp);
+
//Load pred buffer
pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
diff --git a/common/x86/ih264_iquant_itrans_recon_sse42.c b/common/x86/ih264_iquant_itrans_recon_sse42.c
index 2a4ea3f..6399b65 100644
--- a/common/x86/ih264_iquant_itrans_recon_sse42.c
+++ b/common/x86/ih264_iquant_itrans_recon_sse42.c
@@ -120,6 +120,7 @@ void ih264_iquant_itrans_recon_4x4_sse42(WORD16 *pi2_src,
__m128i resq_r0, resq_r1, resq_r2, resq_r3;
__m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
__m128i value_32 = _mm_set1_epi32(32);
+ UNUSED (pi2_tmp);
/*************************************************************/
/* Dequantization of coefficients. Will be replaced by SIMD */
@@ -369,6 +370,8 @@ void ih264_iquant_itrans_recon_chroma_4x4_sse42(WORD16 *pi2_src,
__m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
__m128i value_32 = _mm_set1_epi32(32);
__m128i chroma_mask = _mm_set1_epi16 (0xFF);
+ UNUSED (pi2_tmp);
+
/*************************************************************/
/* Dequantization of coefficients. Will be replaced by SIMD */
/* operations on platform */
diff --git a/common/x86/ih264_iquant_itrans_recon_ssse3.c b/common/x86/ih264_iquant_itrans_recon_ssse3.c
index ca1397e..388cafe 100644
--- a/common/x86/ih264_iquant_itrans_recon_ssse3.c
+++ b/common/x86/ih264_iquant_itrans_recon_ssse3.c
@@ -120,6 +120,8 @@ void ih264_iquant_itrans_recon_4x4_ssse3(WORD16 *pi2_src,
__m128i resq_r0, resq_r1, resq_r2, resq_r3;
__m128i add_rshift = _mm_set1_epi32((1 << (3 - u4_qp_div_6)));
__m128i value_32 = _mm_set1_epi32(32);
+ UNUSED (pi2_tmp);
+ UNUSED (pi2_dc_ld_addr);
/*************************************************************/
/* Dequantization of coefficients. Will be replaced by SIMD */
@@ -397,6 +399,9 @@ void ih264_iquant_itrans_recon_8x8_ssse3(WORD16 *pi2_src,
__m128i resq_r0_1, resq_r0_2, resq_r1_1, resq_r1_2, resq_r2_1, resq_r2_2,
resq_r3_1, resq_r3_2, resq_r4_1, resq_r4_2, resq_r5_1, resq_r5_2,
resq_r6_1, resq_r6_2, resq_r7_1, resq_r7_2;
+ UNUSED (pi2_tmp);
+ UNUSED (iq_start_idx);
+ UNUSED (pi2_dc_ld_addr);
/*************************************************************/
/* Dequantization of coefficients. Will be replaced by SIMD */
diff --git a/common/x86/ih264_resi_trans_quant_sse42.c b/common/x86/ih264_resi_trans_quant_sse42.c
index c267651..eca43ed 100644
--- a/common/x86/ih264_resi_trans_quant_sse42.c
+++ b/common/x86/ih264_resi_trans_quant_sse42.c
@@ -121,6 +121,9 @@ void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
__m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
__m128i sign_reg0, sign_reg2;
__m128i scalemat_r0_r1, scalemat_r2_r3;
+
+ UNUSED (pu2_threshold_matrix);
+
scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
@@ -394,6 +397,8 @@ void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WO
__m128i scalemat_r0_r1, scalemat_r2_r3;
__m128i chroma_mask = _mm_set1_epi16 (0xFF);
+ UNUSED (pu2_threshold_matrix);
+
scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
@@ -676,6 +681,8 @@ void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
__m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
__m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
+ UNUSED (pu2_threshold_matrix);
+
src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
@@ -902,6 +909,8 @@ void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
__m128i temp_1 = _mm_set1_epi16(1);
__m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
+ UNUSED (pu2_threshold_matrix);
+
src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3
sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits
diff --git a/decoder/ih264d_thread_parse_decode.c b/decoder/ih264d_thread_parse_decode.c
index be3cb01..1c9eb68 100644
--- a/decoder/ih264d_thread_parse_decode.c
+++ b/decoder/ih264d_thread_parse_decode.c
@@ -582,13 +582,9 @@ WORD32 ih264d_decode_slice_thread(dec_struct_t *ps_dec /* Decoder parameters */
void ih264d_decode_picture_thread(dec_struct_t *ps_dec )
{
- volatile WORD32 i4_err_status;
-
ithread_set_name("ih264d_decode_picture_thread");
-
-
// run the loop till all slices are decoded
while(1)
@@ -644,13 +640,6 @@ void ih264d_decode_picture_thread(dec_struct_t *ps_dec )
DEBUG_THREADS_PRINTF("Waiting for next slice or end of frame\n");
NOP(32);
- if(i4_err_status != 0)
- {
- /*In the case of error set decode Mb number ,so that the
- parse thread does not wait because of mb difference being
- greated the 32*/
- ps_dec->cur_dec_mb_num = ps_dec->u2_cur_mb_addr - 1;
- }
}
DEBUG_THREADS_PRINTF("Got next slice/end of frame signal \n ");
diff --git a/encoder/arm/ime_distortion_metrics_a9q.s b/encoder/arm/ime_distortion_metrics_a9q.s
index b58911e..27fbe3d 100644
--- a/encoder/arm/ime_distortion_metrics_a9q.s
+++ b/encoder/arm/ime_distortion_metrics_a9q.s
@@ -17,9 +17,9 @@
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
-@/**
+@**
-@/**
+@**
@******************************************************************************
@*
@*
@@ -48,7 +48,7 @@
@
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
@@ -79,59 +79,62 @@
@* @remarks
@*
@******************************************************************************
-@*/
+@*
.text
.p2align 2
+
.global ime_compute_sad_16x16_fast_a9q
+
ime_compute_sad_16x16_fast_a9q:
- stmfd sp!, {r12, lr}
- lsl r2, r2, #1
- lsl r3, r3, #1
+ stmfd sp!, {r12, lr}
+ vpush {d8-d15}
+ lsl r2, r2, #1
+ lsl r3, r3, #1
@for bringing buffer2 into cache..., dummy load instructions
- @ LDR r12,[r1]
+ @LDR r12,[r1]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- mov r12, #6
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d6, d4
- vabdl.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r12, #6
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
loop_sad_16x16_fast:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r12, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
-
- bne loop_sad_16x16_fast
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r12, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
+ bne loop_sad_16x16_fast
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
- ldr r12, [sp, #12]
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- vshl.u32 d0, d0, #1
- vst1.32 {d0[0]}, [r12]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+ vpop {d8-d15}
+ ldr r12, [sp, #12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vshl.u32 d0, d0, #1
+ vst1.32 {d0[0]}, [r12]
- ldmfd sp!, {r12, pc}
+ ldmfd sp!, {r12, pc}
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x8 blocks
@@ -163,56 +166,57 @@ loop_sad_16x16_fast:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
@
.global ime_compute_sad_16x8_a9q
+
ime_compute_sad_16x8_a9q:
- stmfd sp!, {r12, lr}
+ stmfd sp!, {r12, lr}
@for bringing buffer2 into cache..., dummy load instructions
@LDR r12,[r1]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- mov r12, #6
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d6, d4
- vabdl.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r12, #6
+ vpush {d8-d15}
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
loop_sad_16x8:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r12, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
-
- bne loop_sad_16x8
-
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r12, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
+ bne loop_sad_16x8
- ldr r12, [sp, #12]
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- vst1.32 {d0[0]}, [r12]
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
- ldmfd sp!, {r12, pc}
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+ vpop {d8-d15}
+ ldr r12, [sp, #12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vst1.32 {d0[0]}, [r12]
+ ldmfd sp!, {r12, pc}
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
@@ -243,100 +247,103 @@ loop_sad_16x8:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
+
.global ime_compute_sad_16x16_ea8_a9q
ime_compute_sad_16x16_ea8_a9q:
- stmfd sp!, {r5-r7, lr}
- lsl r2, r2, #1
- lsl r3, r3, #1
+ stmfd sp!, {r5-r7, lr}
+ lsl r2, r2, #1
+ lsl r3, r3, #1
@for bringing buffer2 into cache..., dummy load instructions
@LDR r12,[r1]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- mov r5, #6
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d6, d4
- vabdl.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
- ldrd r6, r7, [sp, #16]
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ mov r5, #6
+ ldrd r6, r7, [sp, #16]
+ vpush {d8-d15}
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d6, d4
+ vabdl.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
@r6 = i4_max_sad, r7 = pi4_mb_distortion
loop_sad_16x16_ea8_1:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r5, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
-
- bne loop_sad_16x16_ea8_1
-
- vabal.u8 q0, d10, d8
- sub r0, r0, r2, lsl #3
- vabal.u8 q1, d11, d9
- sub r1, r1, r3, lsl #3
-
- vadd.i16 q6, q0, q1
- add r0, r0, r2, asr #1
- vadd.i16 d12, d12, d13
- add r1, r1, r3, asr #1
-
- vpaddl.u16 d12, d12
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
- vpaddl.u32 d12, d12
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
-
- vst1.32 {d12[0]}, [r7]
- ldr r5, [r7]
- cmp r5, r6
- bgt end_func_16x16_ea8
-
- vld1.8 {d10, d11}, [r1], r3
- mov r5, #6
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r5, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
+
+ bne loop_sad_16x16_ea8_1
+
+ vabal.u8 q0, d10, d8
+ sub r0, r0, r2, lsl #3
+ vabal.u8 q1, d11, d9
+ sub r1, r1, r3, lsl #3
+
+ vadd.i16 q6, q0, q1
+ add r0, r0, r2, asr #1
+ vadd.i16 d12, d12, d13
+ add r1, r1, r3, asr #1
+
+ vpaddl.u16 d12, d12
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ vpaddl.u32 d12, d12
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+
+ vst1.32 {d12[0]}, [r7]
+ ldr r5, [r7]
+ cmp r5, r6
+ bgt end_func_16x16_ea8
+
+ vld1.8 {d10, d11}, [r1], r3
+ mov r5, #6
loop_sad_16x16_ea8_2:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
- vld1.8 {d6, d7}, [r1], r3
- subs r5, #2
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d6, d4
- vabal.u8 q1, d7, d5
- vld1.8 {d10, d11}, [r1], r3
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
+ vld1.8 {d6, d7}, [r1], r3
+ subs r5, #2
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d6, d4
+ vabal.u8 q1, d7, d5
+ vld1.8 {d10, d11}, [r1], r3
- bne loop_sad_16x16_ea8_2
+ bne loop_sad_16x16_ea8_2
- vabal.u8 q0, d10, d8
- vabal.u8 q1, d11, d9
+ vabal.u8 q0, d10, d8
+ vabal.u8 q1, d11, d9
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
- vst1.32 {d0[0]}, [r7]
+ vst1.32 {d0[0]}, [r7]
end_func_16x16_ea8:
-
- ldmfd sp!, {r5-r7, pc}
+ vpop {d8-d15}
+ ldmfd sp!, {r5-r7, pc}
-@/*
+@*
@//---------------------------------------------------------------------------
@// Function Name : Calculate_Mad2_prog()
@//
@@ -346,7 +353,7 @@ end_func_16x16_ea8:
@// Platform : CortexA8/NEON .
@//
@//-----------------------------------------------------------------------------
-@*/
+@*
.global ime_calculate_sad2_prog_a9q
@@ -358,72 +365,72 @@ ime_calculate_sad2_prog_a9q:
@ r3 = RefBufferWidth <UWORD32>
@ stack = CurBufferWidth <UWORD32>, psad <UWORD32 *>
- stmfd sp!, {r4-r5, lr}
-
- ldr r4, [sp, #8] @ load src stride to r4
- mov r5, #14
+ stmfd sp!, {r4-r5, lr}
+ ldr r4, [sp, #8] @ load src stride to r4
+ mov r5, #14
+ vpush {d8-d15}
@Row 1
- vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
- vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
- vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
+ vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
+ vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
@Row 2
- vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
- vabdl.u8 q6, d2, d0
- vabdl.u8 q7, d3, d1
- vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
- vabdl.u8 q8, d4, d0
- vabdl.u8 q9, d5, d1
- vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
+ vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
+ vabdl.u8 q6, d2, d0
+ vabdl.u8 q7, d3, d1
+ vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
+ vabdl.u8 q8, d4, d0
+ vabdl.u8 q9, d5, d1
+ vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
loop_sad2_prog:
- subs r5, #2
+ subs r5, #2
@Row 1
- vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
- vabal.u8 q6, d8, d6
- vabal.u8 q7, d9, d7
- vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
- vabal.u8 q8, d10, d6
- vabal.u8 q9, d11, d7
- vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
+ vld1.8 {d0, d1}, [r2], r4 @ load src Row 1
+ vabal.u8 q6, d8, d6
+ vabal.u8 q7, d9, d7
+ vld1.8 {d2, d3}, [r0], r3 @ load ref1 Row 1
+ vabal.u8 q8, d10, d6
+ vabal.u8 q9, d11, d7
+ vld1.8 {d4, d5}, [r1], r3 @ load ref2 Row 1
@Row 2
- vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
- vabal.u8 q6, d2, d0
- vabal.u8 q7, d3, d1
- vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
- vabal.u8 q8, d4, d0
- vabal.u8 q9, d5, d1
- vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
+ vld1.8 {d6, d7}, [r2], r4 @ load src Row 2
+ vabal.u8 q6, d2, d0
+ vabal.u8 q7, d3, d1
+ vld1.8 {d8, d9}, [r0], r3 @ load ref1 Row 2
+ vabal.u8 q8, d4, d0
+ vabal.u8 q9, d5, d1
+ vld1.8 {d10, d11}, [r1], r3 @ load ref2 Row 2
- bne loop_sad2_prog
+ bne loop_sad2_prog
- vabal.u8 q6, d8, d6
- vabal.u8 q7, d9, d7
- vabal.u8 q8, d10, d6
- vabal.u8 q9, d11, d7
+ vabal.u8 q6, d8, d6
+ vabal.u8 q7, d9, d7
+ vabal.u8 q8, d10, d6
+ vabal.u8 q9, d11, d7
@ Compute SAD
- vadd.u16 q6, q6, q7 @ Q6 : sad_ref1
- vadd.u16 q8, q8, q9 @ Q8 : sad_ref2
+ vadd.u16 q6, q6, q7 @ Q6 : sad_ref1
+ vadd.u16 q8, q8, q9 @ Q8 : sad_ref2
- vadd.u16 d12, d12, d13
- ldr r5, [sp, #16] @ loading pi4_sad to r5
- vadd.u16 d16, d16, d17
+ vadd.u16 d12, d12, d13
+ ldr r5, [sp, #16] @ loading pi4_sad to r5
+ vadd.u16 d16, d16, d17
- vpadd.u16 d12, d12, d16
- vpaddl.u16 d12, d12
+ vpadd.u16 d12, d12, d16
+ vpaddl.u16 d12, d12
- vst1.64 {d12}, [r5]!
+ vst1.64 {d12}, [r5]!
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r5, pc}
- ldmfd sp!, {r4-r5, pc}
-
-@/*
+@*
@//---------------------------------------------------------------------------
@// Function Name : Calculate_Mad3_prog()
@//
@@ -433,7 +440,7 @@ loop_sad2_prog:
@// Platform : CortexA8/NEON .
@//
@//-----------------------------------------------------------------------------
-@*/
+@*
.global ime_calculate_sad3_prog_a9q
@@ -446,90 +453,90 @@ ime_calculate_sad3_prog_a9q:
@ stack = RefBufferWidth <UWORD32>, CurBufferWidth <UWORD32>, psad <UWORD32 *>
- stmfd sp!, {r4-r6, lr}
-
- ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5
- mov r6, #14
-
- @ Row 1
- vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
- vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
- vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
- vabdl.u8 q8, d2, d0
- vabdl.u8 q9, d3, d1
- vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
- vabdl.u8 q10, d4, d0
- vabdl.u8 q11, d5, d1
-
- @ Row 2
- vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
- vabdl.u8 q12, d6, d0
- vabdl.u8 q13, d7, d1
- vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
- vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
- vabal.u8 q8, d10, d8
- vabal.u8 q9, d11, d9
- vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
- vabal.u8 q10, d12, d8
- vabal.u8 q11, d13, d9
+ stmfd sp!, {r4-r6, lr}
+
+ ldrd r4, r5, [sp, #16] @ load ref stride to r4, src stride to r5
+ mov r6, #14
+ vpush {d8-d15}
+ @Row 1
+ vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
+ vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
+ vabdl.u8 q8, d2, d0
+ vabdl.u8 q9, d3, d1
+ vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
+ vabdl.u8 q10, d4, d0
+ vabdl.u8 q11, d5, d1
+
+ @Row 2
+ vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
+ vabdl.u8 q12, d6, d0
+ vabdl.u8 q13, d7, d1
+ vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d10, d8
+ vabal.u8 q9, d11, d9
+ vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d12, d8
+ vabal.u8 q11, d13, d9
loop_sad3_prog:
@Row 1
- vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
- vabal.u8 q12, d14, d8
- vabal.u8 q13, d15, d9
- vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
- vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
- vabal.u8 q8, d2, d0
- vabal.u8 q9, d3, d1
- vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
- vabal.u8 q10, d4, d0
- vabal.u8 q11, d5, d1
+ vld1.8 {d0, d1}, [r3], r5 @ load src Row 1
+ vabal.u8 q12, d14, d8
+ vabal.u8 q13, d15, d9
+ vld1.8 {d2, d3}, [r0], r4 @ load ref1 Row 1
+ vld1.8 {d4, d5}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d2, d0
+ vabal.u8 q9, d3, d1
+ vld1.8 {d6, d7}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d4, d0
+ vabal.u8 q11, d5, d1
@Row 2
- vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
- vabal.u8 q12, d6, d0
- vabal.u8 q13, d7, d1
- vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
- subs r6, #2
- vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
- vabal.u8 q8, d10, d8
- vabal.u8 q9, d11, d9
- vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
- vabal.u8 q10, d12, d8
- vabal.u8 q11, d13, d9
-
- bne loop_sad3_prog
-
- vabal.u8 q12, d14, d8
- vabal.u8 q13, d15, d9
+ vld1.8 {d8, d9}, [r3], r5 @ load src Row 1
+ vabal.u8 q12, d6, d0
+ vabal.u8 q13, d7, d1
+ vld1.8 {d10, d11}, [r0], r4 @ load ref1 Row 1
+ subs r6, #2
+ vld1.8 {d12, d13}, [r1], r4 @ load ref2 Row 1
+ vabal.u8 q8, d10, d8
+ vabal.u8 q9, d11, d9
+ vld1.8 {d14, d15}, [r2], r4 @ load ref3 Row 1
+ vabal.u8 q10, d12, d8
+ vabal.u8 q11, d13, d9
+
+ bne loop_sad3_prog
+
+ vabal.u8 q12, d14, d8
+ vabal.u8 q13, d15, d9
@ Compute SAD
- vadd.u16 q8, q8, q9 @ Q8 : sad_ref1
- vadd.u16 q10, q10, q11 @ Q10 : sad_ref2
- vadd.u16 q12, q12, q13 @ Q12 : sad_ref3
+ vadd.u16 q8, q8, q9 @ Q8 : sad_ref1
+ vadd.u16 q10, q10, q11 @ Q10 : sad_ref2
+ vadd.u16 q12, q12, q13 @ Q12 : sad_ref3
- vadd.u16 d16, d16, d17
- vadd.u16 d20, d20, d21
- vadd.u16 d24, d24, d25
+ vadd.u16 d16, d16, d17
+ vadd.u16 d20, d20, d21
+ vadd.u16 d24, d24, d25
- vpadd.u16 d16, d16, d20
- vpadd.u16 d24, d24, d24
+ vpadd.u16 d16, d16, d20
+ vpadd.u16 d24, d24, d24
- ldr r6, [sp, #24] @ loading pi4_sad to r6
- vpaddl.u16 d16, d16
- vpaddl.u16 d24, d24
+ ldr r6, [sp, #24] @ loading pi4_sad to r6
+ vpaddl.u16 d16, d16
+ vpaddl.u16 d24, d24
- vst1.64 {d16}, [r6]!
- vst1.32 {d24[0]}, [r6]
+ vst1.64 {d16}, [r6]!
+ vst1.32 {d24[0]}, [r6]
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r6, pc}
- ldmfd sp!, {r4-r6, pc}
-
-@/**
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) for sub-pel motion estimation
@@ -551,7 +558,7 @@ loop_sad3_prog:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
.text
.p2align 2
@@ -560,115 +567,116 @@ loop_sad3_prog:
ime_sub_pel_compute_sad_16x16_a9q:
- stmfd sp!, {r4-r11, lr} @store register values to stack
+ stmfd sp!, {r4-r11, lr} @store register values to stack
- ldr r9, [sp, #36]
- ldr r10, [sp, #40]
+ ldr r9, [sp, #36]
+ ldr r10, [sp, #40]
+ vpush {d8-d15}
+ sub r4, r1, #1 @ x left
+ sub r5, r2, r10 @ y top
- sub r4, r1, #1 @ x left
- sub r5, r2, r10 @ y top
+ sub r6, r3, #1 @ xy left
+ sub r7, r3, r10 @ xy top
- sub r6, r3, #1 @ xy left
- sub r7, r3, r10 @ xy top
-
- sub r8, r7, #1 @ xy top-left
- mov r11, #15
+ sub r8, r7, #1 @ xy top-left
+ mov r11, #15
@for bringing buffer2 into cache..., dummy load instructions
@ LDR r12,[r1]
@ LDR r12,[sp,#12]
- vld1.8 {d0, d1}, [r0], r9 @ src
- vld1.8 {d2, d3}, [r5], r10 @ y top LOAD
- vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD
- vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD
-
- vabdl.u8 q6, d2, d0 @ y top ABS1
- vabdl.u8 q7, d4, d0 @ xy top ABS1
- vld1.8 {d8, d9}, [r1], r10 @ x LOAD
- vabdl.u8 q8, d6, d0 @ xy top-left ABS1
- vabdl.u8 q9, d8, d0 @ x ABS1
- vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
-
- vabal.u8 q6, d3, d1 @ y top ABS2
- vabal.u8 q7, d5, d1 @ xy top ABS2
- vld1.8 {d2, d3}, [r2], r10 @ y LOAD
- vabal.u8 q8, d7, d1 @ xy top-left ABS2
- vabal.u8 q9, d9, d1 @ x ABS2
- vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
-
- vabdl.u8 q10, d10, d0 @ x left ABS1
- vabdl.u8 q11, d2, d0 @ y ABS1
- vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
- vabdl.u8 q12, d4, d0 @ xy ABS1
- vabdl.u8 q13, d6, d0 @ xy left ABS1
+ vld1.8 {d0, d1}, [r0], r9 @ src
+ vld1.8 {d2, d3}, [r5], r10 @ y top LOAD
+ vld1.8 {d4, d5}, [r7], r10 @ xy top LOAD
+ vld1.8 {d6, d7}, [r8], r10 @ xy top-left LOAD
+
+ vabdl.u8 q6, d2, d0 @ y top ABS1
+ vabdl.u8 q7, d4, d0 @ xy top ABS1
+ vld1.8 {d8, d9}, [r1], r10 @ x LOAD
+ vabdl.u8 q8, d6, d0 @ xy top-left ABS1
+ vabdl.u8 q9, d8, d0 @ x ABS1
+ vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
+
+ vabal.u8 q6, d3, d1 @ y top ABS2
+ vabal.u8 q7, d5, d1 @ xy top ABS2
+ vld1.8 {d2, d3}, [r2], r10 @ y LOAD
+ vabal.u8 q8, d7, d1 @ xy top-left ABS2
+ vabal.u8 q9, d9, d1 @ x ABS2
+ vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
+
+ vabdl.u8 q10, d10, d0 @ x left ABS1
+ vabdl.u8 q11, d2, d0 @ y ABS1
+ vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
+ vabdl.u8 q12, d4, d0 @ xy ABS1
+ vabdl.u8 q13, d6, d0 @ xy left ABS1
loop_sub_pel_16x16:
- vabal.u8 q10, d11, d1 @ x left ABS2
- vabal.u8 q11, d3, d1 @ y ABS2
- subs r11, #1
- vabal.u8 q12, d5, d1 @ xy ABS2
- vabal.u8 q13, d7, d1 @ xy left ABS2
-
- vld1.8 {d0, d1}, [r0], r9 @ src
- vabal.u8 q6, d2, d0 @ y top ABS1
- vabal.u8 q7, d4, d0 @ xy top ABS1
- vld1.8 {d8, d9}, [r1], r10 @ x LOAD
- vabal.u8 q8, d6, d0 @ xy top-left ABS1
- vabal.u8 q9, d8, d0 @ x ABS1
- vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
-
- vabal.u8 q6, d3, d1 @ y top ABS2
- vabal.u8 q7, d5, d1 @ xy top ABS2
- vld1.8 {d2, d3}, [r2], r10 @ y LOAD
- vabal.u8 q8, d7, d1 @ xy top-left ABS2
- vabal.u8 q9, d9, d1 @ x ABS2
- vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
-
- vabal.u8 q10, d10, d0 @ x left ABS1
- vabal.u8 q11, d2, d0 @ y ABS1
- vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
- vabal.u8 q12, d4, d0 @ xy ABS1
- vabal.u8 q13, d6, d0 @ xy left ABS1
-
- bne loop_sub_pel_16x16
-
- vabal.u8 q10, d11, d1 @ x left ABS2
- vabal.u8 q11, d3, d1 @ y ABS2
- vabal.u8 q12, d5, d1 @ xy ABS2
- vabal.u8 q13, d7, d1 @ xy left ABS2
-
- vadd.i16 d0, d18, d19 @ x
- vadd.i16 d3, d12, d13 @ y top
- vadd.i16 d6, d14, d15 @ xy top
- vadd.i16 d5, d26, d27 @ xy left
- vadd.i16 d1, d20, d21 @ x left
- vadd.i16 d2, d22, d23 @ y
- vadd.i16 d4, d24, d25 @ xy
- vadd.i16 d7, d16, d17 @ xy top left
-
- vpadd.i16 d0, d0, d1
- vpadd.i16 d2, d2, d3
- vpadd.i16 d4, d4, d5
- vpadd.i16 d6, d6, d7
-
- vpaddl.u16 d0, d0
- vpaddl.u16 d2, d2
- ldr r11, [sp, #44]
- vpaddl.u16 d4, d4
- vpaddl.u16 d6, d6
-
- vst1.32 {d0}, [r11]!
- vst1.32 {d2}, [r11]!
- vst1.32 {d4}, [r11]!
- vst1.32 {d6}, [r11]!
-
- ldmfd sp!, {r4-r11, pc} @Restoring registers from stack
-
-
-
-@/**
+ vabal.u8 q10, d11, d1 @ x left ABS2
+ vabal.u8 q11, d3, d1 @ y ABS2
+ subs r11, #1
+ vabal.u8 q12, d5, d1 @ xy ABS2
+ vabal.u8 q13, d7, d1 @ xy left ABS2
+
+ vld1.8 {d0, d1}, [r0], r9 @ src
+ vabal.u8 q6, d2, d0 @ y top ABS1
+ vabal.u8 q7, d4, d0 @ xy top ABS1
+ vld1.8 {d8, d9}, [r1], r10 @ x LOAD
+ vabal.u8 q8, d6, d0 @ xy top-left ABS1
+ vabal.u8 q9, d8, d0 @ x ABS1
+ vld1.8 {d10, d11}, [r4], r10 @ x left LOAD
+
+ vabal.u8 q6, d3, d1 @ y top ABS2
+ vabal.u8 q7, d5, d1 @ xy top ABS2
+ vld1.8 {d2, d3}, [r2], r10 @ y LOAD
+ vabal.u8 q8, d7, d1 @ xy top-left ABS2
+ vabal.u8 q9, d9, d1 @ x ABS2
+ vld1.8 {d4, d5}, [r3], r10 @ xy LOAD
+
+ vabal.u8 q10, d10, d0 @ x left ABS1
+ vabal.u8 q11, d2, d0 @ y ABS1
+ vld1.8 {d6, d7}, [r6], r10 @ xy left LOAD
+ vabal.u8 q12, d4, d0 @ xy ABS1
+ vabal.u8 q13, d6, d0 @ xy left ABS1
+
+ bne loop_sub_pel_16x16
+
+ vabal.u8 q10, d11, d1 @ x left ABS2
+ vabal.u8 q11, d3, d1 @ y ABS2
+ vabal.u8 q12, d5, d1 @ xy ABS2
+ vabal.u8 q13, d7, d1 @ xy left ABS2
+
+ vadd.i16 d0, d18, d19 @ x
+ vadd.i16 d3, d12, d13 @ y top
+ vadd.i16 d6, d14, d15 @ xy top
+ vadd.i16 d5, d26, d27 @ xy left
+ vadd.i16 d1, d20, d21 @ x left
+ vadd.i16 d2, d22, d23 @ y
+ vadd.i16 d4, d24, d25 @ xy
+ vadd.i16 d7, d16, d17 @ xy top left
+
+ vpadd.i16 d0, d0, d1
+ vpadd.i16 d2, d2, d3
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d6, d6, d7
+
+ vpaddl.u16 d0, d0
+ vpaddl.u16 d2, d2
+ vpop {d8-d15}
+ ldr r11, [sp, #44]
+ vpaddl.u16 d4, d4
+ vpaddl.u16 d6, d6
+
+ vst1.32 {d0}, [r11]!
+ vst1.32 {d2}, [r11]!
+ vst1.32 {d4}, [r11]!
+ vst1.32 {d6}, [r11]!
+
+ ldmfd sp!, {r4-r11, pc} @Restoring registers from stack
+
+
+
+@**
@******************************************************************************
@*
@* @brief computes distortion (SAD) between 2 16x16 blocks
@@ -699,7 +707,7 @@ loop_sub_pel_16x16:
@* @remarks
@*
@******************************************************************************
-@*/
+@*
.text
.p2align 2
@@ -710,51 +718,52 @@ ime_compute_sad_16x16_a9q:
@STMFD sp!,{r12,lr}
- stmfd sp!, {r12, r14} @store register values to stack
+ stmfd sp!, {r12, r14} @store register values to stack
@for bringing buffer2 into cache..., dummy load instructions
@ LDR r12,[r1]
@ LDR r12,[sp,#12]
- vld1.8 {d4, d5}, [r0], r2
- vld1.8 {d6, d7}, [r1], r3
-
- mov r12, #14
- vld1.8 {d8, d9}, [r0], r2
- vabdl.u8 q0, d4, d6
- vld1.8 {d10, d11}, [r1], r3
- vabdl.u8 q1, d5, d7
+ vld1.8 {d4, d5}, [r0], r2
+ vld1.8 {d6, d7}, [r1], r3
+ vpush {d8-d15}
+ mov r12, #14
+ vld1.8 {d8, d9}, [r0], r2
+ vabdl.u8 q0, d4, d6
+ vld1.8 {d10, d11}, [r1], r3
+ vabdl.u8 q1, d5, d7
loop_sad_16x16:
- vld1.8 {d4, d5}, [r0], r2
- vabal.u8 q0, d8, d10
- vld1.8 {d6, d7}, [r1], r3
- vabal.u8 q1, d9, d11
+ vld1.8 {d4, d5}, [r0], r2
+ vabal.u8 q0, d8, d10
+ vld1.8 {d6, d7}, [r1], r3
+ vabal.u8 q1, d9, d11
- vld1.8 {d8, d9}, [r0], r2
- vabal.u8 q0, d4, d6
- subs r12, #2
- vld1.8 {d10, d11}, [r1], r3
- vabal.u8 q1, d5, d7
+ vld1.8 {d8, d9}, [r0], r2
+ vabal.u8 q0, d4, d6
+ subs r12, #2
+ vld1.8 {d10, d11}, [r1], r3
+ vabal.u8 q1, d5, d7
- bne loop_sad_16x16
+ bne loop_sad_16x16
- vabal.u8 q0, d8, d10
- vabal.u8 q1, d9, d11
+ vabal.u8 q0, d8, d10
+ vabal.u8 q1, d9, d11
- vadd.i16 q0, q0, q1
- vadd.i16 d0, d1, d0
- ldr r12, [sp, #12]
+ vadd.i16 q0, q0, q1
+ vadd.i16 d0, d1, d0
+ vpop {d8-d15}
+ ldr r12, [sp, #12]
- vpaddl.u16 d0, d0
- vpaddl.u32 d0, d0
- vst1.32 {d0[0]}, [r12]
+ vpaddl.u16 d0, d0
+ vpaddl.u32 d0, d0
+ vst1.32 {d0[0]}, [r12]
- ldmfd sp!, {r12, pc} @Restoring registers from stack
+ ldmfd sp!, {r12, pc} @Restoring registers from stack
-@/*
+@*
@//---------------------------------------------------------------------------
@// Function Name : Calculate_Mad4_prog()
@//
@@ -764,7 +773,7 @@ loop_sad_16x16:
@// Platform : CortexA8/NEON .
@//
@//-----------------------------------------------------------------------------
-@*/
+@*
.global ime_calculate_sad4_prog_a9q
@@ -775,20 +784,20 @@ ime_calculate_sad4_prog_a9q:
@ r3 = CurBufferWidth <UWORD32>
@ stack = psad <UWORD32 *> {at 0x34}
- stmfd sp!, {r4-r7, lr}
+ stmfd sp!, {r4-r7, lr}
@UWORD8 *left_ptr = temp_frame - 1;
@UWORD8 *right_ptr = temp_frame + 1;
@UWORD8 *top_ptr = temp_frame - RefBufferWidth;
@UWORD8 *bot_ptr = temp_frame + RefBufferWidth;
- mov r7, #14
- sub r4, r0, #0x01 @r4 = left_ptr
- add r5, r0, #0x1 @r5 = right_ptr
- sub r6, r0, r2 @r6 = top_ptr
- add r0, r0, r2 @r0 = bot_ptr
+ mov r7, #14
+ sub r4, r0, #0x01 @r4 = left_ptr
+ add r5, r0, #0x1 @r5 = right_ptr
+ sub r6, r0, r2 @r6 = top_ptr
+ add r0, r0, r2 @r0 = bot_ptr
@r1 = buffer_ptr
-
+ vpush {d8-d15}
@D0:D1 : buffer
@D2:D3 : top
@D4:D5 : left
@@ -796,94 +805,93 @@ ime_calculate_sad4_prog_a9q:
@D8:D9 : bottom
@Row 1
- vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
+ vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
- vabdl.u8 q5, d2, d0
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
- vabdl.u8 q6, d3, d1
+ vabdl.u8 q5, d2, d0
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
+ vabdl.u8 q6, d3, d1
- vabdl.u8 q7, d0, d4
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
- vabdl.u8 q8, d1, d5
+ vabdl.u8 q7, d0, d4
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
+ vabdl.u8 q8, d1, d5
@Row 2
- vabdl.u8 q9, d0, d6
- vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
- vabdl.u8 q10, d1, d7
+ vabdl.u8 q9, d0, d6
+ vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
+ vabdl.u8 q10, d1, d7
- vabdl.u8 q11, d0, d8
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
- vabdl.u8 q12, d1, d9
+ vabdl.u8 q11, d0, d8
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
+ vabdl.u8 q12, d1, d9
loop_sad4_prog:
- vabal.u8 q5, d26, d2
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
- vabal.u8 q6, d27, d3
+ vabal.u8 q5, d26, d2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
+ vabal.u8 q6, d27, d3
- vabal.u8 q7, d26, d4
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
- vabal.u8 q8, d27, d5
+ vabal.u8 q7, d26, d4
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
+ vabal.u8 q8, d27, d5
- vabal.u8 q9, d26, d6
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
- vabal.u8 q10, d27, d7
+ vabal.u8 q9, d26, d6
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
+ vabal.u8 q10, d27, d7
@Row 1
- vabal.u8 q11, d26, d8
- vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
- vabal.u8 q12, d27, d9
-
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
- subs r7, #2
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
+ vabal.u8 q11, d26, d8
+ vld1.8 {d0, d1}, [r1], r3 @ load src Row 1
+ vabal.u8 q12, d27, d9
- vabal.u8 q5, d0, d2
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 1
+ subs r7, #2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 1
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
- vabal.u8 q6, d1, d3
+ vabal.u8 q5, d0, d2
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 1
+ vabal.u8 q6, d1, d3
- vabal.u8 q7, d0, d4
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
- vabal.u8 q8, d1, d5
+ vabal.u8 q7, d0, d4
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 1
+ vabal.u8 q8, d1, d5
@Row 2
- vabal.u8 q9, d0, d6
- vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
- vabal.u8 q10, d1, d7
+ vabal.u8 q9, d0, d6
+ vld1.8 {d26, d27}, [r1], r3 @ load src Row 2
+ vabal.u8 q10, d1, d7
- vabal.u8 q11, d0, d8
- vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
- vabal.u8 q12, d1, d9
+ vabal.u8 q11, d0, d8
+ vld1.8 {d2, d3}, [r6], r2 @ load top Row 2
+ vabal.u8 q12, d1, d9
- bne loop_sad4_prog
+ bne loop_sad4_prog
- vabal.u8 q5, d26, d2
- vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
- vabal.u8 q6, d27, d3
+ vabal.u8 q5, d26, d2
+ vld1.8 {d4, d5}, [r4], r2 @ load left Row 2
+ vabal.u8 q6, d27, d3
- vabal.u8 q7, d26, d4
- vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
- vabal.u8 q8, d27, d5
+ vabal.u8 q7, d26, d4
+ vld1.8 {d6, d7}, [r5], r2 @ load right Row 2
+ vabal.u8 q8, d27, d5
- vabal.u8 q9, d26, d6
- vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
- vabal.u8 q10, d27, d7
+ vabal.u8 q9, d26, d6
+ vld1.8 {d8, d9}, [r0], r2 @ load bottom Row 2
+ vabal.u8 q10, d27, d7
- vabal.u8 q11, d26, d8
- vabal.u8 q12, d27, d9
+ vabal.u8 q11, d26, d8
+ vabal.u8 q12, d27, d9
@;Q5:Q6 : sad_top
@;Q7:Q8 : sad_left
@;Q9:Q10 : sad_right
@;Q11:Q12 : sad_bot
- vadd.u16 q5, q5, q6
- vadd.u16 q7, q7, q8
- vadd.u16 q9, q9, q10
- vadd.u16 q11, q11, q12
+ vadd.u16 q5, q5, q6
+ vadd.u16 q7, q7, q8
+ vadd.u16 q9, q9, q10
+ vadd.u16 q11, q11, q12
@; Free :-
@; Q6,Q8,Q10,Q12
@@ -893,10 +901,10 @@ loop_sad4_prog:
@;Q9 -> D18:D19
@;Q11 -> D22:D23
- vadd.u16 d10, d10, d11
- vadd.u16 d14, d14, d15
- vadd.u16 d18, d18, d19
- vadd.u16 d22, d22, d23
+ vadd.u16 d10, d10, d11
+ vadd.u16 d14, d14, d15
+ vadd.u16 d18, d18, d19
+ vadd.u16 d22, d22, d23
@;D10 : sad_top
@;D14 : sad_left
@@ -904,35 +912,35 @@ loop_sad4_prog:
@;D22 : sad_bot
- vpaddl.u16 d11, d10
- vpaddl.u16 d15, d14
- vpaddl.u16 d19, d18
- vpaddl.u16 d23, d22
+ vpaddl.u16 d11, d10
+ vpaddl.u16 d15, d14
+ vpaddl.u16 d19, d18
+ vpaddl.u16 d23, d22
@;D11 : sad_top
@;D15 : sad_left
@;D19 : sad_right
@;D23 : sad_bot
- vpaddl.u32 d10, d11
- vpaddl.u32 d22, d23
- vpaddl.u32 d14, d15
- vpaddl.u32 d18, d19
+ vpaddl.u32 d10, d11
+ vpaddl.u32 d22, d23
+ vpaddl.u32 d14, d15
+ vpaddl.u32 d18, d19
@;D10 : sad_top
@;D14 : sad_left
@;D18 : sad_right
@;D22 : sad_bot
- ldr r4, [sp, #20] @;Can be rearranged
-
- vsli.64 d10, d22, #32
- vsli.64 d14, d18, #32
+ ldr r4, [sp, #84] @;Can be rearranged
- vst1.64 {d14}, [r4]!
- vst1.64 {d10}, [r4]!
+ vsli.64 d10, d22, #32
+ vsli.64 d14, d18, #32
- ldmfd sp!, {r4-r7, pc}
+ vst1.64 {d14}, [r4]!
+ vst1.64 {d10}, [r4]!
+ vpop {d8-d15}
+ ldmfd sp!, {r4-r7, pc}
@@ -974,37 +982,37 @@ ime_compute_satqd_16x16_lumainter_a9q:
@R5 :Distortion,ie SAD
@R6 :is nonzero
- push {r4-r12, lr} @push all the variables first
+ push {r4-r12, lr} @push all the variables first
@ADD SP,SP,#40 ;decrement stack pointer,to accomodate two variables
- ldr r4, [sp, #40] @load the threshold address
-
- mov r8, #8 @Number of 4x8 blocks to be processed
- mov r10, #0 @Sad
- mov r7, #0 @Nonzero info
+ ldr r4, [sp, #40] @load the threshold address
+ vpush {d8-d15}
+ mov r8, #8 @Number of 4x8 blocks to be processed
+ mov r10, #0 @Sad
+ mov r7, #0 @Nonzero info
@----------------------------------------------------
- vld1.u8 d30, [r0], r2 @I load 8 pix src row 1
+ vld1.u8 d30, [r0], r2 @I load 8 pix src row 1
- vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1
+ vld1.u8 d31, [r1], r3 @I load 8 pix pred row 1
- vld1.u8 d28, [r0], r2 @I load 8 pix src row 2
+ vld1.u8 d28, [r0], r2 @I load 8 pix src row 2
- vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2
+ vld1.u8 d29, [r1], r3 @I load 8 pix pred row 2
- vld1.u8 d26, [r0], r2 @I load 8 pix src row 3
- vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12
+ vld1.u8 d26, [r0], r2 @I load 8 pix src row 3
+ vabdl.u8 q0, d30, d31 @I Abs diff r1 blk 12
- vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3
+ vld1.u8 d27, [r1], r3 @I load 8 pix pred row 3
- vld1.u8 d24, [r0], r2 @I load 8 pix src row 4
+ vld1.u8 d24, [r0], r2 @I load 8 pix src row 4
- vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4
- vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12
+ vld1.u8 d25, [r1], r3 @I load 8 pix pred row 4
+ vabdl.u8 q1, d28, d29 @I Abs diff r1 blk 12
- vld1.u16 {q11}, [r4] @I load the threhold
- vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12
+ vld1.u16 {q11}, [r4] @I load the threhold
+ vabdl.u8 q2, d26, d27 @I Abs diff r1 blk 12
- vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12
+ vabdl.u8 q3, d24, d25 @I Abs diff r1 blk 12
@@ -1013,128 +1021,128 @@ core_loop:
@S5 S6 S7 S8 A5 A6 A7 A8
@S9 S10 S11 S12 A9 A10 A11 A12
@S13 S14 S15 S16 A13 A14 A15 A16
- ands r11, r8, #1 @II See if we are at even or odd block
- vadd.u16 q4 , q0, q3 @I Add r1 r4
- lsl r11, r2, #2 @II Move back src 4 rows
+ ands r11, r8, #1 @II See if we are at even or odd block
+ vadd.u16 q4 , q0, q3 @I Add r1 r4
+ lsl r11, r2, #2 @II Move back src 4 rows
- subeq r0, r0, r11 @II Move back src 4 rows if we are at even block
- vadd.u16 q5 , q1, q2 @I Add r2 r3
- addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block
+ subeq r0, r0, r11 @II Move back src 4 rows if we are at even block
+ vadd.u16 q5 , q1, q2 @I Add r2 r3
+ addeq r0, r0, #8 @II Move src 8 cols forward if we are at even block
- lsl r11, r3, #2 @II Move back pred 4 rows
- vtrn.16 d8 , d10 @I trnspse 1
- subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block
+ lsl r11, r3, #2 @II Move back pred 4 rows
+ vtrn.16 d8 , d10 @I trnspse 1
+ subeq r1, r1, r11 @II Move back pred 4 rows if we are at even block
- addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block
- vtrn.16 d9 , d11 @I trnspse 2
- subne r0, r0, #8 @II Src 8clos back for odd rows
+ addeq r1, r1, #8 @II Move pred 8 cols forward if we are at even block
+ vtrn.16 d9 , d11 @I trnspse 2
+ subne r0, r0, #8 @II Src 8clos back for odd rows
- subne r1, r1, #8 @II Pred 8 cols back for odd rows
- vtrn.32 d10, d11 @I trnspse 4
+ subne r1, r1, #8 @II Pred 8 cols back for odd rows
+ vtrn.32 d10, d11 @I trnspse 4
- vtrn.32 d8 , d9 @I trnspse 3
- vswp d10, d11 @I rearrange so that the q4 and q5 add properly
+ vtrn.32 d8 , d9 @I trnspse 3
+ vswp d10, d11 @I rearrange so that the q4 and q5 add properly
@D8 S1 S4 A1 A4
@D9 S2 S3 A2 A3
@D11 S1 S4 A1 A4
@D10 S2 S3 A2 A3
- vadd.s16 q6, q4, q5 @I Get s1 s4
- vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1
+ vadd.s16 q6, q4, q5 @I Get s1 s4
+ vld1.u8 d30, [r0], r2 @II load first 8 pix src row 1
- vtrn.s16 d12, d13 @I Get s2 s3
+ vtrn.s16 d12, d13 @I Get s2 s3
@D12 S1 S4 A1 A4
@D13 S2 S3 A2 A3
- vshl.s16 q7, q6 , #1 @I si = si<<1
- vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1
+ vshl.s16 q7, q6 , #1 @I si = si<<1
+ vld1.u8 d31, [r1], r3 @II load first 8 pix pred row 1
- vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3)
- vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2
+ vpadd.s16 d16, d12, d13 @I (s1 + s4) (s2 + s3)
+ vld1.u8 d28, [r0], r2 @II load first 8 pix src row 2
@ D16 S14 A14 S23 A23
- vrev32.16 d0, d16 @I
- vuzp.s16 d16, d0 @I
+ vrev32.16 d0, d16 @I
+ vuzp.s16 d16, d0 @I
@D16 S14 S23 A14 A23
- vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4)
- vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2
+ vadd.s16 d17, d12, d13 @I (s1 + s2) (s3 + s4)
+ vld1.u8 d29, [r1], r3 @II load first 8 pix pred row 2
@D17 S12 S34 A12 A34
- vrev32.16 q9, q7 @I Rearrange si's
+ vrev32.16 q9, q7 @I Rearrange si's
@Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
@D12 S1 S4 A1 A4
@D19 Z3 Z2 Y3 Y2
- vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1))
- vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3
+ vsub.s16 d8, d12, d19 @I (s1 - (s3<<1)) (s4 - (s2<<1))
+ vld1.u8 d26, [r0], r2 @II load first 8 pix src row 3
@D13 S2 S3 A2 A3
@D18 Z4 Z1 Y4 Y1
- vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1))
- vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3
+ vsub.s16 d9, d13, d18 @I (s2 - (s4<<1)) (s3 - (s1<<1))
+ vld1.u8 d27, [r1], r3 @II load first 8 pix pred row 3
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
@D16 S14 S23 A14 A23
- vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
- vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4
+ vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
+ vld1.u8 d24, [r0], r2 @II load first 8 pix src row 4
@D22 SAD1 SAD2 junk junk
@Q8 S2 S1 A2 A1 S6 S3 A6 A3
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
- vtrn.32 q8, q4 @I Rearrange to make ls of each block togather
+ vtrn.32 q8, q4 @I Rearrange to make ls of each block togather
@Q8 S2 S1 S8 S5 S6 S3 S7 S4
@Q10 A2 A1 A8 A5 A6 A3 A7 A4
- ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1
- vdup.s16 q6, d10[0] @I Get the sad blk 1
- vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12
+ ldrh r11, [r4, #16] @I Load the threshold for DC val blk 1
+ vdup.s16 q6, d10[0] @I Get the sad blk 1
+ vabdl.u8 q0, d30, d31 @II Abs diff r1 blk 12
- vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1
- vmov.s16 r9, d10[0] @I Get the sad for block 1
+ vshl.s16 q7, q6, #1 @I sad_2 = sad_1<<1
+ vmov.s16 r9, d10[0] @I Get the sad for block 1
- vsub.s16 q9, q7, q8 @I Add to the lss
- vmov.s16 r5, d10[1] @I Get the sad for block 2
+ vsub.s16 q9, q7, q8 @I Add to the lss
+ vmov.s16 r5, d10[1] @I Get the sad for block 2
- vcle.s16 q7, q11, q9 @I Add to the lss
- vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4
+ vcle.s16 q7, q11, q9 @I Add to the lss
+ vld1.u8 d25, [r1], r3 @II load first 8 pix pred row 4
- vdup.s16 q15, d10[1] @I Get the sad blk 1
- vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12
+ vdup.s16 q15, d10[1] @I Get the sad blk 1
+ vabdl.u8 q1, d28, d29 @II Abs diff r1 blk 12
- vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1
- vsub.s16 q3, q14, q4 @I Add to the lss
- vcle.s16 q15, q11, q3 @I Add to the lss
+ vshl.s16 q14, q15, #1 @I sad_2 = sad_1<<1
+ vsub.s16 q3, q14, q4 @I Add to the lss
+ vcle.s16 q15, q11, q3 @I Add to the lss
- ADD R10, R10, R9 @I Add to the global sad blk 1
- vtrn.u8 q15, q7 @I get all comparison bits to one reg
- vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12
+ ADD R10, R10, R9 @I Add to the global sad blk 1
+ vtrn.u8 q15, q7 @I get all comparison bits to one reg
+ vabdl.u8 q2, d26, d27 @II Abs diff r1 blk 12
- ADD R10, R10, R5 @I Add to the global sad blk 2
- vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs
- cmp r11, r9
+ ADD R10, R10, R5 @I Add to the global sad blk 2
+ vshr.u8 q14, q15, #7 @I Shift the bits so that no overflow occurs
+ cmp r11, r9
- movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1
- vadd.u8 d28, d28, d29 @I Add the bits
- cmp r11, r5 @I Compare with threshold blk 2
+ movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 1 ;I Compare with threshold blk 1
+ vadd.u8 d28, d28, d29 @I Add the bits
+ cmp r11, r5 @I Compare with threshold blk 2
- movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2
- vpadd.u8 d28, d28, d29 @I Add the bits
+ movle r7, #0xf @I If not met mark it by mvoing non zero val to R7 blk 2
+ vpadd.u8 d28, d28, d29 @I Add the bits
- vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11
- vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12
+ vmov.u32 r11, d28[0] @I Since a set bit now represents a unstatisofrd contifon store it in r11
+ vabdl.u8 q3, d24, d25 @II Abs diff r1 blk 12
- orr r7, r7, r11 @I get the guy to r11
+ orr r7, r7, r11 @I get the guy to r11
- sub r8, r8, #1 @I Decremrnt block count
+ sub r8, r8, #1 @I Decremrnt block count
- cmp r7, #0 @I If we have atlest one non zero block
- bne compute_sad_only @I if a non zero block is der,From now on compute sad only
+ cmp r7, #0 @I If we have atlest one non zero block
+ bne compute_sad_only @I if a non zero block is der,From now on compute sad only
- cmp r8, #1 @I See if we are at the last block
- bne core_loop @I If the blocks are zero, lets continue the satdq
+ cmp r8, #1 @I See if we are at the last block
+ bne core_loop @I If the blocks are zero, lets continue the satdq
@EPILOUGE for core loop
@@ -1142,94 +1150,94 @@ core_loop:
@S5 S6 S7 S8 A5 A6 A7 A8
@S9 S10 S11 S12 A9 A10 A11 A12
@S13 S14 S15 S16 A13 A14 A15 A16
- vadd.u16 q4 , q0, q3 @Add r1 r4
- vadd.u16 q5 , q1, q2 @Add r2 r3
+ vadd.u16 q4 , q0, q3 @Add r1 r4
+ vadd.u16 q5 , q1, q2 @Add r2 r3
@D8 S1 S2 S2 S1
@D10 S4 S3 S3 S4
@D9 A1 A2 A2 A1
@D11 A4 A3 A3 A4
- vtrn.16 d8 , d10 @I trnspse 1
- vtrn.16 d9 , d11 @I trnspse 2
- vtrn.32 d8 , d9 @I trnspse 3
- vtrn.32 d10, d11 @I trnspse 4
+ vtrn.16 d8 , d10 @I trnspse 1
+ vtrn.16 d9 , d11 @I trnspse 2
+ vtrn.32 d8 , d9 @I trnspse 3
+ vtrn.32 d10, d11 @I trnspse 4
- vswp d10, d11 @I rearrange so that the q4 and q5 add properly
+ vswp d10, d11 @I rearrange so that the q4 and q5 add properly
@D8 S1 S4 A1 A4
@D9 S2 S3 A2 A3
@D11 S1 S4 A1 A4
@D10 S2 S3 A2 A3
- vadd.s16 q6, q4, q5 @Get s1 s4
- vtrn.s16 d12, d13 @Get s2 s3
+ vadd.s16 q6, q4, q5 @Get s1 s4
+ vtrn.s16 d12, d13 @Get s2 s3
@D12 S1 S4 A1 A4
@D13 S2 S3 A2 A3
- vshl.s16 q7, q6 , #1 @si = si<<1
- vmov.s16 r9, d10[0] @Get the sad for block 1
+ vshl.s16 q7, q6 , #1 @si = si<<1
+ vmov.s16 r9, d10[0] @Get the sad for block 1
- vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3)
- vmov.s16 r5, d10[1] @Get the sad for block 2
+ vpadd.s16 d16, d12, d13 @(s1 + s4) (s2 + s3)
+ vmov.s16 r5, d10[1] @Get the sad for block 2
@D16 S14 A14 S23 A23
- vrev32.16 d30, d16 @
- vuzp.s16 d16, d30 @
+ vrev32.16 d30, d16 @
+ vuzp.s16 d16, d30 @
@D16 S14 S23 A14 A23
- vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4)
+ vadd.s16 d17, d12, d13 @(s1 + s2) (s3 + s4)
@D17 S12 S34 A12 A34
- vrev32.16 q9, q7 @Rearrange si's
+ vrev32.16 q9, q7 @Rearrange si's
@Q9 Z4,Z1,Y4,Y1,Z3,Z2,Y3,Y2
@D12 S1 S4 A1 A4
@D19 Z3 Z2 Y3 Y2
- vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1))
+ vsub.s16 d8, d12, d19 @(s1 - (s3<<1)) (s4 - (s2<<1))
@D13 S2 S3 A2 A3
@D18 Z4 Z1 Y4 Y1
- vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1))
+ vsub.s16 d9, d13, d18 @(s2 - (s4<<1)) (s3 - (s1<<1))
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
@D16 S14 S23 A14 A23
- vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
+ vpadd.s16 d10, d16, d17 @I Get sad by adding s1 s2 s3 s4
@D22 SAD1 SAD2 junk junk
- vmov.u16 r9, d10[0] @Get the sad for block 1
- vmov.u16 r5, d10[1] @Get the sad for block 2
+ vmov.u16 r9, d10[0] @Get the sad for block 1
+ vmov.u16 r5, d10[1] @Get the sad for block 2
@Q8 S2 S1 A2 A1 S6 S3 A6 A3
@Q10 S8 S5 A8 A5 S7 S4 A7 A4
- ldrh r11, [r4, #16] @Load the threshold for DC val blk 1
- vtrn.32 q8, q4 @Rearrange to make ls of each block togather
- ADD R10, R10, R9 @Add to the global sad blk 1
+ ldrh r11, [r4, #16] @Load the threshold for DC val blk 1
+ vtrn.32 q8, q4 @Rearrange to make ls of each block togather
+ ADD R10, R10, R9 @Add to the global sad blk 1
@Q8 S2 S1 S8 S5 S6 S3 S7 S4
@Q10 A2 A1 A8 A5 A6 A3 A7 A4
- vld1.u16 {q11}, [r4] @load the threhold
- ADD R10, R10, R5 @Add to the global sad blk 2
+ vld1.u16 {q11}, [r4] @load the threhold
+ ADD R10, R10, R5 @Add to the global sad blk 2
- vdup.u16 q6, d10[0] @Get the sad blk 1
+ vdup.u16 q6, d10[0] @Get the sad blk 1
- cmp r11, r9 @Compare with threshold blk 1
- vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1
+ cmp r11, r9 @Compare with threshold blk 1
+ vshl.u16 q7, q6, #1 @sad_2 = sad_1<<1
- vsub.s16 q9, q7, q8 @Add to the lss
+ vsub.s16 q9, q7, q8 @Add to the lss
- vcle.s16 q15, q11, q9 @Add to the lss
- movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1
+ vcle.s16 q15, q11, q9 @Add to the lss
+ movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 1
- cmp r11, r5 @Compare with threshold blk 2
- vdup.u16 q14, d10[1] @Get the sad blk 1
+ cmp r11, r5 @Compare with threshold blk 2
+ vdup.u16 q14, d10[1] @Get the sad blk 1
- vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1
- vsub.s16 q12, q13, q4 @Add to the lss
- vcle.s16 q14, q11, q12 @Add to the lss
- movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2
+ vshl.u16 q13, q14, #1 @sad_2 = sad_1<<1
+ vsub.s16 q12, q13, q4 @Add to the lss
+ vcle.s16 q14, q11, q12 @Add to the lss
+ movle r7, #0xf @If not met mark it by mvoing non zero val to R7 blk 2
- vtrn.u8 q14, q15 @get all comparison bits to one reg
- vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs
- vadd.u8 d28, d28, d29 @Add the bits
- vpadd.u8 d28, d28, d29 @Add the bits
- vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11
- orr r7, r7, r11 @get the guy to r11
+ vtrn.u8 q14, q15 @get all comparison bits to one reg
+ vshr.u8 q14, q14, #7 @Shift the bits so that no overflow occurs
+ vadd.u8 d28, d28, d29 @Add the bits
+ vpadd.u8 d28, d28, d29 @Add the bits
+ vmov.u32 r11, d28[0] @Since a set bit now represents a unstatisofrd contifon store it in r11
+ orr r7, r7, r11 @get the guy to r11
- b funcend_sad_16x16 @Since all blocks ar processed nw, got to end
+ b funcend_sad_16x16 @Since all blocks ar processed nw, got to end
compute_sad_only: @This block computes SAD only, so will be lighter
@IT will start processign at n odd block
@@ -1237,117 +1245,119 @@ compute_sad_only: @This block computes SAD only, so will b
@and then for two blocks at a time
@The counter is r7, hence r7 blocks will be processed
- and r11, r8, #1 @Get the last bit of counter
- cmp r11, #0 @See if we are at even or odd block
+ and r11, r8, #1 @Get the last bit of counter
+ cmp r11, #0 @See if we are at even or odd block
@iif the blk is even we just have to set the pointer to the
@start of current row
- lsleq r11, r2, #2 @I Move back src 4 rows
- subeq r0, r0, r11 @I Move back src 4 rows if we are at even block
+ lsleq r11, r2, #2 @I Move back src 4 rows
+ subeq r0, r0, r11 @I Move back src 4 rows if we are at even block
- lsleq r11, r3, #2 @I Move back pred 4 rows
- subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block
+ lsleq r11, r3, #2 @I Move back pred 4 rows
+ subeq r1, r1, r11 @I Move back pred 4 rows if we are at even block
@ADDEQ R8,R8,#2 ;Inc counter
- beq skip_odd_blk @If the blk is odd we have to compute sad
+ beq skip_odd_blk @If the blk is odd we have to compute sad
- vadd.u16 q4, q0, q1 @Add SAD of row1 and row2
- vadd.u16 q5, q2, q3 @Add SAD of row3 and row4
- vadd.u16 q6, q4, q5 @Add SAD of row 1-4
- vadd.u16 d14, d12, d13 @Add Blk1 and blk2
- vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4
- vpadd.u16 d18, d16, d17 @Add col 12-34
+ vadd.u16 q4, q0, q1 @Add SAD of row1 and row2
+ vadd.u16 q5, q2, q3 @Add SAD of row3 and row4
+ vadd.u16 q6, q4, q5 @Add SAD of row 1-4
+ vadd.u16 d14, d12, d13 @Add Blk1 and blk2
+ vpadd.u16 d16, d14, d15 @Add col 1-2 and 3-4
+ vpadd.u16 d18, d16, d17 @Add col 12-34
- vmov.u16 r9, d18[0] @Move sad to arm
- ADD R10, R10, R9 @Add to the global sad
+ vmov.u16 r9, d18[0] @Move sad to arm
+ ADD R10, R10, R9 @Add to the global sad
- sub r8, r8, #1 @Dec counter
- cmp r8, #0 @See if we processed last block
- beq funcend_sad_16x16 @if lprocessed last block goto end of func
+ sub r8, r8, #1 @Dec counter
+ cmp r8, #0 @See if we processed last block
+ beq funcend_sad_16x16 @if lprocessed last block goto end of func
- sub r0, r0, #8 @Since we processed od block move back src by 8 cols
- sub r1, r1, #8 @Since we processed od block move back pred by 8 cols
+ sub r0, r0, #8 @Since we processed od block move back src by 8 cols
+ sub r1, r1, #8 @Since we processed od block move back pred by 8 cols
skip_odd_blk:
- vmov.s16 q0, #0 @Initialize the accumulator
- vmov.s16 q1, #0 @Initialize the accumulator
+ vmov.s16 q0, #0 @Initialize the accumulator
+ vmov.s16 q1, #0 @Initialize the accumulator
- vld1.u8 {q15}, [r0], r2 @load src r1
- vld1.u8 {q14}, [r1], r3 @load pred r1
+ vld1.u8 {q15}, [r0], r2 @load src r1
+ vld1.u8 {q14}, [r1], r3 @load pred r1
- vld1.u8 {q13}, [r0], r2 @load src r2
- vld1.u8 {q12}, [r1], r3 @load pred r2
+ vld1.u8 {q13}, [r0], r2 @load src r2
+ vld1.u8 {q12}, [r1], r3 @load pred r2
- vld1.u8 {q11}, [r0], r2 @load src r3
- vld1.u8 {q10}, [r1], r3 @load pred r2
+ vld1.u8 {q11}, [r0], r2 @load src r3
+ vld1.u8 {q10}, [r1], r3 @load pred r2
- vld1.u8 {q9}, [r0], r2 @load src r4
- vld1.u8 {q8}, [r1], r3 @load pred r4
+ vld1.u8 {q9}, [r0], r2 @load src r4
+ vld1.u8 {q8}, [r1], r3 @load pred r4
- cmp r8, #2
- beq sad_epilouge
+ cmp r8, #2
+ beq sad_epilouge
sad_loop:
- vabal.u8 q0, d30, d28 @I accumulate Abs diff R1
- vabal.u8 q1, d31, d29 @I accumulate Abs diff R1
+ vabal.u8 q0, d30, d28 @I accumulate Abs diff R1
+ vabal.u8 q1, d31, d29 @I accumulate Abs diff R1
- vld1.u8 {q15}, [r0], r2 @II load r1 src
- vabal.u8 q0, d26, d24 @I accumulate Abs diff R2
+ vld1.u8 {q15}, [r0], r2 @II load r1 src
+ vabal.u8 q0, d26, d24 @I accumulate Abs diff R2
- vld1.u8 {q14}, [r1], r3 @II load r1 pred
- vabal.u8 q1, d27, d25 @I accumulate Abs diff R2
+ vld1.u8 {q14}, [r1], r3 @II load r1 pred
+ vabal.u8 q1, d27, d25 @I accumulate Abs diff R2
- vld1.u8 {q13}, [r0], r2 @II load r3 src
- vabal.u8 q0, d22, d20 @I accumulate Abs diff R3
+ vld1.u8 {q13}, [r0], r2 @II load r3 src
+ vabal.u8 q0, d22, d20 @I accumulate Abs diff R3
- vld1.u8 {q12}, [r1], r3 @II load r2 pred
- vabal.u8 q1, d23, d21 @I accumulate Abs diff R3
+ vld1.u8 {q12}, [r1], r3 @II load r2 pred
+ vabal.u8 q1, d23, d21 @I accumulate Abs diff R3
- vld1.u8 {q11}, [r0], r2 @II load r3 src
- vabal.u8 q0, d18, d16 @I accumulate Abs diff R4
+ vld1.u8 {q11}, [r0], r2 @II load r3 src
+ vabal.u8 q0, d18, d16 @I accumulate Abs diff R4
- sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2
- vld1.u8 {q10}, [r1], r3 @II load r3 pred
- vabal.u8 q1, d19, d17 @I accumulate Abs diff R4
+ sub r8, r8, #2 @Since we processe 16 pix @a time, dec by 2
+ vld1.u8 {q10}, [r1], r3 @II load r3 pred
+ vabal.u8 q1, d19, d17 @I accumulate Abs diff R4
- cmp r8, #2 @Check if last loop
- vld1.u8 {q9}, [r0], r2 @II load r4 src
- vld1.u8 {q8}, [r1], r3 @II load r4 pred
+ cmp r8, #2 @Check if last loop
+ vld1.u8 {q9}, [r0], r2 @II load r4 src
+ vld1.u8 {q8}, [r1], r3 @II load r4 pred
- bne sad_loop @Go back to SAD computation
+ bne sad_loop @Go back to SAD computation
sad_epilouge:
- vabal.u8 q0, d30, d28 @Accumulate Abs diff R1
- vabal.u8 q1, d31, d29 @Accumulate Abs diff R1
+ vabal.u8 q0, d30, d28 @Accumulate Abs diff R1
+ vabal.u8 q1, d31, d29 @Accumulate Abs diff R1
- vabal.u8 q0, d26, d24 @Accumulate Abs diff R2
- vabal.u8 q1, d27, d25 @Accumulate Abs diff R2
+ vabal.u8 q0, d26, d24 @Accumulate Abs diff R2
+ vabal.u8 q1, d27, d25 @Accumulate Abs diff R2
- vabal.u8 q0, d22, d20 @Accumulate Abs diff R3
- vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3
+ vabal.u8 q0, d22, d20 @Accumulate Abs diff R3
+ vabal.u8 q1, d23, d21 @Aaccumulate Abs diff R3
- vabal.u8 q0, d18, d16 @Accumulate Abs diff R4
- vabal.u8 q1, d19, d17 @Accumulate Abs diff R4
+ vabal.u8 q0, d18, d16 @Accumulate Abs diff R4
+ vabal.u8 q1, d19, d17 @Accumulate Abs diff R4
- vadd.u16 q2, q0, q1 @ADD two accumulators
- vadd.u16 d6, d4, d5 @Add two blk sad
- vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad
- vpadd.u16 d10, d8, d9 @Add col 12-34 sad
+ vadd.u16 q2, q0, q1 @ADD two accumulators
+ vadd.u16 d6, d4, d5 @Add two blk sad
+ vpadd.u16 d8, d6, d7 @Add col 1-2 and 3-4 sad
+ vpadd.u16 d10, d8, d9 @Add col 12-34 sad
- vmov.u16 r9, d10[0] @move SAD to ARM
- ADD R10, R10, R9 @Add to the global sad
+ vmov.u16 r9, d10[0] @move SAD to ARM
+ ADD R10, R10, R9 @Add to the global sad
funcend_sad_16x16: @End of fucntion process
- ldr r5, [sp, #44]
- ldr r6, [sp, #48]
- str r7, [r6] @Store the is zero reg
- str r10, [r5] @Store sad
+ vpop {d8-d15}
+ ldr r5, [sp, #44]
+ ldr r6, [sp, #48]
+
+ str r7, [r6] @Store the is zero reg
+ str r10, [r5] @Store sad
@SUB SP,SP,#40
- pop {r4-r12, pc}
+ pop {r4-r12, pc}
diff --git a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
index c442077..e768c21 100644
--- a/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
+++ b/encoder/armv8/ih264e_evaluate_intra16x16_modes_av8.s
@@ -76,7 +76,7 @@
.p2align 2
.include "ih264_neon_macros.s"
-.globl ih264e_evaluate_intra16x16_modes_av8
+.global ih264e_evaluate_intra16x16_modes_av8
ih264e_evaluate_intra16x16_modes_av8:
diff --git a/encoder/armv8/ih264e_half_pel_av8.s b/encoder/armv8/ih264e_half_pel_av8.s
index 6dbd8f8..817faa6 100644
--- a/encoder/armv8/ih264e_half_pel_av8.s
+++ b/encoder/armv8/ih264e_half_pel_av8.s
@@ -1015,10 +1015,3 @@ filter_2dvh_skip_row:
///*****************************************
-
-
-
-
-
-
- .section .note.gnu-stack,"",%progbits
diff --git a/encoder/armv8/ime_distortion_metrics_av8.s b/encoder/armv8/ime_distortion_metrics_av8.s
index 99ebc8a..47c3425 100644
--- a/encoder/armv8/ime_distortion_metrics_av8.s
+++ b/encoder/armv8/ime_distortion_metrics_av8.s
@@ -975,4 +975,3 @@ satdq_end_func:
ldp d8, d9, [sp], #16
pop_v_regs
ret
- .section .note.gnu-stack,"",%progbits
diff --git a/encoder/x86/ih264e_intra_modes_eval_ssse3.c b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
index 657921f..0f4a9ad 100644
--- a/encoder/x86/ih264e_intra_modes_eval_ssse3.c
+++ b/encoder/x86/ih264e_intra_modes_eval_ssse3.c
@@ -487,7 +487,7 @@ void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src,
INT_MAX, INT_MAX, INT_MAX, INT_MAX };
WORD32 min_cost;
- WORD32 lambda4 = u4_lambda << 2;
+ UWORD32 lambda4 = u4_lambda << 2;
WORD32 dst_strd2, dst_strd3;
__m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b;
diff --git a/encoder/x86/ime_distortion_metrics_sse42.c b/encoder/x86/ime_distortion_metrics_sse42.c
index 0876788..baf18a4 100644
--- a/encoder/x86/ime_distortion_metrics_sse42.c
+++ b/encoder/x86/ime_distortion_metrics_sse42.c
@@ -110,6 +110,7 @@ void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
__m128i res_r0, res_r1, res_r2, res_r3;
__m128i sad_val;
int val1, val2;
+ UNUSED (i4_max_sad);
// Row 0-3 sad calculation
src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
@@ -248,6 +249,7 @@ void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
WORD32 i4_max_sad,
WORD32 *pi4_mb_distortion)
{
+ UNUSED (i4_max_sad);
__m128i src_r0, src_r1, src_r2, src_r3;
__m128i est_r0, est_r1, est_r2, est_r3;
__m128i res_r0, res_r1, res_r2, res_r3;
@@ -498,6 +500,7 @@ void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
WORD32 i4_max_sad,
WORD32 *pi4_mb_distortion)
{
+ UNUSED (i4_max_sad);
__m128i src_r0, src_r1, src_r2, src_r3;
__m128i est_r0, est_r1, est_r2, est_r3;
__m128i res_r0, res_r1, res_r2, res_r3;
diff --git a/test/encoder/main.c b/test/encoder/main.c
index 2a9635d..bb9cabf 100644
--- a/test/encoder/main.c
+++ b/test/encoder/main.c
@@ -29,7 +29,10 @@
#include <assert.h>
#include <string.h>
#include <sys/time.h>
+
+#ifndef IOS
#include <malloc.h>
+#endif
#ifdef WINDOWS_TIMER
#include "windows.h"
@@ -1989,7 +1992,7 @@ void synchronous_encode(iv_obj_t *ps_enc, app_ctxt_t *ps_app_ctxt)
/* 20 11 2013 100189 Initial Version */
/*****************************************************************************/
#ifdef IOS
-int h264enc_main(char * homedir)
+int h264enc_main(char * homedir,char *documentdir, int screen_wd, int screen_ht)
#else
int main(int argc, char *argv[])
#endif
@@ -2036,6 +2039,9 @@ int main(int argc, char *argv[])
strcpy(ac_cfg_fname, argv[1]);
}
+#else
+ strcpy(ac_cfg_fname, "test.cfg");
+
#endif
/*************************************************************************/
@@ -2406,22 +2412,22 @@ int main(int argc, char *argv[])
#ifdef IOS
/* Correct file paths */
- sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_ip_fname);
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_ip_fname);
strcpy (s_app_ctxt.ac_ip_fname, filename_with_path);
- sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_op_fname);
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_op_fname);
strcpy (s_app_ctxt.ac_op_fname, filename_with_path);
- sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_recon_fname);
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_recon_fname);
strcpy (s_app_ctxt.ac_recon_fname, filename_with_path);
- sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_chksum_fname);
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_chksum_fname);
strcpy (s_app_ctxt.ac_chksum_fname, filename_with_path);
- sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_mb_info_fname);
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_mb_info_fname);
strcpy (s_app_ctxt.ac_mb_info_fname, filename_with_path);
- sprintf(filename_with_path, "%s/%s", homedir, s_app_ctxt.ac_pic_info_fname);
+ sprintf(filename_with_path, "%s/%s", documentdir, s_app_ctxt.ac_pic_info_fname);
strcpy (s_app_ctxt.ac_pic_info_fname, filename_with_path);
#endif