/******************************************************************************
 *
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/
/**
 *******************************************************************************
 * @file
 *  ih264_inter_pred_filters.c
 *
 * @brief
 *  Contains function definitions for inter prediction interpolation filters
 *
 * @author
 *  Ittiam
 *
 * @par List of Functions:
 *  - ih264_inter_pred_luma_copy
 *  - ih264_interleave_copy
 *  - ih264_inter_pred_luma_horz
 *  - ih264_inter_pred_luma_vert
 *  - ih264_inter_pred_luma_horz_hpel_vert_hpel
 *  - ih264_inter_pred_luma_horz_qpel
 *  - ih264_inter_pred_luma_vert_qpel
 *  - ih264_inter_pred_luma_horz_qpel_vert_qpel
 *  - ih264_inter_pred_luma_horz_hpel_vert_qpel
 *  - ih264_inter_pred_luma_horz_qpel_vert_hpel
 *  - ih264_inter_pred_luma_bilinear
 *  - ih264_inter_pred_chroma
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* User include files */
#include "ih264_typedefs.h"
#include "ih264_macros.h"
#include "ih264_platform_macros.h"
#include "ih264_inter_pred_filters.h"


/*****************************************************************************/
/* Constant Data variables                                                   */
/*****************************************************************************/

/* coefficients for 6 tap filtering*/
const WORD32 ih264_g_six_tap[3] ={1,-5,20};


/*****************************************************************************/
/*  Function definitions .                                                   */
/*****************************************************************************/
/**
 *******************************************************************************
 *
 * @brief
 * Interprediction luma function for copy
 *
 * @par Description:
 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
 *    by 'src' to the location pointed by 'dst'
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

void ih264_inter_pred_luma_copy(UWORD8 *pu1_src,
                                UWORD8 *pu1_dst,
                                WORD32 src_strd,
                                WORD32 dst_strd,
                                WORD32 ht,
                                WORD32 wd,
                                UWORD8* pu1_tmp,
                                WORD32 dydx)
{
    WORD32 row, col;
    UNUSED(pu1_tmp);
    UNUSED(dydx);
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            pu1_dst[col] = pu1_src[col];
        }

        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}

/**
 *******************************************************************************
 *
 * @brief
 * Fucntion for copying to an interleaved destination
 *
 * @par Description:
 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
 *    by 'src' to the location pointed by 'dst'
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  The alternate elements of src will be copied to alternate locations in dsr
 *  Other locations are not touched
 *
 *******************************************************************************
 */
void ih264_interleave_copy(UWORD8 *pu1_src,
                           UWORD8 *pu1_dst,
                           WORD32 src_strd,
                           WORD32 dst_strd,
                           WORD32 ht,
                           WORD32 wd)
{
    WORD32 row, col;
    wd *= 2;

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col+=2)
        {
            pu1_dst[col] = pu1_src[col];
        }

        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}

/**
 *******************************************************************************
 *
 * @brief
 *     Interprediction luma filter for horizontal input
 *
 * @par Description:
 *    Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
 *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
void ih264_inter_pred_luma_horz(UWORD8 *pu1_src,
                                UWORD8 *pu1_dst,
                                WORD32 src_strd,
                                WORD32 dst_strd,
                                WORD32 ht,
                                WORD32 wd,
                                UWORD8* pu1_tmp,
                                WORD32 dydx)
{
    WORD32 row, col;
    WORD16 i2_tmp;
    UNUSED(pu1_tmp);
    UNUSED(dydx);

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            i2_tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            i2_tmp = ih264_g_six_tap[0] *
                            (pu1_src[col - 2] + pu1_src[col + 3])
                     + ih264_g_six_tap[1] *
                            (pu1_src[col - 1] + pu1_src[col + 2])
                     + ih264_g_six_tap[2] *
                            (pu1_src[col] + pu1_src[col + 1]);
            i2_tmp = (i2_tmp + 16) >> 5;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }

        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }

}

/**
 *******************************************************************************
 *
 * @brief
 *    Interprediction luma filter for vertical input
 *
 * @par Description:
 *   Applies a 6 tap vertical filter.The output is  clipped to 8 bits
 *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
void ih264_inter_pred_luma_vert(UWORD8 *pu1_src,
                                UWORD8 *pu1_dst,
                                WORD32 src_strd,
                                WORD32 dst_strd,
                                WORD32 ht,
                                WORD32 wd,
                                UWORD8* pu1_tmp,
                                WORD32 dydx)
{
    WORD32 row, col;
    WORD16 i2_tmp;
    UNUSED(pu1_tmp);
    UNUSED(dydx);

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            i2_tmp = 0; /*ih264_g_six_tap[] is the array containing the filter coeffs*/
            i2_tmp = ih264_g_six_tap[0] *
                            (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
                     + ih264_g_six_tap[1] *
                            (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
                     + ih264_g_six_tap[2] *
                            (pu1_src[col] + pu1_src[col + 1 * src_strd]);
            i2_tmp = (i2_tmp + 16) >> 5;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }
        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}

/*!
 **************************************************************************
 * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_hpel \endif
 *
 * \brief
 *    This function implements a two stage cascaded six tap filter. It
 *    applies the six tap filter in the horizontal direction on the
 *    predictor values, followed by applying the same filter in the
 *    vertical direction on the output of the first stage. The six tap
 *    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
 *    interpolation process"
 *
 * \param pu1_src: Pointer to the buffer containing the predictor values.
 *     pu1_src could point to the frame buffer or the predictor buffer.
 * \param pu1_dst: Pointer to the destination buffer where the output of
 *     the six tap filter is stored.
 * \param ht: Height of the rectangular pixel grid to be interpolated
 * \param wd: Width of the rectangular pixel grid to be interpolated
 * \param src_strd: Width of the buffer pointed to by pu1_src.
 * \param dst_strd: Width of the destination buffer
 * \param pu1_tmp: temporary buffer.
 * \param dydx: x and y reference offset for qpel calculations: UNUSED in this function.
 *
 * \return
 *    None.
 *
 * \note
 *    This function takes the 8 bit predictor values, applies the six tap
 *    filter in the horizontal direction and outputs the result clipped to
 *    8 bit precision. The input is stored in the buffer pointed to by
 *    pu1_src while the output is stored in the buffer pointed by pu1_dst.
 *    Both pu1_src and pu1_dst could point to the same buffer i.e. the
 *    six tap filter could be done in place.
 *
 **************************************************************************
 */
void ih264_inter_pred_luma_horz_hpel_vert_hpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8* pu1_tmp,
                                               WORD32 dydx)
{
    WORD32 row, col;
    WORD32 tmp;
    WORD16* pi2_pred1_temp;
    WORD16* pi2_pred1;
    UNUSED(dydx);
    pi2_pred1_temp = (WORD16*)pu1_tmp;
    pi2_pred1_temp += 2;
    pi2_pred1 = pi2_pred1_temp;
    for(row = 0; row < ht; row++)
    {
        for(col = -2; col < wd + 3; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] *
                            (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
                  + ih264_g_six_tap[1] *
                            (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
                  + ih264_g_six_tap[2] *
                            (pu1_src[col] + pu1_src[col + 1 * src_strd]);
            pi2_pred1_temp[col] = tmp;
        }
        pu1_src += src_strd;
        pi2_pred1_temp = pi2_pred1_temp + wd + 5;
    }

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] *
                            (pi2_pred1[col - 2] + pi2_pred1[col + 3])
                  + ih264_g_six_tap[1] *
                            (pi2_pred1[col - 1] + pi2_pred1[col + 2])
                  + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1]);
            tmp = (tmp + 512) >> 10;
            pu1_dst[col] = CLIP_U8(tmp);
        }
        pi2_pred1 += (wd + 5);
        pu1_dst += dst_strd;
    }
}

/*!
 **************************************************************************
 * \if Function name : ih264_inter_pred_luma_horz_qpel \endif
 *
 * \brief
 *    This routine applies the six tap filter to the predictors in the
 *    horizontal direction. The six tap filtering operation is described in
 *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
 *
 * \param pu1_src: Pointer to the buffer containing the predictor values.
 *     pu1_src could point to the frame buffer or the predictor buffer.
 * \param pu1_dst: Pointer to the destination buffer where the output of
 *     the six tap filter is stored.
 * \param ht: Height of the rectangular pixel grid to be interpolated
 * \param wd: Width of the rectangular pixel grid to be interpolated
 * \param src_strd: Width of the buffer pointed to by pu1_src.
 * \param dst_strd: Width of the destination buffer
 * \param pu1_tmp: temporary buffer: UNUSED in this function
 * \param dydx: x and y reference offset for qpel calculations.
 *
 * \return
 *    None.
 *
 * \note
 *    This function takes the 8 bit predictor values, applies the six tap
 *    filter in the horizontal direction and outputs the result clipped to
 *    8 bit precision. The input is stored in the buffer pointed to by
 *    pu1_src while the output is stored in the buffer pointed by pu1_dst.
 *    Both pu1_src and pu1_dst could point to the same buffer i.e. the
 *    six tap filter could be done in place.
 *
 **************************************************************************
 */
void ih264_inter_pred_luma_horz_qpel(UWORD8 *pu1_src,
                                     UWORD8 *pu1_dst,
                                     WORD32 src_strd,
                                     WORD32 dst_strd,
                                     WORD32 ht,
                                     WORD32 wd,
                                     UWORD8* pu1_tmp,
                                     WORD32 dydx)
{
    WORD32 row, col;
    UWORD8 *pu1_pred1;
    WORD32 x_offset = dydx & 0x3;
    UNUSED(pu1_tmp);
    pu1_pred1 = pu1_src + (x_offset >> 1);

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++, pu1_src++, pu1_dst++)
        {
            WORD16 i2_temp;
            /* The logic below implements the following equation
             i2_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
             20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
            i2_temp = pu1_src[-2] + pu1_src[3]
                      - (pu1_src[-1] + pu1_src[2])
                      + ((pu1_src[0] + pu1_src[1] - pu1_src[-1] - pu1_src[2]) << 2)
                      + ((pu1_src[0] + pu1_src[1]) << 4);
            i2_temp = (i2_temp + 16) >> 5;
            i2_temp = CLIP_U8(i2_temp);
            *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;

            pu1_pred1++;
        }
        pu1_dst += dst_strd - wd;
        pu1_src += src_strd - wd;
        pu1_pred1 += src_strd - wd;
    }
}

/*!
 **************************************************************************
 * \if Function name : ih264_inter_pred_luma_vert_qpel \endif
 *
 * \brief
 *    This routine applies the six tap filter to the predictors in the
 *    vertical direction and interpolates them to obtain pixels at quarter vertical
 *    positions (0, 1/4) and (0, 3/4). The six tap filtering operation is
 *    described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
 *
 * \param pu1_src: Pointer to the buffer containing the predictor values.
 *     pu1_src could point to the frame buffer or the predictor buffer.
 * \param pu1_dst: Pointer to the destination buffer where the output of
 *     the six tap filter is stored.
 * \param ht: Height of the rectangular pixel grid to be interpolated
 * \param wd: Width of the rectangular pixel grid to be interpolated
 * \param src_strd: Width of the buffer pointed to by puc_pred.
 * \param dst_strd: Width of the destination buffer
 * \param pu1_tmp: temporary buffer: UNUSED in this function
 * \param dydx: x and y reference offset for qpel calculations.
 *
 * \return
 *    void
 *
 * \note
 *    This function takes the 8 bit predictor values, applies the six tap
 *    filter in the vertical direction and outputs the result clipped to
 *    8 bit precision. The input is stored in the buffer pointed to by
 *    puc_pred while the output is stored in the buffer pointed by puc_dest.
 *    Both puc_pred and puc_dest could point to the same buffer i.e. the
 *    six tap filter could be done in place.
 *
 * \para <title>
 *    <paragraph>
 *  ...
 **************************************************************************
 */
void ih264_inter_pred_luma_vert_qpel(UWORD8 *pu1_src,
                                     UWORD8 *pu1_dst,
                                     WORD32 src_strd,
                                     WORD32 dst_strd,
                                     WORD32 ht,
                                     WORD32 wd,
                                     UWORD8* pu1_tmp,
                                     WORD32 dydx)
{
    WORD32 row, col;
    WORD32 y_offset = dydx >> 2;
    WORD32 off1, off2, off3;
    UWORD8 *pu1_pred1;
    UNUSED(pu1_tmp);
    y_offset = y_offset & 0x3;

    off1 = src_strd;
    off2 = src_strd << 1;
    off3 = off1 + off2;

    pu1_pred1 = pu1_src + (y_offset >> 1) * src_strd;

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++, pu1_dst++, pu1_src++, pu1_pred1++)
        {
            WORD16 i2_temp;
            /* The logic below implements the following equation
             i16_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
             5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd])  +
             20 * (puc_pred[0] + puc_pred[src_strd]); */
            i2_temp = pu1_src[-off2] + pu1_src[off3]
                       - (pu1_src[-off1] + pu1_src[off2])
                       + ((pu1_src[0] + pu1_src[off1] - pu1_src[-off1] - pu1_src[off2]) << 2)
                       + ((pu1_src[0] + pu1_src[off1]) << 4);
            i2_temp = (i2_temp + 16) >> 5;
            i2_temp = CLIP_U8(i2_temp);

            *pu1_dst = (i2_temp + *pu1_pred1 + 1) >> 1;
        }
        pu1_src += src_strd - wd;
        pu1_pred1 += src_strd - wd;
        pu1_dst += dst_strd - wd;
    }
}

/*!
 **************************************************************************
 * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_qpel \endif
 *
 * \brief
 *    This routine applies the six tap filter to the predictors in the
 *    vertical and horizontal direction and averages them to get pixels at locations
 *    (1/4,1/4), (1/4, 3/4), (3/4, 1/4) & (3/4, 3/4). The six tap filtering operation
 *    is described in sec 8.4.2.2.1 titled "Luma sample interpolation process"
 *
 * \param pu1_src: Pointer to the buffer containing the predictor values.
 *     pu1_src could point to the frame buffer or the predictor buffer.
 * \param pu1_dst: Pointer to the destination buffer where the output of
 *     the six tap filter is stored.
 * \param wd: Width of the rectangular pixel grid to be interpolated
 * \param ht: Height of the rectangular pixel grid to be interpolated
 * \param src_strd: Width of the buffer pointed to by puc_pred.
 * \param dst_strd: Width of the destination buffer
 * \param pu1_tmp: temporary buffer, UNUSED in this function
 * \param dydx: x and y reference offset for qpel calculations.
 *
 * \return
 *    void
 *
 * \note
 *    This function takes the 8 bit predictor values, applies the six tap
 *    filter in the vertical direction and outputs the result clipped to
 *    8 bit precision. The input is stored in the buffer pointed to by
 *    puc_pred while the output is stored in the buffer pointed by puc_dest.
 *    Both puc_pred and puc_dest could point to the same buffer i.e. the
 *    six tap filter could be done in place.
 *
 * \para <title>
 *    <paragraph>
 *  ...
 **************************************************************************
 */
void ih264_inter_pred_luma_horz_qpel_vert_qpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8* pu1_tmp,
                                               WORD32 dydx)
{
    WORD32 row, col;
    WORD32 x_offset = dydx & 0x3;
    WORD32 y_offset = dydx >> 2;

    WORD32 off1, off2, off3;
    UWORD8* pu1_pred_vert, *pu1_pred_horz;
    UNUSED(pu1_tmp);
    y_offset = y_offset & 0x3;

    off1 = src_strd;
    off2 = src_strd << 1;
    off3 = off1 + off2;

    pu1_pred_horz = pu1_src + (y_offset >> 1) * src_strd;
    pu1_pred_vert = pu1_src + (x_offset >> 1);

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd;
                        col++, pu1_dst++, pu1_pred_vert++, pu1_pred_horz++)
        {
            WORD16 i2_temp_vert, i2_temp_horz;
            /* The logic below implements the following equation
             i2_temp = puc_pred[-2*src_strd] + puc_pred[3*src_strd] -
             5 * (puc_pred[-1*src_strd] + puc_pred[2*src_strd])  +
             20 * (puc_pred[0] + puc_pred[src_strd]); */
            i2_temp_vert = pu1_pred_vert[-off2] + pu1_pred_vert[off3]
                            - (pu1_pred_vert[-off1] + pu1_pred_vert[off2])
                            + ((pu1_pred_vert[0] + pu1_pred_vert[off1]
                                            - pu1_pred_vert[-off1]
                                            - pu1_pred_vert[off2]) << 2)
                            + ((pu1_pred_vert[0] + pu1_pred_vert[off1]) << 4);
            i2_temp_vert = (i2_temp_vert + 16) >> 5;
            i2_temp_vert = CLIP_U8(i2_temp_vert);

            /* The logic below implements the following equation
             i16_temp = puc_pred[-2] - 5 * (puc_pred[-1] + puc_pred[2]) +
             20 * (puc_pred[0] + puc_pred[1]) + puc_pred[3]; */
            i2_temp_horz = pu1_pred_horz[-2] + pu1_pred_horz[3]
                            - (pu1_pred_horz[-1] + pu1_pred_horz[2])
                            + ((pu1_pred_horz[0] + pu1_pred_horz[1]
                                            - pu1_pred_horz[-1]
                                            - pu1_pred_horz[2]) << 2)
                            + ((pu1_pred_horz[0] + pu1_pred_horz[1]) << 4);
            i2_temp_horz = (i2_temp_horz + 16) >> 5;
            i2_temp_horz = CLIP_U8(i2_temp_horz);
            *pu1_dst = (i2_temp_vert + i2_temp_horz + 1) >> 1;
        }
        pu1_pred_vert += (src_strd - wd);
        pu1_pred_horz += (src_strd - wd);
        pu1_dst += (dst_strd - wd);
    }
}

/*!
 **************************************************************************
 * \if Function name : ih264_inter_pred_luma_horz_qpel_vert_hpel \endif
 *
 * \brief
 *    This routine applies the six tap filter to the predictors in the vertical
 *    and horizontal direction to obtain the pixel at (1/2,1/2). It then interpolates
 *    pixel at (0,1/2) and (1/2,1/2) to obtain pixel at (1/4,1/2). Similarly for (3/4,1/2).
 *    The six tap filtering operation is described in sec 8.4.2.2.1 titled
 *    "Luma sample interpolation process"
 *
 * \param pu1_src: Pointer to the buffer containing the predictor values.
 *     pu1_src could point to the frame buffer or the predictor buffer.
 * \param pu1_dst: Pointer to the destination buffer where the output of
 *     the six tap filter followed by interpolation is stored.
 * \param wd: Width of the rectangular pixel grid to be interpolated
 * \param ht: Height of the rectangular pixel grid to be interpolated
 * \param src_strd: Width of the buffer pointed to by puc_pred.
 * \param dst_strd: Width of the destination buffer
 * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter.
 * \param dydx: x and y reference offset for qpel calculations.
 *
 * \return
 *    void
 *
 * \note
 *    This function takes the 8 bit predictor values, applies the six tap
 *    filter in the vertical direction and outputs the result clipped to
 *    8 bit precision. The input is stored in the buffer pointed to by
 *    puc_pred while the output is stored in the buffer pointed by puc_dest.
 *    Both puc_pred and puc_dest could point to the same buffer i.e. the
 *    six tap filter could be done in place.
 *
 * \para <title>
 *    <paragraph>
 *  ...
 **************************************************************************
 */
void ih264_inter_pred_luma_horz_qpel_vert_hpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8* pu1_tmp,
                                               WORD32 dydx)
{
    WORD32 row, col;
    WORD32 tmp;
    WORD16* pi2_pred1_temp, *pi2_pred1;
    UWORD8* pu1_dst_tmp;
    WORD32 x_offset = dydx & 0x3;
    WORD16 i2_macro;

    pi2_pred1_temp = (WORD16*)pu1_tmp;
    pi2_pred1_temp += 2;
    pi2_pred1 = pi2_pred1_temp;
    pu1_dst_tmp = pu1_dst;

    for(row = 0; row < ht; row++)
    {
        for(col = -2; col < wd + 3; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] *
                            (pu1_src[col - 2 * src_strd] + pu1_src[col + 3 * src_strd])
                  + ih264_g_six_tap[1] *
                            (pu1_src[col - 1 * src_strd] + pu1_src[col + 2 * src_strd])
                  + ih264_g_six_tap[2] *
                            (pu1_src[col] + pu1_src[col + 1 * src_strd]);
            pi2_pred1_temp[col] = tmp;
        }

        pu1_src += src_strd;
        pi2_pred1_temp = pi2_pred1_temp + wd + 5;
    }

    pi2_pred1_temp = pi2_pred1;
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] *
                            (pi2_pred1[col - 2] + pi2_pred1[col + 3])
                  + ih264_g_six_tap[1] *
                            (pi2_pred1[col - 1] + pi2_pred1[col + 2])
                  + ih264_g_six_tap[2] *
                            (pi2_pred1[col] + pi2_pred1[col + 1]);
            tmp = (tmp + 512) >> 10;
            pu1_dst[col] = CLIP_U8(tmp);
        }
        pi2_pred1 += (wd + 5);
        pu1_dst += dst_strd;
    }

    pu1_dst = pu1_dst_tmp;
    pi2_pred1_temp += (x_offset >> 1);
    for(row = ht; row != 0; row--)
    {
        for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
        {
            UWORD8 uc_temp;
            /* Clipping the output of the six tap filter obtained from the
             first stage of the 2d filter stage */
            *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
            i2_macro = (*pi2_pred1_temp);
            uc_temp = CLIP_U8(i2_macro);
            *pu1_dst = (*pu1_dst + uc_temp + 1) >> 1;
        }
        pi2_pred1_temp += 5;
        pu1_dst += dst_strd - wd;
    }
}

/*!
 **************************************************************************
 * \if Function name : ih264_inter_pred_luma_horz_hpel_vert_qpel \endif
 *
 * \brief
 *    This routine applies the six tap filter to the predictors in the horizontal
 *    and vertical direction to obtain the pixel at (1/2,1/2). It then interpolates
 *    pixel at (1/2,0) and (1/2,1/2) to obtain pixel at (1/2,1/4). Similarly for (1/2,3/4).
 *    The six tap filtering operation is described in sec 8.4.2.2.1 titled
 *    "Luma sample interpolation process"
 *
 * \param pu1_src: Pointer to the buffer containing the predictor values.
 *     pu1_src could point to the frame buffer or the predictor buffer.
 * \param pu1_dst: Pointer to the destination buffer where the output of
 *     the six tap filter followed by interpolation is stored.
 * \param wd: Width of the rectangular pixel grid to be interpolated
 * \param ht: Height of the rectangular pixel grid to be interpolated
 * \param src_strd: Width of the buffer pointed to by puc_pred.
 * \param dst_strd: Width of the destination buffer
 * \param pu1_tmp: buffer to store temporary output after 1st 6-tap filter.
 * \param dydx: x and y reference offset for qpel calculations.
 *
 * \return
 *    void
 *
 * \note
 *    This function takes the 8 bit predictor values, applies the six tap
 *    filter in the vertical direction and outputs the result clipped to
 *    8 bit precision. The input is stored in the buffer pointed to by
 *    puc_pred while the output is stored in the buffer pointed by puc_dest.
 *    Both puc_pred and puc_dest could point to the same buffer i.e. the
 *    six tap filter could be done in place.
 *
 * \para <title>
 *    <paragraph>
 *  ...
 **************************************************************************
 */
void ih264_inter_pred_luma_horz_hpel_vert_qpel(UWORD8 *pu1_src,
                                               UWORD8 *pu1_dst,
                                               WORD32 src_strd,
                                               WORD32 dst_strd,
                                               WORD32 ht,
                                               WORD32 wd,
                                               UWORD8* pu1_tmp,
                                               WORD32 dydx)
{

    WORD32 row, col;
    WORD32 tmp;
    WORD32 y_offset = dydx >> 2;
    WORD16* pi2_pred1_temp, *pi2_pred1;
    UWORD8* pu1_dst_tmp;
    //WORD32 x_offset = dydx & 0x3;
    WORD16 i2_macro;

    y_offset = y_offset & 0x3;

    pi2_pred1_temp = (WORD16*)pu1_tmp;
    pi2_pred1_temp += 2 * wd;
    pi2_pred1 = pi2_pred1_temp;
    pu1_dst_tmp = pu1_dst;
    pu1_src -= 2 * src_strd;
    for(row = -2; row < ht + 3; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] * (pu1_src[col - 2] + pu1_src[col + 3])
                  + ih264_g_six_tap[1] * (pu1_src[col - 1] + pu1_src[col + 2])
                  + ih264_g_six_tap[2] * (pu1_src[col] + pu1_src[col + 1]);
            pi2_pred1_temp[col - 2 * wd] = tmp;
        }

        pu1_src += src_strd;
        pi2_pred1_temp += wd;
    }
    pi2_pred1_temp = pi2_pred1;
    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            tmp = 0;/*ih264_g_six_tap[] is the array containing the filter coeffs*/
            tmp = ih264_g_six_tap[0] * (pi2_pred1[col - 2 * wd] + pi2_pred1[col + 3 * wd])
                  + ih264_g_six_tap[1] * (pi2_pred1[col - 1 * wd] + pi2_pred1[col + 2 * wd])
                  + ih264_g_six_tap[2] * (pi2_pred1[col] + pi2_pred1[col + 1 * wd]);
            tmp = (tmp + 512) >> 10;
            pu1_dst[col] = CLIP_U8(tmp);
        }
        pi2_pred1 += wd;
        pu1_dst += dst_strd;
    }
    pu1_dst = pu1_dst_tmp;
    pi2_pred1_temp += (y_offset >> 1) * wd;
    for(row = ht; row != 0; row--)

    {
        for(col = wd; col != 0; col--, pu1_dst++, pi2_pred1_temp++)
        {
            UWORD8 u1_temp;
            /* Clipping the output of the six tap filter obtained from the
             first stage of the 2d filter stage */
            *pi2_pred1_temp = (*pi2_pred1_temp + 16) >> 5;
            i2_macro = (*pi2_pred1_temp);
            u1_temp = CLIP_U8(i2_macro);
            *pu1_dst = (*pu1_dst + u1_temp + 1) >> 1;
        }
        //pi16_pred1_temp += wd;
        pu1_dst += dst_strd - wd;
    }
}

/**
 *******************************************************************************
 *  function:ih264_inter_pred_luma_bilinear
 *
 * @brief
 *    This routine applies the bilinear filter to the predictors .
 *    The  filtering operation is described in
 *    sec 8.4.2.2.1 titled "Luma sample interpolation process"
 *
 * @par Description:
\note
 *     This function is called to obtain pixels lying at the following
 *    locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
 *    The function averages the two adjacent values from the two input arrays in horizontal direction.
 *
 *
 * @param[in] pu1_src1:
 *  UWORD8 Pointer to the buffer containing the first input array.
 *
 * @param[in] pu1_src2:
 *  UWORD8 Pointer to the buffer containing the second input array.
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination where the output of bilinear filter is stored.
 *
 * @param[in] src_strd1
 *  Stride of the first input buffer
 *
 * @param[in] src_strd2
 *  Stride of the second input buffer
 *
 * @param[in] dst_strd
 *  integer destination stride of pu1_dst
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
                                    UWORD8 *pu1_src2,
                                    UWORD8 *pu1_dst,
                                    WORD32 src_strd1,
                                    WORD32 src_strd2,
                                    WORD32 dst_strd,
                                    WORD32 ht,
                                    WORD32 wd)
{
    WORD32 row, col;
    WORD16 i2_tmp;

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            i2_tmp = pu1_src1[col] + pu1_src2[col];
            i2_tmp = (i2_tmp + 1) >> 1;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }
        pu1_src1 += src_strd1;
        pu1_src2 += src_strd2;
        pu1_dst += dst_strd;
    }

}

/**
 *******************************************************************************
 *
 * @brief
 *    Interprediction chroma filter
 *
 * @par Description:
 *   Applies filtering to chroma samples as mentioned in
 *    sec 8.4.2.2.2 titled "chroma sample interpolation process"
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source containing alternate U and V samples
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] u1_dx
 *  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
 *
 * @param[in] u1_dy
 *  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
 *
 * @param[in] ht
 *  integer height of the array
 *
 * @param[in] wd
 *  integer width of the array
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
void ih264_inter_pred_chroma(UWORD8 *pu1_src,
                             UWORD8 *pu1_dst,
                             WORD32 src_strd,
                             WORD32 dst_strd,
                             WORD32 dx,
                             WORD32 dy,
                             WORD32 ht,
                             WORD32 wd)
{
    WORD32 row, col;
    WORD16 i2_tmp;

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < 2 * wd; col++)
        {
            i2_tmp = 0; /* applies equation (8-266) in section 8.4.2.2.2 */
            i2_tmp = (8 - dx) * (8 - dy) * pu1_src[col]
                     + (dx) * (8 - dy) * pu1_src[col + 2]
                     + (8 - dx) * (dy) * (pu1_src + src_strd)[col]
                     + (dx) * (dy) * (pu1_src + src_strd)[col + 2];
            i2_tmp = (i2_tmp + 32) >> 6;
            pu1_dst[col] = CLIP_U8(i2_tmp);
        }
        pu1_src += src_strd;
        pu1_dst += dst_strd;
    }
}