/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
*  ihevc_weighted_pred_x86_intr.c
*
* @brief
*  Contains function definitions for weighted prediction used in inter
* prediction
*
* @author
*
*
* @par List of Functions:
*   - ihevc_weighted_pred_uni_sse42()
*   - ihevc_weighted_pred_bi_sse42()
*   - ihevc_weighted_pred_bi_default_sse42()
*   - ihevc_weighted_pred_chroma_uni_sse42()
*   - ihevc_weighted_pred_chroma_bi_sse42()
*
* @remarks
*  None
*
*******************************************************************************
*/
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
#include <stdio.h>
#include <assert.h>

#include "ihevc_debug.h"
#include "ihevc_typedefs.h"
#include "ihevc_macros.h"
#include "ihevc_platform_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_defs.h"
#include "ihevc_weighted_pred.h"
#include "ihevc_inter_pred.h"

#include <immintrin.h>

/**
*******************************************************************************
*
* @brief
*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
* it at the location pointed by pi2_dst
*
* @par Description:
*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
* offset
*
* @param[in] pi2_src
*  Pointer to the source
*
* @param[out] pu1_dst
*  Pointer to the destination
*
* @param[in] src_strd
*  Source stride
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to the source
*
* @param[in] off0
*  offset to be added after rounding and
*
* @param[in] shifting
*
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src,
                                   UWORD8 *pu1_dst,
                                   WORD32 src_strd,
                                   WORD32 dst_strd,
                                   WORD32 wgt0,
                                   WORD32 off0,
                                   WORD32 shift,
                                   WORD32 lvl_shift,
                                   WORD32 ht,
                                   WORD32 wd)
{
    WORD32 row, col, temp;
    WORD32 dst0, dst1, dst2, dst3;

    /* all 128 bit registers are named with a suffix mxnb, where m is the */
    /* number of n bits packed in the register                            */
    __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b;
    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;

    ASSERT(wd % 4 == 0); /* checking assumption*/
    ASSERT(ht % 4 == 0); /* checking assumption*/

    temp = 1 << (shift - 1);

    // seting values in register
    const_temp_4x32b = _mm_set1_epi32(temp);
    lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
    wgt0_4x32b = _mm_set1_epi32(wgt0);
    off0_4x32b = _mm_set1_epi32(off0);

    if(0 == (wd & 7)) /* wd multiple of 8 case */
    {
        __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;

        /*  outer for loop starts from here */
        for(row = 0; row < ht; row += 4)
        {
            for(col = 0; col < wd; col += 8)
            {   /* for row =0 ,1,2,3*/

                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
                /* row = 1 */
                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
                /* row = 2 */
                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
                /* row = 3 */
                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));

                /* row = 0 */ /* Last 4 pixels */
                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
                /* row = 1 */
                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
                /* row = 2 */
                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4));
                /* row = 3 */
                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4));

                /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */
                src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
                src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
                src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);

                /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */
                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);

                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */
                src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
                src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
                src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
                src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);

                /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
                src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
                src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
                src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
                src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);

                /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */
                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);

                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */
                src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
                src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
                src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
                src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);

                /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */
                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);

                /* (i4_tmp >> shift) */ /* First 4 pixels */
                src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
                src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);

                /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);

                /* (i4_tmp >> shift) */ /* Last 4 pixels */
                src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
                src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);

                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);

                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);

                src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b);
                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
                src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b);
                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
                src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b);
                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);

                /* store four 8-bit output values  */
                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/
                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/
                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/

                /* To update pointer */
                pi2_src += 8;
                pu1_dst += 8;

            } /* inner loop ends here(4-output values in single iteration) */

            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */

        }
    }
    else  /* wd multiple of 4 case */
    {
        /*  outer for loop starts from here */
        for(row = 0; row < ht; row += 4)
        {
            for(col = 0; col < wd; col += 4)
            {   /* for row =0 ,1,2,3*/

                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
                /* row = 1 */
                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
                /* row = 2 */
                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
                /* row = 3 */
                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));

                /* considering pix. 4:0 by converting 16-into 32 bit */
                src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
                src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
                src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);

                /* (pi2_src[col] + lvl_shift)*/
                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);

                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
                src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
                src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
                src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);

                /* i4_tmp += 1 << (shift - 1) */
                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);

                /* (i4_tmp >> shift) */
                src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
                src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);

                /*i4_tmp = (i4_tmp >> shift) + off0; */
                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);

                src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
                src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b);

                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b);

                dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
                /* dst row = 1 to 3 */
                src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
                src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2);
                src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3);

                /* store four 8-bit output values  */
                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;

                dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
                dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
                dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);

                /* row = 1 to row = 3 */
                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;

                /* To update pointer */
                pi2_src += 4;
                pu1_dst += 4;

            } /* inner loop ends here(4-output values in single iteration) */

            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */

        }
    }
}

/**
*******************************************************************************
*
* @brief
* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
* it at the location pointed by pi2_dst
*
* @par Description:
*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
* offset
*
* @param[in] pi2_src
*  Pointer to the source
*
* @param[out] pu1_dst
*  Pointer to the destination
*
* @param[in] src_strd
*  Source stride
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to the source
*
* @param[in] off0
*  offset to be added after rounding and
*
* @param[in] shifting
*
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source (each colour component)
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src,
                                          UWORD8 *pu1_dst,
                                          WORD32 src_strd,
                                          WORD32 dst_strd,
                                          WORD32 wgt0_cb,
                                          WORD32 wgt0_cr,
                                          WORD32 off0_cb,
                                          WORD32 off0_cr,
                                          WORD32 shift,
                                          WORD32 lvl_shift,
                                          WORD32 ht,
                                          WORD32 wd)
{
    WORD32 row, col, temp, wdx2;
    /* all 128 bit registers are named with a suffix mxnb, where m is the */
    /* number of n bits packed in the register                            */

    __m128i src_temp0_4x32b, src_temp1_4x32b;
    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;

    ASSERT(wd % 2 == 0); /* checking assumption*/
    ASSERT(ht % 2 == 0); /* checking assumption*/

    temp = 1 << (shift - 1);
    wdx2 = 2 * wd;

    // seting values in register
    const_temp_4x32b = _mm_set1_epi32(temp);
    lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
    wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
    off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);

#if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */
    if( 0 == (ht & 3)) /* ht multiple of 4 case */
    {
        if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
        {
            __m128i src_temp2_4x32b, src_temp3_4x32b;
            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
            __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b;
            __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b;
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row +=4)
            {
                for(col = 0; col < wdx2; col +=16)
                {
                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
                    /* row = 1 */
                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
                    /* row = 0 */ /* Second 4 pixels */
                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
                    /* row = 1 */
                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
                    /* row = 0 */ /* Third 4 pixels */
                    src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8));
                    /* row = 1 */
                    src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8));
                    /* row = 0 */ /* Last 4 pixels */
                    src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12));
                    /* row = 1 */
                    src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12));

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
                    src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
                    src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);

                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);

                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b);
                    src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b);
                    src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b);
                    src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b);
                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b);
                    src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b);

                    /* store 16 8-bit output values  */
                    _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
                    _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/

                    /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
                    /* row = 3 */
                    src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
                    /* row = 2 */ /* Second 4 pixels */
                    src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
                    /* row = 3 */
                    src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
                    /* row = 2 */ /* Third 4 pixels */
                    src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8));
                    /* row = 3 */
                    src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8));
                    /* row = 2 */ /* Last 4 pixels */
                    src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12));
                    /* row = 3 */
                    src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12));

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp8_4x32b  = _mm_cvtepi16_epi32(src_temp8_4x32b);
                    src_temp9_4x32b  = _mm_cvtepi16_epi32(src_temp9_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b);
                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp8_4x32b  = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b);
                    src_temp9_4x32b  = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
                    src_temp10_4x32b  = _mm_cvtepi16_epi32(src_temp10_4x32b);
                    src_temp11_4x32b  = _mm_cvtepi16_epi32(src_temp11_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b);
                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp10_4x32b  = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b);
                    src_temp11_4x32b  = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
                    src_temp12_4x32b  = _mm_cvtepi16_epi32(src_temp12_4x32b);
                    src_temp13_4x32b  = _mm_cvtepi16_epi32(src_temp13_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b);
                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp12_4x32b  = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b);
                    src_temp13_4x32b  = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
                    src_temp14_4x32b  = _mm_cvtepi16_epi32(src_temp14_4x32b);
                    src_temp15_4x32b  = _mm_cvtepi16_epi32(src_temp15_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b);
                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp14_4x32b  = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b);
                    src_temp15_4x32b  = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b);
                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b,  shift);
                    src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b);
                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b,  shift);
                    src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b);
                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b,  shift);
                    src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b);
                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b,  shift);
                    src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b,  shift);

                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b);
                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b);
                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b);
                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b);
                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b);

                    src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b);
                    src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b);
                    src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b);
                    src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b);
                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                    src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b);
                    src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b);

                    /* store 16 8-bit output values  */
                    _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/
                    _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/

                    pi2_src += 16;  /* Pointer update */
                    pu1_dst += 16; /* Pointer update */

                } /* inner loop ends here(4-output values in single iteration) */
                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
            }
        }
        else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
        {
            __m128i src_temp2_4x32b,src_temp3_4x32b;
            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row +=4)
            {
                for(col = 0; col < wdx2; col +=8)
                {
                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
                    /* row = 1 */
                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
                    /* row = 2 */
                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
                    /* row = 3 */
                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));

                    /* row = 0 */ /* Last 4 pixels */
                    src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
                    /* row = 1 */
                    src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
                    /* row = 2 */
                    src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
                    /* row = 3 */
                    src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
                    src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
                    src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);

                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);

                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b);
                    src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b);
                    src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b);
                    src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b);

                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b);
                    src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b);
                    src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b);
                    src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b);

                    /* store four 8-bit output values  */
                    _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
                    _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
                    _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/
                    _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/

                    pi2_src += 8;   /* Pointer update */
                    pu1_dst += 8; /* Pointer update */

                } /* inner loop ends here(4-output values in single iteration) */
                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
            }
        }
        else /* 2*wd multiple of 4 case */
        {
            WORD32 dst0, dst1, dst2, dst3;
            __m128i src_temp2_4x32b,src_temp3_4x32b;
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row +=4)
            {
                for(col = 0; col < wdx2; col +=4)
                {
                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
                    /* row = 1 */
                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd));
                    /* row = 2 */
                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
                    /* row = 3 */
                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);

                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);

                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);

                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b);
                    src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b);

                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b);

                    dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
                    /* dst row = 1 to 3 */
                    src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1);
                    src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2);
                    src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3);

                    /* store four 8-bit output values  */
                    *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;

                    dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
                    dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
                    dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
                    /* row = 1 */
                    *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
                    /* row = 2 */
                    *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
                    /* row = 3 */
                    *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;

                    pi2_src += 4;   /* Pointer update */
                    pu1_dst += 4; /* Pointer update */

                } /* inner loop ends here(4-output values in single iteration) */
                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
            }
        }
    }
    else /* ht multiple of 2 case */
#endif

    {
        if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
        {
            __m128i src_temp2_4x32b, src_temp3_4x32b;
            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row += 2)
            {
                for(col = 0; col < wdx2; col += 16)
                {
                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
                    /* row = 1 */
                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));

                    /* row = 0 */ /* Second 4 pixels */
                    src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
                    /* row = 1 */
                    src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
                    /* row = 0 */ /* Third 4 pixels */
                    src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
                    /* row = 1 */
                    src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
                    /* row = 0 */ /* Last 4 pixels */
                    src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12));
                    /* row = 1 */
                    src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12));

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
                    src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
                    src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
                    src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);

                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);

                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
                    src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
                    src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b);
                    src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b);
                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b);
                    src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b);

                    /* store 16 8-bit output values  */
                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/

                    pi2_src += 16;  /* Pointer update */
                    pu1_dst += 16; /* Pointer update */

                } /* inner loop ends here(4-output values in single iteration) */
                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
            }
        }
        else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
        {
            __m128i src_temp2_4x32b, src_temp3_4x32b;
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row += 2)
            {
                for(col = 0; col < wdx2; col += 8)
                {
                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
                    /* row = 1 */
                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));

                    /* row = 0 */ /* Last 4 pixels */
                    src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
                    /* row = 1 */
                    src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);

                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
                    src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);

                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
                    /* (i4_tmp >> shift) */
                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);

                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
                    src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);

                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
                    src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);

                    /* store four 8-bit output values  */
                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/

                    pi2_src += 8;   /* Pointer update */
                    pu1_dst += 8; /* Pointer update */

                } /* inner loop ends here(4-output values in single iteration) */
                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
            }
        }
        else /* 2*wd multiple of 4 case */
        {
            WORD32 dst0, dst1;
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row += 2)
            {
                for(col = 0; col < wdx2; col += 4)
                {
                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
                    /* row = 1 */
                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));

                    /* considering pix. 4:0 by converting 16-into 32 bit */
                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);

                    /* (pi2_src[col] + lvl_shift)*/
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);

                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);

                    /* i4_tmp += 1 << (shift - 1) */
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);

                    /* (i4_tmp >> shift) */
                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);

                    /*i4_tmp = (i4_tmp >> shift) + off0; */
                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);

                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);

                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);

                    dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
                    /* dst row = 1 to 3 */
                    src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);

                    /* store four 8-bit output values  */
                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;

                    dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
                    /* row = 1 */
                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;

                    pi2_src += 4;   /* Pointer update */
                    pu1_dst += 4; /* Pointer update */

                } /* inner loop ends here(4-output values in single iteration) */
                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
            }
        }
    }
}

/**
*******************************************************************************
*
* @brief
*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
* pi2_src2 and stores it at location pointed  by pi2_dst
*
* @par Description:
*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
* off1 + 1) << (shift - 1) ) >> shift
*
* @param[in] pi2_src1
*  Pointer to source 1
*
* @param[in] pi2_src2
*  Pointer to source 2
*
* @param[out] pu1_dst
*  Pointer to destination
*
* @param[in] src_strd1
*  Source stride 1
*
* @param[in] src_strd2
*  Source stride 2
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to source 1
*
* @param[in] off0
*  offset 0
*
* @param[in] wgt1
*  weight to be multiplied to source 2
*
* @param[in] off1
*  offset 1
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift1
*  added before shift and offset
*
* @param[in] lvl_shift2
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1,
                                  WORD16 *pi2_src2,
                                  UWORD8 *pu1_dst,
                                  WORD32 src_strd1,
                                  WORD32 src_strd2,
                                  WORD32 dst_strd,
                                  WORD32 wgt0,
                                  WORD32 off0,
                                  WORD32 wgt1,
                                  WORD32 off1,
                                  WORD32 shift,
                                  WORD32 lvl_shift1,
                                  WORD32 lvl_shift2,
                                  WORD32 ht,
                                  WORD32 wd)
{
    WORD32 row, col, temp;

    __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;


    ASSERT(wd % 4 == 0); /* checking assumption*/
    ASSERT(ht % 2 == 0); /* checking assumption*/

    temp = (off0 + off1 + 1) << (shift - 1);

    // seting values in register
    const_temp_4x32b = _mm_set1_epi32(temp);
    lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
    lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
    wgt0_4x32b = _mm_set1_epi32(wgt0);
    wgt1_4x32b = _mm_set1_epi32(wgt1);

    if(0 == (wd & 7)) /* wd multiple of 8 case */
    {
        __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
        /*  outer for loop starts from here */
        for(row = 0; row < ht; row += 2)
        {
            for(col = 0; col < wd; col += 8)
            {
                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
                /* Next 4 pixels */
                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
                src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */

                /* considering pix. 4:0 by converting 16-into 32 bit */
                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
                /* (pi2_src1[col] + lvl_shift1) */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
                /* (pi2_src2[col] + lvl_shift2) */
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);

                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);

                /* Next 4 Pixels */
                src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
                src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
                src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
                src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
                src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
                src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
                src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
                src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
                src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);

                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
                /* (i4_tmp >> shift) */
                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                /* Next 4 Pixels */
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);

                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);

                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);

                /* store four 8-bit output values  */
                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/

                pi2_src1 += 8;  /* Pointer update */
                pi2_src2 += 8;  /* Pointer update */
                pu1_dst  += 8;  /* Pointer update */

            } /* inner loop ends here(4-output values in single iteration) */

            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */

        } /* outer loop ends */
    }
    else /* wd multiple of 4 case */
    {
        WORD32 dst0, dst1;
        /*  outer for loop starts from here */
        for(row = 0; row < ht; row += 2)
        {
            for(col = 0; col < wd; col += 4)
            {
                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */

                /* considering pix. 4:0 by converting 16-into 32 bit */
                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
                /* (pi2_src1[col] + lvl_shift1) */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
                /* (pi2_src2[col] + lvl_shift2) */
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);

                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);

                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);

                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);

                /* (i4_tmp >> shift) */
                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);

                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);

                dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);

                /* dst row = 1 to 3 */
                src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);

                /* store four 8-bit output values  */
                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;

                dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);

                /* row = 1 to 3 */
                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;

                pi2_src1 += 4;  /* Pointer update */
                pi2_src2 += 4;  /* Pointer update */
                pu1_dst  += 4;  /* Pointer update */

            } /* inner loop ends here(4-output values in single iteration) */

            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */

        } /* outer loop ends */
    }

}

/**
*******************************************************************************
*
* @brief
* Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
* pi2_src2 and stores it at location pointed  by pi2_dst
*
* @par Description:
*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
* off1 + 1) << (shift - 1) ) >> shift
*
* @param[in] pi2_src1
*  Pointer to source 1
*
* @param[in] pi2_src2
*  Pointer to source 2
*
* @param[out] pu1_dst
*  Pointer to destination
*
* @param[in] src_strd1
*  Source stride 1
*
* @param[in] src_strd2
*  Source stride 2
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] wgt0
*  weight to be multiplied to source 1
*
* @param[in] off0
*  offset 0
*
* @param[in] wgt1
*  weight to be multiplied to source 2
*
* @param[in] off1
*  offset 1
*
* @param[in] shift
*  (14 Bit depth) + log2_weight_denominator
*
* @param[in] lvl_shift1
*  added before shift and offset
*
* @param[in] lvl_shift2
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source (each colour component)
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1,
                                         WORD16 *pi2_src2,
                                         UWORD8 *pu1_dst,
                                         WORD32 src_strd1,
                                         WORD32 src_strd2,
                                         WORD32 dst_strd,
                                         WORD32 wgt0_cb,
                                         WORD32 wgt0_cr,
                                         WORD32 off0_cb,
                                         WORD32 off0_cr,
                                         WORD32 wgt1_cb,
                                         WORD32 wgt1_cr,
                                         WORD32 off1_cb,
                                         WORD32 off1_cr,
                                         WORD32 shift,
                                         WORD32 lvl_shift1,
                                         WORD32 lvl_shift2,
                                         WORD32 ht,
                                         WORD32 wd)
{
    WORD32 row, col, temp1, temp2;
    WORD32 wdx2;

    __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;


    ASSERT(wd % 2 == 0); /* checking assumption*/
    ASSERT(ht % 2 == 0); /* checking assumption*/

    temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
    temp2 = (off0_cr + off1_cr + 1) << (shift - 1);

    // seting values in register
    const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
    lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
    lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
    wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
    wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);

    wdx2 = wd * 2;

    if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
    {
        __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
        /*  outer for loop starts from here */
        for(row = 0; row < ht; row += 2)
        {
            for(col = 0; col < wdx2; col += 8)
            {
                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
                /* Next 4 pixels */
                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
                src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */

                /* considering pix. 4:0 by converting 16-into 32 bit */
                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
                /* (pi2_src1[col] + lvl_shift1) */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
                /* (pi2_src2[col] + lvl_shift2) */
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);

                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);

                /* Next 4 Pixels */
                src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
                src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
                src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
                src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
                src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
                src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
                src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
                src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
                src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);

                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
                /* (i4_tmp >> shift) */
                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                /* Next 4 Pixels */
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);

                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);

                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);

                /* store four 8-bit output values  */
                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/

                pi2_src1 += 8;  /* Pointer update */
                pi2_src2 += 8;  /* Pointer update */
                pu1_dst  += 8;  /* Pointer update */

            } /* inner loop ends here(4-output values in single iteration) */

            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */

        } /* outer loop ends */
    }
    else /* wdx2 multiple of 4 case */
    {
        WORD32 dst0, dst1;
        /*  outer for loop starts from here */
        for(row = 0; row < ht; row += 2)
        {
            for(col = 0; col < wdx2; col += 4)
            {
                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */

                /* considering pix. 4:0 by converting 16-into 32 bit */
                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
                /* (pi2_src1[col] + lvl_shift1) */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
                /* (pi2_src2[col] + lvl_shift2) */
                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);

                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);

                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);

                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);

                /* (i4_tmp >> shift) */
                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);

                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);

                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);

                dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);

                /* dst row = 1 to 3 */
                src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);

                /* store four 8-bit output values  */
                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;

                dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);

                /* row = 1 to 3 */
                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;

                pi2_src1 += 4;  /* Pointer update */
                pi2_src2 += 4;  /* Pointer update */
                pu1_dst  += 4;  /* Pointer update */

            } /* inner loop ends here(4-output values in single iteration) */

            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
        }
    }

}

/**
*******************************************************************************
*
* @brief
*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
* pi2_src2 and stores it at location  pointed by pi2_dst
*
* @par Description:
*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
* >> shift  where shift = 15 - BitDepth
*
* @param[in] pi2_src1
*  Pointer to source 1
*
* @param[in] pi2_src2
*  Pointer to source 2
*
* @param[out] pu1_dst
*  Pointer to destination
*
* @param[in] src_strd1
*  Source stride 1
*
* @param[in] src_strd2
*  Source stride 2
*
* @param[in] dst_strd
*  Destination stride
*
* @param[in] lvl_shift1
*  added before shift and offset
*
* @param[in] lvl_shift2
*  added before shift and offset
*
* @param[in] ht
*  height of the source
*
* @param[in] wd
*  width of the source
*
* @returns
*
* @remarks
*  None
*
* Assumption : ht%4 == 0, wd%4 == 0
* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
* final result will match even if intermediate precision is in 16 bit.
*
*******************************************************************************
*/

void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1,
                                          WORD16 *pi2_src2,
                                          UWORD8 *pu1_dst,
                                          WORD32 src_strd1,
                                          WORD32 src_strd2,
                                          WORD32 dst_strd,
                                          WORD32 lvl_shift1,
                                          WORD32 lvl_shift2,
                                          WORD32 ht,
                                          WORD32 wd)
{
    WORD32 row, col, temp;
    WORD32 shift;

    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
    __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
    __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;

    ASSERT(wd % 4 == 0); /* checking assumption*/
    ASSERT(ht % 2 == 0); /* checking assumption*/

    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
    temp = 1 << (shift - 1);

    // seting values in register
    lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
    lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
    const_temp_8x16b = _mm_set1_epi16(temp);

    lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
    lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);

    if(0 == (ht & 3)) /* ht multiple of 4*/
    {
        if(0 == (wd & 15)) /* wd multiple of 16 case */
        {
            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
            __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row += 4)
            {
                for(col = 0; col < wd; col += 16)
                {
                    /*load 8 pixel values */ /* First 8 Values */
                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
                    /* row = 1 */
                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
                    /* row = 2 */
                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
                    /* row = 3 */
                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));

                    /*load 8 pixel values */ /* Second 8 Values */
                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
                    /* row = 1 */
                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
                    /* row = 2 */
                    src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
                    src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));

                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);

                    /*load 8 pixel values */ /* Second 8 Values */
                    /* row = 3 */
                    src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
                    src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));

                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);

                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);

                    /* (i4_tmp >> shift) */ /* First 8 Values */
                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);

                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);

                    /* (i4_tmp >> shift) */ /* Second 8 Values */
                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
                    src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
                    src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);

                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);

                    /* store four 8-bit output values  */ /* 16 8 Values */
                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
                    _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
                    _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/

                    /* To update pointer */
                    pi2_src1 += 16;
                    pi2_src2 += 16;
                    pu1_dst  += 16;

                } /* inner loop ends here(8-output values in single iteration) */

                pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
                pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
                pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */

            }
        }
        else if(0 == (wd & 7)) /* multiple of 8 case */
        {
            /*  outer for loop starts from here */
            for(row = 0; row < ht; row += 4)
            {
                for(col = 0; col < wd; col += 8)
                {
                    /*load 8 pixel values */
                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
                    /* row = 1 */
                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
                    /* row = 2 */
                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
                    /* row = 3 */
                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));

                    /* (pi2_src1[col] + pi2_src2[col]) */
                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);

                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);

                    /* (i4_tmp >> shift) */
                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);

                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);

                    /* store four 8-bit output values  */
                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/

                    /* To update pointer */
                    pi2_src1 += 8;
                    pi2_src2 += 8;
                    pu1_dst  += 8;

                } /* inner loop ends here(8-output values in single iteration) */

                pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
                pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
                pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */

            }
        }
        else /* wd multiple of 4 case*/
        {
            WORD32 dst0, dst1, dst2, dst3;

            /*  outer for loop starts from here */
            for(row = 0; row < ht; row += 4)
            {
                for(col = 0; col < wd; col += 4)
                {
                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));

                    /* row = 1 */
                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
                    /* row = 2 */
                    src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
                    src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
                    /* row = 3 */
                    src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
                    src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));

                    /* Pack two rows together */
                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
                    src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
                    src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);

                    /* (pi2_src1[col] + pi2_src2[col]) */
                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);

                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);

                    /* (i4_tmp >> shift) */
                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);

                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);

                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
                    /* dst row = 1 to 3 */
                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
                    src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);

                    /* store four 8-bit output values  */
                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;

                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
                    dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
                    dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);

                    /* row = 1 to row = 3 */
                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;

                    /* To update pointer */
                    pi2_src1 += 4;
                    pi2_src2 += 4;
                    pu1_dst  += 4;

                } /* inner loop ends here(4-output values in single iteration) */

                pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
                pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
                pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */

            }
        }
    }
    else /* ht multiple of 2 case and wd multiple of 4 case*/
    {

        WORD32 dst0, dst1;

        /*  outer for loop starts from here */
        for(row = 0; row < ht; row += 2)
        {
            for(col = 0; col < wd; col += 4)
            {
                /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));

                /* row = 1 */
                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));

                /* Pack two rows together */
                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);

                /* (pi2_src1[col] + pi2_src2[col]) */
                src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);

                /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
                src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);

                /* (i4_tmp >> shift) */
                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);

                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);

                dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
                /* dst row = 1 to 3 */
                src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);

                /* store four 8-bit output values  */
                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;

                dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);

                /* row = 1 to row = 3 */
                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;

                /* To update pointer */
                pi2_src1 += 4;
                pi2_src2 += 4;
                pu1_dst  += 4;

            } /* inner loop ends here(4-output values in single iteration) */

            pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
            pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */

        }

    }

}