/*! * \copy * Copyright (c) 2009-2018, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * * \file satd_sad_lasx.c * * \brief Loongson optimization * * \date 12/10/2021 Created * ************************************************************************************* */ #include #include "loongson_intrinsics.h" #define HORISUM(in0, in1, out0) \ out0 = __lasx_xvabsd_bu(in0, in1); \ out0 = __lasx_xvhaddw_hu_bu(out0, out0); \ out0 = __lasx_xvhaddw_wu_hu(out0, out0); \ out0 = __lasx_xvhaddw_du_wu(out0, out0); \ int32_t WelsSampleSad4x4_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { uint8_t *pSrc1 = pSample1; uint8_t *pSrc2 = pSample2; int32_t iStride0 = 0; int32_t iStride1_tmp = iStride1 << 1; int32_t iStride2_tmp = iStride2 << 1; __m256i src1_0, src1_1, src1_2, src1_3; __m256i src2_0, src2_1, src2_2, src2_3; DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp, pSrc1, iStride1_tmp + iStride1, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc2, iStride0, pSrc2, iStride2, pSrc2, iStride2_tmp, pSrc2, iStride2_tmp + iStride2, src2_0, src2_1, src2_2, src2_3); DUP2_ARG2(__lasx_xvpackev_w, src1_0, src1_1, src1_2, src1_3, src1_0, src1_2); DUP2_ARG2(__lasx_xvpackev_w, src2_0, src2_1, src2_2, src2_3, src2_0, src2_2); DUP2_ARG2(__lasx_xvpackev_d, src1_0, src1_2, src2_0, src2_2, src1_0, src2_0); HORISUM(src1_0, src2_0, src1_0); src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0); return __lasx_xvpickve2gr_d(src1_0, 0); } static inline int32_t WelsSampleSad8x8x2_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { uint8_t* pSrc1 = pSample1; uint8_t* pSrc2 = pSample2; int32_t iStride0 = 0; int32_t iStride1_tmp2 = iStride1 << 1; int32_t iStride1_tmp3 = iStride1_tmp2 + iStride1; int32_t iStride1_tmp4 = iStride1 << 2; int32_t iStride1_tmp5 = iStride1_tmp4 + iStride1; int32_t iStride1_tmp6 = iStride1_tmp5 + iStride1; int32_t iStride1_tmp7 = iStride1_tmp6 + iStride1; int32_t iStride2_tmp2 = iStride2 << 1; int32_t iStride2_tmp3 = iStride2_tmp2 + iStride2; int32_t iStride2_tmp4 = iStride2 << 2; int32_t iStride2_tmp5 = iStride2_tmp4 + iStride2; int32_t iStride2_tmp6 = iStride2_tmp5 + iStride2; int32_t iStride2_tmp7 = iStride2_tmp6 + iStride2; __m256i src1_0, src1_1, src1_2, src1_3, src1_4, src1_5, src1_6, src1_7; __m256i src2_0, src2_1, src2_2, src2_3, src2_4, src2_5, src2_6, src2_7; DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp2, pSrc1, iStride1_tmp3, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc1, iStride1_tmp4, pSrc1, iStride1_tmp5, pSrc1, iStride1_tmp6, pSrc1, iStride1_tmp7, src1_4, src1_5, src1_6, src1_7); DUP4_ARG2(__lasx_xvldx, pSrc2, iStride0, pSrc2, iStride2, pSrc2, iStride2_tmp2, pSrc2, iStride2_tmp3, src2_0, src2_1, src2_2, src2_3); DUP4_ARG2(__lasx_xvldx, pSrc2, iStride2_tmp4, pSrc2, iStride2_tmp5, pSrc2, iStride2_tmp6, pSrc2, iStride2_tmp7, src2_4, src2_5, src2_6, src2_7); DUP4_ARG3(__lasx_xvpermi_q, src1_0, src1_1, 0x20, src1_2, src1_3, 0x20, src1_4, src1_5, 0x20, src1_6, src1_7, 0x20, src1_0, src1_2, src1_4, src1_6); DUP4_ARG3(__lasx_xvpermi_q, src2_0, src2_1, 0x20, src2_2, src2_3, 0x20, src2_4, src2_5, 0x20, src2_6, src2_7, 0x20, src2_0, src2_2, src2_4, src2_6); HORISUM(src1_0, src2_0, src1_0); HORISUM(src1_2, src2_2, src1_2); HORISUM(src1_4, src2_4, src1_4); HORISUM(src1_6, src2_6, src1_6); src1_0 = __lasx_xvadd_d(src1_0, src1_2); src1_0 = __lasx_xvadd_d(src1_0, src1_4); src1_0 = __lasx_xvadd_d(src1_0, src1_6); src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0); return (__lasx_xvpickve2gr_d(src1_0, 0) + __lasx_xvpickve2gr_d(src1_0, 2)); } int32_t WelsSampleSad8x8_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { uint8_t* pSrc1 = pSample1; uint8_t* pSrc2 = pSample2; int32_t iStride0 = 0; int32_t iStride1_tmp2 = iStride1 << 1; int32_t iStride1_tmp3 = iStride1_tmp2 + iStride1; int32_t iStride1_tmp4 = iStride1 << 2; int32_t iStride1_tmp5 = iStride1_tmp4 + iStride1; int32_t iStride1_tmp6 = iStride1_tmp5 + iStride1; int32_t iStride1_tmp7 = iStride1_tmp6 + iStride1; int32_t iStride2_tmp2 = iStride2 << 1; int32_t iStride2_tmp3 = iStride2_tmp2 + iStride2; int32_t iStride2_tmp4 = iStride2 << 2; int32_t iStride2_tmp5 = iStride2_tmp4 + iStride2; int32_t iStride2_tmp6 = iStride2_tmp5 + iStride2; int32_t iStride2_tmp7 = iStride2_tmp6 + iStride2; __m256i src1_0, src1_1, src1_2, src1_3, src1_4, src1_5, src1_6, src1_7; __m256i src2_0, src2_1, src2_2, src2_3, src2_4, src2_5, src2_6, src2_7; DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp2, pSrc1, iStride1_tmp3, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc1, iStride1_tmp4, pSrc1, iStride1_tmp5, pSrc1, iStride1_tmp6, pSrc1, iStride1_tmp7, src1_4, src1_5, src1_6, src1_7); DUP4_ARG2(__lasx_xvldx, pSrc2, iStride0, pSrc2, iStride2, pSrc2, iStride2_tmp2, pSrc2, iStride2_tmp3, src2_0, src2_1, src2_2, src2_3); DUP4_ARG2(__lasx_xvldx, pSrc2, iStride2_tmp4, pSrc2, iStride2_tmp5, pSrc2, iStride2_tmp6, pSrc2, iStride2_tmp7, src2_4, src2_5, src2_6, src2_7); DUP4_ARG2(__lasx_xvpackev_d, src1_0, src1_1, src1_2, src1_3, src1_4, src1_5, src1_6, src1_7, src1_0, src1_2, src1_4, src1_6); DUP2_ARG3(__lasx_xvpermi_q, src1_0, src1_2, 0x20, src1_4, src1_6, 0x20, src1_0, src1_4); DUP4_ARG2(__lasx_xvpackev_d, src2_0, src2_1, src2_2, src2_3, src2_4, src2_5, src2_6, src2_7, src2_0, src2_2, src2_4, src2_6); DUP2_ARG3(__lasx_xvpermi_q, src2_0, src2_2, 0x20, src2_4, src2_6, 0x20, src2_0, src2_4); HORISUM(src1_0, src2_0, src1_0); HORISUM(src1_4, src2_4, src1_4); src1_0 = __lasx_xvadd_d(src1_0, src1_4); src1_0 = __lasx_xvhaddw_qu_du(src1_0, src1_0); return (__lasx_xvpickve2gr_d(src1_0, 0) + __lasx_xvpickve2gr_d(src1_0, 2)); } int32_t WelsSampleSatd4x4_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { int32_t iSatdSum; uint8_t* pSrc1 = pSample1; uint8_t* pSrc2 = pSample2; int32_t iStride0 = 0; int32_t iStride1_tmp = iStride1 << 1; int32_t iStride2_tmp = iStride2 << 1; __m256i src1_0, src1_1, src1_2, src1_3; __m256i src2_0, src2_1, src2_2, src2_3; __m256i iSample01, iSample23; __m256i tmp0, tmp1, tmp2, tmp3; __m256i zero = __lasx_xvldi(0); v16i16 mask= {1, 0, 3, 2, 5, 4, 7, 6, 1, 0, 3, 2, 5, 4, 7, 6}; DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp, pSrc1, iStride1_tmp + iStride1, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc2, iStride0, pSrc2, iStride2, pSrc2, iStride2_tmp, pSrc2, iStride2_tmp + iStride2, src2_0, src2_1, src2_2, src2_3); DUP4_ARG2(__lasx_xvpackev_w, src1_0, src1_1, src1_2, src1_3, src2_0, src2_1, src2_2, src2_3, src1_0, src1_2, src2_0, src2_2); DUP2_ARG2(__lasx_xvpackev_d, src1_0, src1_2, src2_0, src2_2, src1_0, src2_0); tmp0 = __lasx_xvsubwev_h_bu(src1_0, src2_0); tmp1 = __lasx_xvsubwod_h_bu(src1_0, src2_0); tmp2 = __lasx_xvilvl_w(tmp0, tmp1); tmp3 = __lasx_xvilvh_w(tmp0, tmp1); tmp0 = __lasx_xvpermi_q(tmp3, tmp2, 0x20); tmp0 = __lasx_xvshuf_h((__m256i)mask, tmp0, tmp0); iSample01 = __lasx_xvhaddw_w_h(tmp0, tmp0); iSample23 = __lasx_xvhsubw_w_h(tmp0, tmp0); tmp0 = __lasx_xvhaddw_d_w(iSample01, iSample01); tmp1 = __lasx_xvhaddw_d_w(iSample23, iSample23); tmp2 = __lasx_xvhsubw_d_w(iSample23, iSample23); tmp3 = __lasx_xvhsubw_d_w(iSample01, iSample01); tmp1 = __lasx_xvpackev_w(tmp1, tmp0); tmp3 = __lasx_xvpackev_w(tmp3, tmp2); tmp0 = __lasx_xvpermi_q(tmp3, tmp1, 0x20); tmp2 = __lasx_xvpermi_q(tmp3, tmp1, 0x31); tmp0 = __lasx_xvpermi_w(tmp0, tmp0, 0x72); tmp2 = __lasx_xvpermi_w(tmp2, tmp2, 0x72); iSample01 = __lasx_xvadd_w(tmp0, tmp2); iSample23 = __lasx_xvsub_w(tmp0, tmp2); tmp0 = __lasx_xvhaddw_d_w(iSample01, iSample01); tmp1 = __lasx_xvhaddw_d_w(iSample23, iSample23); tmp2 = __lasx_xvhsubw_d_w(iSample23, iSample23); tmp3 = __lasx_xvhsubw_d_w(iSample01, iSample01); tmp0 = __lasx_xvpackev_w(tmp0, tmp1); tmp2 = __lasx_xvpackev_w(tmp2, tmp3); tmp0 = __lasx_xvabsd_w(tmp0, zero); tmp2 = __lasx_xvabsd_w(tmp2, zero); tmp0 = __lasx_xvadd_w(tmp0, tmp2); tmp0 = __lasx_xvhaddw_d_w(tmp0, tmp0); tmp0 = __lasx_xvhaddw_q_d(tmp0, tmp0); iSatdSum = __lasx_xvpickve2gr_d(tmp0, 0) + __lasx_xvpickve2gr_d(tmp0, 2); return ((iSatdSum + 1) >> 1); } int32_t WelsSampleSad16x8_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { return WelsSampleSad8x8x2_lasx (pSample1, iStride1, pSample2, iStride2); } int32_t WelsSampleSad8x16_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { int32_t iSadSum = 0; iSadSum += WelsSampleSad8x8_lasx (pSample1, iStride1, pSample2, iStride2); iSadSum += WelsSampleSad8x8_lasx (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2); return iSadSum; } int32_t WelsSampleSad16x16_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { int32_t iSadSum = 0; iSadSum += WelsSampleSad8x8x2_lasx (pSample1, iStride1, pSample2, iStride2); iSadSum += WelsSampleSad8x8x2_lasx (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2); return iSadSum; } void WelsSampleSadFour4x4_lasx (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) { uint8_t *pSrc1 = iSample1; uint8_t *pSrc2 = iSample2 - iStride2; uint8_t *pSrc3 = iSample2 + iStride2; uint8_t *pSrc4 = iSample2 - 1; uint8_t *pSrc5 = iSample2 + 1; int32_t iStride0 = 0; int32_t iStride1_tmp = iStride1 << 1; int32_t iStride2_tmp = iStride2 << 1; __m256i src1_0, src1_1, src1_2, src1_3; __m256i src2_0, src2_1, src2_2, src2_3; __m256i cb0, cb1, cb2, cb3, cb4, cb5, cb6, cb7; DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp, pSrc1, iStride1_tmp + iStride1, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc2, iStride0, pSrc2, iStride2, pSrc2, iStride2_tmp, pSrc2, iStride2_tmp + iStride2, src2_0, src2_1, src2_2, src2_3); DUP4_ARG2(__lasx_xvpackev_w, src1_0, src1_1, src1_2, src1_3, src2_0, src2_1, src2_2, src2_3, src1_0, src1_2, src2_0, src2_2); DUP2_ARG2(__lasx_xvpackev_d, src1_0, src1_2, src2_0, src2_2, cb0, cb1); //16 16 DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp, pSrc1, iStride1_tmp + iStride1, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc3, iStride0, pSrc3, iStride2, pSrc3, iStride2_tmp, pSrc3, iStride2_tmp + iStride2, src2_0, src2_1, src2_2, src2_3); DUP4_ARG2(__lasx_xvpackev_w, src1_0, src1_1, src1_2, src1_3, src2_0, src2_1, src2_2, src2_3, src1_0, src1_2, src2_0, src2_2); DUP2_ARG2(__lasx_xvpackev_d, src1_0, src1_2, src2_0, src2_2, cb2, cb3); //16 16 DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp, pSrc1, iStride1_tmp + iStride1, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc4, iStride0, pSrc4, iStride2, pSrc4, iStride2_tmp, pSrc4, iStride2_tmp + iStride2, src2_0, src2_1, src2_2, src2_3); DUP4_ARG2(__lasx_xvpackev_w, src1_0, src1_1, src1_2, src1_3, src2_0, src2_1, src2_2, src2_3, src1_0, src1_2, src2_0, src2_2); DUP2_ARG2(__lasx_xvpackev_d, src1_0, src1_2, src2_0, src2_2, cb4, cb5); //16 16 DUP4_ARG2(__lasx_xvldx, pSrc1, iStride0, pSrc1, iStride1, pSrc1, iStride1_tmp, pSrc1, iStride1_tmp + iStride1, src1_0, src1_1, src1_2, src1_3); DUP4_ARG2(__lasx_xvldx, pSrc5, iStride0, pSrc5, iStride2, pSrc5, iStride2_tmp, pSrc5, iStride2_tmp + iStride2, src2_0, src2_1, src2_2, src2_3); DUP4_ARG2(__lasx_xvpackev_w, src1_0, src1_1, src1_2, src1_3, src2_0, src2_1, src2_2, src2_3, src1_0, src1_2, src2_0, src2_2); DUP2_ARG2(__lasx_xvpackev_d, src1_0, src1_2, src2_0, src2_2, cb6, cb7); //16 16 cb0 = __lasx_xvpermi_q(cb2, cb0, 0x20); cb1 = __lasx_xvpermi_q(cb3, cb1, 0x20); cb4 = __lasx_xvpermi_q(cb6, cb4, 0x20); cb5 = __lasx_xvpermi_q(cb7, cb5, 0x20); HORISUM(cb0, cb1, cb0); HORISUM(cb4, cb5, cb4); DUP2_ARG2(__lasx_xvhaddw_qu_du, cb0, cb0, cb4, cb4, cb0, cb4); * (pSad) = __lasx_xvpickve2gr_d(cb0, 0); * (pSad + 1) = __lasx_xvpickve2gr_d(cb0, 2); * (pSad + 2) = __lasx_xvpickve2gr_d(cb4, 0); * (pSad + 3) = __lasx_xvpickve2gr_d(cb4, 2); } void WelsSampleSadFour8x8_lasx (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) { * (pSad) = WelsSampleSad8x8_lasx (iSample1, iStride1, (iSample2 - iStride2), iStride2); * (pSad + 1) = WelsSampleSad8x8_lasx (iSample1, iStride1, (iSample2 + iStride2), iStride2); * (pSad + 2) = WelsSampleSad8x8_lasx (iSample1, iStride1, (iSample2 - 1), iStride2); * (pSad + 3) = WelsSampleSad8x8_lasx (iSample1, iStride1, (iSample2 + 1), iStride2); } void WelsSampleSadFour8x16_lasx (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) { * (pSad) = WelsSampleSad8x16_lasx (iSample1, iStride1, (iSample2 - iStride2), iStride2); * (pSad + 1) = WelsSampleSad8x16_lasx (iSample1, iStride1, (iSample2 + iStride2), iStride2); * (pSad + 2) = WelsSampleSad8x16_lasx (iSample1, iStride1, (iSample2 - 1), iStride2); * (pSad + 3) = WelsSampleSad8x16_lasx (iSample1, iStride1, (iSample2 + 1), iStride2); } void WelsSampleSadFour16x8_lasx (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) { * (pSad) = WelsSampleSad16x8_lasx (iSample1, iStride1, (iSample2 - iStride2), iStride2); * (pSad + 1) = WelsSampleSad16x8_lasx (iSample1, iStride1, (iSample2 + iStride2), iStride2); * (pSad + 2) = WelsSampleSad16x8_lasx (iSample1, iStride1, (iSample2 - 1), iStride2); * (pSad + 3) = WelsSampleSad16x8_lasx (iSample1, iStride1, (iSample2 + 1), iStride2); } void WelsSampleSadFour16x16_lasx (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) { * (pSad) = WelsSampleSad16x16_lasx (iSample1, iStride1, (iSample2 - iStride2), iStride2); * (pSad + 1) = WelsSampleSad16x16_lasx (iSample1, iStride1, (iSample2 + iStride2), iStride2); * (pSad + 2) = WelsSampleSad16x16_lasx (iSample1, iStride1, (iSample2 - 1), iStride2); * (pSad + 3) = WelsSampleSad16x16_lasx (iSample1, iStride1, (iSample2 + 1), iStride2); } int32_t WelsSampleSatd8x8_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { int32_t iSatdSum = 0; iSatdSum += WelsSampleSatd4x4_lasx (pSample1, iStride1, pSample2, iStride2); iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + 4, iStride1, pSample2 + 4, iStride2); iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2); iSatdSum += WelsSampleSatd4x4_lasx (pSample1 + (iStride1 << 2) + 4, iStride1, pSample2 + (iStride2 << 2) + 4, iStride2); return iSatdSum; } int32_t WelsSampleSatd16x8_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { int32_t iSatdSum = 0; iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1, pSample2, iStride2); iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + 8, iStride1, pSample2 + 8, iStride2); return iSatdSum; } int32_t WelsSampleSatd8x16_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { int32_t iSatdSum = 0; iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1, pSample2, iStride2); iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2); return iSatdSum; } int32_t WelsSampleSatd16x16_lasx (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) { int32_t iSatdSum = 0; iSatdSum += WelsSampleSatd8x8_lasx (pSample1, iStride1, pSample2, iStride2); iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + 8, iStride1, pSample2 + 8, iStride2); iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3), iStride1, pSample2 + (iStride2 << 3), iStride2); iSatdSum += WelsSampleSatd8x8_lasx (pSample1 + (iStride1 << 3) + 8, iStride1, pSample2 + (iStride2 << 3) + 8, iStride2); return iSatdSum; }