/*! ********************************************************************************** * Copyright (c) 2022 Loongson Technology Corporation Limited * Contributed by Lu Wang * Jin Bo * * \copy * Copyright (c) 2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * \file deblock_lsx.c * * \brief Loongson optimization * * \date 22/2/2022 Created * ********************************************************************************** */ #include #include "loongson_intrinsics.h" void DeblockLumaLt4V_lsx (uint8_t* pPix, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc) { __m128i p0, p1, p2, q0, q1, q2; __m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l; __m128i p0_h, p1_h, p2_h, q0_h, q1_h, q2_h; __m128i t0, t1, t2, t3, t; __m128i t0_l, t0_h, t1_l, t1_h, t2_l, t2_h; __m128i iTc, iTc0, negiTc, negiTc0, f, flags; __m128i iTc_l, iTc_h, negiTc_l, negiTc_h; __m128i iTc0_l, iTc0_h, negiTc0_l, negiTc0_h; __m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0; __m128i zero = __lsx_vldi(0); __m128i alpha = __lsx_vreplgr2vr_b(iAlpha); __m128i beta = __lsx_vreplgr2vr_b(iBeta); __m128i shuf = {0x0101010100000000, 0x0303030302020202}; __m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00}; int32_t iStrideX_x0 = 0; int32_t iStrideX_x2 = iStrideX << 1; int32_t iStrideX_x3 = iStrideX_x2 + iStrideX; iTc0 = __lsx_vldx(pTc, 0); iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf); negiTc0 = __lsx_vneg_b(iTc0); iTc = iTc0; DUP4_ARG2(__lsx_vldx, pPix, -iStrideX, pPix, -iStrideX_x2, pPix, -iStrideX_x3, pPix, iStrideX_x0, p0, p1, p2, q0); DUP2_ARG2(__lsx_vldx, pPix, iStrideX, pPix, iStrideX_x2, q1, q2); DUP4_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, q1, q0, p2, p0, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0); bDetaQ2Q0 = __lsx_vabsd_bu(q2, q0); DUP4_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaQ1Q0, beta, bDetaP2P0, beta, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0); bDetaQ2Q0 = __lsx_vslt_bu(bDetaQ2Q0, beta); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, p2, zero, q0, p0_l, p1_l, p2_l, q0_l); DUP2_ARG2(__lsx_vilvl_b, zero, q1, zero, q2, q1_l, q2_l); DUP4_ARG2(__lsx_vilvh_b, zero, p0, zero, p1, zero, p2, zero, q0, p0_h, p1_h, p2_h, q0_h); DUP2_ARG2(__lsx_vilvh_b, zero, q1, zero, q2, q1_h, q2_h); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, f, bDetaQ1Q0, f, f); flags = __lsx_vsle_b(zero, iTc0); DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaP2P0, flags, flags); flags = __lsx_vandi_b(flags, 1); iTc = __lsx_vadd_b(iTc, flags); flags = __lsx_vsle_b(zero, iTc0); DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaQ2Q0, flags, flags); flags = __lsx_vandi_b(flags, 1); iTc = __lsx_vadd_b(iTc, flags); negiTc = __lsx_vneg_b(iTc); flags = __lsx_vslt_b(iTc0, zero); iTc0_l = __lsx_vilvl_b(flags, iTc0); iTc0_h = __lsx_vilvh_b(flags, iTc0); flags = __lsx_vslt_b(negiTc0, zero); negiTc0_l = __lsx_vilvl_b(flags, negiTc0); negiTc0_h = __lsx_vilvh_b(flags, negiTc0); flags = __lsx_vslt_b(iTc, zero); iTc_l = __lsx_vilvl_b(flags, iTc); iTc_h = __lsx_vilvh_b(flags, iTc); flags = __lsx_vslt_b(negiTc, zero); negiTc_l = __lsx_vilvl_b(flags, negiTc); negiTc_h = __lsx_vilvh_b(flags, negiTc); t0_l = __lsx_vadd_h(p0_l, q0_l); t0_l = __lsx_vaddi_hu(t0_l, 1); t0_l = __lsx_vsrai_h(t0_l, 1); t0_l = __lsx_vadd_h(p2_l, t0_l); t = __lsx_vslli_h(p1_l, 1); t0_l = __lsx_vsub_h(t0_l, t); t0_l = __lsx_vsrai_h(t0_l, 1); t0_l = __lsx_vmin_h(iTc0_l, t0_l); t0_l = __lsx_vmax_h(negiTc0_l, t0_l); t0_l = __lsx_vadd_h(p1_l, t0_l); t1_l = __lsx_vadd_h(p0_l, q0_l); t1_l = __lsx_vaddi_hu(t1_l, 1); t1_l = __lsx_vsrai_h(t1_l, 1); t1_l = __lsx_vadd_h(q2_l, t1_l); t = __lsx_vslli_h(q1_l, 1); t1_l = __lsx_vsub_h(t1_l, t); t1_l = __lsx_vsrai_h(t1_l, 1); t1_l = __lsx_vmin_h(iTc0_l, t1_l); t1_l = __lsx_vmax_h(negiTc0_l, t1_l); t1_l = __lsx_vadd_h(q1_l, t1_l); t0_h = __lsx_vadd_h(p0_h, q0_h); t0_h = __lsx_vaddi_hu(t0_h, 1); t0_h = __lsx_vsrai_h(t0_h, 1); t0_h = __lsx_vadd_h(p2_h, t0_h); t = __lsx_vslli_h(p1_h, 1); t0_h = __lsx_vsub_h(t0_h, t); t0_h = __lsx_vsrai_h(t0_h, 1); t0_h = __lsx_vmin_h(iTc0_h, t0_h); t0_h = __lsx_vmax_h(negiTc0_h, t0_h); t0_h = __lsx_vadd_h(p1_h, t0_h); t1_h = __lsx_vadd_h(p0_h, q0_h); t1_h = __lsx_vaddi_hu(t1_h, 1); t1_h = __lsx_vsrai_h(t1_h, 1); t1_h = __lsx_vadd_h(q2_h, t1_h); t = __lsx_vslli_h(q1_h, 1); t1_h = __lsx_vsub_h(t1_h, t); t1_h = __lsx_vsrai_h(t1_h, 1); t1_h = __lsx_vmin_h(iTc0_h, t1_h); t1_h = __lsx_vmax_h(negiTc0_h, t1_h); t1_h = __lsx_vadd_h(q1_h, t1_h); t2_l = __lsx_vsub_h(q0_l, p0_l); t2_l = __lsx_vslli_h(t2_l, 2); t2_l = __lsx_vadd_h(t2_l, p1_l); t2_l = __lsx_vsub_h(t2_l, q1_l); t2_l = __lsx_vaddi_hu(t2_l, 4); t2_l = __lsx_vsrai_h(t2_l, 3); t2_l = __lsx_vmin_h(iTc_l, t2_l); t2_l = __lsx_vmax_h(negiTc_l, t2_l); t2_h = __lsx_vsub_h(q0_h, p0_h); t2_h = __lsx_vslli_h(t2_h, 2); t2_h = __lsx_vadd_h(t2_h, p1_h); t2_h = __lsx_vsub_h(t2_h, q1_h); t2_h = __lsx_vaddi_hu(t2_h, 4); t2_h = __lsx_vsrai_h(t2_h, 3); t2_h = __lsx_vmin_h(iTc_h, t2_h); t2_h = __lsx_vmax_h(negiTc_h, t2_h); p0_l = __lsx_vadd_h(p0_l, t2_l); p1_l = __lsx_vand_v(p0_l, not_255); p2_l = __lsx_vsle_h(zero, p0_l); flags = __lsx_vseq_h(p1_l, zero); p0_l = __lsx_vand_v(p0_l, flags); flags = __lsx_vnor_v(flags,flags); p2_l = __lsx_vand_v(p2_l, flags); p0_l = __lsx_vadd_h(p0_l, p2_l); q0_l = __lsx_vsub_h(q0_l, t2_l); q1_l = __lsx_vand_v(q0_l, not_255); q2_l = __lsx_vsle_h(zero, q0_l); flags = __lsx_vseq_h(q1_l, zero); q0_l = __lsx_vand_v(q0_l, flags); flags = __lsx_vnor_v(flags, flags); q2_l = __lsx_vand_v(q2_l, flags); q0_l = __lsx_vadd_h(q0_l, q2_l); p0_h = __lsx_vadd_h(p0_h, t2_h); p1_h = __lsx_vand_v(p0_h, not_255); p2_h = __lsx_vsle_h(zero, p0_h); flags = __lsx_vseq_h(p1_h, zero); p0_h = __lsx_vand_v(p0_h, flags); flags = __lsx_vnor_v(flags, flags); p2_h = __lsx_vand_v(p2_h, flags); p0_h = __lsx_vadd_h(p0_h, p2_h); q0_h = __lsx_vsub_h(q0_h, t2_h); q1_h = __lsx_vand_v(q0_h, not_255); q2_h = __lsx_vsle_h(zero, q0_h); flags = __lsx_vseq_h(q1_h, zero); q0_h = __lsx_vand_v(q0_h, flags); flags = __lsx_vnor_v(flags, flags); q2_h = __lsx_vand_v(q2_h, flags); q0_h = __lsx_vadd_h(q0_h, q2_h); DUP4_ARG2(__lsx_vpickev_b, t0_h, t0_l, t1_h, t1_l, p0_h, p0_l, q0_h, q0_l, t0, t1, t2, t3); flags = __lsx_vsle_b(zero, iTc0); flags = __lsx_vand_v(flags, f); t2 = __lsx_vand_v(t2, flags); t = __lsx_vnor_v(flags,flags); p0 = __lsx_vand_v(p0, t); p0 = __lsx_vadd_b(t2, p0); t3 = __lsx_vand_v(t3, flags); t = __lsx_vnor_v(flags,flags); q0 = __lsx_vand_v(q0, t); q0 = __lsx_vadd_b(t3, q0); DUP2_ARG2(__lsx_vand_v, flags, bDetaP2P0, t0, t, t, t0); t = __lsx_vnor_v(t, t); p1 = __lsx_vand_v(p1, t); p1 = __lsx_vadd_b(t0, p1); DUP2_ARG2(__lsx_vand_v, flags, bDetaQ2Q0, t1, t, t, t1); t = __lsx_vnor_v(t, t); q1 = __lsx_vand_v(q1, t); q1 = __lsx_vadd_b(t1, q1); __lsx_vstx(p1, pPix, -iStrideX_x2); __lsx_vstx(p0, pPix, -iStrideX); __lsx_vstx(q0, pPix, iStrideX_x0); __lsx_vstx(q1, pPix, iStrideX); } void DeblockLumaLt4H_lsx (uint8_t* pPix, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t* pTc) { __m128i p0, p1, p2, q0, q1, q2; __m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l; __m128i p0_h, p1_h, p2_h, q0_h, q1_h, q2_h; __m128i t0, t1, t2, t3, t; __m128i t0_l, t0_h, t1_l, t1_h, t2_l, t2_h; __m128i iTc, iTc0, negiTc, negiTc0, f, flags; __m128i iTc_l, iTc_h, negiTc_l, negiTc_h; __m128i iTc0_l, iTc0_h, negiTc0_l, negiTc0_h; __m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0; __m128i zero = __lsx_vldi(0); __m128i alpha = __lsx_vreplgr2vr_b(iAlpha); __m128i beta = __lsx_vreplgr2vr_b(iBeta); __m128i shuf = {0x0101010100000000, 0x0303030302020202}; __m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00}; int32_t iStrideY_x0 = 0; int32_t iStrideY_x2 = iStrideY << 1; int32_t iStrideY_x3 = iStrideY_x2 + iStrideY; int32_t iStrideY_x4 = iStrideY << 2; iTc0 = __lsx_vldx(pTc, 0); iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf); negiTc0 = __lsx_vneg_b(iTc0); iTc = iTc0; pPix -= 3; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, p0_l, p1_l, p2_l, q0_l); pPix += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, p0_h, p1_h, p2_h, q0_h); pPix += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, q1_l, q2_l, t0_l, t1_l); pPix += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, q1_h, q2_h, t0_h, t1_h); LSX_TRANSPOSE16x8_B(p0_l, p1_l, p2_l, q0_l, p0_h, p1_h, p2_h, q0_h, q1_l, q2_l, t0_l, t1_l, q1_h, q2_h, t0_h, t1_h, p2, p1, p0, q0, q1, q2, t, f); DUP4_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, q1, q0, p2, p0, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0); bDetaQ2Q0 = __lsx_vabsd_bu(q2, q0); DUP4_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaQ1Q0, beta, bDetaP2P0, beta, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0); bDetaQ2Q0 = __lsx_vslt_bu(bDetaQ2Q0, beta); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, p2, zero, q0, p0_l, p1_l, p2_l, q0_l); DUP2_ARG2(__lsx_vilvl_b, zero, q1, zero, q2, q1_l, q2_l); DUP4_ARG2(__lsx_vilvh_b, zero, p0, zero, p1, zero, p2, zero, q0, p0_h, p1_h, p2_h, q0_h); DUP2_ARG2(__lsx_vilvh_b, zero, q1, zero, q2, q1_h, q2_h); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, f, bDetaQ1Q0, f, f); flags = __lsx_vsle_b(zero, iTc0); DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaP2P0, flags, flags); flags = __lsx_vandi_b(flags, 1); iTc = __lsx_vadd_b(iTc, flags); flags = __lsx_vsle_b(zero, iTc0); DUP2_ARG2(__lsx_vand_v, f, flags, flags, bDetaQ2Q0, flags, flags); flags = __lsx_vandi_b(flags, 1); iTc = __lsx_vadd_b(iTc, flags); negiTc = __lsx_vneg_b(iTc); flags = __lsx_vslt_b(iTc0, zero); iTc0_l = __lsx_vilvl_b(flags, iTc0); iTc0_h = __lsx_vilvh_b(flags, iTc0); flags = __lsx_vslt_b(negiTc0, zero); negiTc0_l = __lsx_vilvl_b(flags, negiTc0); negiTc0_h = __lsx_vilvh_b(flags, negiTc0); flags = __lsx_vslt_b(iTc, zero); iTc_l = __lsx_vilvl_b(flags, iTc); iTc_h = __lsx_vilvh_b(flags, iTc); flags = __lsx_vslt_b(negiTc, zero); negiTc_l = __lsx_vilvl_b(flags, negiTc); negiTc_h = __lsx_vilvh_b(flags, negiTc); t0_l = __lsx_vadd_h(p0_l, q0_l); t0_l = __lsx_vaddi_hu(t0_l, 1); t0_l = __lsx_vsrai_h(t0_l, 1); t0_l = __lsx_vadd_h(p2_l, t0_l); t = __lsx_vslli_h(p1_l, 1); t0_l = __lsx_vsub_h(t0_l, t); t0_l = __lsx_vsrai_h(t0_l, 1); t0_l = __lsx_vmin_h(iTc0_l, t0_l); t0_l = __lsx_vmax_h(negiTc0_l, t0_l); t0_l = __lsx_vadd_h(p1_l, t0_l); t1_l = __lsx_vadd_h(p0_l, q0_l); t1_l = __lsx_vaddi_hu(t1_l, 1); t1_l = __lsx_vsrai_h(t1_l, 1); t1_l = __lsx_vadd_h(q2_l, t1_l); t = __lsx_vslli_h(q1_l, 1); t1_l = __lsx_vsub_h(t1_l, t); t1_l = __lsx_vsrai_h(t1_l, 1); t1_l = __lsx_vmin_h(iTc0_l, t1_l); t1_l = __lsx_vmax_h(negiTc0_l, t1_l); t1_l = __lsx_vadd_h(q1_l, t1_l); t0_h = __lsx_vadd_h(p0_h, q0_h); t0_h = __lsx_vaddi_hu(t0_h, 1); t0_h = __lsx_vsrai_h(t0_h, 1); t0_h = __lsx_vadd_h(p2_h, t0_h); t = __lsx_vslli_h(p1_h, 1); t0_h = __lsx_vsub_h(t0_h, t); t0_h = __lsx_vsrai_h(t0_h, 1); t0_h = __lsx_vmin_h(iTc0_h, t0_h); t0_h = __lsx_vmax_h(negiTc0_h, t0_h); t0_h = __lsx_vadd_h(p1_h, t0_h); t1_h = __lsx_vadd_h(p0_h, q0_h); t1_h = __lsx_vaddi_hu(t1_h, 1); t1_h = __lsx_vsrai_h(t1_h, 1); t1_h = __lsx_vadd_h(q2_h, t1_h); t = __lsx_vslli_h(q1_h, 1); t1_h = __lsx_vsub_h(t1_h, t); t1_h = __lsx_vsrai_h(t1_h, 1); t1_h = __lsx_vmin_h(iTc0_h, t1_h); t1_h = __lsx_vmax_h(negiTc0_h, t1_h); t1_h = __lsx_vadd_h(q1_h, t1_h); t2_l = __lsx_vsub_h(q0_l, p0_l); t2_l = __lsx_vslli_h(t2_l, 2); t2_l = __lsx_vadd_h(t2_l, p1_l); t2_l = __lsx_vsub_h(t2_l, q1_l); t2_l = __lsx_vaddi_hu(t2_l, 4); t2_l = __lsx_vsrai_h(t2_l, 3); t2_l = __lsx_vmin_h(iTc_l, t2_l); t2_l = __lsx_vmax_h(negiTc_l, t2_l); t2_h = __lsx_vsub_h(q0_h, p0_h); t2_h = __lsx_vslli_h(t2_h, 2); t2_h = __lsx_vadd_h(t2_h, p1_h); t2_h = __lsx_vsub_h(t2_h, q1_h); t2_h = __lsx_vaddi_hu(t2_h, 4); t2_h = __lsx_vsrai_h(t2_h, 3); t2_h = __lsx_vmin_h(iTc_h, t2_h); t2_h = __lsx_vmax_h(negiTc_h, t2_h); p0_l = __lsx_vadd_h(p0_l, t2_l); p1_l = __lsx_vand_v(p0_l, not_255); p2_l = __lsx_vsle_h(zero, p0_l); flags = __lsx_vseq_h(p1_l, zero); p0_l = __lsx_vand_v(p0_l, flags); flags = __lsx_vnor_v(flags,flags); p2_l = __lsx_vand_v(p2_l, flags); p0_l = __lsx_vadd_h(p0_l, p2_l); q0_l = __lsx_vsub_h(q0_l, t2_l); q1_l = __lsx_vand_v(q0_l, not_255); q2_l = __lsx_vsle_h(zero, q0_l); flags = __lsx_vseq_h(q1_l, zero); q0_l = __lsx_vand_v(q0_l, flags); flags = __lsx_vnor_v(flags, flags); q2_l = __lsx_vand_v(q2_l, flags); q0_l = __lsx_vadd_h(q0_l, q2_l); p0_h = __lsx_vadd_h(p0_h, t2_h); p1_h = __lsx_vand_v(p0_h, not_255); p2_h = __lsx_vsle_h(zero, p0_h); flags = __lsx_vseq_h(p1_h, zero); p0_h = __lsx_vand_v(p0_h, flags); flags = __lsx_vnor_v(flags, flags); p2_h = __lsx_vand_v(p2_h, flags); p0_h = __lsx_vadd_h(p0_h, p2_h); q0_h = __lsx_vsub_h(q0_h, t2_h); q1_h = __lsx_vand_v(q0_h, not_255); q2_h = __lsx_vsle_h(zero, q0_h); flags = __lsx_vseq_h(q1_h, zero); q0_h = __lsx_vand_v(q0_h, flags); flags = __lsx_vnor_v(flags, flags); q2_h = __lsx_vand_v(q2_h, flags); q0_h = __lsx_vadd_h(q0_h, q2_h); DUP4_ARG2(__lsx_vpickev_b, t0_h, t0_l, t1_h, t1_l, p0_h, p0_l, q0_h, q0_l, t0, t1, t2, t3); flags = __lsx_vsle_b(zero, iTc0); flags = __lsx_vand_v(flags, f); t2 = __lsx_vand_v(t2, flags); t = __lsx_vnor_v(flags,flags); p0 = __lsx_vand_v(p0, t); p0 = __lsx_vadd_b(t2, p0); t3 = __lsx_vand_v(t3, flags); t = __lsx_vnor_v(flags,flags); q0 = __lsx_vand_v(q0, t); q0 = __lsx_vadd_b(t3, q0); DUP2_ARG2(__lsx_vand_v, flags, bDetaP2P0, t0, t, t, t0); t = __lsx_vnor_v(t, t); p1 = __lsx_vand_v(p1, t); p1 = __lsx_vadd_b(t0, p1); DUP2_ARG2(__lsx_vand_v, flags, bDetaQ2Q0, t1, t, t, t1); t = __lsx_vnor_v(t, t); q1 = __lsx_vand_v(q1, t); q1 = __lsx_vadd_b(t1, q1); DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, t0, t2); DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, t1, t3); DUP2_ARG2(__lsx_vilvl_h, t2, t0, t3, t1, p0, p2); DUP2_ARG2(__lsx_vilvh_h, t2, t0, t3, t1, p1, q0); pPix -= iStrideY_x4; pPix -= iStrideY_x4; pPix -= iStrideY_x4 - 1; __lsx_vstelm_w(p0, pPix, 0, 0); __lsx_vstelm_w(p0, pPix + iStrideY, 0, 1); __lsx_vstelm_w(p0, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(p0, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_w(p1, pPix, 0, 0); __lsx_vstelm_w(p1, pPix + iStrideY, 0, 1); __lsx_vstelm_w(p1, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(p1, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_w(p2, pPix, 0, 0); __lsx_vstelm_w(p2, pPix + iStrideY, 0, 1); __lsx_vstelm_w(p2, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(p2, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_w(q0, pPix, 0, 0); __lsx_vstelm_w(q0, pPix + iStrideY, 0, 1); __lsx_vstelm_w(q0, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(q0, pPix + iStrideY_x3, 0, 3); } void DeblockLumaEq4V_lsx(uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) { int32_t iStride0 = 0; int32_t iStride_x2 = iStride << 1; int32_t iStride_x3 = iStride + iStride_x2; int32_t iStride_x4 = iStride << 2; __m128i p0, p1, p2, p3, q0, q1, q2, q3; __m128i p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; __m128i p0_h, p1_h, p2_h, p3_h, q0_h, q1_h, q2_h, q3_h; __m128i t0, t1, t2, t0_con1, s0, s1, s2, s0_con1; __m128i alpha, beta; __m128i iDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0; __m128i mask0, mask1; DUP4_ARG2(__lsx_vldx, pPix, -iStride_x4, pPix, -iStride_x3, pPix, -iStride_x2, pPix, -iStride, p3, p2, p1, p0); DUP4_ARG2(__lsx_vldx, pPix, iStride_x3, pPix, iStride_x2, pPix, iStride, pPix, iStride0, q3, q2, q1, q0); alpha = __lsx_vreplgr2vr_b(iAlpha); beta = __lsx_vreplgr2vr_b(iBeta); iDetaP0Q0 = __lsx_vabsd_bu(p0, q0); DUP4_ARG2(__lsx_vabsd_bu, p1, p0, q1, q0, p2, p0, q2, q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0); DUP4_ARG2(__lsx_vslt_bu, bDetaP1P0, beta, bDetaQ1Q0, beta, bDetaP2P0, beta, bDetaQ2Q0, beta, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0); DUP4_ARG2(__lsx_vsllwil_hu_bu, p0, 0, p1, 0, p2, 0, p3, 0, p0_l, p1_l, p2_l, p3_l); DUP4_ARG1(__lsx_vexth_hu_bu, p0, p1, p2, p3, p0_h, p1_h, p2_h, p3_h); DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l, q3_l); DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h); //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0 mask0 = __lsx_vslt_bu(iDetaP0Q0, alpha); mask0 &= bDetaP1P0; mask0 &= bDetaQ1Q0; //iDetaP0Q0 < ((iAlpha >> 2) + 2) mask1 = __lsx_vsrli_b(alpha, 2); mask1 = __lsx_vaddi_bu(mask1, 2); mask1 = __lsx_vslt_bu(iDetaP0Q0, mask1); //low part //p0 t0 = __lsx_vadd_h(__lsx_vslli_h(p1_l, 1), p2_l); t0 = __lsx_vadd_h(__lsx_vslli_h(p0_l, 1), t0); t0 = __lsx_vadd_h(__lsx_vslli_h(q0_l, 1), t0); t0 = __lsx_vadd_h(q1_l, t0); t0 = __lsx_vsrari_h(t0, 3); //p1 t1 = __lsx_vadd_h(p2_l, p1_l); t1 = __lsx_vadd_h(p0_l, t1); t1 = __lsx_vadd_h(q0_l, t1); t1 = __lsx_vsrari_h(t1, 2); //p2 t2 = __lsx_vadd_h(__lsx_vslli_h(p3_l, 1), p2_l); t2 = __lsx_vadd_h(__lsx_vslli_h(p2_l, 1), t2); t2 = __lsx_vadd_h(p1_l, t2); t2 = __lsx_vadd_h(p0_l, t2); t2 = __lsx_vadd_h(q0_l, t2); t2 = __lsx_vsrari_h(t2, 3); //p0 condition 1 t0_con1 = __lsx_vadd_h(__lsx_vslli_h(p1_l, 1), p0_l); t0_con1 = __lsx_vadd_h(q1_l, t0_con1); t0_con1 = __lsx_vsrari_h(t0_con1, 2); //q0 s0 = __lsx_vadd_h(__lsx_vslli_h(p0_l, 1), p1_l); s0 = __lsx_vadd_h(__lsx_vslli_h(q0_l, 1), s0); s0 = __lsx_vadd_h(__lsx_vslli_h(q1_l, 1), s0); s0 = __lsx_vadd_h(q2_l, s0); s0 = __lsx_vsrari_h(s0, 3); //q1 s1 = __lsx_vadd_h(p0_l, q0_l); s1 = __lsx_vadd_h(q1_l, s1); s1 = __lsx_vadd_h(q2_l, s1); s1 = __lsx_vsrari_h(s1, 2); //q2 s2 = __lsx_vadd_h(__lsx_vslli_h(q3_l, 1), q2_l); s2 = __lsx_vadd_h(__lsx_vslli_h(q2_l, 1), s2); s2 = __lsx_vadd_h(q1_l, s2); s2 = __lsx_vadd_h(q0_l, s2); s2 = __lsx_vadd_h(p0_l, s2); s2 = __lsx_vsrari_h(s2, 3); //q0 condition 1 s0_con1 = __lsx_vadd_h(__lsx_vslli_h(q1_l, 1), q0_l); s0_con1 = __lsx_vadd_h(p1_l, s0_con1); s0_con1 = __lsx_vsrari_h(s0_con1, 2); //move back p0_l = t0; p1_l = t1; p2_l = t2; q0_l = s0; q1_l = s1; q2_l = s2; p3_l = t0_con1; q3_l = s0_con1; //high part //p0 t0 = __lsx_vadd_h(__lsx_vslli_h(p1_h, 1), p2_h); t0 = __lsx_vadd_h(__lsx_vslli_h(p0_h, 1), t0); t0 = __lsx_vadd_h(__lsx_vslli_h(q0_h, 1), t0); t0 = __lsx_vadd_h(q1_h, t0); t0 = __lsx_vsrari_h(t0, 3); //p1 t1 = __lsx_vadd_h(p2_h, p1_h); t1 = __lsx_vadd_h(p0_h, t1); t1 = __lsx_vadd_h(q0_h, t1); t1 = __lsx_vsrari_h(t1, 2); //p2 t2 = __lsx_vadd_h(__lsx_vslli_h(p3_h, 1), p2_h); t2 = __lsx_vadd_h(__lsx_vslli_h(p2_h, 1), t2); t2 = __lsx_vadd_h(p1_h, t2); t2 = __lsx_vadd_h(p0_h, t2); t2 = __lsx_vadd_h(q0_h, t2); t2 = __lsx_vsrari_h(t2, 3); //p0 condition 1 t0_con1 = __lsx_vadd_h(__lsx_vslli_h(p1_h, 1), p0_h); t0_con1 = __lsx_vadd_h(q1_h, t0_con1); t0_con1 = __lsx_vsrari_h(t0_con1, 2); //q0 s0 = __lsx_vadd_h(__lsx_vslli_h(p0_h, 1), p1_h); s0 = __lsx_vadd_h(__lsx_vslli_h(q0_h, 1), s0); s0 = __lsx_vadd_h(__lsx_vslli_h(q1_h, 1), s0); s0 = __lsx_vadd_h(q2_h, s0); s0 = __lsx_vsrari_h(s0, 3); //q1 s1 = __lsx_vadd_h(p0_h, q0_h); s1 = __lsx_vadd_h(q1_h, s1); s1 = __lsx_vadd_h(q2_h, s1); s1 = __lsx_vsrari_h(s1, 2); //q2 s2 = __lsx_vadd_h(__lsx_vslli_h(q3_h, 1), q2_h); s2 = __lsx_vadd_h(__lsx_vslli_h(q2_h, 1), s2); s2 = __lsx_vadd_h(q1_h, s2); s2 = __lsx_vadd_h(q0_h, s2); s2 = __lsx_vadd_h(p0_h, s2); s2 = __lsx_vsrari_h(s2, 3); //q0 condition 1 s0_con1 = __lsx_vadd_h(__lsx_vslli_h(q1_h, 1), q0_h); s0_con1 = __lsx_vadd_h(p1_h, s0_con1); s0_con1 = __lsx_vsrari_h(s0_con1, 2); //move back p0_h = t0; p1_h = t1; p2_h = t2; q0_h = s0; q1_h = s1; q2_h = s2; p3_h = t0_con1; q3_h = s0_con1; //pack low part and high part DUP4_ARG2(__lsx_vpickev_b, p0_h, p0_l, p1_h, p1_l, p2_h, p2_l, q0_h, q0_l, t0, t1, t2, s0); DUP4_ARG2(__lsx_vpickev_b, q1_h, q1_l, q2_h, q2_l, p3_h, p3_l, q3_h, q3_l, s1, s2, t0_con1, s0_con1); t0 = t0 & mask0 & mask1 & bDetaP2P0; t0 = __lsx_vadd_b(t0, t0_con1 & mask0 & mask1 & (~bDetaP2P0)); t0 = __lsx_vadd_b(t0, t0_con1 & mask0 & (~mask1)); t1 = t1 & mask0 & mask1 & bDetaP2P0; t2 = t2 & mask0 & mask1 & bDetaP2P0; s0 = s0 & mask0 & mask1 & bDetaQ2Q0; s0 = __lsx_vadd_b(s0, s0_con1 & mask0 & mask1 & (~bDetaQ2Q0)); s0 = __lsx_vadd_b(s0, s0_con1 & mask0 & (~mask1)); s1 = s1 & mask0 & mask1 & bDetaQ2Q0; s2 = s2 & mask0 & mask1 & bDetaQ2Q0; p0 = __lsx_vadd_b(t0, p0 & (~mask0)); p1 = __lsx_vadd_b(t1, p1 & ~(mask0 & mask1 & bDetaP2P0)); p2 = __lsx_vadd_b(t2, p2 & ~(mask0 & mask1 & bDetaP2P0)); q0 = __lsx_vadd_b(s0, q0 & (~mask0)); q1 = __lsx_vadd_b(s1, q1 & ~(mask0 & mask1 & bDetaQ2Q0)); q2 = __lsx_vadd_b(s2, q2 & ~(mask0 & mask1 & bDetaQ2Q0)); //Store back __lsx_vstx(p2, pPix, -iStride_x3); __lsx_vstx(p1, pPix, -iStride_x2); __lsx_vstx(p0, pPix, -iStride); __lsx_vstx(q0, pPix, iStride0); __lsx_vstx(q1, pPix, iStride); __lsx_vstx(q2, pPix, iStride_x2); } void DeblockLumaEq4H_lsx (uint8_t* pPix, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) { __m128i p0, p1, p2, p3, q0, q1, q2, q3; __m128i p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; __m128i p0_h, p1_h, p2_h, p3_h, q0_h, q1_h, q2_h, q3_h; __m128i t0, t1, t2, t3, t4, t5, t6, t7, temp; __m128i t0_l, t0_h, t1_l, t1_h, t2_l, t2_h; __m128i t3_l, t3_h, t4_l, t4_h, t5_l, t5_h; __m128i t6_l, t6_h, t7_l, t7_h; __m128i f0, f1, f2, f3, fn; __m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0; __m128i zero = __lsx_vldi(0); __m128i alpha = __lsx_vreplgr2vr_b(iAlpha); __m128i beta = __lsx_vreplgr2vr_b(iBeta); int32_t iStrideY_x0 = 0; int32_t iStrideY_x2 = iStrideY << 1; int32_t iStrideY_x3 = iStrideY_x2 + iStrideY; int32_t iStrideY_x4 = iStrideY << 2; // Load data from pPix pPix -= 4; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, p0_l, p1_l, p2_l, q0_l); pPix += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, p0_h, p1_h, p2_h, q0_h); pPix += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, q1_l, q2_l, t0_l, t1_l); pPix += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPix, iStrideY_x0, pPix, iStrideY, pPix, iStrideY_x2, pPix, iStrideY_x3, q1_h, q2_h, t0_h, t1_h); LSX_TRANSPOSE16x8_B(p0_l, p1_l, p2_l, q0_l, p0_h, p1_h, p2_h, q0_h, q1_l, q2_l, t0_l, t1_l, q1_h, q2_h, t0_h, t1_h, p3, p2, p1, p0, q0, q1, q2, q3); // Calculate condition mask bDetaP0Q0 = __lsx_vabsd_bu(p0, q0); DUP4_ARG2(__lsx_vabsd_bu, p1, p0, q1, q0, p2, p0, q2, q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0); DUP4_ARG2(__lsx_vslt_bu, bDetaP1P0, beta, bDetaQ1Q0, beta, bDetaP2P0, beta, bDetaQ2Q0, beta, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0); // Unsigned extend p0, p1, p2, p3, q0, q1, q2, q3 from 8 bits to 16 bits DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, p2, zero, q0, p0_l, p1_l, p2_l, q0_l); DUP4_ARG2(__lsx_vilvh_b, zero, p0, zero, p1, zero, p2, zero, q0, p0_h, p1_h, p2_h, q0_h); DUP2_ARG2(__lsx_vilvl_b, zero, q1, zero, q2, q1_l, q2_l); DUP2_ARG2(__lsx_vilvh_b, zero, q1, zero, q2, q1_h, q2_h); DUP2_ARG2(__lsx_vilvl_b, zero, p3, zero, q3, p3_l, q3_l); DUP2_ARG2(__lsx_vilvh_b, zero, p3, zero, q3, p3_h, q3_h); // Calculate the low part // (p2 + (p1 * (1 << 1)) + (p0 * (1 << 1)) + (q0 * (1 << 1)) + q1 + 4) >> 3 t0_l = __lsx_vslli_h(p1_l, 1); t0_l = __lsx_vadd_h(t0_l, p2_l); temp = __lsx_vslli_h(p0_l, 1); t0_l = __lsx_vadd_h(t0_l, temp); temp = __lsx_vslli_h(q0_l, 1); t0_l = __lsx_vadd_h(t0_l, temp); t0_l = __lsx_vadd_h(t0_l, q1_l); t0_l = __lsx_vaddi_hu(t0_l, 4); t0_l = __lsx_vsrai_h(t0_l, 3); // (p2 + p1 + p0 + q0 + 2) >> 2 t1_l = __lsx_vadd_h(p2_l, p1_l); t1_l = __lsx_vadd_h(t1_l, p0_l); t1_l = __lsx_vadd_h(t1_l, q0_l); t1_l = __lsx_vaddi_hu(t1_l, 2); t1_l = __lsx_vsrai_h(t1_l, 2); // ((p3 * (1 << 1)) + p2 + (p2 * (1 << 1)) + p1 + p0 + q0 + 4) >> 3 t2_l = __lsx_vslli_h(p3_l, 1); t2_l = __lsx_vadd_h(t2_l, p2_l); temp = __lsx_vslli_h(p2_l, 1); t2_l = __lsx_vadd_h(t2_l, temp); t2_l = __lsx_vadd_h(t2_l, p1_l); t2_l = __lsx_vadd_h(t2_l, p0_l); t2_l = __lsx_vadd_h(t2_l, q0_l); t2_l = __lsx_vaddi_hu(t2_l, 4); t2_l = __lsx_vsrai_h(t2_l, 3); // ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2 t3_l = __lsx_vslli_h(p1_l, 1); t3_l = __lsx_vadd_h(t3_l, p0_l); t3_l = __lsx_vadd_h(t3_l, q1_l); t3_l = __lsx_vaddi_hu(t3_l, 2); t3_l = __lsx_vsrai_h(t3_l, 2); // (p1 + (p0 * (1 << 1)) + (q0 * (1 << 1)) + (q1 * (1 << 1)) + q2 + 4) >> 3 t4_l = __lsx_vslli_h(p0_l, 1); t4_l = __lsx_vadd_h(t4_l, p1_l); temp = __lsx_vslli_h(q0_l, 1); t4_l = __lsx_vadd_h(t4_l, temp); temp = __lsx_vslli_h(q1_l, 1); t4_l = __lsx_vadd_h(t4_l, temp); t4_l = __lsx_vadd_h(t4_l, q2_l); t4_l = __lsx_vaddi_hu(t4_l, 4); t4_l = __lsx_vsrai_h(t4_l, 3); // (p0 + q0 + q1 + q2 + 2) >> 2 t5_l = __lsx_vadd_h(p0_l, q0_l); t5_l = __lsx_vadd_h(t5_l, q1_l); t5_l = __lsx_vadd_h(t5_l, q2_l); t5_l = __lsx_vaddi_hu(t5_l, 2); t5_l = __lsx_vsrai_h(t5_l, 2); // ((q3 * (1 << 1)) + q2 + (q2 * (1 << 1)) + q1 + q0 + p0 + 4) >> 3 t6_l = __lsx_vslli_h(q3_l, 1); t6_l = __lsx_vadd_h(t6_l, q2_l); temp = __lsx_vslli_h(q2_l, 1); t6_l = __lsx_vadd_h(t6_l, temp); t6_l = __lsx_vadd_h(t6_l, q1_l); t6_l = __lsx_vadd_h(t6_l, q0_l); t6_l = __lsx_vadd_h(t6_l, p0_l); t6_l = __lsx_vaddi_hu(t6_l, 4); t6_l = __lsx_vsrai_h(t6_l, 3); // ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2 t7_l = __lsx_vslli_h(q1_l, 1); t7_l = __lsx_vadd_h(t7_l, q0_l); t7_l = __lsx_vadd_h(t7_l, p1_l); t7_l = __lsx_vaddi_hu(t7_l, 2); t7_l = __lsx_vsrai_h(t7_l, 2); // Calculate the high part // (p2 + (p1 * (1 << 1)) + (p0 * (1 << 1)) + (q0 * (1 << 1)) + q1 + 4) >> 3 t0_h = __lsx_vslli_h(p1_h, 1); t0_h = __lsx_vadd_h(t0_h, p2_h); temp = __lsx_vslli_h(p0_h, 1); t0_h = __lsx_vadd_h(t0_h, temp); temp = __lsx_vslli_h(q0_h, 1); t0_h = __lsx_vadd_h(t0_h, temp); t0_h = __lsx_vadd_h(t0_h, q1_h); t0_h = __lsx_vaddi_hu(t0_h, 4); t0_h = __lsx_vsrai_h(t0_h, 3); // (p2 + p1 + p0 + q0 + 2) >> 2 t1_h = __lsx_vadd_h(p2_h, p1_h); t1_h = __lsx_vadd_h(t1_h, p0_h); t1_h = __lsx_vadd_h(t1_h, q0_h); t1_h = __lsx_vaddi_hu(t1_h, 2); t1_h = __lsx_vsrai_h(t1_h, 2); // ((p3 * (1 << 1)) + p2 + (p2 * (1 << 1)) + p1 + p0 + q0 + 4) >> 3 t2_h = __lsx_vslli_h(p3_h, 1); t2_h = __lsx_vadd_h(t2_h, p2_h); temp = __lsx_vslli_h(p2_h, 1); t2_h = __lsx_vadd_h(t2_h, temp); t2_h = __lsx_vadd_h(t2_h, p1_h); t2_h = __lsx_vadd_h(t2_h, p0_h); t2_h = __lsx_vadd_h(t2_h, q0_h); t2_h = __lsx_vaddi_hu(t2_h, 4); t2_h = __lsx_vsrai_h(t2_h, 3); // ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2 t3_h = __lsx_vslli_h(p1_h, 1); t3_h = __lsx_vadd_h(t3_h, p0_h); t3_h = __lsx_vadd_h(t3_h, q1_h); t3_h = __lsx_vaddi_hu(t3_h, 2); t3_h = __lsx_vsrai_h(t3_h, 2); // (p1 + (p0 * (1 << 1)) + (q0 * (1 << 1)) + (q1 * (1 << 1)) + q2 + 4) >> 3 t4_h = __lsx_vslli_h(p0_h, 1); t4_h = __lsx_vadd_h(t4_h, p1_h); temp = __lsx_vslli_h(q0_h, 1); t4_h = __lsx_vadd_h(t4_h, temp); temp = __lsx_vslli_h(q1_h, 1); t4_h = __lsx_vadd_h(t4_h, temp); t4_h = __lsx_vadd_h(t4_h, q2_h); t4_h = __lsx_vaddi_hu(t4_h, 4); t4_h = __lsx_vsrai_h(t4_h, 3); // (p0 + q0 + q1 + q2 + 2) >> 2 t5_h = __lsx_vadd_h(p0_h, q0_h); t5_h = __lsx_vadd_h(t5_h, q1_h); t5_h = __lsx_vadd_h(t5_h, q2_h); t5_h = __lsx_vaddi_hu(t5_h, 2); t5_h = __lsx_vsrai_h(t5_h, 2); // ((q3 * (1 << 1)) + q2 + (q2 * (1 << 1)) + q1 + q0 + p0 + 4) >> 3 t6_h = __lsx_vslli_h(q3_h, 1); t6_h = __lsx_vadd_h(t6_h, q2_h); temp = __lsx_vslli_h(q2_h, 1); t6_h = __lsx_vadd_h(t6_h, temp); t6_h = __lsx_vadd_h(t6_h, q1_h); t6_h = __lsx_vadd_h(t6_h, q0_h); t6_h = __lsx_vadd_h(t6_h, p0_h); t6_h = __lsx_vaddi_hu(t6_h, 4); t6_h = __lsx_vsrai_h(t6_h, 3); // ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2 t7_h = __lsx_vslli_h(q1_h, 1); t7_h = __lsx_vadd_h(t7_h, q0_h); t7_h = __lsx_vadd_h(t7_h, p1_h); t7_h = __lsx_vaddi_hu(t7_h, 2); t7_h = __lsx_vsrai_h(t7_h, 2); // Combined low and high DUP4_ARG2(__lsx_vpickev_b, t0_h, t0_l, t1_h, t1_l, t2_h, t2_l, t3_h, t3_l, t0, t1, t2, t3); DUP4_ARG2(__lsx_vpickev_b, t4_h, t4_l, t5_h, t5_l, t6_h, t6_l, t7_h, t7_l, t4, t5, t6, t7); f0 = __lsx_vslt_bu(bDetaP0Q0, alpha); f0 = __lsx_vand_v(f0, bDetaP1P0); f0 = __lsx_vand_v(f0, bDetaQ1Q0); f1 = __lsx_vsrli_b(alpha, 2); f1 = __lsx_vaddi_bu(f1, 2); f1 = __lsx_vslt_bu(bDetaP0Q0, f1); // t0 f2 = __lsx_vand_v(f0, f1); fn = __lsx_vand_v(f2, bDetaP2P0); f3 = __lsx_vand_v(fn, t0); f2 = __lsx_vnor_v(bDetaP2P0, bDetaP2P0); fn = __lsx_vand_v(f0, f2); fn = __lsx_vand_v(fn, f1); t0 = __lsx_vand_v(fn, t3); t0 = __lsx_vadd_b(f3, t0); fn = __lsx_vnor_v(f1, f1); fn = __lsx_vand_v(fn, f0); f3 = __lsx_vand_v(fn, t3); t0 = __lsx_vadd_b(f3, t0); // t1 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaP2P0); t1 = __lsx_vand_v(f2, t1); // t2 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaP2P0); t2 = __lsx_vand_v(f2, t2); // t3 f2 = __lsx_vand_v(f0, f1); fn = __lsx_vand_v(f2, bDetaQ2Q0); f3 = __lsx_vand_v(fn, t4); fn = __lsx_vnor_v(bDetaQ2Q0, bDetaQ2Q0); fn = __lsx_vand_v(fn, f2); t3 = __lsx_vand_v(fn, t7); t3 = __lsx_vadd_b(f3, t3); fn = __lsx_vnor_v(f1, f1); fn = __lsx_vand_v(fn, f0); f3 = __lsx_vand_v(fn, t7); t3 = __lsx_vadd_b(f3, t3); // t4 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaQ2Q0); t4 = __lsx_vand_v(f2, t5); // t5 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaQ2Q0); t5 = __lsx_vand_v(f2, t6); // p0 fn = __lsx_vnor_v(f0, f0); p0 = __lsx_vand_v(fn, p0); p0 = __lsx_vadd_b(p0, t0); // p1 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaP2P0); fn = __lsx_vnor_v(f2, f2); p1 = __lsx_vand_v(fn, p1); p1 = __lsx_vadd_b(t1, p1); // p2 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaP2P0); fn = __lsx_vnor_v(f2, f2); p2 = __lsx_vand_v(fn, p2); p2 = __lsx_vadd_b(t2, p2); // q0 fn = __lsx_vnor_v(f0, f0); q0 = __lsx_vand_v(fn, q0); q0 = __lsx_vadd_b(q0, t3); // q1 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaQ2Q0); fn = __lsx_vnor_v(f2, f2); q1 = __lsx_vand_v(fn, q1); q1 = __lsx_vadd_b(q1, t4); // q2 f2 = __lsx_vand_v(f0, f1); f2 = __lsx_vand_v(f2, bDetaQ2Q0); fn = __lsx_vnor_v(f2, f2); q2 = __lsx_vand_v(fn, q2); q2 = __lsx_vadd_b(q2, t5); DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, t0, t1); DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, t2, t3); DUP2_ARG2(__lsx_vilvl_h, t1, t0, t3, t2, p0, p1); DUP2_ARG2(__lsx_vilvh_h, t1, t0, t3, t2, p2, p3); t1 = __lsx_vilvl_b(q2, q1); t2 = __lsx_vilvh_b(q2, q1); // Store data to pPix pPix -= iStrideY_x4; pPix -= iStrideY_x4; pPix -= iStrideY_x4; pPix += 1; __lsx_vstelm_w(p0, pPix, 0, 0); __lsx_vstelm_w(p0, pPix + iStrideY, 0, 1); __lsx_vstelm_w(p0, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(p0, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_w(p2, pPix, 0, 0); __lsx_vstelm_w(p2, pPix + iStrideY, 0, 1); __lsx_vstelm_w(p2, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(p2, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_w(p1, pPix, 0, 0); __lsx_vstelm_w(p1, pPix + iStrideY, 0, 1); __lsx_vstelm_w(p1, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(p1, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_w(p3, pPix, 0, 0); __lsx_vstelm_w(p3, pPix + iStrideY, 0, 1); __lsx_vstelm_w(p3, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_w(p3, pPix + iStrideY_x3, 0, 3); pPix -= iStrideY_x4; pPix -= iStrideY_x4; pPix -= iStrideY_x4; pPix += 4; __lsx_vstelm_h(t1, pPix, 0, 0); __lsx_vstelm_h(t1, pPix + iStrideY, 0, 1); __lsx_vstelm_h(t1, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_h(t1, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_h(t1, pPix, 0, 4); __lsx_vstelm_h(t1, pPix + iStrideY, 0, 5); __lsx_vstelm_h(t1, pPix + iStrideY_x2, 0, 6); __lsx_vstelm_h(t1, pPix + iStrideY_x3, 0, 7); pPix += iStrideY_x4; __lsx_vstelm_h(t2, pPix, 0, 0); __lsx_vstelm_h(t2, pPix + iStrideY, 0, 1); __lsx_vstelm_h(t2, pPix + iStrideY_x2, 0, 2); __lsx_vstelm_h(t2, pPix + iStrideY_x3, 0, 3); pPix += iStrideY_x4; __lsx_vstelm_h(t2, pPix, 0, 4); __lsx_vstelm_h(t2, pPix + iStrideY, 0, 5); __lsx_vstelm_h(t2, pPix + iStrideY_x2, 0, 6); __lsx_vstelm_h(t2, pPix + iStrideY_x3, 0, 7); } void DeblockChromaLt4V_lsx (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc) { __m128i p0, p1, q0, q1, t0, t1, tp; __m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l; __m128i iTc0, negiTc0, iTc0_l, negiTc0_l; __m128i flags, flag, iDeta_l; __m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0; __m128i zero = __lsx_vldi(0); __m128i alpha = __lsx_vreplgr2vr_b(iAlpha); __m128i beta = __lsx_vreplgr2vr_b(iBeta); __m128i shuf = {0x0303020201010000, 0x0}; __m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00}; int32_t iStrideX_x0 = 0; int32_t iStrideX_x2 = iStrideX << 1; iTc0 = __lsx_vldx(pTc, 0); iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf); negiTc0 = __lsx_vneg_b(iTc0); flag = __lsx_vslt_b(iTc0, zero); iTc0_l = __lsx_vilvl_b(flag, iTc0); flag = __lsx_vslt_b(negiTc0, zero); negiTc0_l = __lsx_vilvl_b(flag, negiTc0); // Load data from pPixCb DUP4_ARG2(__lsx_vldx, pPixCb, -iStrideX, pPixCb, -iStrideX_x2, pPixCb, iStrideX_x0, pPixCb, iStrideX, p0, p1, q0, q1); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1, p0_l, p1_l, q0_l, q1_l); // Calculate condition mask DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0); DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags); // Calculate the low part // WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0) iDeta_l = __lsx_vsub_h(q0_l, p0_l); iDeta_l = __lsx_vslli_h(iDeta_l, 2); iDeta_l = __lsx_vadd_h(iDeta_l, p1_l); iDeta_l = __lsx_vsub_h(iDeta_l, q1_l); iDeta_l = __lsx_vaddi_hu(iDeta_l, 4); iDeta_l = __lsx_vsrai_h(iDeta_l, 3); iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l); iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l); // WelsClip1 (p0 + iDeta) p0_l = __lsx_vadd_h(p0_l, iDeta_l); p1_l = __lsx_vand_v(p0_l, not_255); p2_l = __lsx_vsle_h(zero, p0_l); flag = __lsx_vseq_h(p1_l, zero); p0_l = __lsx_vand_v(p0_l, flag); flag = __lsx_vnor_v(flag,flag); p2_l = __lsx_vand_v(p2_l, flag); p0_l = __lsx_vadd_h(p0_l, p2_l); // WelsClip1 (q0 - iDeta) q0_l = __lsx_vsub_h(q0_l, iDeta_l); q1_l = __lsx_vand_v(q0_l, not_255); q2_l = __lsx_vsle_h(zero, q0_l); flag = __lsx_vseq_h(q1_l, zero); q0_l = __lsx_vand_v(q0_l, flag); flag = __lsx_vnor_v(flag, flag); q2_l = __lsx_vand_v(q2_l, flag); q0_l = __lsx_vadd_h(q0_l, q2_l); DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1); flag = __lsx_vsle_b(zero, iTc0); flag = __lsx_vand_v(flag, flags); t0 = __lsx_vand_v(t0, flag); tp = __lsx_vnor_v(flag,flag); p0 = __lsx_vand_v(p0, tp); p0 = __lsx_vadd_b(t0, p0); t1 = __lsx_vand_v(t1, flag); tp = __lsx_vnor_v(flag,flag); q0 = __lsx_vand_v(q0, tp); q0 = __lsx_vadd_b(t1, q0); // Store data to pPixCb __lsx_vstelm_d(p0, pPixCb - iStrideX, 0, 0); __lsx_vstelm_d(q0, pPixCb, 0, 0); // Load data from pPixCr DUP4_ARG2(__lsx_vldx, pPixCr, -iStrideX, pPixCr, -iStrideX_x2, pPixCr, iStrideX_x0, pPixCr, iStrideX, p0, p1, q0, q1); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1, p0_l, p1_l, q0_l, q1_l); // Calculate condition mask DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0); DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags); // Calculate the low part // WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0) iDeta_l = __lsx_vsub_h(q0_l, p0_l); iDeta_l = __lsx_vslli_h(iDeta_l, 2); iDeta_l = __lsx_vadd_h(iDeta_l, p1_l); iDeta_l = __lsx_vsub_h(iDeta_l, q1_l); iDeta_l = __lsx_vaddi_hu(iDeta_l, 4); iDeta_l = __lsx_vsrai_h(iDeta_l, 3); iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l); iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l); // WelsClip1 (p0 + iDeta) p0_l = __lsx_vadd_h(p0_l, iDeta_l); p1_l = __lsx_vand_v(p0_l, not_255); p2_l = __lsx_vsle_h(zero, p0_l); flag = __lsx_vseq_h(p1_l, zero); p0_l = __lsx_vand_v(p0_l, flag); flag = __lsx_vnor_v(flag,flag); p2_l = __lsx_vand_v(p2_l, flag); p0_l = __lsx_vadd_h(p0_l, p2_l); // WelsClip1 (q0 - iDeta) q0_l = __lsx_vsub_h(q0_l, iDeta_l); q1_l = __lsx_vand_v(q0_l, not_255); q2_l = __lsx_vsle_h(zero, q0_l); flag = __lsx_vseq_h(q1_l, zero); q0_l = __lsx_vand_v(q0_l, flag); flag = __lsx_vnor_v(flag, flag); q2_l = __lsx_vand_v(q2_l, flag); q0_l = __lsx_vadd_h(q0_l, q2_l); DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1); flag = __lsx_vsle_b(zero, iTc0); flag = __lsx_vand_v(flag, flags); t0 = __lsx_vand_v(t0, flag); tp = __lsx_vnor_v(flag,flag); p0 = __lsx_vand_v(p0, tp); p0 = __lsx_vadd_b(t0, p0); t1 = __lsx_vand_v(t1, flag); tp = __lsx_vnor_v(flag,flag); q0 = __lsx_vand_v(q0, tp); q0 = __lsx_vadd_b(t1, q0); // Store data to pPixCr __lsx_vstelm_d(p0, pPixCr - iStrideX, 0, 0); __lsx_vstelm_d(q0, pPixCr, 0, 0); } void DeblockChromaLt4H_lsx (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t* pTc) { __m128i p0, p1, q0, q1, t0, t1, t2, t3, tp; __m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l; __m128i iTc0, negiTc0, iTc0_l, negiTc0_l; __m128i flags, flag, iDeta_l; __m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0; __m128i zero = __lsx_vldi(0); __m128i alpha = __lsx_vreplgr2vr_b(iAlpha); __m128i beta = __lsx_vreplgr2vr_b(iBeta); __m128i shuf = {0x0303020201010000, 0x0}; __m128i not_255 = {0xff00ff00ff00ff00, 0xff00ff00ff00ff00}; int32_t iStrideY_x0 = 0; int32_t iStrideY_x2 = iStrideY << 1; int32_t iStrideY_x3 = iStrideY_x2 + iStrideY; int32_t iStrideY_x4 = iStrideY << 2; iTc0 = __lsx_vldx(pTc, 0); iTc0 = __lsx_vshuf_b(iTc0, iTc0, shuf); negiTc0 = __lsx_vneg_b(iTc0); flag = __lsx_vslt_b(iTc0, zero); iTc0_l = __lsx_vilvl_b(flag, iTc0); flag = __lsx_vslt_b(negiTc0, zero); negiTc0_l = __lsx_vilvl_b(flag, negiTc0); // Load data from pPixCb pPixCb -= 2; DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb, iStrideY_x2, pPixCb, iStrideY_x3, p1, p0, q0, q1); pPixCb += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb, iStrideY_x2, pPixCb, iStrideY_x3, t0, t1, t2, t3); LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1, p0_l, p1_l, q0_l, q1_l); // Calculate condition mask DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0); DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags); // Calculate the low part // WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0) iDeta_l = __lsx_vsub_h(q0_l, p0_l); iDeta_l = __lsx_vslli_h(iDeta_l, 2); iDeta_l = __lsx_vadd_h(iDeta_l, p1_l); iDeta_l = __lsx_vsub_h(iDeta_l, q1_l); iDeta_l = __lsx_vaddi_hu(iDeta_l, 4); iDeta_l = __lsx_vsrai_h(iDeta_l, 3); iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l); iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l); // WelsClip1 (p0 + iDeta) p0_l = __lsx_vadd_h(p0_l, iDeta_l); p1_l = __lsx_vand_v(p0_l, not_255); p2_l = __lsx_vsle_h(zero, p0_l); flag = __lsx_vseq_h(p1_l, zero); p0_l = __lsx_vand_v(p0_l, flag); flag = __lsx_vnor_v(flag,flag); p2_l = __lsx_vand_v(p2_l, flag); p0_l = __lsx_vadd_h(p0_l, p2_l); // WelsClip1 (q0 - iDeta) q0_l = __lsx_vsub_h(q0_l, iDeta_l); q1_l = __lsx_vand_v(q0_l, not_255); q2_l = __lsx_vsle_h(zero, q0_l); flag = __lsx_vseq_h(q1_l, zero); q0_l = __lsx_vand_v(q0_l, flag); flag = __lsx_vnor_v(flag, flag); q2_l = __lsx_vand_v(q2_l, flag); q0_l = __lsx_vadd_h(q0_l, q2_l); DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1); flag = __lsx_vsle_b(zero, iTc0); flag = __lsx_vand_v(flag, flags); t0 = __lsx_vand_v(t0, flag); tp = __lsx_vnor_v(flag,flag); p0 = __lsx_vand_v(p0, tp); p0 = __lsx_vadd_b(t0, p0); t1 = __lsx_vand_v(t1, flag); tp = __lsx_vnor_v(flag,flag); q0 = __lsx_vand_v(q0, tp); q0 = __lsx_vadd_b(t1, q0); p0 = __lsx_vilvl_b(q0, p0); // Store data to pPixCb pPixCb -= iStrideY_x4 - 1; __lsx_vstelm_h(p0, pPixCb, 0, 0); __lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 1); __lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 2); __lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 3); pPixCb += iStrideY_x4; __lsx_vstelm_h(p0, pPixCb, 0, 4); __lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 5); __lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 6); __lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 7); // Load data from pPixCr pPixCr -= 2; DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr, iStrideY_x2, pPixCr, iStrideY_x3, p1, p0, q0, q1); pPixCr += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr, iStrideY_x2, pPixCr, iStrideY_x3, t0, t1, t2, t3); LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1, p0_l, p1_l, q0_l, q1_l); DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0); DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags); // Calculate the low part // WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0) iDeta_l = __lsx_vsub_h(q0_l, p0_l); iDeta_l = __lsx_vslli_h(iDeta_l, 2); iDeta_l = __lsx_vadd_h(iDeta_l, p1_l); iDeta_l = __lsx_vsub_h(iDeta_l, q1_l); iDeta_l = __lsx_vaddi_hu(iDeta_l, 4); iDeta_l = __lsx_vsrai_h(iDeta_l, 3); iDeta_l = __lsx_vmin_h(iTc0_l, iDeta_l); iDeta_l = __lsx_vmax_h(negiTc0_l, iDeta_l); // WelsClip1 (p0 + iDeta) p0_l = __lsx_vadd_h(p0_l, iDeta_l); p1_l = __lsx_vand_v(p0_l, not_255); p2_l = __lsx_vsle_h(zero, p0_l); flag = __lsx_vseq_h(p1_l, zero); p0_l = __lsx_vand_v(p0_l, flag); flag = __lsx_vnor_v(flag,flag); p2_l = __lsx_vand_v(p2_l, flag); p0_l = __lsx_vadd_h(p0_l, p2_l); // WelsClip1 (q0 - iDeta) q0_l = __lsx_vsub_h(q0_l, iDeta_l); q1_l = __lsx_vand_v(q0_l, not_255); q2_l = __lsx_vsle_h(zero, q0_l); flag = __lsx_vseq_h(q1_l, zero); q0_l = __lsx_vand_v(q0_l, flag); flag = __lsx_vnor_v(flag, flag); q2_l = __lsx_vand_v(q2_l, flag); q0_l = __lsx_vadd_h(q0_l, q2_l); DUP2_ARG2(__lsx_vpickev_b, zero, p0_l, zero, q0_l, t0, t1); flag = __lsx_vsle_b(zero, iTc0); flag = __lsx_vand_v(flag, flags); t0 = __lsx_vand_v(t0, flag); tp = __lsx_vnor_v(flag,flag); p0 = __lsx_vand_v(p0, tp); p0 = __lsx_vadd_b(t0, p0); t1 = __lsx_vand_v(t1, flag); tp = __lsx_vnor_v(flag,flag); q0 = __lsx_vand_v(q0, tp); q0 = __lsx_vadd_b(t1, q0); p0 = __lsx_vilvl_b(q0, p0); // Store data to pPixCr pPixCr -= iStrideY_x4 - 1; __lsx_vstelm_h(p0, pPixCr, 0, 0); __lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 1); __lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 2); __lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 3); pPixCr += iStrideY_x4; __lsx_vstelm_h(p0, pPixCr, 0, 4); __lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 5); __lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 6); __lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 7); } void DeblockChromaEq4H_lsx (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) { __m128i p0, p1, q0, q1, t0, t1, t2, t3, tp; __m128i p0_l, p1_l, p2_l, q0_l, q1_l, q2_l; __m128i bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags; __m128i zero = __lsx_vldi(0); __m128i alpha = __lsx_vreplgr2vr_b(iAlpha); __m128i beta = __lsx_vreplgr2vr_b(iBeta); int32_t iStrideY_x0 = 0; int32_t iStrideY_x2 = iStrideY << 1; int32_t iStrideY_x3 = iStrideY_x2 + iStrideY; int32_t iStrideY_x4 = iStrideY << 2; // Load data from pPixCb pPixCb -= 2; DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb, iStrideY_x2, pPixCb, iStrideY_x3, p1, p0, q0, q1); pPixCb += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPixCb, iStrideY_x0, pPixCb, iStrideY, pPixCb, iStrideY_x2, pPixCb, iStrideY_x3, t0, t1, t2, t3); LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1, p0_l, p1_l, q0_l, q1_l); // Calculate condition mask DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0); DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags); // ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2 p2_l = __lsx_vslli_h(p1_l, 1); p2_l = __lsx_vadd_h(p2_l, p0_l); p2_l = __lsx_vadd_h(p2_l, q1_l); p2_l = __lsx_vaddi_hu(p2_l, 2); p2_l = __lsx_vsrai_h(p2_l, 2); // ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2 q2_l = __lsx_vslli_h(q1_l, 1); q2_l = __lsx_vadd_h(q2_l, q0_l); q2_l = __lsx_vadd_h(q2_l, p1_l); q2_l = __lsx_vaddi_hu(q2_l, 2); q2_l = __lsx_vsrai_h(q2_l, 2); DUP2_ARG2(__lsx_vpickev_b, zero, p2_l, zero, q2_l, t0, t1); t0 = __lsx_vand_v(t0, flags); tp = __lsx_vnor_v(flags,flags); p0 = __lsx_vand_v(p0, tp); p0 = __lsx_vadd_b(t0, p0); t1 = __lsx_vand_v(t1, flags); tp = __lsx_vnor_v(flags,flags); q0 = __lsx_vand_v(q0, tp); q0 = __lsx_vadd_b(t1, q0); p0 = __lsx_vilvl_b(q0, p0); // Store data to pPixCb pPixCb -= iStrideY_x4 - 1; __lsx_vstelm_h(p0, pPixCb, 0, 0); __lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 1); __lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 2); __lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 3); pPixCb += iStrideY_x4; __lsx_vstelm_h(p0, pPixCb, 0, 4); __lsx_vstelm_h(p0, pPixCb + iStrideY, 0, 5); __lsx_vstelm_h(p0, pPixCb + iStrideY_x2, 0, 6); __lsx_vstelm_h(p0, pPixCb + iStrideY_x3, 0, 7); // Load data from pPixCr pPixCr -= 2; DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr, iStrideY_x2, pPixCr, iStrideY_x3, p1, p0, q0, q1); pPixCr += iStrideY_x4; DUP4_ARG2(__lsx_vldx, pPixCr, iStrideY_x0, pPixCr, iStrideY, pPixCr, iStrideY_x2, pPixCr, iStrideY_x3, t0, t1, t2, t3); LSX_TRANSPOSE8x4_B(p1, p0, q0, q1, t0, t1, t2, t3, p1, p0, q0, q1); DUP4_ARG2(__lsx_vilvl_b, zero, p0, zero, p1, zero, q0, zero, q1, p0_l, p1_l, q0_l, q1_l); // Calculate condition mask DUP2_ARG2(__lsx_vabsd_bu, p0, q0, p1, p0, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vabsd_bu(q1, q0); DUP2_ARG2(__lsx_vslt_bu, bDetaP0Q0, alpha, bDetaP1P0, beta, bDetaP0Q0, bDetaP1P0); bDetaQ1Q0 = __lsx_vslt_bu(bDetaQ1Q0, beta); DUP2_ARG2(__lsx_vand_v, bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, flags, flags, flags); // ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2 p2_l = __lsx_vslli_h(p1_l, 1); p2_l = __lsx_vadd_h(p2_l, p0_l); p2_l = __lsx_vadd_h(p2_l, q1_l); p2_l = __lsx_vaddi_hu(p2_l, 2); p2_l = __lsx_vsrai_h(p2_l, 2); // ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2 q2_l = __lsx_vslli_h(q1_l, 1); q2_l = __lsx_vadd_h(q2_l, q0_l); q2_l = __lsx_vadd_h(q2_l, p1_l); q2_l = __lsx_vaddi_hu(q2_l, 2); q2_l = __lsx_vsrai_h(q2_l, 2); DUP2_ARG2(__lsx_vpickev_b, zero, p2_l, zero, q2_l, t0, t1); t0 = __lsx_vand_v(t0, flags); tp = __lsx_vnor_v(flags,flags); p0 = __lsx_vand_v(p0, tp); p0 = __lsx_vadd_b(t0, p0); t1 = __lsx_vand_v(t1, flags); tp = __lsx_vnor_v(flags,flags); q0 = __lsx_vand_v(q0, tp); q0 = __lsx_vadd_b(t1, q0); p0 = __lsx_vilvl_b(q0, p0); // Store data to pPixCr pPixCr -= iStrideY_x4 - 1; __lsx_vstelm_h(p0, pPixCr, 0, 0); __lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 1); __lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 2); __lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 3); pPixCr += iStrideY_x4; __lsx_vstelm_h(p0, pPixCr, 0, 4); __lsx_vstelm_h(p0, pPixCr + iStrideY, 0, 5); __lsx_vstelm_h(p0, pPixCr + iStrideY_x2, 0, 6); __lsx_vstelm_h(p0, pPixCr + iStrideY_x3, 0, 7); }