• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*!
2  **********************************************************************************
3  * Copyright (c) 2021 Loongson Technology Corporation Limited
4  * Contributed by Lu Wang <wanglu@loongson.cn>
5  *
6  * \copy
7  *     Copyright (c)  2013, Cisco Systems
8  *     All rights reserved.
9  *
10  *     Redistribution and use in source and binary forms, with or without
11  *     modification, are permitted provided that the following conditions
12  *     are met:
13  *
14  *        * Redistributions of source code must retain the above copyright
15  *          notice, this list of conditions and the following disclaimer.
16  *
17  *        * Redistributions in binary form must reproduce the above copyright
18  *          notice, this list of conditions and the following disclaimer in
19  *          the documentation and/or other materials provided with the
20  *          distribution.
21  *
22  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33  *     POSSIBILITY OF SUCH DAMAGE.
34  *
35  * \file    quant_lsx.c
36  *
37  * \brief   Loongson optimization
38  *
39  * \date    12/10/2021 Created
40  *
41  **********************************************************************************
42  */
43 
44 #include "stdint.h"
45 #include "loongson_intrinsics.h"
46 
WelsQuantFour4x4Max_lsx(int16_t * pDct,const int16_t * pFF,const int16_t * pMF,int16_t * pMax)47 void WelsQuantFour4x4Max_lsx (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax) {
48   int32_t k;
49   int16_t iMaxAbs;
50   __m128i vec_pDct1, vec_pDct2, vec_pDct3, vec_pDct4;
51   __m128i vec_pFF, vec_pMF, vec_iMaxAbs, tmp_iMaxAbs;
52   __m128i vec_pFF0, vec_pFF1, vec_pFF2, vec_pMF0, vec_pMF1, vec_pMF2;
53   __m128i vec_pDct10, vec_pDct11, vec_pDct12, vec_pDct20, vec_pDct21, vec_pDct22;
54   __m128i vec_iSign11, vec_iSign12, vec_iSign21, vec_iSign22;
55   __m128i vec_iSign31, vec_iSign32, vec_iSign41, vec_iSign42;
56 
57   DUP2_ARG2(__lsx_vld, pFF, 0, pMF, 0, vec_pFF, vec_pMF);
58   DUP2_ARG2(__lsx_vsrai_h, vec_pFF, 15, vec_pMF, 15, vec_pFF0, vec_pMF0);
59   DUP2_ARG2(__lsx_vilvl_h, vec_pFF0, vec_pFF, vec_pMF0, vec_pMF, vec_pFF1, vec_pMF1);
60   DUP2_ARG2(__lsx_vilvh_h, vec_pFF0, vec_pFF, vec_pMF0, vec_pMF, vec_pFF2, vec_pMF2);
61 
62   for (k = 0; k < 4; k++) {
63     iMaxAbs = 0;
64     vec_iMaxAbs = __lsx_vreplgr2vr_h(0);
65     DUP2_ARG2(__lsx_vld, pDct, 0, pDct + 8, 0, vec_pDct1, vec_pDct2);
66     DUP2_ARG2(__lsx_vsrai_h, vec_pDct1, 15, vec_pDct2, 15, vec_pDct10, vec_pDct20);
67     DUP2_ARG2(__lsx_vilvl_h, vec_pDct10, vec_pDct1, vec_pDct20, vec_pDct2, vec_pDct11,
68               vec_pDct21);
69     DUP2_ARG2(__lsx_vilvh_h, vec_pDct10, vec_pDct1, vec_pDct20, vec_pDct2, vec_pDct12,
70               vec_pDct22);
71 
72     DUP4_ARG2(__lsx_vsrai_w, vec_pDct11, 31, vec_pDct12, 31, vec_pDct21, 31, vec_pDct22,
73               31, vec_iSign11, vec_iSign12, vec_iSign21, vec_iSign22);
74     vec_iSign31 =  __lsx_vsub_w(__lsx_vxor_v(vec_iSign11, vec_pDct11), vec_iSign11);
75     vec_iSign32 =  __lsx_vsub_w(__lsx_vxor_v(vec_iSign12, vec_pDct12), vec_iSign12);
76     vec_iSign41 =  __lsx_vsub_w(__lsx_vxor_v(vec_iSign21, vec_pDct21), vec_iSign21);
77     vec_iSign42 =  __lsx_vsub_w(__lsx_vxor_v(vec_iSign22, vec_pDct22), vec_iSign22);
78 
79     DUP4_ARG2(__lsx_vadd_w, vec_pFF1, vec_iSign31, vec_pFF2, vec_iSign32, vec_pFF1,
80               vec_iSign41, vec_pFF2, vec_iSign42, vec_iSign31, vec_iSign32, vec_iSign41,
81 	      vec_iSign42);
82     DUP4_ARG2(__lsx_vmul_w, vec_pMF1, vec_iSign31, vec_pMF2, vec_iSign32, vec_pMF1,
83               vec_iSign41, vec_pMF2, vec_iSign42, vec_pDct11, vec_pDct12, vec_pDct21,
84               vec_pDct22);
85     DUP4_ARG2(__lsx_vsrai_w, vec_pDct11, 16, vec_pDct12, 16, vec_pDct21, 16, vec_pDct22,
86               16, vec_pDct11, vec_pDct12, vec_pDct21, vec_pDct22);
87     DUP4_ARG2(__lsx_vmax_w, vec_iMaxAbs, vec_pDct11, vec_iMaxAbs, vec_pDct12, vec_iMaxAbs,
88               vec_pDct21, vec_iMaxAbs, vec_pDct22, vec_iMaxAbs, vec_iMaxAbs, vec_iMaxAbs,
89               vec_iMaxAbs);
90     tmp_iMaxAbs = __lsx_vbsrl_v(vec_iMaxAbs, 8);
91     vec_iMaxAbs = __lsx_vmax_w(vec_iMaxAbs, tmp_iMaxAbs);
92     tmp_iMaxAbs = __lsx_vbsrl_v(vec_iMaxAbs, 4);
93     vec_iMaxAbs = __lsx_vmax_w(vec_iMaxAbs, tmp_iMaxAbs);
94     iMaxAbs = __lsx_vpickve2gr_h(vec_iMaxAbs, 0);
95 
96     vec_pDct1 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign11, vec_pDct11), vec_iSign11);
97     vec_pDct2 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign12, vec_pDct12), vec_iSign12);
98     vec_pDct3 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign21, vec_pDct21), vec_iSign21);
99     vec_pDct4 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign22, vec_pDct22), vec_iSign22);
100     DUP2_ARG2(__lsx_vpickev_h, vec_pDct2, vec_pDct1, vec_pDct4, vec_pDct3, vec_pDct1,
101               vec_pDct2);
102 
103     __lsx_vst(vec_pDct1, pDct, 0);
104     __lsx_vst(vec_pDct2, pDct + 8, 0);
105 
106     pDct += 16;
107     pMax[k] = iMaxAbs;
108   }
109 }
110 
111