1 /*!
2 **********************************************************************************
3 * Copyright (c) 2021 Loongson Technology Corporation Limited
4 * Contributed by Lu Wang <wanglu@loongson.cn>
5 *
6 * \copy
7 * Copyright (c) 2013, Cisco Systems
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * * Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
20 * distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 *
35 * \file quant_lsx.c
36 *
37 * \brief Loongson optimization
38 *
39 * \date 12/10/2021 Created
40 *
41 **********************************************************************************
42 */
43
44 #include <stdint.h>
45 #include "loongson_intrinsics.h"
46
WelsQuantFour4x4_lsx(int16_t * pDct,const int16_t * pFF,const int16_t * pMF)47 void WelsQuantFour4x4_lsx (int16_t* pDct, const int16_t* pFF, const int16_t* pMF) {
48 int32_t i;
49 __m128i vec_pFF0, vec_pFF1, vec_pFF2, vec_pMF0, vec_pMF1, vec_pMF2;
50 __m128i vec_pDct, vec_pDct0, vec_pDct1, vec_pDct2, vec_pFF, vec_pMF;
51 __m128i vec_pDct10, vec_pDct11, vec_pDct12, vec_pDct20, vec_pDct21, vec_pDct22;
52 __m128i vec_iSign1, vec_iSign2;
53
54 DUP2_ARG2(__lsx_vld, pFF, 0, pMF, 0, vec_pFF, vec_pMF);
55 DUP2_ARG2(__lsx_vsrai_h, vec_pFF, 15, vec_pMF, 15, vec_pFF0, vec_pMF0);
56 DUP2_ARG2(__lsx_vilvl_h, vec_pFF0, vec_pFF, vec_pMF0, vec_pMF, vec_pFF1, vec_pMF1);
57 DUP2_ARG2(__lsx_vilvh_h, vec_pFF0, vec_pFF, vec_pMF0, vec_pMF, vec_pFF2, vec_pMF2);
58
59 for (i = 0; i < 8; i++) {
60 vec_pDct = __lsx_vld(pDct, 0);
61 vec_pDct0 = __lsx_vsrai_h(vec_pDct, 15);
62 vec_pDct1 = __lsx_vilvl_h(vec_pDct0, vec_pDct);
63 vec_pDct2 = __lsx_vilvh_h(vec_pDct0, vec_pDct);
64
65 vec_iSign1 = __lsx_vsrai_w(vec_pDct1, 31);
66 vec_iSign2 = __lsx_vsrai_w(vec_pDct2, 31);
67
68 vec_pDct10 = __lsx_vxor_v(vec_iSign1, vec_pDct1);
69 vec_pDct10 = __lsx_vsub_w(vec_pDct10, vec_iSign1);
70 vec_pDct11 = __lsx_vadd_w(vec_pFF1, vec_pDct10);
71 vec_pDct11 = __lsx_vmul_w(vec_pDct11, vec_pMF1);
72 vec_pDct11 = __lsx_vsrai_w(vec_pDct11, 16);
73 vec_pDct12 = __lsx_vxor_v(vec_iSign1, vec_pDct11);
74 vec_pDct12 = __lsx_vsub_w(vec_pDct12, vec_iSign1);
75
76 vec_pDct20 = __lsx_vxor_v(vec_iSign2, vec_pDct2);
77 vec_pDct20 = __lsx_vsub_w(vec_pDct20, vec_iSign2);
78 vec_pDct21 = __lsx_vadd_w(vec_pFF2, vec_pDct20);
79 vec_pDct21 = __lsx_vmul_w(vec_pDct21, vec_pMF2);
80 vec_pDct21 = __lsx_vsrai_w(vec_pDct21, 16);
81 vec_pDct22 = __lsx_vxor_v(vec_iSign2, vec_pDct21);
82 vec_pDct22 = __lsx_vsub_w(vec_pDct22, vec_iSign2);
83
84 vec_pDct = __lsx_vpickev_h(vec_pDct22, vec_pDct12);
85 __lsx_vst(vec_pDct, pDct, 0);
86 pDct += 8;
87 }
88 }
89
WelsQuantFour4x4Max_lsx(int16_t * pDct,const int16_t * pFF,const int16_t * pMF,int16_t * pMax)90 void WelsQuantFour4x4Max_lsx (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax) {
91 int32_t k;
92 int16_t iMaxAbs;
93 __m128i vec_pDct1, vec_pDct2, vec_pDct3, vec_pDct4;
94 __m128i vec_pFF, vec_pMF, vec_iMaxAbs, tmp_iMaxAbs;
95 __m128i vec_pFF0, vec_pFF1, vec_pFF2, vec_pMF0, vec_pMF1, vec_pMF2;
96 __m128i vec_pDct10, vec_pDct11, vec_pDct12, vec_pDct20, vec_pDct21, vec_pDct22;
97 __m128i vec_iSign11, vec_iSign12, vec_iSign21, vec_iSign22;
98 __m128i vec_iSign31, vec_iSign32, vec_iSign41, vec_iSign42;
99
100 DUP2_ARG2(__lsx_vld, pFF, 0, pMF, 0, vec_pFF, vec_pMF);
101 DUP2_ARG2(__lsx_vsrai_h, vec_pFF, 15, vec_pMF, 15, vec_pFF0, vec_pMF0);
102 DUP2_ARG2(__lsx_vilvl_h, vec_pFF0, vec_pFF, vec_pMF0, vec_pMF, vec_pFF1, vec_pMF1);
103 DUP2_ARG2(__lsx_vilvh_h, vec_pFF0, vec_pFF, vec_pMF0, vec_pMF, vec_pFF2, vec_pMF2);
104
105 for (k = 0; k < 4; k++) {
106 iMaxAbs = 0;
107 vec_iMaxAbs = __lsx_vreplgr2vr_h(0);
108 DUP2_ARG2(__lsx_vld, pDct, 0, pDct + 8, 0, vec_pDct1, vec_pDct2);
109 DUP2_ARG2(__lsx_vsrai_h, vec_pDct1, 15, vec_pDct2, 15, vec_pDct10, vec_pDct20);
110 DUP2_ARG2(__lsx_vilvl_h, vec_pDct10, vec_pDct1, vec_pDct20, vec_pDct2, vec_pDct11,
111 vec_pDct21);
112 DUP2_ARG2(__lsx_vilvh_h, vec_pDct10, vec_pDct1, vec_pDct20, vec_pDct2, vec_pDct12,
113 vec_pDct22);
114
115 DUP4_ARG2(__lsx_vsrai_w, vec_pDct11, 31, vec_pDct12, 31, vec_pDct21, 31, vec_pDct22,
116 31, vec_iSign11, vec_iSign12, vec_iSign21, vec_iSign22);
117 vec_iSign31 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign11, vec_pDct11), vec_iSign11);
118 vec_iSign32 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign12, vec_pDct12), vec_iSign12);
119 vec_iSign41 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign21, vec_pDct21), vec_iSign21);
120 vec_iSign42 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign22, vec_pDct22), vec_iSign22);
121
122 DUP4_ARG2(__lsx_vadd_w, vec_pFF1, vec_iSign31, vec_pFF2, vec_iSign32, vec_pFF1,
123 vec_iSign41, vec_pFF2, vec_iSign42, vec_iSign31, vec_iSign32, vec_iSign41,
124 vec_iSign42);
125 DUP4_ARG2(__lsx_vmul_w, vec_pMF1, vec_iSign31, vec_pMF2, vec_iSign32, vec_pMF1,
126 vec_iSign41, vec_pMF2, vec_iSign42, vec_pDct11, vec_pDct12, vec_pDct21,
127 vec_pDct22);
128 DUP4_ARG2(__lsx_vsrai_w, vec_pDct11, 16, vec_pDct12, 16, vec_pDct21, 16, vec_pDct22,
129 16, vec_pDct11, vec_pDct12, vec_pDct21, vec_pDct22);
130 DUP4_ARG2(__lsx_vmax_w, vec_iMaxAbs, vec_pDct11, vec_iMaxAbs, vec_pDct12, vec_iMaxAbs,
131 vec_pDct21, vec_iMaxAbs, vec_pDct22, vec_iMaxAbs, vec_iMaxAbs, vec_iMaxAbs,
132 vec_iMaxAbs);
133 tmp_iMaxAbs = __lsx_vbsrl_v(vec_iMaxAbs, 8);
134 vec_iMaxAbs = __lsx_vmax_w(vec_iMaxAbs, tmp_iMaxAbs);
135 tmp_iMaxAbs = __lsx_vbsrl_v(vec_iMaxAbs, 4);
136 vec_iMaxAbs = __lsx_vmax_w(vec_iMaxAbs, tmp_iMaxAbs);
137 iMaxAbs = __lsx_vpickve2gr_h(vec_iMaxAbs, 0);
138
139 vec_pDct1 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign11, vec_pDct11), vec_iSign11);
140 vec_pDct2 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign12, vec_pDct12), vec_iSign12);
141 vec_pDct3 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign21, vec_pDct21), vec_iSign21);
142 vec_pDct4 = __lsx_vsub_w(__lsx_vxor_v(vec_iSign22, vec_pDct22), vec_iSign22);
143 DUP2_ARG2(__lsx_vpickev_h, vec_pDct2, vec_pDct1, vec_pDct4, vec_pDct3, vec_pDct1,
144 vec_pDct2);
145
146 __lsx_vst(vec_pDct1, pDct, 0);
147 __lsx_vst(vec_pDct2, pDct + 8, 0);
148
149 pDct += 16;
150 pMax[k] = iMaxAbs;
151 }
152 }
153
154