1 /*!
2 * \copy
3 * Copyright (C) 2019 Loongson Technology Co. Ltd.
4 * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 *
32 *
33 * \file deblock_msa.c
34 *
35 * \brief MIPS MSA optimizations
36 *
37 * \date 15/05/2020 Created
38 *
39 *************************************************************************************
40 */
41
42 #include <stdint.h>
43 #include "msa_macros.h"
44
DeblockLumaLt4V_msa(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)45 void DeblockLumaLt4V_msa(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
46 int32_t iBeta, int8_t *pTc) {
47 v16u8 p0, p1, p2, q0, q1, q2;
48 v16i8 iTc, negiTc, negTc, flags, f;
49 v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
50 v8i16 tc_l, tc_r, negTc_l, negTc_r;
51 v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
52 // Use for temporary variable
53 v8i16 t0, t1, t2, t3;
54 v16u8 alpha, beta;
55 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
56 v16i8 const_1_b = __msa_ldi_b(1);
57 v8i16 const_1_h = __msa_ldi_h(1);
58 v8i16 const_4_h = __msa_ldi_h(4);
59 v8i16 const_not_255_h = __msa_ldi_h(~255);
60 v16i8 zero = { 0 };
61 v16i8 tc = { pTc[0 >> 2], pTc[1 >> 2], pTc[2 >> 2], pTc[3 >> 2],
62 pTc[4 >> 2], pTc[5 >> 2], pTc[6 >> 2], pTc[7 >> 2],
63 pTc[8 >> 2], pTc[9 >> 2], pTc[10 >> 2], pTc[11 >> 2],
64 pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
65 negTc = zero - tc;
66 iTc = tc;
67
68 // Load data from pPix
69 MSA_LD_V4(v16u8, pPix - 3 * iStride, iStride, p2, p1, p0, q0);
70 MSA_LD_V2(v16u8, pPix + iStride, iStride, q1, q2);
71 alpha = (v16u8)__msa_fill_b(iAlpha);
72 beta = (v16u8)__msa_fill_b(iBeta);
73
74 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
75 bDetaP1P0 = __msa_asub_u_b(p1, p0);
76 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
77 bDetaP2P0 = __msa_asub_u_b(p2, p0);
78 bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
79 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
80 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
81 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
82 bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
83 bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
84
85 // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits
86 MSA_ILVRL_B4(v8i16, zero, p0, zero, p1,
87 p0_r, p0_l, p1_r, p1_l);
88 MSA_ILVRL_B4(v8i16, zero, p2, zero, q0,
89 p2_r, p2_l, q0_r, q0_l);
90 MSA_ILVRL_B4(v8i16, zero, q1, zero, q2,
91 q1_r, q1_l, q2_r, q2_l);
92 // Signed extend tc, negTc from 8 bits to 16 bits
93 flags = __msa_clt_s_b(tc, zero);
94 MSA_ILVRL_B2(v8i16, flags, tc, tc_r, tc_l);
95 flags = __msa_clt_s_b(negTc, zero);
96 MSA_ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l);
97
98 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
99 flags = f & (v16i8)bDetaP2P0;
100 flags = __msa_ceq_b(flags, zero);
101 iTc += ((~flags) & const_1_b);
102 flags = f & (v16i8)bDetaQ2Q0;
103 flags = __msa_ceq_b(flags, zero);
104 iTc += ((~flags) & const_1_b);
105 negiTc = zero - iTc;
106 // Signed extend iTc, negiTc from 8 bits to 16 bits
107 flags = __msa_clt_s_b(iTc, zero);
108 MSA_ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l);
109 flags = __msa_clt_s_b(negiTc, zero);
110 MSA_ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l);
111
112 // Calculate the left part
113 // p1
114 t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
115 t0 = __msa_max_s_h(negTc_l, t0);
116 t0 = __msa_min_s_h(tc_l, t0);
117 t1 = p1_l + t0;
118 // q1
119 t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
120 t0 = __msa_max_s_h(negTc_l, t0);
121 t0 = __msa_min_s_h(tc_l, t0);
122 t2 = q1_l + t0;
123 // iDeta
124 t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
125 t0 = __msa_max_s_h(negiTc_l, t0);
126 t0 = __msa_min_s_h(iTc_l, t0);
127 p1_l = t1;
128 q1_l = t2;
129 // p0
130 t1 = p0_l + t0;
131 t2 = t1 & const_not_255_h;
132 t3 = __msa_cle_s_h((v8i16)zero, t1);
133 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
134 p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
135 // q0
136 t1 = q0_l - t0;
137 t2 = t1 & const_not_255_h;
138 t3 = __msa_cle_s_h((v8i16)zero, t1);
139 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
140 q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
141
142 // Calculate the right part
143 // p1
144 t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
145 t0 = __msa_max_s_h(negTc_r, t0);
146 t0 = __msa_min_s_h(tc_r, t0);
147 t1 = p1_r + t0;
148 // q1
149 t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
150 t0 = __msa_max_s_h(negTc_r, t0);
151 t0 = __msa_min_s_h(tc_r, t0);
152 t2 = q1_r + t0;
153 // iDeta
154 t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
155 t0 = __msa_max_s_h(negiTc_r, t0);
156 t0 = __msa_min_s_h(iTc_r, t0);
157 p1_r = t1;
158 q1_r = t2;
159 // p0
160 t1 = p0_r + t0;
161 t2 = t1 & const_not_255_h;
162 t3 = __msa_cle_s_h((v8i16)zero, t1);
163 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
164 p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
165 // q0
166 t1 = q0_r - t0;
167 t2 = t1 & const_not_255_h;
168 t3 = __msa_cle_s_h((v8i16)zero, t1);
169 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
170 q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
171
172 // Combined left and right
173 MSA_PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
174 t0, t1, t2, t3);
175 flags = (v16i8)__msa_cle_s_b(zero, tc);
176 flags &= f;
177 p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags)));
178 q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags)));
179 // Using t1, t2 as temporary flags
180 t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero))));
181 p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1));
182 t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero))));
183 q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2));
184
185 // Store data to pPix
186 MSA_ST_V4(v16u8, p1, p0, q0, q1, pPix - 2 * iStride, iStride);
187 }
188
DeblockLumaEq4V_msa(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)189 void DeblockLumaEq4V_msa(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
190 int32_t iBeta) {
191 v16u8 p0, p1, p2, p3, q0, q1, q2, q3;
192 v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p3_l, p3_r,
193 q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q3_l, q3_r;
194 v8i16 t0, t1, t2, t0_con1;
195 v8i16 s0, s1, s2, s0_con1;
196 v16u8 alpha, beta;
197 v16u8 iDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
198 // Condition mask
199 v16u8 mask0, mask1;
200 v16i8 const_2_b = __msa_ldi_b(2);
201 v8i16 const_2_h = __msa_ldi_h(2);
202 v8i16 const_4_h = __msa_ldi_h(4);
203 v16i8 zero = { 0 };
204
205 // Load data from pPix
206 MSA_LD_V8(v16u8, pPix - 4 * iStride, iStride, p3, p2, p1, p0,
207 q0, q1, q2, q3);
208 // iAlpha and beta are uint8_t type
209 alpha = (v16u8)__msa_fill_b(iAlpha);
210 beta = (v16u8)__msa_fill_b(iBeta);
211
212 // iDetaP0Q0 is not bool type
213 iDetaP0Q0 = __msa_asub_u_b(p0, q0);
214
215 bDetaP1P0 = __msa_asub_u_b(p1, p0);
216 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
217 bDetaP2P0 = __msa_asub_u_b(p2, p0);
218 bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
219 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
220 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
221 bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
222 bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
223
224 // Unsigned extend p0, p1, p2, p3, q0, q1, q2, q3 from 8 bits to 16 bits
225 MSA_ILVRL_B4(v8i16, zero, p0, zero, p1,
226 p0_r, p0_l, p1_r, p1_l);
227 MSA_ILVRL_B4(v8i16, zero, p2, zero, p3,
228 p2_r, p2_l, p3_r, p3_l);
229 MSA_ILVRL_B4(v8i16, zero, q0, zero, q1,
230 q0_r, q0_l, q1_r, q1_l);
231 MSA_ILVRL_B4(v8i16, zero, q2, zero, q3,
232 q2_r, q2_l, q3_r, q3_l);
233
234 // Calculate condition mask
235 // (iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0
236 mask0 = (v16u8)__msa_clt_u_b(iDetaP0Q0, alpha);
237 mask0 &= bDetaP1P0;
238 mask0 &= bDetaQ1Q0;
239 // iDetaP0Q0 < ((iAlpha >> 2) + 2)
240 mask1 = (v16u8)((alpha >> 2) + const_2_b);
241 mask1 = (v16u8)__msa_clt_u_b(iDetaP0Q0, mask1);
242
243 // Calculate the left part
244 // p0
245 t0 = (p2_l + (p1_l << 1) + (p0_l << 1) + (q0_l << 1) + q1_l + const_4_h) >> 3;
246 // p1
247 t1 = (p2_l + p1_l + p0_l + q0_l + const_2_h) >> 2;
248 // p2
249 t2 = ((p3_l << 1) + p2_l + (p2_l << 1) + p1_l + p0_l + q0_l + const_4_h) >> 3;
250 // p0 condition 1
251 t0_con1 = ((p1_l << 1) + p0_l + q1_l + const_2_h) >> 2;
252 // q0
253 s0 = (p1_l + (p0_l << 1) + (q0_l << 1) + (q1_l << 1) + q2_l + const_4_h) >> 3;
254 // q1
255 s1 = (p0_l + q0_l + q1_l + q2_l + const_2_h) >> 2;
256 // q2
257 s2 = ((q3_l << 1) + q2_l + (q2_l << 1) + q1_l + q0_l + p0_l + const_4_h) >> 3;
258 // q0 condition 1
259 s0_con1 = ((q1_l << 1) + q0_l + p1_l + const_2_h) >> 2;
260 // Move back
261 p0_l = t0;
262 p1_l = t1;
263 p2_l = t2;
264 q0_l = s0;
265 q1_l = s1;
266 q2_l = s2;
267 // Use p3_l, q3_l as tmp
268 p3_l = t0_con1;
269 q3_l = s0_con1;
270
271 // Calculate the right part
272 // p0
273 t0 = (p2_r + (p1_r << 1) + (p0_r << 1) + (q0_r << 1) + q1_r + const_4_h) >> 3;
274 // p1
275 t1 = (p2_r + p1_r + p0_r + q0_r + const_2_h) >> 2;
276 // p2
277 t2 = ((p3_r << 1) + p2_r + (p2_r << 1) + p1_r + p0_r + q0_r + const_4_h) >> 3;
278 // p0 condition 1
279 t0_con1 = ((p1_r << 1) + p0_r + q1_r + const_2_h) >> 2;
280 // q0
281 s0 = (p1_r + (p0_r << 1) + (q0_r << 1) + (q1_r << 1) + q2_r + const_4_h) >> 3;
282 // q1
283 s1 = (p0_r + q0_r + q1_r + q2_r + const_2_h) >> 2;
284 // q2
285 s2 = ((q3_r << 1) + q2_r + (q2_r << 1) + q1_r + q0_r + p0_r + const_4_h) >> 3;
286 // q0 condition 1
287 s0_con1 = ((q1_r << 1) + q0_r + p1_r + const_2_h) >> 2;
288 // Move back
289 p0_r = t0;
290 p1_r = t1;
291 p2_r = t2;
292 q0_r = s0;
293 q1_r = s1;
294 q2_r = s2;
295 // Use p3_r, q3_r as tmp
296 p3_r = t0_con1;
297 q3_r = s0_con1;
298
299 // Combined left and right
300 MSA_PCKEV_B4(v8i16, p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
301 t0, t1, t2, s0);
302 MSA_PCKEV_B4(v8i16, q1_l, q1_r, q2_l, q2_r, p3_l, p3_r, q3_l, q3_r,
303 s1, s2, t0_con1, s0_con1);
304 t0 = (v8i16)(((v16u8)t0 & mask0 & mask1 & bDetaP2P0) + ((v16u8)t0_con1 &
305 mask0 & mask1 & (~bDetaP2P0)) + ((v16u8)t0_con1 & mask0 & (~mask1)));
306 t1 = (v8i16)((v16u8)t1 & mask0 & mask1 & bDetaP2P0);
307 t2 = (v8i16)((v16u8)t2 & mask0 & mask1 & bDetaP2P0);
308 s0 = (v8i16)(((v16u8)s0 & mask0 & mask1 & bDetaQ2Q0) + ((v16u8)s0_con1 &
309 mask0 & mask1 & (~bDetaQ2Q0)) + ((v16u8)s0_con1 & mask0 & (~mask1)));
310 s1 = (v8i16)((v16u8)s1 & mask0 & mask1 & bDetaQ2Q0);
311 s2 = (v8i16)((v16u8)s2 & mask0 & mask1 & bDetaQ2Q0);
312 p0 = (v16u8)t0 + (p0 & (~mask0));
313 p1 = (v16u8)t1 + (p1 & ~(mask0 & mask1 & bDetaP2P0));
314 p2 = (v16u8)t2 + (p2 & ~(mask0 & mask1 & bDetaP2P0));
315 q0 = (v16u8)s0 + (q0 & (~mask0));
316 q1 = (v16u8)s1 + (q1 & ~(mask0 & mask1 & bDetaQ2Q0));
317 q2 = (v16u8)s2 + (q2 & ~(mask0 & mask1 & bDetaQ2Q0));
318
319 // Store data to pPix
320 MSA_ST_V4(v16u8, p2, p1, p0, q0, pPix - 3 * iStride, iStride);
321 MSA_ST_V2(v16u8, q1, q2, pPix + iStride, iStride);
322 }
323
324
DeblockLumaLt4H_msa(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)325 void DeblockLumaLt4H_msa(uint8_t* pPix, int32_t iStride, int32_t iAlpha,
326 int32_t iBeta, int8_t* pTc) {
327 v16u8 p0, p1, p2, q0, q1, q2;
328 v16i8 iTc, negiTc, negTc, flags, f;
329 v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
330 v8i16 tc_l, tc_r, negTc_l, negTc_r;
331 v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
332 // Use for temporary variable
333 v8i16 t0, t1, t2, t3;
334 v16u8 alpha, beta;
335 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
336 v16i8 const_1_b = __msa_ldi_b(1);
337 v8i16 const_1_h = __msa_ldi_h(1);
338 v8i16 const_4_h = __msa_ldi_h(4);
339 v8i16 const_not_255_h = __msa_ldi_h(~255);
340 v16i8 zero = { 0 };
341 v16i8 tc = { pTc[0 >> 2], pTc[1 >> 2], pTc[2 >> 2], pTc[3 >> 2],
342 pTc[4 >> 2], pTc[5 >> 2], pTc[6 >> 2], pTc[7 >> 2],
343 pTc[8 >> 2], pTc[9 >> 2], pTc[10 >> 2], pTc[11 >> 2],
344 pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
345 negTc = zero - tc;
346 iTc = tc;
347
348 // Load data from pPix
349 MSA_LD_V8(v8i16, pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r);
350 MSA_LD_V8(v8i16, pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r,
351 p2_l, p2_r, q0_l, q0_r);
352 // Transpose 16x8 to 8x16, we just need p0, p1, p2, q0, q1, q2
353 MSA_TRANSPOSE16x8_B(v16u8, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r,
354 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
355 p2, p1, p0, q0, q1, q2, alpha, beta);
356
357 alpha = (v16u8)__msa_fill_b(iAlpha);
358 beta = (v16u8)__msa_fill_b(iBeta);
359
360 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
361 bDetaP1P0 = __msa_asub_u_b(p1, p0);
362 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
363 bDetaP2P0 = __msa_asub_u_b(p2, p0);
364 bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
365 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
366 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
367 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
368 bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
369 bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
370
371 // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits
372 MSA_ILVRL_B4(v8i16, zero, p0, zero, p1,
373 p0_r, p0_l, p1_r, p1_l);
374 MSA_ILVRL_B4(v8i16, zero, p2, zero, q0,
375 p2_r, p2_l, q0_r, q0_l);
376 MSA_ILVRL_B4(v8i16, zero, q1, zero, q2,
377 q1_r, q1_l, q2_r, q2_l);
378 // Signed extend tc, negTc from 8 bits to 16 bits
379 flags = __msa_clt_s_b(tc, zero);
380 MSA_ILVRL_B2(v8i16, flags, tc, tc_r, tc_l);
381 flags = __msa_clt_s_b(negTc, zero);
382 MSA_ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l);
383
384 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
385 flags = f & (v16i8)bDetaP2P0;
386 flags = __msa_ceq_b(flags, zero);
387 iTc += ((~flags) & const_1_b);
388 flags = f & (v16i8)bDetaQ2Q0;
389 flags = __msa_ceq_b(flags, zero);
390 iTc += ((~flags) & const_1_b);
391 negiTc = zero - iTc;
392 // Signed extend iTc, negiTc from 8 bits to 16 bits
393 flags = __msa_clt_s_b(iTc, zero);
394 MSA_ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l);
395 flags = __msa_clt_s_b(negiTc, zero);
396 MSA_ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l);
397
398 // Calculate the left part
399 // p1
400 t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
401 t0 = __msa_max_s_h(negTc_l, t0);
402 t0 = __msa_min_s_h(tc_l, t0);
403 t1 = p1_l + t0;
404 // q1
405 t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
406 t0 = __msa_max_s_h(negTc_l, t0);
407 t0 = __msa_min_s_h(tc_l, t0);
408 t2 = q1_l + t0;
409 // iDeta
410 t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
411 t0 = __msa_max_s_h(negiTc_l, t0);
412 t0 = __msa_min_s_h(iTc_l, t0);
413 p1_l = t1;
414 q1_l = t2;
415 // p0
416 t1 = p0_l + t0;
417 t2 = t1 & const_not_255_h;
418 t3 = __msa_cle_s_h((v8i16)zero, t1);
419 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
420 p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
421 // q0
422 t1 = q0_l - t0;
423 t2 = t1 & const_not_255_h;
424 t3 = __msa_cle_s_h((v8i16)zero, t1);
425 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
426 q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
427
428 // Calculate the right part
429 // p1
430 t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
431 t0 = __msa_max_s_h(negTc_r, t0);
432 t0 = __msa_min_s_h(tc_r, t0);
433 t1 = p1_r + t0;
434 // q1
435 t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
436 t0 = __msa_max_s_h(negTc_r, t0);
437 t0 = __msa_min_s_h(tc_r, t0);
438 t2 = q1_r + t0;
439 // iDeta
440 t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
441 t0 = __msa_max_s_h(negiTc_r, t0);
442 t0 = __msa_min_s_h(iTc_r, t0);
443 p1_r = t1;
444 q1_r = t2;
445 // p0
446 t1 = p0_r + t0;
447 t2 = t1 & const_not_255_h;
448 t3 = __msa_cle_s_h((v8i16)zero, t1);
449 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
450 p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
451 // q0
452 t1 = q0_r - t0;
453 t2 = t1 & const_not_255_h;
454 t3 = __msa_cle_s_h((v8i16)zero, t1);
455 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
456 q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
457
458 // Combined left and right
459 MSA_PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
460 t0, t1, t2, t3);
461 flags = (v16i8)__msa_cle_s_b(zero, tc);
462 flags &= f;
463 p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags)));
464 q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags)));
465 // Using t1, t2 as temporary flags
466 t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero))));
467 p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1));
468 t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero))));
469 q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2));
470
471 MSA_ILVRL_B4(v8i16, p0, p1, q1, q0, t0, t1, t2, t3);
472 MSA_ILVRL_H4(v16u8, t2, t0, t3, t1, p1, p0, q0, q1);
473 // Store data to pPix
474 MSA_ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride);
475 MSA_ST_W8(q0, q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride);
476 }
477
DeblockLumaEq4H_msa(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)478 void DeblockLumaEq4H_msa(uint8_t *pPix, int32_t iStride, int32_t iAlpha,
479 int32_t iBeta) {
480 v16u8 p0, p1, p2, p3, q0, q1, q2, q3;
481 v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p3_l, p3_r,
482 q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q3_l, q3_r;
483 v8i16 t0, t1, t2, t0_con1;
484 v8i16 s0, s1, s2, s0_con1;
485 v16u8 alpha, beta;
486 v16u8 iDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
487 // Condition mask
488 v16u8 mask0, mask1;
489 v16i8 const_2_b = __msa_ldi_b(2);
490 v8i16 const_2_h = __msa_ldi_h(2);
491 v8i16 const_4_h = __msa_ldi_h(4);
492 v16i8 zero = { 0 };
493
494 // Load data from pPix
495 MSA_LD_V8(v8i16, pPix - 4, iStride, p0_l, p0_r, p1_l, p1_r,
496 p2_l, p2_r, p3_l, p3_r);
497 MSA_LD_V8(v8i16, pPix + 8 * iStride - 4, iStride,
498 q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q3_l, q3_r);
499 // Transpose 16x8 to 8x16, we just need p0, p1, p2, p3, q0, q1, q2, q3
500 MSA_TRANSPOSE16x8_B(v16u8, p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p3_l, p3_r,
501 q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q3_l, q3_r,
502 p3, p2, p1, p0, q0, q1, q2, q3);
503 // iAlpha and beta are uint8_t type
504 alpha = (v16u8)__msa_fill_b(iAlpha);
505 beta = (v16u8)__msa_fill_b(iBeta);
506
507 // iDetaP0Q0 is not bool type
508 iDetaP0Q0 = __msa_asub_u_b(p0, q0);
509
510 bDetaP1P0 = __msa_asub_u_b(p1, p0);
511 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
512 bDetaP2P0 = __msa_asub_u_b(p2, p0);
513 bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
514 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
515 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
516 bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
517 bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
518
519 // Unsigned extend p0, p1, p2, p3, q0, q1, q2, q3 from 8 bits to 16 bits
520 MSA_ILVRL_B4(v8i16, zero, p0, zero, p1,
521 p0_r, p0_l, p1_r, p1_l);
522 MSA_ILVRL_B4(v8i16, zero, p2, zero, p3,
523 p2_r, p2_l, p3_r, p3_l);
524 MSA_ILVRL_B4(v8i16, zero, q0, zero, q1,
525 q0_r, q0_l, q1_r, q1_l);
526 MSA_ILVRL_B4(v8i16, zero, q2, zero, q3,
527 q2_r, q2_l, q3_r, q3_l);
528
529 // Calculate condition mask
530 // (iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0
531 mask0 = (v16u8)__msa_clt_u_b(iDetaP0Q0, alpha);
532 mask0 &= bDetaP1P0;
533 mask0 &= bDetaQ1Q0;
534 // iDetaP0Q0 < ((iAlpha >> 2) + 2)
535 mask1 = (v16u8)((alpha >> 2) + const_2_b);
536 mask1 = (v16u8)__msa_clt_u_b(iDetaP0Q0, mask1);
537
538 // Calculate the left part
539 // p0
540 t0 = (p2_l + (p1_l << 1) + (p0_l << 1) + (q0_l << 1) + q1_l + const_4_h) >> 3;
541 // p1
542 t1 = (p2_l + p1_l + p0_l + q0_l + const_2_h) >> 2;
543 // p2
544 t2 = ((p3_l << 1) + p2_l + (p2_l << 1) + p1_l + p0_l + q0_l + const_4_h) >> 3;
545 // p0 condition 1
546 t0_con1 = ((p1_l << 1) + p0_l + q1_l + const_2_h) >> 2;
547 // q0
548 s0 = (p1_l + (p0_l << 1) + (q0_l << 1) + (q1_l << 1) + q2_l + const_4_h) >> 3;
549 // q1
550 s1 = (p0_l + q0_l + q1_l + q2_l + const_2_h) >> 2;
551 // q2
552 s2 = ((q3_l << 1) + q2_l + (q2_l << 1) + q1_l + q0_l + p0_l + const_4_h) >> 3;
553 // q0 condition 1
554 s0_con1 = ((q1_l << 1) + q0_l + p1_l + const_2_h) >> 2;
555 // Move back
556 p0_l = t0;
557 p1_l = t1;
558 p2_l = t2;
559 q0_l = s0;
560 q1_l = s1;
561 q2_l = s2;
562 // Use p3_l, q3_l as tmp
563 p3_l = t0_con1;
564 q3_l = s0_con1;
565
566 // Calculate the right part
567 // p0
568 t0 = (p2_r + (p1_r << 1) + (p0_r << 1) + (q0_r << 1) + q1_r + const_4_h) >> 3;
569 // p1
570 t1 = (p2_r + p1_r + p0_r + q0_r + const_2_h) >> 2;
571 // p2
572 t2 = ((p3_r << 1) + p2_r + (p2_r << 1) + p1_r + p0_r + q0_r + const_4_h) >> 3;
573 // p0 condition 1
574 t0_con1 = ((p1_r << 1) + p0_r + q1_r + const_2_h) >> 2;
575 // q0
576 s0 = (p1_r + (p0_r << 1) + (q0_r << 1) + (q1_r << 1) + q2_r + const_4_h) >> 3;
577 // q1
578 s1 = (p0_r + q0_r + q1_r + q2_r + const_2_h) >> 2;
579 // q2
580 s2 = ((q3_r << 1) + q2_r + (q2_r << 1) + q1_r + q0_r + p0_r + const_4_h) >> 3;
581 // q0 condition 1
582 s0_con1 = ((q1_r << 1) + q0_r + p1_r + const_2_h) >> 2;
583 // Move back
584 p0_r = t0;
585 p1_r = t1;
586 p2_r = t2;
587 q0_r = s0;
588 q1_r = s1;
589 q2_r = s2;
590 // Use p3_r, q3_r as tmp
591 p3_r = t0_con1;
592 q3_r = s0_con1;
593
594 // Combined left and right
595 MSA_PCKEV_B4(v8i16, p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
596 t0, t1, t2, s0);
597 MSA_PCKEV_B4(v8i16, q1_l, q1_r, q2_l, q2_r, p3_l, p3_r, q3_l, q3_r,
598 s1, s2, t0_con1, s0_con1);
599 t0 = (v8i16)(((v16u8)t0 & mask0 & mask1 & bDetaP2P0) + ((v16u8)t0_con1 &
600 mask0 & mask1 & (~bDetaP2P0)) + ((v16u8)t0_con1 & mask0 & (~mask1)));
601 t1 = (v8i16)((v16u8)t1 & mask0 & mask1 & bDetaP2P0);
602 t2 = (v8i16)((v16u8)t2 & mask0 & mask1 & bDetaP2P0);
603 s0 = (v8i16)(((v16u8)s0 & mask0 & mask1 & bDetaQ2Q0) + ((v16u8)s0_con1 &
604 mask0 & mask1 & (~bDetaQ2Q0)) + ((v16u8)s0_con1 & mask0 & (~mask1)));
605 s1 = (v8i16)((v16u8)s1 & mask0 & mask1 & bDetaQ2Q0);
606 s2 = (v8i16)((v16u8)s2 & mask0 & mask1 & bDetaQ2Q0);
607 p0 = (v16u8)t0 + (p0 & (~mask0));
608 p1 = (v16u8)t1 + (p1 & ~(mask0 & mask1 & bDetaP2P0));
609 p2 = (v16u8)t2 + (p2 & ~(mask0 & mask1 & bDetaP2P0));
610 q0 = (v16u8)s0 + (q0 & (~mask0));
611 q1 = (v16u8)s1 + (q1 & ~(mask0 & mask1 & bDetaQ2Q0));
612 q2 = (v16u8)s2 + (q2 & ~(mask0 & mask1 & bDetaQ2Q0));
613
614 MSA_ILVRL_B4(v8i16, p1, p2, q0, p0, t0, s0, t1, s1);
615 MSA_ILVRL_B2(v8i16, q2, q1, t2, s2);
616 MSA_ILVRL_H4(v16u8, t1, t0, s1, s0, p2, p1, p0, q0);
617 // Store data to pPix
618 MSA_ST_W8(p2, p1, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 3, iStride);
619 MSA_ST_W8(p0, q0, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 3, iStride);
620 MSA_ST_H8(t2, 0, 1, 2, 3, 4, 5, 6, 7, pPix + 1, iStride);
621 MSA_ST_H8(s2, 0, 1, 2, 3, 4, 5, 6, 7, pPix + 8 * iStride + 1, iStride);
622 }
623
DeblockChromaLt4V_msa(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)624 void DeblockChromaLt4V_msa(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
625 int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
626 v16u8 p0, p1, q0, q1;
627 v8i16 p0_e, p1_e, q0_e, q1_e;
628 v16i8 negTc, flags, f;
629 v8i16 tc_e, negTc_e;
630 // Use for temporary variable
631 v8i16 t0, t1, t2, t3;
632 v16u8 alpha, beta;
633 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
634 v8i16 const_4_h = __msa_ldi_h(4);
635 v8i16 const_not_255_h = __msa_ldi_h(~255);
636 v16i8 zero = { 0 };
637 v16i8 tc = { pTc[0 >> 1], pTc[1 >> 1], pTc[2 >> 1], pTc[3 >> 1],
638 pTc[4 >> 1], pTc[5 >> 1], pTc[6 >> 1], pTc[7 >> 1] };
639 negTc = zero - tc;
640
641 alpha = (v16u8)__msa_fill_b(iAlpha);
642 beta = (v16u8)__msa_fill_b(iBeta);
643 // Signed extend tc, negTc from 8 bits to 16 bits
644 flags = __msa_clt_s_b(tc, zero);
645 MSA_ILVR_B(v8i16, flags, tc, tc_e);
646 flags = __msa_clt_s_b(negTc, zero);
647 MSA_ILVR_B(v8i16, flags, negTc, negTc_e);
648
649 // Cb
650 // Load data from pPixCb
651 MSA_LD_V4(v16u8, pPixCb - 2 * iStride, iStride, p1, p0, q0, q1);
652
653 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
654 bDetaP1P0 = __msa_asub_u_b(p1, p0);
655 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
656 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
657 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
658 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
659
660 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
661 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
662 p0_e, p1_e, q0_e, q1_e);
663
664 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
665
666 // iDeta
667 t0 = (((q0_e - p0_e) << 2) + (p1_e - q1_e) + const_4_h) >> 3;
668 t0 = __msa_max_s_h(negTc_e, t0);
669 t0 = __msa_min_s_h(tc_e, t0);
670 // p0
671 t1 = p0_e + t0;
672 t2 = t1 & const_not_255_h;
673 t3 = __msa_cle_s_h((v8i16)zero, t1);
674 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
675 p0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
676 // q0
677 t1 = q0_e - t0;
678 t2 = t1 & const_not_255_h;
679 t3 = __msa_cle_s_h((v8i16)zero, t1);
680 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
681 q0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
682
683 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
684 flags = (v16i8)__msa_cle_s_b(zero, tc);
685 flags &= f;
686 p0 = (v16u8)(((v16i8)t0 & flags) + (p0 & (~flags)));
687 q0 = (v16u8)(((v16i8)t1 & flags) + (q0 & (~flags)));
688 // Store data to pPixCb
689 MSA_ST_D(p0, 0, pPixCb - iStride);
690 MSA_ST_D(q0, 0, pPixCb);
691
692 // Cr
693 // Load data from pPixCr
694 MSA_LD_V4(v16u8, pPixCr - 2 * iStride, iStride, p1, p0, q0, q1);
695
696 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
697 bDetaP1P0 = __msa_asub_u_b(p1, p0);
698 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
699 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
700 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
701 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
702
703 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
704 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
705 p0_e, p1_e, q0_e, q1_e);
706
707 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
708
709 // iDeta
710 t0 = (((q0_e - p0_e) << 2) + (p1_e - q1_e) + const_4_h) >> 3;
711 t0 = __msa_max_s_h(negTc_e, t0);
712 t0 = __msa_min_s_h(tc_e, t0);
713 // p0
714 t1 = p0_e + t0;
715 t2 = t1 & const_not_255_h;
716 t3 = __msa_cle_s_h((v8i16)zero, t1);
717 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
718 p0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
719 // q0
720 t1 = q0_e - t0;
721 t2 = t1 & const_not_255_h;
722 t3 = __msa_cle_s_h((v8i16)zero, t1);
723 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
724 q0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
725
726 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
727 flags = (v16i8)__msa_cle_s_b(zero, tc);
728 flags &= f;
729 p0 = (v16u8)(((v16i8)t0 & flags) + (p0 & (~flags)));
730 q0 = (v16u8)(((v16i8)t1 & flags) + (q0 & (~flags)));
731 // Store data to pPixCr
732 MSA_ST_D(p0, 0, pPixCr - iStride);
733 MSA_ST_D(q0, 0, pPixCr);
734 }
735
DeblockChromaEq4V_msa(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)736 void DeblockChromaEq4V_msa(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
737 int32_t iAlpha, int32_t iBeta) {
738 v16u8 p0, p1, q0, q1;
739 v8i16 p0_e, p1_e, q0_e, q1_e;
740 v16i8 f;
741 // Use for temporary variable
742 v8i16 t0, t1;
743 v16u8 alpha, beta;
744 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
745 v8i16 const_2_h = __msa_ldi_h(2);
746 v16i8 zero = { 0 };
747
748 alpha = (v16u8)__msa_fill_b(iAlpha);
749 beta = (v16u8)__msa_fill_b(iBeta);
750
751 // Cb
752 // Load data from pPixCb
753 MSA_LD_V4(v16u8, pPixCb - 2 * iStride, iStride, p1, p0, q0, q1);
754
755 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
756 bDetaP1P0 = __msa_asub_u_b(p1, p0);
757 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
758 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
759 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
760 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
761
762 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
763 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
764 p0_e, p1_e, q0_e, q1_e);
765
766 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
767
768 // p0
769 p0_e = ((p1_e << 1) + p0_e + q1_e + const_2_h) >> 2;
770 // q0
771 q0_e = ((q1_e << 1) + q0_e + p1_e + const_2_h) >> 2;
772
773 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
774 p0 = (v16u8)(((v16i8)t0 & f) + (p0 & (~f)));
775 q0 = (v16u8)(((v16i8)t1 & f) + (q0 & (~f)));
776 // Store data to pPixCb
777 MSA_ST_D(p0, 0, pPixCb - iStride);
778 MSA_ST_D(q0, 0, pPixCb);
779
780 // Cr
781 // Load data from pPixCr
782 MSA_LD_V4(v16u8, pPixCr - 2 * iStride, iStride, p1, p0, q0, q1);
783
784 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
785 bDetaP1P0 = __msa_asub_u_b(p1, p0);
786 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
787 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
788 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
789 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
790
791 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
792 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
793 p0_e, p1_e, q0_e, q1_e);
794
795 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
796
797 // p0
798 p0_e = ((p1_e << 1) + p0_e + q1_e + const_2_h) >> 2;
799 // q0
800 q0_e = ((q1_e << 1) + q0_e + p1_e + const_2_h) >> 2;
801
802 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
803 p0 = (v16u8)(((v16i8)t0 & f) + (p0 & (~f)));
804 q0 = (v16u8)(((v16i8)t1 & f) + (q0 & (~f)));
805 // Store data to pPixCr
806 MSA_ST_D(p0, 0, pPixCr - iStride);
807 MSA_ST_D(q0, 0, pPixCr);
808 }
809
DeblockChromaLt4H_msa(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)810 void DeblockChromaLt4H_msa(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
811 int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
812 v16u8 p0, p1, q0, q1;
813 v8i16 p0_e, p1_e, q0_e, q1_e;
814 v16i8 negTc, flags, f;
815 v8i16 tc_e, negTc_e;
816 // Use for temporary variable
817 v8i16 t0, t1, t2, t3;
818 v16u8 alpha, beta;
819 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
820 v8i16 const_4_h = __msa_ldi_h(4);
821 v8i16 const_not_255_h = __msa_ldi_h(~255);
822 v16i8 zero = { 0 };
823 v16i8 tc = { pTc[0 >> 1], pTc[1 >> 1], pTc[2 >> 1], pTc[3 >> 1],
824 pTc[4 >> 1], pTc[5 >> 1], pTc[6 >> 1], pTc[7 >> 1] };
825 negTc = zero - tc;
826
827 alpha = (v16u8)__msa_fill_b(iAlpha);
828 beta = (v16u8)__msa_fill_b(iBeta);
829 // Signed extend tc, negTc from 8 bits to 16 bits
830 flags = __msa_clt_s_b(tc, zero);
831 MSA_ILVR_B(v8i16, flags, tc, tc_e);
832 flags = __msa_clt_s_b(negTc, zero);
833 MSA_ILVR_B(v8i16, flags, negTc, negTc_e);
834
835 // Cb
836 // Load data from pPixCb
837 MSA_LD_V8(v8i16, pPixCb - 2, iStride, p1_e, p0_e, q0_e, q1_e,
838 t0, t1, t2, t3);
839 // Transpose 8x4 to 4x8, we just need p0, p1, q0, q1
840 MSA_TRANSPOSE8x4_B(v16u8, p1_e, p0_e, q0_e, q1_e, t0, t1, t2, t3,
841 p1, p0, q0, q1);
842
843 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
844 bDetaP1P0 = __msa_asub_u_b(p1, p0);
845 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
846 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
847 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
848 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
849
850 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
851 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
852 p0_e, p1_e, q0_e, q1_e);
853
854 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
855
856 // iDeta
857 t0 = (((q0_e - p0_e) << 2) + (p1_e - q1_e) + const_4_h) >> 3;
858 t0 = __msa_max_s_h(negTc_e, t0);
859 t0 = __msa_min_s_h(tc_e, t0);
860 // p0
861 t1 = p0_e + t0;
862 t2 = t1 & const_not_255_h;
863 t3 = __msa_cle_s_h((v8i16)zero, t1);
864 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
865 p0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
866 // q0
867 t1 = q0_e - t0;
868 t2 = t1 & const_not_255_h;
869 t3 = __msa_cle_s_h((v8i16)zero, t1);
870 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
871 q0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
872
873 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
874 flags = (v16i8)__msa_cle_s_b(zero, tc);
875 flags &= f;
876 p0 = (v16u8)(((v16i8)t0 & flags) + (p0 & (~flags)));
877 q0 = (v16u8)(((v16i8)t1 & flags) + (q0 & (~flags)));
878 // Store data to pPixCb
879 MSA_ILVR_B(v16u8, q0, p0, p0);
880 MSA_ST_H8(p0, 0, 1, 2, 3, 4, 5, 6, 7, pPixCb - 1, iStride);
881
882 // Cr
883 // Load data from pPixCr
884 MSA_LD_V8(v8i16, pPixCr - 2, iStride, p1_e, p0_e, q0_e, q1_e,
885 t0, t1, t2, t3);
886 // Transpose 8x4 to 4x8, we just need p0, p1, q0, q1
887 MSA_TRANSPOSE8x4_B(v16u8, p1_e, p0_e, q0_e, q1_e, t0, t1, t2, t3,
888 p1, p0, q0, q1);
889
890 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
891 bDetaP1P0 = __msa_asub_u_b(p1, p0);
892 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
893 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
894 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
895 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
896
897 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
898 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
899 p0_e, p1_e, q0_e, q1_e);
900
901 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
902
903 // iDeta
904 t0 = (((q0_e - p0_e) << 2) + (p1_e - q1_e) + const_4_h) >> 3;
905 t0 = __msa_max_s_h(negTc_e, t0);
906 t0 = __msa_min_s_h(tc_e, t0);
907 // p0
908 t1 = p0_e + t0;
909 t2 = t1 & const_not_255_h;
910 t3 = __msa_cle_s_h((v8i16)zero, t1);
911 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
912 p0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
913 // q0
914 t1 = q0_e - t0;
915 t2 = t1 & const_not_255_h;
916 t3 = __msa_cle_s_h((v8i16)zero, t1);
917 flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
918 q0_e = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
919
920 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
921 flags = (v16i8)__msa_cle_s_b(zero, tc);
922 flags &= f;
923 p0 = (v16u8)(((v16i8)t0 & flags) + (p0 & (~flags)));
924 q0 = (v16u8)(((v16i8)t1 & flags) + (q0 & (~flags)));
925 // Store data to pPixCr
926 MSA_ILVR_B(v16u8, q0, p0, p0);
927 MSA_ST_H8(p0, 0, 1, 2, 3, 4, 5, 6, 7, pPixCr - 1, iStride);
928 }
929
DeblockChromaEq4H_msa(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)930 void DeblockChromaEq4H_msa(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
931 int32_t iAlpha, int32_t iBeta) {
932 v16u8 p0, p1, q0, q1;
933 v8i16 p0_e, p1_e, q0_e, q1_e;
934 v16i8 f;
935 // Use for temporary variable
936 v8i16 t0, t1, t2, t3;
937 v16u8 alpha, beta;
938 v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
939 v8i16 const_2_h = __msa_ldi_h(2);
940 v16i8 zero = { 0 };
941
942 alpha = (v16u8)__msa_fill_b(iAlpha);
943 beta = (v16u8)__msa_fill_b(iBeta);
944
945 // Cb
946 // Load data from pPixCb
947 MSA_LD_V8(v8i16, pPixCb - 2, iStride, p1_e, p0_e, q0_e, q1_e,
948 t0, t1, t2, t3);
949 // Transpose 8x4 to 4x8, we just need p0, p1, q0, q1
950 MSA_TRANSPOSE8x4_B(v16u8, p1_e, p0_e, q0_e, q1_e, t0, t1, t2, t3,
951 p1, p0, q0, q1);
952
953 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
954 bDetaP1P0 = __msa_asub_u_b(p1, p0);
955 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
956 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
957 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
958 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
959
960 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
961 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
962 p0_e, p1_e, q0_e, q1_e);
963
964 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
965
966 // p0
967 p0_e = ((p1_e << 1) + p0_e + q1_e + const_2_h) >> 2;
968 // q0
969 q0_e = ((q1_e << 1) + q0_e + p1_e + const_2_h) >> 2;
970
971 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
972 p0 = (v16u8)(((v16i8)t0 & f) + (p0 & (~f)));
973 q0 = (v16u8)(((v16i8)t1 & f) + (q0 & (~f)));
974 // Store data to pPixCb
975 MSA_ILVR_B(v16u8, q0, p0, p0);
976 MSA_ST_H8(p0, 0, 1, 2, 3, 4, 5, 6, 7, pPixCb - 1, iStride);
977
978 // Cr
979 // Load data from pPixCr
980 MSA_LD_V8(v8i16, pPixCr - 2, iStride, p1_e, p0_e, q0_e, q1_e,
981 t0, t1, t2, t3);
982 // Transpose 8x4 to 4x8, we just need p0, p1, q0, q1
983 MSA_TRANSPOSE8x4_B(v16u8, p1_e, p0_e, q0_e, q1_e, t0, t1, t2, t3,
984 p1, p0, q0, q1);
985
986 bDetaP0Q0 = __msa_asub_u_b(p0, q0);
987 bDetaP1P0 = __msa_asub_u_b(p1, p0);
988 bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
989 bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
990 bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
991 bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
992
993 // Unsigned extend p0, p1, q0, q1 from 8 bits to 16 bits
994 MSA_ILVR_B4(v8i16, zero, p0, zero, p1, zero, q0, zero, q1,
995 p0_e, p1_e, q0_e, q1_e);
996
997 f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
998
999 // p0
1000 p0_e = ((p1_e << 1) + p0_e + q1_e + const_2_h) >> 2;
1001 // q0
1002 q0_e = ((q1_e << 1) + q0_e + p1_e + const_2_h) >> 2;
1003
1004 MSA_PCKEV_B2(v8i16, p0_e, p0_e, q0_e, q0_e, t0, t1);
1005 p0 = (v16u8)(((v16i8)t0 & f) + (p0 & (~f)));
1006 q0 = (v16u8)(((v16i8)t1 & f) + (q0 & (~f)));
1007 // Store data to pPixCr
1008 MSA_ILVR_B(v16u8, q0, p0, p0);
1009 MSA_ST_H8(p0, 0, 1, 2, 3, 4, 5, 6, 7, pPixCr - 1, iStride);
1010 }
1011
WelsNonZeroCount_msa(int8_t * pNonZeroCount)1012 void WelsNonZeroCount_msa(int8_t* pNonZeroCount) {
1013 v16u8 src0, src1;
1014 v16u8 zero = { 0 };
1015 v16u8 const_1 = (v16u8)__msa_fill_b(0x01);
1016
1017 MSA_LD_V2(v16u8, pNonZeroCount, 16, src0, src1);
1018 src0 = (v16u8)__msa_ceq_b((v16i8)zero, (v16i8)src0);
1019 src1 = (v16u8)__msa_ceq_b((v16i8)zero, (v16i8)src1);
1020 src0 += const_1;
1021 src1 += const_1;
1022 MSA_ST_V(v16u8, src0, pNonZeroCount);
1023 MSA_ST_D(src1, 0, pNonZeroCount + 16);
1024 }
1025