1 /*
2 * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
23
avc_wgt_4x2_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)24 static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride,
25 int32_t log2_denom, int32_t src_weight,
26 int32_t offset_in)
27 {
28 uint32_t tp0, tp1, offset_val;
29 v16u8 zero = { 0 };
30 v16u8 src0 = { 0 };
31 v8i16 src0_r, tmp0, wgt, denom, offset;
32
33 offset_val = (unsigned) offset_in << log2_denom;
34
35 wgt = __msa_fill_h(src_weight);
36 offset = __msa_fill_h(offset_val);
37 denom = __msa_fill_h(log2_denom);
38
39 LW2(data, stride, tp0, tp1);
40 INSERT_W2_UB(tp0, tp1, src0);
41 src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42 tmp0 = wgt * src0_r;
43 tmp0 = __msa_adds_s_h(tmp0, offset);
44 tmp0 = __msa_maxi_s_h(tmp0, 0);
45 tmp0 = __msa_srlr_h(tmp0, denom);
46 tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47 src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48 ST_W2(src0, 0, 1, data, stride);
49 }
50
avc_wgt_4x4_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)51 static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride,
52 int32_t log2_denom, int32_t src_weight,
53 int32_t offset_in)
54 {
55 uint32_t tp0, tp1, tp2, tp3, offset_val;
56 v16u8 src0 = { 0 };
57 v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58
59 offset_val = (unsigned) offset_in << log2_denom;
60
61 wgt = __msa_fill_h(src_weight);
62 offset = __msa_fill_h(offset_val);
63 denom = __msa_fill_h(log2_denom);
64
65 LW4(data, stride, tp0, tp1, tp2, tp3);
66 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
67 UNPCK_UB_SH(src0, src0_r, src1_r);
68 MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
69 ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
70 MAXI_SH2_SH(tmp0, tmp1, 0);
71 tmp0 = __msa_srlr_h(tmp0, denom);
72 tmp1 = __msa_srlr_h(tmp1, denom);
73 SAT_UH2_SH(tmp0, tmp1, 7);
74 src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
75 ST_W4(src0, 0, 1, 2, 3, data, stride);
76 }
77
avc_wgt_4x8_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)78 static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride,
79 int32_t log2_denom, int32_t src_weight,
80 int32_t offset_in)
81 {
82 uint32_t tp0, tp1, tp2, tp3, offset_val;
83 v16u8 src0 = { 0 }, src1 = { 0 };
84 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
85 v8i16 wgt, denom, offset;
86
87 offset_val = (unsigned) offset_in << log2_denom;
88
89 wgt = __msa_fill_h(src_weight);
90 offset = __msa_fill_h(offset_val);
91 denom = __msa_fill_h(log2_denom);
92
93 LW4(data, stride, tp0, tp1, tp2, tp3);
94 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
95 LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
96 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
97 UNPCK_UB_SH(src0, src0_r, src1_r);
98 UNPCK_UB_SH(src1, src2_r, src3_r);
99 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100 tmp3);
101 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
102 tmp1, tmp2, tmp3);
103 MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
104 SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
105 SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
106 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
107 ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
108 }
109
avc_wgt_8x4_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)110 static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride,
111 int32_t log2_denom, int32_t src_weight,
112 int32_t offset_in)
113 {
114 uint32_t offset_val;
115 uint64_t tp0, tp1, tp2, tp3;
116 v16u8 src0 = { 0 }, src1 = { 0 };
117 v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
118 v8i16 wgt, denom, offset;
119
120 offset_val = (unsigned) offset_in << log2_denom;
121
122 wgt = __msa_fill_h(src_weight);
123 offset = __msa_fill_h(offset_val);
124 denom = __msa_fill_h(log2_denom);
125
126 LD4(data, stride, tp0, tp1, tp2, tp3);
127 INSERT_D2_UB(tp0, tp1, src0);
128 INSERT_D2_UB(tp2, tp3, src1);
129 UNPCK_UB_SH(src0, src0_r, src1_r);
130 UNPCK_UB_SH(src1, src2_r, src3_r);
131 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132 tmp3);
133 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
134 tmp1, tmp2, tmp3);
135 MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
136 SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
137 SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
138 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
139 ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
140 }
141
avc_wgt_8x8_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)142 static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom,
143 int32_t src_weight, int32_t offset_in)
144 {
145 uint32_t offset_val;
146 uint64_t tp0, tp1, tp2, tp3;
147 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
148 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
150 v8i16 wgt, denom, offset;
151
152 offset_val = (unsigned) offset_in << log2_denom;
153
154 wgt = __msa_fill_h(src_weight);
155 offset = __msa_fill_h(offset_val);
156 denom = __msa_fill_h(log2_denom);
157
158 LD4(data, stride, tp0, tp1, tp2, tp3);
159 INSERT_D2_UB(tp0, tp1, src0);
160 INSERT_D2_UB(tp2, tp3, src1);
161 LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
162 INSERT_D2_UB(tp0, tp1, src2);
163 INSERT_D2_UB(tp2, tp3, src3);
164 UNPCK_UB_SH(src0, src0_r, src1_r);
165 UNPCK_UB_SH(src1, src2_r, src3_r);
166 UNPCK_UB_SH(src2, src4_r, src5_r);
167 UNPCK_UB_SH(src3, src6_r, src7_r);
168 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169 tmp3);
170 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171 tmp7);
172 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
173 tmp1, tmp2, tmp3);
174 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
175 tmp5, tmp6, tmp7);
176 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
180 src2, src3);
181 ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
182 }
183
avc_wgt_8x16_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)184 static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride,
185 int32_t log2_denom, int32_t src_weight,
186 int32_t offset_in)
187 {
188 uint32_t offset_val, cnt;
189 uint64_t tp0, tp1, tp2, tp3;
190 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
191 v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
193 v8i16 wgt, denom, offset;
194
195 offset_val = (unsigned) offset_in << log2_denom;
196
197 wgt = __msa_fill_h(src_weight);
198 offset = __msa_fill_h(offset_val);
199 denom = __msa_fill_h(log2_denom);
200
201 for (cnt = 2; cnt--;) {
202 LD4(data, stride, tp0, tp1, tp2, tp3);
203 INSERT_D2_UB(tp0, tp1, src0);
204 INSERT_D2_UB(tp2, tp3, src1);
205 LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
206 INSERT_D2_UB(tp0, tp1, src2);
207 INSERT_D2_UB(tp2, tp3, src3);
208 UNPCK_UB_SH(src0, src0_r, src1_r);
209 UNPCK_UB_SH(src1, src2_r, src3_r);
210 UNPCK_UB_SH(src2, src4_r, src5_r);
211 UNPCK_UB_SH(src3, src6_r, src7_r);
212 MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213 tmp2, tmp3);
214 MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215 tmp6, tmp7);
216 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
217 tmp0, tmp1, tmp2, tmp3);
218 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
219 tmp4, tmp5, tmp6, tmp7);
220 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
224 src2, src3);
225 ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
226 data += 8 * stride;
227 }
228 }
229
avc_biwgt_4x2_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)230 static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
231 int32_t log2_denom, int32_t src_weight,
232 int32_t dst_weight, int32_t offset_in)
233 {
234 uint32_t tp0, tp1;
235 v16i8 src_wgt, dst_wgt, wgt, vec0;
236 v16u8 src0 = { 0 }, dst0 = { 0 };
237 v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
238
239 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240 offset_in += (128 * (src_weight + dst_weight));
241
242 src_wgt = __msa_fill_b(src_weight);
243 dst_wgt = __msa_fill_b(dst_weight);
244 offset = __msa_fill_h(offset_in);
245 denom = __msa_fill_h(log2_denom + 1);
246
247 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248
249 LW2(src, stride, tp0, tp1);
250 INSERT_W2_UB(tp0, tp1, src0);
251 LW2(dst, stride, tp0, tp1);
252 INSERT_W2_UB(tp0, tp1, dst0);
253 XORI_B2_128_UB(src0, dst0);
254 vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
255 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
256 tmp0 >>= denom;
257 tmp0 = __msa_maxi_s_h(tmp0, 0);
258 tmp0 = __msa_min_s_h(max255, tmp0);
259 dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
260 ST_W2(dst0, 0, 1, dst, stride);
261 }
262
avc_biwgt_4x4_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)263 static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
264 int32_t log2_denom, int32_t src_weight,
265 int32_t dst_weight, int32_t offset_in)
266 {
267 uint32_t tp0, tp1, tp2, tp3;
268 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
269 v16u8 src0, dst0;
270 v8i16 tmp0, tmp1, denom, offset;
271
272 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273 offset_in += (128 * (src_weight + dst_weight));
274
275 src_wgt = __msa_fill_b(src_weight);
276 dst_wgt = __msa_fill_b(dst_weight);
277 offset = __msa_fill_h(offset_in);
278 denom = __msa_fill_h(log2_denom + 1);
279
280 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281
282 LW4(src, stride, tp0, tp1, tp2, tp3);
283 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
284 LW4(dst, stride, tp0, tp1, tp2, tp3);
285 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
286 XORI_B2_128_UB(src0, dst0);
287 ILVRL_B2_SB(dst0, src0, vec0, vec1);
288 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
289 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
290 tmp0 >>= denom;
291 tmp1 >>= denom;
292 CLIP_SH2_0_255(tmp0, tmp1);
293 dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
294 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
295 }
296
avc_biwgt_4x8_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)297 static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
298 int32_t log2_denom, int32_t src_weight,
299 int32_t dst_weight, int32_t offset_in)
300 {
301 uint32_t tp0, tp1, tp2, tp3;
302 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
303 v16u8 src0, src1, dst0, dst1;
304 v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
305
306 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307 offset_in += (128 * (src_weight + dst_weight));
308
309 src_wgt = __msa_fill_b(src_weight);
310 dst_wgt = __msa_fill_b(dst_weight);
311 offset = __msa_fill_h(offset_in);
312 denom = __msa_fill_h(log2_denom + 1);
313 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314
315 LW4(src, stride, tp0, tp1, tp2, tp3);
316 src += 4 * stride;
317 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
318 LW4(src, stride, tp0, tp1, tp2, tp3);
319 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
320 LW4(dst, stride, tp0, tp1, tp2, tp3);
321 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
322 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
323 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
324 XORI_B4_128_UB(src0, src1, dst0, dst1);
325 ILVRL_B2_SB(dst0, src0, vec0, vec1);
326 ILVRL_B2_SB(dst1, src1, vec2, vec3);
327 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
328 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
329 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
330 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
331 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
332 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
333 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
334 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
335 }
336
avc_biwgt_8x4_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)337 static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
338 int32_t log2_denom, int32_t src_weight,
339 int32_t dst_weight, int32_t offset_in)
340 {
341 uint64_t tp0, tp1, tp2, tp3;
342 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
343 v16u8 src0, src1, dst0, dst1;
344 v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
345
346 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347 offset_in += (128 * (src_weight + dst_weight));
348
349 src_wgt = __msa_fill_b(src_weight);
350 dst_wgt = __msa_fill_b(dst_weight);
351 offset = __msa_fill_h(offset_in);
352 denom = __msa_fill_h(log2_denom + 1);
353
354 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355
356 LD4(src, stride, tp0, tp1, tp2, tp3);
357 INSERT_D2_UB(tp0, tp1, src0);
358 INSERT_D2_UB(tp2, tp3, src1);
359 LD4(dst, stride, tp0, tp1, tp2, tp3);
360 INSERT_D2_UB(tp0, tp1, dst0);
361 INSERT_D2_UB(tp2, tp3, dst1);
362 XORI_B4_128_UB(src0, src1, dst0, dst1);
363 ILVRL_B2_SB(dst0, src0, vec0, vec1);
364 ILVRL_B2_SB(dst1, src1, vec2, vec3);
365 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
366 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
367 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
368 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
369 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
370 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
371 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
372 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
373 }
374
avc_biwgt_8x8_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)375 static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376 int32_t log2_denom, int32_t src_weight,
377 int32_t dst_weight, int32_t offset_in)
378 {
379 uint64_t tp0, tp1, tp2, tp3;
380 v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381 v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
382 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
383
384 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385 offset_in += (128 * (src_weight + dst_weight));
386
387 src_wgt = __msa_fill_b(src_weight);
388 dst_wgt = __msa_fill_b(dst_weight);
389 offset = __msa_fill_h(offset_in);
390 denom = __msa_fill_h(log2_denom + 1);
391 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392
393 LD4(src, stride, tp0, tp1, tp2, tp3);
394 INSERT_D2_UB(tp0, tp1, src0);
395 INSERT_D2_UB(tp2, tp3, src1);
396 LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
397 INSERT_D2_UB(tp0, tp1, src2);
398 INSERT_D2_UB(tp2, tp3, src3);
399 LD4(dst, stride, tp0, tp1, tp2, tp3);
400 INSERT_D2_UB(tp0, tp1, dst0);
401 INSERT_D2_UB(tp2, tp3, dst1);
402 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
403 INSERT_D2_UB(tp0, tp1, dst2);
404 INSERT_D2_UB(tp2, tp3, dst3);
405 XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
406 ILVRL_B2_SB(dst0, src0, vec0, vec1);
407 ILVRL_B2_SB(dst1, src1, vec2, vec3);
408 ILVRL_B2_SB(dst2, src2, vec4, vec5);
409 ILVRL_B2_SB(dst3, src3, vec6, vec7);
410 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
411 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
412 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
413 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
414 tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
415 tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
416 tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
417 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
418 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
420 CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
421 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
422 PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
423 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
424 }
425
avc_biwgt_8x16_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)426 static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
427 int32_t log2_denom, int32_t src_weight,
428 int32_t dst_weight, int32_t offset_in)
429 {
430 uint8_t cnt;
431 uint64_t tp0, tp1, tp2, tp3;
432 v16i8 src_wgt, dst_wgt, wgt;
433 v16u8 src0, src1, src2, src3;
434 v16u8 dst0, dst1, dst2, dst3;
435 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436 v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
437 v8i16 denom, offset;
438
439 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440 offset_in += (128 * (src_weight + dst_weight));
441
442 src_wgt = __msa_fill_b(src_weight);
443 dst_wgt = __msa_fill_b(dst_weight);
444 offset = __msa_fill_h(offset_in);
445 denom = __msa_fill_h(log2_denom + 1);
446 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447
448 for (cnt = 2; cnt--;) {
449 LD4(src, stride, tp0, tp1, tp2, tp3);
450 src += 4 * stride;
451 INSERT_D2_UB(tp0, tp1, src0);
452 INSERT_D2_UB(tp2, tp3, src1);
453 LD4(src, stride, tp0, tp1, tp2, tp3);
454 src += 4 * stride;
455 INSERT_D2_UB(tp0, tp1, src2);
456 INSERT_D2_UB(tp2, tp3, src3);
457 LD4(dst, stride, tp0, tp1, tp2, tp3);
458 INSERT_D2_UB(tp0, tp1, dst0);
459 INSERT_D2_UB(tp2, tp3, dst1);
460 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
461 INSERT_D2_UB(tp0, tp1, dst2);
462 INSERT_D2_UB(tp2, tp3, dst3);
463 XORI_B4_128_UB(src0, src1, src2, src3);
464 XORI_B4_128_UB(dst0, dst1, dst2, dst3);
465 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
466 vec0, vec2, vec4, vec6);
467 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
468 vec1, vec3, vec5, vec7);
469
470 temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
471 temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
472 temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
473 temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
474 temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
475 temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
476 temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
477 temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
478
479 SRA_4V(temp0, temp1, temp2, temp3, denom);
480 SRA_4V(temp4, temp5, temp6, temp7, denom);
481 CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482 PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483 dst0, dst1, dst2, dst3);
484 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
485 dst += 8 * stride;
486 }
487 }
488
489 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
490 q3_or_p3_org_in, p1_or_q1_org_in, \
491 p2_or_q2_org_in, q1_or_p1_org_in, \
492 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
493 { \
494 v8i16 threshold; \
495 v8i16 const3 = __msa_ldi_h(3); \
496 \
497 threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in); \
498 threshold += (p1_or_q1_org_in); \
499 \
500 (p0_or_q0_out) = threshold << 1; \
501 (p0_or_q0_out) += (p2_or_q2_org_in); \
502 (p0_or_q0_out) += (q1_or_p1_org_in); \
503 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3); \
504 \
505 (p1_or_q1_out) = (p2_or_q2_org_in) + threshold; \
506 (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2); \
507 \
508 (p2_or_q2_out) = (p2_or_q2_org_in) * const3; \
509 (p2_or_q2_out) += (p3_or_q3_org_in); \
510 (p2_or_q2_out) += (p3_or_q3_org_in); \
511 (p2_or_q2_out) += threshold; \
512 (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3); \
513 }
514
515 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
517 p1_or_q1_org_in, p0_or_q0_out) \
518 { \
519 (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in); \
520 (p0_or_q0_out) += (p1_or_q1_org_in); \
521 (p0_or_q0_out) += (p1_or_q1_org_in); \
522 (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2); \
523 }
524
525 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
526 p1_or_q1_org_in, p2_or_q2_org_in, \
527 negate_tc_in, tc_in, p1_or_q1_out) \
528 { \
529 v8i16 clip3, temp; \
530 \
531 clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in, \
532 (v8u16) q0_or_p0_org_in); \
533 temp = p1_or_q1_org_in << 1; \
534 clip3 = clip3 - temp; \
535 clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3); \
536 CLIP_SH(clip3, negate_tc_in, tc_in); \
537 p1_or_q1_out = p1_or_q1_org_in + clip3; \
538 }
539
540 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
541 p1_or_q1_org_in, q1_or_p1_org_in, \
542 negate_threshold_in, threshold_in, \
543 p0_or_q0_out, q0_or_p0_out) \
544 { \
545 v8i16 q0_sub_p0, p1_sub_q1, delta; \
546 \
547 q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
548 p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
549 q0_sub_p0 <<= 2; \
550 p1_sub_q1 += 4; \
551 delta = q0_sub_p0 + p1_sub_q1; \
552 delta >>= 3; \
553 \
554 CLIP_SH(delta, negate_threshold_in, threshold_in); \
555 \
556 p0_or_q0_out = p0_or_q0_org_in + delta; \
557 q0_or_p0_out = q0_or_p0_org_in - delta; \
558 \
559 CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out); \
560 }
561
562 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
563 { \
564 uint32_t load0, load1, load2, load3; \
565 v16u8 src0 = { 0 }; \
566 v16u8 src1 = { 0 }; \
567 v16u8 src2 = { 0 }; \
568 v16u8 src3 = { 0 }; \
569 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
570 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
571 v8i16 tc, q0_sub_p0, p1_sub_q1, delta; \
572 v8i16 res0_r, res1_r; \
573 v16i8 zeros = { 0 }; \
574 v16u8 res0, res1; \
575 \
576 LW4((src - 2), stride, load0, load1, load2, load3); \
577 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
578 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
579 src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2); \
580 src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3); \
581 \
582 TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3); \
583 \
584 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
585 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
586 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
587 \
588 tc = __msa_fill_h(tc_val); \
589 \
590 is_less_than_alpha = (p0_asub_q0 < alpha); \
591 is_less_than_beta = (p1_asub_p0 < beta); \
592 is_less_than = is_less_than_alpha & is_less_than_beta; \
593 is_less_than_beta = (q1_asub_q0 < beta); \
594 is_less_than = is_less_than_beta & is_less_than; \
595 \
596 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
597 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
598 \
599 q0_sub_p0 <<= 2; \
600 delta = q0_sub_p0 + p1_sub_q1; \
601 delta = __msa_srari_h(delta, 3); \
602 \
603 CLIP_SH(delta, -tc, tc); \
604 \
605 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
606 \
607 res0_r += delta; \
608 res1_r -= delta; \
609 \
610 CLIP_SH2_0_255(res0_r, res1_r); \
611 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
612 \
613 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
614 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
615 \
616 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
617 }
618
619 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3) \
620 { \
621 v16i8 zero_m = { 0 }; \
622 \
623 out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0); \
624 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2); \
625 SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3); \
626 }
627
628 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res) \
629 { \
630 uint32_t load0, load1; \
631 v16u8 src0 = { 0 }; \
632 v16u8 src1 = { 0 }; \
633 v16u8 src2 = { 0 }; \
634 v16u8 src3 = { 0 }; \
635 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; \
636 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta; \
637 v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r; \
638 v16i8 zeros = { 0 }; \
639 v16u8 res0, res1; \
640 \
641 load0 = LW(src - 2); \
642 load1 = LW(src - 2 + stride); \
643 \
644 src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0); \
645 src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1); \
646 \
647 TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3); \
648 \
649 p0_asub_q0 = __msa_asub_u_b(src2, src1); \
650 p1_asub_p0 = __msa_asub_u_b(src1, src0); \
651 q1_asub_q0 = __msa_asub_u_b(src2, src3); \
652 \
653 tc = __msa_fill_h(tc_val); \
654 \
655 is_less_than_alpha = (p0_asub_q0 < alpha); \
656 is_less_than_beta = (p1_asub_p0 < beta); \
657 is_less_than = is_less_than_alpha & is_less_than_beta; \
658 is_less_than_beta = (q1_asub_q0 < beta); \
659 is_less_than = is_less_than_beta & is_less_than; \
660 \
661 ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1); \
662 HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1); \
663 \
664 q0_sub_p0 <<= 2; \
665 delta = q0_sub_p0 + p1_sub_q1; \
666 delta = __msa_srari_h(delta, 3); \
667 CLIP_SH(delta, -tc, tc); \
668 \
669 ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r); \
670 \
671 res0_r += delta; \
672 res1_r -= delta; \
673 \
674 CLIP_SH2_0_255(res0_r, res1_r); \
675 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1); \
676 \
677 res0 = __msa_bmnz_v(src1, res0, is_less_than); \
678 res1 = __msa_bmnz_v(src2, res1, is_less_than); \
679 \
680 res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0); \
681 }
682
avc_loopfilter_luma_intra_edge_hor_msa(uint8_t * data,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)683 static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
684 uint8_t alpha_in,
685 uint8_t beta_in,
686 ptrdiff_t img_width)
687 {
688 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690 v16u8 p1_org, p0_org, q0_org, q1_org;
691
692 LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693
694 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697
698 is_less_than_alpha = (p0_asub_q0 < alpha_in);
699 is_less_than_beta = (p1_asub_p0 < beta_in);
700 is_less_than = is_less_than_beta & is_less_than_alpha;
701 is_less_than_beta = (q1_asub_q0 < beta_in);
702 is_less_than = is_less_than_beta & is_less_than;
703
704 if (!__msa_test_bz_v(is_less_than)) {
705 v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
706 v8i16 p0_r = { 0 };
707 v8i16 q0_r = { 0 };
708 v8i16 p0_l = { 0 };
709 v8i16 q0_l = { 0 };
710 v16i8 zero = { 0 };
711 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713 v16u8 q2_org = LD_UB(data + (2 * img_width));
714 v16u8 p2_org = LD_UB(data - (3 * img_width));
715 v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716
717 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
718 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
719 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
720
721 tmp_flag = (p0_asub_q0 < tmp_flag);
722
723 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724 is_less_than_beta = (p2_asub_p0 < beta_in);
725 is_less_than_beta = is_less_than_beta & tmp_flag;
726 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727 is_less_than_beta = is_less_than_beta & is_less_than;
728 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729
730 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
731 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
732
733 /* combine and store */
734 if (!__msa_test_bz_v(is_less_than_beta)) {
735 v8i16 p3_org_l, p3_org_r;
736 v16u8 p3_org = LD_UB(data - (img_width << 2));
737 v16u8 p2, p1;
738 v8i16 p2_r = { 0 };
739 v8i16 p2_l = { 0 };
740 v8i16 p1_r = { 0 };
741 v8i16 p1_l = { 0 };
742
743 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
744 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
745 p2_r, q1_org_r, p0_r, p1_r, p2_r);
746
747 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
748 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
749 p2_l, q1_org_l, p0_l, p1_l, p2_l);
750
751 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752
753 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756
757 ST_UB(p1_org, data - (2 * img_width));
758 ST_UB(p2_org, data - (3 * img_width));
759 }
760
761 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
762 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
763
764 /* combine */
765 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767
768 ST_UB(p0_org, data - img_width);
769
770 /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772 is_less_than_beta = (q2_asub_q0 < beta_in);
773 is_less_than_beta = is_less_than_beta & tmp_flag;
774 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775 is_less_than_beta = is_less_than_beta & is_less_than;
776 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777
778 /* combine and store */
779 if (!__msa_test_bz_v(is_less_than_beta)) {
780 v8i16 q3_org_r, q3_org_l;
781 v16u8 q3_org = LD_UB(data + (3 * img_width));
782 v16u8 q1, q2;
783 v8i16 q2_r = { 0 };
784 v8i16 q2_l = { 0 };
785 v8i16 q1_r = { 0 };
786 v8i16 q1_l = { 0 };
787
788 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
789 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
790 q2_r, p1_org_r, q0_r, q1_r, q2_r);
791
792 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
793 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
794 q2_l, p1_org_l, q0_l, q1_l, q2_l);
795
796 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
797 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
798 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
799 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800
801 ST_UB(q1_org, data + img_width);
802 ST_UB(q2_org, data + 2 * img_width);
803 }
804
805 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
806 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
807
808 /* combine */
809 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
811
812 ST_UB(q0_org, data);
813 }
814 }
815
avc_loopfilter_luma_intra_edge_ver_msa(uint8_t * data,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)816 static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
817 uint8_t alpha_in,
818 uint8_t beta_in,
819 ptrdiff_t img_width)
820 {
821 uint8_t *src = data - 4;
822 v16u8 alpha, beta, p0_asub_q0;
823 v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825 v16u8 p1_asub_p0, q1_asub_q0;
826
827
828 {
829 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831
832 LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
833 LD_UB8(src + (8 * img_width), img_width,
834 row8, row9, row10, row11, row12, row13, row14, row15);
835
836 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
837 row4, row5, row6, row7,
838 row8, row9, row10, row11,
839 row12, row13, row14, row15,
840 p3_org, p2_org, p1_org, p0_org,
841 q0_org, q1_org, q2_org, q3_org);
842 }
843
844 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847
848 alpha = (v16u8) __msa_fill_b(alpha_in);
849 beta = (v16u8) __msa_fill_b(beta_in);
850
851 is_less_than_alpha = (p0_asub_q0 < alpha);
852 is_less_than_beta = (p1_asub_p0 < beta);
853 is_less_than = is_less_than_beta & is_less_than_alpha;
854 is_less_than_beta = (q1_asub_q0 < beta);
855 is_less_than = is_less_than_beta & is_less_than;
856
857 if (!__msa_test_bz_v(is_less_than)) {
858 v8i16 p0_r = { 0 };
859 v8i16 q0_r = { 0 };
860 v8i16 p0_l = { 0 };
861 v8i16 q0_l = { 0 };
862 v16i8 zero = { 0 };
863 v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
864 v16u8 negate_is_less_than_beta;
865 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867
868 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
869 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
870 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
871 UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
872
873 tmp_flag = alpha >> 2;
874 tmp_flag = tmp_flag + 2;
875 tmp_flag = (p0_asub_q0 < tmp_flag);
876
877 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878 is_less_than_beta = (p2_asub_p0 < beta);
879 is_less_than_beta = tmp_flag & is_less_than_beta;
880 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881 is_less_than_beta = is_less_than_beta & is_less_than;
882 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883
884 if (!__msa_test_bz_v(is_less_than_beta)) {
885 v16u8 p2, p1;
886 v8i16 p3_org_r, p3_org_l;
887 v8i16 p2_l = { 0 };
888 v8i16 p2_r = { 0 };
889 v8i16 p1_l = { 0 };
890 v8i16 p1_r = { 0 };
891
892 ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
893 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
894 p2_r, q1_org_r, p0_r, p1_r, p2_r);
895
896 ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
897 AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
898 p2_l, q1_org_l, p0_l, p1_l, p2_l);
899
900 PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903 p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904 }
905
906 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
907 AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
908
909 p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910 p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911
912 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913 is_less_than_beta = (q2_asub_q0 < beta);
914
915 is_less_than_beta = is_less_than_beta & tmp_flag;
916 negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917
918 is_less_than_beta = is_less_than_beta & is_less_than;
919 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920
921 if (!__msa_test_bz_v(is_less_than_beta)) {
922 v16u8 q1, q2;
923 v8i16 q3_org_r, q3_org_l;
924 v8i16 q1_l = { 0 };
925 v8i16 q1_r = { 0 };
926 v8i16 q2_l = { 0 };
927 v8i16 q2_r = { 0 };
928
929 ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
930 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
931 q2_r, p1_org_r, q0_r, q1_r, q2_r);
932
933 ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
934 AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
935 q2_l, p1_org_l, q0_l, q1_l, q2_l);
936
937 PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
938 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
939 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
940 q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941 }
942
943 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
944 AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
945
946 q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947 q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
948
949 {
950 v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951
952 ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
953 ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
954 ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
955
956 ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
957 ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
958
959 src = data - 3;
960 ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
961 ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
962 src += 4 * img_width;
963 ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
964 ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
965 src += 4 * img_width;
966
967 ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
968 ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
969 src += 4 * img_width;
970 ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
971 ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
972 }
973 }
974 }
975
avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in)976 static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
977 ptrdiff_t stride,
978 int32_t alpha_in,
979 int32_t beta_in)
980 {
981 uint64_t load0, load1;
982 uint32_t out0, out2;
983 uint16_t out1, out3;
984 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985 v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986 v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987 v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988 v8i16 tmp0, tmp1, tmp2, tmp3;
989 v16u8 alpha, beta;
990 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992 v16u8 is_less_than_beta1, is_less_than_beta2;
993 v16i8 src0 = { 0 };
994 v16i8 src1 = { 0 };
995 v16i8 src2 = { 0 };
996 v16i8 src3 = { 0 };
997 v16i8 src4 = { 0 };
998 v16i8 src5 = { 0 };
999 v16i8 src6 = { 0 };
1000 v16i8 src7 = { 0 };
1001 v16i8 zeros = { 0 };
1002
1003 load0 = LD(src - 4);
1004 load1 = LD(src + stride - 4);
1005 src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1006 src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1007
1008 load0 = LD(src + (2 * stride) - 4);
1009 load1 = LD(src + (3 * stride) - 4);
1010 src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1011 src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012
1013 load0 = LD(src + (4 * stride) - 4);
1014 load1 = LD(src + (5 * stride) - 4);
1015 src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016 src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017
1018 load0 = LD(src + (6 * stride) - 4);
1019 load1 = LD(src + (7 * stride) - 4);
1020 src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021 src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022
1023 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1024 src0, src1, src2, src3);
1025
1026 ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1027 ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1028
1029 ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1030 ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1031 SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1032 8, src0, src2, src4, src7);
1033
1034 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1035 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1036 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037
1038 alpha = (v16u8) __msa_fill_b(alpha_in);
1039 beta = (v16u8) __msa_fill_b(beta_in);
1040
1041 is_less_than_alpha = (p0_asub_q0 < alpha);
1042 is_less_than_beta = (p1_asub_p0 < beta);
1043 is_less_than = is_less_than_alpha & is_less_than_beta;
1044 is_less_than_beta = (q1_asub_q0 < beta);
1045 is_less_than = is_less_than & is_less_than_beta;
1046
1047 alpha >>= 2;
1048 alpha += 2;
1049
1050 is_less_than_alpha = (p0_asub_q0 < alpha);
1051
1052 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1053 is_less_than_beta1 = (p2_asub_p0 < beta);
1054 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055 is_less_than_beta2 = (q2_asub_q0 < beta);
1056
1057 ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1058 src0_r, src1_r, src2_r, src3_r);
1059 ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060 src4_r, src5_r, src6_r, src7_r);
1061
1062 dst2_x_r = src1_r + src2_r + src3_r;
1063 dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064 dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065 dst1_r = src0_r + src1_r + src2_r + src3_r;
1066 dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067
1068 dst0_r = (2 * src6_r) + (3 * src0_r);
1069 dst0_r += src1_r + src2_r + src3_r;
1070 dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073
1074 PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076
1077 dst3_x_r = src2_r + src3_r + src4_r;
1078 dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079 dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080 dst4_r = src2_r + src3_r + src4_r + src5_r;
1081 dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082
1083 dst5_r = (2 * src7_r) + (3 * src5_r);
1084 dst5_r += src4_r + src3_r + src2_r;
1085 dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088
1089 PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091
1092 dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093 dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094 dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095 dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096
1097 PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098
1099 dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100 dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101 dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1102 dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103
1104 is_less_than = is_less_than_alpha & is_less_than;
1105 dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106 is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107 dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1108
1109 dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110 dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1111 dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112 is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113 dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114 dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115 dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116
1117 ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118 dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1119 ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1120 ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1121
1122 ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123 SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124 dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125 dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126 SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127
1128 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129 out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130 out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131 out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132
1133 SW(out0, (src - 3));
1134 SH(out1, (src + 1));
1135 src += stride;
1136 SW(out2, (src - 3));
1137 SH(out3, (src + 1));
1138 src += stride;
1139
1140 out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141 out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142 out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143 out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144
1145 SW(out0, (src - 3));
1146 SH(out1, (src + 1));
1147 src += stride;
1148 SW(out2, (src - 3));
1149 SH(out3, (src + 1));
1150 src += stride;
1151
1152 out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153 out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154 out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155 out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156
1157 SW(out0, (src - 3));
1158 SH(out1, (src + 1));
1159 src += stride;
1160 SW(out2, (src - 3));
1161 SH(out3, (src + 1));
1162 src += stride;
1163
1164 out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165 out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166 out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167 out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168
1169 SW(out0, (src - 3));
1170 SH(out1, (src + 1));
1171 src += stride;
1172 SW(out2, (src - 3));
1173 SH(out3, (src + 1));
1174 }
1175
avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t * data_cb_or_cr,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1176 static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1177 uint8_t alpha_in,
1178 uint8_t beta_in,
1179 ptrdiff_t img_width)
1180 {
1181 v16u8 alpha, beta;
1182 v16u8 is_less_than;
1183 v8i16 p0_or_q0, q0_or_p0;
1184 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1185 v16i8 zero = { 0 };
1186 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187 v16u8 is_less_than_alpha, is_less_than_beta;
1188 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189
1190 alpha = (v16u8) __msa_fill_b(alpha_in);
1191 beta = (v16u8) __msa_fill_b(beta_in);
1192
1193 LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195
1196 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199
1200 is_less_than_alpha = (p0_asub_q0 < alpha);
1201 is_less_than_beta = (p1_asub_p0 < beta);
1202 is_less_than = is_less_than_beta & is_less_than_alpha;
1203 is_less_than_beta = (q1_asub_q0 < beta);
1204 is_less_than = is_less_than_beta & is_less_than;
1205
1206 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1207
1208 if (!__msa_test_bz_v(is_less_than)) {
1209 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1210 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1211 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1212 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1213 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1214
1215 p0_or_q0_org =
1216 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217 q0_or_p0_org =
1218 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219
1220 ST_UB(q0_or_p0_org, data_cb_or_cr);
1221 ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222 }
1223 }
1224
avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t * data_cb_or_cr,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1225 static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1226 uint8_t alpha_in,
1227 uint8_t beta_in,
1228 ptrdiff_t img_width)
1229 {
1230 v8i16 tmp1;
1231 v16u8 alpha, beta, is_less_than;
1232 v8i16 p0_or_q0, q0_or_p0;
1233 v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1234 v16i8 zero = { 0 };
1235 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236 v16u8 is_less_than_alpha, is_less_than_beta;
1237 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238
1239 {
1240 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241
1242 LD_UB8((data_cb_or_cr - 2), img_width,
1243 row0, row1, row2, row3, row4, row5, row6, row7);
1244
1245 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1246 p1_or_q1_org, p0_or_q0_org,
1247 q0_or_p0_org, q1_or_p1_org);
1248 }
1249
1250 alpha = (v16u8) __msa_fill_b(alpha_in);
1251 beta = (v16u8) __msa_fill_b(beta_in);
1252
1253 p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254 p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255 q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256
1257 is_less_than_alpha = (p0_asub_q0 < alpha);
1258 is_less_than_beta = (p1_asub_p0 < beta);
1259 is_less_than = is_less_than_beta & is_less_than_alpha;
1260 is_less_than_beta = (q1_asub_q0 < beta);
1261 is_less_than = is_less_than_beta & is_less_than;
1262 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1263
1264 if (!__msa_test_bz_v(is_less_than)) {
1265 ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1266 zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267
1268 AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1269 AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1270
1271 /* convert 16 bit output into 8 bit output */
1272 PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1273
1274 p0_or_q0_org =
1275 __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276 q0_or_p0_org =
1277 __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279
1280 data_cb_or_cr -= 1;
1281 ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282 data_cb_or_cr += 4 * img_width;
1283 ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284 }
1285 }
1286
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1287 static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
1288 uint8_t bs0, uint8_t bs1,
1289 uint8_t bs2, uint8_t bs3,
1290 uint8_t tc0, uint8_t tc1,
1291 uint8_t tc2, uint8_t tc3,
1292 uint8_t alpha_in,
1293 uint8_t beta_in,
1294 ptrdiff_t img_width)
1295 {
1296 v16u8 tmp_vec, bs = { 0 };
1297
1298 tmp_vec = (v16u8) __msa_fill_b(bs0);
1299 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1300 tmp_vec = (v16u8) __msa_fill_b(bs1);
1301 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1302 tmp_vec = (v16u8) __msa_fill_b(bs2);
1303 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1304 tmp_vec = (v16u8) __msa_fill_b(bs3);
1305 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1306
1307 if (!__msa_test_bz_v(bs)) {
1308 uint8_t *src = data - 4;
1309 v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1310 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
1311 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1312 v16u8 is_bs_greater_than0;
1313 v16u8 tc = { 0 };
1314 v16i8 zero = { 0 };
1315
1316 tmp_vec = (v16u8) __msa_fill_b(tc0);
1317 tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1318 tmp_vec = (v16u8) __msa_fill_b(tc1);
1319 tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1320 tmp_vec = (v16u8) __msa_fill_b(tc2);
1321 tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1322 tmp_vec = (v16u8) __msa_fill_b(tc3);
1323 tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1324
1325 is_bs_greater_than0 = (zero < bs);
1326
1327 {
1328 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1329 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1330
1331 LD_UB8(src, img_width,
1332 row0, row1, row2, row3, row4, row5, row6, row7);
1333 src += (8 * img_width);
1334 LD_UB8(src, img_width,
1335 row8, row9, row10, row11, row12, row13, row14, row15);
1336
1337 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1338 row8, row9, row10, row11,
1339 row12, row13, row14, row15,
1340 p3_org, p2_org, p1_org, p0_org,
1341 q0_org, q1_org, q2_org, q3_org);
1342 }
1343
1344 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1345 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1346 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1347
1348 alpha = (v16u8) __msa_fill_b(alpha_in);
1349 beta = (v16u8) __msa_fill_b(beta_in);
1350
1351 is_less_than_alpha = (p0_asub_q0 < alpha);
1352 is_less_than_beta = (p1_asub_p0 < beta);
1353 is_less_than = is_less_than_beta & is_less_than_alpha;
1354 is_less_than_beta = (q1_asub_q0 < beta);
1355 is_less_than = is_less_than_beta & is_less_than;
1356 is_less_than = is_less_than & is_bs_greater_than0;
1357
1358 if (!__msa_test_bz_v(is_less_than)) {
1359 v16i8 negate_tc, sign_negate_tc;
1360 v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
1361 v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1362 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1363 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1364 v8i16 p0_r, q0_r, p0_l, q0_l;
1365
1366 negate_tc = zero - (v16i8) tc;
1367 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1368
1369 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1370
1371 UNPCK_UB_SH(tc, tc_r, tc_l);
1372 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1373 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1374 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1375
1376 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1377 is_less_than_beta = (p2_asub_p0 < beta);
1378 is_less_than_beta = is_less_than_beta & is_less_than;
1379
1380 if (!__msa_test_bz_v(is_less_than_beta)) {
1381 v16u8 p1;
1382 v8i16 p1_r = { 0 };
1383 v8i16 p1_l = { 0 };
1384 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1385 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1386
1387 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1388 negate_tc_r, tc_r, p1_r);
1389 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1390 i16_negatetc_l, tc_l, p1_l);
1391
1392 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1393 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1394
1395 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1396 tc = tc + is_less_than_beta;
1397 }
1398
1399 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1400 is_less_than_beta = (q2_asub_q0 < beta);
1401 is_less_than_beta = is_less_than_beta & is_less_than;
1402
1403 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1404 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1405
1406 if (!__msa_test_bz_v(is_less_than_beta)) {
1407 v16u8 q1;
1408 v8i16 q1_r = { 0 };
1409 v8i16 q1_l = { 0 };
1410 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1411 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1412
1413 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1414 negate_tc_r, tc_r, q1_r);
1415 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1416 i16_negatetc_l, tc_l, q1_l);
1417
1418 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1419 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1420
1421 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1422 tc = tc + is_less_than_beta;
1423 }
1424
1425 {
1426 v8i16 threshold_r, negate_thresh_r;
1427 v8i16 threshold_l, negate_thresh_l;
1428 v16i8 negate_thresh, sign_negate_thresh;
1429
1430 negate_thresh = zero - (v16i8) tc;
1431 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1432
1433 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1434 threshold_r, negate_thresh_r);
1435
1436 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1437 negate_thresh_r, threshold_r, p0_r, q0_r);
1438
1439 threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1440 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1441 negate_thresh);
1442
1443 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1444 negate_thresh_l, threshold_l, p0_l, q0_l);
1445 }
1446
1447 PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1448
1449 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1450 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1451
1452 {
1453 v16i8 tp0, tp1, tp2, tp3;
1454 v8i16 tmp2, tmp5;
1455 v4i32 tmp3, tmp4, tmp6, tmp7;
1456 uint32_t out0, out2;
1457 uint16_t out1, out3;
1458
1459 src = data - 3;
1460
1461 ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1462 ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1463 ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1464
1465 ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1466 ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1467
1468 out0 = __msa_copy_u_w(tmp3, 0);
1469 out1 = __msa_copy_u_h(tmp2, 0);
1470 out2 = __msa_copy_u_w(tmp3, 1);
1471 out3 = __msa_copy_u_h(tmp2, 1);
1472
1473 SW(out0, src);
1474 SH(out1, (src + 4));
1475 src += img_width;
1476 SW(out2, src);
1477 SH(out3, (src + 4));
1478
1479 out0 = __msa_copy_u_w(tmp3, 2);
1480 out1 = __msa_copy_u_h(tmp2, 2);
1481 out2 = __msa_copy_u_w(tmp3, 3);
1482 out3 = __msa_copy_u_h(tmp2, 3);
1483
1484 src += img_width;
1485 SW(out0, src);
1486 SH(out1, (src + 4));
1487 src += img_width;
1488 SW(out2, src);
1489 SH(out3, (src + 4));
1490
1491 out0 = __msa_copy_u_w(tmp4, 0);
1492 out1 = __msa_copy_u_h(tmp2, 4);
1493 out2 = __msa_copy_u_w(tmp4, 1);
1494 out3 = __msa_copy_u_h(tmp2, 5);
1495
1496 src += img_width;
1497 SW(out0, src);
1498 SH(out1, (src + 4));
1499 src += img_width;
1500 SW(out2, src);
1501 SH(out3, (src + 4));
1502
1503 out0 = __msa_copy_u_w(tmp4, 2);
1504 out1 = __msa_copy_u_h(tmp2, 6);
1505 out2 = __msa_copy_u_w(tmp4, 3);
1506 out3 = __msa_copy_u_h(tmp2, 7);
1507
1508 src += img_width;
1509 SW(out0, src);
1510 SH(out1, (src + 4));
1511 src += img_width;
1512 SW(out2, src);
1513 SH(out3, (src + 4));
1514
1515 out0 = __msa_copy_u_w(tmp6, 0);
1516 out1 = __msa_copy_u_h(tmp5, 0);
1517 out2 = __msa_copy_u_w(tmp6, 1);
1518 out3 = __msa_copy_u_h(tmp5, 1);
1519
1520 src += img_width;
1521 SW(out0, src);
1522 SH(out1, (src + 4));
1523 src += img_width;
1524 SW(out2, src);
1525 SH(out3, (src + 4));
1526
1527 out0 = __msa_copy_u_w(tmp6, 2);
1528 out1 = __msa_copy_u_h(tmp5, 2);
1529 out2 = __msa_copy_u_w(tmp6, 3);
1530 out3 = __msa_copy_u_h(tmp5, 3);
1531
1532 src += img_width;
1533 SW(out0, src);
1534 SH(out1, (src + 4));
1535 src += img_width;
1536 SW(out2, src);
1537 SH(out3, (src + 4));
1538
1539 out0 = __msa_copy_u_w(tmp7, 0);
1540 out1 = __msa_copy_u_h(tmp5, 4);
1541 out2 = __msa_copy_u_w(tmp7, 1);
1542 out3 = __msa_copy_u_h(tmp5, 5);
1543
1544 src += img_width;
1545 SW(out0, src);
1546 SH(out1, (src + 4));
1547 src += img_width;
1548 SW(out2, src);
1549 SH(out3, (src + 4));
1550
1551 out0 = __msa_copy_u_w(tmp7, 2);
1552 out1 = __msa_copy_u_h(tmp5, 6);
1553 out2 = __msa_copy_u_w(tmp7, 3);
1554 out3 = __msa_copy_u_h(tmp5, 7);
1555
1556 src += img_width;
1557 SW(out0, src);
1558 SH(out1, (src + 4));
1559 src += img_width;
1560 SW(out2, src);
1561 SH(out3, (src + 4));
1562 }
1563 }
1564 }
1565 }
1566
avc_loopfilter_luma_inter_edge_hor_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t image_width)1567 static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
1568 uint8_t bs0, uint8_t bs1,
1569 uint8_t bs2, uint8_t bs3,
1570 uint8_t tc0, uint8_t tc1,
1571 uint8_t tc2, uint8_t tc3,
1572 uint8_t alpha_in,
1573 uint8_t beta_in,
1574 ptrdiff_t image_width)
1575 {
1576 v16u8 tmp_vec;
1577 v16u8 bs = { 0 };
1578
1579 tmp_vec = (v16u8) __msa_fill_b(bs0);
1580 bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1581 tmp_vec = (v16u8) __msa_fill_b(bs1);
1582 bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1583 tmp_vec = (v16u8) __msa_fill_b(bs2);
1584 bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1585 tmp_vec = (v16u8) __msa_fill_b(bs3);
1586 bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1587
1588 if (!__msa_test_bz_v(bs)) {
1589 v16u8 alpha, beta, is_less_than, is_less_than_beta;
1590 v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1591 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1592 v16u8 is_less_than_alpha, is_bs_greater_than0;
1593 v8i16 p0_r, q0_r, p0_l, q0_l;
1594 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1595 v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1596 v16i8 zero = { 0 };
1597 v16i8 tc = { 0 };
1598
1599 tmp_vec = (v16u8) __msa_fill_b(tc0);
1600 tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1601 tmp_vec = (v16u8) __msa_fill_b(tc1);
1602 tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1603 tmp_vec = (v16u8) __msa_fill_b(tc2);
1604 tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1605 tmp_vec = (v16u8) __msa_fill_b(tc3);
1606 tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1607
1608 alpha = (v16u8) __msa_fill_b(alpha_in);
1609 beta = (v16u8) __msa_fill_b(beta_in);
1610
1611 LD_UB5(data - (3 * image_width), image_width,
1612 p2_org, p1_org, p0_org, q0_org, q1_org);
1613
1614 is_bs_greater_than0 = ((v16u8) zero < bs);
1615 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1616 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1617 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1618
1619 is_less_than_alpha = (p0_asub_q0 < alpha);
1620 is_less_than_beta = (p1_asub_p0 < beta);
1621 is_less_than = is_less_than_beta & is_less_than_alpha;
1622 is_less_than_beta = (q1_asub_q0 < beta);
1623 is_less_than = is_less_than_beta & is_less_than;
1624 is_less_than = is_less_than & is_bs_greater_than0;
1625
1626 if (!__msa_test_bz_v(is_less_than)) {
1627 v16i8 sign_negate_tc, negate_tc;
1628 v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1629 v16u8 p2_asub_p0, q2_asub_q0;
1630
1631 q2_org = LD_UB(data + (2 * image_width));
1632 negate_tc = zero - tc;
1633 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1634
1635 ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1636
1637 UNPCK_UB_SH(tc, tc_r, tc_l);
1638 UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1639 UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1640 UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1641
1642 p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1643 is_less_than_beta = (p2_asub_p0 < beta);
1644 is_less_than_beta = is_less_than_beta & is_less_than;
1645
1646 if (!__msa_test_bz_v(is_less_than_beta)) {
1647 v16u8 p1;
1648 v8i16 p1_r = { 0 };
1649 v8i16 p1_l = { 0 };
1650 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1651 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1652
1653 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1654 negate_tc_r, tc_r, p1_r);
1655 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1656 i16_negatetc_l, tc_l, p1_l);
1657
1658 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1659 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1660 ST_UB(p1_org, data - (2 * image_width));
1661
1662 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1663 tc = tc + (v16i8) is_less_than_beta;
1664 }
1665
1666 q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1667 is_less_than_beta = (q2_asub_q0 < beta);
1668 is_less_than_beta = is_less_than_beta & is_less_than;
1669
1670 q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1671 q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1672
1673 if (!__msa_test_bz_v(is_less_than_beta)) {
1674 v16u8 q1;
1675 v8i16 q1_r = { 0 };
1676 v8i16 q1_l = { 0 };
1677 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1678 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1679
1680 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1681 negate_tc_r, tc_r, q1_r);
1682 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1683 i16_negatetc_l, tc_l, q1_l);
1684
1685 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1686 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1687 ST_UB(q1_org, data + image_width);
1688
1689 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1690 tc = tc + (v16i8) is_less_than_beta;
1691 }
1692 {
1693 v16i8 negate_thresh, sign_negate_thresh;
1694 v8i16 threshold_r, threshold_l;
1695 v8i16 negate_thresh_l, negate_thresh_r;
1696
1697 negate_thresh = zero - tc;
1698 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1699
1700 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1701 threshold_r, negate_thresh_r);
1702 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1703 negate_thresh_r, threshold_r, p0_r, q0_r);
1704
1705 threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1706 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1707 negate_thresh);
1708 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1709 negate_thresh_l, threshold_l, p0_l, q0_l);
1710 }
1711
1712 PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1713
1714 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1715 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1716
1717 ST_UB(p0_org, (data - image_width));
1718 ST_UB(q0_org, data);
1719 }
1720 }
1721 }
1722
avc_h_loop_filter_luma_mbaff_msa(uint8_t * in,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)1723 static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride,
1724 int32_t alpha_in, int32_t beta_in,
1725 int8_t *tc0)
1726 {
1727 uint8_t *data = in;
1728 uint32_t out0, out1, out2, out3;
1729 uint64_t load;
1730 uint32_t tc_val;
1731 v16u8 alpha, beta;
1732 v16i8 inp0 = { 0 };
1733 v16i8 inp1 = { 0 };
1734 v16i8 inp2 = { 0 };
1735 v16i8 inp3 = { 0 };
1736 v16i8 inp4 = { 0 };
1737 v16i8 inp5 = { 0 };
1738 v16i8 inp6 = { 0 };
1739 v16i8 inp7 = { 0 };
1740 v16i8 src0, src1, src2, src3;
1741 v8i16 src4, src5, src6, src7;
1742 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1743 v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1744 v16u8 is_less_than_beta1, is_less_than_beta2;
1745 v8i16 tc, tc_orig_r, tc_plus1;
1746 v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1747 v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1748 v8i16 src2_r, src3_r;
1749 v8i16 p2_r, p1_r, q2_r, q1_r;
1750 v16u8 p2, q2, p0, q0;
1751 v4i32 dst0, dst1;
1752 v16i8 zeros = { 0 };
1753
1754 alpha = (v16u8) __msa_fill_b(alpha_in);
1755 beta = (v16u8) __msa_fill_b(beta_in);
1756
1757 if (tc0[0] < 0) {
1758 data += (2 * stride);
1759 } else {
1760 load = LD(data - 3);
1761 inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1762 load = LD(data - 3 + stride);
1763 inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1764 data += (2 * stride);
1765 }
1766
1767 if (tc0[1] < 0) {
1768 data += (2 * stride);
1769 } else {
1770 load = LD(data - 3);
1771 inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1772 load = LD(data - 3 + stride);
1773 inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1774 data += (2 * stride);
1775 }
1776
1777 if (tc0[2] < 0) {
1778 data += (2 * stride);
1779 } else {
1780 load = LD(data - 3);
1781 inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1782 load = LD(data - 3 + stride);
1783 inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1784 data += (2 * stride);
1785 }
1786
1787 if (tc0[3] < 0) {
1788 data += (2 * stride);
1789 } else {
1790 load = LD(data - 3);
1791 inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1792 load = LD(data - 3 + stride);
1793 inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1794 data += (2 * stride);
1795 }
1796
1797 ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1798 src0, src1, src2, src3);
1799
1800 ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1801 ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1802
1803 src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1804 src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1805 src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1806 src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1807 src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1808 src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1809
1810 p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1811 p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1812 q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1813 p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1814 q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1815
1816 is_less_than_alpha = (p0_asub_q0 < alpha);
1817 is_less_than_beta = (p1_asub_p0 < beta);
1818 is_less_than = is_less_than_alpha & is_less_than_beta;
1819 is_less_than_beta = (q1_asub_q0 < beta);
1820 is_less_than = is_less_than_beta & is_less_than;
1821
1822 is_less_than_beta1 = (p2_asub_p0 < beta);
1823 is_less_than_beta2 = (q2_asub_q0 < beta);
1824
1825 p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1826 p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1827 p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1828
1829 ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1830 p2_r += p0_add_q0;
1831 p2_r >>= 1;
1832 p2_r -= p1_r;
1833 ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1834 q2_r += p0_add_q0;
1835 q2_r >>= 1;
1836 q2_r -= q1_r;
1837
1838 tc_val = LW(tc0);
1839 tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1840 tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1841 is_tc_orig1 = tc_orig;
1842 is_tc_orig2 = tc_orig;
1843 tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1844 tc = tc_orig_r;
1845
1846 CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1847 CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1848
1849 p2_r += p1_r;
1850 q2_r += q1_r;
1851
1852 PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1853
1854 is_tc_orig1 = (zeros < is_tc_orig1);
1855 is_tc_orig2 = is_tc_orig1;
1856 is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1857 is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1858 is_tc_orig1 = is_less_than & is_tc_orig1;
1859 is_tc_orig2 = is_less_than & is_tc_orig2;
1860
1861 p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1862 q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1863
1864 q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1865 q0_sub_p0 <<= 2;
1866 p1_sub_q1 = p1_r - q1_r;
1867 q0_sub_p0 += p1_sub_q1;
1868 q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1869
1870 tc_plus1 = tc + 1;
1871 is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1872 (v16i8) is_less_than_beta1);
1873 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1874 tc_plus1 = tc + 1;
1875 is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1876 (v16i8) is_less_than_beta2);
1877 tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1878
1879 CLIP_SH(q0_sub_p0, -tc, tc);
1880
1881 ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1882 src2_r += q0_sub_p0;
1883 src3_r -= q0_sub_p0;
1884
1885 CLIP_SH2_0_255(src2_r, src3_r);
1886
1887 PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1888
1889 p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1890 q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1891
1892 ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1893
1894 ILVRL_H2_SW(q2, p2, dst0, dst1);
1895
1896 data = in;
1897
1898 out0 = __msa_copy_u_w(dst0, 0);
1899 out1 = __msa_copy_u_w(dst0, 1);
1900 out2 = __msa_copy_u_w(dst0, 2);
1901 out3 = __msa_copy_u_w(dst0, 3);
1902
1903 if (tc0[0] < 0) {
1904 data += (2 * stride);
1905 } else {
1906 SW(out0, (data - 2));
1907 data += stride;
1908 SW(out1, (data - 2));
1909 data += stride;
1910 }
1911
1912 if (tc0[1] < 0) {
1913 data += (2 * stride);
1914 } else {
1915 SW(out2, (data - 2));
1916 data += stride;
1917 SW(out3, (data - 2));
1918 data += stride;
1919 }
1920
1921 out0 = __msa_copy_u_w(dst1, 0);
1922 out1 = __msa_copy_u_w(dst1, 1);
1923 out2 = __msa_copy_u_w(dst1, 2);
1924 out3 = __msa_copy_u_w(dst1, 3);
1925
1926 if (tc0[2] < 0) {
1927 data += (2 * stride);
1928 } else {
1929 SW(out0, (data - 2));
1930 data += stride;
1931 SW(out1, (data - 2));
1932 data += stride;
1933 }
1934
1935 if (tc0[3] >= 0) {
1936 SW(out2, (data - 2));
1937 data += stride;
1938 SW(out3, (data - 2));
1939 }
1940 }
1941
avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1942 static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
1943 uint8_t bs0, uint8_t bs1,
1944 uint8_t bs2, uint8_t bs3,
1945 uint8_t tc0, uint8_t tc1,
1946 uint8_t tc2, uint8_t tc3,
1947 uint8_t alpha_in,
1948 uint8_t beta_in,
1949 ptrdiff_t img_width)
1950 {
1951 v16u8 alpha, beta;
1952 v8i16 tmp_vec;
1953 v8i16 bs = { 0 };
1954 v8i16 tc = { 0 };
1955 v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1956 v16u8 is_less_than;
1957 v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1958 v8i16 p0_r, q0_r;
1959 v16u8 p1_org, p0_org, q0_org, q1_org;
1960 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1961 v16i8 negate_tc, sign_negate_tc;
1962 v8i16 tc_r, negate_tc_r;
1963 v16i8 zero = { 0 };
1964
1965 tmp_vec = (v8i16) __msa_fill_b(bs0);
1966 bs = __msa_insve_h(bs, 0, tmp_vec);
1967 tmp_vec = (v8i16) __msa_fill_b(bs1);
1968 bs = __msa_insve_h(bs, 1, tmp_vec);
1969 tmp_vec = (v8i16) __msa_fill_b(bs2);
1970 bs = __msa_insve_h(bs, 2, tmp_vec);
1971 tmp_vec = (v8i16) __msa_fill_b(bs3);
1972 bs = __msa_insve_h(bs, 3, tmp_vec);
1973
1974 if (!__msa_test_bz_v((v16u8) bs)) {
1975 tmp_vec = (v8i16) __msa_fill_b(tc0);
1976 tc = __msa_insve_h(tc, 0, tmp_vec);
1977 tmp_vec = (v8i16) __msa_fill_b(tc1);
1978 tc = __msa_insve_h(tc, 1, tmp_vec);
1979 tmp_vec = (v8i16) __msa_fill_b(tc2);
1980 tc = __msa_insve_h(tc, 2, tmp_vec);
1981 tmp_vec = (v8i16) __msa_fill_b(tc3);
1982 tc = __msa_insve_h(tc, 3, tmp_vec);
1983
1984 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1985
1986 alpha = (v16u8) __msa_fill_b(alpha_in);
1987 beta = (v16u8) __msa_fill_b(beta_in);
1988
1989 LD_UB4(data - (img_width << 1), img_width,
1990 p1_org, p0_org, q0_org, q1_org);
1991
1992 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1993 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1994 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1995
1996 is_less_than_alpha = (p0_asub_q0 < alpha);
1997 is_less_than_beta = (p1_asub_p0 < beta);
1998 is_less_than = is_less_than_beta & is_less_than_alpha;
1999 is_less_than_beta = (q1_asub_q0 < beta);
2000 is_less_than = is_less_than_beta & is_less_than;
2001 is_less_than = is_less_than & is_bs_greater_than0;
2002
2003 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2004
2005 if (!__msa_test_bz_v(is_less_than)) {
2006 negate_tc = zero - (v16i8) tc;
2007 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2008
2009 ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
2010
2011 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2012 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2013
2014 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2015 tc_r, p0_r, q0_r);
2016
2017 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2018
2019 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2020 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2021
2022 ST_UB(q0_org, data);
2023 ST_UB(p0_org, (data - img_width));
2024 }
2025 }
2026 }
2027
avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)2028 static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
2029 uint8_t bs0, uint8_t bs1,
2030 uint8_t bs2, uint8_t bs3,
2031 uint8_t tc0, uint8_t tc1,
2032 uint8_t tc2, uint8_t tc3,
2033 uint8_t alpha_in,
2034 uint8_t beta_in,
2035 ptrdiff_t img_width)
2036 {
2037 uint8_t *src;
2038 v16u8 alpha, beta;
2039 v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2040 v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2041 v16u8 p0, q0;
2042 v8i16 p0_r = { 0 };
2043 v8i16 q0_r = { 0 };
2044 v16u8 p1_org, p0_org, q0_org, q1_org;
2045 v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2046 v16u8 is_bs_greater_than0;
2047 v8i16 tc_r, negate_tc_r;
2048 v16i8 negate_tc, sign_negate_tc;
2049 v16i8 zero = { 0 };
2050 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2051 v8i16 tmp1, tmp_vec, bs = { 0 };
2052 v8i16 tc = { 0 };
2053
2054 tmp_vec = (v8i16) __msa_fill_b(bs0);
2055 bs = __msa_insve_h(bs, 0, tmp_vec);
2056 tmp_vec = (v8i16) __msa_fill_b(bs1);
2057 bs = __msa_insve_h(bs, 1, tmp_vec);
2058 tmp_vec = (v8i16) __msa_fill_b(bs2);
2059 bs = __msa_insve_h(bs, 2, tmp_vec);
2060 tmp_vec = (v8i16) __msa_fill_b(bs3);
2061 bs = __msa_insve_h(bs, 3, tmp_vec);
2062
2063 if (!__msa_test_bz_v((v16u8) bs)) {
2064 tmp_vec = (v8i16) __msa_fill_b(tc0);
2065 tc = __msa_insve_h(tc, 0, tmp_vec);
2066 tmp_vec = (v8i16) __msa_fill_b(tc1);
2067 tc = __msa_insve_h(tc, 1, tmp_vec);
2068 tmp_vec = (v8i16) __msa_fill_b(tc2);
2069 tc = __msa_insve_h(tc, 2, tmp_vec);
2070 tmp_vec = (v8i16) __msa_fill_b(tc3);
2071 tc = __msa_insve_h(tc, 3, tmp_vec);
2072
2073 is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2074
2075 LD_UB8((data - 2), img_width,
2076 row0, row1, row2, row3, row4, row5, row6, row7);
2077
2078 TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2079 row4, row5, row6, row7,
2080 p1_org, p0_org, q0_org, q1_org);
2081
2082 p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2083 p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2084 q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2085
2086 alpha = (v16u8) __msa_fill_b(alpha_in);
2087 beta = (v16u8) __msa_fill_b(beta_in);
2088
2089 is_less_than_alpha = (p0_asub_q0 < alpha);
2090 is_less_than_beta = (p1_asub_p0 < beta);
2091 is_less_than = is_less_than_beta & is_less_than_alpha;
2092 is_less_than_beta = (q1_asub_q0 < beta);
2093 is_less_than = is_less_than_beta & is_less_than;
2094 is_less_than = is_bs_greater_than0 & is_less_than;
2095
2096 is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2097
2098 if (!__msa_test_bz_v(is_less_than)) {
2099 ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2100 p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2101
2102 negate_tc = zero - (v16i8) tc;
2103 sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2104
2105 ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2106
2107 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2108 tc_r, p0_r, q0_r);
2109
2110 PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2111
2112 p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2113 q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2114 tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2115 src = data - 1;
2116 ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
2117 src += 4 * img_width;
2118 ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
2119 }
2120 }
2121 }
2122
avc_h_loop_filter_chroma422_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)2123 static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride,
2124 int32_t alpha_in, int32_t beta_in,
2125 int8_t *tc0)
2126 {
2127 int32_t col, tc_val;
2128 v16u8 alpha, beta, res;
2129
2130 alpha = (v16u8) __msa_fill_b(alpha_in);
2131 beta = (v16u8) __msa_fill_b(beta_in);
2132
2133 for (col = 0; col < 4; col++) {
2134 tc_val = (tc0[col] - 1) + 1;
2135
2136 if (tc_val <= 0) {
2137 src += (4 * stride);
2138 continue;
2139 }
2140
2141 AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2142 ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2143 src += (4 * stride);
2144 }
2145 }
2146
avc_h_loop_filter_chroma422_mbaff_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)2147 static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2148 ptrdiff_t stride,
2149 int32_t alpha_in,
2150 int32_t beta_in,
2151 int8_t *tc0)
2152 {
2153 int32_t col, tc_val;
2154 int16_t out0, out1;
2155 v16u8 alpha, beta, res;
2156
2157 alpha = (v16u8) __msa_fill_b(alpha_in);
2158 beta = (v16u8) __msa_fill_b(beta_in);
2159
2160 for (col = 0; col < 4; col++) {
2161 tc_val = (tc0[col] - 1) + 1;
2162
2163 if (tc_val <= 0) {
2164 src += 4 * stride;
2165 continue;
2166 }
2167
2168 AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2169
2170 out0 = __msa_copy_s_h((v8i16) res, 0);
2171 out1 = __msa_copy_s_h((v8i16) res, 1);
2172
2173 SH(out0, (src - 1));
2174 src += stride;
2175 SH(out1, (src - 1));
2176 src += stride;
2177 }
2178 }
2179
ff_h264_h_lpf_luma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2180 void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2181 int alpha, int beta, int8_t *tc)
2182 {
2183 uint8_t bs0 = 1;
2184 uint8_t bs1 = 1;
2185 uint8_t bs2 = 1;
2186 uint8_t bs3 = 1;
2187
2188 if (tc[0] < 0)
2189 bs0 = 0;
2190 if (tc[1] < 0)
2191 bs1 = 0;
2192 if (tc[2] < 0)
2193 bs2 = 0;
2194 if (tc[3] < 0)
2195 bs3 = 0;
2196
2197 avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2198 tc[0], tc[1], tc[2], tc[3],
2199 alpha, beta, img_width);
2200 }
2201
ff_h264_v_lpf_luma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2202 void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2203 int alpha, int beta, int8_t *tc)
2204 {
2205
2206 uint8_t bs0 = 1;
2207 uint8_t bs1 = 1;
2208 uint8_t bs2 = 1;
2209 uint8_t bs3 = 1;
2210
2211 if (tc[0] < 0)
2212 bs0 = 0;
2213 if (tc[1] < 0)
2214 bs1 = 0;
2215 if (tc[2] < 0)
2216 bs2 = 0;
2217 if (tc[3] < 0)
2218 bs3 = 0;
2219
2220 avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2221 tc[0], tc[1], tc[2], tc[3],
2222 alpha, beta, img_width);
2223 }
2224
ff_h264_h_lpf_chroma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2225 void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2226 int alpha, int beta, int8_t *tc)
2227 {
2228 uint8_t bs0 = 1;
2229 uint8_t bs1 = 1;
2230 uint8_t bs2 = 1;
2231 uint8_t bs3 = 1;
2232
2233 if (tc[0] < 0)
2234 bs0 = 0;
2235 if (tc[1] < 0)
2236 bs1 = 0;
2237 if (tc[2] < 0)
2238 bs2 = 0;
2239 if (tc[3] < 0)
2240 bs3 = 0;
2241
2242 avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2243 tc[0], tc[1], tc[2], tc[3],
2244 alpha, beta, img_width);
2245 }
2246
ff_h264_v_lpf_chroma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2247 void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2248 int alpha, int beta, int8_t *tc)
2249 {
2250 uint8_t bs0 = 1;
2251 uint8_t bs1 = 1;
2252 uint8_t bs2 = 1;
2253 uint8_t bs3 = 1;
2254
2255 if (tc[0] < 0)
2256 bs0 = 0;
2257 if (tc[1] < 0)
2258 bs1 = 0;
2259 if (tc[2] < 0)
2260 bs2 = 0;
2261 if (tc[3] < 0)
2262 bs3 = 0;
2263
2264 avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2265 tc[0], tc[1], tc[2], tc[3],
2266 alpha, beta, img_width);
2267 }
2268
ff_h264_h_lpf_luma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2269 void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2270 int alpha, int beta)
2271 {
2272 avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
2273 (uint8_t) beta,
2274 img_width);
2275 }
2276
ff_h264_v_lpf_luma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2277 void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2278 int alpha, int beta)
2279 {
2280 avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
2281 (uint8_t) beta,
2282 img_width);
2283 }
2284
ff_h264_h_lpf_chroma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2285 void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2286 int alpha, int beta)
2287 {
2288 avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
2289 (uint8_t) beta,
2290 img_width);
2291 }
2292
ff_h264_v_lpf_chroma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2293 void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2294 int alpha, int beta)
2295 {
2296 avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
2297 (uint8_t) beta,
2298 img_width);
2299 }
2300
ff_h264_h_loop_filter_chroma422_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2301 void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
2302 ptrdiff_t ystride,
2303 int32_t alpha, int32_t beta,
2304 int8_t *tc0)
2305 {
2306 avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2307 }
2308
ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2309 void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2310 ptrdiff_t ystride,
2311 int32_t alpha,
2312 int32_t beta,
2313 int8_t *tc0)
2314 {
2315 avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2316 }
2317
ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2318 void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
2319 ptrdiff_t ystride,
2320 int32_t alpha,
2321 int32_t beta,
2322 int8_t *tc0)
2323 {
2324 avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2325 }
2326
ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta)2327 void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
2328 ptrdiff_t ystride,
2329 int32_t alpha,
2330 int32_t beta)
2331 {
2332 avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2333 }
2334
ff_weight_h264_pixels16_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset_in)2335 void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2336 int height, int log2_denom,
2337 int weight_src, int offset_in)
2338 {
2339 uint32_t offset_val;
2340 v16i8 zero = { 0 };
2341 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2342 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2343 v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2344 v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2345 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2346 v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2347 v8i16 wgt, denom, offset;
2348
2349 offset_val = (unsigned) offset_in << log2_denom;
2350
2351 wgt = __msa_fill_h(weight_src);
2352 offset = __msa_fill_h(offset_val);
2353 denom = __msa_fill_h(log2_denom);
2354
2355 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2356 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2357 src2_r, src3_r);
2358 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2359 src2_l, src3_l);
2360 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2361 src6_r, src7_r);
2362 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2363 src6_l, src7_l);
2364 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2365 tmp3);
2366 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2367 tmp7);
2368 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2369 tmp11);
2370 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2371 tmp14, tmp15);
2372 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2373 tmp1, tmp2, tmp3);
2374 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2375 tmp5, tmp6, tmp7);
2376 ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2377 tmp9, tmp10, tmp11);
2378 ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2379 tmp12, tmp13, tmp14, tmp15);
2380 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2381 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2382 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2383 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2384 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2385 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2386 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2387 dst2, dst3);
2388 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2389 dst5, dst6, dst7);
2390 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2391 src += 8 * stride;
2392
2393 if (16 == height) {
2394 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2395 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2396 src1_r, src2_r, src3_r);
2397 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2398 src1_l, src2_l, src3_l);
2399 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2400 src5_r, src6_r, src7_r);
2401 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2402 src5_l, src6_l, src7_l);
2403 MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2404 tmp2, tmp3);
2405 MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2406 tmp6, tmp7);
2407 MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2408 tmp10, tmp11);
2409 MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2410 tmp14, tmp15);
2411 ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2412 tmp0, tmp1, tmp2, tmp3);
2413 ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2414 tmp4, tmp5, tmp6, tmp7);
2415 ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2416 tmp8, tmp9, tmp10, tmp11);
2417 ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2418 tmp12, tmp13, tmp14, tmp15);
2419 MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2420 MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2421 SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2422 SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2423 SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2424 SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2425 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2426 dst2, dst3);
2427 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2428 dst5, dst6, dst7);
2429 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2430 }
2431 }
2432
ff_weight_h264_pixels8_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset)2433 void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2434 int height, int log2_denom,
2435 int weight_src, int offset)
2436 {
2437 if (4 == height) {
2438 avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2439 } else if (8 == height) {
2440 avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2441 } else {
2442 avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2443 }
2444 }
2445
ff_weight_h264_pixels4_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset)2446 void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2447 int height, int log2_denom,
2448 int weight_src, int offset)
2449 {
2450 if (2 == height) {
2451 avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2452 } else if (4 == height) {
2453 avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2454 } else {
2455 avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2456 }
2457 }
2458
ff_biweight_h264_pixels16_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset_in)2459 void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2460 ptrdiff_t stride, int height,
2461 int log2_denom, int weight_dst,
2462 int weight_src, int offset_in)
2463 {
2464 v16i8 src_wgt, dst_wgt, wgt;
2465 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2467 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2468 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2469 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2470 v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2471 v8i16 denom, offset;
2472
2473 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2474 offset_in += (128 * (weight_src + weight_dst));
2475
2476 src_wgt = __msa_fill_b(weight_src);
2477 dst_wgt = __msa_fill_b(weight_dst);
2478 offset = __msa_fill_h(offset_in);
2479 denom = __msa_fill_h(log2_denom + 1);
2480
2481 wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2482
2483 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2484 src += 8 * stride;
2485 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2486 XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2487 XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2488 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2489 vec6);
2490 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2491 vec7);
2492 ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2493 vec12, vec14);
2494 ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2495 vec13, vec15);
2496 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2497 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2498 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2499 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2500 tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2501 tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2502 tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2503 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2504 tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2505 tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2506 tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2507 tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2508 tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2509 tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2510 tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2511 tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2512 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2513 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2514 SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2515 SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2516 CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2517 CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2518 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2519 dst2, dst3);
2520 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2521 dst5, dst6, dst7);
2522 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2523 dst += 8 * stride;
2524
2525 if (16 == height) {
2526 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2527 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2528 XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2529 XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2530 ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2531 vec4, vec6);
2532 ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2533 vec5, vec7);
2534 ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2535 vec12, vec14);
2536 ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2537 vec13, vec15);
2538 tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2539 tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2540 tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2541 tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2542 tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2543 tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2544 tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2545 tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2546 tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2547 tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2548 tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2549 tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2550 tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2551 tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2552 tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2553 tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2554 SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2555 SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2556 SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2557 SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2558 CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2559 CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2560 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2561 dst2, dst3);
2562 PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2563 dst5, dst6, dst7);
2564 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2565 }
2566 }
2567
ff_biweight_h264_pixels8_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset)2568 void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2569 ptrdiff_t stride, int height,
2570 int log2_denom, int weight_dst,
2571 int weight_src, int offset)
2572 {
2573 if (4 == height) {
2574 avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2575 offset);
2576 } else if (8 == height) {
2577 avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2578 offset);
2579 } else {
2580 avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2581 offset);
2582 }
2583 }
2584
ff_biweight_h264_pixels4_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset)2585 void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2586 ptrdiff_t stride, int height,
2587 int log2_denom, int weight_dst,
2588 int weight_src, int offset)
2589 {
2590 if (2 == height) {
2591 avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2592 offset);
2593 } else if (4 == height) {
2594 avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2595 offset);
2596 } else {
2597 avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2598 offset);
2599 }
2600 }
2601