• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
23 
avc_wgt_4x2_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)24 static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride,
25                             int32_t log2_denom, int32_t src_weight,
26                             int32_t offset_in)
27 {
28     uint32_t tp0, tp1, offset_val;
29     v16u8 zero = { 0 };
30     v16u8 src0 = { 0 };
31     v8i16 src0_r, tmp0, wgt, denom, offset;
32 
33     offset_val = (unsigned) offset_in << log2_denom;
34 
35     wgt = __msa_fill_h(src_weight);
36     offset = __msa_fill_h(offset_val);
37     denom = __msa_fill_h(log2_denom);
38 
39     LW2(data, stride, tp0, tp1);
40     INSERT_W2_UB(tp0, tp1, src0);
41     src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42     tmp0 = wgt * src0_r;
43     tmp0 = __msa_adds_s_h(tmp0, offset);
44     tmp0 = __msa_maxi_s_h(tmp0, 0);
45     tmp0 = __msa_srlr_h(tmp0, denom);
46     tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47     src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48     ST_W2(src0, 0, 1, data, stride);
49 }
50 
avc_wgt_4x4_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)51 static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride,
52                             int32_t log2_denom, int32_t src_weight,
53                             int32_t offset_in)
54 {
55     uint32_t tp0, tp1, tp2, tp3, offset_val;
56     v16u8 src0 = { 0 };
57     v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58 
59     offset_val = (unsigned) offset_in << log2_denom;
60 
61     wgt = __msa_fill_h(src_weight);
62     offset = __msa_fill_h(offset_val);
63     denom = __msa_fill_h(log2_denom);
64 
65     LW4(data, stride, tp0, tp1, tp2, tp3);
66     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
67     UNPCK_UB_SH(src0, src0_r, src1_r);
68     MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
69     ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
70     MAXI_SH2_SH(tmp0, tmp1, 0);
71     tmp0 = __msa_srlr_h(tmp0, denom);
72     tmp1 = __msa_srlr_h(tmp1, denom);
73     SAT_UH2_SH(tmp0, tmp1, 7);
74     src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
75     ST_W4(src0, 0, 1, 2, 3, data, stride);
76 }
77 
avc_wgt_4x8_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)78 static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride,
79                             int32_t log2_denom, int32_t src_weight,
80                             int32_t offset_in)
81 {
82     uint32_t tp0, tp1, tp2, tp3, offset_val;
83     v16u8 src0 = { 0 }, src1 = { 0 };
84     v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
85     v8i16 wgt, denom, offset;
86 
87     offset_val = (unsigned) offset_in << log2_denom;
88 
89     wgt = __msa_fill_h(src_weight);
90     offset = __msa_fill_h(offset_val);
91     denom = __msa_fill_h(log2_denom);
92 
93     LW4(data, stride, tp0, tp1, tp2, tp3);
94     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
95     LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
96     INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
97     UNPCK_UB_SH(src0, src0_r, src1_r);
98     UNPCK_UB_SH(src1, src2_r, src3_r);
99     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100          tmp3);
101     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
102                 tmp1, tmp2, tmp3);
103     MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
104     SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
105     SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
106     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
107     ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
108 }
109 
avc_wgt_8x4_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)110 static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride,
111                             int32_t log2_denom, int32_t src_weight,
112                             int32_t offset_in)
113 {
114     uint32_t offset_val;
115     uint64_t tp0, tp1, tp2, tp3;
116     v16u8 src0 = { 0 }, src1 = { 0 };
117     v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
118     v8i16 wgt, denom, offset;
119 
120     offset_val = (unsigned) offset_in << log2_denom;
121 
122     wgt = __msa_fill_h(src_weight);
123     offset = __msa_fill_h(offset_val);
124     denom = __msa_fill_h(log2_denom);
125 
126     LD4(data, stride, tp0, tp1, tp2, tp3);
127     INSERT_D2_UB(tp0, tp1, src0);
128     INSERT_D2_UB(tp2, tp3, src1);
129     UNPCK_UB_SH(src0, src0_r, src1_r);
130     UNPCK_UB_SH(src1, src2_r, src3_r);
131     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132          tmp3);
133     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
134                 tmp1, tmp2, tmp3);
135     MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
136     SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
137     SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
138     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
139     ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
140 }
141 
avc_wgt_8x8_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)142 static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom,
143                             int32_t src_weight, int32_t offset_in)
144 {
145     uint32_t offset_val;
146     uint64_t tp0, tp1, tp2, tp3;
147     v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
148     v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
150     v8i16 wgt, denom, offset;
151 
152     offset_val = (unsigned) offset_in << log2_denom;
153 
154     wgt = __msa_fill_h(src_weight);
155     offset = __msa_fill_h(offset_val);
156     denom = __msa_fill_h(log2_denom);
157 
158     LD4(data, stride, tp0, tp1, tp2, tp3);
159     INSERT_D2_UB(tp0, tp1, src0);
160     INSERT_D2_UB(tp2, tp3, src1);
161     LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
162     INSERT_D2_UB(tp0, tp1, src2);
163     INSERT_D2_UB(tp2, tp3, src3);
164     UNPCK_UB_SH(src0, src0_r, src1_r);
165     UNPCK_UB_SH(src1, src2_r, src3_r);
166     UNPCK_UB_SH(src2, src4_r, src5_r);
167     UNPCK_UB_SH(src3, src6_r, src7_r);
168     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169          tmp3);
170     MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171          tmp7);
172     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
173                 tmp1, tmp2, tmp3);
174     ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
175                 tmp5, tmp6, tmp7);
176     MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177     SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178     SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
180                 src2, src3);
181     ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
182 }
183 
avc_wgt_8x16_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)184 static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride,
185                              int32_t log2_denom, int32_t src_weight,
186                              int32_t offset_in)
187 {
188     uint32_t offset_val, cnt;
189     uint64_t tp0, tp1, tp2, tp3;
190     v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
191     v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
193     v8i16 wgt, denom, offset;
194 
195     offset_val = (unsigned) offset_in << log2_denom;
196 
197     wgt = __msa_fill_h(src_weight);
198     offset = __msa_fill_h(offset_val);
199     denom = __msa_fill_h(log2_denom);
200 
201     for (cnt = 2; cnt--;) {
202         LD4(data, stride, tp0, tp1, tp2, tp3);
203         INSERT_D2_UB(tp0, tp1, src0);
204         INSERT_D2_UB(tp2, tp3, src1);
205         LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
206         INSERT_D2_UB(tp0, tp1, src2);
207         INSERT_D2_UB(tp2, tp3, src3);
208         UNPCK_UB_SH(src0, src0_r, src1_r);
209         UNPCK_UB_SH(src1, src2_r, src3_r);
210         UNPCK_UB_SH(src2, src4_r, src5_r);
211         UNPCK_UB_SH(src3, src6_r, src7_r);
212         MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213              tmp2, tmp3);
214         MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215              tmp6, tmp7);
216         ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
217                     tmp0, tmp1, tmp2, tmp3);
218         ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
219                     tmp4, tmp5, tmp6, tmp7);
220         MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221         SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222         SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
224                     src2, src3);
225         ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
226         data += 8 * stride;
227     }
228 }
229 
avc_biwgt_4x2_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)230 static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
231                               int32_t log2_denom, int32_t src_weight,
232                               int32_t dst_weight, int32_t offset_in)
233 {
234     uint32_t tp0, tp1;
235     v16i8 src_wgt, dst_wgt, wgt, vec0;
236     v16u8 src0 = { 0 }, dst0 = { 0 };
237     v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
238 
239     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240     offset_in += (128 * (src_weight + dst_weight));
241 
242     src_wgt = __msa_fill_b(src_weight);
243     dst_wgt = __msa_fill_b(dst_weight);
244     offset = __msa_fill_h(offset_in);
245     denom = __msa_fill_h(log2_denom + 1);
246 
247     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248 
249     LW2(src, stride, tp0, tp1);
250     INSERT_W2_UB(tp0, tp1, src0);
251     LW2(dst, stride, tp0, tp1);
252     INSERT_W2_UB(tp0, tp1, dst0);
253     XORI_B2_128_UB(src0, dst0);
254     vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
255     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
256     tmp0 >>= denom;
257     tmp0 = __msa_maxi_s_h(tmp0, 0);
258     tmp0 = __msa_min_s_h(max255, tmp0);
259     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
260     ST_W2(dst0, 0, 1, dst, stride);
261 }
262 
avc_biwgt_4x4_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)263 static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
264                               int32_t log2_denom, int32_t src_weight,
265                               int32_t dst_weight, int32_t offset_in)
266 {
267     uint32_t tp0, tp1, tp2, tp3;
268     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
269     v16u8 src0, dst0;
270     v8i16 tmp0, tmp1, denom, offset;
271 
272     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273     offset_in += (128 * (src_weight + dst_weight));
274 
275     src_wgt = __msa_fill_b(src_weight);
276     dst_wgt = __msa_fill_b(dst_weight);
277     offset = __msa_fill_h(offset_in);
278     denom = __msa_fill_h(log2_denom + 1);
279 
280     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281 
282     LW4(src, stride, tp0, tp1, tp2, tp3);
283     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
284     LW4(dst, stride, tp0, tp1, tp2, tp3);
285     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
286     XORI_B2_128_UB(src0, dst0);
287     ILVRL_B2_SB(dst0, src0, vec0, vec1);
288     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
289     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
290     tmp0 >>= denom;
291     tmp1 >>= denom;
292     CLIP_SH2_0_255(tmp0, tmp1);
293     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
294     ST_W4(dst0, 0, 1, 2, 3, dst, stride);
295 }
296 
avc_biwgt_4x8_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)297 static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
298                               int32_t log2_denom, int32_t src_weight,
299                               int32_t dst_weight, int32_t offset_in)
300 {
301     uint32_t tp0, tp1, tp2, tp3;
302     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
303     v16u8 src0, src1, dst0, dst1;
304     v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
305 
306     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307     offset_in += (128 * (src_weight + dst_weight));
308 
309     src_wgt = __msa_fill_b(src_weight);
310     dst_wgt = __msa_fill_b(dst_weight);
311     offset = __msa_fill_h(offset_in);
312     denom = __msa_fill_h(log2_denom + 1);
313     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314 
315     LW4(src, stride, tp0, tp1, tp2, tp3);
316     src += 4 * stride;
317     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
318     LW4(src, stride, tp0, tp1, tp2, tp3);
319     INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
320     LW4(dst, stride, tp0, tp1, tp2, tp3);
321     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
322     LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
323     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
324     XORI_B4_128_UB(src0, src1, dst0, dst1);
325     ILVRL_B2_SB(dst0, src0, vec0, vec1);
326     ILVRL_B2_SB(dst1, src1, vec2, vec3);
327     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
328     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
329     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
330     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
331     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
332     CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
333     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
334     ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
335 }
336 
avc_biwgt_8x4_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)337 static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
338                               int32_t log2_denom, int32_t src_weight,
339                               int32_t dst_weight, int32_t offset_in)
340 {
341     uint64_t tp0, tp1, tp2, tp3;
342     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
343     v16u8 src0, src1, dst0, dst1;
344     v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
345 
346     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347     offset_in += (128 * (src_weight + dst_weight));
348 
349     src_wgt = __msa_fill_b(src_weight);
350     dst_wgt = __msa_fill_b(dst_weight);
351     offset = __msa_fill_h(offset_in);
352     denom = __msa_fill_h(log2_denom + 1);
353 
354     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355 
356     LD4(src, stride, tp0, tp1, tp2, tp3);
357     INSERT_D2_UB(tp0, tp1, src0);
358     INSERT_D2_UB(tp2, tp3, src1);
359     LD4(dst, stride, tp0, tp1, tp2, tp3);
360     INSERT_D2_UB(tp0, tp1, dst0);
361     INSERT_D2_UB(tp2, tp3, dst1);
362     XORI_B4_128_UB(src0, src1, dst0, dst1);
363     ILVRL_B2_SB(dst0, src0, vec0, vec1);
364     ILVRL_B2_SB(dst1, src1, vec2, vec3);
365     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
366     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
367     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
368     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
369     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
370     CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
371     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
372     ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
373 }
374 
avc_biwgt_8x8_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)375 static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376                               int32_t log2_denom, int32_t src_weight,
377                               int32_t dst_weight, int32_t offset_in)
378 {
379     uint64_t tp0, tp1, tp2, tp3;
380     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381     v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
382     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
383 
384     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385     offset_in += (128 * (src_weight + dst_weight));
386 
387     src_wgt = __msa_fill_b(src_weight);
388     dst_wgt = __msa_fill_b(dst_weight);
389     offset = __msa_fill_h(offset_in);
390     denom = __msa_fill_h(log2_denom + 1);
391     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392 
393     LD4(src, stride, tp0, tp1, tp2, tp3);
394     INSERT_D2_UB(tp0, tp1, src0);
395     INSERT_D2_UB(tp2, tp3, src1);
396     LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
397     INSERT_D2_UB(tp0, tp1, src2);
398     INSERT_D2_UB(tp2, tp3, src3);
399     LD4(dst, stride, tp0, tp1, tp2, tp3);
400     INSERT_D2_UB(tp0, tp1, dst0);
401     INSERT_D2_UB(tp2, tp3, dst1);
402     LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
403     INSERT_D2_UB(tp0, tp1, dst2);
404     INSERT_D2_UB(tp2, tp3, dst3);
405     XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
406     ILVRL_B2_SB(dst0, src0, vec0, vec1);
407     ILVRL_B2_SB(dst1, src1, vec2, vec3);
408     ILVRL_B2_SB(dst2, src2, vec4, vec5);
409     ILVRL_B2_SB(dst3, src3, vec6, vec7);
410     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
411     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
412     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
413     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
414     tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
415     tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
416     tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
417     tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
418     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
420     CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
421     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
422     PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
423     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
424 }
425 
avc_biwgt_8x16_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)426 static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
427                                int32_t log2_denom, int32_t src_weight,
428                                int32_t dst_weight, int32_t offset_in)
429 {
430     uint8_t cnt;
431     uint64_t tp0, tp1, tp2, tp3;
432     v16i8 src_wgt, dst_wgt, wgt;
433     v16u8 src0, src1, src2, src3;
434     v16u8 dst0, dst1, dst2, dst3;
435     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436     v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
437     v8i16 denom, offset;
438 
439     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440     offset_in += (128 * (src_weight + dst_weight));
441 
442     src_wgt = __msa_fill_b(src_weight);
443     dst_wgt = __msa_fill_b(dst_weight);
444     offset = __msa_fill_h(offset_in);
445     denom = __msa_fill_h(log2_denom + 1);
446     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447 
448     for (cnt = 2; cnt--;) {
449         LD4(src, stride, tp0, tp1, tp2, tp3);
450         src += 4 * stride;
451         INSERT_D2_UB(tp0, tp1, src0);
452         INSERT_D2_UB(tp2, tp3, src1);
453         LD4(src, stride, tp0, tp1, tp2, tp3);
454         src += 4 * stride;
455         INSERT_D2_UB(tp0, tp1, src2);
456         INSERT_D2_UB(tp2, tp3, src3);
457         LD4(dst, stride, tp0, tp1, tp2, tp3);
458         INSERT_D2_UB(tp0, tp1, dst0);
459         INSERT_D2_UB(tp2, tp3, dst1);
460         LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
461         INSERT_D2_UB(tp0, tp1, dst2);
462         INSERT_D2_UB(tp2, tp3, dst3);
463         XORI_B4_128_UB(src0, src1, src2, src3);
464         XORI_B4_128_UB(dst0, dst1, dst2, dst3);
465         ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
466                    vec0, vec2, vec4, vec6);
467         ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
468                    vec1, vec3, vec5, vec7);
469 
470         temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
471         temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
472         temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
473         temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
474         temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
475         temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
476         temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
477         temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
478 
479         SRA_4V(temp0, temp1, temp2, temp3, denom);
480         SRA_4V(temp4, temp5, temp6, temp7, denom);
481         CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482         PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483                     dst0, dst1, dst2, dst3);
484         ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
485         dst += 8 * stride;
486     }
487 }
488 
489 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
490                                  q3_or_p3_org_in, p1_or_q1_org_in,          \
491                                  p2_or_q2_org_in, q1_or_p1_org_in,          \
492                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
493 {                                                                           \
494     v8i16 threshold;                                                        \
495     v8i16 const3 = __msa_ldi_h(3);                                          \
496                                                                             \
497     threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \
498     threshold += (p1_or_q1_org_in);                                         \
499                                                                             \
500     (p0_or_q0_out) = threshold << 1;                                        \
501     (p0_or_q0_out) += (p2_or_q2_org_in);                                    \
502     (p0_or_q0_out) += (q1_or_p1_org_in);                                    \
503     (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \
504                                                                             \
505     (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \
506     (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \
507                                                                             \
508     (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \
509     (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
510     (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
511     (p2_or_q2_out) += threshold;                                            \
512     (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \
513 }
514 
515 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \
517                          p1_or_q1_org_in, p0_or_q0_out)      \
518 {                                                            \
519     (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \
520     (p0_or_q0_out) += (p1_or_q1_org_in);                     \
521     (p0_or_q0_out) += (p1_or_q1_org_in);                     \
522     (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \
523 }
524 
525 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \
526                          p1_or_q1_org_in, p2_or_q2_org_in,    \
527                          negate_tc_in, tc_in, p1_or_q1_out)   \
528 {                                                             \
529     v8i16 clip3, temp;                                        \
530                                                               \
531     clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \
532                                    (v8u16) q0_or_p0_org_in);  \
533     temp = p1_or_q1_org_in << 1;                              \
534     clip3 = clip3 - temp;                                     \
535     clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
536     CLIP_SH(clip3, negate_tc_in, tc_in);                      \
537     p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
538 }
539 
540 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \
541                      p1_or_q1_org_in, q1_or_p1_org_in,          \
542                      negate_threshold_in, threshold_in,         \
543                      p0_or_q0_out, q0_or_p0_out)                \
544 {                                                               \
545     v8i16 q0_sub_p0, p1_sub_q1, delta;                          \
546                                                                 \
547     q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \
548     p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \
549     q0_sub_p0 <<= 2;                                            \
550     p1_sub_q1 += 4;                                             \
551     delta = q0_sub_p0 + p1_sub_q1;                              \
552     delta >>= 3;                                                \
553                                                                 \
554     CLIP_SH(delta, negate_threshold_in, threshold_in);          \
555                                                                 \
556     p0_or_q0_out = p0_or_q0_org_in + delta;                     \
557     q0_or_p0_out = q0_or_p0_org_in - delta;                     \
558                                                                 \
559     CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \
560 }
561 
562 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \
563 {                                                                        \
564     uint32_t load0, load1, load2, load3;                                 \
565     v16u8 src0 = { 0 };                                                  \
566     v16u8 src1 = { 0 };                                                  \
567     v16u8 src2 = { 0 };                                                  \
568     v16u8 src3 = { 0 };                                                  \
569     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \
570     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \
571     v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \
572     v8i16 res0_r, res1_r;                                                \
573     v16i8 zeros = { 0 };                                                 \
574     v16u8 res0, res1;                                                    \
575                                                                          \
576     LW4((src - 2), stride, load0, load1, load2, load3);                  \
577     src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \
578     src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \
579     src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \
580     src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \
581                                                                          \
582     TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \
583                                                                          \
584     p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \
585     p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \
586     q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \
587                                                                          \
588     tc = __msa_fill_h(tc_val);                                           \
589                                                                          \
590     is_less_than_alpha = (p0_asub_q0 < alpha);                           \
591     is_less_than_beta = (p1_asub_p0 < beta);                             \
592     is_less_than = is_less_than_alpha & is_less_than_beta;               \
593     is_less_than_beta = (q1_asub_q0 < beta);                             \
594     is_less_than = is_less_than_beta & is_less_than;                     \
595                                                                          \
596     ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \
597     HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \
598                                                                          \
599     q0_sub_p0 <<= 2;                                                     \
600     delta = q0_sub_p0 + p1_sub_q1;                                       \
601     delta = __msa_srari_h(delta, 3);                                     \
602                                                                          \
603     CLIP_SH(delta, -tc, tc);                                             \
604                                                                          \
605     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
606                                                                          \
607     res0_r += delta;                                                     \
608     res1_r -= delta;                                                     \
609                                                                          \
610     CLIP_SH2_0_255(res0_r, res1_r);                                      \
611     PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \
612                                                                          \
613     res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \
614     res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \
615                                                                          \
616     res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \
617 }
618 
619 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \
620 {                                                            \
621     v16i8 zero_m = { 0 };                                    \
622                                                              \
623     out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
624     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \
625     SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3);   \
626 }
627 
628 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
629 {                                                                          \
630     uint32_t load0, load1;                                                 \
631     v16u8 src0 = { 0 };                                                    \
632     v16u8 src1 = { 0 };                                                    \
633     v16u8 src2 = { 0 };                                                    \
634     v16u8 src3 = { 0 };                                                    \
635     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \
636     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \
637     v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \
638     v16i8 zeros = { 0 };                                                   \
639     v16u8 res0, res1;                                                      \
640                                                                            \
641     load0 = LW(src - 2);                                                   \
642     load1 = LW(src - 2 + stride);                                          \
643                                                                            \
644     src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \
645     src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \
646                                                                            \
647     TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \
648                                                                            \
649     p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \
650     p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \
651     q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \
652                                                                            \
653     tc = __msa_fill_h(tc_val);                                             \
654                                                                            \
655     is_less_than_alpha = (p0_asub_q0 < alpha);                             \
656     is_less_than_beta = (p1_asub_p0 < beta);                               \
657     is_less_than = is_less_than_alpha & is_less_than_beta;                 \
658     is_less_than_beta = (q1_asub_q0 < beta);                               \
659     is_less_than = is_less_than_beta & is_less_than;                       \
660                                                                            \
661     ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \
662     HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \
663                                                                            \
664     q0_sub_p0 <<= 2;                                                       \
665     delta = q0_sub_p0 + p1_sub_q1;                                         \
666     delta = __msa_srari_h(delta, 3);                                       \
667     CLIP_SH(delta, -tc, tc);                                               \
668                                                                            \
669     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
670                                                                            \
671     res0_r += delta;                                                       \
672     res1_r -= delta;                                                       \
673                                                                            \
674     CLIP_SH2_0_255(res0_r, res1_r);                                        \
675     PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \
676                                                                            \
677     res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \
678     res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \
679                                                                            \
680     res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \
681 }
682 
avc_loopfilter_luma_intra_edge_hor_msa(uint8_t * data,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)683 static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
684                                                    uint8_t alpha_in,
685                                                    uint8_t beta_in,
686                                                    ptrdiff_t img_width)
687 {
688     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689     v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690     v16u8 p1_org, p0_org, q0_org, q1_org;
691 
692     LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693 
694     p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695     p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696     q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697 
698     is_less_than_alpha = (p0_asub_q0 < alpha_in);
699     is_less_than_beta = (p1_asub_p0 < beta_in);
700     is_less_than = is_less_than_beta & is_less_than_alpha;
701     is_less_than_beta = (q1_asub_q0 < beta_in);
702     is_less_than = is_less_than_beta & is_less_than;
703 
704     if (!__msa_test_bz_v(is_less_than)) {
705         v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
706         v8i16 p0_r = { 0 };
707         v8i16 q0_r = { 0 };
708         v8i16 p0_l = { 0 };
709         v8i16 q0_l = { 0 };
710         v16i8 zero = { 0 };
711         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713         v16u8 q2_org = LD_UB(data + (2 * img_width));
714         v16u8 p2_org = LD_UB(data - (3 * img_width));
715         v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716 
717         UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
718         UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
719         UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
720 
721         tmp_flag = (p0_asub_q0 < tmp_flag);
722 
723         p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724         is_less_than_beta = (p2_asub_p0 < beta_in);
725         is_less_than_beta = is_less_than_beta & tmp_flag;
726         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727         is_less_than_beta = is_less_than_beta & is_less_than;
728         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729 
730         q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
731         q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
732 
733         /* combine and store */
734         if (!__msa_test_bz_v(is_less_than_beta)) {
735             v8i16 p3_org_l, p3_org_r;
736             v16u8 p3_org = LD_UB(data - (img_width << 2));
737             v16u8 p2, p1;
738             v8i16 p2_r = { 0 };
739             v8i16 p2_l = { 0 };
740             v8i16 p1_r = { 0 };
741             v8i16 p1_l = { 0 };
742 
743             ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
744             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
745                                      p2_r, q1_org_r, p0_r, p1_r, p2_r);
746 
747             ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
748             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
749                                      p2_l, q1_org_l, p0_l, p1_l, p2_l);
750 
751             PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752 
753             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754             p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755             p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756 
757             ST_UB(p1_org, data - (2 * img_width));
758             ST_UB(p2_org, data - (3 * img_width));
759         }
760 
761         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
762         AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
763 
764         /* combine */
765         p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766         p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767 
768         ST_UB(p0_org, data - img_width);
769 
770         /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771         q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772         is_less_than_beta = (q2_asub_q0 < beta_in);
773         is_less_than_beta = is_less_than_beta & tmp_flag;
774         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775         is_less_than_beta = is_less_than_beta & is_less_than;
776         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777 
778         /* combine and store */
779         if (!__msa_test_bz_v(is_less_than_beta)) {
780             v8i16 q3_org_r, q3_org_l;
781             v16u8 q3_org = LD_UB(data + (3 * img_width));
782             v16u8 q1, q2;
783             v8i16 q2_r = { 0 };
784             v8i16 q2_l = { 0 };
785             v8i16 q1_r = { 0 };
786             v8i16 q1_l = { 0 };
787 
788             ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
789             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
790                                      q2_r, p1_org_r, q0_r, q1_r, q2_r);
791 
792             ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
793             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
794                                      q2_l, p1_org_l, q0_l, q1_l, q2_l);
795 
796             PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
797             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
798             q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
799             q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800 
801             ST_UB(q1_org, data + img_width);
802             ST_UB(q2_org, data + 2 * img_width);
803         }
804 
805         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
806         AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
807 
808         /* combine */
809         q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810         q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
811 
812         ST_UB(q0_org, data);
813     }
814 }
815 
avc_loopfilter_luma_intra_edge_ver_msa(uint8_t * data,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)816 static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
817                                                    uint8_t alpha_in,
818                                                    uint8_t beta_in,
819                                                    ptrdiff_t img_width)
820 {
821     uint8_t *src = data - 4;
822     v16u8 alpha, beta, p0_asub_q0;
823     v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824     v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825     v16u8 p1_asub_p0, q1_asub_q0;
826 
827 
828     {
829         v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830         v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831 
832         LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
833         LD_UB8(src + (8 * img_width), img_width,
834                row8, row9, row10, row11, row12, row13, row14, row15);
835 
836         TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
837                             row4, row5, row6, row7,
838                             row8, row9, row10, row11,
839                             row12, row13, row14, row15,
840                             p3_org, p2_org, p1_org, p0_org,
841                             q0_org, q1_org, q2_org, q3_org);
842     }
843 
844     p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845     p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846     q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847 
848     alpha = (v16u8) __msa_fill_b(alpha_in);
849     beta = (v16u8) __msa_fill_b(beta_in);
850 
851     is_less_than_alpha = (p0_asub_q0 < alpha);
852     is_less_than_beta = (p1_asub_p0 < beta);
853     is_less_than = is_less_than_beta & is_less_than_alpha;
854     is_less_than_beta = (q1_asub_q0 < beta);
855     is_less_than = is_less_than_beta & is_less_than;
856 
857     if (!__msa_test_bz_v(is_less_than)) {
858         v8i16 p0_r = { 0 };
859         v8i16 q0_r = { 0 };
860         v8i16 p0_l = { 0 };
861         v8i16 q0_l = { 0 };
862         v16i8 zero = { 0 };
863         v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
864         v16u8 negate_is_less_than_beta;
865         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867 
868         UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
869         UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
870         UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
871         UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
872 
873         tmp_flag = alpha >> 2;
874         tmp_flag = tmp_flag + 2;
875         tmp_flag = (p0_asub_q0 < tmp_flag);
876 
877         p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878         is_less_than_beta = (p2_asub_p0 < beta);
879         is_less_than_beta = tmp_flag & is_less_than_beta;
880         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881         is_less_than_beta = is_less_than_beta & is_less_than;
882         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883 
884         if (!__msa_test_bz_v(is_less_than_beta)) {
885             v16u8 p2, p1;
886             v8i16 p3_org_r, p3_org_l;
887             v8i16 p2_l = { 0 };
888             v8i16 p2_r = { 0 };
889             v8i16 p1_l = { 0 };
890             v8i16 p1_r = { 0 };
891 
892             ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
893             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
894                                      p2_r, q1_org_r, p0_r, p1_r, p2_r);
895 
896             ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
897             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
898                                          p2_l, q1_org_l, p0_l, p1_l, p2_l);
899 
900             PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902             p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903             p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904         }
905 
906         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
907         AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
908 
909         p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910         p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911 
912         q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913         is_less_than_beta = (q2_asub_q0 < beta);
914 
915         is_less_than_beta = is_less_than_beta & tmp_flag;
916         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917 
918         is_less_than_beta = is_less_than_beta & is_less_than;
919         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920 
921         if (!__msa_test_bz_v(is_less_than_beta)) {
922             v16u8 q1, q2;
923             v8i16 q3_org_r, q3_org_l;
924             v8i16 q1_l = { 0 };
925             v8i16 q1_r = { 0 };
926             v8i16 q2_l = { 0 };
927             v8i16 q2_r = { 0 };
928 
929             ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
930             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
931                                      q2_r, p1_org_r, q0_r, q1_r, q2_r);
932 
933             ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
934             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
935                                      q2_l, p1_org_l, q0_l, q1_l, q2_l);
936 
937             PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
938             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
939             q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
940             q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941         }
942 
943         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
944         AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
945 
946         q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947         q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
948 
949     {
950         v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951 
952         ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
953         ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
954         ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
955 
956         ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
957         ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
958 
959         src = data - 3;
960         ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
961         ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
962         src += 4 * img_width;
963         ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
964         ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
965         src += 4 * img_width;
966 
967         ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
968         ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
969         src += 4 * img_width;
970         ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
971         ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
972     }
973     }
974 }
975 
avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in)976 static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
977                                                    ptrdiff_t stride,
978                                                    int32_t alpha_in,
979                                                    int32_t beta_in)
980 {
981     uint64_t load0, load1;
982     uint32_t out0, out2;
983     uint16_t out1, out3;
984     v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985     v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986     v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987     v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988     v8i16 tmp0, tmp1, tmp2, tmp3;
989     v16u8 alpha, beta;
990     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992     v16u8 is_less_than_beta1, is_less_than_beta2;
993     v16i8 src0 = { 0 };
994     v16i8 src1 = { 0 };
995     v16i8 src2 = { 0 };
996     v16i8 src3 = { 0 };
997     v16i8 src4 = { 0 };
998     v16i8 src5 = { 0 };
999     v16i8 src6 = { 0 };
1000     v16i8 src7 = { 0 };
1001     v16i8 zeros = { 0 };
1002 
1003     load0 = LD(src - 4);
1004     load1 = LD(src + stride - 4);
1005     src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1006     src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1007 
1008     load0 = LD(src + (2 * stride) - 4);
1009     load1 = LD(src + (3 * stride) - 4);
1010     src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1011     src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012 
1013     load0 = LD(src + (4 * stride) - 4);
1014     load1 = LD(src + (5 * stride) - 4);
1015     src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016     src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017 
1018     load0 = LD(src + (6 * stride) - 4);
1019     load1 = LD(src + (7 * stride) - 4);
1020     src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021     src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022 
1023     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1024                src0, src1, src2, src3);
1025 
1026     ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1027     ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1028 
1029     ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1030     ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1031     SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1032                8, src0, src2, src4, src7);
1033 
1034     p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1035     p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1036     q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037 
1038     alpha = (v16u8) __msa_fill_b(alpha_in);
1039     beta = (v16u8) __msa_fill_b(beta_in);
1040 
1041     is_less_than_alpha = (p0_asub_q0 < alpha);
1042     is_less_than_beta = (p1_asub_p0 < beta);
1043     is_less_than = is_less_than_alpha & is_less_than_beta;
1044     is_less_than_beta = (q1_asub_q0 < beta);
1045     is_less_than = is_less_than & is_less_than_beta;
1046 
1047     alpha >>= 2;
1048     alpha += 2;
1049 
1050     is_less_than_alpha = (p0_asub_q0 < alpha);
1051 
1052     p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1053     is_less_than_beta1 = (p2_asub_p0 < beta);
1054     q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055     is_less_than_beta2 = (q2_asub_q0 < beta);
1056 
1057     ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1058                src0_r, src1_r, src2_r, src3_r);
1059     ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060                src4_r, src5_r, src6_r, src7_r);
1061 
1062     dst2_x_r = src1_r + src2_r + src3_r;
1063     dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064     dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065     dst1_r = src0_r + src1_r + src2_r + src3_r;
1066     dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067 
1068     dst0_r = (2 * src6_r) + (3 * src0_r);
1069     dst0_r += src1_r + src2_r + src3_r;
1070     dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071     dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072     dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073 
1074     PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075     dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076 
1077     dst3_x_r = src2_r + src3_r + src4_r;
1078     dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079     dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080     dst4_r = src2_r + src3_r + src4_r + src5_r;
1081     dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082 
1083     dst5_r = (2 * src7_r) + (3 * src5_r);
1084     dst5_r += src4_r + src3_r + src2_r;
1085     dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086     dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087     dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088 
1089     PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090     dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091 
1092     dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093     dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094     dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095     dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096 
1097     PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098 
1099     dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100     dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101     dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1102     dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103 
1104     is_less_than = is_less_than_alpha & is_less_than;
1105     dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106     is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107     dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1108 
1109     dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110     dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1111     dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112     is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113     dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114     dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115     dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116 
1117     ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118     dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1119     ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1120     ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1121 
1122     ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123     SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124     dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125     dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126     SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127 
1128     out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129     out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130     out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131     out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132 
1133     SW(out0, (src - 3));
1134     SH(out1, (src + 1));
1135     src += stride;
1136     SW(out2, (src - 3));
1137     SH(out3, (src + 1));
1138     src += stride;
1139 
1140     out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141     out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142     out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143     out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144 
1145     SW(out0, (src - 3));
1146     SH(out1, (src + 1));
1147     src += stride;
1148     SW(out2, (src - 3));
1149     SH(out3, (src + 1));
1150     src += stride;
1151 
1152     out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153     out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154     out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155     out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156 
1157     SW(out0, (src - 3));
1158     SH(out1, (src + 1));
1159     src += stride;
1160     SW(out2, (src - 3));
1161     SH(out3, (src + 1));
1162     src += stride;
1163 
1164     out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165     out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166     out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167     out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168 
1169     SW(out0, (src - 3));
1170     SH(out1, (src + 1));
1171     src += stride;
1172     SW(out2, (src - 3));
1173     SH(out3, (src + 1));
1174 }
1175 
avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t * data_cb_or_cr,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1176 static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1177                                                        uint8_t alpha_in,
1178                                                        uint8_t beta_in,
1179                                                        ptrdiff_t img_width)
1180 {
1181     v16u8 alpha, beta;
1182     v16u8 is_less_than;
1183     v8i16 p0_or_q0, q0_or_p0;
1184     v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1185     v16i8 zero = { 0 };
1186     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187     v16u8 is_less_than_alpha, is_less_than_beta;
1188     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189 
1190     alpha = (v16u8) __msa_fill_b(alpha_in);
1191     beta = (v16u8) __msa_fill_b(beta_in);
1192 
1193     LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194            p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195 
1196     p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197     p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198     q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199 
1200     is_less_than_alpha = (p0_asub_q0 < alpha);
1201     is_less_than_beta = (p1_asub_p0 < beta);
1202     is_less_than = is_less_than_beta & is_less_than_alpha;
1203     is_less_than_beta = (q1_asub_q0 < beta);
1204     is_less_than = is_less_than_beta & is_less_than;
1205 
1206     is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1207 
1208     if (!__msa_test_bz_v(is_less_than)) {
1209         ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1210                    zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1211         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1212         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1213         PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1214 
1215         p0_or_q0_org =
1216             __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217         q0_or_p0_org =
1218             __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219 
1220         ST_UB(q0_or_p0_org, data_cb_or_cr);
1221         ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222     }
1223 }
1224 
avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t * data_cb_or_cr,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1225 static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1226                                                        uint8_t alpha_in,
1227                                                        uint8_t beta_in,
1228                                                        ptrdiff_t img_width)
1229 {
1230     v8i16 tmp1;
1231     v16u8 alpha, beta, is_less_than;
1232     v8i16 p0_or_q0, q0_or_p0;
1233     v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1234     v16i8 zero = { 0 };
1235     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236     v16u8 is_less_than_alpha, is_less_than_beta;
1237     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238 
1239     {
1240         v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241 
1242         LD_UB8((data_cb_or_cr - 2), img_width,
1243                row0, row1, row2, row3, row4, row5, row6, row7);
1244 
1245         TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1246                            p1_or_q1_org, p0_or_q0_org,
1247                            q0_or_p0_org, q1_or_p1_org);
1248     }
1249 
1250     alpha = (v16u8) __msa_fill_b(alpha_in);
1251     beta = (v16u8) __msa_fill_b(beta_in);
1252 
1253     p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254     p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255     q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256 
1257     is_less_than_alpha = (p0_asub_q0 < alpha);
1258     is_less_than_beta = (p1_asub_p0 < beta);
1259     is_less_than = is_less_than_beta & is_less_than_alpha;
1260     is_less_than_beta = (q1_asub_q0 < beta);
1261     is_less_than = is_less_than_beta & is_less_than;
1262     is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1263 
1264     if (!__msa_test_bz_v(is_less_than)) {
1265         ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1266                    zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267 
1268         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1269         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1270 
1271         /* convert 16 bit output into 8 bit output */
1272         PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1273 
1274         p0_or_q0_org =
1275             __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276         q0_or_p0_org =
1277             __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278         tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279 
1280         data_cb_or_cr -= 1;
1281         ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282         data_cb_or_cr += 4 * img_width;
1283         ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284     }
1285 }
1286 
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t * pPix,uint32_t iStride,uint8_t iAlpha,uint8_t iBeta,uint8_t * pTc)1287 static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t* pPix, uint32_t iStride,
1288                                                    uint8_t iAlpha, uint8_t iBeta,
1289                                                    uint8_t* pTc)
1290 {
1291     v16u8 p0, p1, p2, q0, q1, q2;
1292     v16i8 iTc, negiTc, negTc, flags, f;
1293     v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
1294     v8i16 tc_l, tc_r, negTc_l, negTc_r;
1295     v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
1296     // Use for temporary variable
1297     v8i16 t0, t1, t2, t3;
1298     v16u8 alpha, beta;
1299     v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
1300     v16i8 const_1_b = __msa_ldi_b(1);
1301     v8i16 const_1_h = __msa_ldi_h(1);
1302     v8i16 const_4_h = __msa_ldi_h(4);
1303     v8i16 const_not_255_h = __msa_ldi_h(~255);
1304     v16i8 zero = { 0 };
1305     v16i8 tc = { pTc[0  >> 2], pTc[1  >> 2], pTc[2  >> 2], pTc[3  >> 2],
1306                  pTc[4  >> 2], pTc[5  >> 2], pTc[6  >> 2], pTc[7  >> 2],
1307                  pTc[8  >> 2], pTc[9  >> 2], pTc[10 >> 2], pTc[11 >> 2],
1308                  pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
1309     negTc = zero - tc;
1310     iTc = tc;
1311 
1312     // Load data from pPix
1313     LD_SH8(pPix - 3, iStride, t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r);
1314     LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r,
1315            p2_l, p2_r, q0_l, q0_r);
1316     TRANSPOSE16x8_UB_UB(t0, t1, t2, t3, q1_l, q1_r, q2_l, q2_r,
1317                         p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
1318                         p2, p1, p0, q0, q1, q2, alpha, beta);
1319 
1320     alpha = (v16u8)__msa_fill_b(iAlpha);
1321     beta  = (v16u8)__msa_fill_b(iBeta);
1322 
1323     bDetaP0Q0 = __msa_asub_u_b(p0, q0);
1324     bDetaP1P0 = __msa_asub_u_b(p1, p0);
1325     bDetaQ1Q0 = __msa_asub_u_b(q1, q0);
1326     bDetaP2P0 = __msa_asub_u_b(p2, p0);
1327     bDetaQ2Q0 = __msa_asub_u_b(q2, q0);
1328     bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, alpha);
1329     bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
1330     bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
1331     bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
1332     bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
1333 
1334     // Unsigned extend p0, p1, p2, q0, q1, q2 from 8 bits to 16 bits
1335     ILVRL_B2_SH(zero, p0, p0_r, p0_l);
1336     ILVRL_B2_SH(zero, p1, p1_r, p1_l);
1337     ILVRL_B2_SH(zero, p2, p2_r, p2_l);
1338     ILVRL_B2_SH(zero, q0, q0_r, q0_l);
1339     ILVRL_B2_SH(zero, q1, q1_r, q1_l);
1340     ILVRL_B2_SH(zero, q2, q2_r, q2_l);
1341     // Signed extend tc, negTc from 8 bits to 16 bits
1342     flags = __msa_clt_s_b(tc, zero);
1343     ILVRL_B2(v8i16, flags, tc, tc_r, tc_l);
1344     flags = __msa_clt_s_b(negTc, zero);
1345     ILVRL_B2(v8i16, flags, negTc, negTc_r, negTc_l);
1346 
1347     f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
1348     flags = f & (v16i8)bDetaP2P0;
1349     flags = __msa_ceq_b(flags, zero);
1350     iTc += ((~flags) & const_1_b);
1351     flags = f & (v16i8)bDetaQ2Q0;
1352     flags = __msa_ceq_b(flags, zero);
1353     iTc += ((~flags) & const_1_b);
1354     negiTc = zero - iTc;
1355     // Signed extend iTc, negiTc from 8 bits to 16 bits
1356     flags = __msa_clt_s_b(iTc, zero);
1357     ILVRL_B2(v8i16, flags, iTc, iTc_r, iTc_l);
1358     flags = __msa_clt_s_b(negiTc, zero);
1359     ILVRL_B2(v8i16, flags, negiTc, negiTc_r, negiTc_l);
1360 
1361     // Calculate the left part
1362     // p1
1363     t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
1364     t0 = __msa_max_s_h(negTc_l, t0);
1365     t0 = __msa_min_s_h(tc_l, t0);
1366     t1 = p1_l + t0;
1367     // q1
1368     t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
1369     t0 = __msa_max_s_h(negTc_l, t0);
1370     t0 = __msa_min_s_h(tc_l, t0);
1371     t2 = q1_l + t0;
1372     // iDeta
1373     t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
1374     t0 = __msa_max_s_h(negiTc_l, t0);
1375     t0 = __msa_min_s_h(iTc_l, t0);
1376     p1_l = t1;
1377     q1_l = t2;
1378     // p0
1379     t1 = p0_l + t0;
1380     t2 = t1 & const_not_255_h;
1381     t3 = __msa_cle_s_h((v8i16)zero, t1);
1382     flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1383     p0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1384     // q0
1385     t1 = q0_l - t0;
1386     t2 = t1 & const_not_255_h;
1387     t3 = __msa_cle_s_h((v8i16)zero, t1);
1388     flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1389     q0_l = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1390 
1391     // Calculate the right part
1392     // p1
1393     t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
1394     t0 = __msa_max_s_h(negTc_r, t0);
1395     t0 = __msa_min_s_h(tc_r, t0);
1396     t1 = p1_r + t0;
1397     // q1
1398     t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
1399     t0 = __msa_max_s_h(negTc_r, t0);
1400     t0 = __msa_min_s_h(tc_r, t0);
1401     t2 = q1_r + t0;
1402     // iDeta
1403     t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
1404     t0 = __msa_max_s_h(negiTc_r, t0);
1405     t0 = __msa_min_s_h(iTc_r, t0);
1406     p1_r = t1;
1407     q1_r = t2;
1408     // p0
1409     t1 = p0_r + t0;
1410     t2 = t1 & const_not_255_h;
1411     t3 = __msa_cle_s_h((v8i16)zero, t1);
1412     flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1413     p0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1414     // q0
1415     t1 = q0_r - t0;
1416     t2 = t1 & const_not_255_h;
1417     t3 = __msa_cle_s_h((v8i16)zero, t1);
1418     flags = (v16i8)__msa_ceq_h(t2, (v8i16)zero);
1419     q0_r = (t1 & (v8i16)flags) + (t3 & (v8i16)(~flags));
1420 
1421     // Combined left and right
1422     PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
1423              t0, t1, t2, t3);
1424     flags = (v16i8)__msa_cle_s_b(zero, tc);
1425     flags &= f;
1426     p0 = (v16u8)(((v16i8)t1 & flags) + (p0 & (~flags)));
1427     q0 = (v16u8)(((v16i8)t2 & flags) + (q0 & (~flags)));
1428     // Using t1, t2 as temporary flags
1429     t1 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, zero))));
1430     p1 = (v16u8)(t0 & t1) + (p1 & (v16u8)(~t1));
1431     t2 = (v8i16)(flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, zero))));
1432     q1 = (v16u8)(t3 & t2) + (q1 & (v16u8)(~t2));
1433 
1434     ILVRL_B2_SH(p0, p1, t0, t1);
1435     ILVRL_B2_SH(q1, q0, t2, t3);
1436     ILVRL_H2_UB(t2, t0, p1, p0);
1437     ILVRL_H2_UB(t3, t1, q0, q1);
1438     // Store data to pPix
1439     ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride);
1440     ST_W8(q0, q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride);
1441 }
1442 
avc_loopfilter_luma_inter_edge_hor_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t image_width)1443 static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
1444                                                    uint8_t bs0, uint8_t bs1,
1445                                                    uint8_t bs2, uint8_t bs3,
1446                                                    uint8_t tc0, uint8_t tc1,
1447                                                    uint8_t tc2, uint8_t tc3,
1448                                                    uint8_t alpha_in,
1449                                                    uint8_t beta_in,
1450                                                    ptrdiff_t image_width)
1451 {
1452     v16u8 tmp_vec;
1453     v16u8 bs = { 0 };
1454 
1455     tmp_vec = (v16u8) __msa_fill_b(bs0);
1456     bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1457     tmp_vec = (v16u8) __msa_fill_b(bs1);
1458     bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1459     tmp_vec = (v16u8) __msa_fill_b(bs2);
1460     bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1461     tmp_vec = (v16u8) __msa_fill_b(bs3);
1462     bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1463 
1464     if (!__msa_test_bz_v(bs)) {
1465         v16u8 alpha, beta, is_less_than, is_less_than_beta;
1466         v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1467         v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1468         v16u8 is_less_than_alpha, is_bs_greater_than0;
1469         v8i16 p0_r, q0_r, p0_l, q0_l;
1470         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1471         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1472         v16i8 zero = { 0 };
1473         v16i8 tc = { 0 };
1474 
1475         tmp_vec = (v16u8) __msa_fill_b(tc0);
1476         tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1477         tmp_vec = (v16u8) __msa_fill_b(tc1);
1478         tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1479         tmp_vec = (v16u8) __msa_fill_b(tc2);
1480         tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1481         tmp_vec = (v16u8) __msa_fill_b(tc3);
1482         tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1483 
1484         alpha = (v16u8) __msa_fill_b(alpha_in);
1485         beta = (v16u8) __msa_fill_b(beta_in);
1486 
1487         LD_UB5(data - (3 * image_width), image_width,
1488                p2_org, p1_org, p0_org, q0_org, q1_org);
1489 
1490         is_bs_greater_than0 = ((v16u8) zero < bs);
1491         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1492         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1493         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1494 
1495         is_less_than_alpha = (p0_asub_q0 < alpha);
1496         is_less_than_beta = (p1_asub_p0 < beta);
1497         is_less_than = is_less_than_beta & is_less_than_alpha;
1498         is_less_than_beta = (q1_asub_q0 < beta);
1499         is_less_than = is_less_than_beta & is_less_than;
1500         is_less_than = is_less_than & is_bs_greater_than0;
1501 
1502         if (!__msa_test_bz_v(is_less_than)) {
1503             v16i8 sign_negate_tc, negate_tc;
1504             v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1505             v16u8 p2_asub_p0, q2_asub_q0;
1506 
1507             q2_org = LD_UB(data + (2 * image_width));
1508             negate_tc = zero - tc;
1509             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1510 
1511             ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1512 
1513             UNPCK_UB_SH(tc, tc_r, tc_l);
1514             UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1515             UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1516             UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1517 
1518             p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1519             is_less_than_beta = (p2_asub_p0 < beta);
1520             is_less_than_beta = is_less_than_beta & is_less_than;
1521 
1522             if (!__msa_test_bz_v(is_less_than_beta)) {
1523                 v16u8 p1;
1524                 v8i16 p1_r = { 0 };
1525                 v8i16 p1_l = { 0 };
1526                 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1527                 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1528 
1529                 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1530                                  negate_tc_r, tc_r, p1_r);
1531                 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1532                                  i16_negatetc_l, tc_l, p1_l);
1533 
1534                 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1535                 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1536                 ST_UB(p1_org, data - (2 * image_width));
1537 
1538                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1539                 tc = tc + (v16i8) is_less_than_beta;
1540             }
1541 
1542             q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1543             is_less_than_beta = (q2_asub_q0 < beta);
1544             is_less_than_beta = is_less_than_beta & is_less_than;
1545 
1546             q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1547             q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1548 
1549             if (!__msa_test_bz_v(is_less_than_beta)) {
1550                 v16u8 q1;
1551                 v8i16 q1_r = { 0 };
1552                 v8i16 q1_l = { 0 };
1553                 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1554                 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1555 
1556                 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1557                                  negate_tc_r, tc_r, q1_r);
1558                 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1559                                  i16_negatetc_l, tc_l, q1_l);
1560 
1561                 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1562                 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1563                 ST_UB(q1_org, data + image_width);
1564 
1565                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1566                 tc = tc + (v16i8) is_less_than_beta;
1567             }
1568             {
1569                 v16i8 negate_thresh, sign_negate_thresh;
1570                 v8i16 threshold_r, threshold_l;
1571                 v8i16 negate_thresh_l, negate_thresh_r;
1572 
1573                 negate_thresh = zero - tc;
1574                 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1575 
1576                 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1577                            threshold_r, negate_thresh_r);
1578                 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1579                              negate_thresh_r, threshold_r, p0_r, q0_r);
1580 
1581                 threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1582                 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1583                                                        negate_thresh);
1584                 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1585                              negate_thresh_l, threshold_l, p0_l, q0_l);
1586             }
1587 
1588             PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1589 
1590             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1591             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1592 
1593             ST_UB(p0_org, (data - image_width));
1594             ST_UB(q0_org, data);
1595         }
1596     }
1597 }
1598 
avc_h_loop_filter_luma_mbaff_msa(uint8_t * in,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)1599 static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride,
1600                                              int32_t alpha_in, int32_t beta_in,
1601                                              int8_t *tc0)
1602 {
1603     uint8_t *data = in;
1604     uint32_t out0, out1, out2, out3;
1605     uint64_t load;
1606     uint32_t tc_val;
1607     v16u8 alpha, beta;
1608     v16i8 inp0 = { 0 };
1609     v16i8 inp1 = { 0 };
1610     v16i8 inp2 = { 0 };
1611     v16i8 inp3 = { 0 };
1612     v16i8 inp4 = { 0 };
1613     v16i8 inp5 = { 0 };
1614     v16i8 inp6 = { 0 };
1615     v16i8 inp7 = { 0 };
1616     v16i8 src0, src1, src2, src3;
1617     v8i16 src4, src5, src6, src7;
1618     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1619     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1620     v16u8 is_less_than_beta1, is_less_than_beta2;
1621     v8i16 tc, tc_orig_r, tc_plus1;
1622     v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1623     v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1624     v8i16 src2_r, src3_r;
1625     v8i16 p2_r, p1_r, q2_r, q1_r;
1626     v16u8 p2, q2, p0, q0;
1627     v4i32 dst0, dst1;
1628     v16i8 zeros = { 0 };
1629 
1630     alpha = (v16u8) __msa_fill_b(alpha_in);
1631     beta = (v16u8) __msa_fill_b(beta_in);
1632 
1633     if (tc0[0] < 0) {
1634         data += (2 * stride);
1635     } else {
1636         load = LD(data - 3);
1637         inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1638         load = LD(data - 3 + stride);
1639         inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1640         data += (2 * stride);
1641     }
1642 
1643     if (tc0[1] < 0) {
1644         data += (2 * stride);
1645     } else {
1646         load = LD(data - 3);
1647         inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1648         load = LD(data - 3 + stride);
1649         inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1650         data += (2 * stride);
1651     }
1652 
1653     if (tc0[2] < 0) {
1654         data += (2 * stride);
1655     } else {
1656         load = LD(data - 3);
1657         inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1658         load = LD(data - 3 + stride);
1659         inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1660         data += (2 * stride);
1661     }
1662 
1663     if (tc0[3] < 0) {
1664         data += (2 * stride);
1665     } else {
1666         load = LD(data - 3);
1667         inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1668         load = LD(data - 3 + stride);
1669         inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1670         data += (2 * stride);
1671     }
1672 
1673     ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1674                src0, src1, src2, src3);
1675 
1676     ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1677     ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1678 
1679     src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1680     src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1681     src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1682     src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1683     src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1684     src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1685 
1686     p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1687     p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1688     q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1689     p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1690     q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1691 
1692     is_less_than_alpha = (p0_asub_q0 < alpha);
1693     is_less_than_beta = (p1_asub_p0 < beta);
1694     is_less_than = is_less_than_alpha & is_less_than_beta;
1695     is_less_than_beta = (q1_asub_q0 < beta);
1696     is_less_than = is_less_than_beta & is_less_than;
1697 
1698     is_less_than_beta1 = (p2_asub_p0 < beta);
1699     is_less_than_beta2 = (q2_asub_q0 < beta);
1700 
1701     p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1702     p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1703     p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1704 
1705     ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1706     p2_r += p0_add_q0;
1707     p2_r >>= 1;
1708     p2_r -= p1_r;
1709     ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1710     q2_r += p0_add_q0;
1711     q2_r >>= 1;
1712     q2_r -= q1_r;
1713 
1714     tc_val = LW(tc0);
1715     tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1716     tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1717     is_tc_orig1 = tc_orig;
1718     is_tc_orig2 = tc_orig;
1719     tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1720     tc = tc_orig_r;
1721 
1722     CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1723     CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1724 
1725     p2_r += p1_r;
1726     q2_r += q1_r;
1727 
1728     PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1729 
1730     is_tc_orig1 = (zeros < is_tc_orig1);
1731     is_tc_orig2 = is_tc_orig1;
1732     is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1733     is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1734     is_tc_orig1 = is_less_than & is_tc_orig1;
1735     is_tc_orig2 = is_less_than & is_tc_orig2;
1736 
1737     p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1738     q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1739 
1740     q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1741     q0_sub_p0 <<= 2;
1742     p1_sub_q1 = p1_r - q1_r;
1743     q0_sub_p0 += p1_sub_q1;
1744     q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1745 
1746     tc_plus1 = tc + 1;
1747     is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1748                                               (v16i8) is_less_than_beta1);
1749     tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1750     tc_plus1 = tc + 1;
1751     is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1752                                               (v16i8) is_less_than_beta2);
1753     tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1754 
1755     CLIP_SH(q0_sub_p0, -tc, tc);
1756 
1757     ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1758     src2_r += q0_sub_p0;
1759     src3_r -= q0_sub_p0;
1760 
1761     CLIP_SH2_0_255(src2_r, src3_r);
1762 
1763     PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1764 
1765     p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1766     q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1767 
1768     ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1769 
1770     ILVRL_H2_SW(q2, p2, dst0, dst1);
1771 
1772     data = in;
1773 
1774     out0 = __msa_copy_u_w(dst0, 0);
1775     out1 = __msa_copy_u_w(dst0, 1);
1776     out2 = __msa_copy_u_w(dst0, 2);
1777     out3 = __msa_copy_u_w(dst0, 3);
1778 
1779     if (tc0[0] < 0) {
1780         data += (2 * stride);
1781     } else {
1782         SW(out0, (data - 2));
1783         data += stride;
1784         SW(out1, (data - 2));
1785         data += stride;
1786     }
1787 
1788     if (tc0[1] < 0) {
1789         data += (2 * stride);
1790     } else {
1791         SW(out2, (data - 2));
1792         data += stride;
1793         SW(out3, (data - 2));
1794         data += stride;
1795     }
1796 
1797     out0 = __msa_copy_u_w(dst1, 0);
1798     out1 = __msa_copy_u_w(dst1, 1);
1799     out2 = __msa_copy_u_w(dst1, 2);
1800     out3 = __msa_copy_u_w(dst1, 3);
1801 
1802     if (tc0[2] < 0) {
1803         data += (2 * stride);
1804     } else {
1805         SW(out0, (data - 2));
1806         data += stride;
1807         SW(out1, (data - 2));
1808         data += stride;
1809     }
1810 
1811     if (tc0[3] >= 0) {
1812         SW(out2, (data - 2));
1813         data += stride;
1814         SW(out3, (data - 2));
1815     }
1816 }
1817 
avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1818 static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
1819                                                        uint8_t bs0, uint8_t bs1,
1820                                                        uint8_t bs2, uint8_t bs3,
1821                                                        uint8_t tc0, uint8_t tc1,
1822                                                        uint8_t tc2, uint8_t tc3,
1823                                                        uint8_t alpha_in,
1824                                                        uint8_t beta_in,
1825                                                        ptrdiff_t img_width)
1826 {
1827     v16u8 alpha, beta;
1828     v8i16 tmp_vec;
1829     v8i16 bs = { 0 };
1830     v8i16 tc = { 0 };
1831     v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1832     v16u8 is_less_than;
1833     v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1834     v8i16 p0_r, q0_r;
1835     v16u8 p1_org, p0_org, q0_org, q1_org;
1836     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1837     v16i8 negate_tc, sign_negate_tc;
1838     v8i16 tc_r, negate_tc_r;
1839     v16i8 zero = { 0 };
1840 
1841     tmp_vec = (v8i16) __msa_fill_b(bs0);
1842     bs = __msa_insve_h(bs, 0, tmp_vec);
1843     tmp_vec = (v8i16) __msa_fill_b(bs1);
1844     bs = __msa_insve_h(bs, 1, tmp_vec);
1845     tmp_vec = (v8i16) __msa_fill_b(bs2);
1846     bs = __msa_insve_h(bs, 2, tmp_vec);
1847     tmp_vec = (v8i16) __msa_fill_b(bs3);
1848     bs = __msa_insve_h(bs, 3, tmp_vec);
1849 
1850     if (!__msa_test_bz_v((v16u8) bs)) {
1851         tmp_vec = (v8i16) __msa_fill_b(tc0);
1852         tc = __msa_insve_h(tc, 0, tmp_vec);
1853         tmp_vec = (v8i16) __msa_fill_b(tc1);
1854         tc = __msa_insve_h(tc, 1, tmp_vec);
1855         tmp_vec = (v8i16) __msa_fill_b(tc2);
1856         tc = __msa_insve_h(tc, 2, tmp_vec);
1857         tmp_vec = (v8i16) __msa_fill_b(tc3);
1858         tc = __msa_insve_h(tc, 3, tmp_vec);
1859 
1860         is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1861 
1862         alpha = (v16u8) __msa_fill_b(alpha_in);
1863         beta = (v16u8) __msa_fill_b(beta_in);
1864 
1865         LD_UB4(data - (img_width << 1), img_width,
1866                p1_org, p0_org, q0_org, q1_org);
1867 
1868         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1869         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1870         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1871 
1872         is_less_than_alpha = (p0_asub_q0 < alpha);
1873         is_less_than_beta = (p1_asub_p0 < beta);
1874         is_less_than = is_less_than_beta & is_less_than_alpha;
1875         is_less_than_beta = (q1_asub_q0 < beta);
1876         is_less_than = is_less_than_beta & is_less_than;
1877         is_less_than = is_less_than & is_bs_greater_than0;
1878 
1879         is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1880 
1881         if (!__msa_test_bz_v(is_less_than)) {
1882             negate_tc = zero - (v16i8) tc;
1883             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1884 
1885             ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
1886 
1887             ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1888                        p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1889 
1890             AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1891                          tc_r, p0_r, q0_r);
1892 
1893             PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1894 
1895             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1896             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1897 
1898             ST_UB(q0_org, data);
1899             ST_UB(p0_org, (data - img_width));
1900         }
1901     }
1902 }
1903 
avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1904 static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
1905                                                        uint8_t bs0, uint8_t bs1,
1906                                                        uint8_t bs2, uint8_t bs3,
1907                                                        uint8_t tc0, uint8_t tc1,
1908                                                        uint8_t tc2, uint8_t tc3,
1909                                                        uint8_t alpha_in,
1910                                                        uint8_t beta_in,
1911                                                        ptrdiff_t img_width)
1912 {
1913     uint8_t *src;
1914     v16u8 alpha, beta;
1915     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1916     v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1917     v16u8 p0, q0;
1918     v8i16 p0_r = { 0 };
1919     v8i16 q0_r = { 0 };
1920     v16u8 p1_org, p0_org, q0_org, q1_org;
1921     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1922     v16u8 is_bs_greater_than0;
1923     v8i16 tc_r, negate_tc_r;
1924     v16i8 negate_tc, sign_negate_tc;
1925     v16i8 zero = { 0 };
1926     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1927     v8i16 tmp1, tmp_vec, bs = { 0 };
1928     v8i16 tc = { 0 };
1929 
1930     tmp_vec = (v8i16) __msa_fill_b(bs0);
1931     bs = __msa_insve_h(bs, 0, tmp_vec);
1932     tmp_vec = (v8i16) __msa_fill_b(bs1);
1933     bs = __msa_insve_h(bs, 1, tmp_vec);
1934     tmp_vec = (v8i16) __msa_fill_b(bs2);
1935     bs = __msa_insve_h(bs, 2, tmp_vec);
1936     tmp_vec = (v8i16) __msa_fill_b(bs3);
1937     bs = __msa_insve_h(bs, 3, tmp_vec);
1938 
1939     if (!__msa_test_bz_v((v16u8) bs)) {
1940         tmp_vec = (v8i16) __msa_fill_b(tc0);
1941         tc = __msa_insve_h(tc, 0, tmp_vec);
1942         tmp_vec = (v8i16) __msa_fill_b(tc1);
1943         tc = __msa_insve_h(tc, 1, tmp_vec);
1944         tmp_vec = (v8i16) __msa_fill_b(tc2);
1945         tc = __msa_insve_h(tc, 2, tmp_vec);
1946         tmp_vec = (v8i16) __msa_fill_b(tc3);
1947         tc = __msa_insve_h(tc, 3, tmp_vec);
1948 
1949         is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1950 
1951         LD_UB8((data - 2), img_width,
1952                row0, row1, row2, row3, row4, row5, row6, row7);
1953 
1954         TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
1955                            row4, row5, row6, row7,
1956                            p1_org, p0_org, q0_org, q1_org);
1957 
1958         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1959         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1960         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1961 
1962         alpha = (v16u8) __msa_fill_b(alpha_in);
1963         beta = (v16u8) __msa_fill_b(beta_in);
1964 
1965         is_less_than_alpha = (p0_asub_q0 < alpha);
1966         is_less_than_beta = (p1_asub_p0 < beta);
1967         is_less_than = is_less_than_beta & is_less_than_alpha;
1968         is_less_than_beta = (q1_asub_q0 < beta);
1969         is_less_than = is_less_than_beta & is_less_than;
1970         is_less_than = is_bs_greater_than0 & is_less_than;
1971 
1972         is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1973 
1974         if (!__msa_test_bz_v(is_less_than)) {
1975             ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
1976                        p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1977 
1978             negate_tc = zero - (v16i8) tc;
1979             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1980 
1981             ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
1982 
1983             AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
1984                          tc_r, p0_r, q0_r);
1985 
1986             PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
1987 
1988             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1989             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1990             tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
1991             src = data - 1;
1992             ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
1993             src += 4 * img_width;
1994             ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
1995         }
1996     }
1997 }
1998 
avc_h_loop_filter_chroma422_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)1999 static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride,
2000                                             int32_t alpha_in, int32_t beta_in,
2001                                             int8_t *tc0)
2002 {
2003     int32_t col, tc_val;
2004     v16u8 alpha, beta, res;
2005 
2006     alpha = (v16u8) __msa_fill_b(alpha_in);
2007     beta = (v16u8) __msa_fill_b(beta_in);
2008 
2009     for (col = 0; col < 4; col++) {
2010         tc_val = (tc0[col] - 1) + 1;
2011 
2012         if (tc_val <= 0) {
2013             src += (4 * stride);
2014             continue;
2015         }
2016 
2017         AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2018         ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2019         src += (4 * stride);
2020     }
2021 }
2022 
avc_h_loop_filter_chroma422_mbaff_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)2023 static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2024                                                   ptrdiff_t stride,
2025                                                   int32_t alpha_in,
2026                                                   int32_t beta_in,
2027                                                   int8_t *tc0)
2028 {
2029     int32_t col, tc_val;
2030     int16_t out0, out1;
2031     v16u8 alpha, beta, res;
2032 
2033     alpha = (v16u8) __msa_fill_b(alpha_in);
2034     beta = (v16u8) __msa_fill_b(beta_in);
2035 
2036     for (col = 0; col < 4; col++) {
2037         tc_val = (tc0[col] - 1) + 1;
2038 
2039         if (tc_val <= 0) {
2040             src += 4 * stride;
2041             continue;
2042         }
2043 
2044         AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2045 
2046         out0 = __msa_copy_s_h((v8i16) res, 0);
2047         out1 = __msa_copy_s_h((v8i16) res, 1);
2048 
2049         SH(out0, (src - 1));
2050         src += stride;
2051         SH(out1, (src - 1));
2052         src += stride;
2053     }
2054 }
2055 
ff_h264_h_lpf_luma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2056 void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2057                                   int alpha, int beta, int8_t *tc)
2058 {
2059 //    uint8_t bs0 = 1;
2060 //    uint8_t bs1 = 1;
2061 //    uint8_t bs2 = 1;
2062 //    uint8_t bs3 = 1;
2063 //
2064 //    if (tc[0] < 0)
2065 //        bs0 = 0;
2066 //    if (tc[1] < 0)
2067 //        bs1 = 0;
2068 //    if (tc[2] < 0)
2069 //        bs2 = 0;
2070 //    if (tc[3] < 0)
2071 //        bs3 = 0;
2072 //
2073 //    avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2074 //                                           tc[0], tc[1], tc[2], tc[3],
2075 //                                           alpha, beta, img_width);
2076     avc_loopfilter_luma_inter_edge_ver_msa(data, img_width, alpha, beta, tc);
2077 }
2078 
ff_h264_v_lpf_luma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2079 void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2080                                   int alpha, int beta, int8_t *tc)
2081 {
2082 
2083     uint8_t bs0 = 1;
2084     uint8_t bs1 = 1;
2085     uint8_t bs2 = 1;
2086     uint8_t bs3 = 1;
2087 
2088     if (tc[0] < 0)
2089         bs0 = 0;
2090     if (tc[1] < 0)
2091         bs1 = 0;
2092     if (tc[2] < 0)
2093         bs2 = 0;
2094     if (tc[3] < 0)
2095         bs3 = 0;
2096 
2097     avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2098                                            tc[0], tc[1], tc[2], tc[3],
2099                                            alpha, beta, img_width);
2100 }
2101 
ff_h264_h_lpf_chroma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2102 void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2103                                     int alpha, int beta, int8_t *tc)
2104 {
2105     uint8_t bs0 = 1;
2106     uint8_t bs1 = 1;
2107     uint8_t bs2 = 1;
2108     uint8_t bs3 = 1;
2109 
2110     if (tc[0] < 0)
2111         bs0 = 0;
2112     if (tc[1] < 0)
2113         bs1 = 0;
2114     if (tc[2] < 0)
2115         bs2 = 0;
2116     if (tc[3] < 0)
2117         bs3 = 0;
2118 
2119     avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2120                                                tc[0], tc[1], tc[2], tc[3],
2121                                                alpha, beta, img_width);
2122 }
2123 
ff_h264_v_lpf_chroma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2124 void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2125                                     int alpha, int beta, int8_t *tc)
2126 {
2127     uint8_t bs0 = 1;
2128     uint8_t bs1 = 1;
2129     uint8_t bs2 = 1;
2130     uint8_t bs3 = 1;
2131 
2132     if (tc[0] < 0)
2133         bs0 = 0;
2134     if (tc[1] < 0)
2135         bs1 = 0;
2136     if (tc[2] < 0)
2137         bs2 = 0;
2138     if (tc[3] < 0)
2139         bs3 = 0;
2140 
2141     avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2142                                                tc[0], tc[1], tc[2], tc[3],
2143                                                alpha, beta, img_width);
2144 }
2145 
ff_h264_h_lpf_luma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2146 void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2147                                   int alpha, int beta)
2148 {
2149     avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
2150                                            (uint8_t) beta,
2151                                            img_width);
2152 }
2153 
ff_h264_v_lpf_luma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2154 void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2155                                   int alpha, int beta)
2156 {
2157     avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
2158                                            (uint8_t) beta,
2159                                            img_width);
2160 }
2161 
ff_h264_h_lpf_chroma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2162 void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2163                                     int alpha, int beta)
2164 {
2165     avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
2166                                                (uint8_t) beta,
2167                                                img_width);
2168 }
2169 
ff_h264_v_lpf_chroma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2170 void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2171                                     int alpha, int beta)
2172 {
2173     avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
2174                                                (uint8_t) beta,
2175                                                img_width);
2176 }
2177 
ff_h264_h_loop_filter_chroma422_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2178 void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
2179                                          ptrdiff_t ystride,
2180                                          int32_t alpha, int32_t beta,
2181                                          int8_t *tc0)
2182 {
2183     avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2184 }
2185 
ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2186 void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2187                                                ptrdiff_t ystride,
2188                                                int32_t alpha,
2189                                                int32_t beta,
2190                                                int8_t *tc0)
2191 {
2192     avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2193 }
2194 
ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2195 void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
2196                                           ptrdiff_t ystride,
2197                                           int32_t alpha,
2198                                           int32_t beta,
2199                                           int8_t *tc0)
2200 {
2201     avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2202 }
2203 
ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta)2204 void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
2205                                                 ptrdiff_t ystride,
2206                                                 int32_t alpha,
2207                                                 int32_t beta)
2208 {
2209     avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2210 }
2211 
ff_weight_h264_pixels16_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset_in)2212 void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2213                                    int height, int log2_denom,
2214                                    int weight_src, int offset_in)
2215 {
2216     uint32_t offset_val;
2217     v16i8 zero = { 0 };
2218     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2219     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2220     v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2221     v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2222     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2223     v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2224     v8i16 wgt, denom, offset;
2225 
2226     offset_val = (unsigned) offset_in << log2_denom;
2227 
2228     wgt = __msa_fill_h(weight_src);
2229     offset = __msa_fill_h(offset_val);
2230     denom = __msa_fill_h(log2_denom);
2231 
2232     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2233     ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2234                src2_r, src3_r);
2235     ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2236                src2_l, src3_l);
2237     ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2238                src6_r, src7_r);
2239     ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2240                src6_l, src7_l);
2241     MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2242          tmp3);
2243     MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2244          tmp7);
2245     MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2246          tmp11);
2247     MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2248          tmp14, tmp15);
2249     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2250                 tmp1, tmp2, tmp3);
2251     ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2252                 tmp5, tmp6, tmp7);
2253     ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2254                 tmp9, tmp10, tmp11);
2255     ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2256                 tmp12, tmp13, tmp14, tmp15);
2257     MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2258     MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2259     SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2260     SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2261     SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2262     SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2263     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2264                 dst2, dst3);
2265     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2266                 dst5, dst6, dst7);
2267     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2268     src += 8 * stride;
2269 
2270     if (16 == height) {
2271         LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2272         ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2273                    src1_r, src2_r, src3_r);
2274         ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2275                    src1_l, src2_l, src3_l);
2276         ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2277                    src5_r, src6_r, src7_r);
2278         ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2279                    src5_l, src6_l, src7_l);
2280         MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2281              tmp2, tmp3);
2282         MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2283              tmp6, tmp7);
2284         MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2285              tmp10, tmp11);
2286         MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2287              tmp14, tmp15);
2288         ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2289                     tmp0, tmp1, tmp2, tmp3);
2290         ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2291                     tmp4, tmp5, tmp6, tmp7);
2292         ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2293                     tmp8, tmp9, tmp10, tmp11);
2294         ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2295                     tmp12, tmp13, tmp14, tmp15);
2296         MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2297         MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2298         SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2299         SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2300         SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2301         SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2302         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2303                     dst2, dst3);
2304         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2305                     dst5, dst6, dst7);
2306         ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2307     }
2308 }
2309 
ff_weight_h264_pixels8_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset)2310 void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2311                                   int height, int log2_denom,
2312                                   int weight_src, int offset)
2313 {
2314     if (4 == height) {
2315         avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2316     } else if (8 == height) {
2317         avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2318     } else {
2319         avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2320     }
2321 }
2322 
ff_weight_h264_pixels4_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset)2323 void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2324                                   int height, int log2_denom,
2325                                   int weight_src, int offset)
2326 {
2327     if (2 == height) {
2328         avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2329     } else if (4 == height) {
2330         avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2331     } else {
2332         avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2333     }
2334 }
2335 
ff_biweight_h264_pixels16_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset_in)2336 void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2337                                      ptrdiff_t stride, int height,
2338                                      int log2_denom, int weight_dst,
2339                                      int weight_src, int offset_in)
2340 {
2341     v16i8 src_wgt, dst_wgt, wgt;
2342     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2343     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2344     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2345     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2346     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2347     v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2348     v8i16 denom, offset;
2349 
2350     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2351     offset_in += (128 * (weight_src + weight_dst));
2352 
2353     src_wgt = __msa_fill_b(weight_src);
2354     dst_wgt = __msa_fill_b(weight_dst);
2355     offset = __msa_fill_h(offset_in);
2356     denom = __msa_fill_h(log2_denom + 1);
2357 
2358     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2359 
2360     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2361     src += 8 * stride;
2362     LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2363     XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2364     XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2365     ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2366                vec6);
2367     ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2368                vec7);
2369     ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2370                vec12, vec14);
2371     ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2372                vec13, vec15);
2373     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2374     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2375     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2376     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2377     tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2378     tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2379     tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2380     tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2381     tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2382     tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2383     tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2384     tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2385     tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2386     tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2387     tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2388     tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2389     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2390     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2391     SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2392     SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2393     CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2394     CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2395     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2396                 dst2, dst3);
2397     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2398                 dst5, dst6, dst7);
2399     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2400     dst += 8 * stride;
2401 
2402     if (16 == height) {
2403         LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2404         LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2405         XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2406         XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2407         ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2408                    vec4, vec6);
2409         ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2410                    vec5, vec7);
2411         ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2412                    vec12, vec14);
2413         ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2414                    vec13, vec15);
2415         tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2416         tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2417         tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2418         tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2419         tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2420         tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2421         tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2422         tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2423         tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2424         tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2425         tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2426         tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2427         tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2428         tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2429         tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2430         tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2431         SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2432         SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2433         SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2434         SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2435         CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2436         CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2437         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2438                     dst2, dst3);
2439         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2440                     dst5, dst6, dst7);
2441         ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2442     }
2443 }
2444 
ff_biweight_h264_pixels8_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset)2445 void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2446                                     ptrdiff_t stride, int height,
2447                                     int log2_denom, int weight_dst,
2448                                     int weight_src, int offset)
2449 {
2450     if (4 == height) {
2451         avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2452                           offset);
2453     } else if (8 == height) {
2454         avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2455                           offset);
2456     } else {
2457         avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2458                            offset);
2459     }
2460 }
2461 
ff_biweight_h264_pixels4_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset)2462 void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2463                                     ptrdiff_t stride, int height,
2464                                     int log2_denom, int weight_dst,
2465                                     int weight_src, int offset)
2466 {
2467     if (2 == height) {
2468         avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2469                           offset);
2470     } else if (4 == height) {
2471         avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2472                           offset);
2473     } else {
2474         avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2475                           offset);
2476     }
2477 }
2478