1 /*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
24
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26 /* 8 width cases */
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 /* 4 width cases */
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30 /* 4 width cases */
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
37 out0, out1) \
38 { \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40 \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
55 { \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57 \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
74 }
75
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
78 out0, out1) \
79 { \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81 \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
91 { \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93 \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
102 }
103
copy_width8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105 uint8_t *dst, int32_t dst_stride,
106 int32_t height)
107 {
108 int32_t cnt;
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110
111 if (2 == height) {
112 LD2(src, src_stride, out0, out1);
113 SD(out0, dst);
114 dst += dst_stride;
115 SD(out1, dst);
116 } else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
122 SD(out0, dst);
123 dst += dst_stride;
124 SD(out1, dst);
125 } else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
135 }
136 } else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
142 }
143 }
144 }
145
copy_width12_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147 uint8_t *dst, int32_t dst_stride,
148 int32_t height)
149 {
150 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159
copy_width16_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161 uint8_t *dst, int32_t dst_stride,
162 int32_t height)
163 {
164 int32_t cnt;
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166
167 if (12 == height) {
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 } else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179 src7);
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182 dst_stride);
183 dst += (8 * dst_stride);
184 }
185 } else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
189
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
192 }
193 }
194 }
195
copy_width24_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197 uint8_t *dst, int32_t dst_stride,
198 int32_t height)
199 {
200 int32_t cnt;
201 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
210
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
216 }
217 }
218
copy_width32_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220 uint8_t *dst, int32_t dst_stride,
221 int32_t height)
222 {
223 int32_t cnt;
224 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
233 }
234 }
235
copy_width48_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237 uint8_t *dst, int32_t dst_stride,
238 int32_t height)
239 {
240 int32_t cnt;
241 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242 v16u8 src11;
243
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
249
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
254 }
255 }
256
copy_width64_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
259 int32_t height)
260 {
261 int32_t cnt;
262 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
267 src += src_stride;
268 LD_UB4(src, 16, src4, src5, src6, src7);
269 src += src_stride;
270 LD_UB4(src, 16, src8, src9, src10, src11);
271 src += src_stride;
272 LD_UB4(src, 16, src12, src13, src14, src15);
273 src += src_stride;
274
275 ST_UB4(src0, src1, src2, src3, dst, 16);
276 dst += dst_stride;
277 ST_UB4(src4, src5, src6, src7, dst, 16);
278 dst += dst_stride;
279 ST_UB4(src8, src9, src10, src11, dst, 16);
280 dst += dst_stride;
281 ST_UB4(src12, src13, src14, src15, dst, 16);
282 dst += dst_stride;
283 }
284 }
285
common_hz_8t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)286 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287 uint8_t *dst, int32_t dst_stride,
288 const int8_t *filter)
289 {
290 v16u8 mask0, mask1, mask2, mask3, out;
291 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292 v8i16 filt, out0, out1;
293
294 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295 src -= 3;
296
297 /* rearranging filter */
298 filt = LD_SH(filter);
299 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300
301 mask1 = mask0 + 2;
302 mask2 = mask0 + 4;
303 mask3 = mask0 + 6;
304
305 LD_SB4(src, src_stride, src0, src1, src2, src3);
306 XORI_B4_128_SB(src0, src1, src2, src3);
307 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308 mask3, filt0, filt1, filt2, filt3, out0, out1);
309 SRARI_H2_SH(out0, out1, 6);
310 SAT_SH2_SH(out0, out1, 7);
311 out = PCKEV_XORI128_UB(out0, out1);
312 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313 }
314
common_hz_8t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)315 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316 uint8_t *dst, int32_t dst_stride,
317 const int8_t *filter)
318 {
319 v16i8 filt0, filt1, filt2, filt3;
320 v16i8 src0, src1, src2, src3;
321 v16u8 mask0, mask1, mask2, mask3, out;
322 v8i16 filt, out0, out1, out2, out3;
323
324 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325 src -= 3;
326
327 /* rearranging filter */
328 filt = LD_SH(filter);
329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330
331 mask1 = mask0 + 2;
332 mask2 = mask0 + 4;
333 mask3 = mask0 + 6;
334
335 LD_SB4(src, src_stride, src0, src1, src2, src3);
336 XORI_B4_128_SB(src0, src1, src2, src3);
337 src += (4 * src_stride);
338 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339 mask3, filt0, filt1, filt2, filt3, out0, out1);
340 LD_SB4(src, src_stride, src0, src1, src2, src3);
341 XORI_B4_128_SB(src0, src1, src2, src3);
342 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343 mask3, filt0, filt1, filt2, filt3, out2, out3);
344 SRARI_H4_SH(out0, out1, out2, out3, 6);
345 SAT_SH4_SH(out0, out1, out2, out3, 7);
346 out = PCKEV_XORI128_UB(out0, out1);
347 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348 out = PCKEV_XORI128_UB(out2, out3);
349 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350 }
351
common_hz_8t_4x16_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)352 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
353 uint8_t *dst, int32_t dst_stride,
354 const int8_t *filter)
355 {
356 v16u8 mask0, mask1, mask2, mask3, out;
357 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358 v8i16 filt, out0, out1, out2, out3;
359
360 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361 src -= 3;
362
363 /* rearranging filter */
364 filt = LD_SH(filter);
365 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366
367 mask1 = mask0 + 2;
368 mask2 = mask0 + 4;
369 mask3 = mask0 + 6;
370
371 LD_SB4(src, src_stride, src0, src1, src2, src3);
372 XORI_B4_128_SB(src0, src1, src2, src3);
373 src += (4 * src_stride);
374 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375 mask3, filt0, filt1, filt2, filt3, out0, out1);
376 LD_SB4(src, src_stride, src0, src1, src2, src3);
377 XORI_B4_128_SB(src0, src1, src2, src3);
378 src += (4 * src_stride);
379 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380 mask3, filt0, filt1, filt2, filt3, out2, out3);
381 SRARI_H4_SH(out0, out1, out2, out3, 6);
382 SAT_SH4_SH(out0, out1, out2, out3, 7);
383 out = PCKEV_XORI128_UB(out0, out1);
384 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385 out = PCKEV_XORI128_UB(out2, out3);
386 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387 dst += (8 * dst_stride);
388
389 LD_SB4(src, src_stride, src0, src1, src2, src3);
390 XORI_B4_128_SB(src0, src1, src2, src3);
391 src += (4 * src_stride);
392 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393 mask3, filt0, filt1, filt2, filt3, out0, out1);
394 LD_SB4(src, src_stride, src0, src1, src2, src3);
395 XORI_B4_128_SB(src0, src1, src2, src3);
396 src += (4 * src_stride);
397 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398 mask3, filt0, filt1, filt2, filt3, out2, out3);
399
400 SRARI_H4_SH(out0, out1, out2, out3, 6);
401 SAT_SH4_SH(out0, out1, out2, out3, 7);
402 out = PCKEV_XORI128_UB(out0, out1);
403 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404 out = PCKEV_XORI128_UB(out2, out3);
405 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406 }
407
common_hz_8t_4w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)408 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
409 uint8_t *dst, int32_t dst_stride,
410 const int8_t *filter, int32_t height)
411 {
412 if (4 == height) {
413 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414 } else if (8 == height) {
415 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416 } else if (16 == height) {
417 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418 }
419 }
420
common_hz_8t_8w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)421 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
422 uint8_t *dst, int32_t dst_stride,
423 const int8_t *filter, int32_t height)
424 {
425 uint32_t loop_cnt;
426 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429 v8i16 filt, out0, out1, out2, out3;
430
431 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432 src -= 3;
433
434 /* rearranging filter */
435 filt = LD_SH(filter);
436 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437
438 mask1 = mask0 + 2;
439 mask2 = mask0 + 4;
440 mask3 = mask0 + 6;
441
442 for (loop_cnt = (height >> 2); loop_cnt--;) {
443 LD_SB4(src, src_stride, src0, src1, src2, src3);
444 XORI_B4_128_SB(src0, src1, src2, src3);
445 src += (4 * src_stride);
446
447 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450 out0, out1, out2, out3);
451 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454 out0, out1, out2, out3);
455 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458 out0, out1, out2, out3);
459 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462 out0, out1, out2, out3);
463
464 SRARI_H4_SH(out0, out1, out2, out3, 6);
465 SAT_SH4_SH(out0, out1, out2, out3, 7);
466 tmp0 = PCKEV_XORI128_UB(out0, out1);
467 tmp1 = PCKEV_XORI128_UB(out2, out3);
468 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469 dst += (4 * dst_stride);
470 }
471 }
472
common_hz_8t_12w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)473 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
474 uint8_t *dst, int32_t dst_stride,
475 const int8_t *filter, int32_t height)
476 {
477 uint32_t loop_cnt;
478 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479 v16u8 tmp0, tmp1, tmp2;
480 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482 v16i8 filt0, filt1, filt2, filt3;
483 v8i16 filt, out0, out1, out2, out3, out4, out5;
484
485 mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487
488 src = src - 3;
489
490 /* rearranging filter */
491 filt = LD_SH(filter);
492 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493
494 mask1 = mask00 + 2;
495 mask2 = mask00 + 4;
496 mask3 = mask00 + 6;
497 mask4 = mask0 + 2;
498 mask5 = mask0 + 4;
499 mask6 = mask0 + 6;
500
501 for (loop_cnt = 4; loop_cnt--;) {
502 /* 8 width */
503 LD_SB4(src, src_stride, src0, src1, src2, src3);
504 /* 4 width */
505 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506
507 XORI_B4_128_SB(src0, src1, src2, src3);
508 XORI_B4_128_SB(src4, src5, src6, src7);
509 src += (4 * src_stride);
510
511 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514 out1, out2, out3);
515 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518 out1, out2, out3);
519 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522 out1, out2, out3);
523 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526 out1, out2, out3);
527
528 /* 4 width */
529 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534 DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536 DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537
538 SRARI_H4_SH(out0, out1, out2, out3, 6);
539 SRARI_H2_SH(out4, out5, 6);
540 SAT_SH4_SH(out0, out1, out2, out3, 7);
541 SAT_SH2_SH(out4, out5, 7);
542 tmp0 = PCKEV_XORI128_UB(out0, out1);
543 tmp1 = PCKEV_XORI128_UB(out2, out3);
544 tmp2 = PCKEV_XORI128_UB(out4, out5);
545
546 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548 dst += (4 * dst_stride);
549 }
550 }
551
common_hz_8t_16w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)552 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
553 uint8_t *dst, int32_t dst_stride,
554 const int8_t *filter, int32_t height)
555 {
556 uint32_t loop_cnt;
557 v16u8 mask0, mask1, mask2, mask3, out;
558 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559 v16i8 filt0, filt1, filt2, filt3;
560 v8i16 filt, out0, out1, out2, out3;
561
562 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563 src -= 3;
564
565 /* rearranging filter */
566 filt = LD_SH(filter);
567 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568
569 mask1 = mask0 + 2;
570 mask2 = mask0 + 4;
571 mask3 = mask0 + 6;
572
573 for (loop_cnt = (height >> 2); loop_cnt--;) {
574 LD_SB2(src, src_stride, src0, src2);
575 LD_SB2(src + 8, src_stride, src1, src3);
576 src += (2 * src_stride);
577
578 LD_SB2(src, src_stride, src4, src6);
579 LD_SB2(src + 8, src_stride, src5, src7);
580 src += (2 * src_stride);
581
582 XORI_B4_128_SB(src0, src1, src2, src3);
583 XORI_B4_128_SB(src4, src5, src6, src7);
584 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585 mask3, filt0, filt1, filt2, filt3, out0,
586 out1, out2, out3);
587 SRARI_H4_SH(out0, out1, out2, out3, 6);
588 SAT_SH4_SH(out0, out1, out2, out3, 7);
589 out = PCKEV_XORI128_UB(out0, out1);
590 ST_UB(out, dst);
591 dst += dst_stride;
592 out = PCKEV_XORI128_UB(out2, out3);
593 ST_UB(out, dst);
594 dst += dst_stride;
595
596 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597 mask3, filt0, filt1, filt2, filt3, out0,
598 out1, out2, out3);
599 SRARI_H4_SH(out0, out1, out2, out3, 6);
600 SAT_SH4_SH(out0, out1, out2, out3, 7);
601 out = PCKEV_XORI128_UB(out0, out1);
602 ST_UB(out, dst);
603 dst += dst_stride;
604 out = PCKEV_XORI128_UB(out2, out3);
605 ST_UB(out, dst);
606 dst += dst_stride;
607 }
608 }
609
common_hz_8t_24w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)610 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
611 uint8_t *dst, int32_t dst_stride,
612 const int8_t *filter, int32_t height)
613 {
614 uint32_t loop_cnt;
615 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618 v16i8 vec11;
619 v8i16 out0, out1, out2, out3, out8, out9, filt;
620
621 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622 src -= 3;
623
624 /* rearranging filter */
625 filt = LD_SH(filter);
626 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627
628 mask1 = mask0 + 2;
629 mask2 = mask0 + 4;
630 mask3 = mask0 + 6;
631 mask4 = mask0 + 8;
632 mask5 = mask0 + 10;
633 mask6 = mask0 + 12;
634 mask7 = mask0 + 14;
635
636 for (loop_cnt = 16; loop_cnt--;) {
637 LD_SB2(src, src_stride, src0, src2);
638 LD_SB2(src + 16, src_stride, src1, src3);
639 XORI_B4_128_SB(src0, src1, src2, src3);
640 src += (2 * src_stride);
641 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645 out8, out2, out9);
646 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651 out0, out8, out2, out9);
652 DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657 out0, out8, out2, out9);
658 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663 out0, out8, out2, out9);
664 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665 SRARI_H4_SH(out0, out8, out2, out9, 6);
666 SRARI_H2_SH(out1, out3, 6);
667 SAT_SH4_SH(out0, out8, out2, out9, 7);
668 SAT_SH2_SH(out1, out3, 7);
669 out = PCKEV_XORI128_UB(out8, out9);
670 ST_D2(out, 0, 1, dst + 16, dst_stride);
671 out = PCKEV_XORI128_UB(out0, out1);
672 ST_UB(out, dst);
673 dst += dst_stride;
674 out = PCKEV_XORI128_UB(out2, out3);
675 ST_UB(out, dst);
676 dst += dst_stride;
677 }
678 }
679
common_hz_8t_32w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)680 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
681 uint8_t *dst, int32_t dst_stride,
682 const int8_t *filter, int32_t height)
683 {
684 uint32_t loop_cnt;
685 v16u8 mask0, mask1, mask2, mask3, out;
686 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687 v16i8 filt0, filt1, filt2, filt3;
688 v8i16 filt, out0, out1, out2, out3;
689
690 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691 src -= 3;
692
693 /* rearranging filter */
694 filt = LD_SH(filter);
695 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696
697 mask1 = mask0 + 2;
698 mask2 = mask0 + 4;
699 mask3 = mask0 + 6;
700
701 for (loop_cnt = (height >> 1); loop_cnt--;) {
702 src0 = LD_SB(src);
703 src1 = LD_SB(src + 8);
704 src2 = LD_SB(src + 16);
705 src3 = LD_SB(src + 24);
706 src += src_stride;
707 XORI_B4_128_SB(src0, src1, src2, src3);
708
709 src4 = LD_SB(src);
710 src5 = LD_SB(src + 8);
711 src6 = LD_SB(src + 16);
712 src7 = LD_SB(src + 24);
713 src += src_stride;
714 XORI_B4_128_SB(src4, src5, src6, src7);
715
716 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717 mask3, filt0, filt1, filt2, filt3, out0,
718 out1, out2, out3);
719 SRARI_H4_SH(out0, out1, out2, out3, 6);
720 SAT_SH4_SH(out0, out1, out2, out3, 7);
721
722 out = PCKEV_XORI128_UB(out0, out1);
723 ST_UB(out, dst);
724 out = PCKEV_XORI128_UB(out2, out3);
725 ST_UB(out, dst + 16);
726 dst += dst_stride;
727
728 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729 mask3, filt0, filt1, filt2, filt3, out0,
730 out1, out2, out3);
731 SRARI_H4_SH(out0, out1, out2, out3, 6);
732 SAT_SH4_SH(out0, out1, out2, out3, 7);
733 out = PCKEV_XORI128_UB(out0, out1);
734 ST_UB(out, dst);
735 out = PCKEV_XORI128_UB(out2, out3);
736 ST_UB(out, dst + 16);
737 dst += dst_stride;
738 }
739 }
740
common_hz_8t_48w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)741 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
742 uint8_t *dst, int32_t dst_stride,
743 const int8_t *filter, int32_t height)
744 {
745 uint32_t loop_cnt;
746 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747 v16i8 src4;
748 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749 v8i16 filt, out0, out1, out2, out3;
750
751 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752 src -= 3;
753
754 /* rearranging filter */
755 filt = LD_SH(filter);
756 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757
758 mask1 = mask0 + 2;
759 mask2 = mask0 + 4;
760 mask3 = mask0 + 6;
761 mask4 = mask0 + 8;
762 mask5 = mask0 + 10;
763 mask6 = mask0 + 12;
764 mask7 = mask0 + 14;
765
766 for (loop_cnt = 64; loop_cnt--;) {
767 src0 = LD_SB(src);
768 src1 = LD_SB(src + 8);
769 src2 = LD_SB(src + 16);
770 src3 = LD_SB(src + 32);
771 src4 = LD_SB(src + 40);
772 src += src_stride;
773
774 XORI_B4_128_SB(src0, src1, src2, src3);
775 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776
777 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778 vec0, vec1, vec2);
779 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781 vec0, vec1, vec2);
782 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785 vec0, vec1, vec2);
786 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788
789 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790 vec0, vec1, vec2);
791 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793
794 SRARI_H2_SH(out0, out1, 6);
795 out3 = __msa_srari_h(out2, 6);
796 SAT_SH3_SH(out0, out1, out3, 7);
797 out = PCKEV_XORI128_UB(out0, out1);
798 ST_UB(out, dst);
799
800 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801 vec0, vec1, vec2);
802 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804 vec0, vec1, vec2);
805 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808 vec0, vec1, vec2);
809 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812 vec0, vec1, vec2);
813 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815
816 SRARI_H2_SH(out0, out1, 6);
817 out2 = __msa_srari_h(out2, 6);
818 SAT_SH3_SH(out0, out1, out2, 7);
819 out = PCKEV_XORI128_UB(out3, out0);
820 ST_UB(out, dst + 16);
821 out = PCKEV_XORI128_UB(out1, out2);
822 ST_UB(out, dst + 32);
823 dst += dst_stride;
824 }
825 }
826
common_hz_8t_64w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)827 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
828 uint8_t *dst, int32_t dst_stride,
829 const int8_t *filter, int32_t height)
830 {
831 int32_t loop_cnt;
832 v16u8 mask0, mask1, mask2, mask3, out;
833 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835 v16i8 filt0, filt1, filt2, filt3;
836 v8i16 res0, res1, res2, res3, filt;
837
838 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839 src -= 3;
840
841 /* rearranging filter */
842 filt = LD_SH(filter);
843 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844
845 mask1 = mask0 + 2;
846 mask2 = mask0 + 4;
847 mask3 = mask0 + 6;
848
849 for (loop_cnt = height; loop_cnt--;) {
850 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851 src += src_stride;
852
853 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854
855 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858 res1, res2, res3);
859 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862 res1, res2, res3);
863 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866 res1, res2, res3);
867 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870 res1, res2, res3);
871
872 SRARI_H4_SH(res0, res1, res2, res3, 6);
873 SAT_SH4_SH(res0, res1, res2, res3, 7);
874 out = PCKEV_XORI128_UB(res0, res1);
875 ST_UB(out, dst);
876 out = PCKEV_XORI128_UB(res2, res3);
877 ST_UB(out, dst + 16);
878
879 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882 res1, res2, res3);
883 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886 res1, res2, res3);
887 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890 res1, res2, res3);
891 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894 res1, res2, res3);
895
896 SRARI_H4_SH(res0, res1, res2, res3, 6);
897 SAT_SH4_SH(res0, res1, res2, res3, 7);
898 out = PCKEV_XORI128_UB(res0, res1);
899 ST_UB(out, dst + 32);
900 out = PCKEV_XORI128_UB(res2, res3);
901 ST_UB(out, dst + 48);
902 dst += dst_stride;
903 }
904 }
905
common_vt_8t_4w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)906 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
907 uint8_t *dst, int32_t dst_stride,
908 const int8_t *filter, int32_t height)
909 {
910 uint32_t loop_cnt;
911 v16u8 out0, out1;
912 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913 v16i8 src11, src12, src13, src14;
914 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917 v16i8 src10998, filt0, filt1, filt2, filt3;
918 v8i16 filt, out10, out32, out54, out76;
919
920 src -= (3 * src_stride);
921
922 filt = LD_SH(filter);
923 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
924
925 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926 src += (7 * src_stride);
927
928 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
929 src54_r, src21_r);
930 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
932 src4332, src6554);
933 XORI_B3_128_SB(src2110, src4332, src6554);
934
935 for (loop_cnt = (height >> 3); loop_cnt--;) {
936 LD_SB4(src, src_stride, src7, src8, src9, src10);
937 src += (4 * src_stride);
938 LD_SB4(src, src_stride, src11, src12, src13, src14);
939 src += (4 * src_stride);
940
941 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942 src87_r, src98_r, src109_r);
943 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944 src1110_r, src1211_r, src1312_r, src1413_r);
945 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947 src12111110, src14131312);
948 XORI_B2_128_SB(src8776, src10998);
949 XORI_B2_128_SB(src12111110, src14131312);
950
951 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
959 SRARI_H2_SH(out10, out32, 6);
960 SRARI_H2_SH(out54, out76, 6);
961 SAT_SH2_SH(out10, out32, 7);
962 SAT_SH2_SH(out54, out76, 7);
963 out0 = PCKEV_XORI128_UB(out10, out32);
964 out1 = PCKEV_XORI128_UB(out54, out76);
965 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966 dst += (8 * dst_stride);
967
968 src2110 = src10998;
969 src4332 = src12111110;
970 src6554 = src14131312;
971 src6 = src14;
972 }
973 }
974
common_vt_8t_8w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)975 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
976 uint8_t *dst, int32_t dst_stride,
977 const int8_t *filter, int32_t height)
978 {
979 uint32_t loop_cnt;
980 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
983 v16u8 tmp0, tmp1;
984 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
985
986 src -= (3 * src_stride);
987
988 filt = LD_SH(filter);
989 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
990
991 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
992 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
993 src += (7 * src_stride);
994 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
995 src54_r, src21_r);
996 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
997
998 for (loop_cnt = (height >> 2); loop_cnt--;) {
999 LD_SB4(src, src_stride, src7, src8, src9, src10);
1000 XORI_B4_128_SB(src7, src8, src9, src10);
1001 src += (4 * src_stride);
1002
1003 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004 src87_r, src98_r, src109_r);
1005 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006 filt0, out0_r, out1_r, out2_r, out3_r);
1007 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008 filt1, out0_r, out1_r, out2_r, out3_r);
1009 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010 filt2, out0_r, out1_r, out2_r, out3_r);
1011 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012 filt3, out0_r, out1_r, out2_r, out3_r);
1013 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1014 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1015 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1016 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1017 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018 dst += (4 * dst_stride);
1019
1020 src10_r = src54_r;
1021 src32_r = src76_r;
1022 src54_r = src98_r;
1023 src21_r = src65_r;
1024 src43_r = src87_r;
1025 src65_r = src109_r;
1026 src6 = src10;
1027 }
1028 }
1029
common_vt_8t_12w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1030 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1031 uint8_t *dst, int32_t dst_stride,
1032 const int8_t *filter, int32_t height)
1033 {
1034 uint32_t loop_cnt;
1035 uint32_t out2, out3;
1036 uint64_t out0, out1;
1037 v16u8 tmp0, tmp1, tmp2, tmp3;
1038 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039 v16i8 filt0, filt1, filt2, filt3;
1040 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1044
1045 src -= (3 * src_stride);
1046
1047 filt = LD_SH(filter);
1048 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1049
1050 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051 src += (7 * src_stride);
1052
1053 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054
1055 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1056 src54_r, src21_r);
1057 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1059 src54_l, src21_l);
1060 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1061
1062 for (loop_cnt = 4; loop_cnt--;) {
1063 LD_SB4(src, src_stride, src7, src8, src9, src10);
1064 XORI_B4_128_SB(src7, src8, src9, src10);
1065 src += (4 * src_stride);
1066
1067 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068 src87_r, src98_r, src109_r);
1069 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070 src87_l, src98_l, src109_l);
1071 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1072 filt1, filt2, filt3);
1073 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1074 filt1, filt2, filt3);
1075 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1076 filt1, filt2, filt3);
1077 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1078 filt1, filt2, filt3);
1079 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1080 filt1, filt2, filt3);
1081 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1082 filt1, filt2, filt3);
1083 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1084 filt1, filt2, filt3);
1085 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1086 filt1, filt2, filt3);
1087 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1088 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1089 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092 out3_r, tmp0, tmp1, tmp2, tmp3);
1093 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1094
1095 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1099 SD(out0, dst);
1100 SW(out2, (dst + 8));
1101 dst += dst_stride;
1102 SD(out1, dst);
1103 SW(out3, (dst + 8));
1104 dst += dst_stride;
1105 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1109 SD(out0, dst);
1110 SW(out2, (dst + 8));
1111 dst += dst_stride;
1112 SD(out1, dst);
1113 SW(out3, (dst + 8));
1114 dst += dst_stride;
1115
1116 src10_r = src54_r;
1117 src32_r = src76_r;
1118 src54_r = src98_r;
1119 src21_r = src65_r;
1120 src43_r = src87_r;
1121 src65_r = src109_r;
1122 src10_l = src54_l;
1123 src32_l = src76_l;
1124 src54_l = src98_l;
1125 src21_l = src65_l;
1126 src43_l = src87_l;
1127 src65_l = src109_l;
1128 src6 = src10;
1129 }
1130 }
1131
common_vt_8t_16w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1132 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1133 uint8_t *dst, int32_t dst_stride,
1134 const int8_t *filter, int32_t height)
1135 {
1136 uint32_t loop_cnt;
1137 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138 v16i8 filt0, filt1, filt2, filt3;
1139 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142 v16u8 tmp0, tmp1, tmp2, tmp3;
1143 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1144
1145 src -= (3 * src_stride);
1146
1147 filt = LD_SH(filter);
1148 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149
1150 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1151 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1152 src += (7 * src_stride);
1153 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1154 src54_r, src21_r);
1155 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1157 src54_l, src21_l);
1158 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1159
1160 for (loop_cnt = (height >> 2); loop_cnt--;) {
1161 LD_SB4(src, src_stride, src7, src8, src9, src10);
1162 XORI_B4_128_SB(src7, src8, src9, src10);
1163 src += (4 * src_stride);
1164
1165 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166 src87_r, src98_r, src109_r);
1167 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168 src87_l, src98_l, src109_l);
1169 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1170 filt1, filt2, filt3);
1171 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1172 filt1, filt2, filt3);
1173 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1174 filt1, filt2, filt3);
1175 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1176 filt1, filt2, filt3);
1177 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1178 filt1, filt2, filt3);
1179 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1180 filt1, filt2, filt3);
1181 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1182 filt1, filt2, filt3);
1183 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1184 filt1, filt2, filt3);
1185 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1186 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1187 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190 out3_r, tmp0, tmp1, tmp2, tmp3);
1191 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1192 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193 dst += (4 * dst_stride);
1194
1195 src10_r = src54_r;
1196 src32_r = src76_r;
1197 src54_r = src98_r;
1198 src21_r = src65_r;
1199 src43_r = src87_r;
1200 src65_r = src109_r;
1201 src10_l = src54_l;
1202 src32_l = src76_l;
1203 src54_l = src98_l;
1204 src21_l = src65_l;
1205 src43_l = src87_l;
1206 src65_l = src109_l;
1207 src6 = src10;
1208 }
1209 }
1210
common_vt_8t_16w_mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t width)1211 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1212 uint8_t *dst, int32_t dst_stride,
1213 const int8_t *filter, int32_t height,
1214 int32_t width)
1215 {
1216 uint8_t *src_tmp;
1217 uint8_t *dst_tmp;
1218 uint32_t loop_cnt, cnt;
1219 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220 v16i8 filt0, filt1, filt2, filt3;
1221 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224 v16u8 tmp0, tmp1, tmp2, tmp3;
1225 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1226
1227 src -= (3 * src_stride);
1228
1229 filt = LD_SH(filter);
1230 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1231
1232 for (cnt = (width >> 4); cnt--;) {
1233 src_tmp = src;
1234 dst_tmp = dst;
1235
1236 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1237 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1238 src_tmp += (7 * src_stride);
1239 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240 src32_r, src54_r, src21_r);
1241 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243 src32_l, src54_l, src21_l);
1244 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1245
1246 for (loop_cnt = (height >> 2); loop_cnt--;) {
1247 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1248 XORI_B4_128_SB(src7, src8, src9, src10);
1249 src_tmp += (4 * src_stride);
1250 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251 src87_r, src98_r, src109_r);
1252 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253 src87_l, src98_l, src109_l);
1254 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1255 filt0, filt1, filt2, filt3);
1256 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1257 filt0, filt1, filt2, filt3);
1258 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1259 filt0, filt1, filt2, filt3);
1260 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1261 filt0, filt1, filt2, filt3);
1262 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1263 filt0, filt1, filt2, filt3);
1264 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1265 filt0, filt1, filt2, filt3);
1266 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1267 filt0, filt1, filt2, filt3);
1268 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1269 filt0, filt1, filt2, filt3);
1270 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1271 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1272 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275 out3_r, tmp0, tmp1, tmp2, tmp3);
1276 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1277 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278 dst_tmp += (4 * dst_stride);
1279
1280 src10_r = src54_r;
1281 src32_r = src76_r;
1282 src54_r = src98_r;
1283 src21_r = src65_r;
1284 src43_r = src87_r;
1285 src65_r = src109_r;
1286 src10_l = src54_l;
1287 src32_l = src76_l;
1288 src54_l = src98_l;
1289 src21_l = src65_l;
1290 src43_l = src87_l;
1291 src65_l = src109_l;
1292 src6 = src10;
1293 }
1294
1295 src += 16;
1296 dst += 16;
1297 }
1298 }
1299
common_vt_8t_24w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1300 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1301 uint8_t *dst, int32_t dst_stride,
1302 const int8_t *filter, int32_t height)
1303 {
1304 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1305 16);
1306
1307 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1308 height);
1309 }
1310
common_vt_8t_32w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1311 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1312 uint8_t *dst, int32_t dst_stride,
1313 const int8_t *filter, int32_t height)
1314 {
1315 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1316 32);
1317 }
1318
common_vt_8t_48w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1319 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1320 uint8_t *dst, int32_t dst_stride,
1321 const int8_t *filter, int32_t height)
1322 {
1323 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324 48);
1325 }
1326
common_vt_8t_64w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1327 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1328 uint8_t *dst, int32_t dst_stride,
1329 const int8_t *filter, int32_t height)
1330 {
1331 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332 64);
1333 }
1334
hevc_hv_uni_8t_4w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1335 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1336 int32_t src_stride,
1337 uint8_t *dst,
1338 int32_t dst_stride,
1339 const int8_t *filter_x,
1340 const int8_t *filter_y,
1341 int32_t height)
1342 {
1343 uint32_t loop_cnt;
1344 v16u8 out0, out1;
1345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 src9, src10, src11, src12, src13, src14;
1347 v8i16 filt0, filt1, filt2, filt3;
1348 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349 v16i8 mask1, mask2, mask3;
1350 v8i16 filter_vec;
1351 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1357 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1358
1359 src -= ((3 * src_stride) + 3);
1360 filter_vec = LD_SH(filter_x);
1361 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362
1363 filter_vec = LD_SH(filter_y);
1364 UNPCK_R_SB_SH(filter_vec, filter_vec);
1365
1366 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1367
1368 mask1 = mask0 + 2;
1369 mask2 = mask0 + 4;
1370 mask3 = mask0 + 6;
1371
1372 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373 src += (7 * src_stride);
1374 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1375
1376 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379 vec8, vec9, vec10, vec11);
1380 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381 vec12, vec13, vec14, vec15);
1382
1383 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1384 filt3);
1385 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1386 filt3);
1387 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1388 filt3);
1389 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1390 filt3);
1391
1392 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1393 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1394 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1395
1396 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1397
1398 for (loop_cnt = height >> 3; loop_cnt--;) {
1399 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1400 src14);
1401 src += (8 * src_stride);
1402 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1403
1404 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405 vec0, vec1, vec2, vec3);
1406 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407 vec4, vec5, vec6, vec7);
1408 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409 vec8, vec9, vec10, vec11);
1410 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411 vec12, vec13, vec14, vec15);
1412
1413 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1414 filt3);
1415 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1416 filt3);
1417 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1418 filt2, filt3);
1419 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1420 filt2, filt3);
1421
1422 dst76_r = __msa_ilvr_h(dst117, dst66);
1423 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1424 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1425 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1428
1429 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430 filt_h1, filt_h2, filt_h3);
1431 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432 filt_h1, filt_h2, filt_h3);
1433 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434 filt_h1, filt_h2, filt_h3);
1435 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436 filt_h1, filt_h2, filt_h3);
1437 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438 filt_h1, filt_h2, filt_h3);
1439 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440 filt_h1, filt_h2, filt_h3);
1441 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442 filt_h1, filt_h2, filt_h3);
1443 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1444 filt_h0, filt_h1, filt_h2, filt_h3);
1445
1446 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1448 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1449 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1454 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1455 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1456 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457 dst += (8 * dst_stride);
1458
1459 dst10_r = dst98_r;
1460 dst32_r = dst1110_r;
1461 dst54_r = dst1312_r;
1462 dst21_r = dst109_r;
1463 dst43_r = dst1211_r;
1464 dst65_r = dst1413_r;
1465 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1466 }
1467 }
1468
hevc_hv_uni_8t_8multx2mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width)1469 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1470 int32_t src_stride,
1471 uint8_t *dst,
1472 int32_t dst_stride,
1473 const int8_t *filter_x,
1474 const int8_t *filter_y,
1475 int32_t height, int32_t width)
1476 {
1477 uint32_t loop_cnt, cnt;
1478 uint8_t *src_tmp;
1479 uint8_t *dst_tmp;
1480 v16u8 out;
1481 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1482 v8i16 filt0, filt1, filt2, filt3;
1483 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484 v16i8 mask1, mask2, mask3;
1485 v8i16 filter_vec;
1486 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1494 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1495
1496 src -= ((3 * src_stride) + 3);
1497
1498 filter_vec = LD_SH(filter_x);
1499 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1500
1501 filter_vec = LD_SH(filter_y);
1502 UNPCK_R_SB_SH(filter_vec, filter_vec);
1503
1504 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1505
1506 mask1 = mask0 + 2;
1507 mask2 = mask0 + 4;
1508 mask3 = mask0 + 6;
1509
1510 for (cnt = width >> 3; cnt--;) {
1511 src_tmp = src;
1512 dst_tmp = dst;
1513
1514 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515 src_tmp += (7 * src_stride);
1516 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1517
1518 /* row 0 row 1 row 2 row 3 */
1519 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520 vec0, vec1, vec2, vec3);
1521 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522 vec4, vec5, vec6, vec7);
1523 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524 vec8, vec9, vec10, vec11);
1525 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526 vec12, vec13, vec14, vec15);
1527 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1528 filt3);
1529 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1530 filt3);
1531 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1532 filt3);
1533 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1534 filt2, filt3);
1535
1536 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537 vec0, vec1, vec2, vec3);
1538 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539 vec4, vec5, vec6, vec7);
1540 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541 vec8, vec9, vec10, vec11);
1542 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1543 filt3);
1544 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1545 filt3);
1546 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1547 filt3);
1548
1549 for (loop_cnt = height >> 1; loop_cnt--;) {
1550 LD_SB2(src_tmp, src_stride, src7, src8);
1551 XORI_B2_128_SB(src7, src8);
1552 src_tmp += 2 * src_stride;
1553
1554 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555 dst10_r, dst32_r, dst54_r, dst21_r);
1556 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557 dst10_l, dst32_l, dst54_l, dst21_l);
1558 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1560
1561 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562 vec0, vec1, vec2, vec3);
1563 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1564 filt2, filt3);
1565
1566 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1567 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1568 filt_h0, filt_h1, filt_h2, filt_h3);
1569 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1570 filt_h0, filt_h1, filt_h2, filt_h3);
1571 dst0_r >>= 6;
1572 dst0_l >>= 6;
1573
1574 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575 vec0, vec1, vec2, vec3);
1576 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1577 filt2, filt3);
1578
1579 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1580 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1581 filt_h0, filt_h1, filt_h2, filt_h3);
1582 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1583 filt_h0, filt_h1, filt_h2, filt_h3);
1584 dst1_r >>= 6;
1585 dst1_l >>= 6;
1586 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1587 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1588
1589 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1590 out = PCKEV_XORI128_UB(dst0, dst1);
1591 ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592 dst_tmp += (2 * dst_stride);
1593
1594 dst0 = dst2;
1595 dst1 = dst3;
1596 dst2 = dst4;
1597 dst3 = dst5;
1598 dst4 = dst6;
1599 dst5 = dst7;
1600 dst6 = dst8;
1601 }
1602
1603 src += 8;
1604 dst += 8;
1605 }
1606 }
1607
hevc_hv_uni_8t_8w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1608 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1609 int32_t src_stride,
1610 uint8_t *dst,
1611 int32_t dst_stride,
1612 const int8_t *filter_x,
1613 const int8_t *filter_y,
1614 int32_t height)
1615 {
1616 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1617 filter_x, filter_y, height, 8);
1618 }
1619
hevc_hv_uni_8t_12w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1620 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1621 int32_t src_stride,
1622 uint8_t *dst,
1623 int32_t dst_stride,
1624 const int8_t *filter_x,
1625 const int8_t *filter_y,
1626 int32_t height)
1627 {
1628 uint32_t loop_cnt;
1629 uint8_t *src_tmp, *dst_tmp;
1630 v16u8 out0, out1;
1631 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632 v16i8 src11, src12, src13, src14;
1633 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642 v8i16 dst1413_r, dst87_l, filter_vec;
1643 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644 v4i32 dst0_l, dst1_l;
1645
1646 src -= ((3 * src_stride) + 3);
1647
1648 filter_vec = LD_SH(filter_x);
1649 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650
1651 filter_vec = LD_SH(filter_y);
1652 UNPCK_R_SB_SH(filter_vec, filter_vec);
1653
1654 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1655
1656 mask0 = LD_SB(ff_hevc_mask_arr);
1657 mask1 = mask0 + 2;
1658 mask2 = mask0 + 4;
1659 mask3 = mask0 + 6;
1660
1661 src_tmp = src;
1662 dst_tmp = dst;
1663
1664 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1665 src_tmp += (7 * src_stride);
1666 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1667
1668 /* row 0 row 1 row 2 row 3 */
1669 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1670 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1671 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1672 vec11);
1673 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1674 vec15);
1675 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1676 filt3);
1677 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1678 filt3);
1679 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1680 filt3);
1681 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1682 filt2, filt3);
1683
1684 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1687 vec11);
1688 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1689 filt3);
1690 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1691 filt3);
1692 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1693 filt3);
1694
1695 for (loop_cnt = 8; loop_cnt--;) {
1696 LD_SB2(src_tmp, src_stride, src7, src8);
1697 XORI_B2_128_SB(src7, src8);
1698 src_tmp += 2 * src_stride;
1699
1700 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701 dst32_r, dst54_r, dst21_r);
1702 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703 dst32_l, dst54_l, dst21_l);
1704 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1706
1707 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1708 vec3);
1709 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1710 filt3);
1711
1712 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1713 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1714 filt_h0, filt_h1, filt_h2, filt_h3);
1715 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1716 filt_h0, filt_h1, filt_h2, filt_h3);
1717 dst0_r >>= 6;
1718 dst0_l >>= 6;
1719
1720 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1721 vec3);
1722 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1723 filt3);
1724
1725 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1726 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1727 filt_h0, filt_h1, filt_h2, filt_h3);
1728 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1729 filt_h0, filt_h1, filt_h2, filt_h3);
1730 dst1_r >>= 6;
1731 dst1_l >>= 6;
1732 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1733 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1734
1735 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1736 out0 = PCKEV_XORI128_UB(dst0, dst1);
1737 ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738 dst_tmp += (2 * dst_stride);
1739
1740 dst0 = dst2;
1741 dst1 = dst3;
1742 dst2 = dst4;
1743 dst3 = dst5;
1744 dst4 = dst6;
1745 dst5 = dst7;
1746 dst6 = dst8;
1747 }
1748
1749 src += 8;
1750 dst += 8;
1751
1752 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1753 mask5 = mask4 + 2;
1754 mask6 = mask4 + 4;
1755 mask7 = mask4 + 6;
1756
1757 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1758 src += (7 * src_stride);
1759 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1760
1761 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1764 vec11);
1765 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1766 vec15);
1767
1768 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1769 filt3);
1770 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1771 filt3);
1772 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1773 filt3);
1774 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1775 filt3);
1776
1777 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1778 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1779 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1780
1781 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1782
1783 for (loop_cnt = 2; loop_cnt--;) {
1784 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1785 src14);
1786 src += (8 * src_stride);
1787 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1788
1789 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1790 vec3);
1791 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1792 vec7);
1793 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1794 vec11);
1795 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1796 vec14, vec15);
1797
1798 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1799 filt3);
1800 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1801 filt3);
1802 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1803 filt2, filt3);
1804 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1805 filt2, filt3);
1806
1807 dst76_r = __msa_ilvr_h(dst117, dst66);
1808 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1809 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1810 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1813
1814 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815 filt_h1, filt_h2, filt_h3);
1816 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817 filt_h1, filt_h2, filt_h3);
1818 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819 filt_h1, filt_h2, filt_h3);
1820 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821 filt_h1, filt_h2, filt_h3);
1822 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823 filt_h1, filt_h2, filt_h3);
1824 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825 filt_h1, filt_h2, filt_h3);
1826 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827 filt_h1, filt_h2, filt_h3);
1828 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1829 filt_h0, filt_h1, filt_h2, filt_h3);
1830
1831 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1833 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1834 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1839 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1840 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1841 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842 dst += (8 * dst_stride);
1843
1844 dst10_r = dst98_r;
1845 dst32_r = dst1110_r;
1846 dst54_r = dst1312_r;
1847 dst21_r = dst109_r;
1848 dst43_r = dst1211_r;
1849 dst65_r = dst1413_r;
1850 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1851 }
1852 }
1853
hevc_hv_uni_8t_16w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1854 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1855 int32_t src_stride,
1856 uint8_t *dst,
1857 int32_t dst_stride,
1858 const int8_t *filter_x,
1859 const int8_t *filter_y,
1860 int32_t height)
1861 {
1862 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1863 filter_x, filter_y, height, 16);
1864 }
1865
hevc_hv_uni_8t_24w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1866 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1867 int32_t src_stride,
1868 uint8_t *dst,
1869 int32_t dst_stride,
1870 const int8_t *filter_x,
1871 const int8_t *filter_y,
1872 int32_t height)
1873 {
1874 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1875 filter_x, filter_y, height, 24);
1876 }
1877
hevc_hv_uni_8t_32w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1878 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1879 int32_t src_stride,
1880 uint8_t *dst,
1881 int32_t dst_stride,
1882 const int8_t *filter_x,
1883 const int8_t *filter_y,
1884 int32_t height)
1885 {
1886 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1887 filter_x, filter_y, height, 32);
1888 }
1889
hevc_hv_uni_8t_48w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1890 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1891 int32_t src_stride,
1892 uint8_t *dst,
1893 int32_t dst_stride,
1894 const int8_t *filter_x,
1895 const int8_t *filter_y,
1896 int32_t height)
1897 {
1898 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1899 filter_x, filter_y, height, 48);
1900 }
1901
hevc_hv_uni_8t_64w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1902 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1903 int32_t src_stride,
1904 uint8_t *dst,
1905 int32_t dst_stride,
1906 const int8_t *filter_x,
1907 const int8_t *filter_y,
1908 int32_t height)
1909 {
1910 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1911 filter_x, filter_y, height, 64);
1912 }
1913
common_hz_4t_4x2_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1914 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1915 uint8_t *dst, int32_t dst_stride,
1916 const int8_t *filter)
1917 {
1918 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1919 v16u8 out;
1920 v8i16 filt, res0;
1921
1922 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1923 src -= 1;
1924
1925 /* rearranging filter */
1926 filt = LD_SH(filter);
1927 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1928
1929 mask1 = mask0 + 2;
1930
1931 LD_SB2(src, src_stride, src0, src1);
1932 XORI_B2_128_SB(src0, src1);
1933 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1934 res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1935 res0 = __msa_srari_h(res0, 6);
1936 res0 = __msa_sat_s_h(res0, 7);
1937 out = PCKEV_XORI128_UB(res0, res0);
1938 ST_W2(out, 0, 1, dst, dst_stride);
1939 }
1940
common_hz_4t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1941 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1942 uint8_t *dst, int32_t dst_stride,
1943 const int8_t *filter)
1944 {
1945 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1946 v8i16 filt, out0, out1;
1947 v16u8 out;
1948
1949 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1950 src -= 1;
1951
1952 /* rearranging filter */
1953 filt = LD_SH(filter);
1954 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1955
1956 mask1 = mask0 + 2;
1957
1958 LD_SB4(src, src_stride, src0, src1, src2, src3);
1959 XORI_B4_128_SB(src0, src1, src2, src3);
1960 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1961 filt0, filt1, out0, out1);
1962 SRARI_H2_SH(out0, out1, 6);
1963 SAT_SH2_SH(out0, out1, 7);
1964 out = PCKEV_XORI128_UB(out0, out1);
1965 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1966 }
1967
common_hz_4t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1968 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1969 uint8_t *dst, int32_t dst_stride,
1970 const int8_t *filter)
1971 {
1972 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1973 v16u8 out;
1974 v8i16 filt, out0, out1, out2, out3;
1975
1976 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1977 src -= 1;
1978
1979 /* rearranging filter */
1980 filt = LD_SH(filter);
1981 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1982
1983 mask1 = mask0 + 2;
1984
1985 LD_SB4(src, src_stride, src0, src1, src2, src3);
1986 src += (4 * src_stride);
1987
1988 XORI_B4_128_SB(src0, src1, src2, src3);
1989 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1990 filt0, filt1, out0, out1);
1991 LD_SB4(src, src_stride, src0, src1, src2, src3);
1992 XORI_B4_128_SB(src0, src1, src2, src3);
1993 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1994 filt0, filt1, out2, out3);
1995 SRARI_H4_SH(out0, out1, out2, out3, 6);
1996 SAT_SH4_SH(out0, out1, out2, out3, 7);
1997 out = PCKEV_XORI128_UB(out0, out1);
1998 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1999 out = PCKEV_XORI128_UB(out2, out3);
2000 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2001 }
2002
common_hz_4t_4x16_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)2003 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2004 uint8_t *dst, int32_t dst_stride,
2005 const int8_t *filter)
2006 {
2007 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2008 v16i8 filt0, filt1, mask0, mask1;
2009 v16u8 out;
2010 v8i16 filt, out0, out1, out2, out3;
2011
2012 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2013 src -= 1;
2014
2015 /* rearranging filter */
2016 filt = LD_SH(filter);
2017 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2018
2019 mask1 = mask0 + 2;
2020
2021 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2022 src += (8 * src_stride);
2023 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2024 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2025 filt0, filt1, out0, out1);
2026 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2027 filt0, filt1, out2, out3);
2028 SRARI_H4_SH(out0, out1, out2, out3, 6);
2029 SAT_SH4_SH(out0, out1, out2, out3, 7);
2030 out = PCKEV_XORI128_UB(out0, out1);
2031 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2032 out = PCKEV_XORI128_UB(out2, out3);
2033 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034 dst += (8 * dst_stride);
2035
2036 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2037 src += (8 * src_stride);
2038 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2039 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2040 filt0, filt1, out0, out1);
2041 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2042 filt0, filt1, out2, out3);
2043 SRARI_H4_SH(out0, out1, out2, out3, 6);
2044 SAT_SH4_SH(out0, out1, out2, out3, 7);
2045 out = PCKEV_XORI128_UB(out0, out1);
2046 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2047 out = PCKEV_XORI128_UB(out2, out3);
2048 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2049 }
2050
common_hz_4t_4w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2051 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2052 uint8_t *dst, int32_t dst_stride,
2053 const int8_t *filter, int32_t height)
2054 {
2055 if (2 == height) {
2056 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2057 } else if (4 == height) {
2058 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2059 } else if (8 == height) {
2060 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2061 } else if (16 == height) {
2062 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2063 }
2064 }
2065
common_hz_4t_6w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2066 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2067 uint8_t *dst, int32_t dst_stride,
2068 const int8_t *filter, int32_t height)
2069 {
2070 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2071 v16u8 out4, out5;
2072 v8i16 filt, out0, out1, out2, out3;
2073
2074 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2075 src -= 1;
2076
2077 /* rearranging filter */
2078 filt = LD_SH(filter);
2079 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2080
2081 mask1 = mask0 + 2;
2082
2083 LD_SB4(src, src_stride, src0, src1, src2, src3);
2084 src += (4 * src_stride);
2085
2086 XORI_B4_128_SB(src0, src1, src2, src3);
2087 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2088 filt1, out0, out1, out2, out3);
2089 SRARI_H4_SH(out0, out1, out2, out3, 6);
2090 SAT_SH4_SH(out0, out1, out2, out3, 7);
2091 out4 = PCKEV_XORI128_UB(out0, out1);
2092 out5 = PCKEV_XORI128_UB(out2, out3);
2093 ST_W2(out4, 0, 2, dst, dst_stride);
2094 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097 dst += (4 * dst_stride);
2098
2099 LD_SB4(src, src_stride, src0, src1, src2, src3);
2100 src += (4 * src_stride);
2101
2102 XORI_B4_128_SB(src0, src1, src2, src3);
2103 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2104 filt1, out0, out1, out2, out3);
2105 SRARI_H4_SH(out0, out1, out2, out3, 6);
2106 SAT_SH4_SH(out0, out1, out2, out3, 7);
2107 out4 = PCKEV_XORI128_UB(out0, out1);
2108 out5 = PCKEV_XORI128_UB(out2, out3);
2109 ST_W2(out4, 0, 2, dst, dst_stride);
2110 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2113 }
2114
common_hz_4t_8x2mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2115 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2116 uint8_t *dst, int32_t dst_stride,
2117 const int8_t *filter, int32_t height)
2118 {
2119 uint32_t loop_cnt;
2120 v16i8 src0, src1, filt0, filt1, mask0, mask1;
2121 v16u8 out;
2122 v8i16 filt, vec0, vec1, vec2, vec3;
2123
2124 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2125 src -= 1;
2126
2127 filt = LD_SH(filter);
2128 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2129
2130 mask1 = mask0 + 2;
2131
2132 for (loop_cnt = (height >> 1); loop_cnt--;) {
2133 LD_SB2(src, src_stride, src0, src1);
2134 src += (2 * src_stride);
2135
2136 XORI_B2_128_SB(src0, src1);
2137 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2138 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2139 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2140 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2141 SRARI_H2_SH(vec0, vec1, 6);
2142 SAT_SH2_SH(vec0, vec1, 7);
2143 out = PCKEV_XORI128_UB(vec0, vec1);
2144 ST_D2(out, 0, 1, dst, dst_stride);
2145 dst += (2 * dst_stride);
2146 }
2147 }
2148
common_hz_4t_8x4mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2149 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2150 uint8_t *dst, int32_t dst_stride,
2151 const int8_t *filter, int32_t height)
2152 {
2153 uint32_t loop_cnt;
2154 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2155 v16u8 tmp0, tmp1;
2156 v8i16 filt, out0, out1, out2, out3;
2157
2158 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2159 src -= 1;
2160
2161 /* rearranging filter */
2162 filt = LD_SH(filter);
2163 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2164
2165 mask1 = mask0 + 2;
2166
2167 for (loop_cnt = (height >> 2); loop_cnt--;) {
2168 LD_SB4(src, src_stride, src0, src1, src2, src3);
2169 src += (4 * src_stride);
2170
2171 XORI_B4_128_SB(src0, src1, src2, src3);
2172 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2173 filt1, out0, out1, out2, out3);
2174 SRARI_H4_SH(out0, out1, out2, out3, 6);
2175 SAT_SH4_SH(out0, out1, out2, out3, 7);
2176 tmp0 = PCKEV_XORI128_UB(out0, out1);
2177 tmp1 = PCKEV_XORI128_UB(out2, out3);
2178 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179 dst += (4 * dst_stride);
2180 }
2181 }
2182
common_hz_4t_8w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2183 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2184 uint8_t *dst, int32_t dst_stride,
2185 const int8_t *filter, int32_t height)
2186 {
2187 if ((2 == height) || (6 == height)) {
2188 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2189 height);
2190 } else {
2191 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2192 height);
2193 }
2194 }
2195
common_hz_4t_12w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2196 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2197 uint8_t *dst, int32_t dst_stride,
2198 const int8_t *filter, int32_t height)
2199 {
2200 uint32_t loop_cnt;
2201 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2203 v16i8 vec10, vec11;
2204 v16u8 tmp0, tmp1;
2205 v8i16 filt, out0, out1, out2, out3, out4, out5;
2206
2207 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2208 mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2209
2210 src -= 1;
2211
2212 /* rearranging filter */
2213 filt = LD_SH(filter);
2214 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2215
2216 mask1 = mask0 + 2;
2217 mask3 = mask2 + 2;
2218
2219 for (loop_cnt = 4; loop_cnt--;) {
2220 LD_SB4(src, src_stride, src0, src1, src2, src3);
2221 src += (4 * src_stride);
2222
2223 XORI_B4_128_SB(src0, src1, src2, src3);
2224 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2225 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2226 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2227 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2228 SRARI_H2_SH(out0, out1, 6);
2229 SAT_SH2_SH(out0, out1, 7);
2230 tmp0 = PCKEV_XORI128_UB(out0, out1);
2231 ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2232
2233 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2234 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236 out2, out3, out4, out5);
2237 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2238 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240 out2, out3, out4, out5);
2241 SRARI_H4_SH(out2, out3, out4, out5, 6);
2242 SAT_SH4_SH(out2, out3, out4, out5, 7);
2243 tmp0 = PCKEV_XORI128_UB(out2, out3);
2244 tmp1 = PCKEV_XORI128_UB(out4, out5);
2245 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246 dst += (4 * dst_stride);
2247 }
2248 }
2249
common_hz_4t_16w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2250 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2251 uint8_t *dst, int32_t dst_stride,
2252 const int8_t *filter, int32_t height)
2253 {
2254 uint32_t loop_cnt;
2255 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2256 v16i8 filt0, filt1, mask0, mask1;
2257 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2259 v16u8 out;
2260
2261 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2262 src -= 1;
2263
2264 /* rearranging filter */
2265 filt = LD_SH(filter);
2266 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2267
2268 mask1 = mask0 + 2;
2269
2270 for (loop_cnt = (height >> 2); loop_cnt--;) {
2271 LD_SB4(src, src_stride, src0, src2, src4, src6);
2272 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2273 src += (4 * src_stride);
2274
2275 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2276
2277 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2278 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280 out0, out1, out2, out3);
2281 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2282 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284 out0, out1, out2, out3);
2285 SRARI_H4_SH(out0, out1, out2, out3, 6);
2286 SAT_SH4_SH(out0, out1, out2, out3, 7);
2287 out = PCKEV_XORI128_UB(out0, out1);
2288 ST_UB(out, dst);
2289 dst += dst_stride;
2290 out = PCKEV_XORI128_UB(out2, out3);
2291 ST_UB(out, dst);
2292 dst += dst_stride;
2293
2294 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297 out4, out5, out6, out7);
2298 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301 out4, out5, out6, out7);
2302 SRARI_H4_SH(out4, out5, out6, out7, 6);
2303 SAT_SH4_SH(out4, out5, out6, out7, 7);
2304 out = PCKEV_XORI128_UB(out4, out5);
2305 ST_UB(out, dst);
2306 dst += dst_stride;
2307 out = PCKEV_XORI128_UB(out6, out7);
2308 ST_UB(out, dst);
2309 dst += dst_stride;
2310 }
2311 }
2312
common_hz_4t_24w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2313 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2314 uint8_t *dst, int32_t dst_stride,
2315 const int8_t *filter, int32_t height)
2316 {
2317 uint8_t *dst1 = dst + 16;
2318 uint32_t loop_cnt;
2319 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2320 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322 v8i16 filt, out0, out1, out2, out3;
2323 v16u8 tmp0, tmp1;
2324
2325 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2326 src -= 1;
2327
2328 /* rearranging filter */
2329 filt = LD_SH(filter);
2330 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2331
2332 mask1 = mask0 + 2;
2333 mask00 = mask0 + 8;
2334 mask11 = mask0 + 10;
2335
2336 for (loop_cnt = 8; loop_cnt--;) {
2337 LD_SB4(src, src_stride, src0, src2, src4, src6);
2338 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2339 src += (4 * src_stride);
2340
2341 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2342 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2343 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2344 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2345 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347 out0, out1, out2, out3);
2348 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349 out0, out1, out2, out3);
2350 SRARI_H4_SH(out0, out1, out2, out3, 6);
2351 SAT_SH4_SH(out0, out1, out2, out3, 7);
2352 tmp0 = PCKEV_XORI128_UB(out0, out1);
2353 ST_UB(tmp0, dst);
2354 dst += dst_stride;
2355 tmp0 = PCKEV_XORI128_UB(out2, out3);
2356 ST_UB(tmp0, dst);
2357 dst += dst_stride;
2358
2359 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364 out0, out1, out2, out3);
2365 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366 out0, out1, out2, out3);
2367 SRARI_H4_SH(out0, out1, out2, out3, 6);
2368 SAT_SH4_SH(out0, out1, out2, out3, 7);
2369 tmp0 = PCKEV_XORI128_UB(out0, out1);
2370 ST_UB(tmp0, dst);
2371 dst += dst_stride;
2372 tmp0 = PCKEV_XORI128_UB(out2, out3);
2373 ST_UB(tmp0, dst);
2374 dst += dst_stride;
2375
2376 /* 8 width */
2377 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2378 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2379 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2380 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2381
2382 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383 out0, out1, out2, out3);
2384 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385 out0, out1, out2, out3);
2386
2387 SRARI_H4_SH(out0, out1, out2, out3, 6);
2388 SAT_SH4_SH(out0, out1, out2, out3, 7);
2389 tmp0 = PCKEV_XORI128_UB(out0, out1);
2390 tmp1 = PCKEV_XORI128_UB(out2, out3);
2391 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392 dst1 += (4 * dst_stride);
2393 }
2394 }
2395
common_hz_4t_32w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2396 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2397 uint8_t *dst, int32_t dst_stride,
2398 const int8_t *filter, int32_t height)
2399 {
2400 uint32_t loop_cnt;
2401 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2402 v16i8 filt0, filt1, mask0, mask1;
2403 v16u8 out;
2404 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2406
2407 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2408 src -= 1;
2409
2410 /* rearranging filter */
2411 filt = LD_SH(filter);
2412 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2413
2414 mask1 = mask0 + 2;
2415
2416 for (loop_cnt = (height >> 1); loop_cnt--;) {
2417 src0 = LD_SB(src);
2418 src1 = LD_SB(src + 8);
2419 src2 = LD_SB(src + 16);
2420 src3 = LD_SB(src + 24);
2421 src += src_stride;
2422 src4 = LD_SB(src);
2423 src5 = LD_SB(src + 8);
2424 src6 = LD_SB(src + 16);
2425 src7 = LD_SB(src + 24);
2426 src += src_stride;
2427
2428 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2429
2430 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2431 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433 out0, out1, out2, out3);
2434 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2435 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437 out0, out1, out2, out3);
2438
2439 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442 out4, out5, out6, out7);
2443 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446 out4, out5, out6, out7);
2447 SRARI_H4_SH(out0, out1, out2, out3, 6);
2448 SRARI_H4_SH(out4, out5, out6, out7, 6);
2449 SAT_SH4_SH(out0, out1, out2, out3, 7);
2450 SAT_SH4_SH(out4, out5, out6, out7, 7);
2451 out = PCKEV_XORI128_UB(out0, out1);
2452 ST_UB(out, dst);
2453 out = PCKEV_XORI128_UB(out2, out3);
2454 ST_UB(out, dst + 16);
2455 dst += dst_stride;
2456 out = PCKEV_XORI128_UB(out4, out5);
2457 ST_UB(out, dst);
2458 out = PCKEV_XORI128_UB(out6, out7);
2459 ST_UB(out, dst + 16);
2460 dst += dst_stride;
2461 }
2462 }
2463
common_vt_4t_4x2_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)2464 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2465 uint8_t *dst, int32_t dst_stride,
2466 const int8_t *filter)
2467 {
2468 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469 v16i8 src2110, src4332, filt0, filt1;
2470 v16u8 out;
2471 v8i16 filt, out10;
2472
2473 src -= src_stride;
2474
2475 filt = LD_SH(filter);
2476 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2477
2478 LD_SB3(src, src_stride, src0, src1, src2);
2479 src += (3 * src_stride);
2480
2481 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2482 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2484 LD_SB2(src, src_stride, src3, src4);
2485 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2488 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2489 out10 = __msa_srari_h(out10, 6);
2490 out10 = __msa_sat_s_h(out10, 7);
2491 out = PCKEV_XORI128_UB(out10, out10);
2492 ST_W2(out, 0, 1, dst, dst_stride);
2493 }
2494
common_vt_4t_4x4multiple_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2495 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2496 uint8_t *dst, int32_t dst_stride,
2497 const int8_t *filter, int32_t height)
2498 {
2499 uint32_t loop_cnt;
2500 v16i8 src0, src1, src2, src3, src4, src5;
2501 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502 v16i8 src2110, src4332, filt0, filt1;
2503 v8i16 filt, out10, out32;
2504 v16u8 out;
2505
2506 src -= src_stride;
2507
2508 filt = LD_SH(filter);
2509 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2510
2511 LD_SB3(src, src_stride, src0, src1, src2);
2512 src += (3 * src_stride);
2513
2514 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2515
2516 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2518
2519 for (loop_cnt = (height >> 2); loop_cnt--;) {
2520 LD_SB3(src, src_stride, src3, src4, src5);
2521 src += (3 * src_stride);
2522 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2525 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2526
2527 src2 = LD_SB(src);
2528 src += (src_stride);
2529 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2532 out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2533 SRARI_H2_SH(out10, out32, 6);
2534 SAT_SH2_SH(out10, out32, 7);
2535 out = PCKEV_XORI128_UB(out10, out32);
2536 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2537 dst += (4 * dst_stride);
2538 }
2539 }
2540
common_vt_4t_4w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2541 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2542 uint8_t *dst, int32_t dst_stride,
2543 const int8_t *filter, int32_t height)
2544 {
2545 if (2 == height) {
2546 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2547 } else {
2548 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2549 height);
2550 }
2551 }
2552
common_vt_4t_6w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2553 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2554 uint8_t *dst, int32_t dst_stride,
2555 const int8_t *filter, int32_t height)
2556 {
2557 v16u8 out0, out1;
2558 v16i8 src0, src1, src2, src3, src4, src5, src6;
2559 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2561
2562 src -= src_stride;
2563
2564 filter_vec = LD_SH(filter);
2565 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2566
2567 LD_SB3(src, src_stride, src0, src1, src2);
2568 src += (3 * src_stride);
2569 XORI_B3_128_SB(src0, src1, src2);
2570 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2571
2572 LD_SB2(src, src_stride, src3, src4);
2573 src += (2 * src_stride);
2574 XORI_B2_128_SB(src3, src4);
2575 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2576
2577 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2578 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2579
2580 LD_SB2(src, src_stride, src5, src6);
2581 src += (2 * src_stride);
2582 XORI_B2_128_SB(src5, src6);
2583 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2584
2585 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2586 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2587
2588 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2589 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2590 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2591 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2592 ST_W2(out0, 0, 2, dst, dst_stride);
2593 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596 dst += (4 * dst_stride);
2597
2598 LD_SB2(src, src_stride, src3, src4);
2599 src += (2 * src_stride);
2600 XORI_B2_128_SB(src3, src4);
2601 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2602
2603 dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2604 dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2605
2606 LD_SB2(src, src_stride, src5, src6);
2607 src += (2 * src_stride);
2608 XORI_B2_128_SB(src5, src6);
2609 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2610
2611 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2612 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2613
2614 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2615 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2616 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2617 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2618 ST_W2(out0, 0, 2, dst, dst_stride);
2619 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2622 }
2623
common_vt_4t_8x2_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)2624 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2625 uint8_t *dst, int32_t dst_stride,
2626 const int8_t *filter)
2627 {
2628 v16i8 src0, src1, src2, src3, src4;
2629 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2630 v16u8 out;
2631
2632 src -= src_stride;
2633
2634 /* rearranging filter_y */
2635 filt = LD_SH(filter);
2636 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2637
2638 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2639 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2640 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2641 tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2642 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2643 tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2644 SRARI_H2_SH(tmp0, tmp1, 6);
2645 SAT_SH2_SH(tmp0, tmp1, 7);
2646 out = PCKEV_XORI128_UB(tmp0, tmp1);
2647 ST_D2(out, 0, 1, dst, dst_stride);
2648 }
2649
common_vt_4t_8x6_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)2650 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2651 uint8_t *dst, int32_t dst_stride,
2652 const int8_t *filter)
2653 {
2654 uint32_t loop_cnt;
2655 uint64_t out0, out1, out2;
2656 v16i8 src0, src1, src2, src3, src4, src5;
2657 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658 v8i16 filt, filt0, filt1;
2659
2660 src -= src_stride;
2661
2662 /* rearranging filter_y */
2663 filt = LD_SH(filter);
2664 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2665
2666 LD_SB3(src, src_stride, src0, src1, src2);
2667 src += (3 * src_stride);
2668
2669 XORI_B3_128_SB(src0, src1, src2);
2670 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2671
2672 for (loop_cnt = 2; loop_cnt--;) {
2673 LD_SB3(src, src_stride, src3, src4, src5);
2674 src += (3 * src_stride);
2675
2676 XORI_B3_128_SB(src3, src4, src5);
2677 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2678 tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2679 tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2680 tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2681 SRARI_H2_SH(tmp0, tmp1, 6);
2682 tmp2 = __msa_srari_h(tmp2, 6);
2683 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2684 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2685 XORI_B2_128_SH(tmp0, tmp2);
2686
2687 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2690 SD(out0, dst);
2691 dst += dst_stride;
2692 SD(out1, dst);
2693 dst += dst_stride;
2694 SD(out2, dst);
2695 dst += dst_stride;
2696
2697 src2 = src5;
2698 vec0 = vec3;
2699 vec2 = vec4;
2700 }
2701 }
2702
common_vt_4t_8x4mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2703 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2704 uint8_t *dst, int32_t dst_stride,
2705 const int8_t *filter, int32_t height)
2706 {
2707 uint32_t loop_cnt;
2708 v16i8 src0, src1, src2, src7, src8, src9, src10;
2709 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2710 v16u8 tmp0, tmp1;
2711 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2712
2713 src -= src_stride;
2714
2715 filt = LD_SH(filter);
2716 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2717
2718 LD_SB3(src, src_stride, src0, src1, src2);
2719 src += (3 * src_stride);
2720
2721 XORI_B3_128_SB(src0, src1, src2);
2722 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723
2724 for (loop_cnt = (height >> 2); loop_cnt--;) {
2725 LD_SB4(src, src_stride, src7, src8, src9, src10);
2726 src += (4 * src_stride);
2727
2728 XORI_B4_128_SB(src7, src8, src9, src10);
2729 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730 src72_r, src87_r, src98_r, src109_r);
2731 out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2732 out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2733 out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2734 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2735 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2736 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2737 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2738 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2739 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740 dst += (4 * dst_stride);
2741
2742 src10_r = src98_r;
2743 src21_r = src109_r;
2744 src2 = src10;
2745 }
2746 }
2747
common_vt_4t_8w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2748 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2749 uint8_t *dst, int32_t dst_stride,
2750 const int8_t *filter, int32_t height)
2751 {
2752 if (2 == height) {
2753 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2754 } else if (6 == height) {
2755 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2756 } else {
2757 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2758 filter, height);
2759 }
2760 }
2761
common_vt_4t_12w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2762 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2763 uint8_t *dst, int32_t dst_stride,
2764 const int8_t *filter, int32_t height)
2765 {
2766 uint32_t loop_cnt;
2767 v16i8 src0, src1, src2, src3, src4, src5, src6;
2768 v16u8 out0, out1;
2769 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771 v16i8 src2110, src4332, src6554;
2772 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2773 v8i16 filter_vec;
2774
2775 src -= (1 * src_stride);
2776
2777 filter_vec = LD_SH(filter);
2778 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2779
2780 LD_SB3(src, src_stride, src0, src1, src2);
2781 src += (3 * src_stride);
2782
2783 XORI_B3_128_SB(src0, src1, src2);
2784 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2785 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2786 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2787
2788 for (loop_cnt = 4; loop_cnt--;) {
2789 LD_SB4(src, src_stride, src3, src4, src5, src6);
2790 src += (4 * src_stride);
2791
2792 XORI_B4_128_SB(src3, src4, src5, src6);
2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2799
2800 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2801 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2802 dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2803 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2804 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2805 dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2806
2807 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2808 SRARI_H2_SH(dst0_l, dst1_l, 6);
2809 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2810 SAT_SH2_SH(dst0_l, dst1_l, 7);
2811 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2812 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2813 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2814 out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2815 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816 dst += (4 * dst_stride);
2817
2818 src2 = src6;
2819 src10_r = src54_r;
2820 src21_r = src65_r;
2821 src2110 = src6554;
2822 }
2823 }
2824
common_vt_4t_16w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2825 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2826 uint8_t *dst, int32_t dst_stride,
2827 const int8_t *filter, int32_t height)
2828 {
2829 uint32_t loop_cnt;
2830 v16i8 src0, src1, src2, src3, src4, src5, src6;
2831 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833 v16u8 tmp0, tmp1, tmp2, tmp3;
2834 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2835
2836 src -= src_stride;
2837
2838 filt = LD_SH(filter);
2839 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2840
2841 LD_SB3(src, src_stride, src0, src1, src2);
2842 src += (3 * src_stride);
2843
2844 XORI_B3_128_SB(src0, src1, src2);
2845 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2846 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2847
2848 for (loop_cnt = (height >> 2); loop_cnt--;) {
2849 LD_SB4(src, src_stride, src3, src4, src5, src6);
2850 src += (4 * src_stride);
2851
2852 XORI_B4_128_SB(src3, src4, src5, src6);
2853 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854 src32_r, src43_r, src54_r, src65_r);
2855 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856 src32_l, src43_l, src54_l, src65_l);
2857 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2858 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2859 out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2860 out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2861 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2862 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2863 out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2864 out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2865 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2866 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2867 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870 out3_r, tmp0, tmp1, tmp2, tmp3);
2871 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2872 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873 dst += (4 * dst_stride);
2874
2875 src10_r = src54_r;
2876 src21_r = src65_r;
2877 src10_l = src54_l;
2878 src21_l = src65_l;
2879 src2 = src6;
2880 }
2881 }
2882
common_vt_4t_24w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2883 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2884 uint8_t *dst, int32_t dst_stride,
2885 const int8_t *filter, int32_t height)
2886 {
2887 uint32_t loop_cnt;
2888 uint64_t out0, out1;
2889 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890 v16i8 src11, filt0, filt1;
2891 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2893 v16u8 out;
2894 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2895
2896 src -= src_stride;
2897
2898 filt = LD_SH(filter);
2899 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2900
2901 /* 16 width */
2902 LD_SB3(src, src_stride, src0, src1, src2);
2903 XORI_B3_128_SB(src0, src1, src2);
2904 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2906
2907 /* 8 width */
2908 LD_SB3(src + 16, src_stride, src6, src7, src8);
2909 src += (3 * src_stride);
2910 XORI_B3_128_SB(src6, src7, src8);
2911 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2912
2913 for (loop_cnt = 8; loop_cnt--;) {
2914 /* 16 width */
2915 LD_SB2(src, src_stride, src3, src4);
2916 XORI_B2_128_SB(src3, src4);
2917 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2919
2920 /* 8 width */
2921 LD_SB2(src + 16, src_stride, src9, src10);
2922 src += (2 * src_stride);
2923 XORI_B2_128_SB(src9, src10);
2924 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2925
2926 /* 16 width */
2927 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2928 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2929 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2930 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2931
2932 /* 8 width */
2933 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2934 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2935
2936 /* 16 + 8 width */
2937 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2938 SRARI_H2_SH(out0_l, out1_l, 6);
2939 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2940 SAT_SH2_SH(out0_l, out1_l, 7);
2941 out = PCKEV_XORI128_UB(out0_r, out0_l);
2942 ST_UB(out, dst);
2943 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2944 XORI_B2_128_SH(out2_r, out3_r);
2945 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2947 SD(out0, dst + 16);
2948 dst += dst_stride;
2949 out = PCKEV_XORI128_UB(out1_r, out1_l);
2950 ST_UB(out, dst);
2951 SD(out1, dst + 16);
2952 dst += dst_stride;
2953
2954 /* 16 width */
2955 LD_SB2(src, src_stride, src5, src2);
2956 XORI_B2_128_SB(src5, src2);
2957 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2959
2960 /* 8 width */
2961 LD_SB2(src + 16, src_stride, src11, src8);
2962 src += (2 * src_stride);
2963 XORI_B2_128_SB(src11, src8);
2964 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2965
2966 /* 16 width */
2967 out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2968 out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2969 out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2970 out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2971
2972 /* 8 width */
2973 out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2974 out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2975
2976 /* 16 + 8 width */
2977 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2978 SRARI_H2_SH(out0_l, out1_l, 6);
2979 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2980 SAT_SH2_SH(out0_l, out1_l, 7);
2981 out = PCKEV_XORI128_UB(out0_r, out0_l);
2982 ST_UB(out, dst);
2983 out = PCKEV_XORI128_UB(out2_r, out2_r);
2984 ST_D1(out, 0, dst + 16);
2985 dst += dst_stride;
2986 out = PCKEV_XORI128_UB(out1_r, out1_l);
2987 ST_UB(out, dst);
2988 out = PCKEV_XORI128_UB(out3_r, out3_r);
2989 ST_D1(out, 0, dst + 16);
2990 dst += dst_stride;
2991 }
2992 }
2993
common_vt_4t_32w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2994 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2995 uint8_t *dst, int32_t dst_stride,
2996 const int8_t *filter, int32_t height)
2997 {
2998 uint32_t loop_cnt;
2999 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000 v16i8 src10_r, src32_r, src76_r, src98_r;
3001 v16i8 src21_r, src43_r, src87_r, src109_r;
3002 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003 v16i8 src10_l, src32_l, src76_l, src98_l;
3004 v16i8 src21_l, src43_l, src87_l, src109_l;
3005 v8i16 filt;
3006 v16i8 filt0, filt1;
3007 v16u8 out;
3008
3009 src -= src_stride;
3010
3011 filt = LD_SH(filter);
3012 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3013
3014 /* 16 width */
3015 LD_SB3(src, src_stride, src0, src1, src2);
3016 XORI_B3_128_SB(src0, src1, src2);
3017
3018 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3019 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3020
3021 /* next 16 width */
3022 LD_SB3(src + 16, src_stride, src6, src7, src8);
3023 src += (3 * src_stride);
3024
3025 XORI_B3_128_SB(src6, src7, src8);
3026 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3028
3029 for (loop_cnt = (height >> 1); loop_cnt--;) {
3030 /* 16 width */
3031 LD_SB2(src, src_stride, src3, src4);
3032 XORI_B2_128_SB(src3, src4);
3033 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3035
3036 /* 16 width */
3037 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3038 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3039 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3040 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3041
3042 /* 16 width */
3043 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3044 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3045 out = PCKEV_XORI128_UB(out0_r, out0_l);
3046 ST_UB(out, dst);
3047 out = PCKEV_XORI128_UB(out1_r, out1_l);
3048 ST_UB(out, dst + dst_stride);
3049
3050 src10_r = src32_r;
3051 src21_r = src43_r;
3052 src10_l = src32_l;
3053 src21_l = src43_l;
3054 src2 = src4;
3055
3056 /* next 16 width */
3057 LD_SB2(src + 16, src_stride, src9, src10);
3058 src += (2 * src_stride);
3059 XORI_B2_128_SB(src9, src10);
3060 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3062
3063 /* next 16 width */
3064 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3065 out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3066 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3067 out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3068
3069 /* next 16 width */
3070 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3071 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3072 out = PCKEV_XORI128_UB(out2_r, out2_l);
3073 ST_UB(out, dst + 16);
3074 out = PCKEV_XORI128_UB(out3_r, out3_l);
3075 ST_UB(out, dst + 16 + dst_stride);
3076
3077 dst += 2 * dst_stride;
3078
3079 src76_r = src98_r;
3080 src87_r = src109_r;
3081 src76_l = src98_l;
3082 src87_l = src109_l;
3083 src8 = src10;
3084 }
3085 }
3086
hevc_hv_uni_4t_4x2_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3087 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
3088 int32_t src_stride,
3089 uint8_t *dst,
3090 int32_t dst_stride,
3091 const int8_t *filter_x,
3092 const int8_t *filter_y)
3093 {
3094 v16u8 out;
3095 v16i8 src0, src1, src2, src3, src4;
3096 v8i16 filt0, filt1;
3097 v8i16 filt_h0, filt_h1;
3098 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3099 v16i8 mask1;
3100 v8i16 filter_vec, tmp;
3101 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3103 v4i32 dst0, dst1;
3104
3105 src -= (src_stride + 1);
3106
3107 filter_vec = LD_SH(filter_x);
3108 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3109
3110 filter_vec = LD_SH(filter_y);
3111 UNPCK_R_SB_SH(filter_vec, filter_vec);
3112
3113 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3114
3115 mask1 = mask0 + 2;
3116
3117 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3118 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3119
3120 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3121 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3122 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3123
3124 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3125 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3126 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3127
3128 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3129 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3130
3131 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3132 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3133 dst0 >>= 6;
3134 dst1 >>= 6;
3135 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136 tmp = __msa_srari_h(tmp, 6);
3137 tmp = __msa_sat_s_h(tmp, 7);
3138 out = PCKEV_XORI128_UB(tmp, tmp);
3139 ST_W2(out, 0, 1, dst, dst_stride);
3140 }
3141
hevc_hv_uni_4t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3142 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3143 int32_t src_stride,
3144 uint8_t *dst,
3145 int32_t dst_stride,
3146 const int8_t *filter_x,
3147 const int8_t *filter_y)
3148 {
3149 v16u8 out;
3150 v16i8 src0, src1, src2, src3, src4, src5, src6;
3151 v8i16 filt0, filt1;
3152 v8i16 filt_h0, filt_h1;
3153 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3154 v16i8 mask1;
3155 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156 v8i16 filter_vec, tmp0, tmp1;
3157 v8i16 dst30, dst41, dst52, dst63;
3158 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159 v4i32 dst0, dst1, dst2, dst3;
3160
3161 src -= (src_stride + 1);
3162
3163 filter_vec = LD_SH(filter_x);
3164 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3165
3166 filter_vec = LD_SH(filter_y);
3167 UNPCK_R_SB_SH(filter_vec, filter_vec);
3168
3169 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3170
3171 mask1 = mask0 + 2;
3172
3173 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3174 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3175
3176 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3177 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3178 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3180
3181 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3182 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3183 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3184 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3185
3186 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3187 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3188 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3189 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3190 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3191 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3192 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3193 SRA_4V(dst0, dst1, dst2, dst3, 6);
3194 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3195 SRARI_H2_SH(tmp0, tmp1, 6);
3196 SAT_SH2_SH(tmp0, tmp1, 7);
3197 out = PCKEV_XORI128_UB(tmp0, tmp1);
3198 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3199 }
3200
hevc_hv_uni_4t_4multx8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3201 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3202 int32_t src_stride,
3203 uint8_t *dst,
3204 int32_t dst_stride,
3205 const int8_t *filter_x,
3206 const int8_t *filter_y,
3207 int32_t height)
3208 {
3209 uint32_t loop_cnt;
3210 v16u8 out0, out1;
3211 v16i8 src0, src1, src2, src3, src4, src5;
3212 v16i8 src6, src7, src8, src9, src10;
3213 v8i16 filt0, filt1;
3214 v8i16 filt_h0, filt_h1;
3215 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3216 v16i8 mask1;
3217 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223 v8i16 dst98_r, dst109_r;
3224
3225 src -= (src_stride + 1);
3226
3227 filter_vec = LD_SH(filter_x);
3228 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3229
3230 filter_vec = LD_SH(filter_y);
3231 UNPCK_R_SB_SH(filter_vec, filter_vec);
3232
3233 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3234
3235 mask1 = mask0 + 2;
3236
3237 LD_SB3(src, src_stride, src0, src1, src2);
3238 src += (3 * src_stride);
3239
3240 XORI_B3_128_SB(src0, src1, src2);
3241
3242 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3243 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3244 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3245 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3246 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3247 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3248
3249 for (loop_cnt = height >> 3; loop_cnt--;) {
3250 LD_SB8(src, src_stride,
3251 src3, src4, src5, src6, src7, src8, src9, src10);
3252 src += (8 * src_stride);
3253
3254 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3255
3256 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3260
3261 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3262 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3263 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3264 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3265
3266 dst32_r = __msa_ilvr_h(dst73, dst22);
3267 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3268 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3269 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3270 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271 dst76_r = __msa_ilvr_h(dst22, dst106);
3272
3273 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3274 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3275 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3276 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3277 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3278 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3279 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3280 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3281 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3283 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3284 dst5_r, dst4_r, dst7_r, dst6_r,
3285 tmp0, tmp1, tmp2, tmp3);
3286 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3287 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3288 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3289 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3290 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291 dst += (8 * dst_stride);
3292
3293 dst10_r = dst98_r;
3294 dst21_r = dst109_r;
3295 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3296 }
3297 }
3298
hevc_hv_uni_4t_4w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3299 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3300 int32_t src_stride,
3301 uint8_t *dst,
3302 int32_t dst_stride,
3303 const int8_t *filter_x,
3304 const int8_t *filter_y,
3305 int32_t height)
3306 {
3307 if (2 == height) {
3308 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3309 filter_x, filter_y);
3310 } else if (4 == height) {
3311 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3312 filter_x, filter_y);
3313 } else if (0 == (height % 8)) {
3314 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3315 filter_x, filter_y, height);
3316 }
3317 }
3318
hevc_hv_uni_4t_6w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3319 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3320 int32_t src_stride,
3321 uint8_t *dst,
3322 int32_t dst_stride,
3323 const int8_t *filter_x,
3324 const int8_t *filter_y,
3325 int32_t height)
3326 {
3327 v16u8 out0, out1, out2;
3328 v16i8 src0, src1, src2, src3, src4, src5, src6;
3329 v16i8 src7, src8, src9, src10;
3330 v8i16 filt0, filt1;
3331 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3332 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3333 v16i8 mask1;
3334 v8i16 filt_h0, filt_h1, filter_vec;
3335 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3344
3345 src -= (src_stride + 1);
3346
3347 filter_vec = LD_SH(filter_x);
3348 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3349
3350 filter_vec = LD_SH(filter_y);
3351 UNPCK_R_SB_SH(filter_vec, filter_vec);
3352
3353 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3354
3355 mask1 = mask0 + 2;
3356
3357 LD_SB3(src, src_stride, src0, src1, src2);
3358 src += (3 * src_stride);
3359
3360 XORI_B3_128_SB(src0, src1, src2);
3361
3362 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3363 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3364 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3365
3366 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3367 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3368 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3369
3370 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3371 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3372
3373 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3374 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3375
3376 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3380
3381 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3383 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3384 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3385
3386 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3390
3391 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3393 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3394 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3395
3396 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3397 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3398 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3399 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3400 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3401 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3402 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3403 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3404
3405 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3408
3409 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3410 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3411 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3412 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3413 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3414 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3415 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3416 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3417 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3418 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3419 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3420 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3427 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3428 SRARI_H2_SH(tmp4, tmp5, 6);
3429 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3430 SAT_SH2_SH(tmp4, tmp5,7);
3431 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3432 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3433 out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3434 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3436 }
3437
hevc_hv_uni_4t_8x2_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3438 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3439 int32_t src_stride,
3440 uint8_t *dst,
3441 int32_t dst_stride,
3442 const int8_t *filter_x,
3443 const int8_t *filter_y)
3444 {
3445 v16u8 out;
3446 v16i8 src0, src1, src2, src3, src4;
3447 v8i16 filt0, filt1;
3448 v8i16 filt_h0, filt_h1, filter_vec;
3449 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3450 v16i8 mask1;
3451 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452 v8i16 dst0, dst1, dst2, dst3, dst4;
3453 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456 v8i16 out0_r, out1_r;
3457
3458 src -= (src_stride + 1);
3459
3460 filter_vec = LD_SH(filter_x);
3461 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3462
3463 filter_vec = LD_SH(filter_y);
3464 UNPCK_R_SB_SH(filter_vec, filter_vec);
3465
3466 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3467
3468 mask1 = mask0 + 2;
3469
3470 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3471 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3472
3473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3475 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3478
3479 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3480 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3481 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3482 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3483 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3484 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3485 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3486 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3487 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3488 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3489 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3490 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3491 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3492 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3494 SRARI_H2_SH(out0_r, out1_r, 6);
3495 SAT_SH2_SH(out0_r, out1_r, 7);
3496 out = PCKEV_XORI128_UB(out0_r, out1_r);
3497 ST_D2(out, 0, 1, dst, dst_stride);
3498 }
3499
hevc_hv_uni_4t_8multx4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t width8mult)3500 static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
3501 int32_t src_stride,
3502 uint8_t *dst,
3503 int32_t dst_stride,
3504 const int8_t *filter_x,
3505 const int8_t *filter_y,
3506 int32_t width8mult)
3507 {
3508 uint32_t cnt;
3509 v16u8 out0, out1;
3510 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3511 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3517
3518 src -= (src_stride + 1);
3519
3520 filter_vec = LD_SH(filter_x);
3521 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3522
3523 filter_vec = LD_SH(filter_y);
3524 UNPCK_R_SB_SH(filter_vec, filter_vec);
3525
3526 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3527
3528 mask0 = LD_SB(ff_hevc_mask_arr);
3529 mask1 = mask0 + 2;
3530
3531 for (cnt = width8mult; cnt--;) {
3532 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3533 src += 8;
3534 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3535
3536 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3537 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3538 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3539
3540 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3541 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3542 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3543
3544 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3545 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3546
3547 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3551
3552 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3553 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3554 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3555 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3556
3557 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3558 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3559 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3560 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3561
3562 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3563 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3564 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3565 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3566 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3567 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3568 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3569 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3570
3571 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3573
3574 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575 dst3_r, tmp0, tmp1, tmp2, tmp3);
3576 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3577 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3578 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3579 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3580 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3581 dst += 8;
3582 }
3583 }
3584
hevc_hv_uni_4t_8x6_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3585 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3586 int32_t src_stride,
3587 uint8_t *dst,
3588 int32_t dst_stride,
3589 const int8_t *filter_x,
3590 const int8_t *filter_y)
3591 {
3592 v16u8 out0, out1, out2;
3593 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3594 v8i16 filt0, filt1;
3595 v8i16 filt_h0, filt_h1, filter_vec;
3596 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3597 v16i8 mask1;
3598 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3608
3609 src -= (src_stride + 1);
3610
3611 filter_vec = LD_SH(filter_x);
3612 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3613
3614 filter_vec = LD_SH(filter_y);
3615 UNPCK_R_SB_SH(filter_vec, filter_vec);
3616
3617 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3618
3619 mask1 = mask0 + 2;
3620
3621 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3622 src += (5 * src_stride);
3623 LD_SB4(src, src_stride, src5, src6, src7, src8);
3624
3625 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3626 XORI_B4_128_SB(src5, src6, src7, src8);
3627
3628 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3629 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3630 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3637
3638 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3639 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3640 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3641 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3642 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3643 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3644 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3645 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3646 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3647
3648 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3649 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3650 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3651 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3652 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3653 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3654 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3655 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3656
3657 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3658 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3659 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3660 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3661 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3662 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3663 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3664 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3665 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3666 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3667 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3668 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3669
3670 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3673 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3674 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3676 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3677 SRARI_H2_SH(out4_r, out5_r, 6);
3678 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3679 SAT_SH2_SH(out4_r, out5_r, 7);
3680 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3681 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3682 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3683
3684 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3686 }
3687
hevc_hv_uni_4t_8multx4mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width8mult)3688 static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
3689 int32_t src_stride,
3690 uint8_t *dst,
3691 int32_t dst_stride,
3692 const int8_t *filter_x,
3693 const int8_t *filter_y,
3694 int32_t height,
3695 int32_t width8mult)
3696 {
3697 uint32_t loop_cnt, cnt;
3698 uint8_t *src_tmp;
3699 uint8_t *dst_tmp;
3700 v16u8 out0, out1;
3701 v16i8 src0, src1, src2, src3, src4, src5, src6;
3702 v8i16 filt0, filt1;
3703 v8i16 filt_h0, filt_h1, filter_vec;
3704 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3705 v16i8 mask1;
3706 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712 v8i16 out0_r, out1_r, out2_r, out3_r;
3713
3714 src -= (src_stride + 1);
3715
3716 filter_vec = LD_SH(filter_x);
3717 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3718
3719 filter_vec = LD_SH(filter_y);
3720 UNPCK_R_SB_SH(filter_vec, filter_vec);
3721
3722 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3723
3724 mask1 = mask0 + 2;
3725
3726 for (cnt = width8mult; cnt--;) {
3727 src_tmp = src;
3728 dst_tmp = dst;
3729
3730 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3731 src_tmp += (3 * src_stride);
3732
3733 XORI_B3_128_SB(src0, src1, src2);
3734
3735 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3736 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3737 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3738
3739 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3740 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3741 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3742
3743 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3744 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3745
3746 for (loop_cnt = (height >> 2); loop_cnt--;) {
3747 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748 src_tmp += (4 * src_stride);
3749
3750 XORI_B4_128_SB(src3, src4, src5, src6);
3751
3752 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3756
3757 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3758 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3759 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3760 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3761
3762 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3763 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3764 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3765 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3766
3767 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3768 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3769 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3770 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3771 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3772 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3773 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3774 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3775
3776 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3778
3779 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3780 dst2_l, dst2_r, dst3_l, dst3_r,
3781 out0_r, out1_r, out2_r, out3_r);
3782
3783 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3784 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3785 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3786 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3787 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788 dst_tmp += (4 * dst_stride);
3789
3790 dst10_r = dst54_r;
3791 dst10_l = dst54_l;
3792 dst21_r = dst65_r;
3793 dst21_l = dst65_l;
3794 dst2 = dst6;
3795 }
3796
3797 src += 8;
3798 dst += 8;
3799 }
3800 }
3801
hevc_hv_uni_4t_8w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3802 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3803 int32_t src_stride,
3804 uint8_t *dst,
3805 int32_t dst_stride,
3806 const int8_t *filter_x,
3807 const int8_t *filter_y,
3808 int32_t height)
3809 {
3810 if (2 == height) {
3811 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3812 filter_x, filter_y);
3813 } else if (4 == height) {
3814 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3815 filter_x, filter_y, 1);
3816 } else if (6 == height) {
3817 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3818 filter_x, filter_y);
3819 } else if (0 == (height % 4)) {
3820 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3821 filter_x, filter_y, height, 1);
3822 }
3823 }
3824
hevc_hv_uni_4t_12w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3825 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3826 int32_t src_stride,
3827 uint8_t *dst,
3828 int32_t dst_stride,
3829 const int8_t *filter_x,
3830 const int8_t *filter_y,
3831 int32_t height)
3832 {
3833 uint32_t loop_cnt;
3834 uint8_t *src_tmp, *dst_tmp;
3835 v16u8 out0, out1;
3836 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838 v16i8 mask0, mask1, mask2, mask3;
3839 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3847
3848 src -= (src_stride + 1);
3849
3850 filter_vec = LD_SH(filter_x);
3851 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3852
3853 filter_vec = LD_SH(filter_y);
3854 UNPCK_R_SB_SH(filter_vec, filter_vec);
3855
3856 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3857
3858 mask0 = LD_SB(ff_hevc_mask_arr);
3859 mask1 = mask0 + 2;
3860
3861 src_tmp = src;
3862 dst_tmp = dst;
3863
3864 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3865 src_tmp += (3 * src_stride);
3866
3867 XORI_B3_128_SB(src0, src1, src2);
3868
3869 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3870 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3871 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3872
3873 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3874 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3875 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3876
3877 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3878 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3879
3880 for (loop_cnt = 4; loop_cnt--;) {
3881 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882 src_tmp += (4 * src_stride);
3883 XORI_B4_128_SB(src3, src4, src5, src6);
3884
3885 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3889
3890 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3891 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3892 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3893 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3894
3895 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3896 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3897 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3898 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3899
3900 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3901 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3902 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3903 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3904 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3905 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3906 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3907 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3908
3909 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3911
3912 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913 dst3_r, tmp0, tmp1, tmp2, tmp3);
3914 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3915 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3916 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3917 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3918 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919 dst_tmp += (4 * dst_stride);
3920
3921 dst10_r = dst54_r;
3922 dst10_l = dst54_l;
3923 dst21_r = dst65_r;
3924 dst21_l = dst65_l;
3925 dsth2 = dsth6;
3926 }
3927
3928 src += 8;
3929 dst += 8;
3930
3931 mask2 = LD_SB(ff_hevc_mask_arr + 16);
3932 mask3 = mask2 + 2;
3933
3934 LD_SB3(src, src_stride, src0, src1, src2);
3935 src += (3 * src_stride);
3936 XORI_B3_128_SB(src0, src1, src2);
3937 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3938 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3939
3940 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3941 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3942
3943 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3944 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3945
3946 for (loop_cnt = 2; loop_cnt--;) {
3947 LD_SB8(src, src_stride,
3948 src3, src4, src5, src6, src7, src8, src9, src10);
3949 src += (8 * src_stride);
3950 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3951 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3955
3956 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3957 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3958 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3959 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3960
3961 dst32_r = __msa_ilvr_h(dst73, dst22);
3962 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3963 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3964 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3965 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966 dst76_r = __msa_ilvr_h(dst22, dst106);
3967
3968 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3969 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3970 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3971 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3972 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3973 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3974 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3975 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3976 SRA_4V(dst0, dst1, dst2, dst3, 6);
3977 SRA_4V(dst4, dst5, dst6, dst7, 6);
3978 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979 tmp0, tmp1, tmp2, tmp3);
3980 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3981 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3982 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3983 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3984 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985 dst += (8 * dst_stride);
3986
3987 dst10_r = dst98_r;
3988 dst21_r = dst109_r;
3989 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3990 }
3991 }
3992
hevc_hv_uni_4t_16w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3993 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3994 int32_t src_stride,
3995 uint8_t *dst,
3996 int32_t dst_stride,
3997 const int8_t *filter_x,
3998 const int8_t *filter_y,
3999 int32_t height)
4000 {
4001 if (4 == height) {
4002 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4003 filter_y, 2);
4004 } else {
4005 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4006 filter_x, filter_y, height, 2);
4007 }
4008 }
4009
hevc_hv_uni_4t_24w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4010 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
4011 int32_t src_stride,
4012 uint8_t *dst,
4013 int32_t dst_stride,
4014 const int8_t *filter_x,
4015 const int8_t *filter_y,
4016 int32_t height)
4017 {
4018 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4019 filter_x, filter_y, height, 3);
4020 }
4021
hevc_hv_uni_4t_32w_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4022 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
4023 int32_t src_stride,
4024 uint8_t *dst,
4025 int32_t dst_stride,
4026 const int8_t *filter_x,
4027 const int8_t *filter_y,
4028 int32_t height)
4029 {
4030 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4031 filter_x, filter_y, height, 4);
4032 }
4033
4034 #define UNI_MC_COPY(WIDTH) \
4035 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4036 ptrdiff_t dst_stride, \
4037 uint8_t *src, \
4038 ptrdiff_t src_stride, \
4039 int height, \
4040 intptr_t mx, \
4041 intptr_t my, \
4042 int width) \
4043 { \
4044 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4045 }
4046
4047 UNI_MC_COPY(8);
4048 UNI_MC_COPY(12);
4049 UNI_MC_COPY(16);
4050 UNI_MC_COPY(24);
4051 UNI_MC_COPY(32);
4052 UNI_MC_COPY(48);
4053 UNI_MC_COPY(64);
4054
4055 #undef UNI_MC_COPY
4056
4057 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4058 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4059 ptrdiff_t dst_stride, \
4060 uint8_t *src, \
4061 ptrdiff_t src_stride, \
4062 int height, \
4063 intptr_t mx, \
4064 intptr_t my, \
4065 int width) \
4066 { \
4067 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4068 \
4069 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4070 filter, height); \
4071 }
4072
4073 UNI_MC(qpel, h, 4, 8, hz, mx);
4074 UNI_MC(qpel, h, 8, 8, hz, mx);
4075 UNI_MC(qpel, h, 12, 8, hz, mx);
4076 UNI_MC(qpel, h, 16, 8, hz, mx);
4077 UNI_MC(qpel, h, 24, 8, hz, mx);
4078 UNI_MC(qpel, h, 32, 8, hz, mx);
4079 UNI_MC(qpel, h, 48, 8, hz, mx);
4080 UNI_MC(qpel, h, 64, 8, hz, mx);
4081
4082 UNI_MC(qpel, v, 4, 8, vt, my);
4083 UNI_MC(qpel, v, 8, 8, vt, my);
4084 UNI_MC(qpel, v, 12, 8, vt, my);
4085 UNI_MC(qpel, v, 16, 8, vt, my);
4086 UNI_MC(qpel, v, 24, 8, vt, my);
4087 UNI_MC(qpel, v, 32, 8, vt, my);
4088 UNI_MC(qpel, v, 48, 8, vt, my);
4089 UNI_MC(qpel, v, 64, 8, vt, my);
4090
4091 UNI_MC(epel, h, 4, 4, hz, mx);
4092 UNI_MC(epel, h, 6, 4, hz, mx);
4093 UNI_MC(epel, h, 8, 4, hz, mx);
4094 UNI_MC(epel, h, 12, 4, hz, mx);
4095 UNI_MC(epel, h, 16, 4, hz, mx);
4096 UNI_MC(epel, h, 24, 4, hz, mx);
4097 UNI_MC(epel, h, 32, 4, hz, mx);
4098
4099 UNI_MC(epel, v, 4, 4, vt, my);
4100 UNI_MC(epel, v, 6, 4, vt, my);
4101 UNI_MC(epel, v, 8, 4, vt, my);
4102 UNI_MC(epel, v, 12, 4, vt, my);
4103 UNI_MC(epel, v, 16, 4, vt, my);
4104 UNI_MC(epel, v, 24, 4, vt, my);
4105 UNI_MC(epel, v, 32, 4, vt, my);
4106
4107 #undef UNI_MC
4108
4109 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4110 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4111 ptrdiff_t dst_stride, \
4112 uint8_t *src, \
4113 ptrdiff_t src_stride, \
4114 int height, \
4115 intptr_t mx, \
4116 intptr_t my, \
4117 int width) \
4118 { \
4119 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4120 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4121 \
4122 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4123 filter_x, filter_y, height); \
4124 }
4125
4126 UNI_MC_HV(qpel, 4, 8);
4127 UNI_MC_HV(qpel, 8, 8);
4128 UNI_MC_HV(qpel, 12, 8);
4129 UNI_MC_HV(qpel, 16, 8);
4130 UNI_MC_HV(qpel, 24, 8);
4131 UNI_MC_HV(qpel, 32, 8);
4132 UNI_MC_HV(qpel, 48, 8);
4133 UNI_MC_HV(qpel, 64, 8);
4134
4135 UNI_MC_HV(epel, 4, 4);
4136 UNI_MC_HV(epel, 6, 4);
4137 UNI_MC_HV(epel, 8, 4);
4138 UNI_MC_HV(epel, 12, 4);
4139 UNI_MC_HV(epel, 16, 4);
4140 UNI_MC_HV(epel, 24, 4);
4141 UNI_MC_HV(epel, 32, 4);
4142
4143 #undef UNI_MC_HV
4144