1 /*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
24
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26 /* 8 width cases */
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29 };
30
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
32 { \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
36 }
37
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
41 { \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
44 }
45
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
47 out0, out1) \
48 { \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255(out0, out1); \
52 }
53
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
56 { \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
59 }
60
hevc_bi_copy_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)61 static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
62 int32_t src_stride,
63 int16_t *src1_ptr,
64 int32_t src2_stride,
65 uint8_t *dst,
66 int32_t dst_stride,
67 int32_t height)
68 {
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
71 v16i8 src0 = { 0 }, src1 = { 0 };
72 v16i8 zero = { 0 };
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
75
76 if (2 == height) {
77 LW2(src0_ptr, src_stride, tp0, tp1);
78 INSERT_W2_SB(tp0, tp1, src0);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
80 INSERT_D2_SH(tpd0, tpd1, in0);
81
82 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
83 dst0 <<= 6;
84 dst0 += in0;
85 dst0 = __msa_srari_h(dst0, 7);
86 CLIP_SH_0_255(dst0);
87
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST_W2(dst0, 0, 1, dst, dst_stride);
90 } else if (4 == height) {
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
92 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
94 INSERT_D2_SH(tpd0, tpd1, in0);
95 INSERT_D2_SH(tpd2, tpd3, in1);
96 ILVRL_B2_SH(zero, src0, dst0, dst1);
97 SLLI_2V(dst0, dst1, 6);
98 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
101 } else if (0 == height % 8) {
102 for (loop_cnt = (height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
105 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
108 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
111 INSERT_D2_SH(tpd0, tpd1, in0);
112 INSERT_D2_SH(tpd2, tpd3, in1);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
115 INSERT_D2_SH(tpd0, tpd1, in2);
116 INSERT_D2_SH(tpd2, tpd3, in3);
117 ILVRL_B2_SH(zero, src0, dst0, dst1);
118 ILVRL_B2_SH(zero, src1, dst2, dst3);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
120 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
121 dst3, 7, dst0, dst1, dst2, dst3);
122 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
123 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
124 dst += (8 * dst_stride);
125 }
126 }
127 }
128
hevc_bi_copy_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)129 static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
130 int32_t src_stride,
131 int16_t *src1_ptr,
132 int32_t src2_stride,
133 uint8_t *dst,
134 int32_t dst_stride,
135 int32_t height)
136 {
137 uint32_t loop_cnt;
138 uint64_t tp0, tp1, tp2, tp3;
139 v16u8 out0, out1, out2, out3;
140 v16i8 zero = { 0 };
141 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
144
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147 src0_ptr += (4 * src_stride);
148 INSERT_D2_SB(tp0, tp1, src0);
149 INSERT_D2_SB(tp2, tp3, src1);
150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151 src0_ptr += (4 * src_stride);
152 INSERT_D2_SB(tp0, tp1, src2);
153 INSERT_D2_SB(tp2, tp3, src3);
154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155 src1_ptr += (8 * src2_stride);
156 ILVRL_B2_SH(zero, src0, dst0, dst1);
157 ILVRL_B2_SH(zero, src1, dst2, dst3);
158 ILVRL_B2_SH(zero, src2, dst4, dst5);
159 ILVRL_B2_SH(zero, src3, dst6, dst7);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 SLLI_4V(dst4, dst5, dst6, dst7, 6);
162 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
163 7, dst0, dst1, dst2, dst3);
164 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
165 7, dst4, dst5, dst6, dst7);
166 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
167 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
168 ST_W2(out0, 0, 2, dst, dst_stride);
169 ST_H2(out0, 2, 6, dst + 4, dst_stride);
170 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
171 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
172 dst += (4 * dst_stride);
173 ST_W2(out2, 0, 2, dst, dst_stride);
174 ST_H2(out2, 2, 6, dst + 4, dst_stride);
175 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
176 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177 dst += (4 * dst_stride);
178 }
179 }
180
hevc_bi_copy_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)181 static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
182 int32_t src_stride,
183 int16_t *src1_ptr,
184 int32_t src2_stride,
185 uint8_t *dst,
186 int32_t dst_stride,
187 int32_t height)
188 {
189 uint64_t tp0, tp1, tp2, tp3;
190 v16u8 out0, out1, out2, out3;
191 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
192 v16i8 zero = { 0 };
193 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
194 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
195
196 if (2 == height) {
197 LD2(src0_ptr, src_stride, tp0, tp1);
198 INSERT_D2_SB(tp0, tp1, src0);
199 LD_SH2(src1_ptr, src2_stride, in0, in1);
200 ILVRL_B2_SH(zero, src0, dst0, dst1);
201 SLLI_2V(dst0, dst1, 6);
202 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
203 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
204 ST_D2(out0, 0, 1, dst, dst_stride);
205 } else if (4 == height) {
206 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
207 INSERT_D2_SB(tp0, tp1, src0);
208 INSERT_D2_SB(tp2, tp3, src1);
209 ILVRL_B2_SH(zero, src0, dst0, dst1);
210 ILVRL_B2_SH(zero, src1, dst2, dst3);
211 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
212 SLLI_4V(dst0, dst1, dst2, dst3, 6);
213 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
214 7, dst0, dst1, dst2, dst3);
215 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
216 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
217 } else if (6 == height) {
218 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
219 src0_ptr += 4 * src_stride;
220 INSERT_D2_SB(tp0, tp1, src0);
221 INSERT_D2_SB(tp2, tp3, src1);
222 LD2(src0_ptr, src_stride, tp0, tp1);
223 INSERT_D2_SB(tp0, tp1, src2);
224 ILVRL_B2_SH(zero, src0, dst0, dst1);
225 ILVRL_B2_SH(zero, src1, dst2, dst3);
226 ILVRL_B2_SH(zero, src2, dst4, dst5);
227 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
228 SLLI_4V(dst0, dst1, dst2, dst3, 6);
229 SLLI_2V(dst4, dst5, 6);
230 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
231 7, dst0, dst1, dst2, dst3);
232 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
233 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
234 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
235 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
236 } else if (0 == height % 8) {
237 uint32_t loop_cnt;
238
239 for (loop_cnt = (height >> 3); loop_cnt--;) {
240 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
241 src0_ptr += 4 * src_stride;
242 INSERT_D2_SB(tp0, tp1, src0);
243 INSERT_D2_SB(tp2, tp3, src1);
244 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
245 src0_ptr += 4 * src_stride;
246 INSERT_D2_SB(tp0, tp1, src2);
247 INSERT_D2_SB(tp2, tp3, src3);
248 ILVRL_B2_SH(zero, src0, dst0, dst1);
249 ILVRL_B2_SH(zero, src1, dst2, dst3);
250 ILVRL_B2_SH(zero, src2, dst4, dst5);
251 ILVRL_B2_SH(zero, src3, dst6, dst7);
252 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
253 in7);
254 src1_ptr += (8 * src2_stride);
255 SLLI_4V(dst0, dst1, dst2, dst3, 6);
256 SLLI_4V(dst4, dst5, dst6, dst7, 6);
257 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
258 dst3, 7, dst0, dst1, dst2, dst3);
259 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
260 dst7, 7, dst4, dst5, dst6, dst7);
261 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
262 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
263 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
264 dst += (8 * dst_stride);
265 }
266 }
267 }
268
hevc_bi_copy_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)269 static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
270 int32_t src_stride,
271 int16_t *src1_ptr,
272 int32_t src2_stride,
273 uint8_t *dst,
274 int32_t dst_stride,
275 int32_t height)
276 {
277 uint32_t loop_cnt;
278 v16i8 zero = { 0 };
279 v16u8 out0, out1, out2;
280 v16i8 src0, src1, src2, src3;
281 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
282 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
283
284 for (loop_cnt = 4; loop_cnt--;) {
285 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
286 src0_ptr += (4 * src_stride);
287
288 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
289 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
290 src1_ptr += (4 * src2_stride);
291 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
292 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
293 dst2, dst3);
294 SLLI_4V(dst0, dst1, dst2, dst3, 6);
295 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
296 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
297 SLLI_2V(dst4, dst5, 6);
298 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
299 7, dst0, dst1, dst2, dst3);
300 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
301 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
302 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
303 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
304 dst += (4 * dst_stride);
305 }
306 }
307
hevc_bi_copy_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)308 static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
309 int32_t src_stride,
310 int16_t *src1_ptr,
311 int32_t src2_stride,
312 uint8_t *dst,
313 int32_t dst_stride,
314 int32_t height)
315 {
316 uint32_t loop_cnt;
317 v16u8 out0, out1, out2, out3;
318 v16i8 src0, src1, src2, src3;
319 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
320 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
321 v16i8 zero = { 0 };
322
323 for (loop_cnt = (height >> 2); loop_cnt--;) {
324 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
325 src0_ptr += (4 * src_stride);
326 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
327 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
328 src1_ptr += (4 * src2_stride);
329 ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
330 ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
331 ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
332 ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
333 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
334 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
335 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
336 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
337 HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
338 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
339 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
340 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
341 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
342 dst += (4 * dst_stride);
343 }
344 }
345
hevc_bi_copy_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)346 static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
347 int32_t src_stride,
348 int16_t *src1_ptr,
349 int32_t src2_stride,
350 uint8_t *dst,
351 int32_t dst_stride,
352 int32_t height)
353 {
354 uint32_t loop_cnt;
355 v16u8 out0, out1, out2, out3, out4, out5;
356 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
357 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
358 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
359
360 for (loop_cnt = 8; loop_cnt--;) {
361 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
362 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
363 src0_ptr += (4 * src_stride);
364 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
365 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
366 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
367 src1_ptr += (4 * src2_stride);
368
369 ILVRL_B2_SH(zero, src0, dst0, dst1);
370 ILVRL_B2_SH(zero, src1, dst2, dst3);
371 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
372 ILVRL_B2_SH(zero, src4, dst6, dst7);
373 ILVRL_B2_SH(zero, src5, dst8, dst9);
374 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
375 SLLI_4V(dst0, dst1, dst2, dst3, 6);
376 SLLI_4V(dst4, dst5, dst6, dst7, 6);
377 SLLI_4V(dst8, dst9, dst10, dst11, 6);
378 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
379 7, dst0, dst1, dst2, dst3);
380 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
381 7, dst4, dst5, dst6, dst7);
382 HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
383 dst11, 7, dst8, dst9, dst10, dst11);
384 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
385 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
386 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
387 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
388 dst += (4 * dst_stride);
389 }
390 }
391
hevc_bi_copy_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)392 static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
393 int32_t src_stride,
394 int16_t *src1_ptr,
395 int32_t src2_stride,
396 uint8_t *dst,
397 int32_t dst_stride,
398 int32_t height)
399 {
400 uint32_t loop_cnt;
401 v16u8 out0, out1, out2, out3;
402 v16i8 src0, src1, src2, src3;
403 v16i8 zero = { 0 };
404 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
405 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
406
407 for (loop_cnt = (height >> 1); loop_cnt--;) {
408 LD_SB2(src0_ptr, 16, src0, src1);
409 src0_ptr += src_stride;
410 LD_SB2(src0_ptr, 16, src2, src3);
411 src0_ptr += src_stride;
412 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
413 src1_ptr += src2_stride;
414 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
415 src1_ptr += src2_stride;
416
417 ILVRL_B2_SH(zero, src0, dst0, dst1);
418 ILVRL_B2_SH(zero, src1, dst2, dst3);
419 ILVRL_B2_SH(zero, src2, dst4, dst5);
420 ILVRL_B2_SH(zero, src3, dst6, dst7);
421 SLLI_4V(dst0, dst1, dst2, dst3, 6);
422 SLLI_4V(dst4, dst5, dst6, dst7, 6);
423 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
424 7, dst0, dst1, dst2, dst3);
425 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
426 7, dst4, dst5, dst6, dst7);
427 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
428 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
429 ST_UB2(out0, out1, dst, 16);
430 dst += dst_stride;
431 ST_UB2(out2, out3, dst, 16);
432 dst += dst_stride;
433 }
434 }
435
hevc_bi_copy_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)436 static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
437 int32_t src_stride,
438 int16_t *src1_ptr,
439 int32_t src2_stride,
440 uint8_t *dst,
441 int32_t dst_stride,
442 int32_t height)
443 {
444 uint32_t loop_cnt;
445 v16u8 out0, out1, out2, out3, out4, out5;
446 v16i8 src0, src1, src2, src3, src4, src5;
447 v16i8 zero = { 0 };
448 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
449 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
450
451 for (loop_cnt = (height >> 1); loop_cnt--;) {
452 LD_SB3(src0_ptr, 16, src0, src1, src2);
453 src0_ptr += src_stride;
454 LD_SB3(src0_ptr, 16, src3, src4, src5);
455 src0_ptr += src_stride;
456
457 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
458 src1_ptr += src2_stride;
459 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
460 src1_ptr += src2_stride;
461
462 ILVRL_B2_SH(zero, src0, dst0, dst1);
463 ILVRL_B2_SH(zero, src1, dst2, dst3);
464 ILVRL_B2_SH(zero, src2, dst4, dst5);
465 ILVRL_B2_SH(zero, src3, dst6, dst7);
466 ILVRL_B2_SH(zero, src4, dst8, dst9);
467 ILVRL_B2_SH(zero, src5, dst10, dst11);
468
469 SLLI_4V(dst0, dst1, dst2, dst3, 6);
470 SLLI_4V(dst4, dst5, dst6, dst7, 6);
471 SLLI_4V(dst8, dst9, dst10, dst11, 6);
472
473 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
474 7, dst0, dst1, dst2, dst3);
475 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
476 7, dst4, dst5, dst6, dst7);
477 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
478 dst11, 7, dst8, dst9, dst10, dst11);
479 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
480 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
481 ST_UB2(out0, out1, dst, 16);
482 ST_UB(out2, dst + 32);
483 dst += dst_stride;
484 ST_UB2(out3, out4, dst, 16);
485 ST_UB(out5, dst + 32);
486 dst += dst_stride;
487 }
488 }
489
hevc_bi_copy_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height)490 static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
491 int32_t src_stride,
492 int16_t *src1_ptr,
493 int32_t src2_stride,
494 uint8_t *dst,
495 int32_t dst_stride,
496 int32_t height)
497 {
498 uint32_t loop_cnt;
499 v16u8 out0, out1, out2, out3;
500 v16i8 src0, src1, src2, src3;
501 v16i8 zero = { 0 };
502 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
503 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
504
505 for (loop_cnt = height; loop_cnt--;) {
506 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
507 src0_ptr += src_stride;
508 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
509 src1_ptr += src2_stride;
510
511 ILVRL_B2_SH(zero, src0, dst0, dst1);
512 ILVRL_B2_SH(zero, src1, dst2, dst3);
513 ILVRL_B2_SH(zero, src2, dst4, dst5);
514 ILVRL_B2_SH(zero, src3, dst6, dst7);
515 SLLI_4V(dst0, dst1, dst2, dst3, 6);
516 SLLI_4V(dst4, dst5, dst6, dst7, 6);
517 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
518 7, dst0, dst1, dst2, dst3);
519 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
520 7, dst4, dst5, dst6, dst7);
521 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
522 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
523
524 ST_UB4(out0, out1, out2, out3, dst, 16);
525 dst += dst_stride;
526 }
527 }
528
hevc_hz_bi_8t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)529 static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
530 int32_t src_stride,
531 int16_t *src1_ptr,
532 int32_t src2_stride,
533 uint8_t *dst,
534 int32_t dst_stride,
535 const int8_t *filter,
536 int32_t height)
537 {
538 uint32_t loop_cnt;
539 v8i16 filt0, filt1, filt2, filt3;
540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
541 v16i8 mask1, mask2, mask3;
542 v16i8 vec0, vec1, vec2, vec3;
543 v8i16 dst0, dst1, dst2, dst3;
544 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
545 v8i16 filter_vec, const_vec;
546 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
547
548 src0_ptr -= 3;
549
550 /* rearranging filter */
551 filter_vec = LD_SH(filter);
552 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
553
554 mask1 = mask0 + 2;
555 mask2 = mask0 + 4;
556 mask3 = mask0 + 6;
557
558 const_vec = __msa_ldi_h(128);
559 const_vec <<= 6;
560
561 for (loop_cnt = (height >> 3); loop_cnt--;) {
562 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
563 src4, src5, src6, src7);
564 src0_ptr += (8 * src_stride);
565 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
566 src1_ptr += (8 * src2_stride);
567
568 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
569 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
570 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
571
572 dst0 = const_vec;
573 dst1 = const_vec;
574 dst2 = const_vec;
575 dst3 = const_vec;
576 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
577 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
578 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
579 dst1, dst2, dst3);
580 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
581 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
582 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
583 dst1, dst2, dst3);
584 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
585 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
586 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
587 dst1, dst2, dst3);
588 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
589 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
590 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
591 dst1, dst2, dst3);
592
593 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
594 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
595
596 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
597 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
598 dst += (8 * dst_stride);
599 }
600 }
601
hevc_hz_bi_8t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)602 static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
603 int32_t src_stride,
604 int16_t *src1_ptr,
605 int32_t src2_stride,
606 uint8_t *dst,
607 int32_t dst_stride,
608 const int8_t *filter,
609 int32_t height)
610 {
611 uint32_t loop_cnt;
612 v8i16 filt0, filt1, filt2, filt3;
613 v16i8 src0, src1, src2, src3;
614 v16i8 mask1, mask2, mask3;
615 v16i8 vec0, vec1, vec2, vec3;
616 v8i16 dst0, dst1, dst2, dst3;
617 v8i16 in0, in1, in2, in3;
618 v8i16 filter_vec, const_vec;
619 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
620
621 src0_ptr -= 3;
622
623 const_vec = __msa_ldi_h(128);
624 const_vec <<= 6;
625
626 filter_vec = LD_SH(filter);
627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
628
629 mask1 = mask0 + 2;
630 mask2 = mask0 + 4;
631 mask3 = mask0 + 6;
632
633 for (loop_cnt = (height >> 2); loop_cnt--;) {
634 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
635 src0_ptr += (4 * src_stride);
636 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
637 src1_ptr += (4 * src2_stride);
638 XORI_B4_128_SB(src0, src1, src2, src3);
639
640 dst0 = const_vec;
641 dst1 = const_vec;
642 dst2 = const_vec;
643 dst3 = const_vec;
644 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
645 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
647 dst1, dst2, dst3);
648 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
649 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
650 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
651 dst1, dst2, dst3);
652 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
653 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
654 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
655 dst1, dst2, dst3);
656 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
657 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
658 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
659 dst1, dst2, dst3);
660
661 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
662 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
663
664 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
665 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
666 dst += (4 * dst_stride);
667 }
668 }
669
hevc_hz_bi_8t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)670 static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
671 int32_t src_stride,
672 int16_t *src1_ptr,
673 int32_t src2_stride,
674 uint8_t *dst,
675 int32_t dst_stride,
676 const int8_t *filter,
677 int32_t height)
678 {
679 uint32_t loop_cnt;
680 int32_t tmp0, tmp1;
681 int64_t tmp2, tmp3;
682 v16i8 src0, src1, src2, src3;
683 v16i8 vec0, vec1, vec2;
684 v8i16 filt0, filt1, filt2, filt3;
685 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
686 v8i16 dst0, dst1, dst2;
687 v8i16 in0, in1, in2, in3;
688 v8i16 filter_vec, const_vec;
689
690 src0_ptr -= 3;
691 const_vec = __msa_ldi_h(128);
692 const_vec <<= 6;
693
694 filter_vec = LD_SH(filter);
695 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696
697 mask0 = LD_SB(ff_hevc_mask_arr);
698 mask1 = mask0 + 2;
699 mask2 = mask0 + 4;
700 mask3 = mask0 + 6;
701 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
702 mask5 = mask4 + 2;
703 mask6 = mask4 + 4;
704 mask7 = mask4 + 6;
705
706 for (loop_cnt = 8; loop_cnt--;) {
707 LD_SB2(src0_ptr, 8, src0, src1);
708 src0_ptr += src_stride;
709 LD_SB2(src0_ptr, 8, src2, src3);
710 src0_ptr += src_stride;
711 LD_SH2(src1_ptr, 8, in0, in1);
712 src1_ptr += src2_stride;
713 LD_SH2(src1_ptr, 8, in2, in3);
714 src1_ptr += src2_stride;
715 XORI_B4_128_SB(src0, src1, src2, src3);
716
717 dst0 = const_vec;
718 dst1 = const_vec;
719 dst2 = const_vec;
720
721 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
722 vec0, vec1, vec2);
723 DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
724 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
725 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
726 vec0, vec1, vec2);
727 DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
728 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
729 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
730 vec0, vec1, vec2);
731 DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1);
732 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
733 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
734 vec0, vec1, vec2);
735 DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1);
736 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
737
738 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
739 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
740 dst2 = __msa_adds_s_h(in2, dst2);
741 dst2 = __msa_srari_h(dst2, 7);
742 CLIP_SH_0_255(dst2);
743 PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
744
745 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
746 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
747 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
748 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
749 SD(tmp2, dst);
750 SW(tmp0, dst + 8);
751 dst += dst_stride;
752 SD(tmp3, dst);
753 SW(tmp1, dst + 8);
754 dst += dst_stride;
755 }
756 }
757
hevc_hz_bi_8t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)758 static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
759 int32_t src_stride,
760 int16_t *src1_ptr,
761 int32_t src2_stride,
762 uint8_t *dst,
763 int32_t dst_stride,
764 const int8_t *filter,
765 int32_t height)
766 {
767 uint32_t loop_cnt;
768 v16i8 src0, src1, src2, src3;
769 v8i16 filt0, filt1, filt2, filt3;
770 v16i8 mask1, mask2, mask3;
771 v16i8 vec0, vec1, vec2, vec3;
772 v8i16 dst0, dst1, dst2, dst3;
773 v8i16 in0, in1, in2, in3;
774 v8i16 filter_vec, const_vec;
775 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
776
777 src0_ptr -= 3;
778 const_vec = __msa_ldi_h(128);
779 const_vec <<= 6;
780
781 filter_vec = LD_SH(filter);
782 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
783
784 mask1 = mask0 + 2;
785 mask2 = mask0 + 4;
786 mask3 = mask0 + 6;
787
788 for (loop_cnt = (height >> 1); loop_cnt--;) {
789 LD_SB2(src0_ptr, 8, src0, src1);
790 src0_ptr += src_stride;
791 LD_SB2(src0_ptr, 8, src2, src3);
792 src0_ptr += src_stride;
793 LD_SH2(src1_ptr, 8, in0, in1);
794 src1_ptr += src2_stride;
795 LD_SH2(src1_ptr, 8, in2, in3);
796 src1_ptr += src2_stride;
797 XORI_B4_128_SB(src0, src1, src2, src3);
798
799 dst0 = const_vec;
800 dst1 = const_vec;
801 dst2 = const_vec;
802 dst3 = const_vec;
803 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
804 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
806 dst1, dst2, dst3);
807 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
808 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
809 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
810 dst1, dst2, dst3);
811 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
812 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
813 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
814 dst1, dst2, dst3);
815 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
816 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
818 dst1, dst2, dst3);
819
820 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
821 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
822
823 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
824 ST_SH2(dst0, dst1, dst, dst_stride);
825 dst += (2 * dst_stride);
826 }
827 }
828
hevc_hz_bi_8t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)829 static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
830 int32_t src_stride,
831 int16_t *src1_ptr,
832 int32_t src2_stride,
833 uint8_t *dst,
834 int32_t dst_stride,
835 const int8_t *filter,
836 int32_t height)
837 {
838 uint32_t loop_cnt;
839 uint64_t dst_val0;
840 v16i8 src0, src1, tmp0, tmp1;
841 v8i16 filt0, filt1, filt2, filt3;
842 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
843 v16i8 vec0, vec1, vec2, vec3;
844 v8i16 dst0, dst1, dst2;
845 v8i16 in0, in1, in2;
846 v8i16 filter_vec, const_vec;
847 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
848
849 src0_ptr = src0_ptr - 3;
850 const_vec = __msa_ldi_h(128);
851 const_vec <<= 6;
852
853 filter_vec = LD_SH(filter);
854 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
855
856 mask1 = mask0 + 2;
857 mask2 = mask0 + 4;
858 mask3 = mask0 + 6;
859 mask4 = mask0 + 8;
860 mask5 = mask0 + 10;
861 mask6 = mask0 + 12;
862 mask7 = mask0 + 14;
863
864 for (loop_cnt = height; loop_cnt--;) {
865 LD_SB2(src0_ptr, 16, src0, src1);
866 src0_ptr += src_stride;
867 LD_SH2(src1_ptr, 8, in0, in1);
868 in2 = LD_SH(src1_ptr + 16);
869 src1_ptr += src2_stride;
870 XORI_B2_128_SB(src0, src1);
871
872 dst0 = const_vec;
873 dst1 = const_vec;
874 dst2 = const_vec;
875 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
876 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
877 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
878 dst1, dst2, dst0);
879 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
880 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
881 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
882 dst2, dst0, dst1);
883 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
884 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
886 dst0, dst1, dst2);
887
888 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
889 dst2 = __msa_adds_s_h(dst2, in2);
890 dst2 = __msa_srari_h(dst2, 7);
891 CLIP_SH_0_255(dst2);
892
893 PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
894 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
895 ST_SB(tmp0, dst);
896 SD(dst_val0, dst + 16);
897 dst += dst_stride;
898 }
899 }
900
hevc_hz_bi_8t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)901 static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
902 int32_t src_stride,
903 int16_t *src1_ptr,
904 int32_t src2_stride,
905 uint8_t *dst,
906 int32_t dst_stride,
907 const int8_t *filter,
908 int32_t height)
909 {
910 uint32_t loop_cnt;
911 v16i8 src0, src1, src2, tmp0, tmp1;
912 v8i16 filt0, filt1, filt2, filt3;
913 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
914 v16i8 vec0, vec1, vec2, vec3;
915 v8i16 dst0, dst1, dst2, dst3;
916 v8i16 in0, in1, in2, in3;
917 v8i16 filter_vec, const_vec;
918 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
919
920 src0_ptr -= 3;
921 const_vec = __msa_ldi_h(128);
922 const_vec <<= 6;
923
924 filter_vec = LD_SH(filter);
925 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
926
927 mask1 = mask0 + 2;
928 mask2 = mask0 + 4;
929 mask3 = mask0 + 6;
930 mask4 = mask0 + 8;
931 mask5 = mask0 + 10;
932 mask6 = mask0 + 12;
933 mask7 = mask0 + 14;
934
935 for (loop_cnt = height; loop_cnt--;) {
936 LD_SB2(src0_ptr, 16, src0, src1);
937 src2 = LD_SB(src0_ptr + 24);
938 src0_ptr += src_stride;
939 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
940 src1_ptr += src2_stride;
941 XORI_B3_128_SB(src0, src1, src2);
942
943 dst0 = const_vec;
944 dst1 = const_vec;
945 dst2 = const_vec;
946 dst3 = const_vec;
947 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
948 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
949 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
950 dst1, dst2, dst3);
951 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
952 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
953 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
954 dst1, dst2, dst3);
955 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
956 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
957 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
958 dst1, dst2, dst3);
959 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
960 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
961 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
962 dst1, dst2, dst3);
963
964 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
965 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
966
967 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
968 ST_SB2(tmp0, tmp1, dst, 16);
969 dst += dst_stride;
970 }
971 }
972
hevc_hz_bi_8t_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)973 static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
974 int32_t src_stride,
975 int16_t *src1_ptr,
976 int32_t src2_stride,
977 uint8_t *dst,
978 int32_t dst_stride,
979 const int8_t *filter,
980 int32_t height)
981 {
982 uint32_t loop_cnt;
983 v16i8 src0, src1, src2, src3;
984 v16i8 tmp0, tmp1, tmp2;
985 v8i16 filt0, filt1, filt2, filt3;
986 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
987 v16i8 vec0, vec1, vec2, vec3;
988 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
989 v8i16 in0, in1, in2, in3, in4, in5;
990 v8i16 filter_vec, const_vec;
991 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
992
993 src0_ptr -= 3;
994
995 const_vec = __msa_ldi_h(128);
996 const_vec <<= 6;
997
998 filter_vec = LD_SH(filter);
999 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1000
1001 mask1 = mask0 + 2;
1002 mask2 = mask0 + 4;
1003 mask3 = mask0 + 6;
1004 mask4 = mask0 + 8;
1005 mask5 = mask0 + 10;
1006 mask6 = mask0 + 12;
1007 mask7 = mask0 + 14;
1008
1009 for (loop_cnt = 64; loop_cnt--;) {
1010 LD_SB3(src0_ptr, 16, src0, src1, src2);
1011 src3 = LD_SB(src0_ptr + 40);
1012 src0_ptr += src_stride;
1013 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1014 XORI_B4_128_SB(src0, src1, src2, src3);
1015
1016 dst0 = const_vec;
1017 dst1 = const_vec;
1018 dst2 = const_vec;
1019 dst3 = const_vec;
1020
1021 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1022 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1023 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1024 dst1, dst2, dst3);
1025 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1026 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1027 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1028 dst1, dst2, dst3);
1029 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1030 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1031 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1032 dst1, dst2, dst3);
1033 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1034 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1035 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1036 dst1, dst2, dst3);
1037 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
1038 HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
1039 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1040 ST_SB(tmp0, dst);
1041 ST_SB(tmp1, dst + 16);
1042
1043 LD_SH2(src1_ptr + 32, 8, in4, in5);
1044 src1_ptr += src2_stride;
1045
1046 dst4 = const_vec;
1047 dst5 = const_vec;
1048 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1049 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1050 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1051 dst5, dst4, dst5);
1052 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1053 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1054 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1055 dst5, dst4, dst5);
1056
1057 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
1058
1059 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1060 ST_SB(tmp2, dst + 32);
1061 dst += dst_stride;
1062 }
1063 }
1064
hevc_hz_bi_8t_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1065 static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
1066 int32_t src_stride,
1067 int16_t *src1_ptr,
1068 int32_t src2_stride,
1069 uint8_t *dst,
1070 int32_t dst_stride,
1071 const int8_t *filter,
1072 int32_t height)
1073 {
1074 uint32_t loop_cnt;
1075 v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1;
1076 v8i16 filt0, filt1, filt2, filt3;
1077 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1078 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1079 v16i8 vec0, vec1, vec2, vec3;
1080 v8i16 dst0, dst1, dst2, dst3;
1081 v8i16 in0, in1, in2, in3;
1082 v8i16 filter_vec, const_vec;
1083
1084 src0_ptr -= 3;
1085
1086 const_vec = __msa_ldi_h(128);
1087 const_vec <<= 6;
1088
1089 filter_vec = LD_SH(filter);
1090 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1091
1092 mask1 = mask0 + 2;
1093 mask2 = mask0 + 4;
1094 mask3 = mask0 + 6;
1095 mask4 = mask0 + 8;
1096 mask5 = mask0 + 10;
1097 mask6 = mask0 + 12;
1098 mask7 = mask0 + 14;
1099
1100 for (loop_cnt = height; loop_cnt--;) {
1101 LD_SB2(src0_ptr, 16, src0, src1);
1102 src2 = LD_SB(src0_ptr + 24);
1103 LD_SB2(src0_ptr + 32, 16, src3, src4);
1104 src5 = LD_SB(src0_ptr + 56);
1105 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1106 XORI_B3_128_SB(src0, src1, src2);
1107
1108 dst0 = const_vec;
1109 dst1 = const_vec;
1110 dst2 = const_vec;
1111 dst3 = const_vec;
1112
1113 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1114 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1115 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1116 dst1, dst2, dst3);
1117 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1118 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1119 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1120 dst1, dst2, dst3);
1121 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1122 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1123 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1124 dst1, dst2, dst3);
1125 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1126 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1127 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1128 dst1, dst2, dst3);
1129
1130 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1131 dst0, dst1, dst2, dst3, 7,
1132 dst0, dst1, dst2, dst3);
1133
1134 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1135 ST_SB2(tmp0, tmp1, dst, 16);
1136
1137 src0 = src3;
1138 src1 = src4;
1139 src2 = src5;
1140
1141 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1142 XORI_B3_128_SB(src0, src1, src2);
1143
1144 dst0 = const_vec;
1145 dst1 = const_vec;
1146 dst2 = const_vec;
1147 dst3 = const_vec;
1148 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1149 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1150 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1151 dst1, dst2, dst3);
1152 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1153 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1154 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1155 dst1, dst2, dst3);
1156 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1157 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1158 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1159 dst1, dst2, dst3);
1160 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1161 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1162 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1163 dst1, dst2, dst3);
1164 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1165 dst0, dst1, dst2, dst3, 7,
1166 dst0, dst1, dst2, dst3);
1167 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1168 ST_SB2(tmp0, tmp1, dst + 32, 16);
1169 src1_ptr += src2_stride;
1170 src0_ptr += src_stride;
1171 dst += dst_stride;
1172 }
1173 }
1174
hevc_vt_bi_8t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1175 static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
1176 int32_t src_stride,
1177 int16_t *src1_ptr,
1178 int32_t src2_stride,
1179 uint8_t *dst,
1180 int32_t dst_stride,
1181 const int8_t *filter,
1182 int32_t height)
1183 {
1184 int32_t loop_cnt;
1185 v16i8 src0, src1, src2, src3, src4, src5;
1186 v16i8 src6, src7, src8, src9, src10;
1187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1188 v16i8 src11, src12, src13, src14;
1189 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1190 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1191 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1192 v16i8 src2110, src4332, src6554, src8776, src10998;
1193 v16i8 src12111110, src14131312;
1194 v8i16 dst10, dst32, dst54, dst76;
1195 v8i16 filt0, filt1, filt2, filt3;
1196 v8i16 filter_vec, const_vec;
1197
1198 src0_ptr -= (3 * src_stride);
1199
1200 const_vec = __msa_ldi_h(128);
1201 const_vec <<= 6;
1202
1203 filter_vec = LD_SH(filter);
1204 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1205
1206 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1207 src0_ptr += (7 * src_stride);
1208 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1209 src10_r, src32_r, src54_r, src21_r);
1210 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1211 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1212 src2110, src4332, src6554);
1213 XORI_B3_128_SB(src2110, src4332, src6554);
1214
1215 for (loop_cnt = (height >> 3); loop_cnt--;) {
1216 LD_SB8(src0_ptr, src_stride,
1217 src7, src8, src9, src10, src11, src12, src13, src14);
1218 src0_ptr += (8 * src_stride);
1219 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1220 src1_ptr += (8 * src2_stride);
1221
1222 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1223 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1224 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1225 src76_r, src87_r, src98_r, src109_r);
1226 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1227 src1110_r, src1211_r, src1312_r, src1413_r);
1228 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1229 src1413_r, src1312_r,
1230 src8776, src10998, src12111110, src14131312);
1231 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1232
1233 dst10 = const_vec;
1234 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1235 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1236 dst32 = const_vec;
1237 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1238 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1239 dst54 = const_vec;
1240 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1241 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1242 dst76 = const_vec;
1243 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1244 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1245
1246 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1247 dst10, dst32, dst54, dst76, 7,
1248 dst10, dst32, dst54, dst76);
1249
1250 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1251 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1252 dst += (8 * dst_stride);
1253
1254 src2110 = src10998;
1255 src4332 = src12111110;
1256 src6554 = src14131312;
1257 src6 = src14;
1258 }
1259 }
1260
hevc_vt_bi_8t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1261 static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
1262 int32_t src_stride,
1263 int16_t *src1_ptr,
1264 int32_t src2_stride,
1265 uint8_t *dst,
1266 int32_t dst_stride,
1267 const int8_t *filter,
1268 int32_t height)
1269 {
1270 int32_t loop_cnt;
1271 v16i8 src0, src1, src2, src3, src4, src5;
1272 v16i8 src6, src7, src8, src9, src10;
1273 v8i16 in0, in1, in2, in3;
1274 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1275 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1276 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1277 v8i16 filt0, filt1, filt2, filt3;
1278 v8i16 filter_vec, const_vec;
1279
1280 src0_ptr -= (3 * src_stride);
1281 const_vec = __msa_ldi_h(128);
1282 const_vec <<= 6;
1283
1284 filter_vec = LD_SH(filter);
1285 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1286
1287 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1288 src0_ptr += (7 * src_stride);
1289 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1290 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1291 src10_r, src32_r, src54_r, src21_r);
1292 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1293
1294 for (loop_cnt = (height >> 2); loop_cnt--;) {
1295 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1296 src0_ptr += (4 * src_stride);
1297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1298 src1_ptr += (4 * src2_stride);
1299 XORI_B4_128_SB(src7, src8, src9, src10);
1300 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1301 src76_r, src87_r, src98_r, src109_r);
1302
1303 dst0_r = const_vec;
1304 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1305 filt0, filt1, filt2, filt3,
1306 dst0_r, dst0_r, dst0_r, dst0_r);
1307 dst1_r = const_vec;
1308 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1309 filt0, filt1, filt2, filt3,
1310 dst1_r, dst1_r, dst1_r, dst1_r);
1311 dst2_r = const_vec;
1312 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1313 filt0, filt1, filt2, filt3,
1314 dst2_r, dst2_r, dst2_r, dst2_r);
1315 dst3_r = const_vec;
1316 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1317 filt0, filt1, filt2, filt3,
1318 dst3_r, dst3_r, dst3_r, dst3_r);
1319
1320 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1321 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1322 dst0_r, dst1_r, dst2_r, dst3_r);
1323
1324 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1325 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1326 dst += (4 * dst_stride);
1327
1328 src10_r = src54_r;
1329 src32_r = src76_r;
1330 src54_r = src98_r;
1331 src21_r = src65_r;
1332 src43_r = src87_r;
1333 src65_r = src109_r;
1334
1335 src6 = src10;
1336 }
1337 }
1338
hevc_vt_bi_8t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1339 static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
1340 int32_t src_stride,
1341 int16_t *src1_ptr,
1342 int32_t src2_stride,
1343 uint8_t *dst,
1344 int32_t dst_stride,
1345 const int8_t *filter,
1346 int32_t height)
1347 {
1348 int32_t loop_cnt;
1349 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1350 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1351 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1352 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1353 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1354 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1355 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1356 v16i8 src2110, src4332, src6554, src8776, src10998;
1357 v8i16 dst0_l, dst1_l;
1358 v8i16 filt0, filt1, filt2, filt3;
1359 v8i16 filter_vec, const_vec;
1360
1361 src0_ptr -= (3 * src_stride);
1362 const_vec = __msa_ldi_h(128);
1363 const_vec <<= 6;
1364
1365 filter_vec = LD_SH(filter);
1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1367
1368 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1369 src0_ptr += (7 * src_stride);
1370 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1371
1372 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1373 src10_r, src32_r, src54_r, src21_r);
1374 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1375 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1376 src10_l, src32_l, src54_l, src21_l);
1377 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1378 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1379 src2110, src4332, src6554);
1380
1381 for (loop_cnt = (height >> 2); loop_cnt--;) {
1382 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1383 src0_ptr += (4 * src_stride);
1384 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1385 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1386 src1_ptr += (4 * src2_stride);
1387
1388 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1389 XORI_B4_128_SB(src7, src8, src9, src10);
1390 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1391 src76_r, src87_r, src98_r, src109_r);
1392 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1393 src76_l, src87_l, src98_l, src109_l);
1394 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1395
1396 dst0_r = const_vec;
1397 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1398 filt0, filt1, filt2, filt3,
1399 dst0_r, dst0_r, dst0_r, dst0_r);
1400 dst1_r = const_vec;
1401 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1402 filt0, filt1, filt2, filt3,
1403 dst1_r, dst1_r, dst1_r, dst1_r);
1404 dst2_r = const_vec;
1405 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1406 filt0, filt1, filt2, filt3,
1407 dst2_r, dst2_r, dst2_r, dst2_r);
1408 dst3_r = const_vec;
1409 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1410 filt0, filt1, filt2, filt3,
1411 dst3_r, dst3_r, dst3_r, dst3_r);
1412 dst0_l = const_vec;
1413 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1414 filt0, filt1, filt2, filt3,
1415 dst0_l, dst0_l, dst0_l, dst0_l);
1416 dst1_l = const_vec;
1417 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1418 filt0, filt1, filt2, filt3,
1419 dst1_l, dst1_l, dst1_l, dst1_l);
1420
1421 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1422 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1423 dst0_r, dst1_r, dst2_r, dst3_r);
1424 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1425
1426
1427 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1428 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1429 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1430 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
1431 dst += (4 * dst_stride);
1432
1433 src10_r = src54_r;
1434 src32_r = src76_r;
1435 src54_r = src98_r;
1436 src21_r = src65_r;
1437 src43_r = src87_r;
1438 src65_r = src109_r;
1439 src2110 = src6554;
1440 src4332 = src8776;
1441 src6554 = src10998;
1442 src6 = src10;
1443 }
1444 }
1445
hevc_vt_bi_8t_16multx2mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t width)1446 static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
1447 int32_t src_stride,
1448 int16_t *src1_ptr,
1449 int32_t src2_stride,
1450 uint8_t *dst,
1451 int32_t dst_stride,
1452 const int8_t *filter,
1453 int32_t height, int32_t width)
1454 {
1455 uint8_t *src0_ptr_tmp;
1456 int16_t *src1_ptr_tmp;
1457 uint8_t *dst_tmp;
1458 uint32_t loop_cnt;
1459 uint32_t cnt;
1460 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1461 v8i16 in0, in1, in2, in3;
1462 v16i8 src10_r, src32_r, src54_r, src76_r;
1463 v16i8 src21_r, src43_r, src65_r, src87_r;
1464 v8i16 dst0_r, dst1_r;
1465 v16i8 src10_l, src32_l, src54_l, src76_l;
1466 v16i8 src21_l, src43_l, src65_l, src87_l;
1467 v8i16 dst0_l, dst1_l;
1468 v8i16 filt0, filt1, filt2, filt3;
1469 v8i16 filter_vec, const_vec;
1470
1471 src0_ptr -= (3 * src_stride);
1472 const_vec = __msa_ldi_h(128);
1473 const_vec <<= 6;
1474
1475 filter_vec = LD_SH(filter);
1476 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1477
1478 for (cnt = (width >> 4); cnt--;) {
1479 src0_ptr_tmp = src0_ptr;
1480 src1_ptr_tmp = src1_ptr;
1481 dst_tmp = dst;
1482
1483 LD_SB7(src0_ptr_tmp, src_stride,
1484 src0, src1, src2, src3, src4, src5, src6);
1485 src0_ptr_tmp += (7 * src_stride);
1486 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1487
1488 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1489 src10_r, src32_r, src54_r, src21_r);
1490 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1491 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1492 src10_l, src32_l, src54_l, src21_l);
1493 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1494
1495 for (loop_cnt = (height >> 1); loop_cnt--;) {
1496 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1497 src0_ptr_tmp += (2 * src_stride);
1498 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1499 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1500 src1_ptr_tmp += (2 * src2_stride);
1501 XORI_B2_128_SB(src7, src8);
1502
1503 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1504 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1505
1506 dst0_r = const_vec;
1507 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1508 filt0, filt1, filt2, filt3,
1509 dst0_r, dst0_r, dst0_r, dst0_r);
1510 dst1_r = const_vec;
1511 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1512 filt0, filt1, filt2, filt3,
1513 dst1_r, dst1_r, dst1_r, dst1_r);
1514 dst0_l = const_vec;
1515 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1516 filt0, filt1, filt2, filt3,
1517 dst0_l, dst0_l, dst0_l, dst0_l);
1518 dst1_l = const_vec;
1519 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1520 filt0, filt1, filt2, filt3,
1521 dst1_l, dst1_l, dst1_l, dst1_l);
1522
1523 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1524 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1525 dst0_r, dst1_r, dst0_l, dst1_l);
1526
1527 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1528 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1529 dst_tmp += (2 * dst_stride);
1530
1531 src10_r = src32_r;
1532 src32_r = src54_r;
1533 src54_r = src76_r;
1534 src21_r = src43_r;
1535 src43_r = src65_r;
1536 src65_r = src87_r;
1537 src10_l = src32_l;
1538 src32_l = src54_l;
1539 src54_l = src76_l;
1540 src21_l = src43_l;
1541 src43_l = src65_l;
1542 src65_l = src87_l;
1543 src6 = src8;
1544 }
1545
1546 src0_ptr += 16;
1547 src1_ptr += 16;
1548 dst += 16;
1549 }
1550 }
1551
hevc_vt_bi_8t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1552 static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
1553 int32_t src_stride,
1554 int16_t *src1_ptr,
1555 int32_t src2_stride,
1556 uint8_t *dst,
1557 int32_t dst_stride,
1558 const int8_t *filter,
1559 int32_t height)
1560 {
1561 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1562 dst, dst_stride, filter, height, 16);
1563 }
1564
hevc_vt_bi_8t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1565 static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
1566 int32_t src_stride,
1567 int16_t *src1_ptr,
1568 int32_t src2_stride,
1569 uint8_t *dst,
1570 int32_t dst_stride,
1571 const int8_t *filter,
1572 int32_t height)
1573 {
1574 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1575 dst, dst_stride, filter, height, 16);
1576 hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1577 dst + 16, dst_stride, filter, height);
1578 }
1579
hevc_vt_bi_8t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1580 static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
1581 int32_t src_stride,
1582 int16_t *src1_ptr,
1583 int32_t src2_stride,
1584 uint8_t *dst,
1585 int32_t dst_stride,
1586 const int8_t *filter,
1587 int32_t height)
1588 {
1589 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1590 dst, dst_stride, filter, height, 32);
1591 }
1592
hevc_vt_bi_8t_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1593 static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
1594 int32_t src_stride,
1595 int16_t *src1_ptr,
1596 int32_t src2_stride,
1597 uint8_t *dst,
1598 int32_t dst_stride,
1599 const int8_t *filter,
1600 int32_t height)
1601 {
1602 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1603 dst, dst_stride, filter, height, 48);
1604 }
1605
hevc_vt_bi_8t_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1606 static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
1607 int32_t src_stride,
1608 int16_t *src1_ptr,
1609 int32_t src2_stride,
1610 uint8_t *dst,
1611 int32_t dst_stride,
1612 const int8_t *filter,
1613 int32_t height)
1614 {
1615 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1616 dst, dst_stride, filter, height, 64);
1617 }
1618
hevc_hv_bi_8t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1619 static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
1620 int32_t src_stride,
1621 int16_t *src1_ptr,
1622 int32_t src2_stride,
1623 uint8_t *dst,
1624 int32_t dst_stride,
1625 const int8_t *filter_x,
1626 const int8_t *filter_y,
1627 int32_t height)
1628 {
1629 uint32_t loop_cnt;
1630 uint64_t tp0, tp1;
1631 v16u8 out;
1632 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1633 v8i16 in0 = { 0 }, in1 = { 0 };
1634 v8i16 filt0, filt1, filt2, filt3;
1635 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1636 v16i8 mask1, mask2, mask3;
1637 v8i16 filter_vec, const_vec;
1638 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1639 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1640 v8i16 out0, out1;
1641 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1642 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1643 v4i32 dst0, dst1, dst2, dst3;
1644 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1645
1646 src0_ptr -= ((3 * src_stride) + 3);
1647 filter_vec = LD_SH(filter_x);
1648 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1649
1650 filter_vec = LD_SH(filter_y);
1651 UNPCK_R_SB_SH(filter_vec, filter_vec);
1652
1653 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1654
1655 mask1 = mask0 + 2;
1656 mask2 = mask0 + 4;
1657 mask3 = mask0 + 6;
1658
1659 const_vec = __msa_ldi_h(128);
1660 const_vec <<= 6;
1661
1662 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1663 src0_ptr += (7 * src_stride);
1664 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1665
1666 /* row 0 row 1 row 2 row 3 */
1667 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1668 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1669 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1670 vec8, vec9, vec10, vec11);
1671 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1672 vec12, vec13, vec14, vec15);
1673
1674 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1675 filt3);
1676 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1677 filt3);
1678 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1679 filt3);
1680 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1681 filt3);
1682
1683 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
1684 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
1685 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
1686
1687 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1688
1689 for (loop_cnt = height >> 2; loop_cnt--;) {
1690 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1691 src0_ptr += (4 * src_stride);
1692 XORI_B4_128_SB(src7, src8, src9, src10);
1693
1694 LD2(src1_ptr, src2_stride, tp0, tp1);
1695 INSERT_D2_SH(tp0, tp1, in0);
1696 src1_ptr += (2 * src2_stride);
1697 LD2(src1_ptr, src2_stride, tp0, tp1);
1698 INSERT_D2_SH(tp0, tp1, in1);
1699 src1_ptr += (2 * src2_stride);
1700
1701 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1702 vec0, vec1, vec2, vec3);
1703 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1704 vec4, vec5, vec6, vec7);
1705 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1706 filt3);
1707 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1708 filt3);
1709
1710 dst76 = __msa_ilvr_h(dst97, dst66);
1711 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
1712 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1713 dst98 = __msa_ilvr_h(dst66, dst108);
1714
1715 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1716 filt_h2, filt_h3);
1717 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1718 filt_h2, filt_h3);
1719 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1720 filt_h2, filt_h3);
1721 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1722 filt_h2, filt_h3);
1723
1724 SRA_4V(dst0, dst1, dst2, dst3, 6);
1725 PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1);
1726 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
1727 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1728 SRARI_H2_SH(out0, out1, 7);
1729 CLIP_SH2_0_255(out0, out1);
1730 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1731 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1732 dst += (4 * dst_stride);
1733
1734 dst10 = dst54;
1735 dst32 = dst76;
1736 dst54 = dst98;
1737 dst21 = dst65;
1738 dst43 = dst87;
1739 dst65 = dst109;
1740 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1741 }
1742 }
1743
hevc_hv_bi_8t_8multx1mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width)1744 static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
1745 int32_t src_stride,
1746 int16_t *src1_ptr,
1747 int32_t src2_stride,
1748 uint8_t *dst,
1749 int32_t dst_stride,
1750 const int8_t *filter_x,
1751 const int8_t *filter_y,
1752 int32_t height, int32_t width)
1753 {
1754 uint32_t loop_cnt;
1755 uint32_t cnt;
1756 uint8_t *src0_ptr_tmp;
1757 int16_t *src1_ptr_tmp;
1758 uint8_t *dst_tmp;
1759 v16u8 out;
1760 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1761 v8i16 in0, tmp;
1762 v8i16 filt0, filt1, filt2, filt3;
1763 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1764 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1765 v16i8 mask1, mask2, mask3;
1766 v8i16 filter_vec, const_vec;
1767 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1768 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1769 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1770 v4i32 dst0_r, dst0_l;
1771 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1772 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1773
1774 src0_ptr -= ((3 * src_stride) + 3);
1775 const_vec = __msa_ldi_h(128);
1776 const_vec <<= 6;
1777
1778 filter_vec = LD_SH(filter_x);
1779 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1780
1781 filter_vec = LD_SH(filter_y);
1782 UNPCK_R_SB_SH(filter_vec, filter_vec);
1783
1784 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1785
1786 mask1 = mask0 + 2;
1787 mask2 = mask0 + 4;
1788 mask3 = mask0 + 6;
1789
1790 for (cnt = width >> 3; cnt--;) {
1791 src0_ptr_tmp = src0_ptr;
1792 dst_tmp = dst;
1793 src1_ptr_tmp = src1_ptr;
1794
1795 LD_SB7(src0_ptr_tmp, src_stride,
1796 src0, src1, src2, src3, src4, src5, src6);
1797 src0_ptr_tmp += (7 * src_stride);
1798 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1799
1800 /* row 0 row 1 row 2 row 3 */
1801 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1802 vec0, vec1, vec2, vec3);
1803 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1804 vec4, vec5, vec6, vec7);
1805 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1806 vec8, vec9, vec10, vec11);
1807 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1808 vec12, vec13, vec14, vec15);
1809 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1810 filt3);
1811 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1812 filt3);
1813 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1814 filt3);
1815 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1816 filt2, filt3);
1817
1818 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1819 vec0, vec1, vec2, vec3);
1820 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1821 vec4, vec5, vec6, vec7);
1822 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1823 vec8, vec9, vec10, vec11);
1824 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1825 filt3);
1826 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1827 filt3);
1828 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1829 filt3);
1830
1831 for (loop_cnt = height; loop_cnt--;) {
1832 src7 = LD_SB(src0_ptr_tmp);
1833 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1834 src0_ptr_tmp += src_stride;
1835
1836 in0 = LD_SH(src1_ptr_tmp);
1837 src1_ptr_tmp += src2_stride;
1838
1839 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1840 vec0, vec1, vec2, vec3);
1841 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1842 filt2, filt3);
1843 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1844 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1845 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1846 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1847 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1848 filt_h0, filt_h1, filt_h2, filt_h3);
1849 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1850 filt_h0, filt_h1, filt_h2, filt_h3);
1851 dst0_r >>= 6;
1852 dst0_l >>= 6;
1853
1854 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1855 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1856 tmp = __msa_srari_h(tmp, 7);
1857 CLIP_SH_0_255(tmp);
1858 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1859 ST_D1(out, 0, dst_tmp);
1860 dst_tmp += dst_stride;
1861
1862 dst0 = dst1;
1863 dst1 = dst2;
1864 dst2 = dst3;
1865 dst3 = dst4;
1866 dst4 = dst5;
1867 dst5 = dst6;
1868 dst6 = dst7;
1869 }
1870
1871 src0_ptr += 8;
1872 dst += 8;
1873 src1_ptr += 8;
1874 }
1875 }
1876
hevc_hv_bi_8t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1877 static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
1878 int32_t src_stride,
1879 int16_t *src1_ptr,
1880 int32_t src2_stride,
1881 uint8_t *dst,
1882 int32_t dst_stride,
1883 const int8_t *filter_x,
1884 const int8_t *filter_y,
1885 int32_t height)
1886 {
1887 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1888 dst, dst_stride, filter_x, filter_y,
1889 height, 8);
1890 }
1891
hevc_hv_bi_8t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1892 static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
1893 int32_t src_stride,
1894 int16_t *src1_ptr,
1895 int32_t src2_stride,
1896 uint8_t *dst,
1897 int32_t dst_stride,
1898 const int8_t *filter_x,
1899 const int8_t *filter_y,
1900 int32_t height)
1901 {
1902 uint32_t loop_cnt;
1903 uint8_t *src0_ptr_tmp, *dst_tmp;
1904 int16_t *src1_ptr_tmp;
1905 uint64_t tp0, tp1;
1906 v16u8 out;
1907 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1908 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1909 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1910 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1911 v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec;
1912 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1913 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1914 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1915 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1916 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1917 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1918 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1919
1920 src0_ptr -= ((3 * src_stride) + 3);
1921
1922 const_vec = __msa_ldi_h(128);
1923 const_vec <<= 6;
1924
1925 filter_vec = LD_SH(filter_x);
1926 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1927
1928 filter_vec = LD_SH(filter_y);
1929 UNPCK_R_SB_SH(filter_vec, filter_vec);
1930
1931 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1932
1933 mask0 = LD_SB(ff_hevc_mask_arr);
1934 mask1 = mask0 + 2;
1935 mask2 = mask0 + 4;
1936 mask3 = mask0 + 6;
1937
1938 src0_ptr_tmp = src0_ptr;
1939 dst_tmp = dst;
1940 src1_ptr_tmp = src1_ptr;
1941
1942 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
1943 src6);
1944 src0_ptr_tmp += (7 * src_stride);
1945 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1946
1947 /* row 0 row 1 row 2 row 3 */
1948 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1949 vec3);
1950 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1951 vec7);
1952 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1953 vec11);
1954 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1955 vec15);
1956 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1957 filt3);
1958 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1959 filt3);
1960 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1961 filt3);
1962 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1963 filt2, filt3);
1964 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1965 vec3);
1966 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1967 vec7);
1968 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1969 vec11);
1970 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1971 filt3);
1972 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1973 filt3);
1974 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1975 filt3);
1976
1977 for (loop_cnt = 16; loop_cnt--;) {
1978 src7 = LD_SB(src0_ptr_tmp);
1979 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1980 src0_ptr_tmp += src_stride;
1981
1982 in0 = LD_SH(src1_ptr_tmp);
1983 src1_ptr_tmp += src2_stride;
1984
1985 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1986 vec3);
1987 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1988 filt2, filt3);
1989 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1990 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1991 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1992 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1993 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1994 filt_h1, filt_h2, filt_h3);
1995 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1996 filt_h1, filt_h2, filt_h3);
1997 dst0_r >>= 6;
1998 dst0_l >>= 6;
1999
2000 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2001 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
2002 tmp = __msa_srari_h(tmp, 7);
2003 CLIP_SH_0_255(tmp);
2004 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
2005 ST_D1(out, 0, dst_tmp);
2006 dst_tmp += dst_stride;
2007
2008 dst0 = dst1;
2009 dst1 = dst2;
2010 dst2 = dst3;
2011 dst3 = dst4;
2012 dst4 = dst5;
2013 dst5 = dst6;
2014 dst6 = dst7;
2015 }
2016
2017 src0_ptr += 8;
2018 dst += 8;
2019 src1_ptr += 8;
2020
2021 mask4 = LD_SB(ff_hevc_mask_arr + 16);
2022 mask5 = mask4 + 2;
2023 mask6 = mask4 + 4;
2024 mask7 = mask4 + 6;
2025
2026 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2027 src0_ptr += (7 * src_stride);
2028 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2029
2030 /* row 0 row 1 row 2 row 3 */
2031 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2032 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2033 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2034 vec8, vec9, vec10, vec11);
2035 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2036 vec12, vec13, vec14, vec15);
2037 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2038 filt3);
2039 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2040 filt3);
2041 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2042 filt3);
2043 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2044 filt3);
2045
2046 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2047 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2048 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2049
2050 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2051
2052 for (loop_cnt = 4; loop_cnt--;) {
2053 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2054 src0_ptr += (4 * src_stride);
2055 XORI_B4_128_SB(src7, src8, src9, src10);
2056
2057 LD2(src1_ptr, src2_stride, tp0, tp1);
2058 INSERT_D2_SH(tp0, tp1, in0);
2059 src1_ptr += (2 * src2_stride);
2060 LD2(src1_ptr, src2_stride, tp0, tp1);
2061 INSERT_D2_SH(tp0, tp1, in1);
2062 src1_ptr += (2 * src2_stride);
2063
2064 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2065 vec3);
2066 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2067 vec7);
2068 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2069 filt3);
2070 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2071 filt3);
2072
2073 dst76 = __msa_ilvr_h(dst97, dst66);
2074 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2075 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2076 dst98 = __msa_ilvr_h(dst66, dst108);
2077
2078 tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2079 filt_h2, filt_h3);
2080 tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2081 filt_h2, filt_h3);
2082 tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2083 filt_h2, filt_h3);
2084 tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2085 filt_h2, filt_h3);
2086 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2087 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1);
2088 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
2089 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2090 SRARI_H2_SH(out0, out1, 7);
2091 CLIP_SH2_0_255(out0, out1);
2092 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2093 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2094 dst += (4 * dst_stride);
2095
2096 dst10 = dst54;
2097 dst32 = dst76;
2098 dst54 = dst98;
2099 dst21 = dst65;
2100 dst43 = dst87;
2101 dst65 = dst109;
2102 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2103 }
2104 }
2105
hevc_hv_bi_8t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)2106 static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
2107 int32_t src_stride,
2108 int16_t *src1_ptr,
2109 int32_t src2_stride,
2110 uint8_t *dst,
2111 int32_t dst_stride,
2112 const int8_t *filter_x,
2113 const int8_t *filter_y,
2114 int32_t height)
2115 {
2116 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2117 dst, dst_stride, filter_x, filter_y,
2118 height, 16);
2119 }
2120
hevc_hv_bi_8t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)2121 static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
2122 int32_t src_stride,
2123 int16_t *src1_ptr,
2124 int32_t src2_stride,
2125 uint8_t *dst,
2126 int32_t dst_stride,
2127 const int8_t *filter_x,
2128 const int8_t *filter_y,
2129 int32_t height)
2130 {
2131 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2132 dst, dst_stride, filter_x, filter_y,
2133 height, 24);
2134 }
2135
hevc_hv_bi_8t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)2136 static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
2137 int32_t src_stride,
2138 int16_t *src1_ptr,
2139 int32_t src2_stride,
2140 uint8_t *dst,
2141 int32_t dst_stride,
2142 const int8_t *filter_x,
2143 const int8_t *filter_y,
2144 int32_t height)
2145 {
2146 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2147 dst, dst_stride, filter_x, filter_y,
2148 height, 32);
2149 }
2150
hevc_hv_bi_8t_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)2151 static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
2152 int32_t src_stride,
2153 int16_t *src1_ptr,
2154 int32_t src2_stride,
2155 uint8_t *dst,
2156 int32_t dst_stride,
2157 const int8_t *filter_x,
2158 const int8_t *filter_y,
2159 int32_t height)
2160 {
2161 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2162 dst, dst_stride, filter_x, filter_y,
2163 height, 48);
2164 }
2165
hevc_hv_bi_8t_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)2166 static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
2167 int32_t src_stride,
2168 int16_t *src1_ptr,
2169 int32_t src2_stride,
2170 uint8_t *dst,
2171 int32_t dst_stride,
2172 const int8_t *filter_x,
2173 const int8_t *filter_y,
2174 int32_t height)
2175 {
2176 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2177 dst, dst_stride, filter_x, filter_y,
2178 height, 64);
2179 }
2180
hevc_hz_bi_4t_4x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2181 static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
2182 int32_t src_stride,
2183 int16_t *src1_ptr,
2184 int32_t src2_stride,
2185 uint8_t *dst,
2186 int32_t dst_stride,
2187 const int8_t *filter,
2188 int32_t height)
2189 {
2190 v8i16 filt0, filt1;
2191 v16i8 src0, src1, dst0, vec0, vec1;
2192 v8i16 in0, in1;
2193 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2194 v16i8 mask1;
2195 v8i16 tmp0;
2196 v8i16 filter_vec, const_vec;
2197
2198 src0_ptr -= 1;
2199
2200 const_vec = __msa_ldi_h(128);
2201 const_vec <<= 6;
2202
2203 filter_vec = LD_SH(filter);
2204 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2205
2206 mask1 = mask0 + 2;
2207
2208 LD_SB2(src0_ptr, src_stride, src0, src1);
2209 LD_SH2(src1_ptr, src2_stride, in0, in1);
2210 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2211 XORI_B2_128_SB(src0, src1);
2212 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2213 tmp0 = const_vec;
2214 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2215
2216 tmp0 = __msa_adds_s_h(tmp0, in0);
2217 tmp0 = __msa_srari_h(tmp0, 7);
2218 CLIP_SH_0_255(tmp0);
2219 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2220
2221 ST_W2(dst0, 0, 1, dst, dst_stride);
2222 }
2223
hevc_hz_bi_4t_4x4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2224 static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
2225 int32_t src_stride,
2226 int16_t *src1_ptr,
2227 int32_t src2_stride,
2228 uint8_t *dst,
2229 int32_t dst_stride,
2230 const int8_t *filter,
2231 int32_t height)
2232 {
2233 v8i16 filt0, filt1;
2234 v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
2235 v8i16 in0, in1, in2, in3;
2236 v16i8 vec2, vec3;
2237 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2238 v16i8 mask1;
2239 v8i16 tmp0, tmp1;
2240 v8i16 filter_vec, const_vec;
2241
2242 src0_ptr -= 1;
2243
2244 const_vec = __msa_ldi_h(128);
2245 const_vec <<= 6;
2246
2247 filter_vec = LD_SH(filter);
2248 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2249
2250 mask1 = mask0 + 2;
2251
2252 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2253 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2254
2255 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2256 XORI_B4_128_SB(src0, src1, src2, src3);
2257
2258 tmp0 = const_vec;
2259 tmp1 = const_vec;
2260 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2261 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2262 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2263 tmp0, tmp1);
2264 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
2265 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2266
2267 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2268 }
2269
hevc_hz_bi_4t_4x8multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2270 static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2271 int32_t src_stride,
2272 int16_t *src1_ptr,
2273 int32_t src2_stride,
2274 uint8_t *dst,
2275 int32_t dst_stride,
2276 const int8_t *filter,
2277 int32_t height)
2278 {
2279 uint32_t loop_cnt;
2280 v8i16 filt0, filt1;
2281 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2282 v16i8 dst0, dst1;
2283 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2284 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2285 v16i8 mask1, vec0, vec1, vec2, vec3;
2286 v8i16 tmp0, tmp1, tmp2, tmp3;
2287 v8i16 filter_vec, const_vec;
2288
2289 src0_ptr -= 1;
2290
2291 const_vec = __msa_ldi_h(128);
2292 const_vec <<= 6;
2293
2294 filter_vec = LD_SH(filter);
2295 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2296
2297 mask1 = mask0 + 2;
2298
2299 for (loop_cnt = (height >> 3); loop_cnt--;) {
2300 LD_SB8(src0_ptr, src_stride,
2301 src0, src1, src2, src3, src4, src5, src6, src7);
2302 src0_ptr += (8 * src_stride);
2303 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2304 src1_ptr += (4 * src2_stride);
2305 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2306 src1_ptr += (4 * src2_stride);
2307 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2308 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2309 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2310
2311 tmp0 = const_vec;
2312 tmp1 = const_vec;
2313 tmp2 = const_vec;
2314 tmp3 = const_vec;
2315 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2316 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2317 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2318 tmp1, tmp2, tmp3);
2319 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
2320 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2321 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2322 tmp1, tmp2, tmp3);
2323
2324 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2325 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2326
2327 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2328 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2329 dst += (8 * dst_stride);
2330 }
2331 }
2332
hevc_hz_bi_4t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2333 static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
2334 int32_t src_stride,
2335 int16_t *src1_ptr,
2336 int32_t src2_stride,
2337 uint8_t *dst,
2338 int32_t dst_stride,
2339 const int8_t *filter,
2340 int32_t height)
2341 {
2342 if (2 == height) {
2343 hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2344 dst, dst_stride, filter, height);
2345 } else if (4 == height) {
2346 hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2347 dst, dst_stride, filter, height);
2348 } else if (8 == height || 16 == height) {
2349 hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2350 src1_ptr, src2_stride,
2351 dst, dst_stride, filter, height);
2352 }
2353 }
2354
hevc_hz_bi_4t_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2355 static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
2356 int32_t src_stride,
2357 int16_t *src1_ptr,
2358 int32_t src2_stride,
2359 uint8_t *dst,
2360 int32_t dst_stride,
2361 const int8_t *filter,
2362 int32_t height)
2363 {
2364 uint32_t loop_cnt;
2365 v8i16 filt0, filt1;
2366 v16i8 src0, src1, src2, src3;
2367 v8i16 in0, in1, in2, in3;
2368 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2369 v16i8 mask1;
2370 v16i8 vec0, vec1, vec2, vec3;
2371 v8i16 dst0, dst1, dst2, dst3;
2372 v8i16 filter_vec, const_vec;
2373
2374 src0_ptr -= 1;
2375
2376 const_vec = __msa_ldi_h(128);
2377 const_vec <<= 6;
2378
2379 filter_vec = LD_SH(filter);
2380 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2381
2382 mask1 = mask0 + 2;
2383
2384 for (loop_cnt = (height >> 2); loop_cnt--;) {
2385 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2386 src0_ptr += (4 * src_stride);
2387 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2388 src1_ptr += (4 * src2_stride);
2389 XORI_B4_128_SB(src0, src1, src2, src3);
2390
2391 dst0 = const_vec;
2392 dst1 = const_vec;
2393 dst2 = const_vec;
2394 dst3 = const_vec;
2395 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2396 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2397 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2398 dst1, dst2, dst3);
2399 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2400 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2401 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2402 dst1, dst2, dst3);
2403
2404 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2405 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2406
2407 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2408 ST_W2(dst0, 0, 2, dst, dst_stride);
2409 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2410 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2411 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2412 dst += (4 * dst_stride);
2413 }
2414 }
2415
hevc_hz_bi_4t_8x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2416 static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
2417 int32_t src_stride,
2418 int16_t *src1_ptr,
2419 int32_t src2_stride,
2420 uint8_t *dst,
2421 int32_t dst_stride,
2422 const int8_t *filter,
2423 int32_t height)
2424 {
2425 v8i16 filt0, filt1;
2426 v16i8 src0, src1;
2427 v8i16 in0, in1;
2428 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2429 v16i8 mask1, vec0, vec1, vec2, vec3;
2430 v8i16 dst0, dst1;
2431 v8i16 filter_vec, const_vec;
2432
2433 src0_ptr -= 1;
2434
2435 const_vec = __msa_ldi_h(128);
2436 const_vec <<= 6;
2437
2438 filter_vec = LD_SH(filter);
2439 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2440
2441 mask1 = mask0 + 2;
2442
2443 LD_SB2(src0_ptr, src_stride, src0, src1);
2444 LD_SH2(src1_ptr, src2_stride, in0, in1);
2445 XORI_B2_128_SB(src0, src1);
2446
2447 dst0 = const_vec;
2448 dst1 = const_vec;
2449 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2450 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2451 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2452 dst0, dst1);
2453 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2454
2455 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456 ST_D2(dst0, 0, 1, dst, dst_stride);
2457 }
2458
hevc_hz_bi_4t_8x6_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2459 static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
2460 int32_t src_stride,
2461 int16_t *src1_ptr,
2462 int32_t src2_stride,
2463 uint8_t *dst,
2464 int32_t dst_stride,
2465 const int8_t *filter,
2466 int32_t height)
2467 {
2468 v8i16 filt0, filt1;
2469 v16i8 src0, src1, src2, src3, src4, src5;
2470 v8i16 in0, in1, in2, in3, in4, in5;
2471 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2472 v16i8 mask1;
2473 v16i8 vec0, vec1, vec2, vec3;
2474 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2475 v8i16 filter_vec, const_vec;
2476
2477 src0_ptr -= 1;
2478
2479 const_vec = __msa_ldi_h(128);
2480 const_vec <<= 6;
2481
2482 filter_vec = LD_SH(filter);
2483 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2484
2485 mask1 = mask0 + 2;
2486
2487 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2488 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2489 src1_ptr += (4 * src2_stride);
2490 LD_SH2(src1_ptr, src2_stride, in4, in5);
2491 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2492
2493 dst0 = const_vec;
2494 dst1 = const_vec;
2495 dst2 = const_vec;
2496 dst3 = const_vec;
2497 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2498 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2499 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2500 dst2, dst3);
2501 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2502 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2503 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2504 dst2, dst3);
2505 dst4 = const_vec;
2506 dst5 = const_vec;
2507
2508 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2509 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2510 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2511 dst4, dst5);
2512
2513 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2514 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2515 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2516
2517 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2518 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2519 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2520 ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
2521 }
2522
hevc_hz_bi_4t_8x4multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2523 static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2524 int32_t src_stride,
2525 int16_t *src1_ptr,
2526 int32_t src2_stride,
2527 uint8_t *dst,
2528 int32_t dst_stride,
2529 const int8_t *filter,
2530 int32_t height)
2531 {
2532 uint32_t loop_cnt;
2533 v8i16 filt0, filt1;
2534 v16i8 src0, src1, src2, src3;
2535 v8i16 in0, in1, in2, in3;
2536 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2537 v16i8 mask1;
2538 v16i8 vec0, vec1, vec2, vec3;
2539 v8i16 dst0, dst1, dst2, dst3;
2540 v8i16 filter_vec, const_vec;
2541
2542 src0_ptr -= 1;
2543
2544 const_vec = __msa_ldi_h(128);
2545 const_vec <<= 6;
2546
2547 filter_vec = LD_SH(filter);
2548 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2549
2550 mask1 = mask0 + 2;
2551
2552 for (loop_cnt = (height >> 2); loop_cnt--;) {
2553 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2554 src0_ptr += (4 * src_stride);
2555 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2556 src1_ptr += (4 * src2_stride);
2557 XORI_B4_128_SB(src0, src1, src2, src3);
2558
2559 dst0 = const_vec;
2560 dst1 = const_vec;
2561 dst2 = const_vec;
2562 dst3 = const_vec;
2563 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2564 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2565 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2566 dst1, dst2, dst3);
2567 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2568 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2569 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2570 dst1, dst2, dst3);
2571
2572 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2573 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2574
2575 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2576 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2577 dst += (4 * dst_stride);
2578 }
2579 }
2580
hevc_hz_bi_4t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2581 static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
2582 int32_t src_stride,
2583 int16_t *src1_ptr,
2584 int32_t src2_stride,
2585 uint8_t *dst,
2586 int32_t dst_stride,
2587 const int8_t *filter,
2588 int32_t height)
2589 {
2590 if (2 == height) {
2591 hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2592 dst, dst_stride, filter, height);
2593 } else if (6 == height) {
2594 hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2595 dst, dst_stride, filter, height);
2596 } else if (0 == (height % 4)) {
2597 hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2598 src1_ptr, src2_stride,
2599 dst, dst_stride, filter, height);
2600 }
2601 }
2602
hevc_hz_bi_4t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2603 static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
2604 int32_t src_stride,
2605 int16_t *src1_ptr,
2606 int32_t src2_stride,
2607 uint8_t *dst,
2608 int32_t dst_stride,
2609 const int8_t *filter,
2610 int32_t height)
2611 {
2612 uint32_t loop_cnt;
2613 v8i16 filt0, filt1;
2614 v16i8 src0, src1, src2, src3;
2615 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2616 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2617 v16i8 mask2 = {
2618 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2619 };
2620 v16i8 mask1, mask3;
2621 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2622 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2623 v8i16 filter_vec, const_vec;
2624
2625 src0_ptr -= 1;
2626
2627 const_vec = __msa_ldi_h(128);
2628 const_vec <<= 6;
2629
2630 filter_vec = LD_SH(filter);
2631 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2632
2633 mask1 = mask0 + 2;
2634 mask3 = mask2 + 2;
2635
2636 for (loop_cnt = (height >> 2); loop_cnt--;) {
2637 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2638 src0_ptr += (4 * src_stride);
2639 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2640 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2641 src1_ptr += (4 * src2_stride);
2642
2643 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2644 XORI_B4_128_SB(src0, src1, src2, src3);
2645
2646 dst0 = const_vec;
2647 dst1 = const_vec;
2648 dst2 = const_vec;
2649 dst3 = const_vec;
2650 dst4 = const_vec;
2651 dst5 = const_vec;
2652 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2653 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2654 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2656 dst1, dst2, dst3);
2657 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
2658 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2659 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2660 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
2661 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2662 dst1, dst2, dst3);
2663 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
2664
2665 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2666 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2667 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2668
2669 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2670 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2671 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2672 ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
2673 dst += (4 * dst_stride);
2674 }
2675 }
2676
hevc_hz_bi_4t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2677 static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
2678 int32_t src_stride,
2679 int16_t *src1_ptr,
2680 int32_t src2_stride,
2681 uint8_t *dst,
2682 int32_t dst_stride,
2683 const int8_t *filter,
2684 int32_t height)
2685 {
2686 uint32_t loop_cnt;
2687 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
2688 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2689 v8i16 filt0, filt1;
2690 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2691 v16i8 mask1;
2692 v8i16 filter_vec, const_vec;
2693
2694 src0_ptr -= 1;
2695
2696 const_vec = __msa_ldi_h(128);
2697 const_vec <<= 6;
2698
2699 filter_vec = LD_SH(filter);
2700 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2701
2702 mask1 = mask0 + 2;
2703
2704 for (loop_cnt = (height >> 1); loop_cnt--;) {
2705 LD_SB2(src0_ptr, src_stride, src0, src2);
2706 LD_SB2(src0_ptr + 8, src_stride, src1, src3);
2707 src0_ptr += (2 * src_stride);
2708 LD_SH2(src1_ptr, src2_stride, in0, in2);
2709 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2710 src1_ptr += (2 * src2_stride);
2711
2712 XORI_B4_128_SB(src0, src1, src2, src3);
2713
2714 dst0 = const_vec;
2715 dst1 = const_vec;
2716 dst2 = const_vec;
2717 dst3 = const_vec;
2718
2719 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2720 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2721 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2722 dst1, dst2, dst3);
2723 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2724 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2725 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2726 dst1, dst2, dst3);
2727
2728 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2729 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2730
2731 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2732 ST_SH2(dst0, dst1, dst, dst_stride);
2733 dst += (2 * dst_stride);
2734 }
2735 }
2736
hevc_hz_bi_4t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2737 static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
2738 int32_t src_stride,
2739 int16_t *src1_ptr,
2740 int32_t src2_stride,
2741 uint8_t *dst,
2742 int32_t dst_stride,
2743 const int8_t *filter,
2744 int32_t height)
2745 {
2746 int16_t *src1_ptr_tmp;
2747 uint8_t *dst_tmp;
2748 uint32_t loop_cnt;
2749 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2750 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2751 v8i16 filt0, filt1;
2752 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2753 v16i8 mask1, mask2, mask3;
2754 v16i8 vec0, vec1, vec2, vec3;
2755 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2756 v8i16 filter_vec, const_vec;
2757
2758 src0_ptr -= 1;
2759
2760 const_vec = __msa_ldi_h(128);
2761 const_vec <<= 6;
2762
2763 filter_vec = LD_SH(filter);
2764 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2765
2766 mask1 = mask0 + 2;
2767 mask2 = mask0 + 8;
2768 mask3 = mask0 + 10;
2769
2770 dst_tmp = dst + 16;
2771 src1_ptr_tmp = src1_ptr + 16;
2772
2773 for (loop_cnt = (height >> 2); loop_cnt--;) {
2774 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2775 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2776 src0_ptr += (4 * src_stride);
2777 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2778 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2779 src1_ptr += (4 * src2_stride);
2780 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2781
2782 dst0 = const_vec;
2783 dst1 = const_vec;
2784 dst2 = const_vec;
2785 dst3 = const_vec;
2786 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2787 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2788 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2789 dst1, dst2, dst3);
2790 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2791 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2792 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2793 dst1, dst2, dst3);
2794
2795 dst4 = const_vec;
2796 dst5 = const_vec;
2797 dst6 = const_vec;
2798 dst7 = const_vec;
2799 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2800 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2801 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2802 dst5, dst6, dst7);
2803 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2804 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2806 dst5, dst6, dst7);
2807
2808 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2809 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2810 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2811 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2812
2813 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2814 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2815 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2816 dst += (4 * dst_stride);
2817
2818 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2819 src1_ptr_tmp += (4 * src2_stride);
2820
2821 dst0 = const_vec;
2822 dst1 = const_vec;
2823 dst2 = const_vec;
2824 dst3 = const_vec;
2825 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2826 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2827 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2828 dst1, dst2, dst3);
2829 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
2830 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2831 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2832 dst1, dst2, dst3);
2833
2834 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2835 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2836
2837 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2838 ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
2839 dst_tmp += (4 * dst_stride);
2840 }
2841 }
2842
hevc_hz_bi_4t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2843 static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
2844 int32_t src_stride,
2845 int16_t *src1_ptr,
2846 int32_t src2_stride,
2847 uint8_t *dst,
2848 int32_t dst_stride,
2849 const int8_t *filter,
2850 int32_t height)
2851 {
2852 uint32_t loop_cnt;
2853 v16i8 src0, src1, src2;
2854 v8i16 in0, in1, in2, in3;
2855 v8i16 filt0, filt1;
2856 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2857 v16i8 mask1, mask2, mask3;
2858 v8i16 dst0, dst1, dst2, dst3;
2859 v16i8 vec0, vec1, vec2, vec3;
2860 v8i16 filter_vec, const_vec;
2861
2862 src0_ptr -= 1;
2863
2864 const_vec = __msa_ldi_h(128);
2865 const_vec <<= 6;
2866
2867 filter_vec = LD_SH(filter);
2868 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2869
2870 mask1 = mask0 + 2;
2871 mask2 = mask0 + 8;
2872 mask3 = mask0 + 10;
2873
2874 for (loop_cnt = height; loop_cnt--;) {
2875 LD_SB2(src0_ptr, 16, src0, src1);
2876 src2 = LD_SB(src0_ptr + 24);
2877 src0_ptr += src_stride;
2878 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2879 src1_ptr += src2_stride;
2880 XORI_B3_128_SB(src0, src1, src2);
2881
2882 dst0 = const_vec;
2883 dst1 = const_vec;
2884 dst2 = const_vec;
2885 dst3 = const_vec;
2886 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2887 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2888 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2889 dst1, dst2, dst3);
2890 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2891 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2892 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2893 dst1, dst2, dst3);
2894
2895 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2896 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2897
2898 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2899 ST_SH2(dst0, dst1, dst, 16);
2900 dst += dst_stride;
2901 }
2902 }
2903
hevc_vt_bi_4t_4x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2904 static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
2905 int32_t src_stride,
2906 int16_t *src1_ptr,
2907 int32_t src2_stride,
2908 uint8_t *dst,
2909 int32_t dst_stride,
2910 const int8_t *filter,
2911 int32_t height)
2912 {
2913 v16i8 src0, src1, src2, src3, src4;
2914 v8i16 in0, in1;
2915 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2916 v8i16 dst10;
2917 v8i16 filt0, filt1;
2918 v8i16 filter_vec, const_vec;
2919
2920 src0_ptr -= src_stride;
2921
2922 const_vec = __msa_ldi_h(128);
2923 const_vec <<= 6;
2924
2925 filter_vec = LD_SH(filter);
2926 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2927
2928 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2929 src0_ptr += (3 * src_stride);
2930
2931 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2932 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2933 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2934
2935 LD_SB2(src0_ptr, src_stride, src3, src4);
2936 LD_SH2(src1_ptr, src2_stride, in0, in1);
2937 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2938 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2939 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2940 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2941
2942 dst10 = const_vec;
2943 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2944 dst10 = __msa_adds_s_h(dst10, in0);
2945 dst10 = __msa_srari_h(dst10, 7);
2946 CLIP_SH_0_255(dst10);
2947
2948 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2949 ST_W2(dst10, 0, 1, dst, dst_stride);
2950 }
2951
hevc_vt_bi_4t_4x4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2952 static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
2953 int32_t src_stride,
2954 int16_t *src1_ptr,
2955 int32_t src2_stride,
2956 uint8_t *dst,
2957 int32_t dst_stride,
2958 const int8_t *filter,
2959 int32_t height)
2960 {
2961 v16i8 src0, src1, src2, src3, src4, src5, src6;
2962 v8i16 in0, in1, in2, in3;
2963 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2964 v16i8 src2110, src4332, src6554;
2965 v8i16 dst10, dst32;
2966 v8i16 filt0, filt1;
2967 v8i16 filter_vec, const_vec;
2968
2969 src0_ptr -= src_stride;
2970
2971 const_vec = __msa_ldi_h(128);
2972 const_vec <<= 6;
2973
2974 filter_vec = LD_SH(filter);
2975 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2976
2977 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2978 src0_ptr += (3 * src_stride);
2979 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2980 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2981 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2982
2983 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2984 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2985 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2986 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2987 src32_r, src43_r, src54_r, src65_r);
2988 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2989 XORI_B2_128_SB(src4332, src6554);
2990
2991 dst10 = const_vec;
2992 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2993 dst32 = const_vec;
2994 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2995 HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
2996
2997 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2998 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
2999 }
3000
hevc_vt_bi_4t_4x8multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3001 static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
3002 int32_t src_stride,
3003 int16_t *src1_ptr,
3004 int32_t src2_stride,
3005 uint8_t *dst,
3006 int32_t dst_stride,
3007 const int8_t *filter,
3008 int32_t height)
3009 {
3010 int32_t loop_cnt;
3011 v16i8 src0, src1, src2, src3, src4, src5;
3012 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3013 v16i8 src6, src7, src8, src9;
3014 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3015 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3016 v16i8 src2110, src4332, src6554, src8776;
3017 v8i16 dst10, dst32, dst54, dst76;
3018 v8i16 filt0, filt1;
3019 v8i16 filter_vec, const_vec;
3020
3021 src0_ptr -= src_stride;
3022
3023 const_vec = __msa_ldi_h(128);
3024 const_vec <<= 6;
3025
3026 filter_vec = LD_SH(filter);
3027 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3028
3029 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3030 src0_ptr += (3 * src_stride);
3031 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3032 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3033 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3034
3035 for (loop_cnt = (height >> 3); loop_cnt--;) {
3036 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3037 src0_ptr += (6 * src_stride);
3038 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3039 src1_ptr += (8 * src2_stride);
3040 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3041 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3042 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3043 src32_r, src43_r, src54_r, src65_r);
3044 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3045 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3046 src4332, src6554, src8776);
3047 XORI_B3_128_SB(src4332, src6554, src8776);
3048
3049 dst10 = const_vec;
3050 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3051 dst32 = const_vec;
3052 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3053 dst54 = const_vec;
3054 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3055
3056 LD_SB2(src0_ptr, src_stride, src9, src2);
3057 src0_ptr += (2 * src_stride);
3058 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3059 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3060 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3061 dst76 = const_vec;
3062 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3063
3064 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3065 dst10, dst32, dst54, dst76, 7,
3066 dst10, dst32, dst54, dst76);
3067
3068 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3069 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3070 dst += (8 * dst_stride);
3071 }
3072 }
3073
hevc_vt_bi_4t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3074 static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
3075 int32_t src_stride,
3076 int16_t *src1_ptr,
3077 int32_t src2_stride,
3078 uint8_t *dst,
3079 int32_t dst_stride,
3080 const int8_t *filter,
3081 int32_t height)
3082 {
3083 if (2 == height) {
3084 hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3085 dst, dst_stride, filter, height);
3086 } else if (4 == height) {
3087 hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3088 dst, dst_stride, filter, height);
3089 } else {
3090 hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
3091 src1_ptr, src2_stride,
3092 dst, dst_stride, filter, height);
3093 }
3094 }
3095
hevc_vt_bi_4t_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3096 static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
3097 int32_t src_stride,
3098 int16_t *src1_ptr,
3099 int32_t src2_stride,
3100 uint8_t *dst,
3101 int32_t dst_stride,
3102 const int8_t *filter,
3103 int32_t height)
3104 {
3105 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3106 v8i16 in0, in1, in2, in3;
3107 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3108 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3109 v8i16 filt0, filt1;
3110 v8i16 filter_vec, const_vec;
3111
3112 src0_ptr -= src_stride;
3113
3114 const_vec = __msa_ldi_h(128);
3115 const_vec <<= 6;
3116
3117 filter_vec = LD_SH(filter);
3118 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3119
3120 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3121 src0_ptr += (3 * src_stride);
3122 LD_SB2(src0_ptr, src_stride, src3, src4);
3123 src0_ptr += (2 * src_stride);
3124 LD_SB2(src0_ptr, src_stride, src5, src6);
3125 src0_ptr += (2 * src_stride);
3126 LD_SB2(src0_ptr, src_stride, src7, src8);
3127 src0_ptr += (2 * src_stride);
3128 LD_SB2(src0_ptr, src_stride, src9, src10);
3129 src0_ptr += (2 * src_stride);
3130
3131 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3132 src1_ptr += (4 * src2_stride);
3133
3134 XORI_B3_128_SB(src0, src1, src2);
3135 XORI_B2_128_SB(src3, src4);
3136 XORI_B2_128_SB(src5, src6);
3137 XORI_B2_128_SB(src7, src8);
3138 XORI_B2_128_SB(src9, src10);
3139
3140 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3141 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3142
3143 dst0_r = const_vec;
3144 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3145 dst1_r = const_vec;
3146 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3147
3148 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3149
3150 dst2_r = const_vec;
3151 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3152 dst3_r = const_vec;
3153 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3154
3155 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3156 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3157 dst0_r, dst1_r, dst2_r, dst3_r);
3158
3159 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3160 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3161 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3162 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3163 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3164 dst += (4 * dst_stride);
3165
3166 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3167 src1_ptr += (4 * src2_stride);
3168 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3169
3170 dst0_r = const_vec;
3171 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3172 dst1_r = const_vec;
3173 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3174
3175 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3176
3177 dst2_r = const_vec;
3178 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3179 dst3_r = const_vec;
3180 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3181
3182 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3183 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3184 dst0_r, dst1_r, dst2_r, dst3_r);
3185
3186 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3187 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3188 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3189 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3190 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3191 dst += (4 * dst_stride);
3192 }
3193
hevc_vt_bi_4t_8x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3194 static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
3195 int32_t src_stride,
3196 int16_t *src1_ptr,
3197 int32_t src2_stride,
3198 uint8_t *dst,
3199 int32_t dst_stride,
3200 const int8_t *filter,
3201 int32_t height)
3202 {
3203 v16i8 src0, src1, src2, src3, src4;
3204 v8i16 in0, in1, dst0_r, dst1_r;
3205 v16i8 src10_r, src32_r, src21_r, src43_r;
3206 v8i16 filt0, filt1;
3207 v8i16 filter_vec, const_vec;
3208
3209 src0_ptr -= src_stride;
3210
3211 const_vec = __msa_ldi_h(128);
3212 const_vec <<= 6;
3213
3214 filter_vec = LD_SH(filter);
3215 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3216
3217 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3218 src0_ptr += (3 * src_stride);
3219 XORI_B3_128_SB(src0, src1, src2);
3220 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3221
3222 LD_SB2(src0_ptr, src_stride, src3, src4);
3223 LD_SH2(src1_ptr, src2_stride, in0, in1);
3224 XORI_B2_128_SB(src3, src4);
3225 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3226
3227 dst0_r = const_vec;
3228 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3229 dst1_r = const_vec;
3230 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3231
3232 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3233 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3234
3235 ST_D2(dst0_r, 0, 1, dst, dst_stride);
3236 }
3237
hevc_vt_bi_4t_8x6_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3238 static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
3239 int32_t src_stride,
3240 int16_t *src1_ptr,
3241 int32_t src2_stride,
3242 uint8_t *dst,
3243 int32_t dst_stride,
3244 const int8_t *filter,
3245 int32_t height)
3246 {
3247 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3248 v8i16 in0, in1, in2, in3, in4, in5;
3249 v16i8 src10_r, src32_r, src54_r, src76_r;
3250 v16i8 src21_r, src43_r, src65_r, src87_r;
3251 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3252 v8i16 filt0, filt1;
3253 v8i16 filter_vec, const_vec;
3254
3255 src0_ptr -= src_stride;
3256
3257 const_vec = __msa_ldi_h(128);
3258 const_vec <<= 6;
3259
3260 filter_vec = LD_SH(filter);
3261 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3262
3263 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3264 src0_ptr += (3 * src_stride);
3265 XORI_B3_128_SB(src0, src1, src2);
3266 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3267
3268 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3269 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3270 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3271 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3272 src32_r, src43_r, src54_r, src65_r);
3273 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3274
3275 dst0_r = const_vec;
3276 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3277 dst1_r = const_vec;
3278 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3279 dst2_r = const_vec;
3280 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3281 dst3_r = const_vec;
3282 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3283 dst4_r = const_vec;
3284 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3285 dst5_r = const_vec;
3286 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3287 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3288 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3289 dst0_r, dst1_r, dst2_r, dst3_r);
3290 HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
3291
3292 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3293 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3294 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3295 ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
3296 }
3297
hevc_vt_bi_4t_8x4multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3298 static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
3299 int32_t src_stride,
3300 int16_t *src1_ptr,
3301 int32_t src2_stride,
3302 uint8_t *dst,
3303 int32_t dst_stride,
3304 const int8_t *filter,
3305 int32_t height)
3306 {
3307 int32_t loop_cnt;
3308 v16i8 src0, src1, src2, src3, src4, src5;
3309 v8i16 in0, in1, in2, in3;
3310 v16i8 src10_r, src32_r, src21_r, src43_r;
3311 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3312 v8i16 filt0, filt1;
3313 v8i16 filter_vec, const_vec;
3314
3315 src0_ptr -= src_stride;
3316
3317 const_vec = __msa_ldi_h(128);
3318 const_vec <<= 6;
3319
3320 filter_vec = LD_SH(filter);
3321 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3322
3323 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3324 src0_ptr += (3 * src_stride);
3325 XORI_B3_128_SB(src0, src1, src2);
3326 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3327
3328 for (loop_cnt = (height >> 2); loop_cnt--;) {
3329 LD_SB2(src0_ptr, src_stride, src3, src4);
3330 src0_ptr += (2 * src_stride);
3331 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3332 src1_ptr += (4 * src2_stride);
3333 XORI_B2_128_SB(src3, src4);
3334 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3335
3336 dst0_r = const_vec;
3337 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3338 dst1_r = const_vec;
3339 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3340
3341 LD_SB2(src0_ptr, src_stride, src5, src2);
3342 src0_ptr += (2 * src_stride);
3343 XORI_B2_128_SB(src5, src2);
3344 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3345
3346 dst2_r = const_vec;
3347 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3348 dst3_r = const_vec;
3349 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3350 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3351 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3352 dst0_r, dst1_r, dst2_r, dst3_r);
3353
3354 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3355 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3356 dst += (4 * dst_stride);
3357 }
3358 }
3359
hevc_vt_bi_4t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3360 static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
3361 int32_t src_stride,
3362 int16_t *src1_ptr,
3363 int32_t src2_stride,
3364 uint8_t *dst,
3365 int32_t dst_stride,
3366 const int8_t *filter,
3367 int32_t height)
3368 {
3369 if (2 == height) {
3370 hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3371 dst, dst_stride, filter, height);
3372 } else if (6 == height) {
3373 hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3374 dst, dst_stride, filter, height);
3375 } else {
3376 hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3377 src1_ptr, src2_stride,
3378 dst, dst_stride, filter, height);
3379 }
3380 }
3381
hevc_vt_bi_4t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3382 static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
3383 int32_t src_stride,
3384 int16_t *src1_ptr,
3385 int32_t src2_stride,
3386 uint8_t *dst,
3387 int32_t dst_stride,
3388 const int8_t *filter,
3389 int32_t height)
3390 {
3391 int32_t loop_cnt;
3392 v16i8 src0, src1, src2, src3, src4, src5, src6;
3393 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3394 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3395 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3396 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3397 v16i8 src2110, src4332, src6554;
3398 v8i16 dst0_l, dst1_l, filt0, filt1;
3399 v8i16 filter_vec, const_vec;
3400
3401 src0_ptr -= (1 * src_stride);
3402
3403 const_vec = __msa_ldi_h(128);
3404 const_vec <<= 6;
3405
3406 filter_vec = LD_SH(filter);
3407 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3408
3409 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3410 src0_ptr += (3 * src_stride);
3411 XORI_B3_128_SB(src0, src1, src2);
3412 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3413 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3414 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3415
3416 for (loop_cnt = (height >> 2); loop_cnt--;) {
3417 LD_SB2(src0_ptr, src_stride, src3, src4);
3418 src0_ptr += (2 * src_stride);
3419 LD_SB2(src0_ptr, src_stride, src5, src6);
3420 src0_ptr += (2 * src_stride);
3421 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3422 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3423 src1_ptr += (4 * src2_stride);
3424 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3425 XORI_B2_128_SB(src3, src4);
3426 XORI_B2_128_SB(src5, src6);
3427
3428 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3429 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3430 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3431 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3432 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3433 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3434
3435 dst0_r = const_vec;
3436 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3437 dst1_r = const_vec;
3438 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3439 dst0_l = const_vec;
3440 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3441 dst2_r = const_vec;
3442 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3443 dst3_r = const_vec;
3444 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3445 dst1_l = const_vec;
3446 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3447 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3448 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3449 dst0_r, dst1_r, dst2_r, dst3_r);
3450 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3451
3452 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3453 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3454 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3455 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
3456 dst += (4 * dst_stride);
3457
3458 src2 = src6;
3459 src10_r = src54_r;
3460 src21_r = src65_r;
3461 src2110 = src6554;
3462 }
3463 }
3464
hevc_vt_bi_4t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3465 static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
3466 int32_t src_stride,
3467 int16_t *src1_ptr,
3468 int32_t src2_stride,
3469 uint8_t *dst,
3470 int32_t dst_stride,
3471 const int8_t *filter,
3472 int32_t height)
3473 {
3474 int32_t loop_cnt;
3475 v16i8 src0, src1, src2, src3, src4, src5;
3476 v8i16 in0, in1, in2, in3;
3477 v16i8 src10_r, src32_r, src21_r, src43_r;
3478 v16i8 src10_l, src32_l, src21_l, src43_l;
3479 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3480 v8i16 filt0, filt1;
3481 v8i16 filter_vec, const_vec;
3482
3483 src0_ptr -= src_stride;
3484
3485 const_vec = __msa_ldi_h(128);
3486 const_vec <<= 6;
3487
3488 filter_vec = LD_SH(filter);
3489 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3490
3491 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3492 src0_ptr += (3 * src_stride);
3493 XORI_B3_128_SB(src0, src1, src2);
3494 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3495 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3496
3497 for (loop_cnt = (height >> 2); loop_cnt--;) {
3498 LD_SB2(src0_ptr, src_stride, src3, src4);
3499 src0_ptr += (2 * src_stride);
3500 LD_SH2(src1_ptr, src2_stride, in0, in1);
3501 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3502 src1_ptr += (2 * src2_stride);
3503 XORI_B2_128_SB(src3, src4);
3504 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3505 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3506
3507 dst0_r = const_vec;
3508 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3509 dst1_r = const_vec;
3510 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3511 dst0_l = const_vec;
3512 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3513 dst1_l = const_vec;
3514 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3515 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3516 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3517 dst0_r, dst1_r, dst0_l, dst1_l);
3518
3519 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3520 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3521 dst += (2 * dst_stride);
3522
3523 LD_SB2(src0_ptr, src_stride, src5, src2);
3524 src0_ptr += (2 * src_stride);
3525 LD_SH2(src1_ptr, src2_stride, in0, in1);
3526 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3527 src1_ptr += (2 * src2_stride);
3528 XORI_B2_128_SB(src5, src2);
3529 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3530 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3531
3532 dst0_r = const_vec;
3533 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3534 dst0_l = const_vec;
3535 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3536 dst1_r = const_vec;
3537 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3538 dst1_l = const_vec;
3539 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3540 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3541 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3542 dst0_r, dst1_r, dst0_l, dst1_l);
3543
3544 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3545 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3546 dst += (2 * dst_stride);
3547 }
3548 }
3549
hevc_vt_bi_4t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3550 static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
3551 int32_t src_stride,
3552 int16_t *src1_ptr,
3553 int32_t src2_stride,
3554 uint8_t *dst,
3555 int32_t dst_stride,
3556 const int8_t *filter,
3557 int32_t height)
3558 {
3559 uint32_t loop_cnt;
3560 v16i8 src0, src1, src2, src3, src4, src5;
3561 v16i8 src6, src7, src8, src9, src10, src11;
3562 v8i16 in0, in1, in2, in3, in4, in5;
3563 v16i8 src10_r, src32_r, src76_r, src98_r;
3564 v16i8 src21_r, src43_r, src87_r, src109_r;
3565 v16i8 src10_l, src32_l, src21_l, src43_l;
3566 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3567 v8i16 dst0_l, dst1_l;
3568 v8i16 filt0, filt1;
3569 v8i16 filter_vec, const_vec;
3570
3571 src0_ptr -= src_stride;
3572
3573 const_vec = __msa_ldi_h(128);
3574 const_vec <<= 6;
3575
3576 filter_vec = LD_SH(filter);
3577 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3578
3579 /* 16width */
3580 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3581 XORI_B3_128_SB(src0, src1, src2);
3582 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3583 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3584 /* 8width */
3585 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3586 src0_ptr += (3 * src_stride);
3587 XORI_B3_128_SB(src6, src7, src8);
3588 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3589
3590 for (loop_cnt = (height >> 2); loop_cnt--;) {
3591 /* 16width */
3592 LD_SB2(src0_ptr, src_stride, src3, src4);
3593 LD_SH2(src1_ptr, src2_stride, in0, in1);
3594 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3595 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3596 src1_ptr += (2 * src2_stride);
3597 XORI_B2_128_SB(src3, src4);
3598 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3599 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3600 /* 8width */
3601 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3602 src0_ptr += (2 * src_stride);
3603 XORI_B2_128_SB(src9, src10);
3604 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3605 /* 16width */
3606 dst0_r = const_vec;
3607 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3608 dst0_l = const_vec;
3609 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3610 dst1_r = const_vec;
3611 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3612 dst1_l = const_vec;
3613 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3614 /* 8width */
3615 dst2_r = const_vec;
3616 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3617 dst3_r = const_vec;
3618 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3619 /* 16width */
3620 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3621 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3622 dst0_r, dst1_r, dst0_l, dst1_l);
3623
3624 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3625
3626 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3627 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3628 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3629 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3630 dst += (2 * dst_stride);
3631
3632 /* 16width */
3633 LD_SB2(src0_ptr, src_stride, src5, src2);
3634 LD_SH2(src1_ptr, src2_stride, in0, in1);
3635 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3636 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3637 src1_ptr += (2 * src2_stride);
3638 XORI_B2_128_SB(src5, src2);
3639 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3640 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3641 /* 8width */
3642 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3643 src0_ptr += (2 * src_stride);
3644 XORI_B2_128_SB(src11, src8);
3645 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3646 /* 16width */
3647 dst0_r = const_vec;
3648 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3649 dst0_l = const_vec;
3650 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3651 dst1_r = const_vec;
3652 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3653 dst1_l = const_vec;
3654 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3655 /* 8width */
3656 dst2_r = const_vec;
3657 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3658 dst3_r = const_vec;
3659 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3660
3661 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3662 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3663 dst0_r, dst1_r, dst0_l, dst1_l);
3664 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3665
3666 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3667 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3668 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3669 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3670 dst += (2 * dst_stride);
3671 }
3672 }
3673
hevc_vt_bi_4t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3674 static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
3675 int32_t src_stride,
3676 int16_t *src1_ptr,
3677 int32_t src2_stride,
3678 uint8_t *dst,
3679 int32_t dst_stride,
3680 const int8_t *filter,
3681 int32_t height)
3682 {
3683 uint32_t loop_cnt;
3684 uint8_t *dst_tmp = dst + 16;
3685 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3686 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3687 v16i8 src10_r, src32_r, src76_r, src98_r;
3688 v16i8 src21_r, src43_r, src87_r, src109_r;
3689 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3690 v16i8 src10_l, src32_l, src76_l, src98_l;
3691 v16i8 src21_l, src43_l, src87_l, src109_l;
3692 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3693 v8i16 filt0, filt1;
3694 v8i16 filter_vec, const_vec;
3695
3696 src0_ptr -= src_stride;
3697
3698 const_vec = __msa_ldi_h(128);
3699 const_vec <<= 6;
3700
3701 filter_vec = LD_SH(filter);
3702 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3703
3704 /* 16width */
3705 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3706 XORI_B3_128_SB(src0, src1, src2);
3707 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3708 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3709
3710 /* next 16width */
3711 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3712 src0_ptr += (3 * src_stride);
3713 XORI_B3_128_SB(src6, src7, src8);
3714 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3715 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3716
3717 for (loop_cnt = (height >> 1); loop_cnt--;) {
3718 /* 16width */
3719 LD_SB2(src0_ptr, src_stride, src3, src4);
3720 LD_SH2(src1_ptr, src2_stride, in0, in1);
3721 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3722 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3723 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3724 src1_ptr += (2 * src2_stride);
3725 XORI_B2_128_SB(src3, src4);
3726 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3727 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3728 /* 16width */
3729 dst0_r = const_vec;
3730 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3731 dst0_l = const_vec;
3732 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3733 dst1_r = const_vec;
3734 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3735 dst1_l = const_vec;
3736 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3737 /* 16width */
3738 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3739 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3740 dst0_r, dst1_r, dst0_l, dst1_l);
3741
3742 src10_r = src32_r;
3743 src21_r = src43_r;
3744 src10_l = src32_l;
3745 src21_l = src43_l;
3746 src2 = src4;
3747
3748 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3749 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3750 dst += (2 * dst_stride);
3751
3752 /* next 16width */
3753 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3754 src0_ptr += (2 * src_stride);
3755 XORI_B2_128_SB(src9, src10);
3756 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3758 /* next 16width */
3759 dst2_r = const_vec;
3760 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3761 dst2_l = const_vec;
3762 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3763 dst3_r = const_vec;
3764 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3765 dst3_l = const_vec;
3766 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3767 /* next 16width */
3768 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3769 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3770 dst2_r, dst3_r, dst2_l, dst3_l);
3771
3772 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3773 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3774 dst_tmp += (2 * dst_stride);
3775
3776 src76_r = src98_r;
3777 src87_r = src109_r;
3778 src76_l = src98_l;
3779 src87_l = src109_l;
3780 src8 = src10;
3781 }
3782 }
3783
hevc_hv_bi_4t_4x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3784 static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
3785 int32_t src_stride,
3786 int16_t *src1_ptr,
3787 int32_t src2_stride,
3788 uint8_t *dst,
3789 int32_t dst_stride,
3790 const int8_t *filter_x,
3791 const int8_t *filter_y)
3792 {
3793 uint64_t tp0, tp1;
3794 v16u8 out;
3795 v8i16 in0 = { 0 };
3796 v16i8 src0, src1, src2, src3, src4;
3797 v8i16 filt0, filt1;
3798 v8i16 filt_h0, filt_h1;
3799 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3800 v16i8 mask1;
3801 v8i16 filter_vec, const_vec;
3802 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3803 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp;
3804 v4i32 dst0, dst1;
3805
3806 src0_ptr -= (src_stride + 1);
3807
3808 filter_vec = LD_SH(filter_x);
3809 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3810
3811 filter_vec = LD_SH(filter_y);
3812 UNPCK_R_SB_SH(filter_vec, filter_vec);
3813
3814 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3815
3816 mask1 = mask0 + 2;
3817
3818 const_vec = __msa_ldi_h(128);
3819 const_vec <<= 6;
3820
3821 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
3822 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3823
3824 LD2(src1_ptr, src2_stride, tp0, tp1);
3825 INSERT_D2_SH(tp0, tp1, in0);
3826 in0 = __msa_adds_s_h(in0, const_vec);
3827
3828 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3829 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3830 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3831
3832 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3833 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3834 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3835
3836 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3837 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3838
3839 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3840 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3841 dst0 >>= 6;
3842 dst1 >>= 6;
3843 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3844 tmp = __msa_adds_s_h(tmp, in0);
3845 tmp = __msa_srari_h(tmp, 7);
3846 CLIP_SH_0_255(tmp);
3847 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
3848 ST_W2(out, 0, 1, dst, dst_stride);
3849 }
3850
hevc_hv_bi_4t_4x4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3851 static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
3852 int32_t src_stride,
3853 int16_t *src1_ptr,
3854 int32_t src2_stride,
3855 uint8_t *dst,
3856 int32_t dst_stride,
3857 const int8_t *filter_x,
3858 const int8_t *filter_y)
3859 {
3860 uint64_t tp0, tp1;
3861 v16u8 out;
3862 v16i8 src0, src1, src2, src3, src4, src5, src6;
3863 v8i16 filt0, filt1;
3864 v8i16 filt_h0, filt_h1;
3865 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3866 v16i8 mask1;
3867 v8i16 filter_vec, const_vec;
3868 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3869 v8i16 tmp0, tmp1;
3870 v8i16 in0 = { 0 }, in1 = { 0 };
3871 v8i16 dst30, dst41, dst52, dst63;
3872 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3873 v4i32 dst0, dst1, dst2, dst3;
3874
3875 src0_ptr -= (src_stride + 1);
3876
3877 filter_vec = LD_SH(filter_x);
3878 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3879
3880 filter_vec = LD_SH(filter_y);
3881 UNPCK_R_SB_SH(filter_vec, filter_vec);
3882
3883 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3884
3885 mask1 = mask0 + 2;
3886
3887 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
3888 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3889
3890 const_vec = __msa_ldi_h(128);
3891 const_vec <<= 6;
3892
3893 LD2(src1_ptr, src2_stride, tp0, tp1);
3894 src1_ptr += 2 * src2_stride;
3895 INSERT_D2_SH(tp0, tp1, in0);
3896 LD2(src1_ptr, src2_stride, tp0, tp1);
3897 INSERT_D2_SH(tp0, tp1, in1);
3898
3899 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
3900
3901 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3902 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3903 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3904 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3905
3906 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3907 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3908 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3909 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3910
3911 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3912 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3913 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3914 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3915 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3916 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3917 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3918 SRA_4V(dst0, dst1, dst2, dst3, 6);
3919 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3920 ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
3921 SRARI_H2_SH(tmp0, tmp1, 7);
3922 CLIP_SH2_0_255(tmp0, tmp1);
3923 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3924 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3925 }
3926
hevc_hv_bi_4t_4multx8mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3927 static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
3928 int32_t src_stride,
3929 int16_t *src1_ptr,
3930 int32_t src2_stride,
3931 uint8_t *dst,
3932 int32_t dst_stride,
3933 const int8_t *filter_x,
3934 const int8_t *filter_y,
3935 int32_t height)
3936 {
3937 uint32_t loop_cnt;
3938 uint64_t tp0, tp1;
3939 v16u8 out0, out1;
3940 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3941 v8i16 filt0, filt1;
3942 v8i16 filt_h0, filt_h1;
3943 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3944 v16i8 mask1;
3945 v8i16 filter_vec, const_vec;
3946 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3947 v8i16 tmp0, tmp1, tmp2, tmp3;
3948 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3949 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3950 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3951 v8i16 dst98_r, dst109_r;
3952 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
3953 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3954
3955 src0_ptr -= (src_stride + 1);
3956
3957 filter_vec = LD_SH(filter_x);
3958 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3959
3960 filter_vec = LD_SH(filter_y);
3961 UNPCK_R_SB_SH(filter_vec, filter_vec);
3962
3963 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3964
3965 mask1 = mask0 + 2;
3966
3967 const_vec = __msa_ldi_h(128);
3968 const_vec <<= 6;
3969
3970 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3971 src0_ptr += (3 * src_stride);
3972 XORI_B3_128_SB(src0, src1, src2);
3973
3974 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3975 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3976 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3977 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3978 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3979 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3980
3981
3982 for (loop_cnt = height >> 3; loop_cnt--;) {
3983 LD_SB8(src0_ptr, src_stride,
3984 src3, src4, src5, src6, src7, src8, src9, src10);
3985 src0_ptr += (8 * src_stride);
3986 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3987 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3988 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3989 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3990 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3991
3992 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3993 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3994 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3995 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3996
3997 dst32_r = __msa_ilvr_h(dst73, dst22);
3998 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3999 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4000 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4001 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4002 dst76_r = __msa_ilvr_h(dst22, dst106);
4003
4004 LD2(src1_ptr, src2_stride, tp0, tp1);
4005 src1_ptr += 2 * src2_stride;
4006 INSERT_D2_SH(tp0, tp1, in0);
4007 LD2(src1_ptr, src2_stride, tp0, tp1);
4008 src1_ptr += 2 * src2_stride;
4009 INSERT_D2_SH(tp0, tp1, in1);
4010
4011 LD2(src1_ptr, src2_stride, tp0, tp1);
4012 src1_ptr += 2 * src2_stride;
4013 INSERT_D2_SH(tp0, tp1, in2);
4014 LD2(src1_ptr, src2_stride, tp0, tp1);
4015 src1_ptr += 2 * src2_stride;
4016 INSERT_D2_SH(tp0, tp1, in3);
4017
4018 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4019 const_vec, in0, in1, in2, in3);
4020 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4021 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4022 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4023 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4024 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4025 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4026 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4027 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4028 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4029 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4030 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
4031 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4032 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4033 tmp2, tmp3);
4034 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4035 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4036 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4037 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4038 dst += (8 * dst_stride);
4039
4040 dst10_r = dst98_r;
4041 dst21_r = dst109_r;
4042 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4043 }
4044 }
4045
hevc_hv_bi_4t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4046 static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
4047 int32_t src_stride,
4048 int16_t *src1_ptr,
4049 int32_t src2_stride,
4050 uint8_t *dst,
4051 int32_t dst_stride,
4052 const int8_t *filter_x,
4053 const int8_t *filter_y,
4054 int32_t height)
4055 {
4056 if (2 == height) {
4057 hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4058 dst, dst_stride, filter_x, filter_y);
4059 } else if (4 == height) {
4060 hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4061 dst, dst_stride, filter_x, filter_y);
4062 } else if (0 == (height % 8)) {
4063 hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
4064 src1_ptr, src2_stride,
4065 dst, dst_stride,
4066 filter_x, filter_y, height);
4067 }
4068 }
4069
hevc_hv_bi_4t_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4070 static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
4071 int32_t src_stride,
4072 int16_t *src1_ptr,
4073 int32_t src2_stride,
4074 uint8_t *dst,
4075 int32_t dst_stride,
4076 const int8_t *filter_x,
4077 const int8_t *filter_y,
4078 int32_t height)
4079 {
4080 uint32_t tpw0, tpw1, tpw2, tpw3;
4081 uint64_t tp0, tp1;
4082 v16u8 out0, out1, out2;
4083 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4084 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4085 v8i16 filt0, filt1;
4086 v8i16 filt_h0, filt_h1;
4087 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4088 v16i8 mask1;
4089 v8i16 filter_vec, const_vec;
4090 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4091 v8i16 dsth10, tmp4, tmp5;
4092 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4093 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4094 v8i16 tmp0, tmp1, tmp2, tmp3;
4095 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4096 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4097 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4098 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4099 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4100 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4101 v8i16 in4 = { 0 }, in5 = { 0 };
4102
4103 src0_ptr -= (src_stride + 1);
4104
4105 filter_vec = LD_SH(filter_x);
4106 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4107
4108 filter_vec = LD_SH(filter_y);
4109 UNPCK_R_SB_SH(filter_vec, filter_vec);
4110
4111 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4112
4113 mask1 = mask0 + 2;
4114
4115 const_vec = __msa_ldi_h(128);
4116 const_vec <<= 6;
4117
4118 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4119 src0_ptr += (3 * src_stride);
4120 XORI_B3_128_SB(src0, src1, src2);
4121
4122 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4123 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4124 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4125
4126 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4127 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4128 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4129
4130 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4131 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4132
4133 LD_SB8(src0_ptr, src_stride,
4134 src3, src4, src5, src6, src7, src8, src9, src10);
4135 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4136
4137 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4138 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4139 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4140 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4141
4142 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4143 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4144 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4145 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4146
4147 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4148 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4149 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4150 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4151
4152 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4153 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4154 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4155 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4156
4157 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4158 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4159 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4160 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4161 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4162 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4163 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4164 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4165 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4166 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4167 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4168
4169 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4170 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4171 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4172 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4173 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4174 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4175 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4176 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4177 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4178 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4179 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4180 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4181 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4182 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4183 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4184 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4185 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4186 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4187
4188 LD2(src1_ptr, src2_stride, tp0, tp1);
4189 INSERT_D2_SH(tp0, tp1, in0);
4190 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4191 INSERT_D2_SH(tp0, tp1, in1);
4192
4193 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4194 INSERT_D2_SH(tp0, tp1, in2);
4195 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4196 INSERT_D2_SH(tp0, tp1, in3);
4197
4198 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4199 in0, in1, in2, in3);
4200 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4201 tmp3);
4202 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4203 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4204 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4205 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4206
4207 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4208 src1_ptr += (4 * src2_stride);
4209 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
4210 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4211 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
4212 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4213 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4214 SRARI_H2_SH(tmp4, tmp5, 7);
4215 CLIP_SH2_0_255(tmp4, tmp5);
4216 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4217 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4218 }
4219
hevc_hv_bi_4t_8x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)4220 static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
4221 int32_t src_stride,
4222 int16_t *src1_ptr,
4223 int32_t src2_stride,
4224 uint8_t *dst,
4225 int32_t dst_stride,
4226 const int8_t *filter_x,
4227 const int8_t *filter_y)
4228 {
4229 v16u8 out;
4230 v16i8 src0, src1, src2, src3, src4;
4231 v8i16 filt0, filt1;
4232 v8i16 filt_h0, filt_h1;
4233 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4234 v16i8 mask1;
4235 v8i16 filter_vec, const_vec;
4236 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4237 v8i16 dst0, dst1, dst2, dst3, dst4;
4238 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4239 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4240 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4241 v8i16 tmp0, tmp1;
4242 v8i16 in0, in1;
4243
4244 src0_ptr -= (src_stride + 1);
4245
4246 filter_vec = LD_SH(filter_x);
4247 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4248
4249 filter_vec = LD_SH(filter_y);
4250 UNPCK_R_SB_SH(filter_vec, filter_vec);
4251
4252 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4253
4254 mask1 = mask0 + 2;
4255
4256 const_vec = __msa_ldi_h(128);
4257 const_vec <<= 6;
4258
4259 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4260 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4261
4262 LD_SH2(src1_ptr, src2_stride, in0, in1);
4263 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4264
4265 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4266 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4267 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4268 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4269 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4270
4271 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4272 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4273 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4274 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4275 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4276
4277 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4278 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4279 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4280 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4281 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4282 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4283 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4284 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4285 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4286 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4287 ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
4288 SRARI_H2_SH(tmp0, tmp1, 7);
4289 CLIP_SH2_0_255(tmp0, tmp1);
4290 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4291 ST_D2(out, 0, 1, dst, dst_stride);
4292 }
4293
hevc_hv_bi_4t_8multx4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t width8mult)4294 static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
4295 int32_t src_stride,
4296 int16_t *src1_ptr,
4297 int32_t src2_stride,
4298 uint8_t *dst,
4299 int32_t dst_stride,
4300 const int8_t *filter_x,
4301 const int8_t *filter_y,
4302 int32_t width8mult)
4303 {
4304 uint32_t cnt;
4305 v16u8 out0, out1;
4306 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4307 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4308 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4309 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4310 v8i16 in0, in1, in2, in3;
4311 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4312 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4313 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4314
4315 src0_ptr -= (src_stride + 1);
4316
4317 filter_vec = LD_SH(filter_x);
4318 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4319
4320 filter_vec = LD_SH(filter_y);
4321 UNPCK_R_SB_SH(filter_vec, filter_vec);
4322
4323 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4324
4325 mask0 = LD_SB(ff_hevc_mask_arr);
4326 mask1 = mask0 + 2;
4327
4328 const_vec = __msa_ldi_h(128);
4329 const_vec <<= 6;
4330
4331 for (cnt = width8mult; cnt--;) {
4332 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4333 src0_ptr += 8;
4334 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4335
4336 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4337 src1_ptr += 8;
4338 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4339 const_vec, in0, in1, in2, in3);
4340
4341 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4342 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4343 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4344
4345 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4346 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4347 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4348
4349 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4350 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4351
4352 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4353 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4354 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4355 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4356
4357 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4358 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4359 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4360 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4361
4362 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4363 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4364 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4365 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4366
4367 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4368 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4369 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4370 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4371 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4372 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4373 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4374 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4375
4376 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4377 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4378 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4379 dst3_r, tmp0, tmp1, tmp2, tmp3);
4380 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4381 tmp0, tmp1, tmp2, tmp3);
4382 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4383 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4384 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4385 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4386 dst += 8;
4387 }
4388 }
4389
hevc_hv_bi_4t_8x6_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)4390 static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
4391 int32_t src_stride,
4392 int16_t *src1_ptr,
4393 int32_t src2_stride,
4394 uint8_t *dst,
4395 int32_t dst_stride,
4396 const int8_t *filter_x,
4397 const int8_t *filter_y)
4398 {
4399 v16u8 out0, out1, out2;
4400 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4401 v8i16 in0, in1, in2, in3, in4, in5;
4402 v8i16 filt0, filt1;
4403 v8i16 filt_h0, filt_h1;
4404 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4405 v16i8 mask1;
4406 v8i16 filter_vec, const_vec;
4407 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4408 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4409 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4410 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4411 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4412 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4413 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4414 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4415 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4416 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4417
4418 src0_ptr -= (src_stride + 1);
4419
4420 filter_vec = LD_SH(filter_x);
4421 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4422
4423 filter_vec = LD_SH(filter_y);
4424 UNPCK_R_SB_SH(filter_vec, filter_vec);
4425
4426 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4427
4428 mask1 = mask0 + 2;
4429
4430 const_vec = __msa_ldi_h(128);
4431 const_vec <<= 6;
4432
4433 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4434 src0_ptr += (5 * src_stride);
4435 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4436
4437 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4438 XORI_B4_128_SB(src5, src6, src7, src8);
4439
4440 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4441 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4442 in0, in1, in2, in3);
4443 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4444
4445 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4446 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4447 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4448 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4449 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4450 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4451 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4452 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4453 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4454
4455 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4456 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4457 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4458 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4459 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4460 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4461 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4462 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4463 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4464
4465 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4466 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4467 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4468 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4469 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4470 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4471 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4472 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4473
4474 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4475 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4476 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4477 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4478 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4479 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4480 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4481 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4482 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4483 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4484 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4485 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4486
4487 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4488 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4489 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4490 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4491 tmp0, tmp1, tmp2, tmp3);
4492 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4493 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4494 tmp0, tmp1, tmp2, tmp3);
4495 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4496 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4497 SRARI_H2_SH(tmp4, tmp5, 7);
4498 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4499 CLIP_SH2_0_255(tmp4, tmp5);
4500 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4501 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4502 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4503 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4504 }
4505
hevc_hv_bi_4t_8multx4mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width)4506 static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
4507 int32_t src_stride,
4508 int16_t *src1_ptr,
4509 int32_t src2_stride,
4510 uint8_t *dst,
4511 int32_t dst_stride,
4512 const int8_t *filter_x,
4513 const int8_t *filter_y,
4514 int32_t height,
4515 int32_t width)
4516 {
4517 uint32_t loop_cnt, cnt;
4518 uint8_t *src0_ptr_tmp;
4519 int16_t *src1_ptr_tmp;
4520 uint8_t *dst_tmp;
4521 v16u8 out0, out1;
4522 v16i8 src0, src1, src2, src3, src4, src5, src6;
4523 v8i16 in0, in1, in2, in3;
4524 v8i16 filt0, filt1;
4525 v8i16 filt_h0, filt_h1;
4526 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4527 v16i8 mask1;
4528 v8i16 filter_vec, const_vec;
4529 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4530 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4531 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4532 v8i16 tmp0, tmp1, tmp2, tmp3;
4533 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4534 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4535 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4536
4537 src0_ptr -= (src_stride + 1);
4538
4539 filter_vec = LD_SH(filter_x);
4540 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4541
4542 filter_vec = LD_SH(filter_y);
4543 UNPCK_R_SB_SH(filter_vec, filter_vec);
4544
4545 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4546
4547 mask1 = mask0 + 2;
4548
4549 const_vec = __msa_ldi_h(128);
4550 const_vec <<= 6;
4551
4552 for (cnt = width >> 3; cnt--;) {
4553 src0_ptr_tmp = src0_ptr;
4554 dst_tmp = dst;
4555 src1_ptr_tmp = src1_ptr;
4556
4557 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4558 src0_ptr_tmp += (3 * src_stride);
4559 XORI_B3_128_SB(src0, src1, src2);
4560
4561 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4562 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4563 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4564
4565 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4566 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4567 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4568
4569 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4570 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4571
4572 for (loop_cnt = height >> 2; loop_cnt--;) {
4573 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4574 src0_ptr_tmp += (4 * src_stride);
4575 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4576 src1_ptr_tmp += (4 * src2_stride);
4577 XORI_B4_128_SB(src3, src4, src5, src6);
4578
4579 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4580 const_vec, in0, in1, in2, in3);
4581
4582 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4583 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4584 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4585 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4586
4587 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4588 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4589 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4590 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4591
4592 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4593 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4594 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4595 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4596
4597 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4598 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4599 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4600 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4601 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4602 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4603 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4604 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4605
4606 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4607 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4608 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4609 dst3_r, tmp0, tmp1, tmp2, tmp3);
4610 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4611 tmp0, tmp1, tmp2, tmp3);
4612 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4613 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4614 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4615 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4616 dst_tmp += (4 * dst_stride);
4617
4618 dst10_r = dst54_r;
4619 dst10_l = dst54_l;
4620 dst21_r = dst65_r;
4621 dst21_l = dst65_l;
4622 dst2 = dst6;
4623 }
4624
4625 src0_ptr += 8;
4626 dst += 8;
4627 src1_ptr += 8;
4628 }
4629 }
4630
hevc_hv_bi_4t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4631 static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
4632 int32_t src_stride,
4633 int16_t *src1_ptr,
4634 int32_t src2_stride,
4635 uint8_t *dst,
4636 int32_t dst_stride,
4637 const int8_t *filter_x,
4638 const int8_t *filter_y,
4639 int32_t height)
4640 {
4641 if (2 == height) {
4642 hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4643 dst, dst_stride, filter_x, filter_y);
4644 } else if (4 == height) {
4645 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4646 dst, dst_stride, filter_x, filter_y, 1);
4647 } else if (6 == height) {
4648 hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4649 dst, dst_stride, filter_x, filter_y);
4650 } else {
4651 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4652 src1_ptr, src2_stride,
4653 dst, dst_stride,
4654 filter_x, filter_y, height, 8);
4655 }
4656 }
4657
hevc_hv_bi_4t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4658 static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
4659 int32_t src_stride,
4660 int16_t *src1_ptr,
4661 int32_t src2_stride,
4662 uint8_t *dst,
4663 int32_t dst_stride,
4664 const int8_t *filter_x,
4665 const int8_t *filter_y,
4666 int32_t height)
4667 {
4668 uint32_t loop_cnt;
4669 uint64_t tp0, tp1;
4670 uint8_t *src0_ptr_tmp, *dst_tmp;
4671 int16_t *src1_ptr_tmp;
4672 v16u8 out0, out1;
4673 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675 v16i8 mask0, mask1, mask2, mask3;
4676 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4677 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4678 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4679 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4680 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4681 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4682 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4683 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4684 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4685
4686 src0_ptr -= (src_stride + 1);
4687
4688 filter_vec = LD_SH(filter_x);
4689 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4690
4691 filter_vec = LD_SH(filter_y);
4692 UNPCK_R_SB_SH(filter_vec, filter_vec);
4693
4694 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4695
4696 mask0 = LD_SB(ff_hevc_mask_arr);
4697 mask1 = mask0 + 2;
4698
4699 const_vec = __msa_ldi_h(128);
4700 const_vec <<= 6;
4701
4702 src0_ptr_tmp = src0_ptr;
4703 dst_tmp = dst;
4704 src1_ptr_tmp = src1_ptr;
4705
4706 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4707 src0_ptr_tmp += (3 * src_stride);
4708
4709 XORI_B3_128_SB(src0, src1, src2);
4710
4711 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4712 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4713 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4714
4715 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4716 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4717 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4718
4719 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4720 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4721
4722 for (loop_cnt = 4; loop_cnt--;) {
4723 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4724 src0_ptr_tmp += (4 * src_stride);
4725 XORI_B4_128_SB(src3, src4, src5, src6);
4726
4727 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4728 src1_ptr_tmp += (4 * src2_stride);
4729 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4730 const_vec, in0, in1, in2, in3);
4731
4732 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4733 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4734 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4735 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4736
4737 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4738 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4739 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4740 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4741
4742 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4743 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4744 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4745 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4746
4747 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4748 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4749 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4750 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4751 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4752 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4753 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4754 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4755
4756 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4757 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4758 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4759 dst3_r, tmp0, tmp1, tmp2, tmp3);
4760 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4761 tmp0, tmp1, tmp2, tmp3);
4762 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4763 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4764 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4765 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4766 dst_tmp += (4 * dst_stride);
4767
4768 dst10_r = dst54_r;
4769 dst10_l = dst54_l;
4770 dst21_r = dst65_r;
4771 dst21_l = dst65_l;
4772 dsth2 = dsth6;
4773 }
4774
4775 src0_ptr += 8;
4776 dst += 8;
4777 src1_ptr += 8;
4778
4779 mask2 = LD_SB(ff_hevc_mask_arr + 16);
4780 mask3 = mask2 + 2;
4781
4782 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4783 src0_ptr += (3 * src_stride);
4784 XORI_B3_128_SB(src0, src1, src2);
4785 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4786 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4787
4788 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4789 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4790
4791 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4792 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4793
4794 for (loop_cnt = 2; loop_cnt--;) {
4795 LD_SB8(src0_ptr, src_stride,
4796 src3, src4, src5, src6, src7, src8, src9, src10);
4797 src0_ptr += (8 * src_stride);
4798 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4799 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4800 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4801 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4802 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4803
4804 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4805 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4806 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4807 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4808
4809 dst32_r = __msa_ilvr_h(dst73, dst22);
4810 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4811 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4812 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4813 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4814 dst76_r = __msa_ilvr_h(dst22, dst106);
4815
4816 LD2(src1_ptr, src2_stride, tp0, tp1);
4817 src1_ptr += 2 * src2_stride;
4818 INSERT_D2_SH(tp0, tp1, in0);
4819 LD2(src1_ptr, src2_stride, tp0, tp1);
4820 src1_ptr += 2 * src2_stride;
4821 INSERT_D2_SH(tp0, tp1, in1);
4822
4823 LD2(src1_ptr, src2_stride, tp0, tp1);
4824 src1_ptr += 2 * src2_stride;
4825 INSERT_D2_SH(tp0, tp1, in2);
4826 LD2(src1_ptr, src2_stride, tp0, tp1);
4827 src1_ptr += 2 * src2_stride;
4828 INSERT_D2_SH(tp0, tp1, in3);
4829
4830 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4831 const_vec, in0, in1, in2, in3);
4832
4833 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4834 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4835 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4836 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4837 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4838 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4839 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4840 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4841
4842 SRA_4V(dst0, dst1, dst2, dst3, 6);
4843 SRA_4V(dst4, dst5, dst6, dst7, 6);
4844 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4845 tmp0, tmp1, tmp2, tmp3);
4846 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4847 tmp0, tmp1, tmp2, tmp3);
4848 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4849 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4850 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4851 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4852 dst += (8 * dst_stride);
4853
4854 dst10_r = dst98_r;
4855 dst21_r = dst109_r;
4856 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4857 }
4858 }
4859
hevc_hv_bi_4t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4860 static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
4861 int32_t src_stride,
4862 int16_t *src1_ptr,
4863 int32_t src2_stride,
4864 uint8_t *dst,
4865 int32_t dst_stride,
4866 const int8_t *filter_x,
4867 const int8_t *filter_y,
4868 int32_t height)
4869 {
4870 if (4 == height) {
4871 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4872 dst, dst_stride, filter_x, filter_y, 2);
4873 } else {
4874 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
4875 src2_stride, dst, dst_stride, filter_x,
4876 filter_y, height, 16);
4877 }
4878 }
4879
hevc_hv_bi_4t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4880 static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
4881 int32_t src_stride,
4882 int16_t *src1_ptr,
4883 int32_t src2_stride,
4884 uint8_t *dst,
4885 int32_t dst_stride,
4886 const int8_t *filter_x,
4887 const int8_t *filter_y,
4888 int32_t height)
4889 {
4890 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4891 dst, dst_stride, filter_x, filter_y,
4892 height, 24);
4893 }
4894
hevc_hv_bi_4t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4895 static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
4896 int32_t src_stride,
4897 int16_t *src1_ptr,
4898 int32_t src2_stride,
4899 uint8_t *dst,
4900 int32_t dst_stride,
4901 const int8_t *filter_x,
4902 const int8_t *filter_y,
4903 int32_t height)
4904 {
4905 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4906 dst, dst_stride, filter_x, filter_y,
4907 height, 32);
4908 }
4909
4910 #define BI_MC_COPY(WIDTH) \
4911 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4912 ptrdiff_t dst_stride, \
4913 uint8_t *src, \
4914 ptrdiff_t src_stride, \
4915 int16_t *src_16bit, \
4916 int height, \
4917 intptr_t mx, \
4918 intptr_t my, \
4919 int width) \
4920 { \
4921 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4922 dst, dst_stride, height); \
4923 }
4924
4925 BI_MC_COPY(4);
4926 BI_MC_COPY(6);
4927 BI_MC_COPY(8);
4928 BI_MC_COPY(12);
4929 BI_MC_COPY(16);
4930 BI_MC_COPY(24);
4931 BI_MC_COPY(32);
4932 BI_MC_COPY(48);
4933 BI_MC_COPY(64);
4934
4935 #undef BI_MC_COPY
4936
4937 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4938 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4939 ptrdiff_t dst_stride, \
4940 uint8_t *src, \
4941 ptrdiff_t src_stride, \
4942 int16_t *src_16bit, \
4943 int height, \
4944 intptr_t mx, \
4945 intptr_t my, \
4946 int width) \
4947 { \
4948 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4949 \
4950 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4951 MAX_PB_SIZE, dst, dst_stride, \
4952 filter, height); \
4953 }
4954
4955 BI_MC(qpel, h, 4, 8, hz, mx);
4956 BI_MC(qpel, h, 8, 8, hz, mx);
4957 BI_MC(qpel, h, 12, 8, hz, mx);
4958 BI_MC(qpel, h, 16, 8, hz, mx);
4959 BI_MC(qpel, h, 24, 8, hz, mx);
4960 BI_MC(qpel, h, 32, 8, hz, mx);
4961 BI_MC(qpel, h, 48, 8, hz, mx);
4962 BI_MC(qpel, h, 64, 8, hz, mx);
4963
4964 BI_MC(qpel, v, 4, 8, vt, my);
4965 BI_MC(qpel, v, 8, 8, vt, my);
4966 BI_MC(qpel, v, 12, 8, vt, my);
4967 BI_MC(qpel, v, 16, 8, vt, my);
4968 BI_MC(qpel, v, 24, 8, vt, my);
4969 BI_MC(qpel, v, 32, 8, vt, my);
4970 BI_MC(qpel, v, 48, 8, vt, my);
4971 BI_MC(qpel, v, 64, 8, vt, my);
4972
4973 BI_MC(epel, h, 4, 4, hz, mx);
4974 BI_MC(epel, h, 8, 4, hz, mx);
4975 BI_MC(epel, h, 6, 4, hz, mx);
4976 BI_MC(epel, h, 12, 4, hz, mx);
4977 BI_MC(epel, h, 16, 4, hz, mx);
4978 BI_MC(epel, h, 24, 4, hz, mx);
4979 BI_MC(epel, h, 32, 4, hz, mx);
4980
4981 BI_MC(epel, v, 4, 4, vt, my);
4982 BI_MC(epel, v, 8, 4, vt, my);
4983 BI_MC(epel, v, 6, 4, vt, my);
4984 BI_MC(epel, v, 12, 4, vt, my);
4985 BI_MC(epel, v, 16, 4, vt, my);
4986 BI_MC(epel, v, 24, 4, vt, my);
4987 BI_MC(epel, v, 32, 4, vt, my);
4988
4989 #undef BI_MC
4990
4991 #define BI_MC_HV(PEL, WIDTH, TAP) \
4992 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4993 ptrdiff_t dst_stride, \
4994 uint8_t *src, \
4995 ptrdiff_t src_stride, \
4996 int16_t *src_16bit, \
4997 int height, \
4998 intptr_t mx, \
4999 intptr_t my, \
5000 int width) \
5001 { \
5002 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5003 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5004 \
5005 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5006 MAX_PB_SIZE, dst, dst_stride, \
5007 filter_x, filter_y, height); \
5008 }
5009
5010 BI_MC_HV(qpel, 4, 8);
5011 BI_MC_HV(qpel, 8, 8);
5012 BI_MC_HV(qpel, 12, 8);
5013 BI_MC_HV(qpel, 16, 8);
5014 BI_MC_HV(qpel, 24, 8);
5015 BI_MC_HV(qpel, 32, 8);
5016 BI_MC_HV(qpel, 48, 8);
5017 BI_MC_HV(qpel, 64, 8);
5018
5019 BI_MC_HV(epel, 4, 4);
5020 BI_MC_HV(epel, 8, 4);
5021 BI_MC_HV(epel, 6, 4);
5022 BI_MC_HV(epel, 12, 4);
5023 BI_MC_HV(epel, 16, 4);
5024 BI_MC_HV(epel, 24, 4);
5025 BI_MC_HV(epel, 32, 4);
5026
5027 #undef BI_MC_HV
5028