1 /*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
24
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26 /* 8 width cases */
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29 };
30
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
32 out0, out1) \
33 { \
34 v4i32 out0_r, out1_r, out0_l, out1_l; \
35 \
36 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
38 \
39 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
43 \
44 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
46 CLIP_SH2_0_255(out0, out1); \
47 }
48
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50 wgt, rnd, offset, out0, out1, out2, out3) \
51 { \
52 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
54 }
55
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
57 offset, out0, out1) \
58 { \
59 v4i32 out0_r, out1_r, out0_l, out1_l; \
60 \
61 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
69 CLIP_SH2_0_255(out0, out1); \
70 }
71
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73 vec3, wgt, rnd, offset, out0, out1, \
74 out2, out3) \
75 { \
76 HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
77 out0, out1); \
78 HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
79 out2, out3); \
80 }
81
hevc_biwgt_copy_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)82 static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
83 int32_t src_stride,
84 int16_t *src1_ptr,
85 int32_t src2_stride,
86 uint8_t *dst,
87 int32_t dst_stride,
88 int32_t height,
89 int32_t weight0,
90 int32_t weight1,
91 int32_t offset0,
92 int32_t offset1,
93 int32_t rnd_val)
94 {
95 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96 uint64_t tpd0, tpd1, tpd2, tpd3;
97 int32_t offset, weight;
98 v16u8 out0, out1;
99 v16i8 zero = { 0 };
100 v16i8 src0 = { 0 }, src1 = { 0 };
101 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102 v8i16 dst0, dst1, dst2, dst3, weight_vec;
103 v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104
105 offset = (offset0 + offset1) << rnd_val;
106 weight0 = weight0 & 0x0000FFFF;
107 weight = weight0 | (weight1 << 16);
108
109 offset_vec = __msa_fill_w(offset);
110 weight_vec = (v8i16) __msa_fill_w(weight);
111 rnd_vec = __msa_fill_w(rnd_val + 1);
112
113 if (2 == height) {
114 LW2(src0_ptr, src_stride, tp0, tp1);
115 INSERT_W2_SB(tp0, tp1, src0);
116 LD2(src1_ptr, src2_stride, tpd0, tpd1);
117 INSERT_D2_SH(tpd0, tpd1, in0);
118
119 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120 dst0 <<= 6;
121
122 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126 dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
127 CLIP_SH_0_255(dst0);
128 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129 ST_W2(out0, 0, 1, dst, dst_stride);
130 } else if (4 == height) {
131 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134 INSERT_D2_SH(tpd0, tpd1, in0);
135 INSERT_D2_SH(tpd2, tpd3, in1);
136 ILVRL_B2_SH(zero, src0, dst0, dst1);
137 SLLI_2V(dst0, dst1, 6);
138 HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139 offset_vec, dst0, dst1);
140 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142 } else if (0 == height % 8) {
143 for (loop_cnt = (height >> 3); loop_cnt--;) {
144 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145 src0_ptr += 4 * src_stride;
146 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += 4 * src_stride;
149 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151 src1_ptr += (4 * src2_stride);
152 INSERT_D2_SH(tpd0, tpd1, in0);
153 INSERT_D2_SH(tpd2, tpd3, in1);
154 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155 src1_ptr += (4 * src2_stride);
156 INSERT_D2_SH(tpd0, tpd1, in2);
157 INSERT_D2_SH(tpd2, tpd3, in3);
158 ILVRL_B2_SH(zero, src0, dst0, dst1);
159 ILVRL_B2_SH(zero, src1, dst2, dst3);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162 in3, weight_vec, rnd_vec, offset_vec,
163 dst0, dst1, dst2, dst3);
164 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166 dst += (8 * dst_stride);
167 }
168 }
169 }
170
hevc_biwgt_copy_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)171 static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
172 int32_t src_stride,
173 int16_t *src1_ptr,
174 int32_t src2_stride,
175 uint8_t *dst,
176 int32_t dst_stride,
177 int32_t height,
178 int32_t weight0,
179 int32_t weight1,
180 int32_t offset0,
181 int32_t offset1,
182 int32_t rnd_val)
183 {
184 uint32_t loop_cnt;
185 int32_t offset, weight;
186 uint64_t tp0, tp1, tp2, tp3;
187 v16u8 out0, out1;
188 v16i8 zero = { 0 };
189 v16i8 src0 = { 0 }, src1 = { 0 };
190 v8i16 in0, in1, in2, in3;
191 v8i16 dst0, dst1, dst2, dst3;
192 v4i32 offset_vec, weight_vec, rnd_vec;
193
194 offset = (offset0 + offset1) << rnd_val;
195 weight0 = weight0 & 0x0000FFFF;
196 weight = weight0 | (weight1 << 16);
197
198 weight_vec = __msa_fill_w(weight);
199 offset_vec = __msa_fill_w(offset);
200 rnd_vec = __msa_fill_w(rnd_val + 1);
201
202 for (loop_cnt = (height >> 2); loop_cnt--;) {
203 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204 src0_ptr += (4 * src_stride);
205 INSERT_D2_SB(tp0, tp1, src0);
206 INSERT_D2_SB(tp2, tp3, src1);
207 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208 src1_ptr += (4 * src2_stride);
209 ILVRL_B2_SH(zero, src0, dst0, dst1);
210 ILVRL_B2_SH(zero, src1, dst2, dst3);
211 SLLI_4V(dst0, dst1, dst2, dst3, 6);
212 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
213 in0, in1, in2, in3,
214 weight_vec, rnd_vec, offset_vec,
215 dst0, dst1, dst2, dst3);
216 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
217 ST_W2(out0, 0, 2, dst, dst_stride);
218 ST_H2(out0, 2, 6, dst + 4, dst_stride);
219 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
220 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
221 dst += (4 * dst_stride);
222 }
223 }
224
hevc_biwgt_copy_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)225 static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
226 int32_t src_stride,
227 int16_t *src1_ptr,
228 int32_t src2_stride,
229 uint8_t *dst,
230 int32_t dst_stride,
231 int32_t height,
232 int32_t weight0,
233 int32_t weight1,
234 int32_t offset0,
235 int32_t offset1,
236 int32_t rnd_val)
237 {
238 uint64_t tp0, tp1, tp2, tp3;
239 int32_t offset, weight;
240 v16u8 out0, out1, out2;
241 v16i8 zero = { 0 };
242 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
243 v8i16 in0, in1, in2, in3, in4, in5;
244 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
245 v4i32 offset_vec, weight_vec, rnd_vec;
246
247 offset = (offset0 + offset1) << rnd_val;
248 weight0 = weight0 & 0x0000FFFF;
249 weight = weight0 | (weight1 << 16);
250
251 offset_vec = __msa_fill_w(offset);
252 weight_vec = __msa_fill_w(weight);
253 rnd_vec = __msa_fill_w(rnd_val + 1);
254
255 if (2 == height) {
256 LD2(src0_ptr, src_stride, tp0, tp1);
257 INSERT_D2_SB(tp0, tp1, src0);
258 LD_SH2(src1_ptr, src2_stride, in0, in1);
259 ILVRL_B2_SH(zero, src0, dst0, dst1);
260 SLLI_2V(dst0, dst1, 6);
261
262 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
263 weight_vec, rnd_vec, offset_vec,
264 dst0, dst1);
265
266 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
267 ST_D2(out0, 0, 1, dst, dst_stride);
268 } else if (6 == height) {
269 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
270 src0_ptr += 4 * src_stride;
271 INSERT_D2_SB(tp0, tp1, src0);
272 INSERT_D2_SB(tp2, tp3, src1);
273 LD2(src0_ptr, src_stride, tp0, tp1);
274 INSERT_D2_SB(tp0, tp1, src2);
275 ILVRL_B2_SH(zero, src0, dst0, dst1);
276 ILVRL_B2_SH(zero, src1, dst2, dst3);
277 ILVRL_B2_SH(zero, src2, dst4, dst5);
278 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279 SLLI_4V(dst0, dst1, dst2, dst3, 6);
280 SLLI_2V(dst4, dst5, 6);
281 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
282 weight_vec, rnd_vec, offset_vec, dst0, dst1,
283 dst2, dst3);
284 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
285 offset_vec, dst4, dst5);
286 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
287 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
288 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
289 } else if (0 == height % 4) {
290 uint32_t loop_cnt;
291
292 for (loop_cnt = (height >> 2); loop_cnt--;) {
293 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
294 src0_ptr += (4 * src_stride);
295 INSERT_D2_SB(tp0, tp1, src0);
296 INSERT_D2_SB(tp2, tp3, src1);
297 ILVRL_B2_SH(zero, src0, dst0, dst1);
298 ILVRL_B2_SH(zero, src1, dst2, dst3);
299 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
300 src1_ptr += (4 * src2_stride);
301
302 SLLI_4V(dst0, dst1, dst2, dst3, 6);
303 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
304 in3, weight_vec, rnd_vec, offset_vec,
305 dst0, dst1, dst2, dst3);
306 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
307 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
308 dst += (4 * dst_stride);
309 }
310 }
311 }
312
hevc_biwgt_copy_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)313 static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
314 int32_t src_stride,
315 int16_t *src1_ptr,
316 int32_t src2_stride,
317 uint8_t *dst,
318 int32_t dst_stride,
319 int32_t height,
320 int32_t weight0,
321 int32_t weight1,
322 int32_t offset0,
323 int32_t offset1,
324 int32_t rnd_val)
325 {
326 uint32_t loop_cnt;
327 int32_t offset, weight;
328 v16i8 zero = { 0 };
329 v16u8 out0, out1, out2;
330 v16i8 src0, src1, src2, src3;
331 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
332 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
333 v4i32 offset_vec, weight_vec, rnd_vec;
334
335 offset = (offset0 + offset1) << rnd_val;
336 weight0 = weight0 & 0x0000FFFF;
337 weight = weight0 | (weight1 << 16);
338
339 offset_vec = __msa_fill_w(offset);
340 weight_vec = __msa_fill_w(weight);
341 rnd_vec = __msa_fill_w(rnd_val + 1);
342
343 for (loop_cnt = (16 >> 2); loop_cnt--;) {
344 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
345 src0_ptr += (4 * src_stride);
346 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
347 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
348 src1_ptr += (4 * src2_stride);
349
350 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
351 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
352 dst0, dst1, dst2, dst3);
353
354 SLLI_4V(dst0, dst1, dst2, dst3, 6);
355 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
356 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
357
358 dst4 <<= 6;
359 dst5 <<= 6;
360 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
361 weight_vec, rnd_vec, offset_vec, dst0, dst1,
362 dst2, dst3);
363 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
364 offset_vec, dst4, dst5);
365 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
366 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
367 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
368 dst += (4 * dst_stride);
369 }
370 }
371
hevc_biwgt_copy_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)372 static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
373 int32_t src_stride,
374 int16_t *src1_ptr,
375 int32_t src2_stride,
376 uint8_t *dst,
377 int32_t dst_stride,
378 int32_t height,
379 int32_t weight0,
380 int32_t weight1,
381 int32_t offset0,
382 int32_t offset1,
383 int32_t rnd_val)
384 {
385 uint32_t loop_cnt;
386 int32_t offset, weight;
387 v16u8 out0, out1, out2, out3;
388 v16i8 zero = { 0 };
389 v16i8 src0, src1, src2, src3;
390 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
391 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
392 v4i32 offset_vec, weight_vec, rnd_vec;
393
394 offset = (offset0 + offset1) << rnd_val;
395 weight0 = weight0 & 0x0000FFFF;
396 weight = weight0 | (weight1 << 16);
397
398 offset_vec = __msa_fill_w(offset);
399 weight_vec = __msa_fill_w(weight);
400 rnd_vec = __msa_fill_w(rnd_val + 1);
401
402 for (loop_cnt = (height >> 2); loop_cnt--;) {
403 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
404 src0_ptr += (4 * src_stride);
405 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
406 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
407 src1_ptr += (4 * src2_stride);
408 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
409 tmp2, tmp3);
410 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
411 tmp6, tmp7);
412 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
413 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
414 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
415 weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
416 tmp4, tmp5);
417 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
418 weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
419 tmp6, tmp7);
420 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
421 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
422 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
423 dst += (4 * dst_stride);
424 }
425 }
426
hevc_biwgt_copy_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)427 static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
428 int32_t src_stride,
429 int16_t *src1_ptr,
430 int32_t src2_stride,
431 uint8_t *dst,
432 int32_t dst_stride,
433 int32_t height,
434 int32_t weight0,
435 int32_t weight1,
436 int32_t offset0,
437 int32_t offset1,
438 int32_t rnd_val)
439 {
440 uint32_t loop_cnt;
441 int32_t offset, weight;
442 v16u8 out0, out1, out2, out3, out4, out5;
443 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
444 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
445 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
446 v4i32 offset_vec, weight_vec, rnd_vec;
447
448 offset = (offset0 + offset1) << rnd_val;
449 weight0 = weight0 & 0x0000FFFF;
450 weight = weight0 | (weight1 << 16);
451
452 offset_vec = __msa_fill_w(offset);
453 weight_vec = __msa_fill_w(weight);
454 rnd_vec = __msa_fill_w(rnd_val + 1);
455
456 for (loop_cnt = 8; loop_cnt--;) {
457 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
458 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
459 src0_ptr += (4 * src_stride);
460 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
461 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
462 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
463 src1_ptr += (4 * src2_stride);
464
465 ILVRL_B2_SH(zero, src0, dst0, dst1);
466 ILVRL_B2_SH(zero, src1, dst2, dst3);
467 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
468 ILVRL_B2_SH(zero, src4, dst6, dst7);
469 ILVRL_B2_SH(zero, src5, dst8, dst9);
470 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
471 SLLI_4V(dst0, dst1, dst2, dst3, 6);
472 SLLI_4V(dst4, dst5, dst6, dst7, 6);
473 SLLI_4V(dst8, dst9, dst10, dst11, 6);
474 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
475 weight_vec, rnd_vec, offset_vec, dst0, dst1,
476 dst2, dst3);
477 HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
478 weight_vec, rnd_vec, offset_vec, dst4, dst5,
479 dst6, dst7);
480 HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
481 in11, weight_vec, rnd_vec, offset_vec,
482 dst8, dst9, dst10, dst11);
483 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
484 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
485 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
486 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
487 dst += (4 * dst_stride);
488 }
489 }
490
hevc_biwgt_copy_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)491 static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
492 int32_t src_stride,
493 int16_t *src1_ptr,
494 int32_t src2_stride,
495 uint8_t *dst,
496 int32_t dst_stride,
497 int32_t height,
498 int32_t weight0,
499 int32_t weight1,
500 int32_t offset0,
501 int32_t offset1,
502 int32_t rnd_val)
503 {
504 uint32_t loop_cnt;
505 int32_t offset, weight;
506 v16u8 out0, out1, out2, out3;
507 v16i8 zero = { 0 };
508 v16i8 src0, src1, src2, src3;
509 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
510 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
511 v4i32 offset_vec, weight_vec, rnd_vec;
512
513 offset = (offset0 + offset1) << rnd_val;
514 weight0 = weight0 & 0x0000FFFF;
515 weight = weight0 | (weight1 << 16);
516
517 offset_vec = __msa_fill_w(offset);
518 weight_vec = __msa_fill_w(weight);
519 rnd_vec = __msa_fill_w(rnd_val + 1);
520
521 for (loop_cnt = (height >> 1); loop_cnt--;) {
522 LD_SB2(src0_ptr, 16, src0, src1);
523 src0_ptr += src_stride;
524 LD_SB2(src0_ptr, 16, src2, src3);
525 src0_ptr += src_stride;
526 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
527 src1_ptr += src2_stride;
528 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
529 src1_ptr += src2_stride;
530
531 ILVRL_B2_SH(zero, src0, tmp0, tmp4);
532 ILVRL_B2_SH(zero, src1, tmp1, tmp5);
533 ILVRL_B2_SH(zero, src2, tmp2, tmp6);
534 ILVRL_B2_SH(zero, src3, tmp3, tmp7);
535 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
536 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
537 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
538 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
539 tmp1, tmp5);
540 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
541 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
542 tmp3, tmp7);
543 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
544 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
545 ST_UB2(out0, out1, dst, 16);
546 dst += dst_stride;
547 ST_UB2(out2, out3, dst, 16);
548 dst += dst_stride;
549 }
550 }
551
hevc_biwgt_copy_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)552 static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
553 int32_t src_stride,
554 int16_t *src1_ptr,
555 int32_t src2_stride,
556 uint8_t *dst,
557 int32_t dst_stride,
558 int32_t height,
559 int32_t weight0,
560 int32_t weight1,
561 int32_t offset0,
562 int32_t offset1,
563 int32_t rnd_val)
564 {
565 uint32_t loop_cnt;
566 int32_t offset, weight;
567 v16u8 out0, out1, out2;
568 v16i8 src0, src1, src2;
569 v16i8 zero = { 0 };
570 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
571 v4i32 offset_vec, weight_vec, rnd_vec;
572
573 offset = (offset0 + offset1) << rnd_val;
574 weight0 = weight0 & 0x0000FFFF;
575 weight = weight0 | (weight1 << 16);
576
577 offset_vec = __msa_fill_w(offset);
578 weight_vec = __msa_fill_w(weight);
579 rnd_vec = __msa_fill_w(rnd_val + 1);
580
581 for (loop_cnt = 64; loop_cnt--;) {
582 LD_SB3(src0_ptr, 16, src0, src1, src2);
583 src0_ptr += src_stride;
584 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
585 src1_ptr += src2_stride;
586
587 ILVRL_B2_SH(zero, src0, dst0, dst1);
588 ILVRL_B2_SH(zero, src1, dst2, dst3);
589 ILVRL_B2_SH(zero, src2, dst4, dst5);
590 SLLI_4V(dst0, dst1, dst2, dst3, 6);
591 SLLI_2V(dst4, dst5, 6);
592 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
593 weight_vec, rnd_vec, offset_vec, dst0, dst1,
594 dst2, dst3);
595 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
596 offset_vec, dst4, dst5);
597 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
598 ST_UB2(out0, out1, dst, 16);
599 ST_UB(out2, dst + 32);
600 dst += dst_stride;
601 }
602 }
603
hevc_biwgt_copy_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)604 static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
605 int32_t src_stride,
606 int16_t *src1_ptr,
607 int32_t src2_stride,
608 uint8_t *dst,
609 int32_t dst_stride,
610 int32_t height,
611 int32_t weight0,
612 int32_t weight1,
613 int32_t offset0,
614 int32_t offset1,
615 int32_t rnd_val)
616 {
617 uint32_t loop_cnt;
618 int32_t offset, weight;
619 v16u8 out0, out1, out2, out3;
620 v16i8 zero = { 0 };
621 v16i8 src0, src1, src2, src3;
622 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
623 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
624 v4i32 offset_vec, weight_vec, rnd_vec;
625
626 offset = (offset0 + offset1) << rnd_val;
627 weight0 = weight0 & 0x0000FFFF;
628 weight = weight0 | (weight1 << 16);
629
630 offset_vec = __msa_fill_w(offset);
631 weight_vec = __msa_fill_w(weight);
632 rnd_vec = __msa_fill_w(rnd_val + 1);
633
634 for (loop_cnt = height; loop_cnt--;) {
635 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
636 src0_ptr += src_stride;
637 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
638 src1_ptr += src2_stride;
639
640 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
641 tmp2, tmp3);
642 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
643 tmp6, tmp7);
644 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
645 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
646 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
647 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
648 tmp1, tmp5);
649 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
650 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
651 tmp3, tmp7);
652 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
653 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
654 ST_UB4(out0, out1, out2, out3, dst, 16);
655 dst += dst_stride;
656 }
657 }
658
hevc_hz_biwgt_8t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)659 static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
660 int32_t src_stride,
661 int16_t *src1_ptr,
662 int32_t src2_stride,
663 uint8_t *dst,
664 int32_t dst_stride,
665 const int8_t *filter,
666 int32_t height,
667 int32_t weight0,
668 int32_t weight1,
669 int32_t offset0,
670 int32_t offset1,
671 int32_t rnd_val)
672 {
673 uint32_t loop_cnt;
674 int32_t offset, weight, constant;
675 v8i16 filt0, filt1, filt2, filt3;
676 v16i8 src0, src1, src2, src3;
677 v16i8 mask1, mask2, mask3;
678 v16i8 vec0, vec1, vec2, vec3;
679 v8i16 dst0, dst1;
680 v8i16 in0, in1, in2, in3;
681 v8i16 filter_vec, out0, out1;
682 v4i32 weight_vec, offset_vec, rnd_vec;
683 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
684
685 src0_ptr -= 3;
686 filter_vec = LD_SH(filter);
687 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
688
689 mask1 = mask0 + 2;
690 mask2 = mask0 + 4;
691 mask3 = mask0 + 6;
692
693 offset = (offset0 + offset1) << rnd_val;
694 weight0 = weight0 & 0x0000FFFF;
695 weight = weight0 | (weight1 << 16);
696 constant = 128 * weight1;
697 constant <<= 6;
698 offset += constant;
699
700 offset_vec = __msa_fill_w(offset);
701 weight_vec = __msa_fill_w(weight);
702 rnd_vec = __msa_fill_w(rnd_val + 1);
703
704 for (loop_cnt = (height >> 2); loop_cnt--;) {
705 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
706 src0_ptr += (4 * src_stride);
707 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
708 src1_ptr += (4 * src2_stride);
709 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
710 XORI_B4_128_SB(src0, src1, src2, src3);
711
712 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
713 vec0, vec1, vec2, vec3);
714 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
715 filt3);
716 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
717 vec0, vec1, vec2, vec3);
718 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
719 filt3);
720
721 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
722 weight_vec, rnd_vec, offset_vec,
723 out0, out1);
724
725 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
726 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
727 dst += (4 * dst_stride);
728 }
729 }
730
hevc_hz_biwgt_8t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)731 static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
732 int32_t src_stride,
733 int16_t *src1_ptr,
734 int32_t src2_stride,
735 uint8_t *dst,
736 int32_t dst_stride,
737 const int8_t *filter,
738 int32_t height,
739 int32_t weight0,
740 int32_t weight1,
741 int32_t offset0,
742 int32_t offset1,
743 int32_t rnd_val)
744 {
745 uint32_t loop_cnt;
746 int32_t offset, weight, constant;
747 v8i16 filt0, filt1, filt2, filt3;
748 v16i8 src0, src1, src2, src3;
749 v16i8 mask1, mask2, mask3;
750 v16i8 vec0, vec1, vec2, vec3;
751 v8i16 dst0, dst1, dst2, dst3;
752 v8i16 in0, in1, in2, in3;
753 v8i16 filter_vec, out0, out1, out2, out3;
754 v4i32 weight_vec, offset_vec, rnd_vec;
755 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
756
757 src0_ptr -= 3;
758 offset = (offset0 + offset1) << rnd_val;
759 weight0 = weight0 & 0x0000FFFF;
760 weight = weight0 | (weight1 << 16);
761 constant = 128 * weight1;
762 constant <<= 6;
763 offset += constant;
764
765 offset_vec = __msa_fill_w(offset);
766 weight_vec = __msa_fill_w(weight);
767 rnd_vec = __msa_fill_w(rnd_val + 1);
768
769 filter_vec = LD_SH(filter);
770 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
771
772 mask1 = mask0 + 2;
773 mask2 = mask0 + 4;
774 mask3 = mask0 + 6;
775
776 for (loop_cnt = (height >> 2); loop_cnt--;) {
777 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
778 src0_ptr += (4 * src_stride);
779 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
780 src1_ptr += (4 * src2_stride);
781 XORI_B4_128_SB(src0, src1, src2, src3);
782
783 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
784 vec0, vec1, vec2, vec3);
785 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
786 filt3);
787 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
788 vec0, vec1, vec2, vec3);
789 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
790 filt3);
791 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
792 vec0, vec1, vec2, vec3);
793 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
794 filt3);
795 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
796 vec0, vec1, vec2, vec3);
797 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
798 filt3);
799
800 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
801 in0, in1, in2, in3,
802 weight_vec, rnd_vec, offset_vec,
803 out0, out1, out2, out3);
804
805 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
806 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
807 dst += (4 * dst_stride);
808 }
809 }
810
hevc_hz_biwgt_8t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)811 static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
812 int32_t src_stride,
813 int16_t *src1_ptr,
814 int32_t src2_stride,
815 uint8_t *dst,
816 int32_t dst_stride,
817 const int8_t *filter,
818 int32_t height,
819 int32_t weight0,
820 int32_t weight1,
821 int32_t offset0,
822 int32_t offset1,
823 int32_t rnd_val)
824 {
825 uint32_t loop_cnt;
826 int32_t offset, weight, constant;
827 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
828 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829 v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
830 v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
831 v4i32 weight_vec, offset_vec, rnd_vec;
832
833 src0_ptr -= 3;
834
835 weight0 = weight0 & 0x0000FFFF;
836 weight = weight0 | (weight1 << 16);
837 constant = 128 * weight1;
838 constant <<= 6;
839 offset = (offset0 + offset1) << rnd_val;
840 offset += constant;
841
842 offset_vec = __msa_fill_w(offset);
843 weight_vec = __msa_fill_w(weight);
844 rnd_vec = __msa_fill_w(rnd_val + 1);
845
846 filter_vec = LD_SH(filter);
847 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
848
849 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
850 mask1 = mask0 + 2;
851 mask2 = mask0 + 4;
852 mask3 = mask0 + 6;
853 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
854 mask5 = mask4 + 2;
855 mask6 = mask4 + 4;
856 mask7 = mask4 + 6;
857
858 for (loop_cnt = 4; loop_cnt--;) {
859 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
860 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
861 XORI_B4_128_SB(src0, src1, src2, src3);
862 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
863 vec3);
864 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
865 filt3);
866 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
867 vec3);
868 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
869 filt3);
870 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
871 vec3);
872 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
873 filt3);
874 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
875 vec3);
876 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
877 filt3);
878 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
879 weight_vec, rnd_vec, offset_vec, out0, out1, out2,
880 out3);
881 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
882 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
883
884 LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
885 src0_ptr += (4 * src_stride);
886 LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
887 src1_ptr += (4 * src2_stride);
888 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
889 XORI_B4_128_SB(src0, src1, src2, src3);
890 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
891 vec3);
892 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
893 filt3);
894 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
895 vec3);
896 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
897 filt3);
898 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
899 offset_vec, out0, out1);
900 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
901 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
902 dst += (4 * dst_stride);
903 }
904 }
905
hevc_hz_biwgt_8t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)906 static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
907 int32_t src_stride,
908 int16_t *src1_ptr,
909 int32_t src2_stride,
910 uint8_t *dst,
911 int32_t dst_stride,
912 const int8_t *filter,
913 int32_t height,
914 int32_t weight0,
915 int32_t weight1,
916 int32_t offset0,
917 int32_t offset1,
918 int32_t rnd_val)
919 {
920 uint32_t loop_cnt;
921 int32_t offset, weight, constant;
922 v16i8 src0, src1, src2, src3;
923 v8i16 in0, in1, in2, in3;
924 v8i16 filt0, filt1, filt2, filt3;
925 v16i8 mask1, mask2, mask3;
926 v8i16 filter_vec, out0, out1, out2, out3;
927 v16i8 vec0, vec1, vec2, vec3;
928 v8i16 dst0, dst1, dst2, dst3;
929 v4i32 weight_vec, offset_vec, rnd_vec;
930 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
931
932 src0_ptr -= 3;
933 offset = (offset0 + offset1) << rnd_val;
934 weight0 = weight0 & 0x0000FFFF;
935 weight = weight0 | (weight1 << 16);
936 constant = 128 * weight1;
937 constant <<= 6;
938 offset += constant;
939
940 offset_vec = __msa_fill_w(offset);
941 weight_vec = __msa_fill_w(weight);
942 rnd_vec = __msa_fill_w(rnd_val + 1);
943
944 filter_vec = LD_SH(filter);
945 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
946
947 mask1 = mask0 + 2;
948 mask2 = mask0 + 4;
949 mask3 = mask0 + 6;
950
951 for (loop_cnt = (height >> 1); loop_cnt--;) {
952 LD_SB2(src0_ptr, 8, src0, src1);
953 src0_ptr += src_stride;
954 LD_SB2(src0_ptr, 8, src2, src3);
955 src0_ptr += src_stride;
956 LD_SH2(src1_ptr, 8, in0, in1);
957 src1_ptr += src2_stride;
958 LD_SH2(src1_ptr, 8, in2, in3);
959 src1_ptr += src2_stride;
960 XORI_B4_128_SB(src0, src1, src2, src3);
961
962 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
963 vec0, vec1, vec2, vec3);
964 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
965 filt3);
966 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
967 vec0, vec1, vec2, vec3);
968 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
969 filt3);
970 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
971 vec0, vec1, vec2, vec3);
972 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
973 filt3);
974 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
975 vec0, vec1, vec2, vec3);
976 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977 filt3);
978
979 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
980 in0, in1, in2, in3,
981 weight_vec, rnd_vec, offset_vec,
982 out0, out1, out2, out3);
983
984 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
985 ST_SH2(out0, out1, dst, dst_stride);
986 dst += (2 * dst_stride);
987 }
988 }
989
hevc_hz_biwgt_8t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)990 static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
991 int32_t src_stride,
992 int16_t *src1_ptr,
993 int32_t src2_stride,
994 uint8_t *dst,
995 int32_t dst_stride,
996 const int8_t *filter,
997 int32_t height,
998 int32_t weight0,
999 int32_t weight1,
1000 int32_t offset0,
1001 int32_t offset1,
1002 int32_t rnd_val)
1003 {
1004 uint32_t loop_cnt;
1005 uint64_t dst_val0;
1006 int32_t offset, weight, constant;
1007 v16i8 src0, src1;
1008 v8i16 in0, in1, in2;
1009 v8i16 filt0, filt1, filt2, filt3;
1010 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1011 v16i8 vec0, vec1, vec2, vec3;
1012 v8i16 dst0, dst1, dst2;
1013 v4i32 dst2_r, dst2_l;
1014 v8i16 filter_vec, out0, out1, out2;
1015 v4i32 weight_vec, offset_vec, rnd_vec;
1016 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1017
1018 src0_ptr = src0_ptr - 3;
1019 offset = (offset0 + offset1) << rnd_val;
1020 weight0 = weight0 & 0x0000FFFF;
1021 weight = weight0 | (weight1 << 16);
1022 constant = 128 * weight1;
1023 constant <<= 6;
1024 offset += constant;
1025
1026 offset_vec = __msa_fill_w(offset);
1027 weight_vec = __msa_fill_w(weight);
1028 rnd_vec = __msa_fill_w(rnd_val + 1);
1029
1030 filter_vec = LD_SH(filter);
1031 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1032
1033 mask1 = mask0 + 2;
1034 mask2 = mask0 + 4;
1035 mask3 = mask0 + 6;
1036 mask4 = mask0 + 8;
1037 mask5 = mask0 + 10;
1038 mask6 = mask0 + 12;
1039 mask7 = mask0 + 14;
1040
1041 LD_SB2(src0_ptr, 16, src0, src1);
1042 src0_ptr += src_stride;
1043 LD_SH2(src1_ptr, 8, in0, in1);
1044 in2 = LD_SH(src1_ptr + 16);
1045 src1_ptr += src2_stride;
1046 XORI_B2_128_SB(src0, src1);
1047
1048 for (loop_cnt = 31; loop_cnt--;) {
1049 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1050 vec0, vec1, vec2, vec3);
1051 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1052 filt3);
1053 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1054 vec0, vec1, vec2, vec3);
1055 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1056 filt3);
1057 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1058 vec0, vec1, vec2, vec3);
1059 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1060 filt3);
1061
1062 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1063 weight_vec, rnd_vec, offset_vec,
1064 out0, out1);
1065
1066 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1067 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1068 (v8i16) weight_vec);
1069 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1070 (v8i16) weight_vec);
1071 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1072 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1073 CLIP_SH_0_255(out2);
1074
1075 LD_SB2(src0_ptr, 16, src0, src1);
1076 src0_ptr += src_stride;
1077 LD_SH2(src1_ptr, 8, in0, in1);
1078 in2 = LD_SH(src1_ptr + 16);
1079 src1_ptr += src2_stride;
1080 XORI_B2_128_SB(src0, src1);
1081 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1082 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1083 ST_SH(out0, dst);
1084 SD(dst_val0, dst + 16);
1085 dst += dst_stride;
1086 }
1087
1088 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1089 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1090 filt3);
1091 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1092 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1093 filt3);
1094 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1095 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1096 filt3);
1097 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1098 out0, out1);
1099 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1100 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1101 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1102 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1103 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1104 CLIP_SH_0_255(out2);
1105 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1106 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1107 ST_SH(out0, dst);
1108 SD(dst_val0, dst + 16);
1109 dst += dst_stride;
1110 }
1111
hevc_hz_biwgt_8t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1112 static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1113 int32_t src_stride,
1114 int16_t *src1_ptr,
1115 int32_t src2_stride,
1116 uint8_t *dst,
1117 int32_t dst_stride,
1118 const int8_t *filter,
1119 int32_t height,
1120 int32_t weight0,
1121 int32_t weight1,
1122 int32_t offset0,
1123 int32_t offset1,
1124 int32_t rnd_val)
1125 {
1126 uint32_t loop_cnt;
1127 int32_t offset, weight, constant;
1128 v16i8 src0, src1, src2;
1129 v8i16 in0, in1, in2, in3;
1130 v8i16 filt0, filt1, filt2, filt3;
1131 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1132 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1133 v16i8 vec0, vec1, vec2, vec3;
1134 v8i16 dst0, dst1, dst2, dst3;
1135 v8i16 filter_vec, out0, out1, out2, out3;
1136 v4i32 weight_vec, offset_vec, rnd_vec;
1137
1138 src0_ptr -= 3;
1139 offset = (offset0 + offset1) << rnd_val;
1140 weight0 = weight0 & 0x0000FFFF;
1141 weight = weight0 | (weight1 << 16);
1142 constant = 128 * weight1;
1143 constant <<= 6;
1144 offset += constant;
1145
1146 offset_vec = __msa_fill_w(offset);
1147 weight_vec = __msa_fill_w(weight);
1148 rnd_vec = __msa_fill_w(rnd_val + 1);
1149
1150 filter_vec = LD_SH(filter);
1151 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1152
1153 mask1 = mask0 + 2;
1154 mask2 = mask0 + 4;
1155 mask3 = mask0 + 6;
1156 mask4 = mask0 + 8;
1157 mask5 = mask0 + 10;
1158 mask6 = mask0 + 12;
1159 mask7 = mask0 + 14;
1160
1161 for (loop_cnt = height; loop_cnt--;) {
1162 LD_SB2(src0_ptr, 16, src0, src1);
1163 src2 = LD_SB(src0_ptr + 24);
1164 src0_ptr += src_stride;
1165 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1166 src1_ptr += src2_stride;
1167
1168 XORI_B3_128_SB(src0, src1, src2);
1169
1170 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1171 vec0, vec1, vec2, vec3);
1172 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173 filt3);
1174 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1175 vec0, vec1, vec2, vec3);
1176 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1177 filt3);
1178 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1179 vec0, vec1, vec2, vec3);
1180 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1181 filt3);
1182 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1183 vec0, vec1, vec2, vec3);
1184 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1185 filt3);
1186
1187 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1188 in0, in1, in2, in3,
1189 weight_vec, rnd_vec, offset_vec,
1190 out0, out1, out2, out3);
1191
1192 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1193 ST_SH2(out0, out1, dst, 16);
1194 dst += dst_stride;
1195 }
1196 }
1197
hevc_hz_biwgt_8t_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1198 static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1199 int32_t src_stride,
1200 int16_t *src1_ptr,
1201 int32_t src2_stride,
1202 uint8_t *dst,
1203 int32_t dst_stride,
1204 const int8_t *filter,
1205 int32_t height,
1206 int32_t weight0,
1207 int32_t weight1,
1208 int32_t offset0,
1209 int32_t offset1,
1210 int32_t rnd_val)
1211 {
1212 uint32_t loop_cnt;
1213 int32_t offset, weight, constant;
1214 v16i8 src0, src1, src2, src3, src4;
1215 v8i16 in0, in1, in2, in3;
1216 v8i16 filt0, filt1, filt2, filt3;
1217 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1218 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1219 v16i8 vec0, vec1, vec2, vec3;
1220 v8i16 dst0, dst1, dst2, dst3;
1221 v8i16 filter_vec, out0, out1, out2, out3;
1222 v4i32 weight_vec, offset_vec, rnd_vec;
1223
1224 src0_ptr -= 3;
1225 offset = (offset0 + offset1) << rnd_val;
1226 weight0 = weight0 & 0x0000FFFF;
1227 weight = weight0 | (weight1 << 16);
1228 constant = 128 * weight1;
1229 constant <<= 6;
1230 offset += constant;
1231
1232 offset_vec = __msa_fill_w(offset);
1233 weight_vec = __msa_fill_w(weight);
1234 rnd_vec = __msa_fill_w(rnd_val + 1);
1235
1236 filter_vec = LD_SH(filter);
1237 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1238
1239 mask1 = mask0 + 2;
1240 mask2 = mask0 + 4;
1241 mask3 = mask0 + 6;
1242 mask4 = mask0 + 8;
1243 mask5 = mask0 + 10;
1244 mask6 = mask0 + 12;
1245 mask7 = mask0 + 14;
1246
1247 for (loop_cnt = 64; loop_cnt--;) {
1248 LD_SB2(src0_ptr, 16, src0, src1);
1249 src2 = LD_SB(src0_ptr + 24);
1250 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1251 XORI_B3_128_SB(src0, src1, src2);
1252 LD_SB2(src0_ptr + 32, 8, src3, src4);
1253 src0_ptr += src_stride;
1254 XORI_B2_128_SB(src3, src4);
1255
1256 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1257 vec0, vec1, vec2, vec3);
1258 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1259 filt3);
1260 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1261 vec0, vec1, vec2, vec3);
1262 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1263 filt3);
1264 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1265 vec0, vec1, vec2, vec3);
1266 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1267 filt3);
1268 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1269 vec0, vec1, vec2, vec3);
1270 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1271 filt3);
1272
1273 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1274 weight_vec, rnd_vec, offset_vec,
1275 out0, out1, out2, out3);
1276
1277 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1278 ST_SH2(out0, out1, dst, 16);
1279
1280 LD_SH2(src1_ptr + 32, 8, in2, in3);
1281 src1_ptr += src2_stride;
1282
1283 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1284 vec0, vec1, vec2, vec3);
1285 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1286 filt3);
1287 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1288 vec0, vec1, vec2, vec3);
1289 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1290 filt3);
1291
1292 HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1293 weight_vec, rnd_vec, offset_vec,
1294 out0, out1);
1295
1296 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1297 ST_SH(out0, dst + 32);
1298 dst += dst_stride;
1299 }
1300 }
1301
hevc_hz_biwgt_8t_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1302 static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1303 int32_t src_stride,
1304 int16_t *src1_ptr,
1305 int32_t src2_stride,
1306 uint8_t *dst,
1307 int32_t dst_stride,
1308 const int8_t *filter,
1309 int32_t height,
1310 int32_t weight0,
1311 int32_t weight1,
1312 int32_t offset0,
1313 int32_t offset1,
1314 int32_t rnd_val)
1315 {
1316 uint8_t *src0_ptr_tmp;
1317 uint8_t *dst_tmp;
1318 int16_t *src1_ptr_tmp;
1319 uint32_t loop_cnt, cnt;
1320 int32_t offset, weight, constant;
1321 v16i8 src0, src1, src2;
1322 v8i16 in0, in1, in2, in3;
1323 v8i16 filt0, filt1, filt2, filt3;
1324 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1325 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1326 v16i8 vec0, vec1, vec2, vec3;
1327 v8i16 dst0, dst1, dst2, dst3;
1328 v8i16 filter_vec, out0, out1, out2, out3;
1329 v4i32 weight_vec, offset_vec, rnd_vec;
1330
1331 src0_ptr -= 3;
1332 offset = (offset0 + offset1) << rnd_val;
1333 weight0 = weight0 & 0x0000FFFF;
1334 weight = weight0 | (weight1 << 16);
1335 constant = 128 * weight1;
1336 constant <<= 6;
1337 offset += constant;
1338
1339 offset_vec = __msa_fill_w(offset);
1340 weight_vec = __msa_fill_w(weight);
1341 rnd_vec = __msa_fill_w(rnd_val + 1);
1342
1343 filter_vec = LD_SH(filter);
1344 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1345
1346 mask1 = mask0 + 2;
1347 mask2 = mask0 + 4;
1348 mask3 = mask0 + 6;
1349 mask4 = mask0 + 8;
1350 mask5 = mask0 + 10;
1351 mask6 = mask0 + 12;
1352 mask7 = mask0 + 14;
1353
1354 for (loop_cnt = height; loop_cnt--;) {
1355 src0_ptr_tmp = src0_ptr;
1356 dst_tmp = dst;
1357 src1_ptr_tmp = src1_ptr;
1358
1359 for (cnt = 2; cnt--;) {
1360 LD_SB2(src0_ptr_tmp, 16, src0, src1);
1361 src2 = LD_SB(src0_ptr_tmp + 24);
1362 src0_ptr_tmp += 32;
1363 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1364 src1_ptr_tmp += 32;
1365 XORI_B3_128_SB(src0, src1, src2);
1366
1367 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1368 vec0, vec1, vec2, vec3);
1369 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1370 filt2, filt3);
1371 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1372 vec0, vec1, vec2, vec3);
1373 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1374 filt2, filt3);
1375 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1376 vec0, vec1, vec2, vec3);
1377 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1378 filt2, filt3);
1379 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1380 vec0, vec1, vec2, vec3);
1381 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1382 filt2, filt3);
1383
1384 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1385 in0, in1, in2, in3,
1386 weight_vec, rnd_vec, offset_vec,
1387 out0, out1, out2, out3);
1388
1389 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1390 ST_SH2(out0, out1, dst_tmp, 16);
1391 dst_tmp += 32;
1392 }
1393
1394 src0_ptr += src_stride;
1395 src1_ptr += src2_stride;
1396 dst += dst_stride;
1397
1398 }
1399 }
1400
hevc_vt_biwgt_8t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1401 static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1402 int32_t src_stride,
1403 int16_t *src1_ptr,
1404 int32_t src2_stride,
1405 uint8_t *dst,
1406 int32_t dst_stride,
1407 const int8_t *filter,
1408 int32_t height,
1409 int32_t weight0,
1410 int32_t weight1,
1411 int32_t offset0,
1412 int32_t offset1,
1413 int32_t rnd_val)
1414 {
1415 uint32_t loop_cnt;
1416 int32_t offset, weight;
1417 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1418 v16i8 src11, src12, src13, src14;
1419 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1420 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1421 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1422 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1423 v16i8 src2110, src4332, src6554, src8776, src10998;
1424 v16i8 src12111110, src14131312;
1425 v8i16 dst10, dst32, dst54, dst76;
1426 v8i16 filt0, filt1, filt2, filt3;
1427 v8i16 filter_vec, out0, out1, out2, out3;
1428 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1429
1430 src0_ptr -= (3 * src_stride);
1431 offset = (offset0 + offset1) << rnd_val;
1432 weight0 = weight0 & 0x0000FFFF;
1433 weight = weight0 | (weight1 << 16);
1434
1435 const_vec = __msa_ldi_w(128);
1436 const_vec <<= 6;
1437 offset_vec = __msa_fill_w(offset);
1438 weight_vec = __msa_fill_w(weight);
1439 rnd_vec = __msa_fill_w(rnd_val + 1);
1440 weight1_vec = __msa_fill_w(weight1);
1441 offset_vec += const_vec * weight1_vec;
1442
1443 filter_vec = LD_SH(filter);
1444 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445
1446 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1447 src0_ptr += (7 * src_stride);
1448
1449 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1450 src10_r, src32_r, src54_r, src21_r);
1451 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1452 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1453 src2110, src4332, src6554);
1454 XORI_B3_128_SB(src2110, src4332, src6554);
1455
1456 for (loop_cnt = (height >> 3); loop_cnt--;) {
1457 LD_SB8(src0_ptr, src_stride,
1458 src7, src8, src9, src10, src11, src12, src13, src14);
1459 src0_ptr += (8 * src_stride);
1460 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1461 src1_ptr += (8 * src2_stride);
1462
1463 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1464 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1465 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466 src76_r, src87_r, src98_r, src109_r);
1467 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1468 src1110_r, src1211_r, src1312_r, src1413_r);
1469 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1470 src1413_r, src1312_r,
1471 src8776, src10998, src12111110, src14131312);
1472 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1473
1474 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1475 filt0, dst10, dst32, dst54, dst76);
1476 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1477 filt1, dst10, dst32, dst54, dst76);
1478 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1479 filt2, filt2, dst10, dst32, dst54, dst76);
1480 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1481 filt3, filt3, dst10, dst32, dst54, dst76);
1482
1483 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1484 in0, in1, in2, in3,
1485 weight_vec, rnd_vec, offset_vec,
1486 out0, out1, out2, out3);
1487
1488 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1489 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1490 dst += (8 * dst_stride);
1491
1492 src2110 = src10998;
1493 src4332 = src12111110;
1494 src6554 = src14131312;
1495 src6 = src14;
1496 }
1497 }
1498
hevc_vt_biwgt_8t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1499 static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1500 int32_t src_stride,
1501 int16_t *src1_ptr,
1502 int32_t src2_stride,
1503 uint8_t *dst,
1504 int32_t dst_stride,
1505 const int8_t *filter,
1506 int32_t height,
1507 int32_t weight0,
1508 int32_t weight1,
1509 int32_t offset0,
1510 int32_t offset1,
1511 int32_t rnd_val)
1512 {
1513 uint32_t loop_cnt;
1514 int32_t offset, weight;
1515 v16i8 src0, src1, src2, src3, src4, src5;
1516 v16i8 src6, src7, src8, src9, src10;
1517 v8i16 in0, in1, in2, in3;
1518 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1519 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1520 v8i16 tmp0, tmp1, tmp2, tmp3;
1521 v8i16 filt0, filt1, filt2, filt3;
1522 v8i16 filter_vec, out0, out1, out2, out3;
1523 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1524
1525 src0_ptr -= (3 * src_stride);
1526 offset = (offset0 + offset1) << rnd_val;
1527 weight0 = weight0 & 0x0000FFFF;
1528 weight = weight0 | (weight1 << 16);
1529
1530 const_vec = __msa_ldi_w(128);
1531 const_vec <<= 6;
1532 offset_vec = __msa_fill_w(offset);
1533 weight_vec = __msa_fill_w(weight);
1534 rnd_vec = __msa_fill_w(rnd_val + 1);
1535 weight1_vec = __msa_fill_w(weight1);
1536 offset_vec += const_vec * weight1_vec;
1537
1538 filter_vec = LD_SH(filter);
1539 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1540
1541 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1542 src0_ptr += (7 * src_stride);
1543 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1544
1545 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1546 src10_r, src32_r, src54_r, src21_r);
1547 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1548
1549 for (loop_cnt = (height >> 2); loop_cnt--;) {
1550 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1551 src0_ptr += (4 * src_stride);
1552 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1553 src1_ptr += (4 * src2_stride);
1554
1555 XORI_B4_128_SB(src7, src8, src9, src10);
1556 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1557 src76_r, src87_r, src98_r, src109_r);
1558
1559 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1560 filt0, tmp0, tmp1, tmp2, tmp3);
1561 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1562 filt1, tmp0, tmp1, tmp2, tmp3);
1563 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1564 filt2, tmp0, tmp1, tmp2, tmp3);
1565 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1566 filt3, tmp0, tmp1, tmp2, tmp3);
1567
1568 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1569 in0, in1, in2, in3,
1570 weight_vec, rnd_vec, offset_vec,
1571 out0, out1, out2, out3);
1572
1573 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1574 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1575 dst += (4 * dst_stride);
1576
1577 src10_r = src54_r;
1578 src32_r = src76_r;
1579 src54_r = src98_r;
1580 src21_r = src65_r;
1581 src43_r = src87_r;
1582 src65_r = src109_r;
1583 src6 = src10;
1584 }
1585 }
1586
hevc_vt_biwgt_8t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1587 static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1588 int32_t src_stride,
1589 int16_t *src1_ptr,
1590 int32_t src2_stride,
1591 uint8_t *dst,
1592 int32_t dst_stride,
1593 const int8_t *filter,
1594 int32_t height,
1595 int32_t weight0,
1596 int32_t weight1,
1597 int32_t offset0,
1598 int32_t offset1,
1599 int32_t rnd_val)
1600 {
1601 uint32_t loop_cnt;
1602 int32_t offset, weight;
1603 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1604 v8i16 in0, in1, in2, in3;
1605 v16i8 src10_r, src32_r, src54_r, src76_r;
1606 v16i8 src21_r, src43_r, src65_r, src87_r;
1607 v8i16 tmp0, tmp1, tmp2;
1608 v16i8 src10_l, src32_l, src54_l, src76_l;
1609 v16i8 src21_l, src43_l, src65_l, src87_l;
1610 v16i8 src2110, src4332, src6554, src8776;
1611 v8i16 filt0, filt1, filt2, filt3;
1612 v8i16 out0, out1, out2, filter_vec;
1613 v4i32 dst2_r, dst2_l;
1614 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1615
1616 src0_ptr -= (3 * src_stride);
1617 offset = (offset0 + offset1) << rnd_val;
1618 weight0 = weight0 & 0x0000FFFF;
1619 weight = weight0 | (weight1 << 16);
1620
1621 const_vec = __msa_ldi_w(128);
1622 const_vec <<= 6;
1623 offset_vec = __msa_fill_w(offset);
1624 weight_vec = __msa_fill_w(weight);
1625 rnd_vec = __msa_fill_w(rnd_val + 1);
1626 weight1_vec = __msa_fill_w(weight1);
1627 offset_vec += const_vec * weight1_vec;
1628
1629 filter_vec = LD_SH(filter);
1630 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1631
1632 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1633 src0_ptr += (7 * src_stride);
1634 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1635
1636 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637 src10_r, src32_r, src54_r, src21_r);
1638 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1639 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1640 src10_l, src32_l, src54_l, src21_l);
1641 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1642 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1643 src2110, src4332, src6554);
1644
1645 for (loop_cnt = 8; loop_cnt--;) {
1646 LD_SB2(src0_ptr, src_stride, src7, src8);
1647 src0_ptr += (2 * src_stride);
1648 LD_SH2(src1_ptr, src2_stride, in0, in1);
1649 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1650 src1_ptr += (2 * src2_stride);
1651 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1652 XORI_B2_128_SB(src7, src8);
1653
1654 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1655 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1656 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1657
1658 DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1659 tmp0, tmp1, tmp2);
1660 DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1661 tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1662 DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1663 tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1664 DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1665 tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1666
1667 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1668 weight_vec, rnd_vec, offset_vec,
1669 out0, out1);
1670
1671 ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1672 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1673 (v8i16) weight_vec);
1674 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1675 (v8i16) weight_vec);
1676 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1677 out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1678 CLIP_SH_0_255(out2);
1679 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1680 ST_D2(out0, 0, 1, dst, dst_stride);
1681 ST_W2(out2, 0, 1, dst + 8, dst_stride);
1682 dst += (2 * dst_stride);
1683
1684 src10_r = src32_r;
1685 src32_r = src54_r;
1686 src54_r = src76_r;
1687 src21_r = src43_r;
1688 src43_r = src65_r;
1689 src65_r = src87_r;
1690 src2110 = src4332;
1691 src4332 = src6554;
1692 src6554 = src8776;
1693 src6 = src8;
1694 }
1695 }
1696
hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val,int32_t width)1697 static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
1698 int32_t src_stride,
1699 int16_t *src1_ptr,
1700 int32_t src2_stride,
1701 uint8_t *dst,
1702 int32_t dst_stride,
1703 const int8_t *filter,
1704 int32_t height,
1705 int32_t weight0,
1706 int32_t weight1,
1707 int32_t offset0,
1708 int32_t offset1,
1709 int32_t rnd_val,
1710 int32_t width)
1711 {
1712 uint8_t *src0_ptr_tmp;
1713 int16_t *src1_ptr_tmp;
1714 uint8_t *dst_tmp;
1715 uint32_t loop_cnt, cnt;
1716 int32_t offset, weight;
1717 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1718 v8i16 in0, in1, in2, in3;
1719 v16i8 src10_r, src32_r, src54_r, src76_r;
1720 v16i8 src21_r, src43_r, src65_r, src87_r;
1721 v16i8 src10_l, src32_l, src54_l, src76_l;
1722 v16i8 src21_l, src43_l, src65_l, src87_l;
1723 v8i16 tmp0, tmp1, tmp2, tmp3;
1724 v8i16 filt0, filt1, filt2, filt3;
1725 v8i16 filter_vec;
1726 v8i16 out0, out1, out2, out3;
1727 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1728
1729 src0_ptr -= (3 * src_stride);
1730
1731 offset = (offset0 + offset1) << rnd_val;
1732 weight0 = weight0 & 0x0000FFFF;
1733 weight = weight0 | (weight1 << 16);
1734
1735 const_vec = __msa_ldi_w(128);
1736 const_vec <<= 6;
1737 offset_vec = __msa_fill_w(offset);
1738 weight_vec = __msa_fill_w(weight);
1739 rnd_vec = __msa_fill_w(rnd_val + 1);
1740 weight1_vec = __msa_fill_w(weight1);
1741 offset_vec += const_vec * weight1_vec;
1742
1743 filter_vec = LD_SH(filter);
1744 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1745
1746 for (cnt = (width >> 4); cnt--;) {
1747 src0_ptr_tmp = src0_ptr;
1748 src1_ptr_tmp = src1_ptr;
1749 dst_tmp = dst;
1750
1751 LD_SB7(src0_ptr_tmp, src_stride,
1752 src0, src1, src2, src3, src4, src5, src6);
1753 src0_ptr_tmp += (7 * src_stride);
1754
1755 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1756 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757 src10_r, src32_r, src54_r, src21_r);
1758 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1759 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1760 src10_l, src32_l, src54_l, src21_l);
1761 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1762
1763 for (loop_cnt = (height >> 1); loop_cnt--;) {
1764 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1765 src0_ptr_tmp += (2 * src_stride);
1766 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1767 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1768 src1_ptr_tmp += (2 * src2_stride);
1769
1770 XORI_B2_128_SB(src7, src8);
1771 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1772 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1773
1774 DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1775 filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1776 DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1777 filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1778 DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1779 filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1780 DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1781 filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1782
1783 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1784 in0, in1, in2, in3,
1785 weight_vec, rnd_vec, offset_vec,
1786 out0, out1, out2, out3);
1787
1788 PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1789 ST_SH2(out0, out1, dst_tmp, dst_stride);
1790 dst_tmp += (2 * dst_stride);
1791
1792 src10_r = src32_r;
1793 src32_r = src54_r;
1794 src54_r = src76_r;
1795 src21_r = src43_r;
1796 src43_r = src65_r;
1797 src65_r = src87_r;
1798 src10_l = src32_l;
1799 src32_l = src54_l;
1800 src54_l = src76_l;
1801 src21_l = src43_l;
1802 src43_l = src65_l;
1803 src65_l = src87_l;
1804 src6 = src8;
1805 }
1806
1807 src0_ptr += 16;
1808 src1_ptr += 16;
1809 dst += 16;
1810 }
1811 }
1812
hevc_vt_biwgt_8t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1813 static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1814 int32_t src_stride,
1815 int16_t *src1_ptr,
1816 int32_t src2_stride,
1817 uint8_t *dst,
1818 int32_t dst_stride,
1819 const int8_t *filter,
1820 int32_t height,
1821 int32_t weight0,
1822 int32_t weight1,
1823 int32_t offset0,
1824 int32_t offset1,
1825 int32_t rnd_val)
1826 {
1827 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1828 src1_ptr, src2_stride,
1829 dst, dst_stride, filter, height,
1830 weight0, weight1, offset0, offset1,
1831 rnd_val, 16);
1832 }
1833
hevc_vt_biwgt_8t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1834 static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1835 int32_t src_stride,
1836 int16_t *src1_ptr,
1837 int32_t src2_stride,
1838 uint8_t *dst,
1839 int32_t dst_stride,
1840 const int8_t *filter,
1841 int32_t height,
1842 int32_t weight0,
1843 int32_t weight1,
1844 int32_t offset0,
1845 int32_t offset1,
1846 int32_t rnd_val)
1847 {
1848 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1849 src1_ptr, src2_stride,
1850 dst, dst_stride, filter, height,
1851 weight0, weight1, offset0, offset1,
1852 rnd_val, 16);
1853 hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1854 src1_ptr + 16, src2_stride,
1855 dst + 16, dst_stride, filter, height,
1856 weight0, weight1, offset0, offset1, rnd_val);
1857 }
1858
hevc_vt_biwgt_8t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1859 static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1860 int32_t src_stride,
1861 int16_t *src1_ptr,
1862 int32_t src2_stride,
1863 uint8_t *dst,
1864 int32_t dst_stride,
1865 const int8_t *filter,
1866 int32_t height,
1867 int32_t weight0,
1868 int32_t weight1,
1869 int32_t offset0,
1870 int32_t offset1,
1871 int32_t rnd_val)
1872 {
1873 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1874 src1_ptr, src2_stride,
1875 dst, dst_stride, filter, height,
1876 weight0, weight1, offset0, offset1,
1877 rnd_val, 32);
1878 }
1879
hevc_vt_biwgt_8t_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1880 static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1881 int32_t src_stride,
1882 int16_t *src1_ptr,
1883 int32_t src2_stride,
1884 uint8_t *dst,
1885 int32_t dst_stride,
1886 const int8_t *filter,
1887 int32_t height,
1888 int32_t weight0,
1889 int32_t weight1,
1890 int32_t offset0,
1891 int32_t offset1,
1892 int32_t rnd_val)
1893 {
1894 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1895 src1_ptr, src2_stride,
1896 dst, dst_stride, filter, height,
1897 weight0, weight1, offset0, offset1,
1898 rnd_val, 48);
1899 }
1900
hevc_vt_biwgt_8t_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1901 static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1902 int32_t src_stride,
1903 int16_t *src1_ptr,
1904 int32_t src2_stride,
1905 uint8_t *dst,
1906 int32_t dst_stride,
1907 const int8_t *filter,
1908 int32_t height,
1909 int32_t weight0,
1910 int32_t weight1,
1911 int32_t offset0,
1912 int32_t offset1,
1913 int32_t rnd_val)
1914 {
1915 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1916 src1_ptr, src2_stride,
1917 dst, dst_stride, filter, height,
1918 weight0, weight1, offset0, offset1,
1919 rnd_val, 64);
1920 }
1921
hevc_hv_biwgt_8t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)1922 static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1923 int32_t src_stride,
1924 int16_t *src1_ptr,
1925 int32_t src2_stride,
1926 uint8_t *dst,
1927 int32_t dst_stride,
1928 const int8_t *filter_x,
1929 const int8_t *filter_y,
1930 int32_t height,
1931 int32_t weight0,
1932 int32_t weight1,
1933 int32_t offset0,
1934 int32_t offset1,
1935 int32_t rnd_val)
1936 {
1937 uint32_t loop_cnt;
1938 uint64_t tp0, tp1;
1939 int32_t offset, weight;
1940 v16u8 out;
1941 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1942 v8i16 in0 = { 0 }, in1 = { 0 };
1943 v8i16 filt0, filt1, filt2, filt3;
1944 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1945 v16i8 mask1, mask2, mask3;
1946 v8i16 filter_vec, weight_vec;
1947 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1948 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1949 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1950 v8i16 tmp0, tmp1, tmp2, tmp3;
1951 v8i16 dst10, dst32, dst54, dst76;
1952 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1953 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1954 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1955
1956 src0_ptr -= ((3 * src_stride) + 3);
1957
1958 filter_vec = LD_SH(filter_x);
1959 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1960
1961 filter_vec = LD_SH(filter_y);
1962 UNPCK_R_SB_SH(filter_vec, filter_vec);
1963
1964 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1965
1966 mask1 = mask0 + 2;
1967 mask2 = mask0 + 4;
1968 mask3 = mask0 + 6;
1969
1970 offset = (offset0 + offset1) << rnd_val;
1971 weight0 = weight0 & 0x0000FFFF;
1972 weight = weight0 | (weight1 << 16);
1973
1974 const_vec = __msa_fill_w((128 * weight1));
1975 const_vec <<= 6;
1976 offset_vec = __msa_fill_w(offset);
1977 rnd_vec = __msa_fill_w(rnd_val + 1);
1978 offset_vec += const_vec;
1979 weight_vec = (v8i16) __msa_fill_w(weight);
1980
1981 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1982 src0_ptr += (7 * src_stride);
1983
1984 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1985
1986 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1987 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1988 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1989 vec8, vec9, vec10, vec11);
1990 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1991 vec12, vec13, vec14, vec15);
1992
1993 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1994 filt3);
1995 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1996 filt3);
1997 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1998 filt3);
1999 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2000 filt3);
2001
2002 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2003 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2004 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2005
2006 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2007
2008 for (loop_cnt = height >> 2; loop_cnt--;) {
2009 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2010 src0_ptr += (4 * src_stride);
2011 XORI_B4_128_SB(src7, src8, src9, src10);
2012
2013 LD2(src1_ptr, src2_stride, tp0, tp1);
2014 INSERT_D2_SH(tp0, tp1, in0);
2015 src1_ptr += (2 * src2_stride);
2016 LD2(src1_ptr, src2_stride, tp0, tp1);
2017 INSERT_D2_SH(tp0, tp1, in1);
2018 src1_ptr += (2 * src2_stride);
2019
2020 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2021 vec0, vec1, vec2, vec3);
2022 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2023 vec4, vec5, vec6, vec7);
2024 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2025 filt3);
2026 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2027 filt3);
2028
2029 dst76 = __msa_ilvr_h(dst97, dst66);
2030 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2031 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2032 dst98 = __msa_ilvr_h(dst66, dst108);
2033
2034 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2035 filt_h2, filt_h3);
2036 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2037 filt_h2, filt_h3);
2038 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2039 filt_h2, filt_h3);
2040 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2041 filt_h2, filt_h3);
2042 SRA_4V(dst0, dst1, dst2, dst3, 6);
2043 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2044 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2045 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2046 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2047 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2048 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2049 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2050 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2051 CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2052 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2053 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2054 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2055 dst += (4 * dst_stride);
2056
2057 dst10 = dst54;
2058 dst32 = dst76;
2059 dst54 = dst98;
2060 dst21 = dst65;
2061 dst43 = dst87;
2062 dst65 = dst109;
2063 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2064 }
2065 }
2066
hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val,int32_t width8mult)2067 static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
2068 int32_t src_stride,
2069 int16_t *src1_ptr,
2070 int32_t src2_stride,
2071 uint8_t *dst,
2072 int32_t dst_stride,
2073 const int8_t *filter_x,
2074 const int8_t *filter_y,
2075 int32_t height,
2076 int32_t weight0,
2077 int32_t weight1,
2078 int32_t offset0,
2079 int32_t offset1,
2080 int32_t rnd_val,
2081 int32_t width8mult)
2082 {
2083 uint32_t loop_cnt, cnt;
2084 int32_t offset, weight;
2085 uint8_t *src0_ptr_tmp;
2086 int16_t *src1_ptr_tmp;
2087 uint8_t *dst_tmp;
2088 v16u8 out;
2089 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2090 v8i16 in0, in1;
2091 v8i16 filt0, filt1, filt2, filt3;
2092 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2093 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2094 v16i8 mask1, mask2, mask3;
2095 v8i16 filter_vec, weight_vec;
2096 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2097 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2098 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2099 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2100 v8i16 tmp0, tmp1, tmp2, tmp3;
2101 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2102 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2103 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2104 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2105 v4i32 offset_vec, rnd_vec, const_vec;
2106
2107 src0_ptr -= ((3 * src_stride) + 3);
2108
2109 offset = (offset0 + offset1) << rnd_val;
2110 weight0 = weight0 & 0x0000FFFF;
2111 weight = weight0 | (weight1 << 16);
2112
2113 const_vec = __msa_fill_w((128 * weight1));
2114 const_vec <<= 6;
2115 offset_vec = __msa_fill_w(offset);
2116 rnd_vec = __msa_fill_w(rnd_val + 1);
2117 offset_vec += const_vec;
2118 weight_vec = (v8i16) __msa_fill_w(weight);
2119
2120 filter_vec = LD_SH(filter_x);
2121 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2122
2123 filter_vec = LD_SH(filter_y);
2124 UNPCK_R_SB_SH(filter_vec, filter_vec);
2125
2126 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2127
2128 mask1 = mask0 + 2;
2129 mask2 = mask0 + 4;
2130 mask3 = mask0 + 6;
2131
2132 for (cnt = width8mult; cnt--;) {
2133 src0_ptr_tmp = src0_ptr;
2134 src1_ptr_tmp = src1_ptr;
2135 dst_tmp = dst;
2136
2137 LD_SB7(src0_ptr_tmp, src_stride,
2138 src0, src1, src2, src3, src4, src5, src6);
2139 src0_ptr_tmp += (7 * src_stride);
2140
2141 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2142
2143 /* row 0 row 1 row 2 row 3 */
2144 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2145 vec0, vec1, vec2, vec3);
2146 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2147 vec4, vec5, vec6, vec7);
2148 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2149 vec8, vec9, vec10, vec11);
2150 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2151 vec12, vec13, vec14, vec15);
2152
2153 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2154 filt3);
2155 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2156 filt3);
2157 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2158 filt3);
2159 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2160 filt2, filt3);
2161
2162 /* row 4 row 5 row 6 */
2163 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2164 vec0, vec1, vec2, vec3);
2165 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2166 vec4, vec5, vec6, vec7);
2167 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2168 vec8, vec9, vec10, vec11);
2169
2170 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2171 filt3);
2172 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2173 filt3);
2174 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2175 filt3);
2176
2177 for (loop_cnt = height >> 1; loop_cnt--;) {
2178 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2179 XORI_B2_128_SB(src7, src8);
2180 src0_ptr_tmp += 2 * src_stride;
2181
2182 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2183 src1_ptr_tmp += (2 * src2_stride);
2184
2185 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2186 dst32_r, dst54_r, dst21_r);
2187 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2188 dst32_l, dst54_l, dst21_l);
2189 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2190 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2191
2192 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2193 vec0, vec1, vec2, vec3);
2194 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2195 filt2, filt3);
2196
2197 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2198 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2199 filt_h0, filt_h1, filt_h2, filt_h3);
2200 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2201 filt_h0, filt_h1, filt_h2, filt_h3);
2202
2203 dst0_r >>= 6;
2204 dst0_l >>= 6;
2205
2206 /* row 8 */
2207 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2208 vec0, vec1, vec2, vec3);
2209 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2210 filt2, filt3);
2211
2212 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2213 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2214 filt_h0, filt_h1, filt_h2, filt_h3);
2215 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2216 filt_h0, filt_h1, filt_h2, filt_h3);
2217
2218 dst1_r >>= 6;
2219 dst1_l >>= 6;
2220
2221 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2222 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2223 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2224 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2225 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2226 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2227 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2228 SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2229 CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
2230 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2231 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2232 ST_D2(out, 0, 1, dst_tmp, dst_stride);
2233 dst_tmp += (2 * dst_stride);
2234
2235 dst0 = dst2;
2236 dst1 = dst3;
2237 dst2 = dst4;
2238 dst3 = dst5;
2239 dst4 = dst6;
2240 dst5 = dst7;
2241 dst6 = dst8;
2242 }
2243
2244 src0_ptr += 8;
2245 src1_ptr += 8;
2246 dst += 8;
2247 }
2248 }
2249
hevc_hv_biwgt_8t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2250 static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2251 int32_t src_stride,
2252 int16_t *src1_ptr,
2253 int32_t src2_stride,
2254 uint8_t *dst,
2255 int32_t dst_stride,
2256 const int8_t *filter_x,
2257 const int8_t *filter_y,
2258 int32_t height,
2259 int32_t weight0,
2260 int32_t weight1,
2261 int32_t offset0,
2262 int32_t offset1,
2263 int32_t rnd_val)
2264 {
2265 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2266 src1_ptr, src2_stride,
2267 dst, dst_stride, filter_x, filter_y,
2268 height, weight0, weight1, offset0,
2269 offset1, rnd_val, 1);
2270 }
2271
hevc_hv_biwgt_8t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2272 static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2273 int32_t src_stride,
2274 int16_t *src1_ptr,
2275 int32_t src2_stride,
2276 uint8_t *dst,
2277 int32_t dst_stride,
2278 const int8_t *filter_x,
2279 const int8_t *filter_y,
2280 int32_t height,
2281 int32_t weight0,
2282 int32_t weight1,
2283 int32_t offset0,
2284 int32_t offset1,
2285 int32_t rnd_val)
2286 {
2287 uint32_t loop_cnt;
2288 uint8_t *src0_ptr_tmp, *dst_tmp;
2289 int16_t *src1_ptr_tmp;
2290 int32_t offset, weight;
2291 uint64_t tp0, tp1;
2292 v16u8 out;
2293 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2295 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2296 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2297 v8i16 in0 = { 0 }, in1 = { 0 };
2298 v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2299 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2300 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2301 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2302 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2303 v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2304 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2305 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2306
2307 src0_ptr -= ((3 * src_stride) + 3);
2308
2309 offset = (offset0 + offset1) << rnd_val;
2310 weight0 = weight0 & 0x0000FFFF;
2311 weight = weight0 | (weight1 << 16);
2312
2313 const_vec = __msa_fill_w((128 * weight1));
2314 const_vec <<= 6;
2315 offset_vec = __msa_fill_w(offset);
2316 rnd_vec = __msa_fill_w(rnd_val + 1);
2317 offset_vec += const_vec;
2318 weight_vec = (v8i16) __msa_fill_w(weight);
2319
2320 filter_vec = LD_SH(filter_x);
2321 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2322
2323 filter_vec = LD_SH(filter_y);
2324 UNPCK_R_SB_SH(filter_vec, filter_vec);
2325
2326 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2327
2328 mask0 = LD_SB(ff_hevc_mask_arr);
2329 mask1 = mask0 + 2;
2330 mask2 = mask0 + 4;
2331 mask3 = mask0 + 6;
2332
2333 src0_ptr_tmp = src0_ptr;
2334 src1_ptr_tmp = src1_ptr;
2335 dst_tmp = dst;
2336
2337 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2338 src0_ptr_tmp += (7 * src_stride);
2339 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2340
2341 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2342 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2343 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2344 vec11);
2345 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2346 vec15);
2347 dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2348 filt3);
2349 dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2350 filt3);
2351 dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2352 filt3);
2353 dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2354 filt2, filt3);
2355 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2356 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2357 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2358 vec11);
2359 dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2360 filt3);
2361 dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2362 filt3);
2363 dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2364 filt3);
2365
2366 for (loop_cnt = 8; loop_cnt--;) {
2367 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2368 src0_ptr_tmp += (2 * src_stride);
2369 XORI_B2_128_SB(src7, src8);
2370
2371 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2372 src1_ptr_tmp += (2 * src2_stride);
2373
2374 ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2375 dst10_r, dst32_r, dst54_r, dst21_r);
2376 ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2377 dst10_l, dst32_l, dst54_l, dst21_l);
2378 ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2379 ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2380
2381 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2382 vec3);
2383 dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2384 filt3);
2385
2386 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2387 dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2388 filt_h1, filt_h2, filt_h3);
2389 dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2390 filt_h1, filt_h2, filt_h3);
2391 dst0 >>= 6;
2392 dst1 >>= 6;
2393
2394 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2395 vec3);
2396 dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2397 filt3);
2398
2399 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2400 dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2401 filt_h1, filt_h2, filt_h3);
2402 dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2403 filt_h1, filt_h2, filt_h3);
2404 dst2 >>= 6;
2405 dst3 >>= 6;
2406
2407 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2408 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2409 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2410 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2411 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2412 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2413 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2414 SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2415 CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
2416 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2417 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2418 ST_D2(out, 0, 1, dst_tmp, dst_stride);
2419 dst_tmp += (2 * dst_stride);
2420
2421 dsth0 = dsth2;
2422 dsth1 = dsth3;
2423 dsth2 = dsth4;
2424 dsth3 = dsth5;
2425 dsth4 = dsth6;
2426 dsth5 = dsth7;
2427 dsth6 = dsth8;
2428 }
2429
2430 src0_ptr += 8;
2431 src1_ptr += 8;
2432 dst += 8;
2433
2434 mask4 = LD_SB(ff_hevc_mask_arr + 16);
2435 mask5 = mask4 + 2;
2436 mask6 = mask4 + 4;
2437 mask7 = mask4 + 6;
2438
2439 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2440 src0_ptr += (7 * src_stride);
2441 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2442
2443 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2444 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2445 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2446 vec11);
2447 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2448 vec15);
2449 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2450 filt3);
2451 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2452 filt3);
2453 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2454 filt3);
2455 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2456 filt3);
2457 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2458 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2459 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2460
2461 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2462
2463 for (loop_cnt = 4; loop_cnt--;) {
2464 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2465 src0_ptr += (4 * src_stride);
2466 XORI_B4_128_SB(src7, src8, src9, src10);
2467
2468 LD2(src1_ptr, src2_stride, tp0, tp1);
2469 INSERT_D2_SH(tp0, tp1, in0);
2470 src1_ptr += (2 * src2_stride);
2471 LD2(src1_ptr, src2_stride, tp0, tp1);
2472 INSERT_D2_SH(tp0, tp1, in1);
2473 src1_ptr += (2 * src2_stride);
2474
2475 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2476 vec3);
2477 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2478 vec7);
2479 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2480 filt3);
2481 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2482 filt3);
2483
2484 dst76 = __msa_ilvr_h(dst97, dst66);
2485 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2486 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2487 dst98 = __msa_ilvr_h(dst66, dst108);
2488
2489 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2490 filt_h2, filt_h3);
2491 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2492 filt_h2, filt_h3);
2493 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2494 filt_h2, filt_h3);
2495 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2496 filt_h2, filt_h3);
2497 SRA_4V(dst0, dst1, dst2, dst3, 6);
2498 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2499 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2500 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2501 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2502 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2503 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2504 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2505 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2506 CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2507 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2508 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2509 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2510 dst += (4 * dst_stride);
2511
2512 dst10 = dst54;
2513 dst32 = dst76;
2514 dst54 = dst98;
2515 dst21 = dst65;
2516 dst43 = dst87;
2517 dst65 = dst109;
2518 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2519 }
2520 }
2521
hevc_hv_biwgt_8t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2522 static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2523 int32_t src_stride,
2524 int16_t *src1_ptr,
2525 int32_t src2_stride,
2526 uint8_t *dst,
2527 int32_t dst_stride,
2528 const int8_t *filter_x,
2529 const int8_t *filter_y,
2530 int32_t height,
2531 int32_t weight0,
2532 int32_t weight1,
2533 int32_t offset0,
2534 int32_t offset1,
2535 int32_t rnd_val)
2536 {
2537 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2538 src1_ptr, src2_stride,
2539 dst, dst_stride, filter_x, filter_y,
2540 height, weight0, weight1, offset0,
2541 offset1, rnd_val, 2);
2542 }
2543
hevc_hv_biwgt_8t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2544 static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2545 int32_t src_stride,
2546 int16_t *src1_ptr,
2547 int32_t src2_stride,
2548 uint8_t *dst,
2549 int32_t dst_stride,
2550 const int8_t *filter_x,
2551 const int8_t *filter_y,
2552 int32_t height,
2553 int32_t weight0,
2554 int32_t weight1,
2555 int32_t offset0,
2556 int32_t offset1,
2557 int32_t rnd_val)
2558 {
2559 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2560 src1_ptr, src2_stride,
2561 dst, dst_stride, filter_x, filter_y,
2562 height, weight0, weight1, offset0,
2563 offset1, rnd_val, 3);
2564 }
2565
hevc_hv_biwgt_8t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2566 static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2567 int32_t src_stride,
2568 int16_t *src1_ptr,
2569 int32_t src2_stride,
2570 uint8_t *dst,
2571 int32_t dst_stride,
2572 const int8_t *filter_x,
2573 const int8_t *filter_y,
2574 int32_t height,
2575 int32_t weight0,
2576 int32_t weight1,
2577 int32_t offset0,
2578 int32_t offset1,
2579 int32_t rnd_val)
2580 {
2581 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2582 src1_ptr, src2_stride,
2583 dst, dst_stride, filter_x, filter_y,
2584 height, weight0, weight1, offset0,
2585 offset1, rnd_val, 4);
2586 }
2587
hevc_hv_biwgt_8t_48w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2588 static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2589 int32_t src_stride,
2590 int16_t *src1_ptr,
2591 int32_t src2_stride,
2592 uint8_t *dst,
2593 int32_t dst_stride,
2594 const int8_t *filter_x,
2595 const int8_t *filter_y,
2596 int32_t height,
2597 int32_t weight0,
2598 int32_t weight1,
2599 int32_t offset0,
2600 int32_t offset1,
2601 int32_t rnd_val)
2602 {
2603 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2604 src1_ptr, src2_stride,
2605 dst, dst_stride, filter_x, filter_y,
2606 height, weight0, weight1, offset0,
2607 offset1, rnd_val, 6);
2608 }
2609
hevc_hv_biwgt_8t_64w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2610 static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2611 int32_t src_stride,
2612 int16_t *src1_ptr,
2613 int32_t src2_stride,
2614 uint8_t *dst,
2615 int32_t dst_stride,
2616 const int8_t *filter_x,
2617 const int8_t *filter_y,
2618 int32_t height,
2619 int32_t weight0,
2620 int32_t weight1,
2621 int32_t offset0,
2622 int32_t offset1,
2623 int32_t rnd_val)
2624 {
2625 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2626 src1_ptr, src2_stride,
2627 dst, dst_stride, filter_x, filter_y,
2628 height, weight0, weight1, offset0,
2629 offset1, rnd_val, 8);
2630 }
2631
hevc_hz_biwgt_4t_4x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2632 static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2633 int32_t src_stride,
2634 int16_t *src1_ptr,
2635 int32_t src2_stride,
2636 uint8_t *dst,
2637 int32_t dst_stride,
2638 const int8_t *filter,
2639 int32_t weight0,
2640 int32_t weight1,
2641 int32_t offset0,
2642 int32_t offset1,
2643 int32_t rnd_val)
2644 {
2645 int32_t offset, weight, constant;
2646 v8i16 filt0, filt1;
2647 v16i8 src0, src1;
2648 v8i16 in0, in1;
2649 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2650 v16i8 mask1, vec0, vec1;
2651 v8i16 dst0;
2652 v4i32 dst0_r, dst0_l;
2653 v8i16 out0, filter_vec;
2654 v4i32 weight_vec, offset_vec, rnd_vec;
2655
2656 src0_ptr -= 1;
2657
2658 filter_vec = LD_SH(filter);
2659 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2660
2661 mask1 = mask0 + 2;
2662
2663 offset = (offset0 + offset1) << rnd_val;
2664 weight0 = weight0 & 0x0000FFFF;
2665 weight = weight0 | (weight1 << 16);
2666 constant = 128 * weight1;
2667 constant <<= 6;
2668 offset += constant;
2669
2670 offset_vec = __msa_fill_w(offset);
2671 weight_vec = __msa_fill_w(weight);
2672 rnd_vec = __msa_fill_w(rnd_val + 1);
2673
2674 LD_SB2(src0_ptr, src_stride, src0, src1);
2675 LD_SH2(src1_ptr, src2_stride, in0, in1);
2676 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2677 XORI_B2_128_SB(src0, src1);
2678
2679 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2680 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2681
2682 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2683 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2684 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2685 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2686 out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2687 CLIP_SH_0_255(out0);
2688 out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2689 ST_W2(out0, 0, 1, dst, dst_stride);
2690 }
2691
hevc_hz_biwgt_4t_4x4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2692 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2693 int32_t src_stride,
2694 int16_t *src1_ptr,
2695 int32_t src2_stride,
2696 uint8_t *dst,
2697 int32_t dst_stride,
2698 const int8_t *filter,
2699 int32_t weight0,
2700 int32_t weight1,
2701 int32_t offset0,
2702 int32_t offset1,
2703 int32_t rnd_val)
2704 {
2705 int32_t offset, weight, constant;
2706 v8i16 filt0, filt1;
2707 v16i8 src0, src1, src2, src3;
2708 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2709 v16i8 mask1;
2710 v8i16 dst0, dst1;
2711 v16i8 vec0, vec1;
2712 v8i16 in0, in1, in2, in3;
2713 v8i16 filter_vec;
2714 v4i32 weight_vec, offset_vec, rnd_vec;
2715
2716 src0_ptr -= 1;
2717
2718 /* rearranging filter */
2719 filter_vec = LD_SH(filter);
2720 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2721
2722 mask1 = mask0 + 2;
2723
2724 offset = (offset0 + offset1) << rnd_val;
2725 weight0 = weight0 & 0x0000FFFF;
2726 weight = weight0 | (weight1 << 16);
2727 constant = 128 * weight1;
2728 constant <<= 6;
2729 offset += constant;
2730
2731 offset_vec = __msa_fill_w(offset);
2732 weight_vec = __msa_fill_w(weight);
2733 rnd_vec = __msa_fill_w(rnd_val + 1);
2734
2735 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2736 XORI_B4_128_SB(src0, src1, src2, src3);
2737 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2738 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2739
2740 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2741 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2742 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2743 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2744 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2745 weight_vec, rnd_vec, offset_vec,
2746 dst0, dst1);
2747
2748 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2749 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2750 }
2751
hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2752 static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
2753 int32_t src_stride,
2754 int16_t *src1_ptr,
2755 int32_t src2_stride,
2756 uint8_t *dst,
2757 int32_t dst_stride,
2758 const int8_t *filter,
2759 int32_t height,
2760 int32_t weight0,
2761 int32_t weight1,
2762 int32_t offset0,
2763 int32_t offset1,
2764 int32_t rnd_val)
2765 {
2766 uint32_t loop_cnt;
2767 int32_t weight, offset, constant;
2768 v8i16 filt0, filt1;
2769 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2770 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2771 v16i8 mask1;
2772 v16i8 vec0, vec1;
2773 v8i16 dst0, dst1, dst2, dst3;
2774 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2775 v8i16 filter_vec;
2776 v4i32 weight_vec, offset_vec, rnd_vec;
2777
2778 src0_ptr -= 1;
2779
2780 filter_vec = LD_SH(filter);
2781 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2782
2783 offset = (offset0 + offset1) << rnd_val;
2784 weight0 = weight0 & 0x0000FFFF;
2785 weight = weight0 | (weight1 << 16);
2786 constant = 128 * weight1;
2787 constant <<= 6;
2788 offset += constant;
2789
2790 offset_vec = __msa_fill_w(offset);
2791 weight_vec = __msa_fill_w(weight);
2792 rnd_vec = __msa_fill_w(rnd_val + 1);
2793
2794 mask1 = mask0 + 2;
2795
2796 for (loop_cnt = (height >> 3); loop_cnt--;) {
2797 LD_SB8(src0_ptr, src_stride,
2798 src0, src1, src2, src3, src4, src5, src6, src7);
2799 src0_ptr += (8 * src_stride);
2800 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2801 src1_ptr += (4 * src2_stride);
2802 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2803 src1_ptr += (4 * src2_stride);
2804 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2805 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2806 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2807
2808 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2809 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2810 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2811 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2812 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2813 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2814 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2815 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2816 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2817 in0, in1, in2, in3,
2818 weight_vec, rnd_vec, offset_vec,
2819 dst0, dst1, dst2, dst3);
2820
2821 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2822 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2823 dst += (8 * dst_stride);
2824 }
2825 }
2826
hevc_hz_biwgt_4t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2827 static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2828 int32_t src_stride,
2829 int16_t *src1_ptr,
2830 int32_t src2_stride,
2831 uint8_t *dst,
2832 int32_t dst_stride,
2833 const int8_t *filter,
2834 int32_t height,
2835 int32_t weight0,
2836 int32_t weight1,
2837 int32_t offset0,
2838 int32_t offset1,
2839 int32_t rnd_val)
2840 {
2841 if (2 == height) {
2842 hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2843 dst, dst_stride, filter,
2844 weight0, weight1, offset0, offset1, rnd_val);
2845 } else if (4 == height) {
2846 hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2847 dst, dst_stride, filter,
2848 weight0, weight1, offset0, offset1, rnd_val);
2849 } else if (0 == (height % 8)) {
2850 hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2851 src1_ptr, src2_stride,
2852 dst, dst_stride, filter, height,
2853 weight0, weight1, offset0, offset1,
2854 rnd_val);
2855 }
2856 }
2857
hevc_hz_biwgt_4t_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2858 static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2859 int32_t src_stride,
2860 int16_t *src1_ptr,
2861 int32_t src2_stride,
2862 uint8_t *dst,
2863 int32_t dst_stride,
2864 const int8_t *filter,
2865 int32_t height,
2866 int32_t weight0,
2867 int32_t weight1,
2868 int32_t offset0,
2869 int32_t offset1,
2870 int32_t rnd_val)
2871 {
2872 uint32_t loop_cnt;
2873 int32_t offset, weight, constant;
2874 v8i16 filt0, filt1;
2875 v16i8 src0, src1, src2, src3;
2876 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2877 v16i8 mask1;
2878 v16i8 vec0, vec1;
2879 v8i16 in0, in1, in2, in3;
2880 v8i16 dst0, dst1, dst2, dst3;
2881 v8i16 filter_vec;
2882 v4i32 weight_vec, offset_vec, rnd_vec;
2883
2884 src0_ptr -= 1;
2885
2886 filter_vec = LD_SH(filter);
2887 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2888
2889 offset = (offset0 + offset1) << rnd_val;
2890 weight0 = weight0 & 0x0000FFFF;
2891 weight = weight0 | (weight1 << 16);
2892 constant = 128 * weight1;
2893 constant <<= 6;
2894 offset += constant;
2895
2896 offset_vec = __msa_fill_w(offset);
2897 weight_vec = __msa_fill_w(weight);
2898 rnd_vec = __msa_fill_w(rnd_val + 1);
2899
2900 mask1 = mask0 + 2;
2901
2902 for (loop_cnt = 2; loop_cnt--;) {
2903 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2904 src0_ptr += (4 * src_stride);
2905 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2906 src1_ptr += (4 * src2_stride);
2907 XORI_B4_128_SB(src0, src1, src2, src3);
2908
2909 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2910 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2911 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2912 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2913 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2914 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2915 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2916 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2917
2918 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2919 in0, in1, in2, in3,
2920 weight_vec, rnd_vec, offset_vec,
2921 dst0, dst1, dst2, dst3);
2922
2923 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2924 ST_W2(dst0, 0, 2, dst, dst_stride);
2925 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2926 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2927 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2928 dst += (4 * dst_stride);
2929 }
2930 }
2931
hevc_hz_biwgt_4t_8x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2932 static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2933 int32_t src_stride,
2934 int16_t *src1_ptr,
2935 int32_t src2_stride,
2936 uint8_t *dst,
2937 int32_t dst_stride,
2938 const int8_t *filter,
2939 int32_t weight0,
2940 int32_t weight1,
2941 int32_t offset0,
2942 int32_t offset1,
2943 int32_t rnd_val)
2944 {
2945 int32_t offset, weight, constant;
2946 v8i16 filt0, filt1;
2947 v16i8 src0, src1;
2948 v8i16 in0, in1;
2949 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2950 v16i8 mask1, vec0, vec1;
2951 v8i16 dst0, dst1;
2952 v8i16 filter_vec;
2953 v4i32 weight_vec, offset_vec, rnd_vec;
2954
2955 src0_ptr -= 1;
2956
2957 filter_vec = LD_SH(filter);
2958 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2959
2960 offset = (offset0 + offset1) << rnd_val;
2961 weight0 = weight0 & 0x0000FFFF;
2962 weight = weight0 | (weight1 << 16);
2963 constant = 128 * weight1;
2964 constant <<= 6;
2965 offset += constant;
2966
2967 offset_vec = __msa_fill_w(offset);
2968 weight_vec = __msa_fill_w(weight);
2969 rnd_vec = __msa_fill_w(rnd_val + 1);
2970
2971 mask1 = mask0 + 2;
2972
2973 LD_SB2(src0_ptr, src_stride, src0, src1);
2974 LD_SH2(src1_ptr, src2_stride, in0, in1);
2975 XORI_B2_128_SB(src0, src1);
2976 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2977 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2978 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2979 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2981 weight_vec, rnd_vec, offset_vec,
2982 dst0, dst1);
2983
2984 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2985 ST_D2(dst0, 0, 1, dst, dst_stride);
2986 }
2987
hevc_hz_biwgt_4t_8x6_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)2988 static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2989 int32_t src_stride,
2990 int16_t *src1_ptr,
2991 int32_t src2_stride,
2992 uint8_t *dst,
2993 int32_t dst_stride,
2994 const int8_t *filter,
2995 int32_t weight0,
2996 int32_t weight1,
2997 int32_t offset0,
2998 int32_t offset1,
2999 int32_t rnd_val)
3000 {
3001 int32_t weight, offset, constant;
3002 v8i16 filt0, filt1;
3003 v16i8 src0, src1, src2, src3, src4, src5;
3004 v8i16 in0, in1, in2, in3, in4, in5;
3005 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3006 v16i8 mask1;
3007 v16i8 vec0, vec1;
3008 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3009 v8i16 filter_vec;
3010 v4i32 weight_vec, offset_vec, rnd_vec;
3011
3012 src0_ptr -= 1;
3013
3014 filter_vec = LD_SH(filter);
3015 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3016
3017 offset = (offset0 + offset1) << rnd_val;
3018 weight0 = weight0 & 0x0000FFFF;
3019 weight = weight0 | (weight1 << 16);
3020 constant = 128 * weight1;
3021 constant <<= 6;
3022 offset += constant;
3023
3024 offset_vec = __msa_fill_w(offset);
3025 weight_vec = __msa_fill_w(weight);
3026 rnd_vec = __msa_fill_w(rnd_val + 1);
3027
3028 mask1 = mask0 + 2;
3029
3030 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3031
3032 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3033 src1_ptr += (4 * src2_stride);
3034 LD_SH2(src1_ptr, src2_stride, in4, in5);
3035 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3036 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3037 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3038 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3039 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3040 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3041 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3043 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3045 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3046 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3047 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3048 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3049 in0, in1, in2, in3,
3050 weight_vec, rnd_vec, offset_vec,
3051 dst0, dst1, dst2, dst3);
3052 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3053 weight_vec, rnd_vec, offset_vec,
3054 dst4, dst5);
3055
3056 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3057 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3058 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3059 ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3060 }
3061
hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3062 static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3063 int32_t src_stride,
3064 int16_t *src1_ptr,
3065 int32_t src2_stride,
3066 uint8_t *dst,
3067 int32_t dst_stride,
3068 const int8_t *filter,
3069 int32_t height,
3070 int32_t weight0,
3071 int32_t weight1,
3072 int32_t offset0,
3073 int32_t offset1,
3074 int32_t rnd_val)
3075 {
3076 uint32_t loop_cnt;
3077 int32_t offset, weight, constant;
3078 v8i16 filt0, filt1;
3079 v16i8 src0, src1, src2, src3;
3080 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3081 v16i8 mask1;
3082 v16i8 vec0, vec1;
3083 v8i16 in0, in1, in2, in3;
3084 v8i16 dst0, dst1, dst2, dst3;
3085 v8i16 filter_vec;
3086 v4i32 weight_vec, offset_vec, rnd_vec;
3087
3088 src0_ptr -= 1;
3089
3090 filter_vec = LD_SH(filter);
3091 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3092
3093 offset = (offset0 + offset1) << rnd_val;
3094 weight0 = weight0 & 0x0000FFFF;
3095 weight = weight0 | (weight1 << 16);
3096 constant = 128 * weight1;
3097 constant <<= 6;
3098 offset += constant;
3099
3100 offset_vec = __msa_fill_w(offset);
3101 weight_vec = __msa_fill_w(weight);
3102 rnd_vec = __msa_fill_w(rnd_val + 1);
3103
3104 mask1 = mask0 + 2;
3105
3106 for (loop_cnt = (height >> 2); loop_cnt--;) {
3107 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3108 src0_ptr += (4 * src_stride);
3109 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3110 src1_ptr += (4 * src2_stride);
3111 XORI_B4_128_SB(src0, src1, src2, src3);
3112
3113 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3114 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3115 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3116 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3117 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3118 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3119 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3120 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3121 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3122 in0, in1, in2, in3,
3123 weight_vec, rnd_vec, offset_vec,
3124 dst0, dst1, dst2, dst3);
3125
3126 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3127 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3128 dst += (4 * dst_stride);
3129 }
3130 }
3131
hevc_hz_biwgt_4t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3132 static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3133 int32_t src_stride,
3134 int16_t *src1_ptr,
3135 int32_t src2_stride,
3136 uint8_t *dst,
3137 int32_t dst_stride,
3138 const int8_t *filter,
3139 int32_t height,
3140 int32_t weight0,
3141 int32_t weight1,
3142 int32_t offset0,
3143 int32_t offset1,
3144 int32_t rnd_val)
3145 {
3146 if (2 == height) {
3147 hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3148 dst, dst_stride, filter,
3149 weight0, weight1, offset0, offset1, rnd_val);
3150 } else if (6 == height) {
3151 hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3152 dst, dst_stride, filter,
3153 weight0, weight1, offset0, offset1, rnd_val);
3154 } else if (0 == (height % 4)) {
3155 hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3156 src1_ptr, src2_stride,
3157 dst, dst_stride, filter, height,
3158 weight0, weight1, offset0, offset1,
3159 rnd_val);
3160 }
3161 }
3162
hevc_hz_biwgt_4t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3163 static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3164 int32_t src_stride,
3165 int16_t *src1_ptr,
3166 int32_t src2_stride,
3167 uint8_t *dst,
3168 int32_t dst_stride,
3169 const int8_t *filter,
3170 int32_t height,
3171 int32_t weight0,
3172 int32_t weight1,
3173 int32_t offset0,
3174 int32_t offset1,
3175 int32_t rnd_val)
3176 {
3177 uint32_t loop_cnt;
3178 int32_t offset, weight, constant;
3179 v8i16 filt0, filt1;
3180 v16i8 src0, src1, src2, src3;
3181 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3182 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3183 v16i8 mask2 = {
3184 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3185 };
3186 v16i8 mask1, mask3;
3187 v16i8 vec0, vec1;
3188 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3189 v8i16 filter_vec;
3190 v4i32 weight_vec, offset_vec, rnd_vec;
3191
3192 src0_ptr -= 1;
3193
3194 filter_vec = LD_SH(filter);
3195 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3196
3197 offset = (offset0 + offset1) << rnd_val;
3198 weight0 = weight0 & 0x0000FFFF;
3199 weight = weight0 | (weight1 << 16);
3200 constant = 128 * weight1;
3201 constant <<= 6;
3202 offset += constant;
3203
3204 offset_vec = __msa_fill_w(offset);
3205 weight_vec = __msa_fill_w(weight);
3206 rnd_vec = __msa_fill_w(rnd_val + 1);
3207
3208 mask1 = mask0 + 2;
3209 mask3 = mask2 + 2;
3210
3211 for (loop_cnt = 4; loop_cnt--;) {
3212 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3213 src0_ptr += (4 * src_stride);
3214 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3215 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3216 src1_ptr += (4 * src2_stride);
3217 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3218 XORI_B4_128_SB(src0, src1, src2, src3);
3219
3220 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3221 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3222 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3223 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3224 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3225 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3226 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3227 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3228 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3229 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3230 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3231 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3232
3233 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3234 in0, in1, in2, in3,
3235 weight_vec, rnd_vec, offset_vec,
3236 dst0, dst1, dst2, dst3);
3237 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3238 weight_vec, rnd_vec, offset_vec,
3239 dst4, dst5);
3240
3241 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3242 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3243 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3244 ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3245 dst += (4 * dst_stride);
3246 }
3247 }
3248
hevc_hz_biwgt_4t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3249 static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3250 int32_t src_stride,
3251 int16_t *src1_ptr,
3252 int32_t src2_stride,
3253 uint8_t *dst,
3254 int32_t dst_stride,
3255 const int8_t *filter,
3256 int32_t height,
3257 int32_t weight0,
3258 int32_t weight1,
3259 int32_t offset0,
3260 int32_t offset1,
3261 int32_t rnd_val)
3262 {
3263 uint32_t loop_cnt;
3264 int32_t offset, weight, constant;
3265 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3266 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3267 v8i16 filt0, filt1;
3268 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3269 v16i8 mask1;
3270 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3271 v16i8 vec0, vec1;
3272 v8i16 filter_vec;
3273 v4i32 weight_vec, offset_vec, rnd_vec;
3274
3275 src0_ptr -= 1;
3276
3277 filter_vec = LD_SH(filter);
3278 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3279
3280 offset = (offset0 + offset1) << rnd_val;
3281 weight0 = weight0 & 0x0000FFFF;
3282 weight = weight0 | (weight1 << 16);
3283 constant = 128 * weight1;
3284 constant <<= 6;
3285 offset += constant;
3286
3287 offset_vec = __msa_fill_w(offset);
3288 weight_vec = __msa_fill_w(weight);
3289 rnd_vec = __msa_fill_w(rnd_val + 1);
3290
3291 mask1 = mask0 + 2;
3292
3293 for (loop_cnt = (height >> 2); loop_cnt--;) {
3294 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3295 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3296 src0_ptr += (4 * src_stride);
3297 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3298 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3299 src1_ptr += (4 * src2_stride);
3300 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3301
3302 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3303 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3304 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3305 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3306 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3307 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3308 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3309 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3310 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3311 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3312 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3313 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3314 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3315 dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3316 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3317 dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3318 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3319 in0, in1, in2, in3,
3320 weight_vec, rnd_vec, offset_vec,
3321 dst0, dst1, dst2, dst3);
3322
3323 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3324 ST_SH2(dst0, dst1, dst, dst_stride);
3325 dst += (2 * dst_stride);
3326
3327 HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3328 in4, in5, in6, in7,
3329 weight_vec, rnd_vec, offset_vec,
3330 dst0, dst1, dst2, dst3);
3331
3332 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3333 ST_SH2(dst0, dst1, dst, dst_stride);
3334 dst += (2 * dst_stride);
3335 }
3336 }
3337
hevc_hz_biwgt_4t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3338 static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3339 int32_t src_stride,
3340 int16_t *src1_ptr,
3341 int32_t src2_stride,
3342 uint8_t *dst,
3343 int32_t dst_stride,
3344 const int8_t *filter,
3345 int32_t height,
3346 int32_t weight0,
3347 int32_t weight1,
3348 int32_t offset0,
3349 int32_t offset1,
3350 int32_t rnd_val)
3351 {
3352 uint32_t loop_cnt;
3353 int32_t offset, weight, constant;
3354 v16i8 src0, src1, src2, src3;
3355 v8i16 filt0, filt1;
3356 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3357 v16i8 mask1, mask2, mask3;
3358 v16i8 vec0, vec1;
3359 v8i16 dst0, dst1, dst2, dst3;
3360 v8i16 in0, in1, in2, in3, in4, in5;
3361 v8i16 filter_vec;
3362 v4i32 weight_vec, offset_vec, rnd_vec;
3363
3364 src0_ptr -= 1;
3365
3366 filter_vec = LD_SH(filter);
3367 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3368
3369 offset = (offset0 + offset1) << rnd_val;
3370 weight0 = weight0 & 0x0000FFFF;
3371 weight = weight0 | (weight1 << 16);
3372 constant = 128 * weight1;
3373 constant <<= 6;
3374 offset += constant;
3375
3376 offset_vec = __msa_fill_w(offset);
3377 weight_vec = __msa_fill_w(weight);
3378 rnd_vec = __msa_fill_w(rnd_val + 1);
3379
3380 mask1 = mask0 + 2;
3381 mask2 = mask0 + 8;
3382 mask3 = mask0 + 10;
3383
3384 for (loop_cnt = 16; loop_cnt--;) {
3385 LD_SB2(src0_ptr, src_stride, src0, src2);
3386 LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3387 src0_ptr += (2 * src_stride);
3388 LD_SH2(src1_ptr, src2_stride, in0, in2);
3389 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3390 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3391 src1_ptr += (2 * src2_stride);
3392 XORI_B4_128_SB(src0, src1, src2, src3);
3393
3394 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3395 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3396 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3397 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3398 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3399 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3400 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3401 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3402 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3403 in0, in1, in2, in3,
3404 weight_vec, rnd_vec, offset_vec,
3405 dst0, dst1, dst2, dst3);
3406
3407 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3408 ST_SH2(dst0, dst1, dst, dst_stride);
3409
3410 /* 8 width */
3411 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3412 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3413 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3414 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3415 HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3416 weight_vec, rnd_vec, offset_vec,
3417 dst0, dst1);
3418
3419 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3420 ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3421 dst += (2 * dst_stride);
3422 }
3423 }
3424
hevc_hz_biwgt_4t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3425 static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3426 int32_t src_stride,
3427 int16_t *src1_ptr,
3428 int32_t src2_stride,
3429 uint8_t *dst,
3430 int32_t dst_stride,
3431 const int8_t *filter,
3432 int32_t height,
3433 int32_t weight0,
3434 int32_t weight1,
3435 int32_t offset0,
3436 int32_t offset1,
3437 int32_t rnd_val)
3438 {
3439 uint32_t loop_cnt;
3440 int32_t offset, weight, constant;
3441 v16i8 src0, src1, src2;
3442 v8i16 filt0, filt1;
3443 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3444 v16i8 mask1, mask2, mask3;
3445 v8i16 dst0, dst1, dst2, dst3;
3446 v16i8 vec0, vec1;
3447 v8i16 in0, in1, in2, in3;
3448 v8i16 filter_vec;
3449 v4i32 weight_vec, offset_vec, rnd_vec;
3450
3451 src0_ptr -= 1;
3452
3453 filter_vec = LD_SH(filter);
3454 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3455
3456 offset = (offset0 + offset1) << rnd_val;
3457 weight0 = weight0 & 0x0000FFFF;
3458 weight = weight0 | (weight1 << 16);
3459 constant = 128 * weight1;
3460 constant <<= 6;
3461 offset += constant;
3462
3463 offset_vec = __msa_fill_w(offset);
3464 weight_vec = __msa_fill_w(weight);
3465 rnd_vec = __msa_fill_w(rnd_val + 1);
3466
3467 mask1 = mask0 + 2;
3468 mask2 = mask0 + 8;
3469 mask3 = mask0 + 10;
3470
3471 for (loop_cnt = height; loop_cnt--;) {
3472 LD_SB2(src0_ptr, 16, src0, src1);
3473 src2 = LD_SB(src0_ptr + 24);
3474 src0_ptr += src_stride;
3475 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3476 src1_ptr += src2_stride;
3477 XORI_B3_128_SB(src0, src1, src2);
3478
3479 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3480 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3482 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3484 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3485 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3486 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3487 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3488 in0, in1, in2, in3,
3489 weight_vec, rnd_vec, offset_vec,
3490 dst0, dst1, dst2, dst3);
3491
3492 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3493 ST_SH2(dst0, dst1, dst, 16);
3494 dst += dst_stride;
3495 }
3496 }
3497
hevc_vt_biwgt_4t_4x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3498 static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3499 int32_t src_stride,
3500 int16_t *src1_ptr,
3501 int32_t src2_stride,
3502 uint8_t *dst,
3503 int32_t dst_stride,
3504 const int8_t *filter,
3505 int32_t weight0,
3506 int32_t weight1,
3507 int32_t offset0,
3508 int32_t offset1,
3509 int32_t rnd_val)
3510 {
3511 int32_t weight, offset, constant;
3512 v16i8 src0, src1, src2, src3, src4;
3513 v8i16 in0, in1, dst10;
3514 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3515 v4i32 dst10_r, dst10_l;
3516 v8i16 filt0, filt1;
3517 v8i16 filter_vec, out;
3518 v4i32 weight_vec, offset_vec, rnd_vec;
3519
3520 src0_ptr -= src_stride;
3521
3522 offset = (offset0 + offset1) << rnd_val;
3523 weight0 = weight0 & 0x0000FFFF;
3524 weight = weight0 | (weight1 << 16);
3525 constant = 128 * weight1;
3526 constant <<= 6;
3527 offset += constant;
3528
3529 offset_vec = __msa_fill_w(offset);
3530 weight_vec = __msa_fill_w(weight);
3531 rnd_vec = __msa_fill_w(rnd_val + 1);
3532
3533 filter_vec = LD_SH(filter);
3534 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3535
3536 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3537 src0_ptr += (3 * src_stride);
3538 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3539 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3540 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3541 LD_SB2(src0_ptr, src_stride, src3, src4);
3542 src0_ptr += (2 * src_stride);
3543 LD_SH2(src1_ptr, src2_stride, in0, in1);
3544 src1_ptr += (2 * src2_stride);
3545
3546 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3547 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3548 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3549 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3550
3551 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3552
3553 ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3554 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3555 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3556 SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3557 out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3558 CLIP_SH_0_255(out);
3559 out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3560 ST_W2(out, 0, 1, dst, dst_stride);
3561 }
3562
hevc_vt_biwgt_4t_4x4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3563 static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3564 int32_t src_stride,
3565 int16_t *src1_ptr,
3566 int32_t src2_stride,
3567 uint8_t *dst,
3568 int32_t dst_stride,
3569 const int8_t *filter,
3570 int32_t weight0,
3571 int32_t weight1,
3572 int32_t offset0,
3573 int32_t offset1,
3574 int32_t rnd_val)
3575 {
3576 int32_t weight, offset, constant;
3577 v16i8 src0, src1, src2, src3, src4, src5, src6;
3578 v8i16 in0, in1, in2, in3;
3579 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3580 v16i8 src2110, src4332, src6554;
3581 v8i16 dst10, dst32;
3582 v8i16 filt0, filt1;
3583 v8i16 filter_vec;
3584 v4i32 weight_vec, offset_vec, rnd_vec;
3585
3586 src0_ptr -= src_stride;
3587
3588 offset = (offset0 + offset1) << rnd_val;
3589 weight0 = weight0 & 0x0000FFFF;
3590 weight = weight0 | (weight1 << 16);
3591 constant = 128 * weight1;
3592 constant <<= 6;
3593 offset += constant;
3594
3595 offset_vec = __msa_fill_w(offset);
3596 weight_vec = __msa_fill_w(weight);
3597 rnd_vec = __msa_fill_w(rnd_val + 1);
3598
3599 filter_vec = LD_SH(filter);
3600 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3601
3602 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3603 src0_ptr += (3 * src_stride);
3604 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3605 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3606 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3607
3608 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3609 src0_ptr += (4 * src_stride);
3610 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3611 src1_ptr += (4 * src2_stride);
3612 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3613 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3614 src32_r, src43_r, src54_r, src65_r);
3615 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3616 XORI_B2_128_SB(src4332, src6554);
3617
3618 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3619 dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3620
3621 HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3622 weight_vec, rnd_vec, offset_vec,
3623 dst10, dst32);
3624
3625 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3626 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3627 dst += (4 * dst_stride);
3628 }
3629
hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3630 static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
3631 int32_t src_stride,
3632 int16_t *src1_ptr,
3633 int32_t src2_stride,
3634 uint8_t *dst,
3635 int32_t dst_stride,
3636 const int8_t *filter,
3637 int32_t height,
3638 int32_t weight0,
3639 int32_t weight1,
3640 int32_t offset0,
3641 int32_t offset1,
3642 int32_t rnd_val)
3643 {
3644 uint32_t loop_cnt;
3645 int32_t weight, offset, constant;
3646 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3647 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3648 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3649 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3650 v16i8 src2110, src4332, src6554, src8776;
3651 v8i16 dst10, dst32, dst54, dst76;
3652 v8i16 filt0, filt1;
3653 v8i16 filter_vec;
3654 v4i32 weight_vec, offset_vec, rnd_vec;
3655
3656 src0_ptr -= src_stride;
3657
3658 offset = (offset0 + offset1) << rnd_val;
3659 weight0 = weight0 & 0x0000FFFF;
3660 weight = weight0 | (weight1 << 16);
3661 constant = 128 * weight1;
3662 constant <<= 6;
3663 offset += constant;
3664
3665 offset_vec = __msa_fill_w(offset);
3666 weight_vec = __msa_fill_w(weight);
3667 rnd_vec = __msa_fill_w(rnd_val + 1);
3668
3669 filter_vec = LD_SH(filter);
3670 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3671
3672 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3673 src0_ptr += (3 * src_stride);
3674 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3675 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3676 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3677
3678 for (loop_cnt = (height >> 3); loop_cnt--;) {
3679 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3680 src0_ptr += (6 * src_stride);
3681 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3682 src1_ptr += (8 * src2_stride);
3683
3684 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3685 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3686
3687 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3688 src32_r, src43_r, src54_r, src65_r);
3689 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3690 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3691 src4332, src6554, src8776);
3692 XORI_B3_128_SB(src4332, src6554, src8776);
3693
3694 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3695 dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3696 dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3697
3698 LD_SB2(src0_ptr, src_stride, src9, src2);
3699 src0_ptr += (2 * src_stride);
3700 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3701 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3702 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3703
3704 dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3705 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3706 in0, in1, in2, in3,
3707 weight_vec, rnd_vec, offset_vec,
3708 dst10, dst32, dst54, dst76);
3709
3710 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3711 ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3712 dst += (8 * dst_stride);
3713 }
3714 }
3715
hevc_vt_biwgt_4t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3716 static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3717 int32_t src_stride,
3718 int16_t *src1_ptr,
3719 int32_t src2_stride,
3720 uint8_t *dst,
3721 int32_t dst_stride,
3722 const int8_t *filter,
3723 int32_t height,
3724 int32_t weight0,
3725 int32_t weight1,
3726 int32_t offset0,
3727 int32_t offset1,
3728 int32_t rnd_val)
3729 {
3730 if (2 == height) {
3731 hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3732 dst, dst_stride, filter,
3733 weight0, weight1, offset0, offset1, rnd_val);
3734 } else if (4 == height) {
3735 hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3736 dst, dst_stride, filter,
3737 weight0, weight1, offset0, offset1, rnd_val);
3738 } else if (0 == (height % 8)) {
3739 hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3740 src1_ptr, src2_stride,
3741 dst, dst_stride, filter, height,
3742 weight0, weight1, offset0, offset1,
3743 rnd_val);
3744 }
3745 }
3746
hevc_vt_biwgt_4t_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3747 static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3748 int32_t src_stride,
3749 int16_t *src1_ptr,
3750 int32_t src2_stride,
3751 uint8_t *dst,
3752 int32_t dst_stride,
3753 const int8_t *filter,
3754 int32_t height,
3755 int32_t weight0,
3756 int32_t weight1,
3757 int32_t offset0,
3758 int32_t offset1,
3759 int32_t rnd_val)
3760 {
3761 uint32_t loop_cnt;
3762 int32_t offset, weight, constant;
3763 v16i8 src0, src1, src2, src3, src4;
3764 v8i16 in0, in1, in2, in3;
3765 v16i8 src10_r, src32_r, src21_r, src43_r;
3766 v8i16 tmp0, tmp1, tmp2, tmp3;
3767 v8i16 filt0, filt1;
3768 v8i16 filter_vec;
3769 v4i32 weight_vec, offset_vec, rnd_vec;
3770
3771 src0_ptr -= src_stride;
3772
3773 offset = (offset0 + offset1) << rnd_val;
3774 weight0 = weight0 & 0x0000FFFF;
3775 weight = weight0 | (weight1 << 16);
3776 constant = 128 * weight1;
3777 constant <<= 6;
3778 offset += constant;
3779
3780 offset_vec = __msa_fill_w(offset);
3781 weight_vec = __msa_fill_w(weight);
3782 rnd_vec = __msa_fill_w(rnd_val + 1);
3783
3784 filter_vec = LD_SH(filter);
3785 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3786
3787 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3788 src0_ptr += (3 * src_stride);
3789 XORI_B3_128_SB(src0, src1, src2);
3790 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3791
3792 for (loop_cnt = (height >> 2); loop_cnt--;) {
3793 LD_SB2(src0_ptr, src_stride, src3, src4);
3794 src0_ptr += (2 * src_stride);
3795 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3796 src1_ptr += (4 * src2_stride);
3797 XORI_B2_128_SB(src3, src4);
3798 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3799
3800 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3801 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3802
3803 LD_SB2(src0_ptr, src_stride, src1, src2);
3804 src0_ptr += (2 * src_stride);
3805 XORI_B2_128_SB(src1, src2);
3806 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3807
3808 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3809 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3810 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3811 in0, in1, in2, in3,
3812 weight_vec, rnd_vec, offset_vec,
3813 tmp0, tmp1, tmp2, tmp3);
3814
3815 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3816 ST_W2(tmp0, 0, 2, dst, dst_stride);
3817 ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3818 ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3819 ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3820 dst += (4 * dst_stride);
3821 }
3822 }
3823
hevc_vt_biwgt_4t_8x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3824 static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3825 int32_t src_stride,
3826 int16_t *src1_ptr,
3827 int32_t src2_stride,
3828 uint8_t *dst,
3829 int32_t dst_stride,
3830 const int8_t *filter,
3831 int32_t weight0,
3832 int32_t weight1,
3833 int32_t offset0,
3834 int32_t offset1,
3835 int32_t rnd_val)
3836 {
3837 int32_t offset, weight, constant;
3838 v16i8 src0, src1, src2, src3, src4;
3839 v8i16 in0, in1, tmp0, tmp1;
3840 v16i8 src10_r, src32_r, src21_r, src43_r;
3841 v8i16 filt0, filt1;
3842 v8i16 filter_vec;
3843 v4i32 weight_vec, offset_vec, rnd_vec;
3844
3845 src0_ptr -= src_stride;
3846
3847 offset = (offset0 + offset1) << rnd_val;
3848 weight0 = weight0 & 0x0000FFFF;
3849 weight = weight0 | (weight1 << 16);
3850 constant = 128 * weight1;
3851 constant <<= 6;
3852 offset += constant;
3853
3854 offset_vec = __msa_fill_w(offset);
3855 weight_vec = __msa_fill_w(weight);
3856 rnd_vec = __msa_fill_w(rnd_val + 1);
3857
3858 filter_vec = LD_SH(filter);
3859 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3860
3861 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3862 src0_ptr += (3 * src_stride);
3863 XORI_B3_128_SB(src0, src1, src2);
3864 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3865
3866 LD_SB2(src0_ptr, src_stride, src3, src4);
3867 LD_SH2(src1_ptr, src2_stride, in0, in1);
3868 XORI_B2_128_SB(src3, src4);
3869 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3870
3871 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3872 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3873 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3874 weight_vec, rnd_vec, offset_vec,
3875 tmp0, tmp1);
3876
3877 tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3878 ST_D2(tmp0, 0, 1, dst, dst_stride);
3879 }
3880
hevc_vt_biwgt_4t_8x6_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3881 static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3882 int32_t src_stride,
3883 int16_t *src1_ptr,
3884 int32_t src2_stride,
3885 uint8_t *dst,
3886 int32_t dst_stride,
3887 const int8_t *filter,
3888 int32_t weight0,
3889 int32_t weight1,
3890 int32_t offset0,
3891 int32_t offset1,
3892 int32_t rnd_val)
3893 {
3894 int32_t offset, weight, constant;
3895 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3896 v8i16 in0, in1, in2, in3, in4, in5;
3897 v16i8 src10_r, src32_r, src54_r, src76_r;
3898 v16i8 src21_r, src43_r, src65_r, src87_r;
3899 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3900 v8i16 filt0, filt1;
3901 v8i16 filter_vec;
3902 v4i32 weight_vec, offset_vec, rnd_vec;
3903
3904 src0_ptr -= src_stride;
3905
3906 offset = (offset0 + offset1) << rnd_val;
3907 weight0 = weight0 & 0x0000FFFF;
3908 weight = weight0 | (weight1 << 16);
3909 constant = 128 * weight1;
3910 constant <<= 6;
3911 offset += constant;
3912
3913 offset_vec = __msa_fill_w(offset);
3914 weight_vec = __msa_fill_w(weight);
3915 rnd_vec = __msa_fill_w(rnd_val + 1);
3916
3917 filter_vec = LD_SH(filter);
3918 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3919
3920 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3921 src0_ptr += (3 * src_stride);
3922 XORI_B3_128_SB(src0, src1, src2);
3923 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3924
3925 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3926 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3927 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3928 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3929 src32_r, src43_r, src54_r, src65_r);
3930 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3931
3932 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3933 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3934 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3935 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3936 tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3937 tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3938 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3939 in0, in1, in2, in3,
3940 weight_vec, rnd_vec, offset_vec,
3941 tmp0, tmp1, tmp2, tmp3);
3942 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3943 weight_vec, rnd_vec, offset_vec,
3944 tmp4, tmp5);
3945
3946 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3947 tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3948 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3949 ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
3950 }
3951
hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)3952 static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3953 int32_t src_stride,
3954 int16_t *src1_ptr,
3955 int32_t src2_stride,
3956 uint8_t *dst,
3957 int32_t dst_stride,
3958 const int8_t *filter,
3959 int32_t height,
3960 int32_t weight0,
3961 int32_t weight1,
3962 int32_t offset0,
3963 int32_t offset1,
3964 int32_t rnd_val)
3965 {
3966 uint32_t loop_cnt;
3967 int32_t offset, weight, constant;
3968 v16i8 src0, src1, src2, src3, src4;
3969 v8i16 in0, in1, in2, in3;
3970 v16i8 src10_r, src32_r, src21_r, src43_r;
3971 v8i16 tmp0, tmp1, tmp2, tmp3;
3972 v8i16 filt0, filt1;
3973 v8i16 filter_vec;
3974 v4i32 weight_vec, offset_vec, rnd_vec;
3975
3976 src0_ptr -= src_stride;
3977
3978 offset = (offset0 + offset1) << rnd_val;
3979 weight0 = weight0 & 0x0000FFFF;
3980 weight = weight0 | (weight1 << 16);
3981 constant = 128 * weight1;
3982 constant <<= 6;
3983 offset += constant;
3984
3985 offset_vec = __msa_fill_w(offset);
3986 weight_vec = __msa_fill_w(weight);
3987 rnd_vec = __msa_fill_w(rnd_val + 1);
3988
3989 filter_vec = LD_SH(filter);
3990 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3991
3992 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3993 src0_ptr += (3 * src_stride);
3994 XORI_B3_128_SB(src0, src1, src2);
3995 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3996
3997 for (loop_cnt = (height >> 2); loop_cnt--;) {
3998 LD_SB2(src0_ptr, src_stride, src3, src4);
3999 src0_ptr += (2 * src_stride);
4000 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4001 src1_ptr += (4 * src2_stride);
4002 XORI_B2_128_SB(src3, src4);
4003 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4004
4005 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4006 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4007
4008 LD_SB2(src0_ptr, src_stride, src1, src2);
4009 src0_ptr += (2 * src_stride);
4010 XORI_B2_128_SB(src1, src2);
4011 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4012
4013 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4014 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4015 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4016 in0, in1, in2, in3,
4017 weight_vec, rnd_vec, offset_vec,
4018 tmp0, tmp1, tmp2, tmp3);
4019
4020 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4021 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4022 dst += (4 * dst_stride);
4023 }
4024 }
4025
hevc_vt_biwgt_4t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4026 static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
4027 int32_t src_stride,
4028 int16_t *src1_ptr,
4029 int32_t src2_stride,
4030 uint8_t *dst,
4031 int32_t dst_stride,
4032 const int8_t *filter,
4033 int32_t height,
4034 int32_t weight0,
4035 int32_t weight1,
4036 int32_t offset0,
4037 int32_t offset1,
4038 int32_t rnd_val)
4039 {
4040 if (2 == height) {
4041 hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4042 dst, dst_stride, filter,
4043 weight0, weight1, offset0, offset1, rnd_val);
4044 } else if (6 == height) {
4045 hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4046 dst, dst_stride, filter,
4047 weight0, weight1, offset0, offset1, rnd_val);
4048 } else {
4049 hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4050 src1_ptr, src2_stride,
4051 dst, dst_stride, filter, height,
4052 weight0, weight1, offset0, offset1,
4053 rnd_val);
4054 }
4055 }
4056
hevc_vt_biwgt_4t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4057 static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
4058 int32_t src_stride,
4059 int16_t *src1_ptr,
4060 int32_t src2_stride,
4061 uint8_t *dst,
4062 int32_t dst_stride,
4063 const int8_t *filter,
4064 int32_t height,
4065 int32_t weight0,
4066 int32_t weight1,
4067 int32_t offset0,
4068 int32_t offset1,
4069 int32_t rnd_val)
4070 {
4071 uint32_t loop_cnt;
4072 int32_t offset, weight, constant;
4073 v16i8 src0, src1, src2, src3, src4, src5;
4074 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4075 v16i8 src10_r, src32_r, src21_r, src43_r;
4076 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4077 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4078 v16i8 src2110, src4332;
4079 v8i16 filt0, filt1;
4080 v8i16 filter_vec;
4081 v4i32 weight_vec, offset_vec, rnd_vec;
4082
4083 src0_ptr -= (1 * src_stride);
4084
4085 offset = (offset0 + offset1) << rnd_val;
4086 weight0 = weight0 & 0x0000FFFF;
4087 weight = weight0 | (weight1 << 16);
4088 constant = 128 * weight1;
4089 constant <<= 6;
4090 offset += constant;
4091
4092 offset_vec = __msa_fill_w(offset);
4093 weight_vec = __msa_fill_w(weight);
4094 rnd_vec = __msa_fill_w(rnd_val + 1);
4095
4096 filter_vec = LD_SH(filter);
4097 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4098
4099 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4100 src0_ptr += (3 * src_stride);
4101 XORI_B3_128_SB(src0, src1, src2);
4102 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4103 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4104 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4105
4106 for (loop_cnt = (height >> 2); loop_cnt--;) {
4107 LD_SB2(src0_ptr, src_stride, src3, src4);
4108 src0_ptr += (2 * src_stride);
4109 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4110 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4111 src1_ptr += (4 * src2_stride);
4112 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4113 XORI_B2_128_SB(src3, src4);
4114
4115 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4116 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4117 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4118
4119 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4120 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4121 tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4122
4123 LD_SB2(src0_ptr, src_stride, src5, src2);
4124 src0_ptr += (2 * src_stride);
4125 XORI_B2_128_SB(src5, src2);
4126 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4127 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4128 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4129
4130 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4131 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4132 tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4133 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4134 in0, in1, in2, in3,
4135 weight_vec, rnd_vec, offset_vec,
4136 tmp0, tmp1, tmp2, tmp3);
4137 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4138 weight_vec, rnd_vec, offset_vec,
4139 tmp4, tmp5);
4140
4141 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4142 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4143 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4144 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4145 dst += (4 * dst_stride);
4146 }
4147 }
4148
hevc_vt_biwgt_4t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4149 static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
4150 int32_t src_stride,
4151 int16_t *src1_ptr,
4152 int32_t src2_stride,
4153 uint8_t *dst,
4154 int32_t dst_stride,
4155 const int8_t *filter,
4156 int32_t height,
4157 int32_t weight0,
4158 int32_t weight1,
4159 int32_t offset0,
4160 int32_t offset1,
4161 int32_t rnd_val)
4162 {
4163 uint32_t loop_cnt;
4164 int32_t offset, weight, constant;
4165 v16i8 src0, src1, src2, src3, src4, src5;
4166 v8i16 in0, in1, in2, in3;
4167 v16i8 src10_r, src32_r, src21_r, src43_r;
4168 v16i8 src10_l, src32_l, src21_l, src43_l;
4169 v8i16 tmp0, tmp1, tmp2, tmp3;
4170 v8i16 filt0, filt1;
4171 v8i16 filter_vec;
4172 v4i32 weight_vec, offset_vec, rnd_vec;
4173
4174 src0_ptr -= src_stride;
4175
4176 offset = (offset0 + offset1) << rnd_val;
4177 weight0 = weight0 & 0x0000FFFF;
4178 weight = weight0 | (weight1 << 16);
4179 constant = 128 * weight1;
4180 constant <<= 6;
4181 offset += constant;
4182
4183 offset_vec = __msa_fill_w(offset);
4184 weight_vec = __msa_fill_w(weight);
4185 rnd_vec = __msa_fill_w(rnd_val + 1);
4186
4187 filter_vec = LD_SH(filter);
4188 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4189
4190 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4191 src0_ptr += (3 * src_stride);
4192 XORI_B3_128_SB(src0, src1, src2);
4193 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4194 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4195
4196 for (loop_cnt = (height >> 2); loop_cnt--;) {
4197 LD_SB2(src0_ptr, src_stride, src3, src4);
4198 src0_ptr += (2 * src_stride);
4199 LD_SH2(src1_ptr, src2_stride, in0, in1);
4200 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4201 src1_ptr += (2 * src2_stride);
4202 XORI_B2_128_SB(src3, src4);
4203 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4204 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4205
4206 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4207 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4208 tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4209 tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4210
4211 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4212 in0, in1, in2, in3,
4213 weight_vec, rnd_vec, offset_vec,
4214 tmp0, tmp1, tmp2, tmp3);
4215 PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4216 ST_SH2(tmp0, tmp1, dst, dst_stride);
4217 dst += (2 * dst_stride);
4218 LD_SB2(src0_ptr, src_stride, src5, src2);
4219 src0_ptr += (2 * src_stride);
4220
4221 LD_SH2(src1_ptr, src2_stride, in0, in1);
4222 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4223 src1_ptr += (2 * src2_stride);
4224 XORI_B2_128_SB(src5, src2);
4225 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4226 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4227
4228 tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4229 tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4230 tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4231 tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4232 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4233 in0, in1, in2, in3,
4234 weight_vec, rnd_vec, offset_vec,
4235 tmp0, tmp1, tmp2, tmp3);
4236
4237 PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4238 ST_SH2(tmp0, tmp1, dst, dst_stride);
4239 dst += (2 * dst_stride);
4240 }
4241 }
4242
hevc_vt_biwgt_4t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4243 static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4244 int32_t src_stride,
4245 int16_t *src1_ptr,
4246 int32_t src2_stride,
4247 uint8_t *dst,
4248 int32_t dst_stride,
4249 const int8_t *filter,
4250 int32_t height,
4251 int32_t weight0,
4252 int32_t weight1,
4253 int32_t offset0,
4254 int32_t offset1,
4255 int32_t rnd_val)
4256 {
4257 uint32_t loop_cnt;
4258 int32_t offset, weight, constant;
4259 v16i8 src0, src1, src2, src3, src4, src5;
4260 v16i8 src6, src7, src8, src9, src10, src11;
4261 v8i16 in0, in1, in2, in3, in4, in5;
4262 v16i8 src10_r, src32_r, src76_r, src98_r;
4263 v16i8 src10_l, src32_l, src21_l, src43_l;
4264 v16i8 src21_r, src43_r, src87_r, src109_r;
4265 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4266 v8i16 filt0, filt1;
4267 v8i16 filter_vec;
4268 v4i32 weight_vec, offset_vec, rnd_vec;
4269
4270 src0_ptr -= src_stride;
4271
4272 offset = (offset0 + offset1) << rnd_val;
4273 weight0 = weight0 & 0x0000FFFF;
4274 weight = weight0 | (weight1 << 16);
4275 constant = 128 * weight1;
4276 constant <<= 6;
4277 offset += constant;
4278
4279 offset_vec = __msa_fill_w(offset);
4280 weight_vec = __msa_fill_w(weight);
4281 rnd_vec = __msa_fill_w(rnd_val + 1);
4282
4283 filter_vec = LD_SH(filter);
4284 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4285
4286 /* 16width */
4287 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4288 XORI_B3_128_SB(src0, src1, src2);
4289 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4290 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4291 /* 8width */
4292 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4293 src0_ptr += (3 * src_stride);
4294 XORI_B3_128_SB(src6, src7, src8);
4295 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4296
4297 for (loop_cnt = (height >> 2); loop_cnt--;) {
4298 /* 16width */
4299 LD_SB2(src0_ptr, src_stride, src3, src4);
4300 LD_SH2(src1_ptr, src2_stride, in0, in1);
4301 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4302 XORI_B2_128_SB(src3, src4);
4303 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4304 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4305
4306 /* 8width */
4307 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4308 src0_ptr += (2 * src_stride);
4309 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4310 src1_ptr += (2 * src2_stride);
4311 XORI_B2_128_SB(src9, src10);
4312 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4313 /* 16width */
4314 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4315 tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4316 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4317 tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4318 /* 8width */
4319 tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4320 tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4321 /* 16width */
4322 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4323 in0, in1, in2, in3,
4324 weight_vec, rnd_vec, offset_vec,
4325 tmp0, tmp1, tmp4, tmp5);
4326 /* 8width */
4327 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4328 weight_vec, rnd_vec, offset_vec,
4329 tmp2, tmp3);
4330 /* 16width */
4331 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4332 /* 8width */
4333 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4334 ST_SH2(tmp0, tmp1, dst, dst_stride);
4335 ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4336 dst += (2 * dst_stride);
4337
4338 /* 16width */
4339 LD_SB2(src0_ptr, src_stride, src5, src2);
4340 LD_SH2(src1_ptr, src2_stride, in0, in1);
4341 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4342 XORI_B2_128_SB(src5, src2);
4343 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4344 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4345 /* 8width */
4346 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4347 src0_ptr += (2 * src_stride);
4348 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4349 src1_ptr += (2 * src2_stride);
4350 XORI_B2_128_SB(src11, src8);
4351 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4352 /* 16width */
4353 tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4354 tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4355 tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4356 tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4357 /* 8width */
4358 tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4359 tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4360 /* 16width */
4361 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4362 in0, in1, in2, in3,
4363 weight_vec, rnd_vec, offset_vec,
4364 tmp0, tmp1, tmp4, tmp5);
4365 /* 8width */
4366 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4367 weight_vec, rnd_vec, offset_vec,
4368 tmp2, tmp3);
4369 /* 16width */
4370 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4371
4372 /* 8width */
4373 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4374 ST_SH2(tmp0, tmp1, dst, dst_stride);
4375 ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4376 dst += (2 * dst_stride);
4377 }
4378 }
4379
hevc_vt_biwgt_4t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4380 static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4381 int32_t src_stride,
4382 int16_t *src1_ptr,
4383 int32_t src2_stride,
4384 uint8_t *dst,
4385 int32_t dst_stride,
4386 const int8_t *filter,
4387 int32_t height,
4388 int32_t weight0,
4389 int32_t weight1,
4390 int32_t offset0,
4391 int32_t offset1,
4392 int32_t rnd_val)
4393 {
4394 uint32_t loop_cnt;
4395 uint8_t *dst_tmp = dst + 16;
4396 int32_t offset, weight, constant;
4397 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4398 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4399 v16i8 src10_r, src32_r, src76_r, src98_r;
4400 v16i8 src21_r, src43_r, src87_r, src109_r;
4401 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4402 v16i8 src10_l, src32_l, src76_l, src98_l;
4403 v16i8 src21_l, src43_l, src87_l, src109_l;
4404 v8i16 filt0, filt1;
4405 v8i16 filter_vec;
4406 v4i32 weight_vec, offset_vec, rnd_vec;
4407
4408 src0_ptr -= src_stride;
4409
4410 offset = (offset0 + offset1) << rnd_val;
4411 weight0 = weight0 & 0x0000FFFF;
4412 weight = weight0 | (weight1 << 16);
4413 constant = 128 * weight1;
4414 constant <<= 6;
4415 offset += constant;
4416
4417 offset_vec = __msa_fill_w(offset);
4418 weight_vec = __msa_fill_w(weight);
4419 rnd_vec = __msa_fill_w(rnd_val + 1);
4420
4421 filter_vec = LD_SH(filter);
4422 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4423
4424 /* 16width */
4425 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4426 XORI_B3_128_SB(src0, src1, src2);
4427 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4428 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4429 /* next 16width */
4430 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4431 src0_ptr += (3 * src_stride);
4432 XORI_B3_128_SB(src6, src7, src8);
4433 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4434 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4435
4436 for (loop_cnt = (height >> 1); loop_cnt--;) {
4437 /* 16width */
4438 LD_SB2(src0_ptr, src_stride, src3, src4);
4439 LD_SH2(src1_ptr, src2_stride, in0, in1);
4440 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4441 XORI_B2_128_SB(src3, src4);
4442 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4443 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4444
4445 /* 16width */
4446 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4447 tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4448 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4449 tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4450 /* 16width */
4451 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4452 in0, in1, in2, in3,
4453 weight_vec, rnd_vec, offset_vec,
4454 tmp0, tmp1, tmp4, tmp5);
4455 /* 16width */
4456 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4457 ST_SH2(tmp0, tmp1, dst, dst_stride);
4458 dst += (2 * dst_stride);
4459
4460 src10_r = src32_r;
4461 src21_r = src43_r;
4462 src10_l = src32_l;
4463 src21_l = src43_l;
4464 src2 = src4;
4465
4466 /* next 16width */
4467 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4468 src0_ptr += (2 * src_stride);
4469 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4470 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4471 src1_ptr += (2 * src2_stride);
4472 XORI_B2_128_SB(src9, src10);
4473 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4474 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4475 /* next 16width */
4476 tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4477 tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4478 tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4479 tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4480 /* next 16width */
4481 HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4482 in4, in5, in6, in7,
4483 weight_vec, rnd_vec, offset_vec,
4484 tmp2, tmp3, tmp6, tmp7);
4485
4486 /* next 16width */
4487 PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4488 ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4489 dst_tmp += (2 * dst_stride);
4490
4491 src76_r = src98_r;
4492 src87_r = src109_r;
4493 src76_l = src98_l;
4494 src87_l = src109_l;
4495 src8 = src10;
4496 }
4497 }
4498
hevc_hv_biwgt_4t_4x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4499 static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4500 int32_t src_stride,
4501 int16_t *src1_ptr,
4502 int32_t src2_stride,
4503 uint8_t *dst,
4504 int32_t dst_stride,
4505 const int8_t *filter_x,
4506 const int8_t *filter_y,
4507 int32_t weight0,
4508 int32_t weight1,
4509 int32_t offset0,
4510 int32_t offset1,
4511 int32_t rnd_val)
4512 {
4513 uint64_t tp0, tp1;
4514 int32_t offset, weight;
4515 v8i16 in0 = { 0 };
4516 v16u8 out;
4517 v16i8 src0, src1, src2, src3, src4;
4518 v8i16 filt0, filt1;
4519 v8i16 filt_h0, filt_h1;
4520 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4521 v16i8 mask1;
4522 v8i16 filter_vec, tmp, weight_vec;
4523 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4524 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4525 v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4526
4527 src0_ptr -= (src_stride + 1);
4528
4529 filter_vec = LD_SH(filter_x);
4530 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4531
4532 filter_vec = LD_SH(filter_y);
4533 UNPCK_R_SB_SH(filter_vec, filter_vec);
4534
4535 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4536
4537 mask1 = mask0 + 2;
4538
4539 offset = (offset0 + offset1) << rnd_val;
4540 weight0 = weight0 & 0x0000FFFF;
4541 weight = weight0 | (weight1 << 16);
4542
4543 const_vec = __msa_fill_w((128 * weight1));
4544 const_vec <<= 6;
4545 offset_vec = __msa_fill_w(offset);
4546 weight_vec = (v8i16) __msa_fill_w(weight);
4547 rnd_vec = __msa_fill_w(rnd_val + 1);
4548 offset_vec += const_vec;
4549
4550 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4551 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4552
4553 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4554 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4555 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4556
4557 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4558 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4559 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4560
4561 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4562 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4563
4564 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4565 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4566 dst0 >>= 6;
4567 dst1 >>= 6;
4568 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4569
4570 LD2(src1_ptr, src2_stride, tp0, tp1);
4571 INSERT_D2_SH(tp0, tp1, in0);
4572
4573 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4574 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4575 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4576 SRAR_W2_SW(dst0, dst1, rnd_vec);
4577 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4578 CLIP_SH_0_255(tmp);
4579 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4580 ST_W2(out, 0, 1, dst, dst_stride);
4581 }
4582
hevc_hv_biwgt_4t_4x4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4583 static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4584 int32_t src_stride,
4585 int16_t *src1_ptr,
4586 int32_t src2_stride,
4587 uint8_t *dst,
4588 int32_t dst_stride,
4589 const int8_t *filter_x,
4590 const int8_t *filter_y,
4591 int32_t weight0,
4592 int32_t weight1,
4593 int32_t offset0,
4594 int32_t offset1,
4595 int32_t rnd_val)
4596 {
4597 uint64_t tp0, tp1;
4598 int32_t offset, weight;
4599 v16u8 out;
4600 v8i16 in0 = { 0 }, in1 = { 0 };
4601 v16i8 src0, src1, src2, src3, src4, src5, src6;
4602 v8i16 filt0, filt1;
4603 v8i16 filt_h0, filt_h1;
4604 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4605 v16i8 mask1;
4606 v8i16 filter_vec, weight_vec;
4607 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4608 v8i16 tmp0, tmp1, tmp2, tmp3;
4609 v8i16 dst30, dst41, dst52, dst63;
4610 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4611 v4i32 offset_vec, rnd_vec, const_vec;
4612 v4i32 dst0, dst1, dst2, dst3;
4613
4614 src0_ptr -= (src_stride + 1);
4615
4616 filter_vec = LD_SH(filter_x);
4617 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4618
4619 filter_vec = LD_SH(filter_y);
4620 UNPCK_R_SB_SH(filter_vec, filter_vec);
4621
4622 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4623
4624 mask1 = mask0 + 2;
4625
4626 offset = (offset0 + offset1) << rnd_val;
4627 weight0 = weight0 & 0x0000FFFF;
4628 weight = weight0 | (weight1 << 16);
4629
4630 const_vec = __msa_fill_w((128 * weight1));
4631 const_vec <<= 6;
4632 offset_vec = __msa_fill_w(offset);
4633 weight_vec = (v8i16) __msa_fill_w(weight);
4634 rnd_vec = __msa_fill_w(rnd_val + 1);
4635 offset_vec += const_vec;
4636
4637 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4638 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4639
4640 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4641 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4642 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4643 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4644
4645 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4646 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4647 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4648 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4649
4650 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4651 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4652 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4653 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4654 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4655 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4656 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4657 SRA_4V(dst0, dst1, dst2, dst3, 6);
4658 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4659
4660 LD2(src1_ptr, src2_stride, tp0, tp1);
4661 INSERT_D2_SH(tp0, tp1, in0);
4662 src1_ptr += (2 * src2_stride);
4663 LD2(src1_ptr, src2_stride, tp0, tp1);
4664 INSERT_D2_SH(tp0, tp1, in1);
4665
4666 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4667 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4668
4669 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4670 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4671 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4672 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4673 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4674 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4675 CLIP_SH2_0_255(tmp0, tmp1);
4676 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4677 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4678 }
4679
hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4680 static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
4681 int32_t src_stride,
4682 int16_t *src1_ptr,
4683 int32_t src2_stride,
4684 uint8_t *dst,
4685 int32_t dst_stride,
4686 const int8_t *filter_x,
4687 const int8_t *filter_y,
4688 int32_t height,
4689 int32_t weight0,
4690 int32_t weight1,
4691 int32_t offset0,
4692 int32_t offset1,
4693 int32_t rnd_val)
4694 {
4695 uint32_t loop_cnt;
4696 uint64_t tp0, tp1;
4697 int32_t offset, weight;
4698 v16u8 out0, out1;
4699 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4700 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4701 v8i16 filt0, filt1;
4702 v8i16 filt_h0, filt_h1;
4703 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4704 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4705 v16i8 mask1;
4706 v8i16 filter_vec, weight_vec;
4707 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4708 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4709 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4710 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4711 v8i16 dst98_r, dst109_r;
4712 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4713 v4i32 offset_vec, rnd_vec, const_vec;
4714
4715 src0_ptr -= (src_stride + 1);
4716
4717 filter_vec = LD_SH(filter_x);
4718 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4719
4720 filter_vec = LD_SH(filter_y);
4721 UNPCK_R_SB_SH(filter_vec, filter_vec);
4722
4723 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4724
4725 mask1 = mask0 + 2;
4726
4727 offset = (offset0 + offset1) << rnd_val;
4728 weight0 = weight0 & 0x0000FFFF;
4729 weight = weight0 | (weight1 << 16);
4730
4731 const_vec = __msa_fill_w((128 * weight1));
4732 const_vec <<= 6;
4733 offset_vec = __msa_fill_w(offset);
4734 weight_vec = (v8i16) __msa_fill_w(weight);
4735 rnd_vec = __msa_fill_w(rnd_val + 1);
4736 offset_vec += const_vec;
4737
4738 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4739 src0_ptr += (3 * src_stride);
4740 XORI_B3_128_SB(src0, src1, src2);
4741
4742 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4743 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4744 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4745 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4746 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4747 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4748
4749 for (loop_cnt = height >> 3; loop_cnt--;) {
4750 LD_SB8(src0_ptr, src_stride,
4751 src3, src4, src5, src6, src7, src8, src9, src10);
4752 src0_ptr += (8 * src_stride);
4753 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4754 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4755 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4756 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4757 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4758
4759 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4760 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4761 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4762 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4763
4764 dst32_r = __msa_ilvr_h(dst73, dst22);
4765 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4766 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4767 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4768 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4769 dst76_r = __msa_ilvr_h(dst22, dst106);
4770
4771 LD2(src1_ptr, src2_stride, tp0, tp1);
4772 src1_ptr += 2 * src2_stride;
4773 INSERT_D2_SH(tp0, tp1, in0);
4774 LD2(src1_ptr, src2_stride, tp0, tp1);
4775 src1_ptr += 2 * src2_stride;
4776 INSERT_D2_SH(tp0, tp1, in1);
4777
4778 LD2(src1_ptr, src2_stride, tp0, tp1);
4779 src1_ptr += 2 * src2_stride;
4780 INSERT_D2_SH(tp0, tp1, in2);
4781 LD2(src1_ptr, src2_stride, tp0, tp1);
4782 src1_ptr += 2 * src2_stride;
4783 INSERT_D2_SH(tp0, tp1, in3);
4784
4785 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4786 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4787 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4788 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4789 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4790 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4791 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4792 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4793 SRA_4V(dst0, dst1, dst2, dst3, 6);
4794 SRA_4V(dst4, dst5, dst6, dst7, 6);
4795 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4796 dst2, dst3);
4797 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4798 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4799 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4800 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4801 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4802 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4803 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4804 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4805 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4806 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4807 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4808 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4809 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4810 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4811 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4812 tmp2, tmp3);
4813 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4814 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4815 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4816 dst += (8 * dst_stride);
4817
4818 dst10_r = dst98_r;
4819 dst21_r = dst109_r;
4820 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4821 }
4822 }
4823
hevc_hv_biwgt_4t_4w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4824 static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4825 int32_t src_stride,
4826 int16_t *src1_ptr,
4827 int32_t src2_stride,
4828 uint8_t *dst,
4829 int32_t dst_stride,
4830 const int8_t *filter_x,
4831 const int8_t *filter_y,
4832 int32_t height,
4833 int32_t weight0,
4834 int32_t weight1,
4835 int32_t offset0,
4836 int32_t offset1,
4837 int32_t rnd_val)
4838 {
4839 if (2 == height) {
4840 hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4841 dst, dst_stride, filter_x, filter_y,
4842 weight0, weight1, offset0, offset1, rnd_val);
4843 } else if (4 == height) {
4844 hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4845 dst, dst_stride, filter_x, filter_y,
4846 weight0, weight1, offset0, offset1, rnd_val);
4847 } else if (0 == (height % 8)) {
4848 hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4849 src1_ptr, src2_stride,
4850 dst, dst_stride, filter_x, filter_y,
4851 height, weight0, weight1,
4852 offset0, offset1, rnd_val);
4853 }
4854 }
4855
hevc_hv_biwgt_4t_6w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)4856 static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4857 int32_t src_stride,
4858 int16_t *src1_ptr,
4859 int32_t src2_stride,
4860 uint8_t *dst,
4861 int32_t dst_stride,
4862 const int8_t *filter_x,
4863 const int8_t *filter_y,
4864 int32_t height,
4865 int32_t weight0,
4866 int32_t weight1,
4867 int32_t offset0,
4868 int32_t offset1,
4869 int32_t rnd_val)
4870 {
4871 uint32_t tpw0, tpw1, tpw2, tpw3;
4872 uint64_t tp0, tp1;
4873 int32_t offset, weight;
4874 v16u8 out0, out1, out2;
4875 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4876 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4877 v8i16 in4 = { 0 }, in5 = { 0 };
4878 v8i16 filt0, filt1;
4879 v8i16 filt_h0, filt_h1, filter_vec;
4880 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4881 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4882 v16i8 mask1;
4883 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4884 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4885 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4886 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4887 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4888 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4889 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4890 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4891 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4892 v4i32 offset_vec, rnd_vec, const_vec;
4893
4894 src0_ptr -= (src_stride + 1);
4895
4896 filter_vec = LD_SH(filter_x);
4897 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4898
4899 filter_vec = LD_SH(filter_y);
4900 UNPCK_R_SB_SH(filter_vec, filter_vec);
4901
4902 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4903
4904 mask1 = mask0 + 2;
4905
4906 offset = (offset0 + offset1) << rnd_val;
4907 weight0 = weight0 & 0x0000FFFF;
4908 weight = weight0 | (weight1 << 16);
4909
4910 const_vec = __msa_fill_w((128 * weight1));
4911 const_vec <<= 6;
4912 offset_vec = __msa_fill_w(offset);
4913 weight_vec = (v8i16) __msa_fill_w(weight);
4914 rnd_vec = __msa_fill_w(rnd_val + 1);
4915 offset_vec += const_vec;
4916
4917 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4918 src0_ptr += (3 * src_stride);
4919 XORI_B3_128_SB(src0, src1, src2);
4920
4921 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4922 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4923 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4924 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4925 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4926 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4927
4928 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4929 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4930
4931 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4932 src10);
4933 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4934
4935 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4936 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4937 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4938 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4939
4940 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4941 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4942 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4943 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4944
4945 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4946 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4947 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4948 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4949
4950 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4951 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4952 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4953 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4954
4955 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4956 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4957 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4958 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4959 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4960 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4961 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4962 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4963 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4964 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4965 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4966
4967 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4968 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4969 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4970 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4971 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4972 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4973 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4974 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4975 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4976 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4977 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4978 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4979 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4980 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4981 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4982 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4983 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4984
4985 LD2(src1_ptr, src2_stride, tp0, tp1);
4986 INSERT_D2_SH(tp0, tp1, in0);
4987 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4988 INSERT_D2_SH(tp0, tp1, in1);
4989
4990 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4991 INSERT_D2_SH(tp0, tp1, in2);
4992 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4993 INSERT_D2_SH(tp0, tp1, in3);
4994
4995 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4996 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4997 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4998 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4999 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5000 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5001 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5002 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5003 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5004 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5005 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5006 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5007 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5008 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5009 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5010 tmp2, tmp3);
5011 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5012 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5013 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5014
5015 PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5016
5017 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5018 src1_ptr += (4 * src2_stride);
5019 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5020 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5021 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5022
5023 ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5024 ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5025
5026 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5027 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5028 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5029 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5030 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5031 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5032
5033 CLIP_SH2_0_255(tmp4, tmp5);
5034 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5035 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5036 }
5037
hevc_hv_biwgt_4t_8x2_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)5038 static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
5039 int32_t src_stride,
5040 int16_t *src1_ptr,
5041 int32_t src2_stride,
5042 uint8_t *dst,
5043 int32_t dst_stride,
5044 const int8_t *filter_x,
5045 const int8_t *filter_y,
5046 int32_t weight0,
5047 int32_t weight1,
5048 int32_t offset0,
5049 int32_t offset1,
5050 int32_t rnd_val)
5051 {
5052 int32_t weight, offset;
5053 v16u8 out;
5054 v16i8 src0, src1, src2, src3, src4;
5055 v8i16 filt0, filt1;
5056 v8i16 filt_h0, filt_h1;
5057 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5058 v16i8 mask1;
5059 v8i16 filter_vec, weight_vec;
5060 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5061 v8i16 dst0, dst1, dst2, dst3, dst4;
5062 v8i16 in0, in1;
5063 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5064 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5065 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5066 v8i16 tmp0, tmp1, tmp2, tmp3;
5067 v4i32 offset_vec, rnd_vec, const_vec;
5068
5069 src0_ptr -= (src_stride + 1);
5070
5071 filter_vec = LD_SH(filter_x);
5072 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5073
5074 filter_vec = LD_SH(filter_y);
5075 UNPCK_R_SB_SH(filter_vec, filter_vec);
5076
5077 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5078
5079 mask1 = mask0 + 2;
5080
5081 offset = (offset0 + offset1) << rnd_val;
5082 weight0 = weight0 & 0x0000FFFF;
5083 weight = weight0 | (weight1 << 16);
5084
5085 const_vec = __msa_fill_w((128 * weight1));
5086 const_vec <<= 6;
5087 offset_vec = __msa_fill_w(offset);
5088 weight_vec = (v8i16) __msa_fill_w(weight);
5089 rnd_vec = __msa_fill_w(rnd_val + 1);
5090 offset_vec += const_vec;
5091
5092 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5093 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5094
5095 LD_SH2(src1_ptr, src2_stride, in0, in1);
5096
5097 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5098 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5099 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5100 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5101 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5102
5103 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5104 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5105 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5106 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5107 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5108
5109 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5110 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5111 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5112 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5113 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5114 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5115 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5116 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5117 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5118 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5119
5120 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5121 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5122
5123 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5124 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5125 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5126 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5127 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5128 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5129 CLIP_SH2_0_255(tmp0, tmp1);
5130 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5131 ST_D2(out, 0, 1, dst, dst_stride);
5132 }
5133
hevc_hv_biwgt_4t_8multx4_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val,int32_t width8mult)5134 static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
5135 int32_t src_stride,
5136 int16_t *src1_ptr,
5137 int32_t src2_stride,
5138 uint8_t *dst,
5139 int32_t dst_stride,
5140 const int8_t *filter_x,
5141 const int8_t *filter_y,
5142 int32_t weight0,
5143 int32_t weight1,
5144 int32_t offset0,
5145 int32_t offset1,
5146 int32_t rnd_val,
5147 int32_t width8mult)
5148 {
5149 int32_t weight, offset;
5150 uint32_t cnt;
5151 v16u8 out0, out1;
5152 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5153 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5154 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5155 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5156 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5157 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5158 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5159 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5160 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5161 v4i32 offset_vec, rnd_vec, const_vec;
5162
5163 src0_ptr -= (src_stride + 1);
5164
5165 filter_vec = LD_SH(filter_x);
5166 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5167
5168 filter_vec = LD_SH(filter_y);
5169 UNPCK_R_SB_SH(filter_vec, filter_vec);
5170
5171 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5172
5173 mask0 = LD_SB(ff_hevc_mask_arr);
5174 mask1 = mask0 + 2;
5175
5176 offset = (offset0 + offset1) << rnd_val;
5177 weight0 = weight0 & 0x0000FFFF;
5178 weight = weight0 | (weight1 << 16);
5179
5180 const_vec = __msa_fill_w((128 * weight1));
5181 const_vec <<= 6;
5182 offset_vec = __msa_fill_w(offset);
5183 rnd_vec = __msa_fill_w(rnd_val + 1);
5184 offset_vec += const_vec;
5185 weight_vec = (v8i16) __msa_fill_w(weight);
5186
5187 for (cnt = width8mult; cnt--;) {
5188 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5189 src0_ptr += 8;
5190 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5191
5192 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5193 src1_ptr += 8;
5194
5195 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5196 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5197 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5198
5199 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5200 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5201 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5202
5203 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5204 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5205
5206 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5207 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5208 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5209 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5210
5211 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5212 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5213 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5214 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5215
5216 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5217 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5218 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5219 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5220
5221 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5222 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5223 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5224 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5225 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5226 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5227 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5228 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5229
5230 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5231 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5232 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5233 dst3_r, dst0, dst1, dst2, dst3);
5234
5235 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5236 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5237 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5238 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5239 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5240 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5241 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5242 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5243 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5244 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5245 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5246 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5247 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5248 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5249 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5250 tmp0, tmp1, tmp2, tmp3);
5251 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5252 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5253 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5254 dst += 8;
5255 }
5256 }
5257
hevc_hv_biwgt_4t_8x6_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)5258 static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
5259 int32_t src_stride,
5260 int16_t *src1_ptr,
5261 int32_t src2_stride,
5262 uint8_t *dst,
5263 int32_t dst_stride,
5264 const int8_t *filter_x,
5265 const int8_t *filter_y,
5266 int32_t weight0,
5267 int32_t weight1,
5268 int32_t offset0,
5269 int32_t offset1,
5270 int32_t rnd_val)
5271 {
5272 uint32_t offset, weight;
5273 v16u8 out0, out1, out2;
5274 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5275 v8i16 filt0, filt1;
5276 v8i16 filt_h0, filt_h1;
5277 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5278 v16i8 mask1;
5279 v8i16 filter_vec, weight_vec;
5280 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5281 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5282 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5283 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5284 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5285 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5286 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5287 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5288 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5289 v8i16 in0, in1, in2, in3, in4, in5;
5290 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5291 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5292 v4i32 offset_vec, rnd_vec, const_vec;
5293
5294 src0_ptr -= (src_stride + 1);
5295
5296 filter_vec = LD_SH(filter_x);
5297 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5298
5299 filter_vec = LD_SH(filter_y);
5300 UNPCK_R_SB_SH(filter_vec, filter_vec);
5301
5302 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5303
5304 mask1 = mask0 + 2;
5305
5306 offset = (offset0 + offset1) << rnd_val;
5307 weight0 = weight0 & 0x0000FFFF;
5308 weight = weight0 | (weight1 << 16);
5309
5310 const_vec = __msa_fill_w((128 * weight1));
5311 const_vec <<= 6;
5312 offset_vec = __msa_fill_w(offset);
5313 weight_vec = (v8i16) __msa_fill_w(weight);
5314 rnd_vec = __msa_fill_w(rnd_val + 1);
5315 offset_vec += const_vec;
5316
5317 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5318 src0_ptr += (5 * src_stride);
5319 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5320
5321 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5322 XORI_B4_128_SB(src5, src6, src7, src8);
5323
5324 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5325
5326 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5327 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5328 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5329 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5330 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5331 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5332 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5333 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5334 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5335
5336 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5337 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5338 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5339 dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5340 dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5341 dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5342 dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5343 dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5344 dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5345
5346 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5347 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5348 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5349 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5350 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5351 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5352 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5353 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5354
5355 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5356 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5357 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5358 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5359 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5360 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5361 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5362 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5363 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5364 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5365 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5366 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5367
5368 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5369 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5370 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5371 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5372 dst0, dst1, dst2, dst3);
5373
5374 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5375 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5376 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5377 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5378 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5379 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5380 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5381 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5382 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5383 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5384 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5385 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5386 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5387 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5388 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5389 tmp0, tmp1, tmp2, tmp3);
5390 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5391 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5392
5393 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5394 ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5395 ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5396 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5397 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5398 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5399 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5400 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5401 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5402 CLIP_SH2_0_255(tmp4, tmp5);
5403 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5404 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5405 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5406 }
5407
hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val,int32_t width)5408 static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
5409 int32_t src_stride,
5410 int16_t *src1_ptr,
5411 int32_t src2_stride,
5412 uint8_t *dst,
5413 int32_t dst_stride,
5414 const int8_t *filter_x,
5415 const int8_t *filter_y,
5416 int32_t height,
5417 int32_t weight0,
5418 int32_t weight1,
5419 int32_t offset0,
5420 int32_t offset1,
5421 int32_t rnd_val,
5422 int32_t width)
5423 {
5424 uint32_t loop_cnt;
5425 uint32_t cnt;
5426 int32_t offset, weight;
5427 uint8_t *src0_ptr_tmp;
5428 int16_t *src1_ptr_tmp;
5429 uint8_t *dst_tmp;
5430 v16u8 out0, out1;
5431 v16i8 src0, src1, src2, src3, src4, src5, src6;
5432 v8i16 in0, in1, in2, in3;
5433 v8i16 filt0, filt1;
5434 v8i16 filt_h0, filt_h1;
5435 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5436 v16i8 mask1;
5437 v8i16 filter_vec;
5438 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5439 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5440 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5441 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5442 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5443 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5444 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5445 v4i32 offset_vec, rnd_vec, const_vec;
5446
5447 src0_ptr -= (src_stride + 1);
5448
5449 filter_vec = LD_SH(filter_x);
5450 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5451
5452 filter_vec = LD_SH(filter_y);
5453 UNPCK_R_SB_SH(filter_vec, filter_vec);
5454
5455 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5456
5457 mask1 = mask0 + 2;
5458
5459 offset = (offset0 + offset1) << rnd_val;
5460 weight0 = weight0 & 0x0000FFFF;
5461 weight = weight0 | (weight1 << 16);
5462
5463 const_vec = __msa_fill_w((128 * weight1));
5464 const_vec <<= 6;
5465 offset_vec = __msa_fill_w(offset);
5466 weight_vec = (v8i16) __msa_fill_w(weight);
5467 rnd_vec = __msa_fill_w(rnd_val + 1);
5468 offset_vec += const_vec;
5469
5470 for (cnt = width >> 3; cnt--;) {
5471 src0_ptr_tmp = src0_ptr;
5472 src1_ptr_tmp = src1_ptr;
5473 dst_tmp = dst;
5474
5475 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5476 src0_ptr_tmp += (3 * src_stride);
5477 XORI_B3_128_SB(src0, src1, src2);
5478
5479 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5480 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5481 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5482 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5483 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5484 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5485
5486 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5487 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5488
5489 for (loop_cnt = height >> 2; loop_cnt--;) {
5490 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5491 src0_ptr_tmp += (4 * src_stride);
5492 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5493 src1_ptr_tmp += (4 * src2_stride);
5494 XORI_B4_128_SB(src3, src4, src5, src6);
5495
5496 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5497 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5498 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5499 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5500
5501 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5502 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5503 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5504 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5505
5506 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5507 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5508 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5509 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5510
5511 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5512 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5513 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5514 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5515 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5516 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5517 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5518 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5519
5520 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5521 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5522 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5523 dst3_r, dst0, dst1, dst2, dst3);
5524 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5525 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5526 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5527 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5528 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5529 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5530 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5531 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5532 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5533 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5534 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5535 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5536 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5537 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5538 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5539 tmp0, tmp1, tmp2, tmp3);
5540 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5541 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5542 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5543 dst_tmp += (4 * dst_stride);
5544
5545 dst10_r = dst54_r;
5546 dst10_l = dst54_l;
5547 dst21_r = dst65_r;
5548 dst21_l = dst65_l;
5549 dsth2 = dsth6;
5550 }
5551
5552 src0_ptr += 8;
5553 dst += 8;
5554 src1_ptr += 8;
5555 }
5556 }
5557
hevc_hv_biwgt_4t_8w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)5558 static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5559 int32_t src_stride,
5560 int16_t *src1_ptr,
5561 int32_t src2_stride,
5562 uint8_t *dst,
5563 int32_t dst_stride,
5564 const int8_t *filter_x,
5565 const int8_t *filter_y,
5566 int32_t height,
5567 int32_t weight0,
5568 int32_t weight1,
5569 int32_t offset0,
5570 int32_t offset1,
5571 int32_t rnd_val)
5572 {
5573 if (2 == height) {
5574 hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5575 dst, dst_stride, filter_x, filter_y,
5576 weight0, weight1, offset0, offset1, rnd_val);
5577 } else if (4 == height) {
5578 hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5579 src2_stride, dst, dst_stride, filter_x,
5580 filter_y, weight0, weight1, offset0,
5581 offset1, rnd_val, 1);
5582 } else if (6 == height) {
5583 hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5584 dst, dst_stride, filter_x, filter_y,
5585 weight0, weight1, offset0, offset1, rnd_val);
5586 } else if (0 == (height % 4)) {
5587 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5588 src1_ptr, src2_stride,
5589 dst, dst_stride, filter_x, filter_y,
5590 height, weight0,
5591 weight1, offset0, offset1, rnd_val, 8);
5592 }
5593 }
5594
hevc_hv_biwgt_4t_12w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)5595 static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5596 int32_t src_stride,
5597 int16_t *src1_ptr,
5598 int32_t src2_stride,
5599 uint8_t *dst,
5600 int32_t dst_stride,
5601 const int8_t *filter_x,
5602 const int8_t *filter_y,
5603 int32_t height,
5604 int32_t weight0,
5605 int32_t weight1,
5606 int32_t offset0,
5607 int32_t offset1,
5608 int32_t rnd_val)
5609 {
5610 uint32_t loop_cnt;
5611 uint64_t tp0, tp1;
5612 int32_t offset, weight;
5613 uint8_t *src0_ptr_tmp, *dst_tmp;
5614 int16_t *src1_ptr_tmp;
5615 v16u8 out0, out1;
5616 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5618 v16i8 mask0, mask1, mask2, mask3;
5619 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5620 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5621 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5622 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5623 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5624 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5625 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5626 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5627 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5628 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5629 v4i32 offset_vec, rnd_vec, const_vec;
5630
5631 src0_ptr -= (src_stride + 1);
5632
5633 filter_vec = LD_SH(filter_x);
5634 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5635
5636 filter_vec = LD_SH(filter_y);
5637 UNPCK_R_SB_SH(filter_vec, filter_vec);
5638
5639 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5640
5641 mask0 = LD_SB(ff_hevc_mask_arr);
5642 mask1 = mask0 + 2;
5643
5644 offset = (offset0 + offset1) << rnd_val;
5645 weight0 = weight0 & 0x0000FFFF;
5646 weight = weight0 | (weight1 << 16);
5647
5648 const_vec = __msa_fill_w((128 * weight1));
5649 const_vec <<= 6;
5650 offset_vec = __msa_fill_w(offset);
5651 rnd_vec = __msa_fill_w(rnd_val + 1);
5652 offset_vec += const_vec;
5653 weight_vec = (v8i16) __msa_fill_w(weight);
5654
5655 src0_ptr_tmp = src0_ptr;
5656 dst_tmp = dst;
5657 src1_ptr_tmp = src1_ptr;
5658
5659 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5660 src0_ptr_tmp += (3 * src_stride);
5661
5662 XORI_B3_128_SB(src0, src1, src2);
5663
5664 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5665 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5666 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5667
5668 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5669 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5670 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5671
5672 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5673 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5674
5675 for (loop_cnt = 4; loop_cnt--;) {
5676 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5677 src0_ptr_tmp += (4 * src_stride);
5678 XORI_B4_128_SB(src3, src4, src5, src6);
5679
5680 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5681 src1_ptr_tmp += (4 * src2_stride);
5682
5683 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5684 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5685 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5686 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5687
5688 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5689 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5690 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5691 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5692
5693 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5694 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5695 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5696 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5697
5698 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5699 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5700 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5701 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5702 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5703 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5704 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5705 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5706
5707 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5708 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5709 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5710 dst3_r, dst0, dst1, dst2, dst3);
5711 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5712 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5713 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5714 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5715 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5716 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5717 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5718 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5719 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5720 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5721 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5722 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5723 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5724 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5725 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5726 tmp0, tmp1, tmp2, tmp3);
5727 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5728 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5729 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5730 dst_tmp += (4 * dst_stride);
5731
5732 dst10_r = dst54_r;
5733 dst10_l = dst54_l;
5734 dst21_r = dst65_r;
5735 dst21_l = dst65_l;
5736 dsth2 = dsth6;
5737 }
5738
5739 src0_ptr += 8;
5740 dst += 8;
5741 src1_ptr += 8;
5742
5743 mask2 = LD_SB(ff_hevc_mask_arr + 16);
5744 mask3 = mask2 + 2;
5745
5746 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5747 src0_ptr += (3 * src_stride);
5748 XORI_B3_128_SB(src0, src1, src2);
5749 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5750 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5751
5752 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5753 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5754
5755 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5756 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5757
5758 for (loop_cnt = 2; loop_cnt--;) {
5759 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5760 src10);
5761 src0_ptr += (8 * src_stride);
5762 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5763 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5764 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5765 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5766 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5767
5768 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5769 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5770 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5771 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5772
5773 dst32_r = __msa_ilvr_h(dst73, dst22);
5774 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5775 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5776 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5777 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5778 dst76_r = __msa_ilvr_h(dst22, dst106);
5779
5780 LD2(src1_ptr, src2_stride, tp0, tp1);
5781 src1_ptr += 2 * src2_stride;
5782 INSERT_D2_SH(tp0, tp1, in0);
5783 LD2(src1_ptr, src2_stride, tp0, tp1);
5784 src1_ptr += 2 * src2_stride;
5785 INSERT_D2_SH(tp0, tp1, in1);
5786
5787 LD2(src1_ptr, src2_stride, tp0, tp1);
5788 src1_ptr += 2 * src2_stride;
5789 INSERT_D2_SH(tp0, tp1, in2);
5790 LD2(src1_ptr, src2_stride, tp0, tp1);
5791 src1_ptr += 2 * src2_stride;
5792 INSERT_D2_SH(tp0, tp1, in3);
5793
5794 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5795 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5796 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5797 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5798 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5799 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5800 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5801 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5802
5803 SRA_4V(dst0, dst1, dst2, dst3, 6);
5804 SRA_4V(dst4, dst5, dst6, dst7, 6);
5805 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5806 dst0, dst1, dst2, dst3);
5807 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5808 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5809 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5810 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5811 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5812 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5813 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5814 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5815 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5816 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5817 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5818 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5819 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5820 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5821 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5822 tmp0, tmp1, tmp2, tmp3);
5823 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5824 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5825 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5826 dst += (8 * dst_stride);
5827
5828 dst10_r = dst98_r;
5829 dst21_r = dst109_r;
5830 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5831 }
5832 }
5833
hevc_hv_biwgt_4t_16w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)5834 static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
5835 int32_t src_stride,
5836 int16_t *src1_ptr,
5837 int32_t src2_stride,
5838 uint8_t *dst,
5839 int32_t dst_stride,
5840 const int8_t *filter_x,
5841 const int8_t *filter_y,
5842 int32_t height,
5843 int32_t weight0,
5844 int32_t weight1,
5845 int32_t offset0,
5846 int32_t offset1,
5847 int32_t rnd_val)
5848 {
5849 if (4 == height) {
5850 hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5851 src2_stride, dst, dst_stride, filter_x,
5852 filter_y, weight0, weight1, offset0,
5853 offset1, rnd_val, 2);
5854 } else {
5855 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5856 src2_stride, dst, dst_stride,
5857 filter_x, filter_y, height, weight0,
5858 weight1, offset0, offset1, rnd_val, 16);
5859 }
5860 }
5861
hevc_hv_biwgt_4t_24w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)5862 static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
5863 int32_t src_stride,
5864 int16_t *src1_ptr,
5865 int32_t src2_stride,
5866 uint8_t *dst,
5867 int32_t dst_stride,
5868 const int8_t *filter_x,
5869 const int8_t *filter_y,
5870 int32_t height,
5871 int32_t weight0,
5872 int32_t weight1,
5873 int32_t offset0,
5874 int32_t offset1,
5875 int32_t rnd_val)
5876 {
5877 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5878 src1_ptr, src2_stride,
5879 dst, dst_stride,
5880 filter_x, filter_y, height, weight0,
5881 weight1, offset0, offset1, rnd_val, 24);
5882 }
5883
hevc_hv_biwgt_4t_32w_msa(uint8_t * src0_ptr,int32_t src_stride,int16_t * src1_ptr,int32_t src2_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t weight0,int32_t weight1,int32_t offset0,int32_t offset1,int32_t rnd_val)5884 static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
5885 int32_t src_stride,
5886 int16_t *src1_ptr,
5887 int32_t src2_stride,
5888 uint8_t *dst,
5889 int32_t dst_stride,
5890 const int8_t *filter_x,
5891 const int8_t *filter_y,
5892 int32_t height,
5893 int32_t weight0,
5894 int32_t weight1,
5895 int32_t offset0,
5896 int32_t offset1,
5897 int32_t rnd_val)
5898 {
5899 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5900 src1_ptr, src2_stride,
5901 dst, dst_stride,
5902 filter_x, filter_y, height, weight0,
5903 weight1, offset0, offset1, rnd_val, 32);
5904 }
5905
5906 #define BI_W_MC_COPY(WIDTH) \
5907 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5908 ptrdiff_t dst_stride, \
5909 uint8_t *src, \
5910 ptrdiff_t src_stride, \
5911 int16_t *src_16bit, \
5912 int height, \
5913 int denom, \
5914 int weight0, \
5915 int weight1, \
5916 int offset0, \
5917 int offset1, \
5918 intptr_t mx, \
5919 intptr_t my, \
5920 int width) \
5921 { \
5922 int shift = 14 + 1 - 8; \
5923 int log2Wd = denom + shift - 1; \
5924 \
5925 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5926 dst, dst_stride, height, \
5927 weight0, weight1, offset0, \
5928 offset1, log2Wd); \
5929 }
5930
5931 BI_W_MC_COPY(4);
5932 BI_W_MC_COPY(6);
5933 BI_W_MC_COPY(8);
5934 BI_W_MC_COPY(12);
5935 BI_W_MC_COPY(16);
5936 BI_W_MC_COPY(24);
5937 BI_W_MC_COPY(32);
5938 BI_W_MC_COPY(48);
5939 BI_W_MC_COPY(64);
5940
5941 #undef BI_W_MC_COPY
5942
5943 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5944 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5945 ptrdiff_t \
5946 dst_stride, \
5947 uint8_t *src, \
5948 ptrdiff_t \
5949 src_stride, \
5950 int16_t *src_16bit, \
5951 int height, \
5952 int denom, \
5953 int weight0, \
5954 int weight1, \
5955 int offset0, \
5956 int offset1, \
5957 intptr_t mx, \
5958 intptr_t my, \
5959 int width) \
5960 { \
5961 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5962 int log2Wd = denom + 14 - 8; \
5963 \
5964 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5965 MAX_PB_SIZE, dst, dst_stride, \
5966 filter, height, weight0, \
5967 weight1, offset0, offset1, \
5968 log2Wd); \
5969 }
5970
5971 BI_W_MC(qpel, h, 4, 8, hz, mx);
5972 BI_W_MC(qpel, h, 8, 8, hz, mx);
5973 BI_W_MC(qpel, h, 12, 8, hz, mx);
5974 BI_W_MC(qpel, h, 16, 8, hz, mx);
5975 BI_W_MC(qpel, h, 24, 8, hz, mx);
5976 BI_W_MC(qpel, h, 32, 8, hz, mx);
5977 BI_W_MC(qpel, h, 48, 8, hz, mx);
5978 BI_W_MC(qpel, h, 64, 8, hz, mx);
5979
5980 BI_W_MC(qpel, v, 4, 8, vt, my);
5981 BI_W_MC(qpel, v, 8, 8, vt, my);
5982 BI_W_MC(qpel, v, 12, 8, vt, my);
5983 BI_W_MC(qpel, v, 16, 8, vt, my);
5984 BI_W_MC(qpel, v, 24, 8, vt, my);
5985 BI_W_MC(qpel, v, 32, 8, vt, my);
5986 BI_W_MC(qpel, v, 48, 8, vt, my);
5987 BI_W_MC(qpel, v, 64, 8, vt, my);
5988
5989 BI_W_MC(epel, h, 4, 4, hz, mx);
5990 BI_W_MC(epel, h, 8, 4, hz, mx);
5991 BI_W_MC(epel, h, 6, 4, hz, mx);
5992 BI_W_MC(epel, h, 12, 4, hz, mx);
5993 BI_W_MC(epel, h, 16, 4, hz, mx);
5994 BI_W_MC(epel, h, 24, 4, hz, mx);
5995 BI_W_MC(epel, h, 32, 4, hz, mx);
5996
5997 BI_W_MC(epel, v, 4, 4, vt, my);
5998 BI_W_MC(epel, v, 8, 4, vt, my);
5999 BI_W_MC(epel, v, 6, 4, vt, my);
6000 BI_W_MC(epel, v, 12, 4, vt, my);
6001 BI_W_MC(epel, v, 16, 4, vt, my);
6002 BI_W_MC(epel, v, 24, 4, vt, my);
6003 BI_W_MC(epel, v, 32, 4, vt, my);
6004
6005 #undef BI_W_MC
6006
6007 #define BI_W_MC_HV(PEL, WIDTH, TAP) \
6008 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6009 ptrdiff_t dst_stride, \
6010 uint8_t *src, \
6011 ptrdiff_t src_stride, \
6012 int16_t *src_16bit, \
6013 int height, \
6014 int denom, \
6015 int weight0, \
6016 int weight1, \
6017 int offset0, \
6018 int offset1, \
6019 intptr_t mx, \
6020 intptr_t my, \
6021 int width) \
6022 { \
6023 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
6024 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
6025 int log2Wd = denom + 14 - 8; \
6026 \
6027 hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6028 MAX_PB_SIZE, dst, dst_stride, \
6029 filter_x, filter_y, height, \
6030 weight0, weight1, offset0, \
6031 offset1, log2Wd); \
6032 }
6033
6034 BI_W_MC_HV(qpel, 4, 8);
6035 BI_W_MC_HV(qpel, 8, 8);
6036 BI_W_MC_HV(qpel, 12, 8);
6037 BI_W_MC_HV(qpel, 16, 8);
6038 BI_W_MC_HV(qpel, 24, 8);
6039 BI_W_MC_HV(qpel, 32, 8);
6040 BI_W_MC_HV(qpel, 48, 8);
6041 BI_W_MC_HV(qpel, 64, 8);
6042
6043 BI_W_MC_HV(epel, 4, 4);
6044 BI_W_MC_HV(epel, 8, 4);
6045 BI_W_MC_HV(epel, 6, 4);
6046 BI_W_MC_HV(epel, 12, 4);
6047 BI_W_MC_HV(epel, 16, 4);
6048 BI_W_MC_HV(epel, 24, 4);
6049 BI_W_MC_HV(epel, 32, 4);
6050
6051 #undef BI_W_MC_HV
6052