1 /*
2 * Copyright (c) 2018 gxw <guxiwei-hf@loongson.cn>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "vp3dsp_mips.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "libavutil/intreadwrite.h"
24 #include "libavcodec/rnd_avg.h"
25
idct_msa(uint8_t * dst,int stride,int16_t * input,int type)26 static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
27 {
28 v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
29 v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
30 r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
31 v4i32 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
32 v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
33 v16u8 sign_l;
34 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
35 v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
36 v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
37 v4i32 sign_t;
38 v16i8 zero = {0};
39 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
40 v4i32 cnst64277w = {64277, 64277, 64277, 64277};
41 v4i32 cnst60547w = {60547, 60547, 60547, 60547};
42 v4i32 cnst54491w = {54491, 54491, 54491, 54491};
43 v4i32 cnst46341w = {46341, 46341, 46341, 46341};
44 v4i32 cnst36410w = {36410, 36410, 36410, 36410};
45 v4i32 cnst25080w = {25080, 25080, 25080, 25080};
46 v4i32 cnst12785w = {12785, 12785, 12785, 12785};
47 v4i32 cnst8w = {8, 8, 8, 8};
48 v4i32 cnst2048w = {2048, 2048, 2048, 2048};
49 v4i32 cnst128w = {128, 128, 128, 128};
50
51 /* Extended input data */
52 LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
53 sign = __msa_clti_s_h(r0, 0);
54 r0_r = (v4i32) __msa_ilvr_h(sign, r0);
55 r0_l = (v4i32) __msa_ilvl_h(sign, r0);
56 sign = __msa_clti_s_h(r1, 0);
57 r1_r = (v4i32) __msa_ilvr_h(sign, r1);
58 r1_l = (v4i32) __msa_ilvl_h(sign, r1);
59 sign = __msa_clti_s_h(r2, 0);
60 r2_r = (v4i32) __msa_ilvr_h(sign, r2);
61 r2_l = (v4i32) __msa_ilvl_h(sign, r2);
62 sign = __msa_clti_s_h(r3, 0);
63 r3_r = (v4i32) __msa_ilvr_h(sign, r3);
64 r3_l = (v4i32) __msa_ilvl_h(sign, r3);
65 sign = __msa_clti_s_h(r4, 0);
66 r4_r = (v4i32) __msa_ilvr_h(sign, r4);
67 r4_l = (v4i32) __msa_ilvl_h(sign, r4);
68 sign = __msa_clti_s_h(r5, 0);
69 r5_r = (v4i32) __msa_ilvr_h(sign, r5);
70 r5_l = (v4i32) __msa_ilvl_h(sign, r5);
71 sign = __msa_clti_s_h(r6, 0);
72 r6_r = (v4i32) __msa_ilvr_h(sign, r6);
73 r6_l = (v4i32) __msa_ilvl_h(sign, r6);
74 sign = __msa_clti_s_h(r7, 0);
75 r7_r = (v4i32) __msa_ilvr_h(sign, r7);
76 r7_l = (v4i32) __msa_ilvl_h(sign, r7);
77
78 /* Right part */
79 A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
80 B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
81 C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
82 D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
83 Ad = ((A - C) * cnst46341w) >> 16;
84 Bd = ((B - D) * cnst46341w) >> 16;
85 Cd = A + C;
86 Dd = B + D;
87 E = ((r0_r + r4_r) * cnst46341w) >> 16;
88 F = ((r0_r - r4_r) * cnst46341w) >> 16;
89 G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
90 H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
91 Ed = E - G;
92 Gd = E + G;
93 Add = F + Ad;
94 Bdd = Bd - H;
95 Fd = F - Ad;
96 Hd = Bd + H;
97 r0_r = Gd + Cd;
98 r7_r = Gd - Cd;
99 r1_r = Add + Hd;
100 r2_r = Add - Hd;
101 r3_r = Ed + Dd;
102 r4_r = Ed - Dd;
103 r5_r = Fd + Bdd;
104 r6_r = Fd - Bdd;
105
106 /* Left part */
107 A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
108 B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
109 C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
110 D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
111 Ad = ((A - C) * cnst46341w) >> 16;
112 Bd = ((B - D) * cnst46341w) >> 16;
113 Cd = A + C;
114 Dd = B + D;
115 E = ((r0_l + r4_l) * cnst46341w) >> 16;
116 F = ((r0_l - r4_l) * cnst46341w) >> 16;
117 G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
118 H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
119 Ed = E - G;
120 Gd = E + G;
121 Add = F + Ad;
122 Bdd = Bd - H;
123 Fd = F - Ad;
124 Hd = Bd + H;
125 r0_l = Gd + Cd;
126 r7_l = Gd - Cd;
127 r1_l = Add + Hd;
128 r2_l = Add - Hd;
129 r3_l = Ed + Dd;
130 r4_l = Ed - Dd;
131 r5_l = Fd + Bdd;
132 r6_l = Fd - Bdd;
133
134 /* Row 0 to 3 */
135 TRANSPOSE4x4_SW_SW(r0_r, r1_r, r2_r, r3_r,
136 r0_r, r1_r, r2_r, r3_r);
137 TRANSPOSE4x4_SW_SW(r0_l, r1_l, r2_l, r3_l,
138 r0_l, r1_l, r2_l, r3_l);
139 A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
140 B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
141 C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
142 D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
143 Ad = ((A - C) * cnst46341w) >> 16;
144 Bd = ((B - D) * cnst46341w) >> 16;
145 Cd = A + C;
146 Dd = B + D;
147 E = ((r0_r + r0_l) * cnst46341w) >> 16;
148 E += cnst8w;
149 F = ((r0_r - r0_l) * cnst46341w) >> 16;
150 F += cnst8w;
151 if (type == 1) { // HACK
152 E += cnst2048w;
153 F += cnst2048w;
154 }
155 G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
156 H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
157 Ed = E - G;
158 Gd = E + G;
159 Add = F + Ad;
160 Bdd = Bd - H;
161 Fd = F - Ad;
162 Hd = Bd + H;
163 A = (Gd + Cd) >> 4;
164 B = (Gd - Cd) >> 4;
165 C = (Add + Hd) >> 4;
166 D = (Add - Hd) >> 4;
167 E = (Ed + Dd) >> 4;
168 F = (Ed - Dd) >> 4;
169 G = (Fd + Bdd) >> 4;
170 H = (Fd - Bdd) >> 4;
171 if (type != 1) {
172 LD_SB8(dst, stride, d0, d1, d2, d3, d4, d5, d6, d7);
173 ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
174 f0, f1, f2, f3);
175 ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
176 f4, f5, f6, f7);
177 ILVR_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
178 c0, c1, c2, c3);
179 ILVR_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
180 c4, c5, c6, c7);
181 A += c0;
182 B += c7;
183 C += c1;
184 D += c2;
185 E += c3;
186 F += c4;
187 G += c5;
188 H += c6;
189 }
190 CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
191 sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
192 sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
193 sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
194 sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
195 sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
196 sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
197 sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
198 Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
199 if (type == 1) {
200 Bdd = Add + cnst128w;
201 CLIP_SW_0_255(Bdd);
202 Ad = Bdd;
203 Bd = Bdd;
204 Cd = Bdd;
205 Dd = Bdd;
206 Ed = Bdd;
207 Fd = Bdd;
208 Gd = Bdd;
209 Hd = Bdd;
210 } else {
211 Ad = Add + c0;
212 Bd = Add + c1;
213 Cd = Add + c2;
214 Dd = Add + c3;
215 Ed = Add + c4;
216 Fd = Add + c5;
217 Gd = Add + c6;
218 Hd = Add + c7;
219 CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
220 }
221 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
222 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
223 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
224 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
225 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
226 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
227 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
228 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
229 sign_t = __msa_ceqi_w(sign_t, 0);
230 A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
231 B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
232 C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
233 D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
234 E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
235 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
236 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
237 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
238 r0_r = Ad + A;
239 r1_r = Bd + C;
240 r2_r = Cd + D;
241 r3_r = Dd + E;
242 r0_l = Ed + F;
243 r1_l = Fd + G;
244 r2_l = Gd + H;
245 r3_l = Hd + B;
246
247 /* Row 4 to 7 */
248 TRANSPOSE4x4_SW_SW(r4_r, r5_r, r6_r, r7_r,
249 r4_r, r5_r, r6_r, r7_r);
250 TRANSPOSE4x4_SW_SW(r4_l, r5_l, r6_l, r7_l,
251 r4_l, r5_l, r6_l, r7_l);
252 A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
253 B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
254 C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
255 D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
256 Ad = ((A - C) * cnst46341w) >> 16;
257 Bd = ((B - D) * cnst46341w) >> 16;
258 Cd = A + C;
259 Dd = B + D;
260 E = ((r4_r + r4_l) * cnst46341w) >> 16;
261 E += cnst8w;
262 F = ((r4_r - r4_l) * cnst46341w) >> 16;
263 F += cnst8w;
264 if (type == 1) { // HACK
265 E += cnst2048w;
266 F += cnst2048w;
267 }
268 G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
269 H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
270 Ed = E - G;
271 Gd = E + G;
272 Add = F + Ad;
273 Bdd = Bd - H;
274 Fd = F - Ad;
275 Hd = Bd + H;
276 A = (Gd + Cd) >> 4;
277 B = (Gd - Cd) >> 4;
278 C = (Add + Hd) >> 4;
279 D = (Add - Hd) >> 4;
280 E = (Ed + Dd) >> 4;
281 F = (Ed - Dd) >> 4;
282 G = (Fd + Bdd) >> 4;
283 H = (Fd - Bdd) >> 4;
284 if (type != 1) {
285 ILVL_H4_SW(zero, f0, zero, f1, zero, f2, zero, f3,
286 c0, c1, c2, c3);
287 ILVL_H4_SW(zero, f4, zero, f5, zero, f6, zero, f7,
288 c4, c5, c6, c7);
289 A += c0;
290 B += c7;
291 C += c1;
292 D += c2;
293 E += c3;
294 F += c4;
295 G += c5;
296 H += c6;
297 }
298 CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
299 sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
300 sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
301 sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
302 sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
303 sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
304 sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
305 sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
306 Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
307 if (type == 1) {
308 Bdd = Add + cnst128w;
309 CLIP_SW_0_255(Bdd);
310 Ad = Bdd;
311 Bd = Bdd;
312 Cd = Bdd;
313 Dd = Bdd;
314 Ed = Bdd;
315 Fd = Bdd;
316 Gd = Bdd;
317 Hd = Bdd;
318 } else {
319 Ad = Add + c0;
320 Bd = Add + c1;
321 Cd = Add + c2;
322 Dd = Add + c3;
323 Ed = Add + c4;
324 Fd = Add + c5;
325 Gd = Add + c6;
326 Hd = Add + c7;
327 CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
328 }
329 Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
330 Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
331 Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
332 Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
333 Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
334 Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
335 Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
336 Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
337 sign_t = __msa_ceqi_w(sign_t, 0);
338 A = (v4i32)__msa_and_v((v16u8)A, (v16u8)sign_t);
339 B = (v4i32)__msa_and_v((v16u8)B, (v16u8)sign_t);
340 C = (v4i32)__msa_and_v((v16u8)C, (v16u8)sign_t);
341 D = (v4i32)__msa_and_v((v16u8)D, (v16u8)sign_t);
342 E = (v4i32)__msa_and_v((v16u8)E, (v16u8)sign_t);
343 F = (v4i32)__msa_and_v((v16u8)F, (v16u8)sign_t);
344 G = (v4i32)__msa_and_v((v16u8)G, (v16u8)sign_t);
345 H = (v4i32)__msa_and_v((v16u8)H, (v16u8)sign_t);
346 r4_r = Ad + A;
347 r5_r = Bd + C;
348 r6_r = Cd + D;
349 r7_r = Dd + E;
350 r4_l = Ed + F;
351 r5_l = Fd + G;
352 r6_l = Gd + H;
353 r7_l = Hd + B;
354 VSHF_B2_SB(r0_r, r4_r, r1_r, r5_r, mask, mask, d0, d1);
355 VSHF_B2_SB(r2_r, r6_r, r3_r, r7_r, mask, mask, d2, d3);
356 VSHF_B2_SB(r0_l, r4_l, r1_l, r5_l, mask, mask, d4, d5);
357 VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
358
359 /* Final sequence of operations over-write original dst */
360 ST_D1(d0, 0, dst);
361 ST_D1(d1, 0, dst + stride);
362 ST_D1(d2, 0, dst + 2 * stride);
363 ST_D1(d3, 0, dst + 3 * stride);
364 ST_D1(d4, 0, dst + 4 * stride);
365 ST_D1(d5, 0, dst + 5 * stride);
366 ST_D1(d6, 0, dst + 6 * stride);
367 ST_D1(d7, 0, dst + 7 * stride);
368 }
369
ff_vp3_idct_put_msa(uint8_t * dest,ptrdiff_t line_size,int16_t * block)370 void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
371 {
372 idct_msa(dest, line_size, block, 1);
373 memset(block, 0, sizeof(*block) * 64);
374 }
375
ff_vp3_idct_add_msa(uint8_t * dest,ptrdiff_t line_size,int16_t * block)376 void ff_vp3_idct_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
377 {
378 idct_msa(dest, line_size, block, 2);
379 memset(block, 0, sizeof(*block) * 64);
380 }
381
ff_vp3_idct_dc_add_msa(uint8_t * dest,ptrdiff_t line_size,int16_t * block)382 void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
383 {
384 int i = (block[0] + 15) >> 5;
385 v4i32 dc = {i, i, i, i};
386 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
387 v4i32 c0, c1, c2, c3, c4, c5, c6, c7;
388 v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
389 v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
390 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
391 v16i8 zero = {0};
392
393 LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
394 ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
395 c0, c1, c2, c3);
396 ILVR_B4_SW(zero, d4, zero, d5, zero, d6, zero, d7,
397 c4, c5, c6, c7);
398 /* Right part */
399 ILVR_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
400 e0, e1, e2, e3);
401 ILVR_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
402 e4, e5, e6, e7);
403 e0 += dc;
404 e1 += dc;
405 e2 += dc;
406 e3 += dc;
407 e4 += dc;
408 e5 += dc;
409 e6 += dc;
410 e7 += dc;
411 CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7);
412
413 /* Left part */
414 ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
415 r0, r1, r2, r3);
416 ILVL_H4_SW(zero, c4, zero, c5, zero, c6, zero, c7,
417 r4, r5, r6, r7);
418 r0 += dc;
419 r1 += dc;
420 r2 += dc;
421 r3 += dc;
422 r4 += dc;
423 r5 += dc;
424 r6 += dc;
425 r7 += dc;
426 CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7);
427 VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
428 VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
429 VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
430 VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
431
432 /* Final sequence of operations over-write original dst */
433 ST_D1(d0, 0, dest);
434 ST_D1(d1, 0, dest + line_size);
435 ST_D1(d2, 0, dest + 2 * line_size);
436 ST_D1(d3, 0, dest + 3 * line_size);
437 ST_D1(d4, 0, dest + 4 * line_size);
438 ST_D1(d5, 0, dest + 5 * line_size);
439 ST_D1(d6, 0, dest + 6 * line_size);
440 ST_D1(d7, 0, dest + 7 * line_size);
441
442 block[0] = 0;
443 }
444
ff_vp3_v_loop_filter_msa(uint8_t * first_pixel,ptrdiff_t stride,int * bounding_values)445 void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
446 int *bounding_values)
447 {
448 int nstride = -stride;
449 v4i32 e0, e1, f0, f1, g0, g1;
450 v16i8 zero = {0};
451 v16i8 d0, d1, d2, d3;
452 v8i16 c0, c1, c2, c3;
453 v8i16 r0;
454 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
455 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
456 v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
457 int16_t temp_16[8];
458 int temp_32[8];
459
460 LD_SB4(first_pixel + nstride * 2, stride, d0, d1, d2, d3);
461 ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
462 c0, c1, c2, c3);
463 r0 = (c0 - c3) + (c2 - c1) * cnst3h;
464 r0 += cnst4h;
465 r0 = r0 >> 3;
466 /* Get filter_value from bounding_values one by one */
467 ST_SH(r0, temp_16);
468 for (int i = 0; i < 8; i++)
469 temp_32[i] = bounding_values[temp_16[i]];
470 LD_SW2(temp_32, 4, e0, e1);
471 ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
472 ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
473 f0 += e0;
474 f1 += e1;
475 g0 -= e0;
476 g1 -= e1;
477 CLIP_SW4_0_255(f0, f1, g0, g1);
478 VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
479
480 /* Final move to first_pixel */
481 ST_D1(d1, 0, first_pixel + nstride);
482 ST_D1(d2, 0, first_pixel);
483 }
484
ff_vp3_h_loop_filter_msa(uint8_t * first_pixel,ptrdiff_t stride,int * bounding_values)485 void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
486 int *bounding_values)
487 {
488 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
489 v8i16 c0, c1, c2, c3, c4, c5, c6, c7;
490 v8i16 r0;
491 v4i32 e0, e1, f0, f1, g0, g1;
492 v16i8 zero = {0};
493 v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
494 cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
495 v16i8 mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
496 int16_t temp_16[8];
497 int temp_32[8];
498
499 LD_SB8(first_pixel - 2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
500 ILVR_B4_SH(zero, d0, zero, d1, zero, d2, zero, d3,
501 c0, c1, c2, c3);
502 ILVR_B4_SH(zero, d4, zero, d5, zero, d6, zero, d7,
503 c4, c5, c6, c7);
504 TRANSPOSE8x8_SH_SH(c0, c1, c2, c3, c4, c5, c6, c7,
505 c0, c1, c2, c3, c4, c5, c6, c7);
506 r0 = (c0 - c3) + (c2 - c1) * cnst3h;
507 r0 += cnst4h;
508 r0 = r0 >> 3;
509
510 /* Get filter_value from bounding_values one by one */
511 ST_SH(r0, temp_16);
512 for (int i = 0; i < 8; i++)
513 temp_32[i] = bounding_values[temp_16[i]];
514 LD_SW2(temp_32, 4, e0, e1);
515 ILVR_H2_SW(zero, c1, zero, c2, f0, g0);
516 ILVL_H2_SW(zero, c1, zero, c2, f1, g1);
517 f0 += e0;
518 f1 += e1;
519 g0 -= e0;
520 g1 -= e1;
521 CLIP_SW4_0_255(f0, f1, g0, g1);
522 VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
523 /* Final move to first_pixel */
524 ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
525 ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
526 }
527
ff_put_no_rnd_pixels_l2_msa(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,ptrdiff_t stride,int h)528 void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
529 const uint8_t *src2, ptrdiff_t stride, int h)
530 {
531 if (h == 8) {
532 v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
533 v16i8 c0, c1, c2, c3;
534 v4i32 a0, a1, a2, a3, b0, b1, b2, b3;
535 v4i32 e0, e1, e2;
536 v4i32 f0, f1, f2;
537 v4u32 t0, t1, t2, t3;
538 v16i8 mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
539 int32_t value = 0xfefefefe;
540 v4i32 fmask = {value, value, value, value};
541
542 LD_SB8(src1, stride, d0, d1, d2, d3, d4, d5, d6, d7);
543 VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
544 VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
545 a0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
546 a2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
547 a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
548 a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
549
550 LD_SB8(src2, stride, d0, d1, d2, d3, d4, d5, d6, d7);
551 VSHF_B2_SB(d0, d1, d2, d3, mask, mask, c0, c1);
552 VSHF_B2_SB(d4, d5, d6, d7, mask, mask, c2, c3);
553 b0 = (v4i32) __msa_pckev_d((v2i64)c1, (v2i64)c0);
554 b2 = (v4i32) __msa_pckod_d((v2i64)c1, (v2i64)c0);
555 b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)c2);
556 b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)c2);
557
558 e0 = (v4i32) __msa_xor_v((v16u8)a0, (v16u8)b0);
559 e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
560 t0 = ((v4u32)e0) >> 1;
561 e2 = (v4i32) __msa_and_v((v16u8)a0, (v16u8)b0);
562 t0 = t0 + (v4u32)e2;
563
564 e1 = (v4i32) __msa_xor_v((v16u8)a1, (v16u8)b1);
565 e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
566 t1 = ((v4u32)e1) >> 1;
567 e2 = (v4i32) __msa_and_v((v16u8)a1, (v16u8)b1);
568 t1 = t1 + (v4u32)e2;
569
570 f0 = (v4i32) __msa_xor_v((v16u8)a2, (v16u8)b2);
571 f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
572 t2 = ((v4u32)f0) >> 1;
573 f2 = (v4i32) __msa_and_v((v16u8)a2, (v16u8)b2);
574 t2 = t2 + (v4u32)f2;
575
576 f1 = (v4i32) __msa_xor_v((v16u8)a3, (v16u8)b3);
577 f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
578 t3 = ((v4u32)f1) >> 1;
579 f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
580 t3 = t3 + (v4u32)f2;
581
582 ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
583 ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
584 } else {
585 int i;
586
587 for (i = 0; i < h; i++) {
588 uint32_t a, b;
589
590 a = AV_RN32(&src1[i * stride]);
591 b = AV_RN32(&src2[i * stride]);
592 AV_WN32A(&dst[i * stride], no_rnd_avg32(a, b));
593 a = AV_RN32(&src1[i * stride + 4]);
594 b = AV_RN32(&src2[i * stride + 4]);
595 AV_WN32A(&dst[i * stride + 4], no_rnd_avg32(a, b));
596 }
597 }
598 }
599