• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavcodec/vp8dsp.h"
23 #include "vp8dsp_loongarch.h"
24 #include "libavutil/loongarch/loongson_intrinsics.h"
25 
26 #define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out,  \
27                            mask_in, hev_in)                             \
28 {                                                                       \
29     __m128i p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;               \
30     __m128i filt, filt1, filt2, cnst4b, cnst3b;                         \
31     __m128i q0_sub_p0_l, q0_sub_p0_h, filt_h, filt_l, cnst3h;           \
32                                                                         \
33     p1_m = __lsx_vxori_b(p1_in_out, 0x80);                              \
34     p0_m = __lsx_vxori_b(p0_in_out, 0x80);                              \
35     q0_m = __lsx_vxori_b(q0_in_out, 0x80);                              \
36     q1_m = __lsx_vxori_b(q1_in_out, 0x80);                              \
37     filt = __lsx_vssub_b(p1_m, q1_m);                                   \
38     filt = filt & hev_in;                                               \
39                                                                         \
40     q0_sub_p0 = __lsx_vsub_b(q0_m, p0_m);                               \
41     filt_sign = __lsx_vslti_b(filt, 0);                                 \
42                                                                         \
43     cnst3h = __lsx_vreplgr2vr_h(3);                                     \
44     q0_sub_p0_l = __lsx_vilvl_b(q0_sub_p0, q0_sub_p0);                  \
45     q0_sub_p0_l = __lsx_vdp2_h_b(q0_sub_p0_l, cnst3h);                  \
46     filt_l = __lsx_vilvl_b(filt_sign, filt);                            \
47     filt_l = __lsx_vadd_h(filt_l, q0_sub_p0_l);                         \
48     filt_l = __lsx_vsat_h(filt_l, 7);                                   \
49                                                                         \
50     q0_sub_p0_h = __lsx_vilvh_b(q0_sub_p0, q0_sub_p0);                  \
51     q0_sub_p0_h = __lsx_vdp2_h_b(q0_sub_p0_h, cnst3h);                  \
52     filt_h = __lsx_vilvh_b(filt_sign, filt);                            \
53     filt_h = __lsx_vadd_h(filt_h, q0_sub_p0_h);                         \
54     filt_h = __lsx_vsat_h(filt_h, 7);                                   \
55                                                                         \
56     filt = __lsx_vpickev_b(filt_h, filt_l);                             \
57     filt = filt & mask_in;                                              \
58     cnst4b = __lsx_vreplgr2vr_b(4);                                     \
59     filt1 = __lsx_vsadd_b(filt, cnst4b);                                \
60     filt1 = __lsx_vsrai_b(filt1, 3);                                    \
61                                                                         \
62     cnst3b = __lsx_vreplgr2vr_b(3);                                     \
63     filt2 = __lsx_vsadd_b(filt, cnst3b);                                \
64     filt2 = __lsx_vsrai_b(filt2, 3);                                    \
65                                                                         \
66     q0_m = __lsx_vssub_b(q0_m, filt1);                                  \
67     q0_in_out = __lsx_vxori_b(q0_m, 0x80);                              \
68     p0_m = __lsx_vsadd_b(p0_m, filt2);                                  \
69     p0_in_out = __lsx_vxori_b(p0_m, 0x80);                              \
70                                                                         \
71     filt = __lsx_vsrari_b(filt1, 1);                                    \
72     hev_in = __lsx_vxori_b(hev_in, 0xff);                               \
73     filt = filt & hev_in;                                               \
74                                                                         \
75     q1_m = __lsx_vssub_b(q1_m, filt);                                   \
76     q1_in_out = __lsx_vxori_b(q1_m, 0x80);                              \
77     p1_m = __lsx_vsadd_b(p1_m, filt);                                   \
78     p1_in_out = __lsx_vxori_b(p1_m, 0x80);                              \
79 }
80 
81 #define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev)             \
82 {                                                                   \
83     __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                     \
84     __m128i filt, q0_sub_p0, cnst4b, cnst3b;                        \
85     __m128i u, filt1, filt2, filt_sign, q0_sub_p0_sign;             \
86     __m128i q0_sub_p0_l, q0_sub_p0_h, filt_l, u_l, u_h, filt_h;     \
87     __m128i cnst3h, cnst27h, cnst18h, cnst63h;                      \
88                                                                     \
89     cnst3h = __lsx_vreplgr2vr_h(3);                                 \
90                                                                     \
91     p2_m = __lsx_vxori_b(p2, 0x80);                                 \
92     p1_m = __lsx_vxori_b(p1, 0x80);                                 \
93     p0_m = __lsx_vxori_b(p0, 0x80);                                 \
94     q0_m = __lsx_vxori_b(q0, 0x80);                                 \
95     q1_m = __lsx_vxori_b(q1, 0x80);                                 \
96     q2_m = __lsx_vxori_b(q2, 0x80);                                 \
97                                                                     \
98     filt = __lsx_vssub_b(p1_m, q1_m);                               \
99     q0_sub_p0 = __lsx_vsub_b(q0_m, p0_m);                           \
100     q0_sub_p0_sign = __lsx_vslti_b(q0_sub_p0, 0);                   \
101     filt_sign = __lsx_vslti_b(filt, 0);                             \
102                                                                     \
103     /* right part */                                                \
104     q0_sub_p0_l = __lsx_vilvl_b(q0_sub_p0_sign, q0_sub_p0);         \
105     q0_sub_p0_l = __lsx_vmul_h(q0_sub_p0_l, cnst3h);                \
106     filt_l = __lsx_vilvl_b(filt_sign, filt);                        \
107     filt_l = __lsx_vadd_h(filt_l, q0_sub_p0_l);                     \
108     filt_l = __lsx_vsat_h(filt_l, 7);                               \
109                                                                     \
110     /* left part */                                                 \
111     q0_sub_p0_h = __lsx_vilvh_b(q0_sub_p0_sign, q0_sub_p0);         \
112     q0_sub_p0_h = __lsx_vmul_h(q0_sub_p0_h, cnst3h);                \
113     filt_h = __lsx_vilvh_b(filt_sign, filt);                        \
114     filt_h = __lsx_vadd_h(filt_h,  q0_sub_p0_h);                    \
115     filt_h = __lsx_vsat_h(filt_h, 7);                               \
116                                                                     \
117     /* combine left and right part */                               \
118     filt = __lsx_vpickev_b(filt_h, filt_l);                         \
119     filt = filt & mask;                                             \
120     filt2 = filt & hev;                                             \
121     /* filt_val &= ~hev */                                          \
122     hev = __lsx_vxori_b(hev, 0xff);                                 \
123     filt = filt & hev;                                              \
124     cnst4b = __lsx_vreplgr2vr_b(4);                                 \
125     filt1 = __lsx_vsadd_b(filt2, cnst4b);                           \
126     filt1 = __lsx_vsrai_b(filt1, 3);                                \
127     cnst3b = __lsx_vreplgr2vr_b(3);                                 \
128     filt2 = __lsx_vsadd_b(filt2, cnst3b);                           \
129     filt2 = __lsx_vsrai_b(filt2, 3);                                \
130     q0_m = __lsx_vssub_b(q0_m, filt1);                              \
131     p0_m = __lsx_vsadd_b(p0_m, filt2);                              \
132                                                                     \
133     filt_sign = __lsx_vslti_b(filt, 0);                             \
134     filt_l = __lsx_vilvl_b(filt_sign, filt);                        \
135     filt_h = __lsx_vilvh_b(filt_sign, filt);                        \
136                                                                     \
137     cnst27h = __lsx_vreplgr2vr_h(27);                               \
138     cnst63h = __lsx_vreplgr2vr_h(63);                               \
139                                                                     \
140     /* right part */                                                \
141     u_l = __lsx_vmul_h(filt_l, cnst27h);                            \
142     u_l = __lsx_vadd_h(u_l, cnst63h);                               \
143     u_l = __lsx_vsrai_h(u_l, 7);                                    \
144     u_l = __lsx_vsat_h(u_l, 7);                                     \
145     /* left part */                                                 \
146     u_h = __lsx_vmul_h(filt_h, cnst27h);                            \
147     u_h = __lsx_vadd_h(u_h, cnst63h);                               \
148     u_h = __lsx_vsrai_h(u_h, 7);                                    \
149     u_h = __lsx_vsat_h(u_h, 7);                                     \
150     /* combine left and right part */                               \
151     u = __lsx_vpickev_b(u_h, u_l);                                  \
152     q0_m = __lsx_vssub_b(q0_m, u);                                  \
153     q0 = __lsx_vxori_b(q0_m, 0x80);                                 \
154     p0_m = __lsx_vsadd_b(p0_m, u);                                  \
155     p0 = __lsx_vxori_b(p0_m, 0x80);                                 \
156     cnst18h = __lsx_vreplgr2vr_h(18);                               \
157     u_l = __lsx_vmul_h(filt_l, cnst18h);                            \
158     u_l = __lsx_vadd_h(u_l, cnst63h);                               \
159     u_l = __lsx_vsrai_h(u_l, 7);                                    \
160     u_l = __lsx_vsat_h(u_l, 7);                                     \
161                                                                     \
162     /* left part */                                                 \
163     u_h = __lsx_vmul_h(filt_h, cnst18h);                            \
164     u_h = __lsx_vadd_h(u_h, cnst63h);                               \
165     u_h = __lsx_vsrai_h(u_h, 7);                                    \
166     u_h = __lsx_vsat_h(u_h, 7);                                     \
167     /* combine left and right part */                               \
168     u = __lsx_vpickev_b(u_h, u_l);                                  \
169     q1_m = __lsx_vssub_b(q1_m, u);                                  \
170     q1 = __lsx_vxori_b(q1_m, 0x80);                                 \
171     p1_m = __lsx_vsadd_b(p1_m, u);                                  \
172     p1 = __lsx_vxori_b(p1_m, 0x80);                                 \
173     u_l = __lsx_vslli_h(filt_l, 3);                                 \
174     u_l = __lsx_vadd_h(u_l, filt_l);                                \
175     u_l = __lsx_vadd_h(u_l, cnst63h);                               \
176     u_l = __lsx_vsrai_h(u_l, 7);                                    \
177     u_l = __lsx_vsat_h(u_l, 7);                                     \
178                                                                     \
179     /* left part */                                                 \
180     u_h = __lsx_vslli_h(filt_h, 3);                                 \
181     u_h = __lsx_vadd_h(u_h, filt_h);                                \
182     u_h = __lsx_vadd_h(u_h, cnst63h);                               \
183     u_h = __lsx_vsrai_h(u_h, 7);                                    \
184     u_h = __lsx_vsat_h(u_h, 7);                                     \
185     /* combine left and right part */                               \
186     u = __lsx_vpickev_b(u_h, u_l);                                  \
187     q2_m = __lsx_vssub_b(q2_m, u);                                  \
188     q2 = __lsx_vxori_b(q2_m, 0x80);                                 \
189     p2_m = __lsx_vsadd_b(p2_m, u);                                  \
190     p2 = __lsx_vxori_b(p2_m, 0x80);                                 \
191 }
192 
193 #define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src,                \
194                      q0_src, q1_src, q2_src, q3_src,                \
195                      limit_src, b_limit_src, thresh_src,            \
196                      hev_dst, mask_dst, flat_dst)                   \
197 {                                                                   \
198     __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
199     __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
200                                                                     \
201     /* absolute subtraction of pixel values */                      \
202     p3_asub_p2_m = __lsx_vabsd_bu(p3_src, p2_src);                  \
203     p2_asub_p1_m = __lsx_vabsd_bu(p2_src, p1_src);                  \
204     p1_asub_p0_m = __lsx_vabsd_bu(p1_src, p0_src);                  \
205     q1_asub_q0_m = __lsx_vabsd_bu(q1_src, q0_src);                  \
206     q2_asub_q1_m = __lsx_vabsd_bu(q2_src, q1_src);                  \
207     q3_asub_q2_m = __lsx_vabsd_bu(q3_src, q2_src);                  \
208     p0_asub_q0_m = __lsx_vabsd_bu(p0_src, q0_src);                  \
209     p1_asub_q1_m = __lsx_vabsd_bu(p1_src, q1_src);                  \
210                                                                     \
211     /* calculation of hev */                                        \
212     flat_dst = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);           \
213     hev_dst = __lsx_vslt_bu(thresh_src, flat_dst);                  \
214     /* calculation of mask */                                       \
215     p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);      \
216     p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                  \
217     p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);      \
218     mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_m);            \
219     mask_dst = __lsx_vmax_bu(flat_dst, mask_dst);                   \
220     p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);       \
221     mask_dst = __lsx_vmax_bu(p3_asub_p2_m, mask_dst);               \
222     q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);       \
223     mask_dst = __lsx_vmax_bu(q2_asub_q1_m, mask_dst);               \
224     mask_dst = __lsx_vslt_bu(limit_src, mask_dst);                  \
225     mask_dst = __lsx_vxori_b(mask_dst, 0xff);                       \
226 }
227 
228 #define VP8_ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride)      \
229 {                                                                   \
230     __lsx_vstelm_w(in0, pdst, 0, in0_idx);                          \
231     __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);                 \
232 }
233 
234 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)     \
235 {                                                           \
236     __lsx_vstelm_w(in, pdst, 0, idx0);                      \
237     pdst += stride;                                         \
238     __lsx_vstelm_w(in, pdst, 0, idx1);                      \
239     pdst += stride;                                         \
240     __lsx_vstelm_w(in, pdst, 0, idx2);                      \
241     pdst += stride;                                         \
242     __lsx_vstelm_w(in, pdst, 0, idx3);                      \
243     pdst += stride;                                         \
244 }
245 
ff_vp8_v_loop_filter16_lsx(uint8_t * dst,ptrdiff_t stride,int b_limit_in,int limit_in,int thresh_in)246 void ff_vp8_v_loop_filter16_lsx(uint8_t *dst, ptrdiff_t stride, int b_limit_in,
247                                 int limit_in, int thresh_in)
248 {
249     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
250     __m128i mask, hev, flat, thresh, limit, b_limit;
251 
252     ptrdiff_t stride2 = stride << 1;
253     ptrdiff_t stride3 = stride2 + stride;
254     ptrdiff_t stride4 = stride2 << 1;
255 
256     b_limit = __lsx_vreplgr2vr_b(b_limit_in);
257     limit = __lsx_vreplgr2vr_b(limit_in);
258     thresh = __lsx_vreplgr2vr_b(thresh_in);
259 
260     /*load vector elements*/
261     DUP4_ARG2(__lsx_vld, dst - stride4, 0, dst - stride3, 0, dst - stride2, 0,
262               dst - stride, 0, p3, p2, p1, p0);
263     DUP4_ARG2(__lsx_vld, dst, 0, dst + stride, 0, dst + stride2, 0, dst + stride3, 0,
264               q0, q1, q2, q3);
265     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat);
266     VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
267 
268     /*store vector elements*/
269     __lsx_vst(p2, dst - stride3, 0);
270     __lsx_vst(p1, dst - stride2, 0);
271     __lsx_vst(p0, dst - stride,  0);
272     __lsx_vst(q0, dst,           0);
273 
274     __lsx_vst(q1, dst + stride,  0);
275     __lsx_vst(q2, dst + stride2, 0);
276 }
277 
ff_vp8_v_loop_filter8uv_lsx(uint8_t * dst_u,uint8_t * dst_v,ptrdiff_t stride,int b_limit_in,int limit_in,int thresh_in)278 void ff_vp8_v_loop_filter8uv_lsx(uint8_t *dst_u, uint8_t *dst_v,
279                                  ptrdiff_t stride, int b_limit_in,
280                                  int limit_in, int thresh_in)
281 {
282     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
283     __m128i mask, hev, flat, thresh, limit, b_limit;
284     __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
285     __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
286 
287     ptrdiff_t stride2 = stride << 1;
288     ptrdiff_t stride3 = stride2 + stride;
289     ptrdiff_t stride4 = stride2 << 1;
290 
291     b_limit = __lsx_vreplgr2vr_b(b_limit_in);
292     limit = __lsx_vreplgr2vr_b(limit_in);
293     thresh = __lsx_vreplgr2vr_b(thresh_in);
294 
295     DUP4_ARG2(__lsx_vld, dst_u - stride4, 0, dst_u - stride3, 0, dst_u - stride2, 0,
296               dst_u - stride, 0, p3_u, p2_u, p1_u, p0_u);
297     DUP4_ARG2(__lsx_vld, dst_u, 0, dst_u + stride, 0, dst_u + stride2, 0,
298               dst_u + stride3, 0, q0_u, q1_u, q2_u, q3_u);
299 
300     DUP4_ARG2(__lsx_vld, dst_v - stride4, 0, dst_v - stride3, 0, dst_v - stride2, 0,
301               dst_v - stride, 0, p3_v, p2_v, p1_v, p0_v);
302     DUP4_ARG2(__lsx_vld, dst_v, 0, dst_v + stride, 0, dst_v + stride2, 0,
303               dst_v + stride3, 0, q0_v, q1_v, q2_v, q3_v);
304 
305     /* rht 8 element of p3 are u pixel and left 8 element of p3 are v pixei */
306     DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
307     DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
308     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
309                  hev, mask, flat);
310     VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
311 
312     __lsx_vstelm_d(p2, dst_u - stride3, 0, 0);
313     __lsx_vstelm_d(p1, dst_u - stride2, 0, 0);
314     __lsx_vstelm_d(p0, dst_u - stride , 0, 0);
315     __lsx_vstelm_d(q0, dst_u,           0, 0);
316 
317     __lsx_vstelm_d(q1, dst_u + stride,  0, 0);
318     __lsx_vstelm_d(q2, dst_u + stride2, 0, 0);
319 
320     __lsx_vstelm_d(p2, dst_v - stride3, 0, 1);
321     __lsx_vstelm_d(p1, dst_v - stride2, 0, 1);
322     __lsx_vstelm_d(p0, dst_v - stride , 0, 1);
323     __lsx_vstelm_d(q0, dst_v,           0, 1);
324 
325     __lsx_vstelm_d(q1, dst_v + stride,  0, 1);
326     __lsx_vstelm_d(q2, dst_v + stride2, 0, 1);
327 }
328 
ff_vp8_h_loop_filter16_lsx(uint8_t * dst,ptrdiff_t stride,int b_limit_in,int limit_in,int thresh_in)329 void ff_vp8_h_loop_filter16_lsx(uint8_t *dst, ptrdiff_t stride, int b_limit_in,
330                                 int limit_in, int thresh_in)
331 {
332     uint8_t *temp_src;
333     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
334     __m128i mask, hev, flat, thresh, limit, b_limit;
335     __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
336     __m128i row9, row10, row11, row12, row13, row14, row15;
337     __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
338 
339     ptrdiff_t stride2 = stride << 1;
340     ptrdiff_t stride3 = stride2 + stride;
341     ptrdiff_t stride4 = stride2 << 1;
342 
343     b_limit = __lsx_vreplgr2vr_b(b_limit_in);
344     limit = __lsx_vreplgr2vr_b(limit_in);
345     thresh = __lsx_vreplgr2vr_b(thresh_in);
346 
347     temp_src = dst - 4;
348     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
349               temp_src + stride3, 0, row0, row1, row2, row3);
350     temp_src += stride4;
351     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
352               temp_src + stride3, 0, row4, row5, row6, row7);
353 
354     temp_src += stride4;
355     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
356               temp_src + stride3, 0, row8, row9, row10, row11);
357     temp_src += stride4;
358     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
359               temp_src + stride3, 0, row12, row13, row14, row15);
360     LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, row10,
361                         row11, row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, q3);
362 
363     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat);
364     VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
365 
366     tmp0 = __lsx_vilvl_b(p1, p2);
367     tmp1 = __lsx_vilvl_b(q0, p0);
368 
369     tmp3 = __lsx_vilvl_h(tmp1, tmp0);
370     tmp4 = __lsx_vilvh_h(tmp1, tmp0);
371 
372     tmp0 = __lsx_vilvh_b(p1, p2);
373     tmp1 = __lsx_vilvh_b(q0, p0);
374 
375     tmp6 = __lsx_vilvl_h(tmp1, tmp0);
376     tmp7 = __lsx_vilvh_h(tmp1, tmp0);
377 
378     tmp2 = __lsx_vilvl_b(q2, q1);
379     tmp5 = __lsx_vilvh_b(q2, q1);
380 
381     temp_src = dst - 3;
382     VP8_ST6x1_UB(tmp3, 0, tmp2, 0, temp_src, 4);
383     temp_src += stride;
384     VP8_ST6x1_UB(tmp3, 1, tmp2, 1, temp_src, 4);
385     temp_src += stride;
386     VP8_ST6x1_UB(tmp3, 2, tmp2, 2, temp_src, 4);
387     temp_src += stride;
388     VP8_ST6x1_UB(tmp3, 3, tmp2, 3, temp_src, 4);
389     temp_src += stride;
390     VP8_ST6x1_UB(tmp4, 0, tmp2, 4, temp_src, 4);
391     temp_src += stride;
392     VP8_ST6x1_UB(tmp4, 1, tmp2, 5, temp_src, 4);
393     temp_src += stride;
394     VP8_ST6x1_UB(tmp4, 2, tmp2, 6, temp_src, 4);
395     temp_src += stride;
396     VP8_ST6x1_UB(tmp4, 3, tmp2, 7, temp_src, 4);
397     temp_src += stride;
398     VP8_ST6x1_UB(tmp6, 0, tmp5, 0, temp_src, 4);
399     temp_src += stride;
400     VP8_ST6x1_UB(tmp6, 1, tmp5, 1, temp_src, 4);
401     temp_src += stride;
402     VP8_ST6x1_UB(tmp6, 2, tmp5, 2, temp_src, 4);
403     temp_src += stride;
404     VP8_ST6x1_UB(tmp6, 3, tmp5, 3, temp_src, 4);
405     temp_src += stride;
406     VP8_ST6x1_UB(tmp7, 0, tmp5, 4, temp_src, 4);
407     temp_src += stride;
408     VP8_ST6x1_UB(tmp7, 1, tmp5, 5, temp_src, 4);
409     temp_src += stride;
410     VP8_ST6x1_UB(tmp7, 2, tmp5, 6, temp_src, 4);
411     temp_src += stride;
412     VP8_ST6x1_UB(tmp7, 3, tmp5, 7, temp_src, 4);
413 }
414 
ff_vp8_h_loop_filter8uv_lsx(uint8_t * dst_u,uint8_t * dst_v,ptrdiff_t stride,int b_limit_in,int limit_in,int thresh_in)415 void ff_vp8_h_loop_filter8uv_lsx(uint8_t *dst_u, uint8_t *dst_v,
416                                  ptrdiff_t stride, int b_limit_in,
417                                  int limit_in, int thresh_in)
418 {
419     uint8_t *temp_src;
420     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
421     __m128i mask, hev, flat, thresh, limit, b_limit;
422     __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
423     __m128i row9, row10, row11, row12, row13, row14, row15;
424     __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
425 
426     ptrdiff_t stride2 = stride << 1;
427     ptrdiff_t stride3 = stride2 + stride;
428     ptrdiff_t stride4 = stride2 << 1;
429 
430     b_limit = __lsx_vreplgr2vr_b(b_limit_in);
431     limit = __lsx_vreplgr2vr_b(limit_in);
432     thresh = __lsx_vreplgr2vr_b(thresh_in);
433 
434     temp_src = dst_u - 4;
435     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
436               temp_src + stride3, 0, row0, row1, row2, row3);
437     temp_src += stride4;
438     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
439               temp_src + stride3, 0, row4, row5, row6, row7);
440 
441     temp_src = dst_v - 4;
442     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
443               temp_src + stride3, 0, row8, row9, row10, row11);
444     temp_src += stride4;
445     DUP4_ARG2(__lsx_vld, temp_src, 0, temp_src + stride, 0, temp_src + stride2, 0,
446               temp_src + stride3, 0, row12, row13, row14, row15);
447 
448     LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
449                         row8, row9, row10, row11, row12, row13, row14, row15,
450                         p3, p2, p1, p0, q0, q1, q2, q3);
451 
452     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat);
453     VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
454 
455     tmp0 = __lsx_vilvl_b(p1, p2);
456     tmp1 = __lsx_vilvl_b(q0, p0);
457 
458     tmp3 = __lsx_vilvl_h(tmp1, tmp0);
459     tmp4 = __lsx_vilvh_h(tmp1, tmp0);
460 
461     tmp0 = __lsx_vilvh_b(p1, p2);
462     tmp1 = __lsx_vilvh_b(q0, p0);
463 
464     tmp6 = __lsx_vilvl_h(tmp1, tmp0);
465     tmp7 = __lsx_vilvh_h(tmp1, tmp0);
466 
467     tmp2 = __lsx_vilvl_b(q2, q1);
468     tmp5 = __lsx_vilvh_b(q2, q1);
469 
470     dst_u -= 3;
471     VP8_ST6x1_UB(tmp3, 0, tmp2, 0, dst_u, 4);
472     dst_u += stride;
473     VP8_ST6x1_UB(tmp3, 1, tmp2, 1, dst_u, 4);
474     dst_u += stride;
475     VP8_ST6x1_UB(tmp3, 2, tmp2, 2, dst_u, 4);
476     dst_u += stride;
477     VP8_ST6x1_UB(tmp3, 3, tmp2, 3, dst_u, 4);
478     dst_u += stride;
479     VP8_ST6x1_UB(tmp4, 0, tmp2, 4, dst_u, 4);
480     dst_u += stride;
481     VP8_ST6x1_UB(tmp4, 1, tmp2, 5, dst_u, 4);
482     dst_u += stride;
483     VP8_ST6x1_UB(tmp4, 2, tmp2, 6, dst_u, 4);
484     dst_u += stride;
485     VP8_ST6x1_UB(tmp4, 3, tmp2, 7, dst_u, 4);
486 
487     dst_v -= 3;
488     VP8_ST6x1_UB(tmp6, 0, tmp5, 0, dst_v, 4);
489     dst_v += stride;
490     VP8_ST6x1_UB(tmp6, 1, tmp5, 1, dst_v, 4);
491     dst_v += stride;
492     VP8_ST6x1_UB(tmp6, 2, tmp5, 2, dst_v, 4);
493     dst_v += stride;
494     VP8_ST6x1_UB(tmp6, 3, tmp5, 3, dst_v, 4);
495     dst_v += stride;
496     VP8_ST6x1_UB(tmp7, 0, tmp5, 4, dst_v, 4);
497     dst_v += stride;
498     VP8_ST6x1_UB(tmp7, 1, tmp5, 5, dst_v, 4);
499     dst_v += stride;
500     VP8_ST6x1_UB(tmp7, 2, tmp5, 6, dst_v, 4);
501     dst_v += stride;
502     VP8_ST6x1_UB(tmp7, 3, tmp5, 7, dst_v, 4);
503 }
504 
ff_vp8_v_loop_filter16_inner_lsx(uint8_t * src,ptrdiff_t stride,int32_t e,int32_t i,int32_t h)505 void ff_vp8_v_loop_filter16_inner_lsx(uint8_t *src, ptrdiff_t stride,
506                                       int32_t e, int32_t i, int32_t h)
507 {
508     __m128i mask, hev, flat;
509     __m128i thresh, b_limit, limit;
510     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
511 
512     ptrdiff_t stride2 = stride << 1;
513     ptrdiff_t stride3 = stride2 + stride;
514     ptrdiff_t stride4 = stride2 << 1;
515 
516     /* load vector elements */
517     src -= stride4;
518     DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
519               src + stride3, 0, p3, p2, p1, p0);
520     src += stride4;
521     DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
522               src + stride3, 0, q0, q1, q2, q3);
523     thresh = __lsx_vreplgr2vr_b(h);
524     b_limit = __lsx_vreplgr2vr_b(e);
525     limit = __lsx_vreplgr2vr_b(i);
526 
527     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
528                  hev, mask, flat);
529     VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
530 
531     __lsx_vst(p1, src - stride2, 0);
532     __lsx_vst(p0, src - stride,  0);
533     __lsx_vst(q0, src,           0);
534     __lsx_vst(q1, src + stride,  0);
535 }
536 
ff_vp8_h_loop_filter16_inner_lsx(uint8_t * src,ptrdiff_t stride,int32_t e,int32_t i,int32_t h)537 void ff_vp8_h_loop_filter16_inner_lsx(uint8_t *src, ptrdiff_t stride,
538                                       int32_t e, int32_t i, int32_t h)
539 {
540     __m128i mask, hev, flat;
541     __m128i thresh, b_limit, limit;
542     __m128i p3, p2, p1, p0, q3, q2, q1, q0;
543     __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
544     __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
545 
546     ptrdiff_t stride2 = stride << 1;
547     ptrdiff_t stride3 = stride2 + stride;
548     ptrdiff_t stride4 = stride2 << 1;
549 
550     src -= 4;
551     DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
552               src + stride3, 0, tmp0, tmp1, tmp2, tmp3);
553     src += stride4;
554     DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
555               src + stride3, 0, tmp4, tmp5, tmp6, tmp7);
556     src += stride4;
557     DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
558               src + stride3, 0, tmp8, tmp9, tmp10, tmp11);
559     src += stride4;
560     DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride2, 0,
561               src + stride3, 0, tmp12, tmp13, tmp14, tmp15);
562     src -= 3 * stride4;
563 
564     LSX_TRANSPOSE16x8_B(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
565                         tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15,
566                         p3, p2, p1, p0, q0, q1, q2, q3);
567 
568     thresh = __lsx_vreplgr2vr_b(h);
569     b_limit = __lsx_vreplgr2vr_b(e);
570     limit = __lsx_vreplgr2vr_b(i);
571 
572     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
573                  hev, mask, flat);
574     VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
575 
576     DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
577     tmp2 = __lsx_vilvl_h(tmp1, tmp0);
578     tmp3 = __lsx_vilvh_h(tmp1, tmp0);
579 
580     src += 2;
581     ST_W4(tmp2, 0, 1, 2, 3, src, stride);
582     ST_W4(tmp3, 0, 1, 2, 3, src, stride);
583 
584     DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
585     tmp2 = __lsx_vilvl_h(tmp1, tmp0);
586     tmp3 = __lsx_vilvh_h(tmp1, tmp0);
587 
588     ST_W4(tmp2, 0, 1, 2, 3, src, stride);
589     ST_W4(tmp3, 0, 1, 2, 3, src, stride);
590     src -= 4 * stride4;
591 }
592