• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen <chenhao@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "vc1dsp_loongarch.h"
23 #include "libavutil/loongarch/loongson_intrinsics.h"
24 
ff_vc1_inv_trans_8x8_lasx(int16_t block[64])25 void ff_vc1_inv_trans_8x8_lasx(int16_t block[64])
26 {
27     int32_t con_4    = 4;
28     int32_t con_64   = 64;
29     __m256i in0, in1, in2, in3;
30     __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4, t5, t6, t7, t8;
31     __m256i const_1  = {0x000c000c000c000c, 0x000c000c000c000c,
32                         0x000c000c000c000c, 0x000c000c000c000c};
33     __m256i const_2  = {0xfff4000cfff4000c, 0xfff4000cfff4000c,
34                         0xfff4000cfff4000c, 0xfff4000cfff4000c};
35     __m256i const_3  = {0x0006001000060010, 0x0006001000060010,
36                         0x0006001000060010, 0x0006001000060010};
37     __m256i const_4  = {0xfff00006fff00006, 0xfff00006fff00006,
38                         0xfff00006fff00006, 0xfff00006fff00006};
39     __m256i const_5  = {0x000f0010000f0010, 0x000f0010000f0010,
40                         0x000f0010000f0010, 0x000f0010000f0010};
41     __m256i const_6  = {0x0004000900040009, 0x0004000900040009,
42                         0x0004000900040009, 0x0004000900040009};
43     __m256i const_7  = {0xfffc000ffffc000f, 0xfffc000ffffc000f,
44                         0xfffc000ffffc000f, 0xfffc000ffffc000f};
45     __m256i const_8  = {0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0,
46                         0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0};
47     __m256i const_9  = {0xfff00009fff00009, 0xfff00009fff00009,
48                         0xfff00009fff00009, 0xfff00009fff00009};
49     __m256i const_10 = {0x000f0004000f0004, 0x000f0004000f0004,
50                         0x000f0004000f0004, 0x000f0004000f0004};
51     __m256i const_11 = {0xfff70004fff70004, 0xfff70004fff70004,
52                         0xfff70004fff70004, 0xfff70004fff70004};
53     __m256i const_12 = {0xfff0000ffff0000f, 0xfff0000ffff0000f,
54                         0xfff0000ffff0000f, 0xfff0000ffff0000f};
55 
56     DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
57               in0, in1, in2, in3);
58     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
59               in0, in1, in2, in3);
60     /* first loops */
61     DUP2_ARG2(__lasx_xvilvl_h, in2, in0, in3, in1, temp0, temp1);
62     t2 = __lasx_xvreplgr2vr_w(con_4);
63     DUP2_ARG3(__lasx_xvdp2add_w_h, t2, temp0, const_1, t2, temp0,
64               const_2, t1, t2);
65     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
66 
67     t5 = __lasx_xvadd_w(t1, t3);
68     t6 = __lasx_xvadd_w(t2, t4);
69     t7 = __lasx_xvsub_w(t2, t4);
70     t8 = __lasx_xvsub_w(t1, t3);
71 
72     DUP2_ARG2(__lasx_xvilvh_h, in1, in0, in3, in2, temp0, temp1);
73     temp2 = __lasx_xvdp2_w_h(const_5, temp0);
74     t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
75     temp2 = __lasx_xvdp2_w_h(const_7, temp0);
76     t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
77     temp2 = __lasx_xvdp2_w_h(const_9, temp0);
78     t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
79     temp2 = __lasx_xvdp2_w_h(const_11, temp0);
80     t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
81 
82     DUP4_ARG2(__lasx_xvadd_w, t1, t5, t6, t2, t7, t3, t8, t4,
83               temp0, temp1, temp2, temp3);
84     DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1,
85               in0, in1, in2, in3);
86     DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
87               temp0, temp1, temp2, temp3);
88     DUP4_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in2, 3, in3, 3,
89               in0, in1, in2, in3);
90 
91     /* second loops */
92     DUP4_ARG2(__lasx_xvpackev_h, temp1, temp0, temp3, temp2, in1, in0,
93               in3, in2, temp0, temp1, temp2, temp3);
94     DUP2_ARG2(__lasx_xvilvl_w, temp1, temp0, temp3, temp2, t1, t3);
95     DUP2_ARG2(__lasx_xvilvh_w, temp1, temp0, temp3, temp2, t2, t4);
96     DUP4_ARG3(__lasx_xvpermi_q, t3, t1, 0x20, t3, t1, 0x31, t4, t2, 0x20,
97               t4, t2, 0x31, in0, in1, in2, in3);
98     DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in3, in2, temp0, temp1);
99     t3    = __lasx_xvreplgr2vr_w(con_64);
100     DUP2_ARG3(__lasx_xvdp2add_w_h, t3, temp0, const_1, t3, temp0,
101               const_2, t1, t2);
102     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
103 
104     t5    = __lasx_xvadd_w(t1, t3);
105     t6    = __lasx_xvadd_w(t2, t4);
106     t7    = __lasx_xvsub_w(t2, t4);
107     t8    = __lasx_xvsub_w(t1, t3);
108 
109     DUP2_ARG2(__lasx_xvilvh_h, in2, in0, in3, in1, temp0, temp1);
110     temp2 = __lasx_xvdp2_w_h(const_5, temp0);
111     t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
112     temp2 = __lasx_xvdp2_w_h(const_7, temp0);
113     t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
114     temp2 = __lasx_xvdp2_w_h(const_9, temp0);
115     t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
116     temp2 = __lasx_xvdp2_w_h(const_11, temp0);
117     t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
118 
119     DUP4_ARG2(__lasx_xvadd_w, t5, t1, t6, t2, t7, t3, t8, t4,
120               temp0, temp1, temp2, temp3);
121     DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1,
122               in0, in1, in2, in3);
123     DUP4_ARG2(__lasx_xvaddi_wu, in0, 1, in1, 1, in2, 1, in3, 1,
124               in0, in1, in2, in3);
125     DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 7, temp3, temp2, 7,
126               in1, in0, 7, in3, in2, 7, t1, t2, t3, t4);
127     DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8,
128               in0, in1, in2, in3);
129     __lasx_xvst(in0, block, 0);
130     __lasx_xvst(in1, block, 32);
131     __lasx_xvst(in2, block, 64);
132     __lasx_xvst(in3, block, 96);
133 }
134 
ff_vc1_inv_trans_8x8_dc_lasx(uint8_t * dest,ptrdiff_t stride,int16_t * block)135 void ff_vc1_inv_trans_8x8_dc_lasx(uint8_t *dest, ptrdiff_t stride,
136                                   int16_t *block)
137 {
138     int dc = block[0];
139     ptrdiff_t stride2 = stride << 1;
140     ptrdiff_t stride3 = stride2 + stride;
141     uint8_t *dst = dest + (stride2 << 1);
142     __m256i in0, in1, in2, in3, in4, in5, in6, in7;
143     __m256i const_dc, temp0, temp1, temp2, temp3;
144     __m256i reg0, reg1, reg2, reg3;
145 
146     dc = (3 * dc +  1) >> 1;
147     dc = (3 * dc + 16) >> 5;
148 
149     const_dc = __lasx_xvreplgr2vr_h(dc);
150     DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2,
151               0, dest + stride3, 0, in0, in1, in2, in3);
152     DUP4_ARG2(__lasx_xvldrepl_d, dst, 0, dst + stride, 0, dst + stride2,
153               0, dst + stride3, 0, in4, in5, in6, in7);
154 
155     DUP4_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, in5, in4, in7, in6,
156               temp0, temp1, temp2, temp3);
157     DUP4_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp2, temp3,
158               temp0, temp1, temp2, temp3);
159 
160     DUP4_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, temp2,
161               const_dc, temp3, const_dc, reg0, reg1, reg2, reg3);
162     DUP2_ARG3(__lasx_xvssrarni_bu_h, reg1, reg0, 0, reg3, reg2, 0,
163               temp0, temp1);
164     __lasx_xvstelm_d(temp0, dest, 0, 0);
165     __lasx_xvstelm_d(temp0, dest + stride, 0, 2);
166     __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
167     __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
168     __lasx_xvstelm_d(temp1, dst, 0, 0);
169     __lasx_xvstelm_d(temp1, dst + stride, 0, 2);
170     __lasx_xvstelm_d(temp1, dst + stride2, 0, 1);
171     __lasx_xvstelm_d(temp1, dst + stride3, 0, 3);
172 }
173 
ff_vc1_inv_trans_8x4_lasx(uint8_t * dest,ptrdiff_t stride,int16_t * block)174 void ff_vc1_inv_trans_8x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block)
175 {
176     ptrdiff_t stride2 = stride << 1;
177     ptrdiff_t stride3 = stride2 + stride;
178     __m256i shift    = {0x0000000400000000, 0x0000000500000001,
179                         0x0000000600000002, 0x0000000700000003};
180     __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
181                         0x0000004000000040, 0x0000004000000040};
182     __m256i const_1  = {0x00060010000C000C, 0x00060010000C000C,
183                         0x00060010000C000C, 0x00060010000C000C};
184     __m256i const_2  = {0xFFF00006FFF4000C, 0xFFF00006FFF4000C,
185                         0xFFF00006FFF4000C, 0xFFF00006FFF4000C};
186     __m256i const_3  = {0x0004000F00090010, 0x0004000F00090010,
187                         0x0004000F00090010, 0x0004000F00090010};
188     __m256i const_4  = {0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F,
189                         0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F};
190     __m256i const_5  = {0x000FFFF000040009, 0x000FFFF000040009,
191                         0x000FFFF000040009, 0x000FFFF000040009};
192     __m256i const_6  = {0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004,
193                         0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004};
194     __m256i const_7  = {0x0000000000000004, 0x0000000000000004,
195                         0x0000000000000004, 0x0000000000000004};
196     __m256i const_8  = {0x0011001100110011, 0x0011001100110011,
197                         0x0011001100110011, 0x0011001100110011};
198     __m256i const_9  = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
199                         0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
200     __m256i const_10 = {0x000A0016000A0016, 0x000A0016000A0016,
201                         0x000A0016000A0016, 0x000A0016000A0016};
202     __m256i const_11 = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
203                         0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
204     __m256i in0, in1;
205     __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4;
206 
207     DUP2_ARG2(__lasx_xvld, block, 0, block, 32, in0, in1);
208     /* first loops */
209     temp0 = __lasx_xvpermi_d(in0, 0xB1);
210     temp1 = __lasx_xvpermi_d(in1, 0xB1);
211     DUP2_ARG2(__lasx_xvilvl_h, temp0, in0, temp1, in1, temp0, temp1);
212     temp2 = __lasx_xvpickev_w(temp1, temp0);
213     temp3 = __lasx_xvpickod_w(temp1, temp0);
214 
215     DUP2_ARG2(__lasx_xvdp2_w_h, temp2, const_1, temp2, const_2, temp0, temp1);
216     t1    = __lasx_xvadd_w(temp0, const_7);
217     t2    = __lasx_xvadd_w(temp1, const_7);
218     temp0 = __lasx_xvpickev_w(t2, t1);
219     temp1 = __lasx_xvpickod_w(t2, t1);
220     t3    = __lasx_xvadd_w(temp0, temp1);
221     t4    = __lasx_xvsub_w(temp0, temp1);
222     t4    = __lasx_xvpermi_d(t4, 0xB1);
223 
224     DUP4_ARG2(__lasx_xvdp4_d_h, temp3, const_3, temp3, const_4, temp3,
225               const_5, temp3, const_6, t1, t2, temp0, temp1);
226     temp2 = __lasx_xvpickev_w(t2, t1);
227     temp3 = __lasx_xvpickev_w(temp1, temp0);
228 
229     t1    = __lasx_xvadd_w(temp2, t3);
230     t2    = __lasx_xvadd_w(temp3, t4);
231     temp0 = __lasx_xvsub_w(t4, temp3);
232     temp1 = __lasx_xvsub_w(t3, temp2);
233     /* second loops */
234     DUP2_ARG3(__lasx_xvsrani_h_w, t2, t1, 3, temp1, temp0, 3, temp2, temp3);
235     temp3 = __lasx_xvshuf4i_h(temp3, 0x4E);
236     temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
237     temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
238     DUP2_ARG3(__lasx_xvdp2add_w_h, const_64, temp0, const_8, const_64, temp0,
239               const_9, t1, t2);
240     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_10, temp1, const_11, t3, t4);
241     temp0 = __lasx_xvadd_w(t1, t3);
242     temp1 = __lasx_xvsub_w(t2, t4);
243     temp2 = __lasx_xvadd_w(t2, t4);
244     temp3 = __lasx_xvsub_w(t1, t3);
245     DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
246               t1, t2, t3, t4);
247 
248     temp0 = __lasx_xvldrepl_d(dest, 0);
249     DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2, 0,
250               dest + stride3, 0, temp0, temp1, temp2, temp3);
251     DUP4_ARG1(__lasx_vext2xv_wu_bu, temp0, temp1, temp2, temp3,
252               temp0, temp1, temp2, temp3);
253     DUP4_ARG2(__lasx_xvadd_w, temp0, t1, temp1, t2, temp2, t3, temp3, t4,
254               t1, t2, t3, t4);
255     DUP4_ARG1(__lasx_xvclip255_w, t1, t2, t3, t4, t1, t2, t3, t4);
256     DUP2_ARG2(__lasx_xvpickev_h, t2, t1, t4, t3, temp0, temp1);
257     temp2 = __lasx_xvpickev_b(temp1, temp0);
258     temp0 = __lasx_xvperm_w(temp2, shift);
259     __lasx_xvstelm_d(temp0, dest, 0, 0);
260     __lasx_xvstelm_d(temp0, dest + stride, 0, 1);
261     __lasx_xvstelm_d(temp0, dest + stride2, 0, 2);
262     __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
263 }
264 
ff_vc1_inv_trans_8x4_dc_lasx(uint8_t * dest,ptrdiff_t stride,int16_t * block)265 void ff_vc1_inv_trans_8x4_dc_lasx(uint8_t *dest, ptrdiff_t stride,
266                                   int16_t *block)
267 {
268     int dc = block[0];
269     ptrdiff_t stride2 = stride << 1;
270     ptrdiff_t stride3 = stride2 + stride;
271     __m256i in0, in1, in2, in3;
272     __m256i const_dc, temp0, temp1, reg0, reg1;
273 
274     dc = (3  * dc + 1) >> 1;
275     dc = (17 * dc + 64) >> 7;
276     const_dc = __lasx_xvreplgr2vr_h(dc);
277 
278     DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + stride, 0, dest + stride2,
279               0, dest + stride3, 0, in0, in1, in2, in3);
280     DUP2_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, temp0, temp1);
281     DUP2_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp0, temp1);
282     DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
283     temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
284     __lasx_xvstelm_d(temp0, dest, 0, 0);
285     __lasx_xvstelm_d(temp0, dest + stride, 0, 2);
286     __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
287     __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
288 }
289 
ff_vc1_inv_trans_4x8_dc_lasx(uint8_t * dest,ptrdiff_t stride,int16_t * block)290 void ff_vc1_inv_trans_4x8_dc_lasx(uint8_t *dest, ptrdiff_t stride,
291                                   int16_t *block)
292 {
293     int dc = block[0];
294     ptrdiff_t stride2 = stride << 1;
295     ptrdiff_t stride3 = stride2 + stride;
296     uint8_t *dst = dest + (stride2 << 1);
297     __m256i in0, in1, in2, in3, in4, in5, in6, in7;
298     __m256i const_dc, temp0, temp1, temp2, temp3, reg0, reg1;
299 
300     dc = (17 * dc +  4) >> 3;
301     dc = (12 * dc + 64) >> 7;
302     const_dc = __lasx_xvreplgr2vr_h(dc);
303 
304     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + stride, 0, dest + stride2,
305               0, dest + stride3, 0, in0, in1, in2, in3);
306     DUP4_ARG2(__lasx_xvldrepl_w, dst, 0, dst + stride, 0, dst + stride2,
307               0, dst + stride3, 0, in4, in5, in6, in7);
308 
309     DUP4_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, in5, in4, in7, in6,
310               temp0, temp1, temp2, temp3);
311     DUP2_ARG2(__lasx_xvilvl_d, temp1, temp0, temp3, temp2, reg0, reg1);
312     DUP2_ARG1(__lasx_vext2xv_hu_bu, reg0, reg1, temp0, temp1);
313     DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
314     temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
315     __lasx_xvstelm_w(temp0, dest, 0, 0);
316     __lasx_xvstelm_w(temp0, dest + stride, 0, 1);
317     __lasx_xvstelm_w(temp0, dest + stride2, 0, 4);
318     __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
319     __lasx_xvstelm_w(temp0, dst, 0, 2);
320     __lasx_xvstelm_w(temp0, dst + stride, 0, 3);
321     __lasx_xvstelm_w(temp0, dst + stride2, 0, 6);
322     __lasx_xvstelm_w(temp0, dst + stride3, 0, 7);
323 }
324 
ff_vc1_inv_trans_4x8_lasx(uint8_t * dest,ptrdiff_t stride,int16_t * block)325 void ff_vc1_inv_trans_4x8_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block)
326 {
327     ptrdiff_t stride2 = stride << 1;
328     ptrdiff_t stride3 = stride2 + stride;
329     uint8_t *dst = dest + (stride2 << 1);
330     __m256i in0, in1, in2, in3;
331     __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4;
332 
333     __m256i const_1  = {0x0011001100110011, 0x0011001100110011,
334                         0x0011001100110011, 0x0011001100110011};
335     __m256i const_2  = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
336                         0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
337     __m256i const_3  = {0x000A0016000A0016, 0x000A0016000A0016,
338                         0x000A0016000A0016, 0x000A0016000A0016};
339     __m256i const_4  = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
340                         0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
341     __m256i const_5  = {0x0000000400000004, 0x0000000400000004,
342                         0x0000000400000004, 0x0000000400000004};
343     __m256i const_6  = {0x0000004000000040, 0x0000004000000040,
344                         0x0000004000000040, 0x0000004000000040};
345     __m256i const_7  = {0x000C000C000C000C, 0X000C000C000C000C,
346                         0xFFF4000CFFF4000C, 0xFFF4000CFFF4000C};
347     __m256i const_8  = {0x0006001000060010, 0x0006001000060010,
348                         0xFFF00006FFF00006, 0xFFF00006FFF00006};
349     __m256i const_9  = {0x0009001000090010, 0x0009001000090010,
350                         0x0004000F0004000F, 0x0004000F0004000F};
351     __m256i const_10 = {0xFFF0000FFFF0000F, 0xFFF0000FFFF0000F,
352                         0xFFF7FFFCFFF7FFFC, 0xFFF7FFFCFFF7FFFC};
353     __m256i const_11 = {0x0004000900040009, 0x0004000900040009,
354                         0x000FFFF0000FFFF0, 0x000FFFF0000FFFF0};
355     __m256i const_12 = {0x000F0004000F0004, 0x000F0004000F0004,
356                         0xFFF0FFF7FFF0FFF7, 0xFFF0FFF7FFF0FFF7};
357     __m256i shift    = {0x0000000400000000, 0x0000000600000002,
358                         0x0000000500000001, 0x0000000700000003};
359 
360     /* first loops */
361     DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
362               in0, in1, in2, in3);
363     in0   = __lasx_xvilvl_d(in1, in0);
364     in1   = __lasx_xvilvl_d(in3, in2);
365     temp0 = __lasx_xvpickev_h(in1, in0);
366     temp1 = __lasx_xvpickod_h(in1, in0);
367     temp0 = __lasx_xvperm_w(temp0, shift);
368     temp1 = __lasx_xvperm_w(temp1, shift);
369 
370     DUP2_ARG3(__lasx_xvdp2add_w_h, const_5, temp0, const_1, const_5, temp0,
371               const_2, t1, t2);
372     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
373 
374     temp0 = __lasx_xvadd_w(t1, t3);
375     temp1 = __lasx_xvsub_w(t2, t4);
376     temp2 = __lasx_xvadd_w(t2, t4);
377     temp3 = __lasx_xvsub_w(t1, t3);
378     DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
379               temp0, temp1, temp2, temp3);
380 
381     /* second loops */
382     t1    = __lasx_xvpickev_w(temp1, temp0);
383     t2    = __lasx_xvpickev_w(temp3, temp2);
384     t1    = __lasx_xvpickev_h(t2, t1);
385     t3    = __lasx_xvpickod_w(temp1, temp0);
386     t4    = __lasx_xvpickod_w(temp3, temp2);
387     temp1 = __lasx_xvpickev_h(t4, t3);
388     temp2 = __lasx_xvpermi_q(t1, t1, 0x00);
389     temp3 = __lasx_xvpermi_q(t1, t1, 0x11);
390     t1 = __lasx_xvdp2add_w_h(const_6, temp2, const_7);
391     t2 = __lasx_xvdp2_w_h(temp3, const_8);
392     t3    = __lasx_xvadd_w(t1, t2);
393     t4    = __lasx_xvsub_w(t1, t2);
394     t4    = __lasx_xvpermi_d(t4, 0x4E);
395 
396     DUP4_ARG2(__lasx_xvdp2_w_h, temp1, const_9, temp1, const_10, temp1,
397               const_11, temp1, const_12, t1, t2, temp2, temp3);
398 
399     temp0 = __lasx_xvpermi_q(t2, t1, 0x20);
400     temp1 = __lasx_xvpermi_q(t2, t1, 0x31);
401     t1    = __lasx_xvadd_w(temp0, temp1);
402     temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
403     temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
404     t2    = __lasx_xvadd_w(temp1, temp0);
405     temp0 = __lasx_xvadd_w(t1, t3);
406     temp1 = __lasx_xvadd_w(t2, t4);
407     temp2 = __lasx_xvsub_w(t4, t2);
408     temp3 = __lasx_xvsub_w(t3, t1);
409     temp2 = __lasx_xvaddi_wu(temp2, 1);
410     temp3 = __lasx_xvaddi_wu(temp3, 1);
411     DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
412               temp0, temp1, temp2, temp3);
413 
414     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + stride, 0, dest + stride2, 0,
415               dest + stride3, 0, const_1, const_2, const_3, const_4);
416     DUP4_ARG2(__lasx_xvldrepl_w, dst, 0, dst + stride, 0, dst + stride2, 0,
417               dst + stride3, 0, const_5, const_6, const_7, const_8);
418 
419     DUP4_ARG2(__lasx_xvilvl_w, const_2, const_1, const_4, const_3, const_5,
420               const_6, const_7, const_8, const_1, const_2, const_3, const_4);
421     DUP4_ARG1(__lasx_vext2xv_wu_bu, const_1, const_2, const_3, const_4,
422               const_1, const_2, const_3, const_4);
423     DUP4_ARG2(__lasx_xvadd_w, temp0, const_1, temp1, const_2, temp2, const_3,
424               temp3, const_4, temp0, temp1, temp2, temp3);
425     DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3,
426               temp0, temp1, temp2, temp3);
427     DUP2_ARG2(__lasx_xvpickev_h, temp1, temp0, temp3, temp2, temp0, temp1);
428     temp0   = __lasx_xvpickev_b(temp1, temp0);
429     __lasx_xvstelm_w(temp0, dest, 0, 0);
430     __lasx_xvstelm_w(temp0, dest + stride, 0, 4);
431     __lasx_xvstelm_w(temp0, dest + stride2, 0, 1);
432     __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
433     __lasx_xvstelm_w(temp0, dst, 0, 6);
434     __lasx_xvstelm_w(temp0, dst + stride, 0, 2);
435     __lasx_xvstelm_w(temp0, dst + stride2, 0, 7);
436     __lasx_xvstelm_w(temp0, dst + stride3, 0, 3);
437 }
438 
ff_vc1_inv_trans_4x4_dc_lasx(uint8_t * dest,ptrdiff_t stride,int16_t * block)439 void ff_vc1_inv_trans_4x4_dc_lasx(uint8_t *dest, ptrdiff_t stride,
440                                   int16_t *block)
441 {
442     int dc = block[0];
443     uint8_t *dst1 = dest + stride;
444     uint8_t *dst2 = dst1 + stride;
445     uint8_t *dst3 = dst2 + stride;
446     __m256i in0, in1, in2, in3, temp0, temp1, const_dc;
447     __m256i zero  = {0};
448 
449     dc = (17 * dc +  4) >> 3;
450     dc = (17 * dc + 64) >> 7;
451     const_dc = __lasx_xvreplgr2vr_h(dc);
452 
453     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
454               in0, in1, in2, in3);
455     DUP2_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, temp0, temp1);
456     in0   = __lasx_xvpermi_q(temp1, temp0, 0x20);
457     temp0 = __lasx_xvilvl_b(zero, in0);
458     in0   = __lasx_xvadd_h(temp0, const_dc);
459     temp0 = __lasx_xvssrarni_bu_h(in0, in0, 0);
460     __lasx_xvstelm_w(temp0, dest, 0, 0);
461     __lasx_xvstelm_w(temp0, dst1, 0, 1);
462     __lasx_xvstelm_w(temp0, dst2, 0, 4);
463     __lasx_xvstelm_w(temp0, dst3, 0, 5);
464 }
465 
ff_vc1_inv_trans_4x4_lasx(uint8_t * dest,ptrdiff_t stride,int16_t * block)466 void ff_vc1_inv_trans_4x4_lasx(uint8_t *dest, ptrdiff_t stride, int16_t *block)
467 {
468     uint8_t *dst1 = dest + stride;
469     uint8_t *dst2 = dst1 + stride;
470     uint8_t *dst3 = dst2 + stride;
471     __m256i in0, in1, in2, in3;
472     __m256i temp0, temp1, temp2, temp3, t1, t2;
473 
474     __m256i const_1  = {0x0011001100110011, 0xFFEF0011FFEF0011,
475                         0x0011001100110011, 0xFFEF0011FFEF0011};
476     __m256i const_2  = {0x000A0016000A0016, 0x0016FFF60016FFF6,
477                         0x000A0016000A0016, 0x0016FFF60016FFF6};
478     __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
479                         0x0000004000000040, 0x0000004000000040};
480 
481     DUP2_ARG2(__lasx_xvld, block, 0, block, 32, in0, in1);
482     /* first loops */
483     temp0 = __lasx_xvilvl_d(in1, in0);
484     temp1 = __lasx_xvpickev_h(temp0, temp0);
485     temp2 = __lasx_xvpickod_h(temp0, temp0);
486     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_1, temp2, const_2, t1, t2);
487     t1    = __lasx_xvaddi_wu(t1, 4);
488     in0   = __lasx_xvadd_w(t1, t2);
489     in1   = __lasx_xvsub_w(t1, t2);
490     DUP2_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in0, in1);
491     /* second loops */
492     temp0   = __lasx_xvpickev_h(in1, in0);
493     temp1   = __lasx_xvpermi_q(temp0, temp0, 0x00);
494     temp2   = __lasx_xvpermi_q(temp0, temp0, 0x11);
495     const_1 = __lasx_xvpermi_d(const_1, 0xD8);
496     const_2 = __lasx_xvpermi_d(const_2, 0xD8);
497     t1 = __lasx_xvdp2add_w_h(const_64, temp1, const_1);
498     t2 = __lasx_xvdp2_w_h(temp2, const_2);
499     in0     = __lasx_xvadd_w(t1, t2);
500     in1     = __lasx_xvsub_w(t1, t2);
501     DUP2_ARG2(__lasx_xvsrai_w, in0, 7, in1, 7, in0, in1);
502     temp0   = __lasx_xvshuf4i_w(in0, 0x9C);
503     temp1   = __lasx_xvshuf4i_w(in1, 0x9C);
504 
505     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
506               in0, in1, in2, in3);
507     temp2   = __lasx_xvilvl_w(in2, in0);
508     temp2   = __lasx_vext2xv_wu_bu(temp2);
509     temp3   = __lasx_xvilvl_w(in1, in3);
510     temp3   = __lasx_vext2xv_wu_bu(temp3);
511     temp0   = __lasx_xvadd_w(temp0, temp2);
512     temp1   = __lasx_xvadd_w(temp1, temp3);
513     DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1);
514     temp1   = __lasx_xvpickev_h(temp1, temp0);
515     temp0   = __lasx_xvpickev_b(temp1, temp1);
516     __lasx_xvstelm_w(temp0, dest, 0, 0);
517     __lasx_xvstelm_w(temp0, dst1, 0, 5);
518     __lasx_xvstelm_w(temp0, dst2, 0, 4);
519     __lasx_xvstelm_w(temp0, dst3, 0, 1);
520 }
521 
put_vc1_mspel_mc_h_v_lasx(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int hmode,int vmode,int rnd)522 static void put_vc1_mspel_mc_h_v_lasx(uint8_t *dst, const uint8_t *src,
523                                       ptrdiff_t stride, int hmode, int vmode,
524                                       int rnd)
525 {
526     __m256i in0, in1, in2, in3;
527     __m256i t0, t1, t2, t3, t4, t5, t6, t7;
528     __m256i temp0, temp1, const_para1_2, const_para0_3;
529     __m256i const_r, const_sh;
530     __m256i sh = {0x0000000400000000, 0x0000000500000001,
531                   0x0000000600000002, 0x0000000700000003};
532     static const uint8_t para_value[][4] = {{4, 3, 53, 18},
533                                             {1, 1, 9, 9},
534                                             {3, 4, 18, 53}};
535     static const int shift_value[] = {0, 5, 1, 5};
536     int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;
537     int r     = (1 << (shift - 1)) + rnd - 1;
538     const uint8_t *para_v = para_value[vmode - 1];
539     ptrdiff_t stride2 = stride << 1;
540     ptrdiff_t stride4 = stride << 2;
541     ptrdiff_t stride3 = stride2 + stride;
542 
543     const_r  = __lasx_xvreplgr2vr_h(r);
544     const_sh = __lasx_xvreplgr2vr_h(shift);
545     src -= 1, src -= stride;
546     const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
547     const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
548     DUP4_ARG2(__lasx_xvld, src, 0, src + stride, 0, src + stride2, 0,
549               src + stride3, 0, in0, in1, in2, in3);
550     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
551               in0, in1, in2, in3);
552     DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
553     t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
554     t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3);
555     src  += stride4;
556     in0   = __lasx_xvld(src, 0);
557     in0   = __lasx_xvpermi_d(in0, 0xD8);
558     DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
559     t1 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
560     t1 = __lasx_xvdp2sub_h_bu(t1, temp1, const_para0_3);
561     src  += stride;
562     in1   = __lasx_xvld(src, 0);
563     in1   = __lasx_xvpermi_d(in1, 0xD8);
564     DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
565     t2 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
566     t2 = __lasx_xvdp2sub_h_bu(t2, temp1, const_para0_3);
567     src  += stride;
568     in2   = __lasx_xvld(src, 0);
569     in2   = __lasx_xvpermi_d(in2, 0xD8);
570     DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
571     t3 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
572     t3 = __lasx_xvdp2sub_h_bu(t3, temp1, const_para0_3);
573     src  += stride;
574     in3   = __lasx_xvld(src, 0);
575     in3   = __lasx_xvpermi_d(in3, 0xD8);
576     DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
577     t4 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
578     t4 = __lasx_xvdp2sub_h_bu(t4, temp1, const_para0_3);
579     src  += stride;
580     in0   = __lasx_xvld(src, 0);
581     in0   = __lasx_xvpermi_d(in0, 0xD8);
582     DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
583     t5 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
584     t5 = __lasx_xvdp2sub_h_bu(t5, temp1, const_para0_3);
585     src  += stride;
586     in1   = __lasx_xvld(src, 0);
587     in1   = __lasx_xvpermi_d(in1, 0xD8);
588     DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
589     t6 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
590     t6 = __lasx_xvdp2sub_h_bu(t6, temp1, const_para0_3);
591     src  += stride;
592     in2   = __lasx_xvld(src, 0);
593     in2   = __lasx_xvpermi_d(in2, 0xD8);
594     DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
595     t7 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
596     t7 = __lasx_xvdp2sub_h_bu(t7, temp1, const_para0_3);
597     DUP4_ARG2(__lasx_xvadd_h, t0, const_r, t1, const_r, t2, const_r, t3,
598               const_r, t0, t1, t2, t3);
599     DUP4_ARG2(__lasx_xvadd_h, t4, const_r, t5, const_r, t6, const_r, t7,
600               const_r, t4, t5, t6, t7);
601     DUP4_ARG2(__lasx_xvsra_h, t0, const_sh, t1, const_sh, t2, const_sh,
602               t3, const_sh, t0, t1, t2, t3);
603     DUP4_ARG2(__lasx_xvsra_h, t4, const_sh, t5, const_sh, t6, const_sh,
604               t7, const_sh, t4, t5, t6, t7);
605     LASX_TRANSPOSE8x8_H(t0, t1, t2, t3, t4, t5, t6, t7, t0,
606                         t1, t2, t3, t4, t5, t6, t7);
607     para_v  = para_value[hmode - 1];
608     const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
609     const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
610     const_para0_3 = __lasx_vext2xv_h_b(const_para0_3);
611     const_para1_2 = __lasx_vext2xv_h_b(const_para1_2);
612     r       = 64 - rnd;
613     const_r = __lasx_xvreplgr2vr_w(r);
614     DUP4_ARG2(__lasx_xvpermi_d, t0, 0x72, t1, 0x72, t2, 0x72, t0, 0xD8,
615               in0, in1, in2, t0);
616     DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8,
617               t1, t2, t3, t4);
618     DUP2_ARG2(__lasx_xvpermi_d, t5, 0xD8, t6, 0xD8, t5, t6);
619     t7      = __lasx_xvpermi_d(t7, 0xD8);
620     DUP2_ARG2(__lasx_xvilvl_h, t2, t1, t3, t0, temp0, temp1);
621     t0 = __lasx_xvdp2_w_h(temp0, const_para1_2);
622     t0 = __lasx_xvdp2sub_w_h(t0, temp1, const_para0_3);
623     DUP2_ARG2(__lasx_xvilvl_h, t3, t2, t4, t1, temp0, temp1);
624     t1 = __lasx_xvdp2_w_h(temp0, const_para1_2);
625     t1 = __lasx_xvdp2sub_w_h(t1, temp1, const_para0_3);
626     DUP2_ARG2(__lasx_xvilvl_h, t4, t3, t5, t2, temp0, temp1);
627     t2 = __lasx_xvdp2_w_h(temp0, const_para1_2);
628     t2 = __lasx_xvdp2sub_w_h(t2, temp1, const_para0_3);
629     DUP2_ARG2(__lasx_xvilvl_h, t5, t4, t6, t3, temp0, temp1);
630     t3 = __lasx_xvdp2_w_h(temp0, const_para1_2);
631     t3 = __lasx_xvdp2sub_w_h(t3, temp1, const_para0_3);
632     DUP2_ARG2(__lasx_xvilvl_h, t6, t5, t7, t4, temp0, temp1);
633     t4 = __lasx_xvdp2_w_h(temp0, const_para1_2);
634     t4 = __lasx_xvdp2sub_w_h(t4, temp1, const_para0_3);
635     DUP2_ARG2(__lasx_xvilvl_h, t7, t6, in0, t5, temp0, temp1);
636     t5 = __lasx_xvdp2_w_h(temp0, const_para1_2);
637     t5 = __lasx_xvdp2sub_w_h(t5, temp1, const_para0_3);
638     DUP2_ARG2(__lasx_xvilvl_h, in0, t7, in1, t6, temp0, temp1);
639     t6 = __lasx_xvdp2_w_h(temp0, const_para1_2);
640     t6 = __lasx_xvdp2sub_w_h(t6, temp1, const_para0_3);
641     DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in2, t7, temp0, temp1);
642     t7 = __lasx_xvdp2_w_h(temp0, const_para1_2);
643     t7 = __lasx_xvdp2sub_w_h(t7, temp1, const_para0_3);
644     DUP4_ARG2(__lasx_xvadd_w, t0, const_r, t1, const_r, t2, const_r,
645               t3, const_r, t0, t1, t2, t3);
646     DUP4_ARG2(__lasx_xvadd_w, t4, const_r, t5, const_r, t6, const_r,
647               t7, const_r, t4, t5, t6, t7);
648     DUP4_ARG2(__lasx_xvsrai_w, t0, 7, t1, 7, t2, 7, t3, 7, t0, t1, t2, t3);
649     DUP4_ARG2(__lasx_xvsrai_w, t4, 7, t5, 7, t6, 7, t7, 7, t4, t5, t6, t7);
650     LASX_TRANSPOSE8x8_W(t0, t1, t2, t3, t4, t5, t6, t7,
651                         t0, t1, t2, t3, t4, t5, t6, t7);
652     DUP4_ARG1(__lasx_xvclip255_w, t0, t1, t2, t3, t0, t1, t2, t3);
653     DUP4_ARG1(__lasx_xvclip255_w, t4, t5, t6, t7, t4, t5, t6, t7);
654     DUP4_ARG2(__lasx_xvpickev_h, t1, t0, t3, t2, t5, t4, t7, t6,
655               t0, t1, t2, t3);
656     DUP2_ARG2(__lasx_xvpickev_b, t1, t0, t3, t2, t0, t1);
657     t0 = __lasx_xvperm_w(t0, sh);
658     t1 = __lasx_xvperm_w(t1, sh);
659     __lasx_xvstelm_d(t0, dst, 0, 0);
660     __lasx_xvstelm_d(t0, dst + stride, 0, 1);
661     __lasx_xvstelm_d(t0, dst + stride2, 0, 2);
662     __lasx_xvstelm_d(t0, dst + stride3, 0, 3);
663     dst += stride4;
664     __lasx_xvstelm_d(t1, dst, 0, 0);
665     __lasx_xvstelm_d(t1, dst + stride, 0, 1);
666     __lasx_xvstelm_d(t1, dst + stride2, 0, 2);
667     __lasx_xvstelm_d(t1, dst + stride3, 0, 3);
668 }
669 
670 #define PUT_VC1_MSPEL_MC_LASX(hmode, vmode)                                   \
671 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _lasx(uint8_t *dst,             \
672                                                 const uint8_t *src,           \
673                                                 ptrdiff_t stride, int rnd)    \
674 {                                                                             \
675     put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \
676 }                                                                             \
677 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_lasx(uint8_t *dst,          \
678                                                    const uint8_t *src,        \
679                                                    ptrdiff_t stride, int rnd) \
680 {                                                                             \
681     put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \
682     put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd);   \
683     dst += 8 * stride, src += 8 * stride;                                     \
684     put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \
685     put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd);   \
686 }
687 
688 PUT_VC1_MSPEL_MC_LASX(1, 1);
689 PUT_VC1_MSPEL_MC_LASX(1, 2);
690 PUT_VC1_MSPEL_MC_LASX(1, 3);
691 
692 PUT_VC1_MSPEL_MC_LASX(2, 1);
693 PUT_VC1_MSPEL_MC_LASX(2, 2);
694 PUT_VC1_MSPEL_MC_LASX(2, 3);
695 
696 PUT_VC1_MSPEL_MC_LASX(3, 1);
697 PUT_VC1_MSPEL_MC_LASX(3, 2);
698 PUT_VC1_MSPEL_MC_LASX(3, 3);
699 
ff_put_no_rnd_vc1_chroma_mc8_lasx(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h,int x,int y)700 void ff_put_no_rnd_vc1_chroma_mc8_lasx(uint8_t *dst /* align 8 */,
701                                        uint8_t *src /* align 1 */,
702                                        ptrdiff_t stride, int h, int x, int y)
703 {
704     const int intA = (8 - x) * (8 - y);
705     const int intB =     (x) * (8 - y);
706     const int intC = (8 - x) *     (y);
707     const int intD =     (x) *     (y);
708     __m256i src00, src01, src10, src11;
709     __m256i A, B, C, D;
710     int i;
711 
712     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
713 
714     A = __lasx_xvreplgr2vr_h(intA);
715     B = __lasx_xvreplgr2vr_h(intB);
716     C = __lasx_xvreplgr2vr_h(intC);
717     D = __lasx_xvreplgr2vr_h(intD);
718     for(i = 0; i < h; i++){
719         DUP2_ARG2(__lasx_xvld, src, 0, src, 1, src00, src01);
720         src += stride;
721         DUP2_ARG2(__lasx_xvld, src, 0, src, 1, src10, src11);
722 
723         DUP4_ARG1(__lasx_vext2xv_hu_bu, src00, src01, src10, src11,
724                   src00, src01, src10, src11);
725         DUP4_ARG2(__lasx_xvmul_h, src00, A, src01, B, src10, C, src11, D,
726                   src00, src01, src10, src11);
727         src00 = __lasx_xvadd_h(src00, src01);
728         src10 = __lasx_xvadd_h(src10, src11);
729         src00 = __lasx_xvadd_h(src00, src10);
730         src00 = __lasx_xvaddi_hu(src00, 28);
731         src00 = __lasx_xvsrli_h(src00, 6);
732         src00 = __lasx_xvpickev_b(src00, src00);
733         __lasx_xvstelm_d(src00, dst, 0, 0);
734         dst += stride;
735     }
736 }
737 
put_vc1_mspel_mc_v_lasx(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int vmode,int rnd)738 static void put_vc1_mspel_mc_v_lasx(uint8_t *dst, const uint8_t *src,
739                                     ptrdiff_t stride, int vmode, int rnd)
740 {
741     __m256i in0, in1, in2, in3, temp0, temp1, t0;
742     __m256i const_para0_3, const_para1_2, const_r, const_sh;
743     static const uint16_t para_value[][2] = {{0x0304, 0x1235},
744                                             {0x0101, 0x0909},
745                                             {0x0403, 0x3512}};
746     const uint16_t *para_v = para_value[vmode - 1];
747     static const int shift_value[] = {0, 6, 4, 6};
748     static int add_value[3];
749     ptrdiff_t stride_2x = stride << 1;
750     int i = 0;
751     add_value[2] = add_value[0] = 31 + rnd, add_value[1] = 7 + rnd;
752 
753     const_r  = __lasx_xvreplgr2vr_h(add_value[vmode - 1]);
754     const_sh = __lasx_xvreplgr2vr_h(shift_value[vmode]);
755     const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
756     const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
757 
758     DUP2_ARG2(__lasx_xvld, src - stride, 0, src, 0, in0, in1);
759     in2 = __lasx_xvld(src + stride, 0);
760     in0   = __lasx_xvpermi_d(in0, 0xD8);
761     in1   = __lasx_xvpermi_d(in1, 0xD8);
762     in2   = __lasx_xvpermi_d(in2, 0xD8);
763     for (; i < 16; i++) {
764         in3 = __lasx_xvld(src + stride_2x, 0);
765         in3 = __lasx_xvpermi_d(in3, 0xD8);
766         DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
767         t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
768         t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3);
769         t0 = __lasx_xvadd_h(t0, const_r);
770         t0 = __lasx_xvsra_h(t0, const_sh);
771         t0 = __lasx_xvclip255_h(t0);
772         t0 = __lasx_xvpickev_b(t0, t0);
773         __lasx_xvstelm_d(t0, dst, 0, 0);
774         __lasx_xvstelm_d(t0, dst, 8, 2);
775         dst += stride;
776         src += stride;
777         in0 = in1;
778         in1 = in2;
779         in2 = in3;
780     }
781 }
782 
783 #define PUT_VC1_MSPEL_MC_V_LASX(vmode)                                    \
784 void ff_put_vc1_mspel_mc0 ## vmode ## _16_lasx(uint8_t *dst,              \
785                                                const uint8_t *src,        \
786                                                ptrdiff_t stride, int rnd) \
787 {                                                                         \
788     put_vc1_mspel_mc_v_lasx(dst, src, stride, vmode, rnd);                \
789 }
790 
791 PUT_VC1_MSPEL_MC_V_LASX(1);
792 PUT_VC1_MSPEL_MC_V_LASX(2);
793 PUT_VC1_MSPEL_MC_V_LASX(3);
794 
795 #define ROW_LASX(in0, in1, in2, in3, out0)                                \
796     DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, tmp0_m, tmp1_m);       \
797     out0 = __lasx_xvdp2_h_bu(tmp0_m, const_para1_2);                      \
798     out0 = __lasx_xvdp2sub_h_bu(out0, tmp1_m, const_para0_3);             \
799     out0 = __lasx_xvadd_h(out0, const_r);                                 \
800     out0 = __lasx_xvsra_h(out0, const_sh);                                \
801     out0 = __lasx_xvclip255_h(out0);                                      \
802     out0 = __lasx_xvpickev_b(out0, out0);                                 \
803     out0 = __lasx_xvpermi_d(out0, 0xd8);                                  \
804 
put_vc1_mspel_mc_h_lasx(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int hmode,int rnd)805 static void put_vc1_mspel_mc_h_lasx(uint8_t *dst, const uint8_t *src,
806                                     ptrdiff_t stride, int hmode, int rnd)
807 {
808     __m256i in0, in1, in2, in3, in4, in5, in6, in7,
809             in8, in9, in10, in11, in12, in13, in14, in15;
810     __m256i out0, out1, out2, out3, out4, out5, out6, out7, out8, out9,
811             out10, out11, out12, out13, out14, out15, out16, out17, out18;
812     __m256i const_para0_3, const_para1_2, const_r, const_sh;
813     __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m;
814     __m256i tmp4_m, tmp5_m, tmp6_m, tmp7_m;
815     __m256i t0, t1, t2, t3, t4, t5, t6, t7;
816     ptrdiff_t stride2 = stride << 1;
817     ptrdiff_t stride4 = stride << 2;
818     ptrdiff_t stride3 = stride2 + stride;
819     static const uint16_t para_value[][2] = {{0x0304, 0x1235},
820                                             {0x0101, 0x0909},
821                                             {0x0403, 0x3512}};
822     const uint16_t *para_v = para_value[hmode - 1];
823     static const int shift_value[] = {0, 6, 4, 6};
824     static int add_value[3];
825     uint8_t *_src = (uint8_t*)src - 1;
826     add_value[2] = add_value[0] = 32 - rnd, add_value[1] = 8 - rnd;
827 
828     const_r  = __lasx_xvreplgr2vr_h(add_value[hmode - 1]);
829     const_sh = __lasx_xvreplgr2vr_h(shift_value[hmode]);
830     const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
831     const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
832 
833     in0 = __lasx_xvld(_src, 0);
834     DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in1, in2);
835     in3 = __lasx_xvldx(_src, stride3);
836     _src += stride4;
837     in4 = __lasx_xvld(_src, 0);
838     DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in5, in6);
839     in7 = __lasx_xvldx(_src, stride3);
840     _src += stride4;
841     in8 = __lasx_xvld(_src, 0);
842     DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in9, in10);
843     in11 = __lasx_xvldx(_src, stride3);
844     _src += stride4;
845     in12 = __lasx_xvld(_src, 0);
846     DUP2_ARG2(__lasx_xvldx, _src, stride, _src, stride2, in13, in14);
847     in15 = __lasx_xvldx(_src, stride3);
848     DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
849               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
850     DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
851               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
852     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
853               tmp7_m, tmp6_m, t0, t2, t4, t6);
854     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
855               tmp7_m, tmp6_m, t1, t3, t5, t7);
856     DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
857               tmp1_m, tmp5_m);
858     DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
859               tmp3_m, tmp7_m);
860     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
861               tmp7_m, tmp6_m, out0, out2, out4, out6);
862     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
863               tmp7_m, tmp6_m, out1, out3, out5, out7);
864 
865     DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
866               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
867     DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
868               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
869     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
870               tmp7_m, tmp6_m, t0, t2, t4, t6);
871     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
872               tmp7_m, tmp6_m, t1, t3, t5, t7);
873     DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
874               tmp1_m, tmp5_m);
875     DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
876               tmp3_m, tmp7_m);
877     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
878               tmp7_m, tmp6_m, out8, out10, out12, out14);
879     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
880               tmp7_m, tmp6_m, out9, out11, out13, out15);
881     DUP2_ARG3(__lasx_xvpermi_q, out0, out0, 0x31, out1, out1, 0x31, out16, out17);
882     out18 = __lasx_xvpermi_q(out2, out2, 0x31);
883 
884     DUP4_ARG2(__lasx_xvpermi_d, out0, 0xD8, out1, 0xD8, out2, 0xD8, out3, 0xD8,
885               out0, out1, out2, out3);
886     DUP4_ARG2(__lasx_xvpermi_d, out4, 0xD8, out5, 0xD8, out6, 0xD8, out7, 0xD8,
887               out4, out5, out6, out7);
888     DUP4_ARG2(__lasx_xvpermi_d, out8, 0xD8, out9, 0xD8, out10, 0xD8, out11,
889               0xD8, out8, out9, out10, out11);
890     DUP4_ARG2(__lasx_xvpermi_d, out12, 0xD8, out13, 0xD8, out14, 0xD8, out15,
891               0xD8, out12, out13, out14, out15);
892     out16 = __lasx_xvpermi_d(out16, 0xD8);
893     out17 = __lasx_xvpermi_d(out17, 0xD8);
894     out18 = __lasx_xvpermi_d(out18, 0xD8);
895 
896     ROW_LASX(out0,  out1,  out2,  out3,  in0);
897     ROW_LASX(out1,  out2,  out3,  out4,  in1);
898     ROW_LASX(out2,  out3,  out4,  out5,  in2);
899     ROW_LASX(out3,  out4,  out5,  out6,  in3);
900     ROW_LASX(out4,  out5,  out6,  out7,  in4);
901     ROW_LASX(out5,  out6,  out7,  out8,  in5);
902     ROW_LASX(out6,  out7,  out8,  out9,  in6);
903     ROW_LASX(out7,  out8,  out9,  out10, in7);
904     ROW_LASX(out8,  out9,  out10, out11, in8);
905     ROW_LASX(out9,  out10, out11, out12, in9);
906     ROW_LASX(out10, out11, out12, out13, in10);
907     ROW_LASX(out11, out12, out13, out14, in11);
908     ROW_LASX(out12, out13, out14, out15, in12);
909     ROW_LASX(out13, out14, out15, out16, in13);
910     ROW_LASX(out14, out15, out16, out17, in14);
911     ROW_LASX(out15, out16, out17, out18, in15);
912 
913     DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
914               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
915     DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
916               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
917     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
918               tmp7_m, tmp6_m, t0, t2, t4, t6);
919     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
920               tmp7_m, tmp6_m, t1, t3, t5, t7);
921     DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
922               tmp1_m, tmp5_m);
923     DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
924               tmp3_m, tmp7_m);
925     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
926               tmp7_m, tmp6_m, out0, out2, out4, out6);
927     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
928               tmp7_m, tmp6_m, out1, out3, out5, out7);
929 
930     DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
931               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
932     DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
933               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
934     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
935               tmp7_m, tmp6_m, t0, t2, t4, t6);
936     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
937               tmp7_m, tmp6_m, t1, t3, t5, t7);
938     DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
939               tmp1_m, tmp5_m);
940     DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
941               tmp3_m, tmp7_m);
942     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
943               tmp7_m, tmp6_m, out8, out10, out12, out14);
944     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
945               tmp7_m, tmp6_m, out9, out11, out13, out15);
946     __lasx_xvstelm_d(out0, dst, 0, 0);
947     __lasx_xvstelm_d(out0, dst, 8, 1);
948     dst += stride;
949     __lasx_xvstelm_d(out1, dst, 0, 0);
950     __lasx_xvstelm_d(out1, dst, 8, 1);
951     dst += stride;
952     __lasx_xvstelm_d(out2, dst, 0, 0);
953     __lasx_xvstelm_d(out2, dst, 8, 1);
954     dst += stride;
955     __lasx_xvstelm_d(out3, dst, 0, 0);
956     __lasx_xvstelm_d(out3, dst, 8, 1);
957     dst += stride;
958     __lasx_xvstelm_d(out4, dst, 0, 0);
959     __lasx_xvstelm_d(out4, dst, 8, 1);
960     dst += stride;
961     __lasx_xvstelm_d(out5, dst, 0, 0);
962     __lasx_xvstelm_d(out5, dst, 8, 1);
963     dst += stride;
964     __lasx_xvstelm_d(out6, dst, 0, 0);
965     __lasx_xvstelm_d(out6, dst, 8, 1);
966     dst += stride;
967     __lasx_xvstelm_d(out7, dst, 0, 0);
968     __lasx_xvstelm_d(out7, dst, 8, 1);
969     dst += stride;
970     __lasx_xvstelm_d(out8, dst, 0, 0);
971     __lasx_xvstelm_d(out8, dst, 8, 1);
972     dst += stride;
973     __lasx_xvstelm_d(out9, dst, 0, 0);
974     __lasx_xvstelm_d(out9, dst, 8, 1);
975     dst += stride;
976     __lasx_xvstelm_d(out10, dst, 0, 0);
977     __lasx_xvstelm_d(out10, dst, 8, 1);
978     dst += stride;
979     __lasx_xvstelm_d(out11, dst, 0, 0);
980     __lasx_xvstelm_d(out11, dst, 8, 1);
981     dst += stride;
982     __lasx_xvstelm_d(out12, dst, 0, 0);
983     __lasx_xvstelm_d(out12, dst, 8, 1);
984     dst += stride;
985     __lasx_xvstelm_d(out13, dst, 0, 0);
986     __lasx_xvstelm_d(out13, dst, 8, 1);
987     dst += stride;
988     __lasx_xvstelm_d(out14, dst, 0, 0);
989     __lasx_xvstelm_d(out14, dst, 8, 1);
990     dst += stride;
991     __lasx_xvstelm_d(out15, dst, 0, 0);
992     __lasx_xvstelm_d(out15, dst, 8, 1);
993 }
994 
995 #define PUT_VC1_MSPEL_MC_H_LASX(hmode)                                    \
996 void ff_put_vc1_mspel_mc ## hmode ## 0_16_lasx(uint8_t *dst,              \
997                                                const uint8_t *src,        \
998                                                ptrdiff_t stride, int rnd) \
999 {                                                                         \
1000     put_vc1_mspel_mc_h_lasx(dst, src, stride, hmode, rnd);                \
1001 }
1002 
1003 PUT_VC1_MSPEL_MC_H_LASX(1);
1004 PUT_VC1_MSPEL_MC_H_LASX(2);
1005 PUT_VC1_MSPEL_MC_H_LASX(3);
1006