• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30/*
31void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrlowff_t stride,
32                                coef *const coeff, const int eob
33                                HIGHBD_DECL_SUFFIX)
34*/
35function inv_txfm_add_wht_wht_4x4_8bpc_lsx
36    vld           vr0,       a2,      0
37    vld           vr2,       a2,      16
38
39    vreplgr2vr.h  vr20,      zero
40
41    vsrai.h       vr0,       vr0,     2
42    vsrai.h       vr2,       vr2,     2
43
44    vst           vr20,      a2,      0
45
46    vpickod.d     vr1,       vr0,     vr0
47    vpickod.d     vr3,       vr2,     vr2
48
49    vadd.h        vr4,       vr0,     vr1
50    vsub.h        vr5,       vr2,     vr3
51    vsub.h        vr6,       vr4,     vr5
52    vsrai.h       vr6,       vr6,     1
53    vsub.h        vr0,       vr6,     vr3
54    vsub.h        vr2,       vr6,     vr1
55    vsub.h        vr1,       vr4,     vr0
56    vadd.h        vr3,       vr5,     vr2
57
58    vst           vr20,      a2,      16
59
60    vilvl.h       vr4,       vr0,     vr1
61    vilvl.h       vr5,       vr3,     vr2
62    vilvl.w       vr0,       vr5,     vr4
63    vilvh.w       vr2,       vr5,     vr4
64    vilvh.d       vr1,       vr0,     vr0
65    vilvh.d       vr3,       vr2,     vr2
66
67    vadd.h        vr4,       vr0,     vr1
68    vsub.h        vr5,       vr2,     vr3
69    vsub.h        vr6,       vr4,     vr5
70    vsrai.h       vr6,       vr6,     1
71    vsub.h        vr0,       vr6,     vr3
72    vsub.h        vr2,       vr6,     vr1
73    vsub.h        vr1,       vr4,     vr0
74    vadd.h        vr3,       vr5,     vr2
75
76    vld           vr4,       a0,      0
77    vldx          vr5,       a0,      a1
78    alsl.d        t0,        a1,      a0,    1
79    vld           vr6,       t0,      0
80    vldx          vr7,       t0,      a1
81
82    vsllwil.hu.bu vr4,       vr4,     0
83    vsllwil.hu.bu vr5,       vr5,     0
84    vsllwil.hu.bu vr6,       vr6,     0
85    vsllwil.hu.bu vr7,       vr7,     0
86    vilvl.d       vr1,       vr0,     vr1
87    vilvl.d       vr2,       vr3,     vr2
88    vilvl.d       vr4,       vr5,     vr4
89    vilvl.d       vr6,       vr7,     vr6
90    vadd.h        vr1,       vr1,     vr4
91    vadd.h        vr2,       vr2,     vr6
92    vssrani.bu.h  vr2,       vr1,     0
93
94    vstelm.w      vr2,       a0,      0,     0
95    add.d         a0,        a0,      a1
96    vstelm.w      vr2,       a0,      0,     1
97    add.d         a0,        a0,      a1
98    vstelm.w      vr2,       a0,      0,     2
99    add.d         a0,        a0,      a1
100    vstelm.w      vr2,       a0,      0,     3
101endfunc
102
103const idct_coeffs, align=4
104    // idct4
105    .word          2896, 2896*8, 1567, 3784
106    // idct8
107    .word          799, 4017, 3406, 2276
108    // idct16
109    .word          401, 4076, 3166, 2598
110    .word          1931, 3612, 3920, 1189
111    // idct32
112    .word          201, 4091, 3035, 2751
113    .word          1751, 3703, 3857, 1380
114    .word          995, 3973, 3513, 2106
115    .word          2440, 3290, 4052, 601
116endconst
117
118.macro vld_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7
119    vld           \in0,     \src,     \start
120    vld           \in1,     \src,     \start+(\stride*1)
121    vld           \in2,     \src,     \start+(\stride*2)
122    vld           \in3,     \src,     \start+(\stride*3)
123    vld           \in4,     \src,     \start+(\stride*4)
124    vld           \in5,     \src,     \start+(\stride*5)
125    vld           \in6,     \src,     \start+(\stride*6)
126    vld           \in7,     \src,     \start+(\stride*7)
127.endm
128
129.macro vst_x8 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7
130    vst           \in0,     \src,     \start
131    vst           \in1,     \src,     \start+(\stride*1)
132    vst           \in2,     \src,     \start+(\stride*2)
133    vst           \in3,     \src,     \start+(\stride*3)
134    vst           \in4,     \src,     \start+(\stride*4)
135    vst           \in5,     \src,     \start+(\stride*5)
136    vst           \in6,     \src,     \start+(\stride*6)
137    vst           \in7,     \src,     \start+(\stride*7)
138.endm
139
140.macro vld_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \
141               in8, in9, in10, in11, in12, in13, in14, in15
142
143    vld_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
144
145    vld           \in8,     \src,     \start+(\stride*8)
146    vld           \in9,     \src,     \start+(\stride*9)
147    vld           \in10,    \src,     \start+(\stride*10)
148    vld           \in11,    \src,     \start+(\stride*11)
149    vld           \in12,    \src,     \start+(\stride*12)
150    vld           \in13,    \src,     \start+(\stride*13)
151    vld           \in14,    \src,     \start+(\stride*14)
152    vld           \in15,    \src,     \start+(\stride*15)
153.endm
154
155.macro vst_x16 src, start, stride, in0, in1, in2, in3, in4, in5, in6, in7, \
156               in8, in9, in10, in11, in12, in13, in14, in15
157
158    vst_x8 \src, \start, \stride, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
159
160    vst           \in8,     \src,     \start+(\stride*8)
161    vst           \in9,     \src,     \start+(\stride*9)
162    vst           \in10,    \src,     \start+(\stride*10)
163    vst           \in11,    \src,     \start+(\stride*11)
164    vst           \in12,    \src,     \start+(\stride*12)
165    vst           \in13,    \src,     \start+(\stride*13)
166    vst           \in14,    \src,     \start+(\stride*14)
167    vst           \in15,    \src,     \start+(\stride*15)
168.endm
169
170.macro DST_ADD_W4 in0, in1, in2, in3, in4, in5
171    vilvl.w       vr10,     \in1,     \in0  // 0 1  2  3  4  5  6  7 x ...
172    vilvl.w       vr12,     \in3,     \in2  // 8 9 10 11 12 13 14 15 x ...
173    vsllwil.hu.bu vr10,     vr10,     0
174    vsllwil.hu.bu vr12,     vr12,     0
175    vadd.h        vr10,     \in4,     vr10
176    vadd.h        vr12,     \in5,     vr12
177    vssrani.bu.h  vr12,     vr10,     0
178    vstelm.w      vr12,     a0,       0,    0
179    add.d         t8,       a0,       a1
180    vstelm.w      vr12,     t8,       0,    1
181    vstelm.w      vr12,     t2,       0,    2
182    add.d         t8,       t2,       a1
183    vstelm.w      vr12,     t8,       0,    3
184.endm
185
186.macro VLD_DST_ADD_W4 in0, in1
187    vld           vr0,      a0,       0
188    vldx          vr1,      a0,       a1
189    vld           vr2,      t2,       0
190    vldx          vr3,      t2,       a1
191
192    DST_ADD_W4    vr0, vr1, vr2, vr3, \in0, \in1
193.endm
194
195.macro dct_4x4_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1
196    vexth.w.h     vr4,      \in0            // in1
197    vexth.w.h     vr5,      \in1            // in3
198    vmul.w        vr6,      vr4,      \in4
199    vmul.w        vr7,      vr4,      \in5
200    vmadd.w       vr6,      vr5,      \in5  // t3
201    vmsub.w       vr7,      vr5,      \in4  // t2
202    vsllwil.w.h   vr4,      \in2,     0     // in0
203    vsllwil.w.h   vr5,      \in3,     0     // in2
204    vmul.w        vr9,      vr4,      \in6
205    vmul.w        vr10,     vr4,      \in7
206    vmadd.w       vr9,      vr5,      \in7  // t0
207    vmsub.w       vr10,     vr5,      \in6  // t1
208    vssrarni.h.w  vr10,     vr9,      12    // t0 t1
209    vssrarni.h.w  vr7,      vr6,      12    // t3 t2
210    vsadd.h       \out0,    vr10,     vr7   // 0 4  8 12 1 5  9 13  c[0] c[1]
211    vssub.h       \out1,    vr10,     vr7   // 3 7 11 15 2 6 10 14  c[3] c[2]
212.endm
213
214.macro inv_dct_dct_4x4_lsx
215    la.local      t0,       idct_coeffs
216
217    vld           vr0,      a2,       0    // 0 1  2  3  4  5  6  7
218    vld           vr1,      a2,       16   // 8 9 10 11 12 13 14 15
219
220    vldrepl.w     vr2,      t0,       8    // 1567
221    vldrepl.w     vr3,      t0,       12   // 3784
222    vldrepl.w     vr8,      t0,       0    // 2896
223
224    dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12
225
226    vreplgr2vr.h  vr15,     zero
227    vshuf4i.d     vr12,     vr12,     0x01 // 2 6 10 14 3 7 11 15
228    vst           vr15,     a2,       0
229    vst           vr15,     a2,       16
230
231    vilvl.h       vr4,      vr12,     vr11 // 0 2 4 6 8 10 12 14
232    vilvh.h       vr5,      vr12,     vr11 // 1 3 5 7 9 11 13 15
233    vilvl.h       vr0,      vr5,      vr4  // 0 1  2  3  4  5  6  7
234    vilvh.h       vr1,      vr5,      vr4  // 8 9 10 11 12 13 14 15
235
236    dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14
237    vsrari.h      vr13,     vr13,     4
238    vsrari.h      vr14,     vr14,     4
239    vshuf4i.d     vr14,     vr14,     0x01
240
241    alsl.d        t2,       a1,       a0,     1
242
243    VLD_DST_ADD_W4 vr13, vr14
244.endm
245
246.macro identity_4x4_lsx in0, in1, in2, in3, out0
247    vsllwil.w.h   vr2,      \in0,    0
248    vexth.w.h     vr3,      \in1
249    vmul.w        vr4,      vr2,     \in2
250    vmul.w        vr5,      vr3,     \in2
251    vssrarni.h.w  vr5,      vr4,     12
252    vsadd.h       \out0,    vr5,     \in3
253.endm
254
255.macro inv_identity_identity_4x4_lsx
256    vld           vr0,      a2,       0    // 0 1  2  3  4  5  6  7
257    vld           vr1,      a2,       16   // 8 9 10 11 12 13 14 15
258
259    li.w          t0,       1697
260    vreplgr2vr.w  vr20,     t0
261
262    identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
263    identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
264    vreplgr2vr.h  vr15,     zero
265    vst           vr15,     a2,      0
266    vst           vr15,     a2,      16
267    identity_4x4_lsx vr0, vr0, vr20, vr0, vr6
268    identity_4x4_lsx vr1, vr1, vr20, vr1, vr7
269
270    vsrari.h      vr6,      vr6,     4
271    vsrari.h      vr7,      vr7,     4
272    vilvh.d       vr8,      vr6,     vr6
273    vilvh.d       vr9,      vr7,     vr7
274    vilvl.h       vr4,      vr8,     vr6
275    vilvl.h       vr5,      vr9,     vr7
276    vilvl.w       vr6,      vr5,     vr4
277    vilvh.w       vr7,      vr5,     vr4
278
279    alsl.d        t2,       a1,      a0,   1
280    VLD_DST_ADD_W4 vr6, vr7
281.endm
282
283const iadst4_coeffs, align=4
284    .word          1321, 3803, 2482, 3344
285endconst
286
287.macro adst4x4_1d_lsx in0, in1, in2, in3, out0, out1, out2, out3
288    vsub.w        vr6,      \in0,   \in2  // in0-in2
289    vmul.w        vr7,      \in0,   vr20  // in0*1321
290    vmadd.w       vr7,      \in2,   vr21  // in0*1321+in2*3803
291    vmadd.w       vr7,      \in3,   vr22  // in0*1321+in2*3803+in3*2482
292    vmul.w        vr8,      \in1,   vr23  // in1*3344
293    vadd.w        vr6,      vr6,    \in3  // in0-in2+in3
294    vmul.w        vr9,      \in0,   vr22  // in0*2482
295    vmsub.w       vr9,      \in2,   vr20  // in2*1321
296    vmsub.w       vr9,      \in3,   vr21  // in0*2482-in2*1321-in3*3803
297    vadd.w        vr5,      vr7,    vr9
298    vmul.w        \out2,    vr6,    vr23  // out[2] 8  9  10 11
299    vadd.w        \out0,    vr7,    vr8   // out[0] 0  1  2  3
300    vadd.w        \out1,    vr9,    vr8   // out[1] 4  5  6  7
301    vsub.w        \out3,    vr5,    vr8   // out[3] 12 13 14 15
302.endm
303
304.macro inv_adst_dct_4x4_lsx
305    vld           vr0,      a2,     0
306    vld           vr1,      a2,     16
307
308    la.local      t0,       iadst4_coeffs
309    vsllwil.w.h   vr2,      vr0,    0     // in0
310    vexth.w.h     vr3,      vr0           // in1
311    vsllwil.w.h   vr4,      vr1,    0     // in2
312    vexth.w.h     vr5,      vr1           // in3
313    vldrepl.w     vr20,     t0,     0     // 1321
314    vldrepl.w     vr21,     t0,     4     // 3803
315    vldrepl.w     vr22,     t0,     8     // 2482
316    vldrepl.w     vr23,     t0,     12    // 3344
317
318    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
319
320    LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
321    vssrarni.h.w  vr13,     vr11,    12
322    vssrarni.h.w  vr14,     vr12,    12
323
324    vreplgr2vr.h  vr15,     zero
325    la.local      t0,       idct_coeffs
326    vst           vr15,     a2,      0
327    vst           vr15,     a2,      16
328    vldrepl.w     vr20,     t0,      8    // 1567
329    vldrepl.w     vr21,     t0,      12   // 3784
330    vldrepl.w     vr22,     t0,      0    // 2896
331
332    dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14
333
334    vshuf4i.d     vr14,     vr14,    0x01
335    vsrari.h      vr13,     vr13,    4
336    vsrari.h      vr14,     vr14,    4
337
338    alsl.d        t2,       a1,      a0,   1
339    VLD_DST_ADD_W4 vr13, vr14
340.endm
341
342.macro inv_adst_adst_4x4_lsx
343    vld           vr0,      a2,     0
344    vld           vr1,      a2,     16
345
346    la.local      t0,       iadst4_coeffs
347    vsllwil.w.h   vr2,      vr0,    0     // in0
348    vexth.w.h     vr3,      vr0           // in1
349    vsllwil.w.h   vr4,      vr1,    0     // in2
350    vexth.w.h     vr5,      vr1           // in3
351    vldrepl.w     vr20,     t0,     0     // 1321
352    vldrepl.w     vr21,     t0,     4     // 3803
353    vldrepl.w     vr22,     t0,     8     // 2482
354    vldrepl.w     vr23,     t0,     12    // 3344
355
356    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
357
358    LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
359
360    vsrari.w      vr11,     vr11,    12
361    vsrari.w      vr13,     vr13,    12
362    vsrari.w      vr12,     vr12,    12
363    vsrari.w      vr14,     vr14,    12
364
365    vreplgr2vr.h  vr15,     zero
366    vst           vr15,     a2,      0
367    vst           vr15,     a2,      16
368
369    adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr13, vr12, vr14
370
371    vssrarni.h.w  vr13,     vr11,    12
372    vssrarni.h.w  vr14,     vr12,    12
373    vsrari.h      vr13,     vr13,    4
374    vsrari.h      vr14,     vr14,    4
375
376    alsl.d        t2,       a1,      a0,   1
377    VLD_DST_ADD_W4 vr13, vr14
378.endm
379
380.macro inv_dct_adst_4x4_lsx
381    la.local      t0,       idct_coeffs
382
383    vld           vr0,      a2,       0    // 0 1  2  3  4  5  6  7
384    vld           vr1,      a2,       16   // 8 9 10 11 12 13 14 15
385
386    vldrepl.w     vr20,     t0,       8    // 1567
387    vldrepl.w     vr21,     t0,       12   // 3784
388    vldrepl.w     vr22,     t0,       0    // 2896
389
390    dct_4x4_core_lsx  vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12
391
392    vreplgr2vr.h  vr15,     zero
393    vst           vr15,     a2,       0
394    vst           vr15,     a2,       16
395
396    vshuf4i.d     vr12,     vr12,     0x01 // 3 7 11 15 2 6 10 14
397
398    vilvl.h       vr4,      vr12,     vr11 // 0 2 4 6 8 10 12 14
399    vilvh.h       vr5,      vr12,     vr11 // 1 3 5 7 9 11 13 15
400    vilvl.h       vr11,     vr5,      vr4  // 0 1  2  3  4  5  6  7
401    vilvh.h       vr12,     vr5,      vr4  // 8 9 10 11 12 13 14 15
402
403    vsllwil.w.h   vr2,      vr11,     0     // in0
404    vexth.w.h     vr3,      vr11            // in1
405    vsllwil.w.h   vr4,      vr12,     0     // in2
406    vexth.w.h     vr5,      vr12            // in3
407
408    la.local      t0,       iadst4_coeffs
409
410    vldrepl.w     vr20,     t0,       0     // 1321
411    vldrepl.w     vr21,     t0,       4     // 3803
412    vldrepl.w     vr22,     t0,       8     // 2482
413    vldrepl.w     vr23,     t0,       12    // 3344
414
415    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr13, vr12, vr14
416
417    vssrarni.h.w  vr13,     vr11,     12
418    vssrarni.h.w  vr14,     vr12,     12
419    vsrari.h      vr13,     vr13,     4
420    vsrari.h      vr14,     vr14,     4
421
422    alsl.d        t2,       a1,       a0,   1
423    VLD_DST_ADD_W4 vr13, vr14
424.endm
425
426.macro inv_dct_flipadst_4x4_lsx
427    la.local      t0,       idct_coeffs
428
429    vld           vr0,      a2,       0    // 0 1  2  3  4  5  6  7
430    vld           vr1,      a2,       16   // 8 9 10 11 12 13 14 15
431
432    vldrepl.w     vr20,     t0,       8    // 1567
433    vldrepl.w     vr21,     t0,       12   // 3784
434    vldrepl.w     vr22,     t0,       0    // 2896
435
436    dct_4x4_core_lsx  vr0, vr1, vr0, vr1, vr21, vr20, vr22, vr22, vr11, vr12
437
438    vreplgr2vr.h  vr15,     zero
439    vst           vr15,     a2,       0
440    vst           vr15,     a2,       16
441
442    vshuf4i.d     vr12,     vr12,     0x01 // 3 7 11 15 2 6 10 14
443
444    vilvl.h       vr4,      vr12,     vr11 // 0 2 4 6 8 10 12 14
445    vilvh.h       vr5,      vr12,     vr11 // 1 3 5 7 9 11 13 15
446    vilvl.h       vr11,     vr5,      vr4  // 0 1  2  3  4  5  6  7
447    vilvh.h       vr12,     vr5,      vr4  // 8 9 10 11 12 13 14 15
448    vsllwil.w.h   vr2,      vr11,     0    // in0
449    vexth.w.h     vr3,      vr11           // in1
450    vsllwil.w.h   vr4,      vr12,     0    // in2
451    vexth.w.h     vr5,      vr12           // in3
452
453    la.local      t0,       iadst4_coeffs
454
455    vldrepl.w     vr20,     t0,       0     // 1321
456    vldrepl.w     vr21,     t0,       4     // 3803
457    vldrepl.w     vr22,     t0,       8     // 2482
458    vldrepl.w     vr23,     t0,       12    // 3344
459
460    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr11, vr12, vr13, vr14
461
462    vssrarni.h.w  vr11,     vr12,     12    // 0 1  2  3  4  5  6  7
463    vssrarni.h.w  vr13,     vr14,     12    // 8 9 10 11 12 13 14 15
464    vsrari.h      vr11,     vr11,     4
465    vsrari.h      vr13,     vr13,     4
466
467    alsl.d        t2,       a1,       a0,   1
468    VLD_DST_ADD_W4 vr13, vr11
469.endm
470
471.macro inv_flipadst_adst_4x4_lsx
472    vld           vr0,      a2,       0
473    vld           vr1,      a2,       16
474
475    la.local      t0,       iadst4_coeffs
476    vsllwil.w.h   vr2,      vr0,      0     // in0
477    vexth.w.h     vr3,      vr0             // in1
478    vsllwil.w.h   vr4,      vr1,      0     // in2
479    vexth.w.h     vr5,      vr1             // in3
480    vldrepl.w     vr20,     t0,       0     // 1321
481    vldrepl.w     vr21,     t0,       4     // 3803
482    vldrepl.w     vr22,     t0,       8     // 2482
483    vldrepl.w     vr23,     t0,       12    // 3344
484
485    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
486
487    vsrari.w      vr0,      vr0,      12
488    vsrari.w      vr1,      vr1,      12
489    vsrari.w      vr2,      vr2,      12
490    vsrari.w      vr3,      vr3,      12
491
492    vilvl.w       vr4,      vr0,      vr1
493    vilvh.w       vr5,      vr0,      vr1
494    vilvl.w       vr6,      vr2,      vr3
495    vilvh.w       vr7,      vr2,      vr3
496    vilvl.d       vr11,     vr4,      vr6
497    vilvh.d       vr12,     vr4,      vr6
498    vilvl.d       vr13,     vr5,      vr7
499    vilvh.d       vr14,     vr5,      vr7
500
501    vreplgr2vr.h  vr15,     zero
502    vst           vr15,     a2,       0
503    vst           vr15,     a2,       16
504
505    adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr13, vr12, vr14
506
507    vssrarni.h.w  vr13,     vr11,     12
508    vssrarni.h.w  vr14,     vr12,     12
509    vsrari.h      vr13,     vr13,     4
510    vsrari.h      vr14,     vr14,     4
511
512    alsl.d        t2,       a1,       a0,   1
513    VLD_DST_ADD_W4 vr13, vr14
514.endm
515
516.macro inv_adst_flipadst_4x4_lsx
517    vld           vr0,      a2,      0
518    vld           vr1,      a2,      16
519
520    la.local      t0,       iadst4_coeffs
521    vsllwil.w.h   vr2,      vr0,      0     // in0
522    vexth.w.h     vr3,      vr0             // in1
523    vsllwil.w.h   vr4,      vr1,      0     // in2
524    vexth.w.h     vr5,      vr1             // in3
525    vldrepl.w     vr20,     t0,       0     // 1321
526    vldrepl.w     vr21,     t0,       4     // 3803
527    vldrepl.w     vr22,     t0,       8     // 2482
528    vldrepl.w     vr23,     t0,       12    // 3344
529
530    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
531    LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
532    vsrari.w      vr11,     vr11,     12
533    vsrari.w      vr12,     vr12,     12
534    vsrari.w      vr13,     vr13,     12
535    vsrari.w      vr14,     vr14,     12
536
537    vreplgr2vr.h  vr15,     zero
538    vst           vr15,     a2,       0
539    vst           vr15,     a2,       16
540
541    adst4x4_1d_lsx vr11, vr13, vr12, vr14, vr11, vr12, vr13, vr14
542
543    vssrarni.h.w  vr11,     vr12,     12
544    vssrarni.h.w  vr13,     vr14,     12
545    vsrari.h      vr11,     vr11,     4
546    vsrari.h      vr13,     vr13,     4
547
548    alsl.d        t2,       a1,       a0,   1
549    VLD_DST_ADD_W4 vr13, vr11
550.endm
551
552.macro inv_flipadst_dct_4x4_lsx
553    vld           vr0,      a2,       0
554    vld           vr1,      a2,       16
555
556    la.local      t0,       iadst4_coeffs
557    vsllwil.w.h   vr2,      vr0,      0     // in0
558    vexth.w.h     vr3,      vr0             // in1
559    vsllwil.w.h   vr4,      vr1,      0     // in2
560    vexth.w.h     vr5,      vr1             // in3
561    vldrepl.w     vr20,     t0,       0     // 1321
562    vldrepl.w     vr21,     t0,       4     // 3803
563    vldrepl.w     vr22,     t0,       8     // 2482
564    vldrepl.w     vr23,     t0,       12    // 3344
565
566    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
567
568    vilvl.w       vr4,      vr0,      vr1
569    vilvh.w       vr5,      vr0,      vr1
570    vilvl.w       vr6,      vr2,      vr3
571    vilvh.w       vr7,      vr2,      vr3
572
573    vilvl.d       vr11,     vr4,      vr6
574    vilvh.d       vr12,     vr4,      vr6
575    vilvl.d       vr13,     vr5,      vr7
576    vilvh.d       vr14,     vr5,      vr7
577
578    vssrarni.h.w  vr12,     vr11,     12
579    vssrarni.h.w  vr14,     vr13,     12
580
581    vreplgr2vr.h  vr15,     zero
582    la.local      t0,       idct_coeffs
583    vst           vr15,     a2,       0
584    vst           vr15,     a2,       16
585    vldrepl.w     vr20,     t0,       8    // 1567
586    vldrepl.w     vr21,     t0,       12   // 3784
587    vldrepl.w     vr22,     t0,       0    // 2896
588
589    dct_4x4_core_lsx vr12, vr14, vr12, vr14, vr21, vr20, vr22, vr22, vr13, vr14
590
591    vshuf4i.d     vr14,     vr14,     0x01
592    vsrari.h      vr13,     vr13,     4
593    vsrari.h      vr14,     vr14,     4
594
595    alsl.d        t2,       a1,       a0,   1
596    VLD_DST_ADD_W4 vr13, vr14
597.endm
598
599.macro inv_flipadst_flipadst_4x4_lsx
600    vld           vr0,      a2,       0
601    vld           vr1,      a2,       16
602
603    la.local      t0,       iadst4_coeffs
604    vsllwil.w.h   vr2,      vr0,      0     // in0
605    vexth.w.h     vr3,      vr0             // in1
606    vsllwil.w.h   vr4,      vr1,      0     // in2
607    vexth.w.h     vr5,      vr1             // in3
608    vldrepl.w     vr20,     t0,       0     // 1321
609    vldrepl.w     vr21,     t0,       4     // 3803
610    vldrepl.w     vr22,     t0,       8     // 2482
611    vldrepl.w     vr23,     t0,       12    // 3344
612
613    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
614
615    vilvl.w       vr4,      vr0,      vr1
616    vilvh.w       vr5,      vr0,      vr1
617    vilvl.w       vr6,      vr2,      vr3
618    vilvh.w       vr7,      vr2,      vr3
619    vilvl.d       vr11,     vr4,      vr6
620    vilvh.d       vr12,     vr4,      vr6
621    vilvl.d       vr13,     vr5,      vr7
622    vilvh.d       vr14,     vr5,      vr7
623
624    vsrari.w      vr11,     vr11,     12
625    vsrari.w      vr12,     vr12,     12
626    vsrari.w      vr13,     vr13,     12
627    vsrari.w      vr14,     vr14,     12
628
629    vreplgr2vr.h  vr15,     zero
630    vst           vr15,     a2,       0
631    vst           vr15,     a2,       16
632
633    adst4x4_1d_lsx vr11, vr12, vr13, vr14, vr11, vr12, vr13, vr14
634
635    vssrarni.h.w  vr11,     vr12,     12
636    vssrarni.h.w  vr13,     vr14,     12
637    vsrari.h      vr11,     vr11,     4
638    vsrari.h      vr13,     vr13,     4
639
640    alsl.d        t2,       a1,       a0,   1
641    VLD_DST_ADD_W4 vr13, vr11
642.endm
643
644.macro inv_dct_identity_4x4_lsx
645    la.local      t0,       idct_coeffs
646
647    vld           vr0,      a2,       0
648    vld           vr1,      a2,       16
649
650    vldrepl.w     vr2,      t0,       8    // 1567
651    vldrepl.w     vr3,      t0,       12   // 3784
652    vldrepl.w     vr8,      t0,       0    // 2896
653
654    dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr11, vr12
655    vshuf4i.d     vr12,     vr12,     0x01 // 2 6 10 14 3 7 11 15
656
657    vreplgr2vr.h  vr15,     zero
658    li.w          t0,       1697
659
660    vilvl.h       vr4,      vr12,     vr11 // 0 2 4 6 8 10 12 14
661    vilvh.h       vr5,      vr12,     vr11 // 1 3 5 7 9 11 13 15
662    vilvl.h       vr10,     vr5,      vr4  // 0 1  2  3  4  5  6  7
663    vilvh.h       vr12,     vr5,      vr4  // 8 9 10 11 12 13 14 15
664
665    vst           vr15,     a2,       0
666    vst           vr15,     a2,       16
667    vreplgr2vr.w  vr20,     t0
668
669    identity_4x4_lsx vr10, vr10, vr20, vr10, vr6
670    identity_4x4_lsx vr12, vr12, vr20, vr12, vr7
671    vsrari.h      vr11,      vr6,     4
672    vsrari.h      vr13,      vr7,     4
673
674    alsl.d        t2,       a1,       a0,   1
675    VLD_DST_ADD_W4 vr11, vr13
676.endm
677
678.macro inv_identity_dct_4x4_lsx
679    vld           vr0,      a2,       0
680    vld           vr1,      a2,       16
681
682    li.w          t0,       1697
683    vreplgr2vr.w  vr20,     t0
684
685    identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
686    identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
687
688    vreplgr2vr.h  vr15,     zero
689
690    vilvl.h       vr4,      vr1,      vr0  // 0 2 4 6 8 10 12 14
691    vilvh.h       vr5,      vr1,      vr0  // 1 3 5 7 9 11 13 15
692    vilvl.h       vr13,     vr5,      vr4  // 0 1  2  3  4  5  6  7
693    vilvh.h       vr14,     vr5,      vr4  // 8 9 10 11 12 13 14 15
694
695    vst           vr15,     a2,       0
696    vst           vr15,     a2,       16
697
698    la.local      t0,       idct_coeffs
699
700    vldrepl.w     vr20,     t0,       8    // 1567
701    vldrepl.w     vr21,     t0,       12   // 3784
702    vldrepl.w     vr22,     t0,       0    // 2896
703
704    dct_4x4_core_lsx vr13, vr14, vr13, vr14, vr21, vr20, vr22, vr22, vr13, vr14
705
706    vshuf4i.d     vr14,     vr14,     0x01
707    vsrari.h      vr13,     vr13,     4
708    vsrari.h      vr14,     vr14,     4
709
710    alsl.d        t2,       a1,       a0,   1
711    VLD_DST_ADD_W4 vr13, vr14
712.endm
713
714.macro inv_flipadst_identity_4x4_lsx
715    vld           vr0,      a2,       0
716    vld           vr1,      a2,       16
717
718    la.local      t0,       iadst4_coeffs
719    vsllwil.w.h   vr2,      vr0,      0     // in0
720    vexth.w.h     vr3,      vr0             // in1
721    vsllwil.w.h   vr4,      vr1,      0     // in2
722    vexth.w.h     vr5,      vr1             // in3
723    vldrepl.w     vr20,     t0,       0     // 1321
724    vldrepl.w     vr21,     t0,       4     // 3803
725    vldrepl.w     vr22,     t0,       8     // 2482
726    vldrepl.w     vr23,     t0,       12    // 3344
727
728    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr10, vr11, vr12, vr13
729
730    vssrarni.h.w  vr12,     vr13,     12
731    vssrarni.h.w  vr10,     vr11,     12
732
733    vilvl.h       vr4,      vr10,     vr12  // 0 2 4 6 8 10 12 14
734    vilvh.h       vr5,      vr10,     vr12  // 1 3 5 7 9 11 13 15
735    vilvl.h       vr11,     vr5,      vr4   // 0 1  2  3  4  5  6  7
736    vilvh.h       vr13,     vr5,      vr4   // 8 9 10 11 12 13 14 15
737
738    vreplgr2vr.h  vr15,     zero
739    li.w          t0,       1697
740
741    vst           vr15,     a2,       0
742    vst           vr15,     a2,       16
743    vreplgr2vr.w  vr20,     t0
744
745    identity_4x4_lsx vr11, vr11, vr20, vr11, vr6
746    identity_4x4_lsx vr13, vr13, vr20, vr13, vr7
747    vsrari.h      vr11,     vr6,     4
748    vsrari.h      vr13,     vr7,     4
749
750    alsl.d        t2,       a1,      a0,   1
751    VLD_DST_ADD_W4 vr11, vr13
752.endm
753
754.macro inv_identity_flipadst_4x4_lsx
755    vld           vr0,      a2,       0
756    vld           vr1,      a2,       16
757
758    li.w          t0,       1697
759    vreplgr2vr.w  vr20,     t0
760
761    identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
762    identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
763
764    vilvl.h       vr4,      vr1,      vr0
765    vilvh.h       vr5,      vr1,      vr0
766    vilvl.h       vr11,     vr5,      vr4
767    vilvh.h       vr13,     vr5,      vr4
768
769    vreplgr2vr.h  vr15,     zero
770    vst           vr15,     a2,       0
771    vst           vr15,     a2,       16
772
773    la.local      t0,       iadst4_coeffs
774    vsllwil.w.h   vr2,      vr11,     0   // in0
775    vexth.w.h     vr3,      vr11          // in1
776    vsllwil.w.h   vr4,      vr13,     0   // in2
777    vexth.w.h     vr5,      vr13          // in3
778    vldrepl.w     vr20,     t0,       0   // 1321
779    vldrepl.w     vr21,     t0,       4   // 3803
780    vldrepl.w     vr22,     t0,       8   // 2482
781    vldrepl.w     vr23,     t0,       12  // 3344
782
783    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
784
785    vssrarni.h.w  vr0,      vr1,      12  // 8 9 10 11 12 13 14 15
786    vssrarni.h.w  vr2,      vr3,      12  // 0 1  2  3  4  5  6  7
787    vsrari.h      vr11,     vr0,      4
788    vsrari.h      vr13,     vr2,      4
789
790    alsl.d        t2,       a1,       a0,   1
791    VLD_DST_ADD_W4 vr13, vr11
792.endm
793
794.macro inv_identity_adst_4x4_lsx
795    vld           vr0,      a2,       0
796    vld           vr1,      a2,       16
797
798    li.w          t0,       1697
799    vreplgr2vr.w  vr20,     t0
800
801    identity_4x4_lsx vr0, vr0, vr20, vr0, vr0
802    identity_4x4_lsx vr1, vr1, vr20, vr1, vr1
803
804    vilvl.h       vr4,      vr1,      vr0
805    vilvh.h       vr5,      vr1,      vr0
806    vilvl.h       vr11,     vr5,      vr4
807    vilvh.h       vr13,     vr5,      vr4
808
809    vreplgr2vr.h  vr15,     zero
810    vst           vr15,     a2,       0
811    vst           vr15,     a2,       16
812
813    la.local      t0,       iadst4_coeffs
814    vsllwil.w.h   vr2,      vr11,     0     // in0
815    vexth.w.h     vr3,      vr11            // in1
816    vsllwil.w.h   vr4,      vr13,     0     // in2
817    vexth.w.h     vr5,      vr13            // in3
818    vldrepl.w     vr20,     t0,       0     // 1321
819    vldrepl.w     vr21,     t0,       4     // 3803
820    vldrepl.w     vr22,     t0,       8     // 2482
821    vldrepl.w     vr23,     t0,       12    // 3344
822
823    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
824
825    vssrarni.h.w  vr1,      vr0,      12
826    vssrarni.h.w  vr3,      vr2,      12
827    vsrari.h      vr11,     vr1,      4
828    vsrari.h      vr13,     vr3,      4
829
830    alsl.d        t2,       a1,       a0,   1
831    VLD_DST_ADD_W4 vr11, vr13
832.endm
833
834.macro inv_adst_identity_4x4_lsx
835    vld           vr0,      a2,       0
836    vld           vr1,      a2,       16
837
838    la.local      t0,       iadst4_coeffs
839    vsllwil.w.h   vr2,      vr0,      0     // in0
840    vexth.w.h     vr3,      vr0             // in1
841    vsllwil.w.h   vr4,      vr1,      0     // in2
842    vexth.w.h     vr5,      vr1             // in3
843    vldrepl.w     vr20,     t0,       0     // 1321
844    vldrepl.w     vr21,     t0,       4     // 3803
845    vldrepl.w     vr22,     t0,       8     // 2482
846    vldrepl.w     vr23,     t0,       12    // 3344
847
848    adst4x4_1d_lsx vr2, vr3, vr4, vr5, vr0, vr1, vr2, vr3
849
850    LSX_TRANSPOSE4x4_W vr0, vr1, vr2, vr3, vr11, vr13, vr12, vr14, vr6, vr7
851
852    vssrarni.h.w  vr13,     vr11,     12
853    vssrarni.h.w  vr14,     vr12,     12
854
855    vreplgr2vr.h  vr15,     zero
856    li.w          t0,       1697
857
858    vst           vr15,     a2,       0
859    vst           vr15,     a2,       16
860    vreplgr2vr.w  vr20,     t0
861
862    identity_4x4_lsx vr13, vr13, vr20, vr13, vr6
863    identity_4x4_lsx vr14, vr14, vr20, vr14, vr7
864    vsrari.h      vr11,     vr6,      4
865    vsrari.h      vr13,     vr7,      4
866
867    alsl.d        t2,       a1,       a0,   1
868    VLD_DST_ADD_W4 vr11, vr13
869.endm
870
871.macro fun4x4 type1, type2
872function inv_txfm_add_\type1\()_\type2\()_4x4_8bpc_lsx
873.ifc \type1\()_\type2, dct_dct
874    bnez          a3,       .LLL
875
876    vldi          vr0,      0x8b5            // 181
877    ld.h          t2,       a2,       0      // dc
878    st.h          zero,     a2,       0
879    vreplgr2vr.w  vr1,      t2
880    vldi          vr3,      0x880            // 128
881    vmul.w        vr2,      vr0,      vr1
882    vld           vr10,     a0,       0
883    vsrari.w      vr2,      vr2,      8
884    vldx          vr11,     a0,       a1
885    vmadd.w       vr3,      vr2,      vr0
886    alsl.d        t2,       a1,       a0,    1
887    vssrarni.h.w  vr3,      vr3,      12
888    vld           vr12,     t2,       0
889    vldx          vr13,     t2,       a1
890
891    DST_ADD_W4    vr10, vr11, vr12, vr13, vr3, vr3
892
893    b             .IDST_\type1\()_\type2\()_4X4_END
894.LLL:
895.endif
896
897    inv_\type1\()_\type2\()_4x4_lsx
898.IDST_\type1\()_\type2\()_4X4_END:
899endfunc
900.endm
901
902fun4x4 dct, dct
903fun4x4 identity, identity
904fun4x4 adst, dct
905fun4x4 dct, adst
906fun4x4 adst, adst
907fun4x4 dct, flipadst
908fun4x4 flipadst, adst
909fun4x4 adst, flipadst
910fun4x4 flipadst, dct
911fun4x4 flipadst, flipadst
912fun4x4 dct, identity
913fun4x4 identity, dct
914fun4x4 flipadst, identity
915fun4x4 identity, flipadst
916fun4x4 identity, adst
917fun4x4 adst, identity
918
919function inv_txfm_add_dct_dct_4x8_8bpc_lsx
920    bnez          a3,       .NO_HAS_DCONLY_4x8
921
922    ld.h          t2,       a2,       0      // dc
923    vldi          vr0,      0x8b5            // 181
924    vreplgr2vr.w  vr1,      t2
925    vldi          vr5,      0x880            // 128
926    vmul.w        vr2,      vr0,      vr1
927    st.h          zero,     a2,       0
928    vsrari.w      vr2,      vr2,      8
929    vld           vr10,     a0,       0
930    vmul.w        vr2,      vr2,      vr0
931    vldx          vr11,     a0,       a1
932    vsrari.w      vr2,      vr2,      8
933    alsl.d        t2,       a1,       a0,    1
934    vmadd.w       vr5,      vr2,      vr0
935    vld           vr12,     t2,       0
936    vssrarni.h.w  vr5,      vr5,      12
937    vldx          vr13,     t2,       a1
938
939    DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5
940
941    alsl.d        a0,       a1,       a0,   2
942    alsl.d        t2,       a1,       t2,   2
943
944    VLD_DST_ADD_W4 vr5, vr5
945    b             .DCT_DCT_4x8_END
946
947.NO_HAS_DCONLY_4x8:
948    // sh=8 sw=4
949    la.local      t0,       idct_coeffs
950
951    vld           vr0,      a2,       0    //  0  1  2  3  4  5  6  7  in0
952    vld           vr1,      a2,       16   //  8  9 10 11 12 13 14 15  in1
953    vld           vr20,     a2,       32   // 16 17 18 19 20 21 22 23  in2
954    vld           vr21,     a2,       48   // 24 25 26 27 28 29 30 31  in3
955
956    vldrepl.w     vr2,      t0,       8    // 1567
957    vldrepl.w     vr3,      t0,       12   // 3784
958    vldrepl.w     vr8,      t0,       0    // 2896
959
960.macro DCT4_4Wx8H_1D_LSX
961    // in1 in3
962    vsllwil.w.h   vr4,      vr1,      0    // in1
963    vsllwil.w.h   vr5,      vr21,     0    // in3
964    vmul.w        vr4,      vr4,      vr8
965    vmul.w        vr5,      vr5,      vr8
966    vsrari.w      vr4,      vr4,      12
967    vsrari.w      vr5,      vr5,      12
968    vmul.w        vr6,      vr4,      vr3
969    vmul.w        vr7,      vr4,      vr2
970    vmadd.w       vr6,      vr5,      vr2  // t3 0 1 2 3
971    vmsub.w       vr7,      vr5,      vr3  // t2 0 1 2 3
972    vexth.w.h     vr4,      vr1            // in1
973    vexth.w.h     vr5,      vr21           // in3
974    vmul.w        vr4,      vr4,      vr8
975    vmul.w        vr5,      vr5,      vr8
976    vsrari.w      vr4,      vr4,      12
977    vsrari.w      vr5,      vr5,      12
978    vmul.w        vr9,      vr4,      vr3
979    vmul.w        vr10,     vr4,      vr2
980    vmadd.w       vr9,      vr5,      vr2  // t3 4 5 6 7
981    vmsub.w       vr10,     vr5,      vr3  // t2 4 5 6 7
982
983    // in0 in2
984    vsllwil.w.h   vr4,      vr0,      0    // in0
985    vsllwil.w.h   vr5,      vr20,     0    // in2
986    vmul.w        vr4,      vr4,      vr8
987    vmul.w        vr5,      vr5,      vr8
988    vsrari.w      vr4,      vr4,      12
989    vsrari.w      vr5,      vr5,      12
990    vmul.w        vr11,     vr4,      vr8
991    vmul.w        vr12,     vr4,      vr8
992    vmadd.w       vr11,     vr5,      vr8  // t0 0 1 2 3
993    vmsub.w       vr12,     vr5,      vr8  // t1 0 1 2 3
994    vexth.w.h     vr4,      vr0            // in0
995    vexth.w.h     vr5,      vr20           // in2
996    vmul.w        vr4,      vr4,      vr8
997    vmul.w        vr5,      vr5,      vr8
998    vsrari.w      vr4,      vr4,      12
999    vsrari.w      vr5,      vr5,      12
1000    vmul.w        vr13,     vr4,      vr8
1001    vmul.w        vr14,     vr4,      vr8
1002    vmadd.w       vr13,     vr5,      vr8  // t0 4 5 6 7
1003    vmsub.w       vr14,     vr5,      vr8  // t1 4 5 6 7
1004    vssrarni.h.w  vr9,      vr6,      12   // t3
1005    vssrarni.h.w  vr10,     vr7,      12   // t2
1006    vssrarni.h.w  vr14,     vr12,     12   // t1
1007    vssrarni.h.w  vr13,     vr11,     12   // t0
1008    vsadd.h       vr4,      vr13,     vr9  // c[0] 0 4  8 12 16 20 24 28
1009    vsadd.h       vr5,      vr14,     vr10 // c[1] 1 5  9 13 17 21 25 29
1010    vssub.h       vr20,     vr14,     vr10 // c[2] 2 6 10 14 18 22 26 30
1011    vssub.h       vr21,     vr13,     vr9  // c[3] 3 7 11 15 19 23 27 31
1012.endm
1013
1014    DCT4_4Wx8H_1D_LSX
1015
1016    vreplgr2vr.h  vr22,     zero
1017    vst           vr22,     a2,       0
1018    vst           vr22,     a2,       16
1019    vst           vr22,     a2,       32
1020    vst           vr22,     a2,       48
1021
1022    vilvl.h       vr0,      vr5,      vr4   // 0 1 4 5  8  9 12 13
1023    vilvl.h       vr1,      vr21,     vr20  // 2 3 6 7 10 11 14 15
1024    vilvh.h       vr6,      vr5,      vr4   // 16 17 20 21 24 25 28 29
1025    vilvh.h       vr7,      vr21,     vr20  // 18 19 22 23 26 27 30 31
1026    vilvl.w       vr9,      vr1,      vr0   //  0  1  2  3  4  5  6  7  in0
1027    vilvh.w       vr10,     vr1,      vr0   //  8  9 10 11 12 13 14 15  in1
1028    vilvl.w       vr11,     vr7,      vr6   // 16 17 18 19 20 21 22 23  in2
1029    vilvh.w       vr12,     vr7,      vr6   // 24 25 26 27 28 29 30 31  in3
1030
1031    vilvl.d       vr0,      vr10,     vr9
1032    vilvl.d       vr1,      vr12,     vr11
1033    vilvh.d       vr20,     vr9,      vr11  // in5 in1
1034    vilvh.d       vr21,     vr12,     vr10  // in3 in7
1035
1036.macro DCT8_4Wx8H_1D_LSX
1037    dct_4x4_core_lsx vr0, vr1, vr0, vr1, vr3, vr2, vr8, vr8, vr13, vr14
1038
1039    vldrepl.w     vr17,     t0,       16    // 799
1040    vldrepl.w     vr18,     t0,       20    // 4017
1041    vldrepl.w     vr11,     t0,       24    // 3406
1042    vldrepl.w     vr12,     t0,       28    // 2276
1043
1044    vexth.w.h     vr4,      vr20
1045    vexth.w.h     vr5,      vr21
1046    vmul.w        vr6,      vr4,      vr18  // in1 * 4017
1047    vmul.w        vr7,      vr4,      vr17  // in1 * 799
1048    vmadd.w       vr6,      vr5,      vr17  // in7 * 799
1049    vmsub.w       vr7,      vr5,      vr18  // in7 * 4017
1050    vsllwil.w.h   vr4,      vr20,     0
1051    vsllwil.w.h   vr5,      vr21,     0
1052    vmul.w        vr9,      vr4,      vr12
1053    vmul.w        vr10,     vr4,      vr11
1054    vmadd.w       vr9,      vr5,      vr11
1055    vmsub.w       vr10,     vr5,      vr12
1056    vssrarni.h.w  vr10,     vr9,      12    // t6a t5a
1057    vssrarni.h.w  vr7,      vr6,      12    // t7a t4a
1058    vsadd.h       vr15,     vr7,      vr10  // t7  t4
1059    vssub.h       vr16,     vr7,      vr10  // t6a t5a
1060
1061    vexth.w.h     vr4,      vr16            // t5a
1062    vsllwil.w.h   vr5,      vr16,     0     // t6a
1063    vldi          vr2,      0x8b5           // 181
1064    vsub.w        vr6,      vr5,      vr4
1065    vadd.w        vr7,      vr5,      vr4
1066    vmul.w        vr6,      vr6,      vr2
1067    vmul.w        vr7,      vr7,      vr2
1068    vssrarni.h.w  vr7,      vr6,      8     // t5 t6
1069    vaddi.hu      vr18,     vr7,      0
1070    vshuf4i.d     vr7,      vr15,     0x06  // t7 t6
1071    vshuf4i.d     vr15,     vr18,     0x09  // t4 t5
1072
1073    // vr17 -> vr7 vr18 -> vr15
1074    vsadd.h       vr4,      vr13,     vr7
1075    vsadd.h       vr5,      vr14,     vr15
1076    vssub.h       vr6,      vr14,     vr15
1077    vssub.h       vr7,      vr13,     vr7
1078.endm
1079
1080    DCT8_4Wx8H_1D_LSX
1081
1082    vshuf4i.d     vr5,      vr5,      0x01
1083    vshuf4i.d     vr7,      vr7,      0x01
1084
1085    vsrari.h      vr4,      vr4,      4
1086    vsrari.h      vr5,      vr5,      4
1087    vsrari.h      vr6,      vr6,      4
1088    vsrari.h      vr7,      vr7,      4
1089
1090    alsl.d        t2,       a1,       a0,    1
1091
1092    VLD_DST_ADD_W4 vr4, vr5
1093
1094    alsl.d        a0,       a1,       a0,    2
1095    alsl.d        t2,       a1,       t2,    2
1096
1097    VLD_DST_ADD_W4 vr6, vr7
1098.DCT_DCT_4x8_END:
1099endfunc
1100
1101.macro rect2_w4_lsx in0, in1, in2, out0, out1
1102    vsllwil.w.h   vr22,     \in0,     0
1103    vexth.w.h     vr23,     \in1
1104    vmul.w        vr22,     vr22,     \in2
1105    vmul.w        vr23,     vr23,     \in2
1106    vsrari.w      \out0,    vr22,     12
1107    vsrari.w      \out1,    vr23,     12
1108.endm
1109
1110.macro dct_8x4_core_lsx1 out0, out1, out2, out3
1111    // dct4 stride=1<<1
1112    vmul.w        vr0,      vr6,      vr21
1113    vmul.w        vr1,      vr6,      vr20
1114    vmadd.w       vr0,      vr10,     vr20  // t3
1115    vmsub.w       vr1,      vr10,     vr21  // t2
1116    vmul.w        vr2,      vr18,     vr22
1117    vmul.w        vr3,      vr18,     vr22
1118    vmadd.w       vr2,      vr8,      vr22  // t0
1119    vmsub.w       vr3,      vr8,      vr22  // t1
1120    vssrarni.h.w  vr1,      vr0,      12    // t3 t2
1121    vssrarni.h.w  vr3,      vr2,      12    // t0 t1
1122    vsadd.h       vr8,      vr3,      vr1   // t0 t1
1123    vssub.h       vr10,     vr3,      vr1   // t3 t2
1124
1125    vldrepl.w     vr20,     t0,       16    // 799
1126    vldrepl.w     vr21,     t0,       20    // 4017
1127    vldrepl.w     vr22,     t0,       24    // 3406
1128    vldrepl.w     vr23,     t0,       28    // 2276
1129
1130    vmul.w        vr0,      vr19,     vr21  // in1 * 4017
1131    vmul.w        vr1,      vr19,     vr20  // in1 * 799
1132    vmadd.w       vr0,      vr11,     vr20  // in7 * 799   // t7a
1133    vmsub.w       vr1,      vr11,     vr21  // in7 * 4017  // t4a
1134    vmul.w        vr2,      vr9,      vr23  // in5 * 1138
1135    vmul.w        vr3,      vr9,      vr22  // in5 * 1703
1136    vmadd.w       vr2,      vr7,      vr22  // in3 * 1703  // t6a
1137    vmsub.w       vr3,      vr7,      vr23  // in3 * 1138  // t5a
1138    vssrarni.h.w  vr0,      vr1,      12    // t4a t7a
1139    vssrarni.h.w  vr2,      vr3,      12    // t5a t6a
1140    vsadd.h       vr9,      vr0,      vr2   // t4  t7
1141    vssub.h       vr11,     vr0,      vr2   // t5a t6a
1142
1143    vldrepl.w     vr22,     t0,       0     // 2896
1144    vexth.w.h     vr18,     vr11            // t6a
1145    vsllwil.w.h   vr19,     vr11,     0     // t5a
1146    vmul.w        vr6,      vr18,     vr22
1147    vmul.w        vr7,      vr18,     vr22
1148    vmadd.w       vr6,      vr19,     vr22  // t6
1149    vmsub.w       vr7,      vr19,     vr22  // t5
1150    vssrarni.h.w  vr6,      vr7,      12    // t5 t6
1151
1152    vilvh.d       vr11,     vr6,      vr9   // t7 t6
1153    vilvl.d       vr9,      vr6,      vr9   // t4 t5
1154
1155    vsadd.h       \out0,    vr8,      vr11  // c[0] c[1]
1156    vsadd.h       \out1,    vr10,     vr9   // c[3] c[2]
1157    vssub.h       \out2,    vr10,     vr9   // c[4] c[5]
1158    vssub.h       \out3,    vr8,      vr11  // c[7] c[6]
1159.endm
1160
1161.macro dct_8x4_core_lsx2 in0, in1, in2, in3, in4, in5, in6, in7, \
1162                         out0, out1, out2, out3
1163    vexth.w.h     vr4,      \in0            // in1
1164    vexth.w.h     vr5,      \in1            // in3
1165    vmul.w        vr6,      vr4,      \in4
1166    vmul.w        vr7,      vr4,      \in5
1167    vmadd.w       vr6,      vr5,      \in5  // t3
1168    vmsub.w       vr7,      vr5,      \in4  // t2
1169    vexth.w.h     vr4,      \in2            // in1
1170    vexth.w.h     vr5,      \in3            // in3
1171    vmul.w        vr8,      vr4,      \in4
1172    vmul.w        vr9,      vr4,      \in5
1173    vmadd.w       vr8,      vr5,      \in5  // t3
1174    vmsub.w       vr9,      vr5,      \in4  // t2
1175    vssrarni.h.w  vr8,      vr6,      12    // t3
1176    vssrarni.h.w  vr9,      vr7,      12    // t2
1177
1178    vsllwil.w.h   vr4,      \in0,     0
1179    vsllwil.w.h   vr5,      \in1,     0
1180    vmul.w        vr11,     vr4,      \in6
1181    vmul.w        vr12,     vr4,      \in7
1182    vmadd.w       vr11,     vr5,      \in7  // t0
1183    vmsub.w       vr12,     vr5,      \in6  // t1
1184    vsllwil.w.h   vr4,      \in2,     0
1185    vsllwil.w.h   vr5,      \in3,     0
1186    vmul.w        vr13,     vr4,      \in6
1187    vmul.w        vr14,     vr4,      \in7
1188    vmadd.w       vr13,     vr5,      \in7  // t0
1189    vmsub.w       vr14,     vr5,      \in6  // t1
1190    vssrarni.h.w  vr13,     vr11,     12    // t0
1191    vssrarni.h.w  vr14,     vr12,     12    // t1
1192
1193    vsadd.h       \out0,    vr13,     vr8
1194    vsadd.h       \out1,    vr14,     vr9
1195    vssub.h       \out2,    vr14,     vr9
1196    vssub.h       \out3,    vr13,     vr8
1197.endm
1198
1199.macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7
1200    vsllwil.hu.bu vr10,     \in0,     0
1201    vsllwil.hu.bu vr11,     \in1,     0
1202    vsllwil.hu.bu vr12,     \in2,     0
1203    vsllwil.hu.bu vr13,     \in3,     0
1204    vadd.h        vr10,     \in4,     vr10
1205    vadd.h        vr11,     \in5,     vr11
1206    vadd.h        vr12,     \in6,     vr12
1207    vadd.h        vr13,     \in7,     vr13
1208    vssrani.bu.h  vr11,     vr10,     0
1209    vssrani.bu.h  vr13,     vr12,     0
1210    vstelm.d      vr11,     a0,       0,    0
1211    add.d         t8,       a0,       a1
1212    vstelm.d      vr11,     t8,       0,    1
1213    vstelm.d      vr13,     t2,       0,    0
1214    add.d         t8,       t2,       a1
1215    vstelm.d      vr13,     t8,       0,    1
1216.endm
1217
1218.macro VLD_DST_ADD_W8 in0, in1, in2, in3
1219    vld           vr0,      a0,       0
1220    vldx          vr1,      a0,       a1
1221    vld           vr2,      t2,       0
1222    vldx          vr3,      t2,       a1
1223
1224    DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3
1225.endm
1226
1227function inv_txfm_add_dct_dct_8x4_8bpc_lsx
1228    bnez          a3,       .NO_HAS_DCONLY_8x4
1229
1230    ld.h          t2,       a2,       0      // dc
1231    vldi          vr0,      0x8b5            // 181
1232    vreplgr2vr.w  vr1,      t2
1233    vldi          vr5,      0x880            // 128
1234    vmul.w        vr2,      vr0,      vr1
1235    st.h          zero,     a2,       0
1236    vsrari.w      vr2,      vr2,      8
1237    vld           vr10,     a0,       0
1238    vmul.w        vr2,      vr2,      vr0
1239    vldx          vr11,     a0,       a1
1240    vsrari.w      vr2,      vr2,      8
1241    alsl.d        t2,       a1,       a0,    1
1242    vmadd.w       vr5,      vr2,      vr0
1243    vld           vr12,     t2,       0
1244    vssrarni.h.w  vr5,      vr5,      12
1245    vldx          vr13,     t2,       a1
1246
1247    DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
1248
1249    b             .DCT_DCT_8X4_END
1250
1251.NO_HAS_DCONLY_8x4:
1252    la.local      t0,       idct_coeffs
1253
1254    vld           vr0,      a2,       0
1255    vld           vr1,      a2,       16
1256    vld           vr2,      a2,       32
1257    vld           vr3,      a2,       48
1258
1259    vldrepl.w     vr20,     t0,       0     // 2896
1260
1261    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
1262    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
1263    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
1264    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
1265
1266    vldrepl.w     vr20,     t0,       8     // 1567
1267    vldrepl.w     vr21,     t0,       12    // 3784
1268    vldrepl.w     vr22,     t0,       0     // 2896
1269
1270    dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
1271
1272    vshuf4i.d     vr1,      vr1,      0x01
1273    vshuf4i.d     vr3,      vr3,      0x01
1274
1275    vilvl.h       vr4,      vr1,      vr0   // 0 2 4 6 8 10 12 14
1276    vilvh.h       vr5,      vr1,      vr0   // 1 3 5 7 9 11 13 15
1277    vilvl.h       vr0,      vr5,      vr4   // 0 1  2  3  4  5  6  7 in0
1278    vilvh.h       vr1,      vr5,      vr4   // 8 9 10 11 12 13 14 15 in1
1279    vilvl.h       vr4,      vr3,      vr2   // 0 2 4 6 8 10 12 14
1280    vilvh.h       vr5,      vr3,      vr2   // 1 3 5 7 9 11 13 15
1281    vilvl.h       vr2,      vr5,      vr4   // 16 - 23  in2
1282    vilvh.h       vr3,      vr5,      vr4   // 24 - 31  in3
1283
1284    la.local      t0,       idct_coeffs
1285
1286    vreplgr2vr.h  vr23,     zero
1287    vst           vr23,     a2,       0
1288    vst           vr23,     a2,       16
1289    vst           vr23,     a2,       32
1290    vst           vr23,     a2,       48
1291
1292    vldrepl.w     vr20,     t0,       8     // 1567
1293    vldrepl.w     vr21,     t0,       12    // 3784
1294
1295    dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
1296                      vr22, vr15, vr16, vr17, vr18
1297
1298    vsrari.h      vr15,     vr15,     4
1299    vsrari.h      vr16,     vr16,     4
1300    vsrari.h      vr17,     vr17,     4
1301    vsrari.h      vr18,     vr18,     4
1302
1303    alsl.d        t2,       a1,       a0,     1
1304
1305    VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
1306
1307.DCT_DCT_8X4_END:
1308endfunc
1309
1310.macro identity8_lsx in0, in1, in2, in3, in4, in5, in6, in7, \
1311                     out0, out1, out2, out3
1312    vssrarni.h.w  \in1,     \in0,     0
1313    vssrarni.h.w  \in3,     \in2,     0
1314    vssrarni.h.w  \in5,     \in4,     0
1315    vssrarni.h.w  \in7,     \in6,     0
1316    vsadd.h       \out0,    \in1,     \in1
1317    vsadd.h       \out1,    \in3,     \in3
1318    vsadd.h       \out2,    \in5,     \in5
1319    vsadd.h       \out3,    \in7,     \in7
1320.endm
1321
1322function inv_txfm_add_identity_identity_8x4_8bpc_lsx
1323    la.local      t0,       idct_coeffs
1324
1325    vld           vr0,      a2,       0    //  0  1  2  3  4  5  6  7  in0
1326    vld           vr1,      a2,       16   //  8  9 10 11 12 13 14 15  in1
1327    vld           vr2,      a2,       32   // 16 17 18 19 20 21 22 23  in2
1328    vld           vr3,      a2,       48   // 24 25 26 27 28 29 30 31  in3
1329
1330    vldrepl.w     vr20,     t0,       0    // 2896
1331
1332    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
1333    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
1334    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
1335    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
1336
1337    identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
1338                  vr19, vr7, vr9, vr11
1339
1340    vreplgr2vr.h  vr23,     zero
1341    vst           vr23,     a2,       0
1342    vst           vr23,     a2,       16
1343    vst           vr23,     a2,       32
1344    vst           vr23,     a2,       48
1345
1346    li.w          t0,       1697
1347    vreplgr2vr.w  vr20,     t0
1348    identity_4x4_lsx vr19, vr19, vr20, vr19, vr19
1349    identity_4x4_lsx vr7, vr7, vr20, vr7, vr7
1350    identity_4x4_lsx vr9, vr9, vr20, vr9, vr9
1351    identity_4x4_lsx vr11, vr11, vr20, vr11, vr11
1352
1353    vsrari.h      vr15,     vr19,     4
1354    vsrari.h      vr16,     vr7,      4
1355    vsrari.h      vr17,     vr9,      4
1356    vsrari.h      vr18,     vr11,     4
1357
1358    vilvl.h       vr4,      vr16,     vr15
1359    vilvh.h       vr5,      vr16,     vr15
1360    vilvl.h       vr11,     vr5,      vr4
1361    vilvh.h       vr12,     vr5,      vr4
1362    vilvl.h       vr4,      vr18,     vr17
1363    vilvh.h       vr5,      vr18,     vr17
1364    vilvl.h       vr13,     vr5,      vr4
1365    vilvh.h       vr14,     vr5,      vr4
1366    vilvl.d       vr15,     vr13,     vr11
1367    vilvh.d       vr16,     vr13,     vr11
1368    vilvl.d       vr17,     vr14,     vr12
1369    vilvh.d       vr18,     vr14,     vr12
1370
1371    alsl.d        t2,       a1,       a0,     1
1372
1373    VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
1374endfunc
1375
1376const iadst8_coeffs, align=4
1377    .word          4076, 401, 3612, 1931
1378    .word          2598, 3166, 1189, 3920
1379    // idct_coeffs
1380    .word          2896, 0, 1567, 3784, 0, 0, 0, 0
1381endconst
1382
1383.macro vmadd_vmsub_vssrarni_hw_12 in0, in1, in2, in3, in4, in5, in6, in7, \
1384                                  in8, in9, in10, in11, out0, out1, out2, out3
1385    vmul.w        \out0,    \in0,     \in4
1386    vmul.w        \out1,    \in0,     \in5
1387    vmadd.w       \out0,    \in1,     \in6   // t0a
1388    vmsub.w       \out1,    \in1,     \in7   // t1a
1389    vmul.w        \out2,    \in2,     \in8
1390    vmul.w        \out3,    \in2,     \in9
1391    vmadd.w       \out2,    \in3,     \in10  // t2a
1392    vmsub.w       \out3,    \in3,     \in11  // t3a
1393    vssrarni.h.w  \out1,    \out0,    12     // t0a t1a
1394    vssrarni.h.w  \out3,    \out2,    12     // t2a t3a
1395.endm
1396
1397.macro adst8x4_1d_lsx
1398    la.local      t0,       iadst8_coeffs
1399
1400    vldrepl.w     vr20,     t0,       0     // 4076
1401    vldrepl.w     vr21,     t0,       4     // 401
1402    vldrepl.w     vr22,     t0,       8     // 3612
1403    vldrepl.w     vr23,     t0,       12    // 1931
1404
1405    // vr13 t0a t1a    vr15 t2a t3a
1406    vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \
1407                               vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15
1408    vldrepl.w     vr20,     t0,       16    // 2598
1409    vldrepl.w     vr21,     t0,       20    // 3166
1410    vldrepl.w     vr22,     t0,       24    // 1189
1411    vldrepl.w     vr23,     t0,       28    // 3920
1412
1413    // vr18 t4a t5a     vr6 t6a t7a
1414    vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \
1415                               vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6
1416
1417    vsadd.h       vr12,     vr13,     vr18  // t0 t1
1418    vsadd.h       vr14,     vr15,     vr6   // t2 t3
1419    vssub.h       vr16,     vr13,     vr18  // t4 t5
1420    vssub.h       vr18,     vr15,     vr6   // t6 t7
1421
1422    la.local      t0,       idct_coeffs
1423
1424    vldrepl.w     vr20,     t0,       8     // 1567
1425    vldrepl.w     vr21,     t0,       12    // 3784
1426    vldrepl.w     vr22,     t0,       0     // 2896
1427
1428    vsllwil.w.h   vr7,      vr16,     0     // t4
1429    vexth.w.h     vr8,      vr16            // t5
1430    vsllwil.w.h   vr10,     vr18,     0     // t6
1431    vexth.w.h     vr11,     vr18            // t7
1432
1433    // vr13 out0 out7   vr17 out1 out6
1434    vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \
1435                               vr20, vr21, vr21, vr20, vr13, vr15, vr17, vr19
1436    vshuf4i.d     vr19,     vr19,     0x01
1437
1438    vsadd.h       vr13,     vr12,     vr14  // out0 out7
1439    vssub.h       vr16,     vr12,     vr14  // t2 t3
1440    vsadd.h       vr17,     vr15,     vr19  // out1 out6
1441    vssub.h       vr18,     vr15,     vr19  // t6 t7
1442
1443    vexth.w.h     vr20,     vr13            // out7
1444    vsllwil.w.h   vr21,     vr17,     0     // out1
1445    vneg.w        vr20,     vr20
1446    vneg.w        vr21,     vr21
1447    vssrarni.h.w  vr21,     vr20,     0     // out7 out1
1448    vilvl.d       vr13,     vr21,     vr13  // out0 out7
1449    vilvh.d       vr17,     vr17,     vr21  // out1 out6
1450
1451    vsllwil.w.h   vr7,      vr16,     0     // t2
1452    vexth.w.h     vr8,      vr16            // t3
1453    vsllwil.w.h   vr10,     vr18,     0     // t6
1454    vexth.w.h     vr11,     vr18            // t7
1455
1456    // vr15 out[3] out[4]    vr18 out[2] out[5]
1457    vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \
1458                               vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18
1459
1460    vexth.w.h     vr20,     vr18            // out5
1461    vsllwil.w.h   vr21,     vr15,     0     // out3
1462    vneg.w        vr20,     vr20
1463    vneg.w        vr21,     vr21
1464    vssrarni.h.w  vr21,     vr20,     0     // out5 out3
1465    vilvl.d       vr18,     vr21,     vr18  // out2 out5
1466    vilvh.d       vr15,     vr15,     vr21  // out3 out4
1467.endm
1468
1469function inv_txfm_add_adst_dct_8x4_8bpc_lsx
1470    vld           vr0,      a2,       0     //  0  1  2  3  4  5  6  7  in0
1471    vld           vr1,      a2,       16    //  8  9 10 11 12 13 14 15  in1
1472    vld           vr2,      a2,       32    // 16 17 18 19 20 21 22 23  in2
1473    vld           vr3,      a2,       48    // 24 25 26 27 28 29 30 31  in3
1474
1475    la.local      t0,       idct_coeffs
1476    vldrepl.w     vr20,     t0,       0     // 2896
1477
1478    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
1479    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
1480    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
1481    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
1482
1483    adst8x4_1d_lsx
1484
1485    vilvl.h       vr4,      vr17,     vr13
1486    vilvl.h       vr5,      vr15,     vr18
1487    vilvl.w       vr0,      vr5,      vr4
1488    vilvh.w       vr1,      vr5,      vr4
1489    vilvh.h       vr4,      vr18,     vr15
1490    vilvh.h       vr5,      vr13,     vr17
1491    vilvl.w       vr2,      vr5,      vr4
1492    vilvh.w       vr3,      vr5,      vr4
1493
1494    vreplgr2vr.h  vr23,     zero
1495    vst           vr23,     a2,       0
1496    vst           vr23,     a2,       16
1497    vst           vr23,     a2,       32
1498    vst           vr23,     a2,       48
1499
1500    la.local      t0,       idct_coeffs
1501
1502    vldrepl.w     vr20,     t0,       8     // 1567
1503    vldrepl.w     vr21,     t0,       12    // 3784
1504    vldrepl.w     vr22,     t0,       0     // 2896
1505
1506    dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
1507                      vr22, vr15, vr16, vr17, vr18
1508
1509    vsrari.h      vr15,     vr15,     4
1510    vsrari.h      vr16,     vr16,     4
1511    vsrari.h      vr17,     vr17,     4
1512    vsrari.h      vr18,     vr18,     4
1513
1514    alsl.d        t2,       a1,       a0,    1
1515
1516    VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
1517endfunc
1518
1519function inv_txfm_add_dct_adst_8x4_8bpc_lsx
1520    vld           vr0,      a2,       0     //  0  1  2  3  4  5  6  7  in0
1521    vld           vr1,      a2,       16    //  8  9 10 11 12 13 14 15  in1
1522    vld           vr2,      a2,       32    // 16 17 18 19 20 21 22 23  in2
1523    vld           vr3,      a2,       48    // 24 25 26 27 28 29 30 31  in3
1524
1525    la.local      t0,       idct_coeffs
1526    vldrepl.w     vr20,     t0,       0     // 2896
1527
1528    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
1529    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
1530    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
1531    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
1532
1533    vldrepl.w     vr20,      t0,       8    // 1567
1534    vldrepl.w     vr21,      t0,       12   // 3784
1535    vldrepl.w     vr22,      t0,       0    // 2896
1536
1537    dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
1538
1539    vshuf4i.d     vr1,      vr1,      0x01
1540    vshuf4i.d     vr3,      vr3,      0x01
1541
1542    vilvl.h       vr4,      vr1,      vr0
1543    vilvh.h       vr5,      vr1,      vr0
1544    vilvl.h       vr0,      vr5,      vr4
1545    vilvh.h       vr1,      vr5,      vr4
1546    vilvl.h       vr4,      vr3,      vr2
1547    vilvh.h       vr5,      vr3,      vr2
1548    vilvl.h       vr2,      vr5,      vr4
1549    vilvh.h       vr3,      vr5,      vr4
1550
1551    la.local      t0,       iadst4_coeffs
1552
1553    vreplgr2vr.h  vr23,     zero
1554    vst           vr23,     a2,       0
1555    vst           vr23,     a2,       16
1556    vst           vr23,     a2,       32
1557    vst           vr23,     a2,       48
1558
1559    vldrepl.w     vr20,     t0,       0     // 1321
1560    vldrepl.w     vr21,     t0,       4     // 3803
1561    vldrepl.w     vr22,     t0,       8     // 2482
1562    vldrepl.w     vr23,     t0,       12    // 3344
1563
1564    vsllwil.w.h   vr10,     vr0,      0
1565    vexth.w.h     vr11,     vr0
1566    vsllwil.w.h   vr12,     vr1,      0
1567    vexth.w.h     vr13,     vr1
1568
1569    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
1570
1571    vsllwil.w.h   vr14,     vr2,      0
1572    vexth.w.h     vr15,     vr2
1573    vsllwil.w.h   vr16,     vr3,      0
1574    vexth.w.h     vr17,     vr3
1575
1576    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
1577
1578    vssrarni.h.w  vr14,     vr10,     12
1579    vssrarni.h.w  vr15,     vr11,     12
1580    vssrarni.h.w  vr16,     vr12,     12
1581    vssrarni.h.w  vr17,     vr13,     12
1582
1583    vsrari.h      vr14,     vr14,     4
1584    vsrari.h      vr15,     vr15,     4
1585    vsrari.h      vr16,     vr16,     4
1586    vsrari.h      vr17,     vr17,     4
1587
1588    alsl.d        t2,       a1,       a0,     1
1589
1590    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
1591endfunc
1592
1593function inv_txfm_add_adst_adst_8x4_8bpc_lsx
1594    vld           vr0,      a2,       0     //  0  1  2  3  4  5  6  7  in0
1595    vld           vr1,      a2,       16    //  8  9 10 11 12 13 14 15  in1
1596    vld           vr2,      a2,       32    // 16 17 18 19 20 21 22 23  in2
1597    vld           vr3,      a2,       48    // 24 25 26 27 28 29 30 31  in3
1598
1599    la.local      t0,       idct_coeffs
1600    vldrepl.w     vr20,     t0,       0     // 2896
1601
1602    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
1603    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
1604    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
1605    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
1606
1607    adst8x4_1d_lsx
1608
1609    vilvl.h       vr4,      vr17,     vr13
1610    vilvl.h       vr5,      vr15,     vr18
1611    vilvl.w       vr0,      vr5,      vr4
1612    vilvh.w       vr1,      vr5,      vr4
1613    vilvh.h       vr4,      vr18,     vr15
1614    vilvh.h       vr5,      vr13,     vr17
1615    vilvl.w       vr2,      vr5,      vr4
1616    vilvh.w       vr3,      vr5,      vr4
1617
1618    la.local      t0,       iadst4_coeffs
1619
1620    vreplgr2vr.h  vr23,     zero
1621    vst           vr23,     a2,       0
1622    vst           vr23,     a2,       16
1623    vst           vr23,     a2,       32
1624    vst           vr23,     a2,       48
1625
1626    vldrepl.w     vr20,     t0,       0     // 1321
1627    vldrepl.w     vr21,     t0,       4     // 3803
1628    vldrepl.w     vr22,     t0,       8     // 2482
1629    vldrepl.w     vr23,     t0,       12    // 3344
1630
1631    vsllwil.w.h   vr10,     vr0,      0
1632    vexth.w.h     vr11,     vr0
1633    vsllwil.w.h   vr12,     vr1,      0
1634    vexth.w.h     vr13,     vr1
1635
1636    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
1637
1638    vsllwil.w.h   vr14,     vr2,      0
1639    vexth.w.h     vr15,     vr2
1640    vsllwil.w.h   vr16,     vr3,      0
1641    vexth.w.h     vr17,     vr3
1642
1643    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
1644
1645    vssrarni.h.w  vr14,     vr10,     12
1646    vssrarni.h.w  vr15,     vr11,     12
1647    vssrarni.h.w  vr16,     vr12,     12
1648    vssrarni.h.w  vr17,     vr13,     12
1649
1650    vsrari.h      vr14,     vr14,     4
1651    vsrari.h      vr15,     vr15,     4
1652    vsrari.h      vr16,     vr16,     4
1653    vsrari.h      vr17,     vr17,     4
1654
1655    alsl.d        t2,       a1,       a0,     1
1656
1657    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
1658endfunc
1659
1660function inv_txfm_add_flipadst_adst_8x4_8bpc_lsx
1661    vld           vr0,      a2,       0    //  0  1  2  3  4  5  6  7  in0
1662    vld           vr1,      a2,       16   //  8  9 10 11 12 13 14 15  in1
1663    vld           vr2,      a2,       32   // 16 17 18 19 20 21 22 23  in2
1664    vld           vr3,      a2,       48   // 24 25 26 27 28 29 30 31  in3
1665
1666    la.local      t0,       idct_coeffs
1667    vldrepl.w     vr20,     t0,       0    // 2896
1668
1669    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19
1670    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7
1671    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9
1672    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11
1673
1674    adst8x4_1d_lsx
1675
1676    vilvl.h       vr20,     vr15,     vr13
1677    vilvl.h       vr21,     vr18,     vr17
1678    vilvl.w       vr0,      vr21,     vr20
1679    vilvh.w       vr1,      vr21,     vr20
1680    vilvh.h       vr20,     vr15,     vr13
1681    vilvh.h       vr21,     vr18,     vr17
1682    vilvl.w       vr2,      vr21,     vr20
1683    vilvh.w       vr3,      vr21,     vr20
1684    vshuf4i.h     vr0,      vr0,      0x2d
1685    vshuf4i.h     vr1,      vr1,      0x2d
1686    vshuf4i.h     vr2,      vr2,      0x78
1687    vshuf4i.h     vr3,      vr3,      0x78
1688
1689    la.local      t0,       iadst4_coeffs
1690
1691    vreplgr2vr.h  vr23,     zero
1692    vst           vr23,     a2,       0
1693    vst           vr23,     a2,       16
1694    vst           vr23,     a2,       32
1695    vst           vr23,     a2,       48
1696
1697    vldrepl.w     vr20,     t0,       0     // 1321
1698    vldrepl.w     vr21,     t0,       4     // 3803
1699    vldrepl.w     vr22,     t0,       8     // 2482
1700    vldrepl.w     vr23,     t0,       12    // 3344
1701
1702    vsllwil.w.h   vr10,     vr2,      0
1703    vexth.w.h     vr11,     vr2
1704    vsllwil.w.h   vr12,     vr3,      0
1705    vexth.w.h     vr13,     vr3
1706
1707    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
1708
1709    vsllwil.w.h   vr14,     vr0,      0
1710    vexth.w.h     vr15,     vr0
1711    vsllwil.w.h   vr16,     vr1,      0
1712    vexth.w.h     vr17,     vr1
1713
1714    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
1715
1716    vssrarni.h.w  vr14,     vr10,     12
1717    vssrarni.h.w  vr15,     vr11,     12
1718    vssrarni.h.w  vr16,     vr12,     12
1719    vssrarni.h.w  vr17,     vr13,     12
1720
1721    vsrari.h      vr14,     vr14,     4
1722    vsrari.h      vr15,     vr15,     4
1723    vsrari.h      vr16,     vr16,     4
1724    vsrari.h      vr17,     vr17,     4
1725
1726    alsl.d        t2,       a1,       a0,     1
1727
1728    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
1729endfunc
1730
1731function inv_txfm_add_adst_flipadst_8x4_8bpc_lsx
1732    vld           vr0,      a2,       0      // in0
1733    vld           vr1,      a2,       16     // in1
1734    vld           vr2,      a2,       32     // in2
1735    vld           vr3,      a2,       48     // in3
1736
1737    la.local      t0,       idct_coeffs
1738    vldrepl.w     vr20,     t0,       0      // 2896
1739
1740    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19  // 0  8 16 24 1  9 17 25 in0 in1
1741    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7    // 2 10 18 26 3 11 19 27 in2 in3
1742    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9    // 4 12 20 28 5 13 21 29 in4 in5
1743    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11  // 6 14 22 30 7 15 23 31 in6 in7
1744
1745    adst8x4_1d_lsx
1746
1747    vilvl.h       vr4,      vr17,     vr13
1748    vilvl.h       vr5,      vr15,     vr18
1749    vilvl.w       vr0,      vr5,      vr4
1750    vilvh.w       vr1,      vr5,      vr4
1751    vilvh.h       vr4,      vr18,     vr15
1752    vilvh.h       vr5,      vr13,     vr17
1753    vilvl.w       vr2,      vr5,      vr4
1754    vilvh.w       vr3,      vr5,      vr4
1755
1756    la.local      t0,       iadst4_coeffs
1757
1758    vreplgr2vr.h  vr23,     zero
1759    vst           vr23,     a2,       0
1760    vst           vr23,     a2,       16
1761    vst           vr23,     a2,       32
1762    vst           vr23,     a2,       48
1763
1764    vldrepl.w     vr20,     t0,       0     // 1321
1765    vldrepl.w     vr21,     t0,       4     // 3803
1766    vldrepl.w     vr22,     t0,       8     // 2482
1767    vldrepl.w     vr23,     t0,       12    // 3344
1768
1769    vsllwil.w.h   vr10,     vr0,      0
1770    vexth.w.h     vr11,     vr0
1771    vsllwil.w.h   vr12,     vr1,      0
1772    vexth.w.h     vr13,     vr1
1773
1774    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
1775
1776    vsllwil.w.h   vr14,     vr2,      0
1777    vexth.w.h     vr15,     vr2
1778    vsllwil.w.h   vr16,     vr3,      0
1779    vexth.w.h     vr17,     vr3
1780
1781    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
1782
1783    vssrarni.h.w  vr14,     vr10,     12
1784    vssrarni.h.w  vr15,     vr11,     12
1785    vssrarni.h.w  vr16,     vr12,     12
1786    vssrarni.h.w  vr17,     vr13,     12
1787
1788    vsrari.h      vr14,     vr14,     4
1789    vsrari.h      vr15,     vr15,     4
1790    vsrari.h      vr16,     vr16,     4
1791    vsrari.h      vr17,     vr17,     4
1792
1793    alsl.d        t2,       a1,       a0,     1
1794
1795    VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
1796endfunc
1797
1798function inv_txfm_add_flipadst_dct_8x4_8bpc_lsx
1799    vld           vr0,      a2,       0      // in0
1800    vld           vr1,      a2,       16     // in1
1801    vld           vr2,      a2,       32     // in2
1802    vld           vr3,      a2,       48     // in3
1803
1804    la.local      t0,       idct_coeffs
1805    vldrepl.w     vr20,     t0,       0      // 2896
1806
1807    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19  // 0  8 16 24 1  9 17 25 in0 in1
1808    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7    // 2 10 18 26 3 11 19 27 in2 in3
1809    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9    // 4 12 20 28 5 13 21 29 in4 in5
1810    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11  // 6 14 22 30 7 15 23 31 in6 in7
1811
1812    adst8x4_1d_lsx
1813
1814    vilvl.h       vr20,     vr15,     vr13
1815    vilvl.h       vr21,     vr18,     vr17
1816    vilvl.w       vr0,      vr21,     vr20
1817    vilvh.w       vr1,      vr21,     vr20
1818    vilvh.h       vr20,     vr15,     vr13
1819    vilvh.h       vr21,     vr18,     vr17
1820    vilvl.w       vr2,      vr21,     vr20
1821    vilvh.w       vr3,      vr21,     vr20
1822    vshuf4i.h     vr0,      vr0,      0x2d
1823    vshuf4i.h     vr1,      vr1,      0x2d
1824    vshuf4i.h     vr2,      vr2,      0x78
1825    vshuf4i.h     vr3,      vr3,      0x78
1826
1827    vreplgr2vr.h  vr23,     zero
1828    vst           vr23,     a2,       0
1829    vst           vr23,     a2,       16
1830    vst           vr23,     a2,       32
1831    vst           vr23,     a2,       48
1832
1833    la.local      t0,       idct_coeffs
1834
1835    vldrepl.w     vr20,     t0,       8     // 1567
1836    vldrepl.w     vr21,     t0,       12    // 3784
1837    vldrepl.w     vr22,     t0,       0     // 2896
1838
1839    dct_8x4_core_lsx2 vr2, vr3, vr0, vr1, vr21, vr20, vr22, \
1840                      vr22, vr15, vr16, vr17, vr18
1841
1842    vsrari.h      vr15,     vr15,     4
1843    vsrari.h      vr16,     vr16,     4
1844    vsrari.h      vr17,     vr17,     4
1845    vsrari.h      vr18,     vr18,     4
1846
1847    alsl.d        t2,       a1,       a0,     1
1848
1849    VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
1850endfunc
1851
1852function inv_txfm_add_dct_flipadst_8x4_8bpc_lsx
1853    la.local      t0,       idct_coeffs
1854
1855    vld           vr0,      a2,       0     // in0
1856    vld           vr1,      a2,       16    // in1
1857    vld           vr2,      a2,       32    // in2
1858    vld           vr3,      a2,       48    // in3
1859
1860    vldrepl.w     vr20,     t0,       0     // 2896
1861
1862    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19  // in0 0 - 7
1863    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7    // in1 8 - 15
1864    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9    // in2 16 - 23
1865    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11  // in3 24 - 31
1866
1867    vldrepl.w     vr20,     t0,       8      // 1567
1868    vldrepl.w     vr21,     t0,       12     // 3784
1869    vldrepl.w     vr22,     t0,       0      // 2896
1870
1871    dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
1872
1873    vshuf4i.d     vr1,      vr1,      0x01
1874    vshuf4i.d     vr3,      vr3,      0x01
1875
1876    vilvl.h       vr4,      vr1,      vr0
1877    vilvh.h       vr5,      vr1,      vr0
1878    vilvl.h       vr0,      vr5,      vr4
1879    vilvh.h       vr1,      vr5,      vr4
1880    vilvl.h       vr4,      vr3,      vr2
1881    vilvh.h       vr5,      vr3,      vr2
1882    vilvl.h       vr2,      vr5,      vr4
1883    vilvh.h       vr3,      vr5,      vr4
1884
1885    la.local      t0,       iadst4_coeffs
1886
1887    vreplgr2vr.h  vr23,     zero
1888    vst           vr23,     a2,       0
1889    vst           vr23,     a2,       16
1890    vst           vr23,     a2,       32
1891    vst           vr23,     a2,       48
1892
1893    vldrepl.w     vr20,     t0,       0     // 1321
1894    vldrepl.w     vr21,     t0,       4     // 3803
1895    vldrepl.w     vr22,     t0,       8     // 2482
1896    vldrepl.w     vr23,     t0,       12    // 3344
1897
1898    vsllwil.w.h   vr10,     vr0,      0     // in0
1899    vexth.w.h     vr11,     vr0             // in1
1900    vsllwil.w.h   vr12,     vr1,      0     // in2
1901    vexth.w.h     vr13,     vr1             // in3
1902    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
1903
1904    vsllwil.w.h   vr14,     vr2,      0
1905    vexth.w.h     vr15,     vr2
1906    vsllwil.w.h   vr16,     vr3,      0
1907    vexth.w.h     vr17,     vr3
1908    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
1909
1910    vssrarni.h.w  vr14,     vr10,     12
1911    vssrarni.h.w  vr15,     vr11,     12
1912    vssrarni.h.w  vr16,     vr12,     12
1913    vssrarni.h.w  vr17,     vr13,     12
1914    vsrari.h      vr14,     vr14,     4
1915    vsrari.h      vr15,     vr15,     4
1916    vsrari.h      vr16,     vr16,     4
1917    vsrari.h      vr17,     vr17,     4
1918
1919    alsl.d        t2,       a1,       a0,     1
1920
1921    VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
1922endfunc
1923
1924function inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx
1925    vld           vr0,      a2,       0      // in0
1926    vld           vr1,      a2,       16     // in1
1927    vld           vr2,      a2,       32     // in2
1928    vld           vr3,      a2,       48     // in3
1929
1930    la.local      t0,       idct_coeffs
1931    vldrepl.w     vr20,     t0,       0      // 2896
1932
1933    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19  // 0  8 16 24 1  9 17 25 in0 in1
1934    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7    // 2 10 18 26 3 11 19 27 in2 in3
1935    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9    // 4 12 20 28 5 13 21 29 in4 in5
1936    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11  // 6 14 22 30 7 15 23 31 in6 in7
1937
1938    adst8x4_1d_lsx
1939
1940    vilvl.h       vr20,     vr15,     vr13
1941    vilvl.h       vr21,     vr18,     vr17
1942    vilvl.w       vr0,      vr21,     vr20
1943    vilvh.w       vr1,      vr21,     vr20
1944    vilvh.h       vr20,     vr15,     vr13
1945    vilvh.h       vr21,     vr18,     vr17
1946    vilvl.w       vr2,      vr21,     vr20
1947    vilvh.w       vr3,      vr21,     vr20
1948    vshuf4i.h     vr0,      vr0,      0x2d
1949    vshuf4i.h     vr1,      vr1,      0x2d
1950    vshuf4i.h     vr2,      vr2,      0x78
1951    vshuf4i.h     vr3,      vr3,      0x78
1952
1953    la.local      t0,       iadst4_coeffs
1954
1955    vreplgr2vr.h  vr23,     zero
1956    vst           vr23,     a2,       0
1957    vst           vr23,     a2,       16
1958    vst           vr23,     a2,       32
1959    vst           vr23,     a2,       48
1960
1961    vldrepl.w     vr20,     t0,       0     // 1321
1962    vldrepl.w     vr21,     t0,       4     // 3803
1963    vldrepl.w     vr22,     t0,       8     // 2482
1964    vldrepl.w     vr23,     t0,       12    // 3344
1965
1966    vsllwil.w.h   vr10,     vr2,      0     // in0
1967    vexth.w.h     vr11,     vr2             // in1
1968    vsllwil.w.h   vr12,     vr3,      0     // in2
1969    vexth.w.h     vr13,     vr3             // in3
1970    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
1971
1972    vsllwil.w.h   vr14,     vr0,      0
1973    vexth.w.h     vr15,     vr0
1974    vsllwil.w.h   vr16,     vr1,      0
1975    vexth.w.h     vr17,     vr1
1976    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
1977
1978    vssrarni.h.w  vr14,     vr10,     12
1979    vssrarni.h.w  vr15,     vr11,     12
1980    vssrarni.h.w  vr16,     vr12,     12
1981    vssrarni.h.w  vr17,     vr13,     12
1982
1983    vsrari.h      vr14,     vr14,     4
1984    vsrari.h      vr15,     vr15,     4
1985    vsrari.h      vr16,     vr16,     4
1986    vsrari.h      vr17,     vr17,     4
1987
1988    alsl.d        t2,       a1,       a0,     1
1989
1990    VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
1991endfunc
1992
1993function inv_txfm_add_dct_identity_8x4_8bpc_lsx
1994    vld           vr0,      a2,       0     // in0
1995    vld           vr1,      a2,       16    // in1
1996    vld           vr2,      a2,       32    // in2
1997    vld           vr3,      a2,       48    // in3
1998
1999    la.local      t0,       idct_coeffs
2000    vldrepl.w     vr20,     t0,       0     // 2896
2001
2002    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
2003    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7   // in1 8 - 15
2004    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9   // in2 16 - 23
2005    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
2006
2007    vldrepl.w     vr20,     t0,       8     // 1567
2008    vldrepl.w     vr21,     t0,       12    // 3784
2009    vldrepl.w     vr22,     t0,       0     // 2896
2010
2011    dct_8x4_core_lsx1 vr0, vr1, vr2, vr3
2012
2013    vshuf4i.d     vr1,      vr1,      0x01
2014    vshuf4i.d     vr3,      vr3,      0x01
2015
2016    vilvl.h       vr4,      vr1,      vr0
2017    vilvh.h       vr5,      vr1,      vr0
2018    vilvl.h       vr0,      vr5,      vr4
2019    vilvh.h       vr1,      vr5,      vr4
2020    vilvl.h       vr4,      vr3,      vr2
2021    vilvh.h       vr5,      vr3,      vr2
2022    vilvl.h       vr2,      vr5,      vr4
2023    vilvh.h       vr3,      vr5,      vr4
2024    vilvl.d       vr14,     vr2,      vr0
2025    vilvh.d       vr15,     vr2,      vr0
2026    vilvl.d       vr16,     vr3,      vr1
2027    vilvh.d       vr17,     vr3,      vr1
2028
2029    vreplgr2vr.h  vr23,     zero
2030    vst           vr23,     a2,       0
2031    vst           vr23,     a2,       16
2032    vst           vr23,     a2,       32
2033    vst           vr23,     a2,       48
2034
2035    li.w          t0,       1697
2036    vreplgr2vr.w  vr20,     t0
2037
2038    identity_4x4_lsx vr14, vr14, vr20, vr14, vr14
2039    identity_4x4_lsx vr15, vr15, vr20, vr15, vr15
2040    identity_4x4_lsx vr16, vr16, vr20, vr16, vr16
2041    identity_4x4_lsx vr17, vr17, vr20, vr17, vr17
2042
2043    vsrari.h      vr14,     vr14,     4
2044    vsrari.h      vr15,     vr15,     4
2045    vsrari.h      vr16,     vr16,     4
2046    vsrari.h      vr17,     vr17,     4
2047
2048    alsl.d        t2,       a1,       a0,     1
2049
2050    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
2051endfunc
2052
2053function inv_txfm_add_identity_dct_8x4_8bpc_lsx
2054    vld           vr0,      a2,       0     // in0
2055    vld           vr1,      a2,       16    // in1
2056    vld           vr2,      a2,       32    // in2
2057    vld           vr3,      a2,       48    // in3
2058
2059    la.local      t0,       idct_coeffs
2060    vldrepl.w     vr20,     t0,       0     // 2896
2061
2062    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
2063    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7   // in1 8 - 15
2064    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9   // in2 16 - 23
2065    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
2066
2067    identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
2068                  vr19, vr7, vr9, vr11
2069
2070    vreplgr2vr.h  vr23,     zero
2071    vst           vr23,     a2,       0
2072    vst           vr23,     a2,       16
2073    vst           vr23,     a2,       32
2074    vst           vr23,     a2,       48
2075
2076    vilvl.h       vr4,      vr7,      vr19
2077    vilvh.h       vr5,      vr7,      vr19
2078    vilvl.h       vr0,      vr5,      vr4
2079    vilvh.h       vr1,      vr5,      vr4
2080    vilvl.h       vr4,      vr11,     vr9
2081    vilvh.h       vr5,      vr11,     vr9
2082    vilvl.h       vr2,      vr5,      vr4
2083    vilvh.h       vr3,      vr5,      vr4
2084
2085    la.local      t0,       idct_coeffs
2086
2087    vldrepl.w     vr20,     t0,       8    // 1567
2088    vldrepl.w     vr21,     t0,       12   // 3784
2089    vldrepl.w     vr22,     t0,       0    // 2896
2090
2091    dct_8x4_core_lsx2 vr0, vr1, vr2, vr3, vr21, vr20, vr22, \
2092                      vr22, vr15, vr16, vr17, vr18
2093
2094    vsrari.h      vr15,     vr15,     4
2095    vsrari.h      vr16,     vr16,     4
2096    vsrari.h      vr17,     vr17,     4
2097    vsrari.h      vr18,     vr18,     4
2098
2099    alsl.d        t2,       a1,       a0,     1
2100
2101    VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
2102endfunc
2103
2104function inv_txfm_add_flipadst_identity_8x4_8bpc_lsx
2105    vld           vr0,      a2,       0      // in0
2106    vld           vr1,      a2,       16     // in1
2107    vld           vr2,      a2,       32     // in2
2108    vld           vr3,      a2,       48     // in3
2109
2110    la.local      t0,       idct_coeffs
2111    vldrepl.w     vr20,     t0,       0      // 2896
2112
2113    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19  // 0  8 16 24 1  9 17 25 in0 in1
2114    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7    // 2 10 18 26 3 11 19 27 in2 in3
2115    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9    // 4 12 20 28 5 13 21 29 in4 in5
2116    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11  // 6 14 22 30 7 15 23 31 in6 in7
2117
2118    adst8x4_1d_lsx
2119
2120    vilvl.h       vr20,     vr15,     vr13
2121    vilvl.h       vr21,     vr18,     vr17
2122    vilvl.w       vr0,      vr21,     vr20
2123    vilvh.w       vr1,      vr21,     vr20
2124    vilvh.h       vr20,     vr15,     vr13
2125    vilvh.h       vr21,     vr18,     vr17
2126    vilvl.w       vr2,      vr21,     vr20
2127    vilvh.w       vr3,      vr21,     vr20
2128    vshuf4i.h     vr0,      vr0,      0x2d
2129    vshuf4i.h     vr1,      vr1,      0x2d
2130    vshuf4i.h     vr2,      vr2,      0x78
2131    vshuf4i.h     vr3,      vr3,      0x78
2132    vilvl.d       vr14,     vr0,      vr2    // in0
2133    vilvh.d       vr15,     vr0,      vr2    // in1
2134    vilvl.d       vr16,     vr1,      vr3    // in2
2135    vilvh.d       vr17,     vr1,      vr3    // in3
2136
2137    vreplgr2vr.h  vr23,     zero
2138    vst           vr23,     a2,       0
2139    vst           vr23,     a2,       16
2140    vst           vr23,     a2,       32
2141    vst           vr23,     a2,       48
2142
2143    li.w          t0,       1697
2144    vreplgr2vr.w  vr20,     t0
2145
2146    identity_4x4_lsx vr14, vr14, vr20, vr14, vr14
2147    identity_4x4_lsx vr15, vr15, vr20, vr15, vr15
2148    identity_4x4_lsx vr16, vr16, vr20, vr16, vr16
2149    identity_4x4_lsx vr17, vr17, vr20, vr17, vr17
2150
2151    vsrari.h      vr14,     vr14,     4
2152    vsrari.h      vr15,     vr15,     4
2153    vsrari.h      vr16,     vr16,     4
2154    vsrari.h      vr17,     vr17,     4
2155
2156    alsl.d        t2,       a1,       a0,     1
2157
2158    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
2159endfunc
2160
2161function inv_txfm_add_identity_flipadst_8x4_8bpc_lsx
2162    vld           vr0,      a2,       0     // in0
2163    vld           vr1,      a2,       16    // in1
2164    vld           vr2,      a2,       32    // in2
2165    vld           vr3,      a2,       48    // in3
2166
2167    la.local      t0,       idct_coeffs
2168    vldrepl.w     vr20,     t0,       0     // 2896
2169
2170    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
2171    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7   // in1 8 - 15
2172    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9   // in2 16 - 23
2173    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
2174
2175    identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
2176                  vr19, vr7, vr9, vr11
2177
2178    vreplgr2vr.h  vr23,     zero
2179    vst           vr23,     a2,       0
2180    vst           vr23,     a2,       16
2181    vst           vr23,     a2,       32
2182    vst           vr23,     a2,       48
2183
2184    vilvl.h       vr4,      vr7,      vr19
2185    vilvh.h       vr5,      vr7,      vr19
2186    vilvl.h       vr0,      vr5,      vr4
2187    vilvh.h       vr1,      vr5,      vr4
2188    vilvl.h       vr4,      vr11,     vr9
2189    vilvh.h       vr5,      vr11,     vr9
2190    vilvl.h       vr2,      vr5,      vr4
2191    vilvh.h       vr3,      vr5,      vr4
2192
2193    la.local      t0,       iadst4_coeffs
2194
2195    vreplgr2vr.h  vr23,     zero
2196    vst           vr23,     a2,       0
2197    vst           vr23,     a2,       16
2198    vst           vr23,     a2,       32
2199    vst           vr23,     a2,       48
2200
2201    vldrepl.w     vr20,     t0,       0     // 1321
2202    vldrepl.w     vr21,     t0,       4     // 3803
2203    vldrepl.w     vr22,     t0,       8     // 2482
2204    vldrepl.w     vr23,     t0,       12    // 3344
2205
2206    vsllwil.w.h   vr10,     vr0,      0     // in0
2207    vexth.w.h     vr11,     vr0             // in1
2208    vsllwil.w.h   vr12,     vr1,      0     // in2
2209    vexth.w.h     vr13,     vr1             // in3
2210    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
2211
2212    vsllwil.w.h   vr14,     vr2,      0
2213    vexth.w.h     vr15,     vr2
2214    vsllwil.w.h   vr16,     vr3,      0
2215    vexth.w.h     vr17,     vr3
2216    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
2217
2218    vssrarni.h.w  vr14,     vr10,     12
2219    vssrarni.h.w  vr15,     vr11,     12
2220    vssrarni.h.w  vr16,     vr12,     12
2221    vssrarni.h.w  vr17,     vr13,     12
2222
2223    vsrari.h      vr14,     vr14,     4
2224    vsrari.h      vr15,     vr15,     4
2225    vsrari.h      vr16,     vr16,     4
2226    vsrari.h      vr17,     vr17,     4
2227
2228    alsl.d        t2,       a1,       a0,     1
2229
2230    VLD_DST_ADD_W8 vr17, vr16, vr15, vr14
2231endfunc
2232
2233function inv_txfm_add_adst_identity_8x4_8bpc_lsx
2234    vld           vr0,      a2,       0     // in0
2235    vld           vr1,      a2,       16    // in1
2236    vld           vr2,      a2,       32    // in2
2237    vld           vr3,      a2,       48    // in3
2238
2239    la.local      t0,       idct_coeffs
2240    vldrepl.w     vr20,     t0,       0     // 2896
2241
2242    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // 0  8 16 24 1  9 17 25 in0 in1
2243    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7   // 2 10 18 26 3 11 19 27 in2 in3
2244    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9   // 4 12 20 28 5 13 21 29 in4 in5
2245    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // 6 14 22 30 7 15 23 31 in6 in7
2246
2247    adst8x4_1d_lsx
2248
2249    vilvl.h       vr4,      vr17,     vr13
2250    vilvl.h       vr5,      vr15,     vr18
2251    vilvl.w       vr14,     vr5,      vr4   // in0 in1
2252    vilvh.w       vr16,     vr5,      vr4   // in2 in3
2253    vilvh.h       vr4,      vr18,     vr15
2254    vilvh.h       vr5,      vr13,     vr17
2255    vilvl.w       vr17,     vr5,      vr4
2256    vilvh.w       vr18,     vr5,      vr4
2257    vilvl.d       vr10,     vr17,     vr14  // in0
2258    vilvh.d       vr11,     vr17,     vr14  // in1
2259    vilvl.d       vr12,     vr18,     vr16  // in2
2260    vilvh.d       vr13,     vr18,     vr16  // in3
2261
2262    vreplgr2vr.h  vr23,     zero
2263    vst           vr23,     a2,       0
2264    vst           vr23,     a2,       16
2265    vst           vr23,     a2,       32
2266    vst           vr23,     a2,       48
2267
2268    li.w          t0,       1697
2269    vreplgr2vr.w  vr20,     t0
2270
2271    identity_4x4_lsx vr10, vr10, vr20, vr10, vr15
2272    identity_4x4_lsx vr11, vr11, vr20, vr11, vr16
2273    identity_4x4_lsx vr12, vr12, vr20, vr12, vr17
2274    identity_4x4_lsx vr13, vr13, vr20, vr13, vr18
2275
2276    vsrari.h      vr15,     vr15,     4
2277    vsrari.h      vr16,     vr16,     4
2278    vsrari.h      vr17,     vr17,     4
2279    vsrari.h      vr18,     vr18,     4
2280
2281    alsl.d        t2,       a1,       a0,     1
2282
2283    VLD_DST_ADD_W8 vr15, vr16, vr17, vr18
2284endfunc
2285
2286function inv_txfm_add_identity_adst_8x4_8bpc_lsx
2287    vld           vr0,      a2,       0     // in0
2288    vld           vr1,      a2,       16    // in1
2289    vld           vr2,      a2,       32    // in2
2290    vld           vr3,      a2,       48    // in3
2291
2292    la.local      t0,       idct_coeffs
2293    vldrepl.w     vr20,     t0,       0     // 2896
2294
2295    rect2_w4_lsx vr0, vr0, vr20, vr18, vr19 // in0 0 - 7
2296    rect2_w4_lsx vr1, vr1, vr20, vr6, vr7   // in1 8 - 15
2297    rect2_w4_lsx vr2, vr2, vr20, vr8, vr9   // in2 16 - 23
2298    rect2_w4_lsx vr3, vr3, vr20, vr10, vr11 // in3 24 - 31
2299
2300    identity8_lsx vr18, vr19, vr6, vr7, vr8, vr9, vr10, vr11, \
2301                  vr0, vr1, vr2, vr3
2302
2303    vilvl.h       vr4,      vr1,      vr0   // 0 2 4 6 8 10 12 14
2304    vilvh.h       vr5,      vr1,      vr0   // 1 3 5 7 9 11 13 15
2305    vilvl.h       vr0,      vr5,      vr4   // 0 1  2  3  4  5  6  7
2306    vilvh.h       vr1,      vr5,      vr4   // 8 9 10 11 12 13 14 15
2307    vilvl.h       vr4,      vr3,      vr2   // 0 2 4 6 8 10 12 14
2308    vilvh.h       vr5,      vr3,      vr2   // 1 3 5 7 9 11 13 15
2309    vilvl.h       vr2,      vr5,      vr4   // 0 1  2  3  4  5  6  7
2310    vilvh.h       vr3,      vr5,      vr4   // 8 9 10 11 12 13 14 15
2311
2312    vreplgr2vr.h  vr23,     zero
2313    vst           vr23,     a2,       0
2314    vst           vr23,     a2,       16
2315    vst           vr23,     a2,       32
2316    vst           vr23,     a2,       48
2317
2318    la.local      t0,       iadst4_coeffs
2319
2320    vldrepl.w     vr20,     t0,       0     // 1321
2321    vldrepl.w     vr21,     t0,       4     // 3803
2322    vldrepl.w     vr22,     t0,       8     // 2482
2323    vldrepl.w     vr23,     t0,       12    // 3344
2324
2325    vsllwil.w.h   vr10,     vr0,      0
2326    vexth.w.h     vr11,     vr0
2327    vsllwil.w.h   vr12,     vr1,      0
2328    vexth.w.h     vr13,     vr1
2329
2330    adst4x4_1d_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
2331
2332    vsllwil.w.h   vr14,     vr2,      0
2333    vexth.w.h     vr15,     vr2
2334    vsllwil.w.h   vr16,     vr3,      0
2335    vexth.w.h     vr17,     vr3
2336
2337    adst4x4_1d_lsx vr14, vr15, vr16, vr17, vr14, vr15, vr16, vr17
2338
2339    vssrarni.h.w  vr14,     vr10,     12
2340    vssrarni.h.w  vr15,     vr11,     12
2341    vssrarni.h.w  vr16,     vr12,     12
2342    vssrarni.h.w  vr17,     vr13,     12
2343
2344    vsrari.h      vr14,     vr14,     4
2345    vsrari.h      vr15,     vr15,     4
2346    vsrari.h      vr16,     vr16,     4
2347    vsrari.h      vr17,     vr17,     4
2348
2349    alsl.d        t2,       a1,       a0,     1
2350
2351    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
2352endfunc
2353
2354function inv_txfm_add_identity_identity_8x8_8bpc_lsx
2355
2356    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
2357
2358    // identity8
2359    vsllwil.w.h   vr6,      vr0,      1
2360    vsllwil.w.h   vr7,      vr1,      1
2361    vsllwil.w.h   vr8,      vr2,      1
2362    vsllwil.w.h   vr9,      vr3,      1
2363    vsllwil.w.h   vr10,     vr4,      1
2364    vsllwil.w.h   vr11,     vr5,      1
2365    vsllwil.w.h   vr12,     vr14,     1
2366    vsllwil.w.h   vr13,     vr15,     1
2367
2368.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
2369    vexth.w.h     \i,       \i
2370.endr
2371
2372.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15
2373    vslli.w       \i,       \i,       1
2374.endr
2375
2376    vssrarni.h.w  vr0,      vr6,      1     // in0
2377    vssrarni.h.w  vr1,      vr7,      1     // in1
2378    vssrarni.h.w  vr2,      vr8,      1     // in2
2379    vssrarni.h.w  vr3,      vr9,      1     // in3
2380    vssrarni.h.w  vr4,      vr10,     1     // in4
2381    vssrarni.h.w  vr5,      vr11,     1     // in5
2382    vssrarni.h.w  vr14,     vr12,     1     // in6
2383    vssrarni.h.w  vr15,     vr13,     1     // in7
2384
2385    vreplgr2vr.h  vr23,     zero
2386.irp i, 0, 16, 32, 48, 64, 80, 96, 112
2387    vst           vr23,     a2,       \i
2388.endr
2389
2390    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr14, vr15, \
2391                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
2392                       vr6, vr7, vr8, vr9, vr10, vr11, vr12 vr13
2393
2394    vsllwil.w.h   vr6,      vr16,     1
2395    vsllwil.w.h   vr7,      vr17,     1
2396    vsllwil.w.h   vr8,      vr18,     1
2397    vsllwil.w.h   vr9,      vr19,     1
2398    vsllwil.w.h   vr10,     vr20,     1
2399    vsllwil.w.h   vr11,     vr21,     1
2400    vsllwil.w.h   vr12,     vr22,     1
2401    vsllwil.w.h   vr13,     vr23,     1
2402
2403.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2404    vexth.w.h     \i,       \i
2405.endr
2406
2407.irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
2408    vslli.w       \i,       \i,       1
2409.endr
2410
2411    vssrarni.h.w  vr16,     vr6,      4     // in0
2412    vssrarni.h.w  vr17,     vr7,      4     // in1
2413    vssrarni.h.w  vr18,     vr8,      4     // in2
2414    vssrarni.h.w  vr19,     vr9,      4     // in3
2415    vssrarni.h.w  vr20,     vr10,     4     // in4
2416    vssrarni.h.w  vr21,     vr11,     4     // in5
2417    vssrarni.h.w  vr22,     vr12,     4     // in6
2418    vssrarni.h.w  vr23,     vr13,     4     // in7
2419
2420    alsl.d        t2,       a1,       a0,     1
2421
2422    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
2423
2424    alsl.d        a0,       a1,       a0,     2
2425    alsl.d        t2,       a1,       a0,     1
2426
2427    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
2428
2429endfunc
2430
2431.macro adst8x8_1d_lsx out0, out1, out2, out3
2432    la.local      t0,       iadst8_coeffs
2433
2434    vldrepl.w     vr20,     t0,       0     // 4076
2435    vldrepl.w     vr21,     t0,       4     // 401
2436    vldrepl.w     vr22,     t0,       8     // 3612
2437    vldrepl.w     vr23,     t0,       12    // 1931
2438
2439    // vr13 t0a t1a    vr15 t2a t3a
2440    vmadd_vmsub_vssrarni_hw_12 vr11, vr18, vr9, vr6, vr20, vr21, vr21, vr20, \
2441                               vr22, vr23, vr23, vr22, vr12, vr13, vr14, vr15
2442    vldrepl.w     vr20,     t0,       16    // 2598
2443    vldrepl.w     vr21,     t0,       20    // 3166
2444    vldrepl.w     vr22,     t0,       24    // 1189
2445    vldrepl.w     vr23,     t0,       28    // 3920
2446
2447    // vr18 t4a t5a     vr6 t6a t7a
2448    vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr19, vr10, vr20, vr21, vr21, vr20, \
2449                               vr22, vr23, vr23, vr22, vr11, vr18, vr9, vr6
2450
2451    vsadd.h       vr12,     vr13,     vr18  // t0 t1
2452    vsadd.h       vr14,     vr15,     vr6   // t2 t3
2453    vssub.h       vr9,      vr13,     vr18  // t4 t5
2454    vssub.h       vr18,     vr15,     vr6   // t6 t7
2455
2456    la.local      t0,       idct_coeffs
2457
2458    vldrepl.w     vr20,     t0,       8     // 1567
2459    vldrepl.w     vr21,     t0,       12    // 3784
2460    vldrepl.w     vr22,     t0,       0     // 2896
2461
2462    vsllwil.w.h   vr7,      vr9,      0     // t4
2463    vexth.w.h     vr8,      vr9             // t5
2464    vsllwil.w.h   vr10,     vr18,     0     // t6
2465    vexth.w.h     vr11,     vr18            // t7
2466
2467    // vr13 out0 out7   vr17 out1 out6
2468    vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr11, vr10, vr21, vr20, vr20, vr21, \
2469                               vr20, vr21, vr21, vr20, vr13, vr15, vr18, vr19
2470    vshuf4i.d     vr19,     vr19,     0x01
2471
2472    vsadd.h       vr13,     vr12,     vr14  // out0 out7
2473    vssub.h       vr6,      vr12,     vr14  // t2 t3
2474    vsadd.h       vr7,      vr15,     vr19  // out1 out6
2475    vssub.h       vr18,     vr15,     vr19  // t6 t7
2476
2477    vexth.w.h     vr20,     vr13            // out7
2478    vsllwil.w.h   vr21,     vr7,      0     // out1
2479    vneg.w        vr20,     vr20
2480    vneg.w        vr21,     vr21
2481    vssrarni.h.w  vr21,     vr20,     0     // out7 out1
2482    vilvl.d       \out0,    vr21,     vr13  // out0 out7
2483    vilvh.d       \out1,    vr7,      vr21  // out1 out6
2484
2485    vsllwil.w.h   vr7,      vr6,      0     // t2
2486    vexth.w.h     vr8,      vr6             // t3
2487    vsllwil.w.h   vr10,     vr18,     0     // t6
2488    vexth.w.h     vr11,     vr18            // t7
2489
2490    // vr15 out[3] out[4]    vr18 out[2] out[5]
2491    vmadd_vmsub_vssrarni_hw_12 vr7, vr8, vr10, vr11, vr22, vr22, vr22, vr22, \
2492                               vr22, vr22, vr22, vr22, vr14, vr15, vr19, vr18
2493
2494    vexth.w.h     vr20,     vr18            // out5
2495    vsllwil.w.h   vr21,     vr15,     0     // out3
2496    vneg.w        vr20,     vr20
2497    vneg.w        vr21,     vr21
2498    vssrarni.h.w  vr21,     vr20,     0     // out5 out3
2499    vilvl.d       \out2,    vr21,     vr18  // out2 out5
2500    vilvh.d       \out3,    vr15,     vr21  // out3 out4
2501.endm
2502
2503function inv_txfm_add_adst_dct_8x8_8bpc_lsx
2504    addi.d        sp,       sp,       -32
2505    fst.d         f24,      sp,       0
2506    fst.d         f25,      sp,       8
2507    fst.d         f26,      sp,       16
2508    fst.d         f27,      sp,       24
2509
2510    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
2511
2512    vsllwil.w.h   vr18,     vr0,      0
2513    vsllwil.w.h   vr19,     vr1,      0
2514    vsllwil.w.h   vr6,      vr2,      0
2515    vsllwil.w.h   vr7,      vr3,      0
2516    vsllwil.w.h   vr8,      vr4,      0
2517    vsllwil.w.h   vr9,      vr5,      0
2518    vsllwil.w.h   vr10,     vr16,     0
2519    vsllwil.w.h   vr11,     vr17,     0
2520    adst8x8_1d_lsx vr24, vr25, vr26, vr27
2521
2522    vexth.w.h     vr18,     vr0
2523    vexth.w.h     vr19,     vr1
2524    vexth.w.h     vr6,      vr2
2525    vexth.w.h     vr7,      vr3
2526    vexth.w.h     vr8,      vr4
2527    vexth.w.h     vr9,      vr5
2528    vexth.w.h     vr10,     vr16
2529    vexth.w.h     vr11,     vr17
2530    adst8x8_1d_lsx vr0, vr1, vr2, vr3
2531
2532    vreplgr2vr.h  vr23,     zero
2533.irp i, 0, 16, 32, 48, 64, 80, 96, 112
2534    vst           vr23,     a2,       \i
2535.endr
2536
2537.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
2538    vsrari.h        \i,       \i,     1
2539.endr
2540
2541    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
2542                       vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \
2543                       vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
2544
2545    vshuf4i.h     vr14,     vr14,     0x1b
2546    vshuf4i.h     vr15,     vr15,     0x1b
2547    vshuf4i.h     vr24,     vr24,     0x1b
2548    vshuf4i.h     vr25,     vr25,     0x1b
2549
2550    vsllwil.w.h   vr18,     vr4,      0
2551    vsllwil.w.h   vr19,     vr5,      0
2552    vsllwil.w.h   vr6,      vr12,     0
2553    vsllwil.w.h   vr7,      vr13,     0
2554    vexth.w.h     vr8,      vr4
2555    vexth.w.h     vr9,      vr5
2556    vexth.w.h     vr10,     vr12
2557    vexth.w.h     vr11,     vr13
2558
2559    la.local      t0,       idct_coeffs
2560    vldrepl.w     vr20,     t0,       8    // 1567
2561    vldrepl.w     vr21,     t0,       12   // 3784
2562    vldrepl.w     vr22,     t0,       0    // 2896
2563
2564    dct_8x4_core_lsx1 vr4, vr5, vr12, vr13
2565
2566    vshuf4i.d     vr5,      vr5,      0x01
2567    vshuf4i.d     vr13,     vr13,     0x01
2568
2569    vsllwil.w.h   vr18,     vr14,     0
2570    vsllwil.w.h   vr19,     vr15,     0
2571    vsllwil.w.h   vr6,      vr24,     0
2572    vsllwil.w.h   vr7,      vr25,     0
2573    vexth.w.h     vr8,      vr14
2574    vexth.w.h     vr9,      vr15
2575    vexth.w.h     vr10,     vr24
2576    vexth.w.h     vr11,     vr25
2577
2578    la.local      t0,       idct_coeffs
2579    vldrepl.w     vr20,     t0,       8     // 1567
2580    vldrepl.w     vr21,     t0,       12    // 3784
2581    vldrepl.w     vr22,     t0,       0     // 2896
2582
2583    dct_8x4_core_lsx1 vr14, vr15, vr24, vr25
2584
2585    vshuf4i.d     vr15,     vr15,     0x01
2586    vshuf4i.d     vr25,     vr25,     0x01
2587
2588    vilvl.d       vr20,     vr14,     vr4
2589    vilvh.d       vr21,     vr14,     vr4
2590    vilvl.d       vr22,     vr15,     vr5
2591    vilvh.d       vr23,     vr15,     vr5
2592    vilvl.d       vr16,     vr24,     vr12
2593    vilvh.d       vr17,     vr24,     vr12
2594    vilvl.d       vr18,     vr25,     vr13
2595    vilvh.d       vr19,     vr25,     vr13
2596
2597.irp i, vr20, vr21, vr22, vr23, vr16, vr17, vr18, vr19
2598    vsrari.h      \i,       \i,       4
2599.endr
2600
2601    alsl.d        t2,       a1,       a0,     1
2602
2603    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
2604
2605    alsl.d        a0,       a1,       a0,     2
2606    alsl.d        t2,       a1,       a0,     1
2607
2608    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
2609
2610    fld.d            f24,     sp,    0
2611    fld.d            f25,     sp,    8
2612    fld.d            f26,     sp,    16
2613    fld.d            f27,     sp,    24
2614    addi.d           sp,      sp,    32
2615endfunc
2616
2617function inv_txfm_add_dct_adst_8x8_8bpc_lsx
2618    addi.d        sp,       sp,       -48
2619    fst.d         f24,      sp,       0
2620    fst.d         f25,      sp,       8
2621    fst.d         f26,      sp,       16
2622    fst.d         f27,      sp,       24
2623    fst.d         f28,      sp,       32
2624    fst.d         f29,      sp,       40
2625
2626    vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
2627
2628    la.local      t0,       idct_coeffs
2629    vldrepl.w     vr20,     t0,       8    // 1567
2630    vldrepl.w     vr21,     t0,       12   // 3784
2631    vldrepl.w     vr22,     t0,       0    // 2896
2632
2633    vsllwil.w.h   vr18,     vr4,      0
2634    vsllwil.w.h   vr19,     vr5,      0
2635    vsllwil.w.h   vr6,      vr12,     0
2636    vsllwil.w.h   vr7,      vr13,     0
2637    vsllwil.w.h   vr8,      vr14,     0
2638    vsllwil.w.h   vr9,      vr15,     0
2639    vsllwil.w.h   vr10,     vr24,     0
2640    vsllwil.w.h   vr11,     vr25,     0
2641
2642    dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
2643
2644    vshuf4i.d     vr27,     vr27,     0x01
2645    vshuf4i.d     vr29,     vr29,     0x01
2646
2647    vilvl.h       vr8,      vr27,     vr26  // 0 2 4 6 8 10 12 14
2648    vilvh.h       vr9,      vr27,     vr26  // 1 3 5 7 9 11 13 15
2649    vilvl.h       vr26,     vr9,      vr8   // 0 - 7 in0
2650    vilvh.h       vr27,     vr9,      vr8   // 8 - 15 in1
2651    vilvl.h       vr8,      vr29,     vr28  // 0 2 4 6 8 10 12 14
2652    vilvh.h       vr9,      vr29,     vr28  // 1 3 5 7 9 11 13 15
2653    vilvl.h       vr28,     vr9,      vr8   // 16 - 23  in2
2654    vilvh.h       vr29,     vr9,      vr8   // 24 - 31  in3
2655
2656    vsrari.h      vr26,     vr26,     1     // in0low in1low
2657    vsrari.h      vr27,     vr27,     1     // in2low in3low
2658    vsrari.h      vr28,     vr28,     1     // in0high in1high
2659    vsrari.h      vr29,     vr29,     1     // in2high in3high
2660
2661    vexth.w.h     vr18,     vr4
2662    vexth.w.h     vr19,     vr5
2663    vexth.w.h     vr6,      vr12
2664    vexth.w.h     vr7,      vr13
2665    vexth.w.h     vr8,      vr14
2666    vexth.w.h     vr9,      vr15
2667    vexth.w.h     vr10,     vr24
2668    vexth.w.h     vr11,     vr25
2669
2670    la.local      t0,       idct_coeffs
2671    vldrepl.w     vr20,     t0,       8     // 1567
2672    vldrepl.w     vr21,     t0,       12    // 3784
2673    vldrepl.w     vr22,     t0,       0     // 2896
2674
2675    dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
2676
2677    vshuf4i.d     vr13,     vr13,     0x01
2678    vshuf4i.d     vr15,     vr15,     0x01
2679
2680    vilvl.h       vr8,      vr13,     vr12  // 0 2 4 6 8 10 12 14
2681    vilvh.h       vr9,      vr13,     vr12  // 1 3 5 7 9 11 13 15
2682    vilvl.h       vr12,     vr9,      vr8   // 0 - 7 in0
2683    vilvh.h       vr13,     vr9,      vr8   // 8 - 15 in1
2684    vilvl.h       vr8,      vr15,     vr14  // 0 2 4 6 8 10 12 14
2685    vilvh.h       vr9,      vr15,     vr14  // 1 3 5 7 9 11 13 15
2686    vilvl.h       vr14,     vr9,      vr8   // 16 - 23  in2
2687    vilvh.h       vr15,     vr9,      vr8   // 24 - 31  in3
2688
2689    vsrari.h      vr0,      vr12,     1     // in4low in5low
2690    vsrari.h      vr1,      vr13,     1     // in6low in7low
2691    vsrari.h      vr2,      vr14,     1     // in4high in5high
2692    vsrari.h      vr3,      vr15,     1     // in6high in7high
2693
2694    vreplgr2vr.h  vr23,     zero
2695.irp i, 0, 16, 32, 48, 64, 80, 96, 112
2696    vst           vr23,     a2,       \i
2697.endr
2698
2699    vsllwil.w.h   vr18,     vr26,     0     // in0
2700    vexth.w.h     vr19,     vr26            // in1
2701    vsllwil.w.h   vr6,      vr27,     0     // in2
2702    vexth.w.h     vr7,      vr27            // in3
2703    vsllwil.w.h   vr8,      vr0,      0     // in3
2704    vexth.w.h     vr9,      vr0             // in4
2705    vsllwil.w.h   vr10,     vr1,      0     // in5
2706    vexth.w.h     vr11,     vr1             // in6
2707    adst8x8_1d_lsx vr26, vr27, vr0, vr1
2708
2709    vsllwil.w.h   vr18,     vr28,     0     // in0
2710    vexth.w.h     vr19,     vr28            // in1
2711    vsllwil.w.h   vr6,      vr29,     0     // in2
2712    vexth.w.h     vr7,      vr29            // in3
2713    vsllwil.w.h   vr8,      vr2,      0     // in4
2714    vexth.w.h     vr9,      vr2             // in5
2715    vsllwil.w.h   vr10,     vr3,      0     // in6
2716    vexth.w.h     vr11,     vr3             // in7
2717    adst8x8_1d_lsx vr28, vr29, vr16, vr17
2718
2719    vilvl.d       vr4,      vr28,     vr26  // 0 ... 7
2720    vilvl.d       vr5,      vr29,     vr27  // 8 ... 15
2721    vilvl.d       vr6,      vr16,     vr0   // 16 ... 23
2722    vilvl.d       vr7,      vr17,     vr1   // 24 ... 31
2723    vilvh.d       vr14,     vr17,     vr1   // 32 ... 39
2724    vilvh.d       vr15,     vr16,     vr0   // 40 ... 47
2725    vilvh.d       vr16,     vr29,     vr27  // 48 ... 55
2726    vilvh.d       vr17,     vr28,     vr26  // 56 ... 63
2727
2728.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
2729    vsrari.h      \i,       \i,       4
2730.endr
2731
2732    alsl.d        t2,       a1,       a0,     1
2733
2734    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
2735
2736    alsl.d        a0,       a1,       a0,     2
2737    alsl.d        t2,       a1,       a0,     1
2738
2739    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
2740
2741    fld.d         f24,      sp,       0
2742    fld.d         f25,      sp,       8
2743    fld.d         f26,      sp,       16
2744    fld.d         f27,      sp,       24
2745    fld.d         f28,      sp,       32
2746    fld.d         f29,      sp,       40
2747    addi.d        sp,       sp,       48
2748endfunc
2749
2750function inv_txfm_add_adst_adst_8x8_8bpc_lsx
2751    addi.d        sp,       sp,       -32
2752    fst.d         f24,      sp,       0
2753    fst.d         f25,      sp,       8
2754    fst.d         f26,      sp,       16
2755    fst.d         f27,      sp,       24
2756
2757    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
2758
2759    vsllwil.w.h   vr18,     vr0,      0
2760    vsllwil.w.h   vr19,     vr1,      0
2761    vsllwil.w.h   vr6,      vr2,      0
2762    vsllwil.w.h   vr7,      vr3,      0
2763    vsllwil.w.h   vr8,      vr4,      0
2764    vsllwil.w.h   vr9,      vr5,      0
2765    vsllwil.w.h   vr10,     vr16,     0
2766    vsllwil.w.h   vr11,     vr17,     0
2767    adst8x8_1d_lsx vr24, vr25, vr26, vr27
2768
2769    vexth.w.h     vr18,     vr0            // in0
2770    vexth.w.h     vr19,     vr1            // in1
2771    vexth.w.h     vr6,      vr2            // in2
2772    vexth.w.h     vr7,      vr3            // in3
2773    vexth.w.h     vr8,      vr4            // in3
2774    vexth.w.h     vr9,      vr5            // in4
2775    vexth.w.h     vr10,     vr16           // in5
2776    vexth.w.h     vr11,     vr17           // in6
2777    adst8x8_1d_lsx vr0, vr1, vr2, vr3
2778
2779    vreplgr2vr.h  vr23,     zero
2780.irp i, 0, 16, 32, 48, 64, 80, 96, 112
2781    vst           vr23,     a2,       \i
2782.endr
2783
2784.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
2785    vsrari.h        \i,       \i,     1
2786.endr
2787
2788    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
2789                       vr14, vr15, vr12, vr13, vr4, vr5, vr24, vr25, \
2790                       vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
2791
2792    vshuf4i.h     vr4,      vr4,      0x1b
2793    vshuf4i.h     vr5,      vr5,      0x1b
2794    vshuf4i.h     vr24,     vr24,     0x1b
2795    vshuf4i.h     vr25,     vr25,     0x1b
2796
2797    vsllwil.w.h   vr18,     vr14,     0
2798    vsllwil.w.h   vr19,     vr15,     0
2799    vsllwil.w.h   vr6,      vr12,     0
2800    vsllwil.w.h   vr7,      vr13,     0
2801    vexth.w.h     vr8,      vr14            // in3
2802    vexth.w.h     vr9,      vr15            // in4
2803    vexth.w.h     vr10,     vr12            // in5
2804    vexth.w.h     vr11,     vr13            // in6
2805
2806    adst8x8_1d_lsx vr26, vr27, vr0, vr1
2807
2808    vsllwil.w.h   vr18,     vr4,     0
2809    vsllwil.w.h   vr19,     vr5,     0
2810    vsllwil.w.h   vr6,      vr24,    0
2811    vsllwil.w.h   vr7,      vr25,    0
2812    vexth.w.h     vr8,      vr4             // in3
2813    vexth.w.h     vr9,      vr5             // in4
2814    vexth.w.h     vr10,     vr24            // in5
2815    vexth.w.h     vr11,     vr25            // in6
2816
2817    adst8x8_1d_lsx vr24, vr25, vr16, vr17
2818
2819    vilvl.d       vr4,      vr24,     vr26  // 0 ... 7
2820    vilvl.d       vr5,      vr25,     vr27  // 8 ... 15
2821    vilvl.d       vr6,      vr16,     vr0   // 16 ... 23
2822    vilvl.d       vr7,      vr17,     vr1   // 24 ... 31
2823    vilvh.d       vr14,     vr17,     vr1   // 32 ... 39
2824    vilvh.d       vr15,     vr16,     vr0   // 40 ... 47
2825    vilvh.d       vr16,     vr25,     vr27  // 48 ... 55
2826    vilvh.d       vr17,     vr24,     vr26  // 56 ... 63
2827
2828.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
2829    vsrari.h        \i,       \i,     4
2830.endr
2831
2832    alsl.d        t2,       a1,       a0,     1
2833
2834    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
2835
2836    alsl.d        a0,       a1,       a0,     2
2837    alsl.d        t2,       a1,       a0,     1
2838
2839    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
2840
2841    fld.d         f24,      sp,       0
2842    fld.d         f25,      sp,       8
2843    fld.d         f26,      sp,       16
2844    fld.d         f27,      sp,       24
2845    addi.d        sp,       sp,       32
2846endfunc
2847
2848function inv_txfm_add_flipadst_adst_8x8_8bpc_lsx
2849    addi.d        sp,       sp,       -32
2850    fst.d         f24,      sp,       0
2851    fst.d         f25,      sp,       8
2852    fst.d         f26,      sp,       16
2853    fst.d         f27,      sp,       24
2854
2855    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
2856
2857    vsllwil.w.h   vr18,     vr0,      0
2858    vsllwil.w.h   vr19,     vr1,      0
2859    vsllwil.w.h   vr6,      vr2,      0
2860    vsllwil.w.h   vr7,      vr3,      0
2861    vsllwil.w.h   vr8,      vr4,      0
2862    vsllwil.w.h   vr9,      vr5,      0
2863    vsllwil.w.h   vr10,     vr16,     0
2864    vsllwil.w.h   vr11,     vr17,     0
2865    adst8x8_1d_lsx vr12, vr13, vr14, vr15
2866
2867    vilvl.h       vr20,     vr12,     vr13
2868    vilvl.h       vr21,     vr14,     vr15
2869    vilvl.w       vr24,     vr20,     vr21
2870    vilvh.w       vr25,     vr20,     vr21
2871    vilvh.h       vr20,     vr12,     vr13
2872    vilvh.h       vr21,     vr14,     vr15
2873    vilvl.w       vr26,     vr20,     vr21
2874    vilvh.w       vr27,     vr20,     vr21
2875    vshuf4i.h     vr26,     vr26,     0x1b
2876    vshuf4i.h     vr27,     vr27,     0x1b
2877
2878    vexth.w.h     vr18,     vr0
2879    vexth.w.h     vr19,     vr1
2880    vexth.w.h     vr6,      vr2
2881    vexth.w.h     vr7,      vr3
2882    vexth.w.h     vr8,      vr4
2883    vexth.w.h     vr9,      vr5
2884    vexth.w.h     vr10,     vr16
2885    vexth.w.h     vr11,     vr17
2886    adst8x8_1d_lsx vr12, vr13, vr14, vr15
2887
2888    vilvl.h       vr20,     vr12,     vr13
2889    vilvl.h       vr21,     vr14,     vr15
2890    vilvl.w       vr0,      vr20,     vr21
2891    vilvh.w       vr1,      vr20,     vr21
2892    vilvh.h       vr20,     vr12,     vr13
2893    vilvh.h       vr21,     vr14,     vr15
2894    vilvl.w       vr2,      vr20,     vr21
2895    vilvh.w       vr3,      vr20,     vr21
2896    vshuf4i.h     vr2,      vr2,      0x1b
2897    vshuf4i.h     vr3,      vr3,      0x1b
2898
2899    vreplgr2vr.h  vr23,     zero
2900.irp i, 0, 16, 32, 48, 64, 80, 96, 112
2901    vst           vr23,     a2,       \i
2902.endr
2903
2904.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
2905    vsrari.h      \i,       \i,       1
2906.endr
2907
2908    vsllwil.w.h   vr18,     vr26,     0    // in0
2909    vexth.w.h     vr19,     vr26           // in1
2910    vsllwil.w.h   vr6,      vr27,     0    // in2
2911    vexth.w.h     vr7,      vr27           // in3
2912    vsllwil.w.h   vr8,      vr2,      0    // in4
2913    vexth.w.h     vr9,      vr2            // in5
2914    vsllwil.w.h   vr10,     vr3,      0    // in6
2915    vexth.w.h     vr11,     vr3            // in7
2916    adst8x8_1d_lsx vr4, vr5, vr16, vr17
2917
2918    vsllwil.w.h   vr18,     vr24,     0    // in0
2919    vexth.w.h     vr19,     vr24           // in1
2920    vsllwil.w.h   vr6,      vr25,     0    // in2
2921    vexth.w.h     vr7,      vr25           // in3
2922    vsllwil.w.h   vr8,      vr0,      0    // in4
2923    vexth.w.h     vr9,      vr0            // in5
2924    vsllwil.w.h   vr10,     vr1,      0    // in6
2925    vexth.w.h     vr11,     vr1            // in7
2926    adst8x8_1d_lsx vr0, vr1, vr2, vr3
2927
2928    vilvl.d       vr20,     vr0,     vr4   // 0 ... 7
2929    vilvl.d       vr21,     vr1,     vr5   // 8 ... 15
2930    vilvl.d       vr22,     vr2,     vr16  // 16 ... 23
2931    vilvl.d       vr23,     vr3,     vr17  // 24 ... 31
2932    vilvh.d       vr14,     vr3,     vr17  // 32 ... 39
2933    vilvh.d       vr15,     vr2,     vr16  // 40 ... 47
2934    vilvh.d       vr16,     vr1,     vr5   // 48 ... 55
2935    vilvh.d       vr17,     vr0,     vr4   // 56 ... 63
2936
2937.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr16, vr17
2938    vsrari.h      \i,       \i,      4
2939.endr
2940
2941    alsl.d        t2,       a1,       a0,     1
2942
2943    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
2944
2945    alsl.d        a0,       a1,       a0,     2
2946    alsl.d        t2,       a1,       a0,     1
2947
2948    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
2949    fld.d         f24,      sp,       0
2950    fld.d         f25,      sp,       8
2951    fld.d         f26,      sp,       16
2952    fld.d         f27,      sp,       24
2953    addi.d        sp,       sp,       32
2954endfunc
2955
2956function inv_txfm_add_adst_flipadst_8x8_8bpc_lsx
2957    addi.d        sp,       sp,       -32
2958    fst.d         f24,      sp,       0
2959    fst.d         f25,      sp,       8
2960    fst.d         f26,      sp,       16
2961    fst.d         f27,      sp,       24
2962
2963    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
2964
2965    vsllwil.w.h   vr18,     vr0,      0
2966    vsllwil.w.h   vr19,     vr1,      0
2967    vsllwil.w.h   vr6,      vr2,      0
2968    vsllwil.w.h   vr7,      vr3,      0
2969    vsllwil.w.h   vr8,      vr4,      0
2970    vsllwil.w.h   vr9,      vr5,      0
2971    vsllwil.w.h   vr10,     vr16,     0
2972    vsllwil.w.h   vr11,     vr17,     0
2973    adst8x8_1d_lsx vr24, vr25, vr26, vr27
2974
2975    vexth.w.h     vr18,     vr0
2976    vexth.w.h     vr19,     vr1
2977    vexth.w.h     vr6,      vr2
2978    vexth.w.h     vr7,      vr3
2979    vexth.w.h     vr8,      vr4
2980    vexth.w.h     vr9,      vr5
2981    vexth.w.h     vr10,     vr16
2982    vexth.w.h     vr11,     vr17
2983    adst8x8_1d_lsx vr0, vr1, vr2, vr3
2984
2985    vreplgr2vr.h  vr23,     zero
2986.irp i, 0, 16, 32, 48, 64, 80, 96, 112
2987    vst           vr23,     a2,       \i
2988.endr
2989
2990.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
2991    vsrari.h      \i,       \i,       1
2992.endr
2993
2994    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
2995                       vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
2996                       vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
2997
2998    vshuf4i.h     vr0,      vr0,      0x1b
2999    vshuf4i.h     vr1,      vr1,      0x1b
3000    vshuf4i.h     vr2,      vr2,      0x1b
3001    vshuf4i.h     vr3,      vr3,      0x1b
3002
3003    vsllwil.w.h   vr18,     vr0,      0    // in0
3004    vsllwil.w.h   vr19,     vr1,      0    // in1
3005    vsllwil.w.h   vr6,      vr2,      0    // in2
3006    vsllwil.w.h   vr7,      vr3,      0    // in3
3007    vexth.w.h     vr8,      vr0            // in4
3008    vexth.w.h     vr9,      vr1            // in5
3009    vexth.w.h     vr10,     vr2            // in6
3010    vexth.w.h     vr11,     vr3            // in7
3011    adst8x8_1d_lsx vr4, vr5, vr16, vr17
3012
3013    vsllwil.w.h   vr18,     vr24,     0    // in0
3014    vsllwil.w.h   vr19,     vr25,     0    // in1
3015    vsllwil.w.h   vr6,      vr26,     0    // in2
3016    vsllwil.w.h   vr7,      vr27,     0    // in3
3017    vexth.w.h     vr8,      vr24           // in4
3018    vexth.w.h     vr9,      vr25           // in5
3019    vexth.w.h     vr10,     vr26           // in6
3020    vexth.w.h     vr11,     vr27           // in7
3021    adst8x8_1d_lsx vr0, vr1, vr2, vr3
3022
3023    vilvh.d       vr20,     vr4,      vr0
3024    vilvh.d       vr21,     vr5,      vr1
3025    vilvh.d       vr22,     vr16,     vr2
3026    vilvh.d       vr23,     vr17,     vr3
3027    vilvl.d       vr14,     vr17,     vr3
3028    vilvl.d       vr15,     vr16,     vr2
3029    vilvl.d       vr18,     vr5,      vr1
3030    vilvl.d       vr19,     vr4,      vr0
3031
3032.irp i, vr20, vr21, vr22, vr23, vr14, vr15, vr18, vr19
3033    vsrari.h      \i,       \i,       4
3034.endr
3035
3036    alsl.d        t2,       a1,       a0,     1
3037
3038    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
3039
3040    alsl.d        a0,       a1,       a0,     2
3041    alsl.d        t2,       a1,       a0,     1
3042
3043    VLD_DST_ADD_W8 vr14, vr15, vr18, vr19
3044
3045    fld.d         f24,      sp,       0
3046    fld.d         f25,      sp,       8
3047    fld.d         f26,      sp,       16
3048    fld.d         f27,      sp,       24
3049    addi.d        sp,       sp,       32
3050endfunc
3051
3052function inv_txfm_add_flipadst_dct_8x8_8bpc_lsx
3053    addi.d        sp,       sp,       -32
3054    fst.d         f24,      sp,       0
3055    fst.d         f25,      sp,       8
3056    fst.d         f26,      sp,       16
3057    fst.d         f27,      sp,       24
3058
3059    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
3060
3061    vsllwil.w.h   vr18,     vr0,      0
3062    vsllwil.w.h   vr19,     vr1,      0
3063    vsllwil.w.h   vr6,      vr2,      0
3064    vsllwil.w.h   vr7,      vr3,      0
3065    vsllwil.w.h   vr8,      vr4,      0
3066    vsllwil.w.h   vr9,      vr5,      0
3067    vsllwil.w.h   vr10,     vr16,     0
3068    vsllwil.w.h   vr11,     vr17,     0
3069    adst8x8_1d_lsx vr12, vr13, vr14, vr15
3070
3071    vilvl.h       vr20,     vr12,     vr13
3072    vilvl.h       vr21,     vr14,     vr15
3073    vilvl.w       vr24,     vr20,     vr21
3074    vilvh.w       vr25,     vr20,     vr21
3075    vilvh.h       vr20,     vr12,     vr13
3076    vilvh.h       vr21,     vr14,     vr15
3077    vilvl.w       vr26,     vr20,     vr21
3078    vilvh.w       vr27,     vr20,     vr21
3079
3080    vexth.w.h     vr18,     vr0
3081    vexth.w.h     vr19,     vr1
3082    vexth.w.h     vr6,      vr2
3083    vexth.w.h     vr7,      vr3
3084    vexth.w.h     vr8,      vr4
3085    vexth.w.h     vr9,      vr5
3086    vexth.w.h     vr10,     vr16
3087    vexth.w.h     vr11,     vr17
3088    adst8x8_1d_lsx vr12, vr13, vr14, vr15
3089
3090    vilvl.h       vr20,     vr12,     vr13
3091    vilvl.h       vr21,     vr14,     vr15
3092    vilvl.w       vr0,      vr20,     vr21
3093    vilvh.w       vr1,      vr20,     vr21
3094    vilvh.h       vr20,     vr12,     vr13
3095    vilvh.h       vr21,     vr14,     vr15
3096    vilvl.w       vr2,      vr20,     vr21
3097    vilvh.w       vr3,      vr20,     vr21
3098
3099    vreplgr2vr.h  vr23,     zero
3100
3101.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3102    vst           vr23,     a2,       \i
3103.endr
3104
3105    vsrari.h      vr24,     vr24,     1
3106    vsrari.h      vr25,     vr25,     1
3107    vsrari.h      vr26,     vr26,     1
3108    vsrari.h      vr27,     vr27,     1
3109    vsrari.h      vr14,     vr0,      1
3110    vsrari.h      vr15,     vr1,      1
3111    vsrari.h      vr16,     vr2,      1
3112    vsrari.h      vr17,     vr3,      1
3113
3114    vsllwil.w.h   vr18,     vr26,     0
3115    vexth.w.h     vr19,     vr26
3116    vsllwil.w.h   vr6,      vr27,     0
3117    vexth.w.h     vr7,      vr27
3118    vsllwil.w.h   vr8,      vr16,     0
3119    vexth.w.h     vr9,      vr16
3120    vsllwil.w.h   vr10,     vr17,     0
3121    vexth.w.h     vr11,     vr17
3122
3123    la.local      t0,       idct_coeffs
3124    vldrepl.w     vr20,     t0,       8    // 1567
3125    vldrepl.w     vr21,     t0,       12   // 3784
3126    vldrepl.w     vr22,     t0,       0    // 2896
3127
3128    dct_8x4_core_lsx1 vr26, vr27, vr16, vr17
3129
3130    vshuf4i.h     vr26,     vr26,     0x1b
3131    vshuf4i.h     vr27,     vr27,     0x1b
3132    vshuf4i.h     vr16,     vr16,     0x1b
3133    vshuf4i.h     vr17,     vr17,     0x1b
3134
3135    vsllwil.w.h   vr18,     vr24,     0
3136    vexth.w.h     vr19,     vr24
3137    vsllwil.w.h   vr6,      vr25,     0
3138    vexth.w.h     vr7,      vr25
3139    vsllwil.w.h   vr8,      vr14,     0
3140    vexth.w.h     vr9,      vr14
3141    vsllwil.w.h   vr10,     vr15,     0
3142    vexth.w.h     vr11,     vr15
3143
3144    la.local      t0,       idct_coeffs
3145    vldrepl.w     vr20,     t0,       8    // 1567
3146    vldrepl.w     vr21,     t0,       12   // 3784
3147    vldrepl.w     vr22,     t0,       0    // 2896
3148
3149    dct_8x4_core_lsx1 vr24, vr25, vr14, vr15
3150
3151    vilvl.d       vr4,      vr24,     vr26
3152    vilvh.d       vr5,      vr24,     vr26
3153    vilvh.d       vr6,      vr25,     vr27
3154    vilvl.d       vr7,      vr25,     vr27
3155    vilvl.d       vr24,     vr14,     vr16
3156    vilvh.d       vr25,     vr14,     vr16
3157    vilvh.d       vr26,     vr15,     vr17
3158    vilvl.d       vr27,     vr15,     vr17
3159
3160.irp i, vr4, vr5, vr6, vr7, vr24, vr25, vr26, vr27
3161    vsrari.h      \i,       \i,       4
3162.endr
3163
3164    alsl.d        t2,       a1,       a0,     1
3165
3166    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
3167
3168    alsl.d        a0,       a1,       a0,     2
3169    alsl.d        t2,       a1,       a0,     1
3170
3171    VLD_DST_ADD_W8 vr24, vr25, vr26, vr27
3172
3173    fld.d         f24,      sp,       0
3174    fld.d         f25,      sp,       8
3175    fld.d         f26,      sp,       16
3176    fld.d         f27,      sp,       24
3177    addi.d        sp,       sp,       32
3178endfunc
3179
3180function inv_txfm_add_dct_flipadst_8x8_8bpc_lsx
3181    addi.d        sp,       sp,       -48
3182    fst.d         f24,      sp,       0
3183    fst.d         f25,      sp,       8
3184    fst.d         f26,      sp,       16
3185    fst.d         f27,      sp,       24
3186    fst.d         f28,      sp,       32
3187    fst.d         f29,      sp,       40
3188
3189    vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
3190
3191    la.local      t0,       idct_coeffs
3192    vldrepl.w     vr20,     t0,       8    // 1567
3193    vldrepl.w     vr21,     t0,       12   // 3784
3194    vldrepl.w     vr22,     t0,       0    // 2896
3195
3196    vsllwil.w.h   vr18,     vr4,      0
3197    vsllwil.w.h   vr19,     vr5,      0
3198    vsllwil.w.h   vr6,      vr12,     0
3199    vsllwil.w.h   vr7,      vr13,     0
3200    vsllwil.w.h   vr8,      vr14,     0
3201    vsllwil.w.h   vr9,      vr15,     0
3202    vsllwil.w.h   vr10,     vr24,     0
3203    vsllwil.w.h   vr11,     vr25,     0
3204    dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
3205    vshuf4i.d     vr27,     vr27,     0x01
3206    vshuf4i.d     vr29,     vr29,     0x01
3207
3208    vilvl.h       vr8,      vr27,     vr26
3209    vilvh.h       vr9,      vr27,     vr26
3210    vilvl.h       vr26,     vr9,      vr8
3211    vilvh.h       vr27,     vr9,      vr8
3212    vilvl.h       vr8,      vr29,     vr28
3213    vilvh.h       vr9,      vr29,     vr28
3214    vilvl.h       vr28,     vr9,      vr8
3215    vilvh.h       vr29,     vr9,      vr8
3216
3217    vsrari.h      vr26,     vr26,     1     // in0low in1low
3218    vsrari.h      vr27,     vr27,     1     // in2low in3low
3219    vsrari.h      vr28,     vr28,     1     // in0high in1high
3220    vsrari.h      vr29,     vr29,     1     // in2high in3high
3221
3222    vexth.w.h     vr18,     vr4
3223    vexth.w.h     vr19,     vr5
3224    vexth.w.h     vr6,      vr12
3225    vexth.w.h     vr7,      vr13
3226    vexth.w.h     vr8,      vr14
3227    vexth.w.h     vr9,      vr15
3228    vexth.w.h     vr10,     vr24
3229    vexth.w.h     vr11,     vr25
3230    la.local      t0,       idct_coeffs
3231    vldrepl.w     vr20,     t0,       8    // 1567
3232    vldrepl.w     vr21,     t0,       12   // 3784
3233    vldrepl.w     vr22,     t0,       0    // 2896
3234    dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
3235    vshuf4i.d     vr13,     vr13,     0x01
3236    vshuf4i.d     vr15,     vr15,     0x01
3237
3238    vilvl.h       vr8,      vr13,     vr12
3239    vilvh.h       vr9,      vr13,     vr12
3240    vilvl.h       vr12,     vr9,      vr8
3241    vilvh.h       vr13,     vr9,      vr8
3242    vilvl.h       vr8,      vr15,     vr14
3243    vilvh.h       vr9,      vr15,     vr14
3244    vilvl.h       vr14,     vr9,      vr8
3245    vilvh.h       vr15,     vr9,      vr8
3246
3247    vsrari.h      vr0,      vr12,     1
3248    vsrari.h      vr1,      vr13,     1
3249    vsrari.h      vr2,      vr14,     1
3250    vsrari.h      vr3,      vr15,     1
3251
3252    vreplgr2vr.h  vr23,     zero
3253.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3254    vst           vr23,     a2,       \i
3255.endr
3256
3257    vsllwil.w.h   vr18,     vr28,     0    // in0
3258    vexth.w.h     vr19,     vr28           // in1
3259    vsllwil.w.h   vr6,      vr29,     0    // in2
3260    vexth.w.h     vr7,      vr29           // in3
3261    vsllwil.w.h   vr8,      vr2,      0    // in4
3262    vexth.w.h     vr9,      vr2            // in5
3263    vsllwil.w.h   vr10,     vr3,      0    // in6
3264    vexth.w.h     vr11,     vr3            // in7
3265    adst8x8_1d_lsx vr4, vr5, vr16, vr17
3266
3267    vsllwil.w.h   vr18,     vr26,     0    // in0
3268    vexth.w.h     vr19,     vr26           // in1
3269    vsllwil.w.h   vr6,      vr27,     0    // in2
3270    vexth.w.h     vr7,      vr27           // in3
3271    vsllwil.w.h   vr8,      vr0,      0    // in4
3272    vexth.w.h     vr9,      vr0            // in5
3273    vsllwil.w.h   vr10,     vr1,      0    // in6
3274    vexth.w.h     vr11,     vr1            // in7
3275    adst8x8_1d_lsx vr0, vr1, vr2, vr3
3276
3277    vilvh.d       vr26,     vr4,      vr0
3278    vilvh.d       vr27,     vr5,      vr1
3279    vilvh.d       vr28,     vr16,     vr2
3280    vilvh.d       vr29,     vr17,     vr3
3281    vilvl.d       vr20,     vr17,     vr3
3282    vilvl.d       vr21,     vr16,     vr2
3283    vilvl.d       vr22,     vr5,      vr1
3284    vilvl.d       vr23,     vr4,      vr0
3285
3286.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
3287    vsrari.h      \i,       \i,       4
3288.endr
3289
3290    alsl.d        t2,       a1,       a0,     1
3291
3292    VLD_DST_ADD_W8 vr26, vr27, vr28, vr29
3293
3294    alsl.d        a0,       a1,       a0,     2
3295    alsl.d        t2,       a1,       a0,     1
3296
3297    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
3298
3299    fld.d         f24,      sp,       0
3300    fld.d         f25,      sp,       8
3301    fld.d         f26,      sp,       16
3302    fld.d         f27,      sp,       24
3303    fld.d         f28,      sp,       32
3304    fld.d         f29,      sp,       40
3305    addi.d        sp,       sp,       48
3306endfunc
3307
3308function inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx
3309    addi.d        sp,       sp,       -32
3310    fst.d         f24,      sp,       0
3311    fst.d         f25,      sp,       8
3312    fst.d         f26,      sp,       16
3313    fst.d         f27,      sp,       24
3314
3315    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
3316
3317    vsllwil.w.h   vr18,     vr0,      0
3318    vsllwil.w.h   vr19,     vr1,      0
3319    vsllwil.w.h   vr6,      vr2,      0
3320    vsllwil.w.h   vr7,      vr3,      0
3321    vsllwil.w.h   vr8,      vr4,      0
3322    vsllwil.w.h   vr9,      vr5,      0
3323    vsllwil.w.h   vr10,     vr16,     0
3324    vsllwil.w.h   vr11,     vr17,     0
3325    adst8x8_1d_lsx vr12, vr13, vr14, vr15
3326
3327    vilvl.h       vr20,     vr12,     vr13
3328    vilvl.h       vr21,     vr14,     vr15
3329    vilvl.w       vr24,     vr20,     vr21
3330    vilvh.w       vr25,     vr20,     vr21
3331    vilvh.h       vr20,     vr12,     vr13
3332    vilvh.h       vr21,     vr14,     vr15
3333    vilvl.w       vr26,     vr20,     vr21
3334    vilvh.w       vr27,     vr20,     vr21
3335    vshuf4i.h     vr26,     vr26,     0x1b
3336    vshuf4i.h     vr27,     vr27,     0x1b
3337
3338    vexth.w.h     vr18,     vr0
3339    vexth.w.h     vr19,     vr1
3340    vexth.w.h     vr6,      vr2
3341    vexth.w.h     vr7,      vr3
3342    vexth.w.h     vr8,      vr4
3343    vexth.w.h     vr9,      vr5
3344    vexth.w.h     vr10,     vr16
3345    vexth.w.h     vr11,     vr17
3346    adst8x8_1d_lsx vr12, vr13, vr14, vr15
3347
3348    vilvl.h       vr20,     vr12,     vr13
3349    vilvl.h       vr21,     vr14,     vr15
3350    vilvl.w       vr0,      vr20,     vr21
3351    vilvh.w       vr1,      vr20,     vr21
3352    vilvh.h       vr20,     vr12,     vr13
3353    vilvh.h       vr21,     vr14,     vr15
3354    vilvl.w       vr2,      vr20,     vr21
3355    vilvh.w       vr3,      vr20,     vr21
3356    vshuf4i.h     vr2,      vr2,      0x1b
3357    vshuf4i.h     vr3,      vr3,      0x1b
3358
3359.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
3360    vsrari.h      \i,       \i,       1
3361.endr
3362
3363    vreplgr2vr.h  vr23,     zero
3364.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3365    vst           vr23,     a2,       \i
3366.endr
3367
3368    vsllwil.w.h   vr18,     vr26,     0    // in0
3369    vexth.w.h     vr19,     vr26           // in1
3370    vsllwil.w.h   vr6,      vr27,     0    // in2
3371    vexth.w.h     vr7,      vr27           // in3
3372    vsllwil.w.h   vr8,      vr2,      0    // in4
3373    vexth.w.h     vr9,      vr2            // in5
3374    vsllwil.w.h   vr10,     vr3,      0    // in6
3375    vexth.w.h     vr11,     vr3            // in7
3376    adst8x8_1d_lsx vr4, vr5, vr16, vr17
3377
3378    vsllwil.w.h   vr18,     vr24,     0    // in0
3379    vexth.w.h     vr19,     vr24           // in1
3380    vsllwil.w.h   vr6,      vr25,     0    // in2
3381    vexth.w.h     vr7,      vr25           // in3
3382    vsllwil.w.h   vr8,      vr0,      0    // in4
3383    vexth.w.h     vr9,      vr0            // in5
3384    vsllwil.w.h   vr10,     vr1,      0    // in6
3385    vexth.w.h     vr11,     vr1            // in7
3386    adst8x8_1d_lsx vr0, vr1, vr2, vr3
3387
3388    vilvh.d       vr24,     vr0,      vr4
3389    vilvh.d       vr25,     vr1,      vr5
3390    vilvh.d       vr26,     vr2,      vr16
3391    vilvh.d       vr27,     vr3,      vr17
3392    vilvl.d       vr20,     vr3,      vr17
3393    vilvl.d       vr21,     vr2,      vr16
3394    vilvl.d       vr22,     vr1,      vr5
3395    vilvl.d       vr23,     vr0,      vr4
3396
3397.irp i, vr24, vr25, vr26, vr27, vr20, vr21, vr22, vr23
3398    vsrari.h      \i,       \i,       4
3399.endr
3400
3401    alsl.d        t2,       a1,       a0,     1
3402
3403    VLD_DST_ADD_W8 vr24, vr25, vr26, vr27
3404
3405    alsl.d        a0,       a1,       a0,     2
3406    alsl.d        t2,       a1,       a0,     1
3407
3408    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
3409
3410    fld.d         f24,      sp,       0
3411    fld.d         f25,      sp,       8
3412    fld.d         f26,      sp,       16
3413    fld.d         f27,      sp,       24
3414    addi.d        sp,       sp,       32
3415endfunc
3416
3417function inv_txfm_add_dct_identity_8x8_8bpc_lsx
3418    addi.d        sp,       sp,       -48
3419    fst.d         f24,      sp,       0
3420    fst.d         f25,      sp,       8
3421    fst.d         f26,      sp,       16
3422    fst.d         f27,      sp,       24
3423    fst.d         f28,      sp,       32
3424    fst.d         f29,      sp,       40
3425
3426    vld_x8 a2, 0, 16, vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25
3427
3428    la.local      t0,       idct_coeffs
3429    vldrepl.w     vr20,     t0,       8    // 1567
3430    vldrepl.w     vr21,     t0,       12   // 3784
3431    vldrepl.w     vr22,     t0,       0    // 2896
3432
3433    vsllwil.w.h   vr18,     vr4,      0
3434    vsllwil.w.h   vr19,     vr5,      0
3435    vsllwil.w.h   vr6,      vr12,     0
3436    vsllwil.w.h   vr7,      vr13,     0
3437    vsllwil.w.h   vr8,      vr14,     0
3438    vsllwil.w.h   vr9,      vr15,     0
3439    vsllwil.w.h   vr10,     vr24,     0
3440    vsllwil.w.h   vr11,     vr25,     0
3441    dct_8x4_core_lsx1 vr26, vr27, vr28, vr29
3442    vshuf4i.d     vr27,     vr27,     0x01
3443    vshuf4i.d     vr29,     vr29,     0x01
3444
3445    vilvl.h       vr8,      vr27,     vr26
3446    vilvh.h       vr9,      vr27,     vr26
3447    vilvl.h       vr26,     vr9,      vr8
3448    vilvh.h       vr27,     vr9,      vr8
3449    vilvl.h       vr8,      vr29,     vr28
3450    vilvh.h       vr9,      vr29,     vr28
3451    vilvl.h       vr28,     vr9,      vr8
3452    vilvh.h       vr29,     vr9,      vr8
3453
3454    vsrari.h      vr26,     vr26,     1     // in0low in1low
3455    vsrari.h      vr27,     vr27,     1     // in2low in3low
3456    vsrari.h      vr28,     vr28,     1     // in0high in1high
3457    vsrari.h      vr29,     vr29,     1     // in2high in3high
3458
3459    vexth.w.h     vr18,     vr4
3460    vexth.w.h     vr19,     vr5
3461    vexth.w.h     vr6,      vr12
3462    vexth.w.h     vr7,      vr13
3463    vexth.w.h     vr8,      vr14
3464    vexth.w.h     vr9,      vr15
3465    vexth.w.h     vr10,     vr24
3466    vexth.w.h     vr11,     vr25
3467
3468    la.local      t0,       idct_coeffs
3469    vldrepl.w     vr20,     t0,       8    // 1567
3470    vldrepl.w     vr21,     t0,       12   // 3784
3471    vldrepl.w     vr22,     t0,       0    // 2896
3472
3473    dct_8x4_core_lsx1 vr12, vr13, vr14, vr15
3474
3475    vshuf4i.d     vr13,     vr13,     0x01
3476    vshuf4i.d     vr15,     vr15,     0x01
3477
3478    vilvl.h       vr8,      vr13,     vr12
3479    vilvh.h       vr9,      vr13,     vr12
3480    vilvl.h       vr12,     vr9,      vr8
3481    vilvh.h       vr13,     vr9,      vr8
3482    vilvl.h       vr8,      vr15,     vr14
3483    vilvh.h       vr9,      vr15,     vr14
3484    vilvl.h       vr14,     vr9,      vr8
3485    vilvh.h       vr15,     vr9,      vr8
3486
3487    vsrari.h      vr20,     vr12,     1
3488    vsrari.h      vr21,     vr13,     1
3489    vsrari.h      vr22,     vr14,     1
3490    vsrari.h      vr23,     vr15,     1
3491
3492    vreplgr2vr.h  vr19,     zero
3493.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3494    vst           vr19,     a2,       \i
3495.endr
3496    // identity8
3497    vsllwil.w.h   vr10,     vr26,     1
3498    vsllwil.w.h   vr11,     vr27,     1
3499    vsllwil.w.h   vr16,     vr28,     1
3500    vsllwil.w.h   vr17,     vr29,     1
3501    vsllwil.w.h   vr6,      vr20,     1
3502    vsllwil.w.h   vr7,      vr21,     1
3503    vsllwil.w.h   vr18,     vr22,     1
3504    vsllwil.w.h   vr19,     vr23,     1
3505
3506.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
3507    vexth.w.h     \i,       \i
3508.endr
3509
3510.irp i, vr26, vr27, vr28, vr29, vr20, vr21, vr22, vr23
3511    vslli.w       \i,       \i,       1
3512.endr
3513
3514    vssrarni.h.w  vr16,     vr10,     4   // in0
3515    vssrarni.h.w  vr28,     vr26,     4   // in1
3516    vssrarni.h.w  vr17,     vr11,     4   // in2
3517    vssrarni.h.w  vr29,     vr27,     4   // in3
3518    vssrarni.h.w  vr18,     vr6,      4   // in4
3519    vssrarni.h.w  vr22,     vr20,     4   // in5
3520    vssrarni.h.w  vr19,     vr7,      4   // in6
3521    vssrarni.h.w  vr23,     vr21,     4   // in7
3522
3523    alsl.d        t2,       a1,       a0,     1
3524
3525    VLD_DST_ADD_W8 vr16, vr28, vr17, vr29
3526
3527    alsl.d        a0,       a1,       a0,     2
3528    alsl.d        t2,       a1,       a0,     1
3529
3530    VLD_DST_ADD_W8 vr18, vr22, vr19, vr23
3531
3532    fld.d         f24,      sp,       0
3533    fld.d         f25,      sp,       8
3534    fld.d         f26,      sp,       16
3535    fld.d         f27,      sp,       24
3536    fld.d         f28,      sp,       32
3537    fld.d         f29,      sp,       40
3538    addi.d        sp,       sp,       48
3539endfunc
3540
3541function inv_txfm_add_identity_dct_8x8_8bpc_lsx
3542    addi.d        sp,       sp,       -48
3543    fst.d         f24,      sp,       0
3544    fst.d         f25,      sp,       8
3545    fst.d         f26,      sp,       16
3546    fst.d         f27,      sp,       24
3547    fst.d         f28,      sp,       32
3548    fst.d         f29,      sp,       40
3549
3550    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3551
3552    // identity8
3553    vsllwil.w.h   vr6,      vr0,      1
3554    vsllwil.w.h   vr7,      vr1,      1
3555    vsllwil.w.h   vr8,      vr2,      1
3556    vsllwil.w.h   vr9,      vr3,      1
3557    vsllwil.w.h   vr10,     vr4,      1
3558    vsllwil.w.h   vr11,     vr5,      1
3559    vsllwil.w.h   vr12,     vr24,     1
3560    vsllwil.w.h   vr13,     vr25,     1
3561
3562.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3563    vexth.w.h     \i,       \i
3564.endr
3565
3566.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3567    vslli.w       \i,       \i,       1
3568.endr
3569    vssrarni.h.w  vr0,      vr6,      1   // in0
3570    vssrarni.h.w  vr1,      vr7,      1   // in1
3571    vssrarni.h.w  vr2,      vr8,      1   // in2
3572    vssrarni.h.w  vr3,      vr9,      1   // in3
3573    vssrarni.h.w  vr4,      vr10,     1   // in4
3574    vssrarni.h.w  vr5,      vr11,     1   // in5
3575    vssrarni.h.w  vr24,     vr12,     1   // in6
3576    vssrarni.h.w  vr25,     vr13,     1   // in7
3577
3578    vreplgr2vr.h  vr23,     zero
3579.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3580    vst           vr23,     a2,       \i
3581.endr
3582
3583    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
3584                       vr4, vr5, vr12, vr13, vr14, vr15, vr24, vr25, \
3585                       vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
3586
3587    la.local      t0,       idct_coeffs
3588    vldrepl.w     vr20,     t0,       8    // 1567
3589    vldrepl.w     vr21,     t0,       12   // 3784
3590    vldrepl.w     vr22,     t0,       0    // 2896
3591
3592    // dct4 in0 in2 in4 in6
3593    vsllwil.w.h   vr18,     vr4,      0
3594    vsllwil.w.h   vr19,     vr5,      0
3595    vsllwil.w.h   vr6,      vr12,     0
3596    vsllwil.w.h   vr7,      vr13,     0
3597    vsllwil.w.h   vr8,      vr14,     0
3598    vsllwil.w.h   vr9,      vr15,     0
3599    vsllwil.w.h   vr10,     vr24,     0
3600    vsllwil.w.h   vr11,     vr25,     0
3601    dct_8x4_core_lsx1 vr16, vr17, vr26, vr27
3602
3603    vexth.w.h     vr18,     vr4
3604    vexth.w.h     vr19,     vr5
3605    vexth.w.h     vr6,      vr12
3606    vexth.w.h     vr7,      vr13
3607    vexth.w.h     vr8,      vr14
3608    vexth.w.h     vr9,      vr15
3609    vexth.w.h     vr10,     vr24
3610    vexth.w.h     vr11,     vr25
3611
3612    la.local      t0,       idct_coeffs
3613    vldrepl.w     vr20,     t0,       8     // 1567
3614    vldrepl.w     vr21,     t0,       12    // 3784
3615    vldrepl.w     vr22,     t0,       0     // 2896
3616    dct_8x4_core_lsx1 vr4, vr5, vr24, vr25
3617
3618    vilvl.d       vr8,      vr4,      vr16
3619    vilvh.d       vr9,      vr4,      vr16
3620    vilvh.d       vr6,      vr5,      vr17
3621    vilvl.d       vr7,      vr5,      vr17
3622    vilvl.d       vr16,     vr24,     vr26
3623    vilvh.d       vr17,     vr24,     vr26
3624    vilvh.d       vr18,     vr25,     vr27
3625    vilvl.d       vr19,     vr25,     vr27
3626
3627.irp i, vr8, vr9, vr6, vr7, vr16, vr17, vr18, vr19
3628    vsrari.h      \i,       \i,       4
3629.endr
3630
3631    alsl.d        t2,       a1,       a0,     1
3632
3633    VLD_DST_ADD_W8 vr8, vr9, vr6, vr7
3634
3635    alsl.d        a0,       a1,       a0,     2
3636    alsl.d        t2,       a1,       a0,     1
3637
3638    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
3639
3640    fld.d         f24,      sp,       0
3641    fld.d         f25,      sp,       8
3642    fld.d         f26,      sp,       16
3643    fld.d         f27,      sp,       24
3644    fld.d         f28,      sp,       32
3645    fld.d         f29,      sp,       40
3646    addi.d        sp,       sp,       48
3647endfunc
3648
3649function inv_txfm_add_flipadst_identity_8x8_8bpc_lsx
3650    addi.d        sp,       sp,       -32
3651    fst.d         f24,      sp,       0
3652    fst.d         f25,      sp,       8
3653    fst.d         f26,      sp,       16
3654    fst.d         f27,      sp,       24
3655
3656    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
3657
3658    vsllwil.w.h   vr18,     vr0,      0
3659    vsllwil.w.h   vr19,     vr1,      0
3660    vsllwil.w.h   vr6,      vr2,      0
3661    vsllwil.w.h   vr7,      vr3,      0
3662    vsllwil.w.h   vr8,      vr4,      0
3663    vsllwil.w.h   vr9,      vr5,      0
3664    vsllwil.w.h   vr10,     vr16,     0
3665    vsllwil.w.h   vr11,     vr17,     0
3666    adst8x8_1d_lsx vr12, vr13, vr14, vr15
3667
3668    vilvl.h       vr20,     vr12,     vr13
3669    vilvl.h       vr21,     vr14,     vr15
3670    vilvl.w       vr24,     vr20,     vr21
3671    vilvh.w       vr25,     vr20,     vr21
3672    vilvh.h       vr20,     vr12,     vr13
3673    vilvh.h       vr21,     vr14,     vr15
3674    vilvl.w       vr26,     vr20,     vr21
3675    vilvh.w       vr27,     vr20,     vr21
3676    vshuf4i.h     vr26,     vr26,     0x1b
3677    vshuf4i.h     vr27,     vr27,     0x1b
3678
3679    vexth.w.h     vr18,     vr0            // in0
3680    vexth.w.h     vr19,     vr1            // in1
3681    vexth.w.h     vr6,      vr2            // in2
3682    vexth.w.h     vr7,      vr3            // in3
3683    vexth.w.h     vr8,      vr4            // in3
3684    vexth.w.h     vr9,      vr5            // in4
3685    vexth.w.h     vr10,     vr16           // in5
3686    vexth.w.h     vr11,     vr17           // in6
3687    adst8x8_1d_lsx vr12, vr13, vr14, vr15
3688
3689    vilvl.h       vr20,     vr12,     vr13
3690    vilvl.h       vr21,     vr14,     vr15
3691    vilvl.w       vr16,     vr20,     vr21
3692    vilvh.w       vr17,     vr20,     vr21
3693    vilvh.h       vr20,     vr12,     vr13
3694    vilvh.h       vr21,     vr14,     vr15
3695    vilvl.w       vr18,     vr20,     vr21
3696    vilvh.w       vr19,     vr20,     vr21
3697    vshuf4i.h     vr18,     vr18,     0x1b
3698    vshuf4i.h     vr19,     vr19,     0x1b
3699
3700    vreplgr2vr.h  vr23,     zero
3701.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3702    vst           vr23,     a2,       \i
3703.endr
3704
3705.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
3706    vsrari.h      \i,       \i,       1
3707.endr
3708
3709    // identity8
3710    vsllwil.w.h   vr20,     vr24,     1
3711    vsllwil.w.h   vr21,     vr25,     1
3712    vsllwil.w.h   vr12,     vr26,     1
3713    vsllwil.w.h   vr13,     vr27,     1
3714    vsllwil.w.h   vr22,     vr16,     1
3715    vsllwil.w.h   vr23,     vr17,     1
3716    vsllwil.w.h   vr14,     vr18,     1
3717    vsllwil.w.h   vr15,     vr19,     1
3718
3719.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
3720    vexth.w.h     \i,       \i
3721.endr
3722
3723.irp i, vr24, vr25, vr26, vr27, vr16, vr17, vr18, vr19
3724     vslli.w      \i,       \i,       1
3725.endr
3726
3727    vssrarni.h.w  vr20,     vr12,     4   // in0
3728    vssrarni.h.w  vr24,     vr26,     4   // in1
3729    vssrarni.h.w  vr21,     vr13,     4   // in2
3730    vssrarni.h.w  vr25,     vr27,     4   // in3
3731    vssrarni.h.w  vr22,     vr14,     4   // in4
3732    vssrarni.h.w  vr16,     vr18,     4   // in5
3733    vssrarni.h.w  vr23,     vr15,     4   // in6
3734    vssrarni.h.w  vr17,     vr19,     4   // in7
3735
3736    alsl.d        t2,       a1,       a0,     1
3737
3738    VLD_DST_ADD_W8 vr20, vr24, vr21, vr25
3739
3740    alsl.d        a0,       a1,       a0,     2
3741    alsl.d        t2,       a1,       a0,     1
3742
3743    VLD_DST_ADD_W8 vr22, vr16, vr23, vr17
3744
3745    fld.d         f24,      sp,       0
3746    fld.d         f25,      sp,       8
3747    fld.d         f26,      sp,       16
3748    fld.d         f27,      sp,       24
3749    addi.d        sp,       sp,       32
3750endfunc
3751
3752function inv_txfm_add_identity_flipadst_8x8_8bpc_lsx
3753    addi.d        sp,       sp,       -48
3754    fst.d         f24,      sp,       0
3755    fst.d         f25,      sp,       8
3756    fst.d         f26,      sp,       16
3757    fst.d         f27,      sp,       24
3758    fst.d         f28,      sp,       32
3759    fst.d         f29,      sp,       40
3760
3761    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3762
3763    // identity8
3764    vsllwil.w.h   vr6,      vr0,      1
3765    vsllwil.w.h   vr7,      vr1,      1
3766    vsllwil.w.h   vr8,      vr2,      1
3767    vsllwil.w.h   vr9,      vr3,      1
3768    vsllwil.w.h   vr10,     vr4,      1
3769    vsllwil.w.h   vr11,     vr5,      1
3770    vsllwil.w.h   vr12,     vr24,     1
3771    vsllwil.w.h   vr13,     vr25,     1
3772
3773.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3774    vexth.w.h     \i,       \i
3775.endr
3776
3777.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3778    vslli.w       \i,       \i,       1
3779.endr
3780
3781    vssrarni.h.w  vr0,      vr6,      1   // in0
3782    vssrarni.h.w  vr1,      vr7,      1   // in1
3783    vssrarni.h.w  vr2,      vr8,      1   // in2
3784    vssrarni.h.w  vr3,      vr9,      1   // in3
3785    vssrarni.h.w  vr4,      vr10,     1   // in4
3786    vssrarni.h.w  vr5,      vr11,     1   // in5
3787    vssrarni.h.w  vr24,     vr12,     1   // in6
3788    vssrarni.h.w  vr25,     vr13,     1   // in7
3789
3790    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
3791                       vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
3792                       vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13
3793
3794    vreplgr2vr.h  vr23,     zero
3795.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3796    vst           vr23,     a2,       \i
3797.endr
3798
3799    vsllwil.w.h   vr18,     vr0,      0    // in0
3800    vsllwil.w.h   vr19,     vr1,      0    // in1
3801    vsllwil.w.h   vr6,      vr2,      0    // in2
3802    vsllwil.w.h   vr7,      vr3,      0    // in3
3803    vsllwil.w.h   vr8,      vr4,      0    // in3
3804    vsllwil.w.h   vr9,      vr5,      0    // in4
3805    vsllwil.w.h   vr10,     vr24,     0    // in5
3806    vsllwil.w.h   vr11,     vr25,     0    // in6
3807    adst8x8_1d_lsx vr26, vr27, vr28, vr29
3808
3809    vexth.w.h     vr18,     vr0            // in0
3810    vexth.w.h     vr19,     vr1            // in1
3811    vexth.w.h     vr6,      vr2            // in2
3812    vexth.w.h     vr7,      vr3            // in3
3813    vexth.w.h     vr8,      vr4            // in3
3814    vexth.w.h     vr9,      vr5            // in4
3815    vexth.w.h     vr10,     vr24           // in5
3816    vexth.w.h     vr11,     vr25           // in6
3817    adst8x8_1d_lsx vr0, vr1, vr2, vr3
3818
3819    vilvh.d       vr4,      vr0,      vr26
3820    vilvh.d       vr5,      vr1,      vr27
3821    vilvh.d       vr6,      vr2,      vr28
3822    vilvh.d       vr7,      vr3,      vr29
3823    vilvl.d       vr14,     vr3,      vr29
3824    vilvl.d       vr15,     vr2,      vr28
3825    vilvl.d       vr16,     vr1,      vr27
3826    vilvl.d       vr17,     vr0,      vr26
3827
3828.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
3829    vsrari.h      \i,        \i,      4
3830.endr
3831
3832    alsl.d        t2,       a1,       a0,     1
3833
3834    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
3835
3836    alsl.d        a0,       a1,       a0,     2
3837    alsl.d        t2,       a1,       a0,     1
3838
3839    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
3840
3841    fld.d         f24,      sp,       0
3842    fld.d         f25,      sp,       8
3843    fld.d         f26,      sp,       16
3844    fld.d         f27,      sp,       24
3845    fld.d         f28,      sp,       32
3846    fld.d         f29,      sp,       40
3847    addi.d        sp,       sp,       48
3848
3849endfunc
3850
3851function inv_txfm_add_adst_identity_8x8_8bpc_lsx
3852    addi.d        sp,       sp,       -32
3853    fst.d         f24,      sp,       0
3854    fst.d         f25,      sp,       8
3855    fst.d         f26,      sp,       16
3856    fst.d         f27,      sp,       24
3857
3858    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr16, vr17
3859
3860    vsllwil.w.h   vr18,     vr0,      0
3861    vsllwil.w.h   vr19,     vr1,      0
3862    vsllwil.w.h   vr6,      vr2,      0
3863    vsllwil.w.h   vr7,      vr3,      0
3864    vsllwil.w.h   vr8,      vr4,      0
3865    vsllwil.w.h   vr9,      vr5,      0
3866    vsllwil.w.h   vr10,     vr16,     0
3867    vsllwil.w.h   vr11,     vr17,     0
3868    adst8x8_1d_lsx vr24, vr25, vr26, vr27
3869
3870    vexth.w.h     vr18,     vr0
3871    vexth.w.h     vr19,     vr1
3872    vexth.w.h     vr6,      vr2
3873    vexth.w.h     vr7,      vr3
3874    vexth.w.h     vr8,      vr4
3875    vexth.w.h     vr9,      vr5
3876    vexth.w.h     vr10,     vr16
3877    vexth.w.h     vr11,     vr17
3878    adst8x8_1d_lsx vr0, vr1, vr2, vr3
3879
3880    vreplgr2vr.h  vr23,     zero
3881.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3882    vst           vr23,     a2,       \i
3883.endr
3884
3885.irp i, vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3
3886    vsrari.h      \i,       \i,       1
3887.endr
3888
3889    LSX_TRANSPOSE8x8_H vr24, vr25, vr26, vr27, vr0, vr1, vr2, vr3, \
3890                       vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23, \
3891                       vr6, vr7, vr8, vr9, vr10, vr11, vr16, vr17
3892
3893    vshuf4i.h     vr26,     vr26,     0x1b
3894    vshuf4i.h     vr27,     vr27,     0x1b
3895    vshuf4i.h     vr22,     vr22,     0x1b
3896    vshuf4i.h     vr23,     vr23,     0x1b
3897
3898    // identity8
3899    vsllwil.w.h   vr16,     vr24,     1
3900    vsllwil.w.h   vr17,     vr25,     1
3901    vsllwil.w.h   vr10,     vr20,     1
3902    vsllwil.w.h   vr11,     vr21,     1
3903    vsllwil.w.h   vr18,     vr26,     1
3904    vsllwil.w.h   vr19,     vr27,     1
3905    vsllwil.w.h   vr14,     vr22,     1
3906    vsllwil.w.h   vr15,     vr23,     1
3907
3908.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23
3909    vexth.w.h     \i,       \i
3910.endr
3911
3912.irp i, vr24, vr25, vr20, vr21, vr26, vr27, vr22, vr23
3913    vslli.w       \i,       \i,       1
3914.endr
3915
3916    vssrarni.h.w  vr18,     vr16,     4    // in0
3917    vssrarni.h.w  vr19,     vr17,     4    // in1
3918    vssrarni.h.w  vr14,     vr10,     4    // in2
3919    vssrarni.h.w  vr15,     vr11,     4    // in3
3920    vssrarni.h.w  vr26,     vr24,     4    // in4
3921    vssrarni.h.w  vr27,     vr25,     4    // in5
3922    vssrarni.h.w  vr22,     vr20,     4    // in6
3923    vssrarni.h.w  vr23,     vr21,     4    // in7
3924
3925    alsl.d        t2,       a1,       a0,     1
3926
3927    VLD_DST_ADD_W8 vr18, vr19, vr14, vr15
3928
3929    alsl.d        a0,       a1,       a0,     2
3930    alsl.d        t2,       a1,       a0,     1
3931
3932    VLD_DST_ADD_W8 vr26, vr27, vr22, vr23
3933
3934    fld.d         f24,      sp,       0
3935    fld.d         f25,      sp,       8
3936    fld.d         f26,      sp,       16
3937    fld.d         f27,      sp,       24
3938    addi.d        sp,       sp,       32
3939endfunc
3940
3941function inv_txfm_add_identity_adst_8x8_8bpc_lsx
3942    addi.d        sp,       sp,       -48
3943    fst.d         f24,      sp,       0
3944    fst.d         f25,      sp,       8
3945    fst.d         f26,      sp,       16
3946    fst.d         f27,      sp,       24
3947    fst.d         f28,      sp,       32
3948    fst.d         f29,      sp,       40
3949
3950    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3951
3952    // identity8
3953    vsllwil.w.h   vr6,      vr0,      1
3954    vsllwil.w.h   vr7,      vr1,      1
3955    vsllwil.w.h   vr8,      vr2,      1
3956    vsllwil.w.h   vr9,      vr3,      1
3957    vsllwil.w.h   vr10,     vr4,      1
3958    vsllwil.w.h   vr11,     vr5,      1
3959    vsllwil.w.h   vr12,     vr24,     1
3960    vsllwil.w.h   vr13,     vr25,     1
3961
3962.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3963    vexth.w.h     \i,       \i
3964.endr
3965
3966.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25
3967    vslli.w       \i,       \i,       1
3968.endr
3969
3970    vssrarni.h.w  vr0,      vr6,      1   // in0
3971    vssrarni.h.w  vr1,      vr7,      1   // in1
3972    vssrarni.h.w  vr2,      vr8,      1   // in2
3973    vssrarni.h.w  vr3,      vr9,      1   // in3
3974    vssrarni.h.w  vr4,      vr10,     1   // in4
3975    vssrarni.h.w  vr5,      vr11,     1   // in5
3976    vssrarni.h.w  vr24,     vr12,     1   // in6
3977    vssrarni.h.w  vr25,     vr13,     1   // in7
3978
3979    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
3980                       vr0, vr1, vr2, vr3, vr4, vr5, vr24, vr25, \
3981                       vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13
3982
3983    vreplgr2vr.h  vr23,     zero
3984
3985.irp i, 0, 16, 32, 48, 64, 80, 96, 112
3986    vst           vr23,     a2,       \i
3987.endr
3988
3989    vsllwil.w.h   vr18,     vr0,      0
3990    vsllwil.w.h   vr19,     vr1,      0
3991    vsllwil.w.h   vr6,      vr2,      0
3992    vsllwil.w.h   vr7,      vr3,      0
3993    vsllwil.w.h   vr8,      vr4,      0
3994    vsllwil.w.h   vr9,      vr5,      0
3995    vsllwil.w.h   vr10,     vr24,     0
3996    vsllwil.w.h   vr11,     vr25,     0
3997    adst8x8_1d_lsx vr26, vr27, vr28, vr29
3998
3999    vexth.w.h     vr18,     vr0
4000    vexth.w.h     vr19,     vr1
4001    vexth.w.h     vr6,      vr2
4002    vexth.w.h     vr7,      vr3
4003    vexth.w.h     vr8,      vr4
4004    vexth.w.h     vr9,      vr5
4005    vexth.w.h     vr10,     vr24
4006    vexth.w.h     vr11,     vr25
4007
4008    adst8x8_1d_lsx vr0, vr1, vr2, vr3
4009
4010    vilvl.d       vr4,      vr0,      vr26  // 0 ... 7
4011    vilvl.d       vr5,      vr1,      vr27  // 8 ... 15
4012    vilvl.d       vr6,      vr2,      vr28  // 16 ... 23
4013    vilvl.d       vr7,      vr3,      vr29  // 24 ... 31
4014    vilvh.d       vr14,     vr3,      vr29  // 32 ... 39
4015    vilvh.d       vr15,     vr2,      vr28  // 40 ... 47
4016    vilvh.d       vr16,     vr1,      vr27  // 48 ... 55
4017    vilvh.d       vr17,     vr0,      vr26  // 56 ... 63
4018
4019.irp i, vr4, vr5, vr6, vr7, vr14, vr15, vr16, vr17
4020    vsrari.h      \i,       \i,       4
4021.endr
4022
4023    alsl.d        t2,       a1,       a0,    1
4024
4025    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
4026
4027    alsl.d        a0,       a1,       a0,     2
4028    alsl.d        t2,       a1,       a0,     1
4029
4030    VLD_DST_ADD_W8 vr14, vr15, vr16, vr17
4031
4032    fld.d         f24,      sp,       0
4033    fld.d         f25,      sp,       8
4034    fld.d         f26,      sp,       16
4035    fld.d         f27,      sp,       24
4036    fld.d         f28,      sp,       32
4037    fld.d         f29,      sp,       40
4038    addi.d        sp,       sp,       48
4039endfunc
4040
4041.macro vmul_vmadd_w in0, in1, in2, in3, out0, out1
4042    vsllwil.w.h   vr22,     \in0,     0
4043    vexth.w.h     vr23,     \in0
4044    vmul.w        \out0,    vr22,     \in2
4045    vmul.w        \out1,    vr23,     \in2
4046    vsllwil.w.h   vr22,     \in1,     0
4047    vexth.w.h     vr23,     \in1
4048    vmadd.w       \out0,    vr22,     \in3
4049    vmadd.w       \out1,    vr23,     \in3
4050.endm
4051
4052.macro vmul_vmsub_w in0, in1, in2, in3, out0, out1
4053    vsllwil.w.h   vr22,     \in0,     0
4054    vexth.w.h     vr23,     \in0
4055    vmul.w        \out0,    vr22,     \in2
4056    vmul.w        \out1,    vr23,     \in2
4057    vsllwil.w.h   vr22,     \in1,     0
4058    vexth.w.h     vr23,     \in1
4059    vmsub.w       \out0,    vr22,     \in3
4060    vmsub.w       \out1,    vr23,     \in3
4061.endm
4062
4063.macro rect2_lsx in0, in1, out0
4064    vsllwil.w.h   vr22,     \in0,     0     // in1
4065    vexth.w.h     \in0,     \in0            // in1
4066    vmul.w        vr22,     vr22,     \in1
4067    vmul.w        \out0,    \in0,     \in1
4068    vssrarni.h.w  \out0,    vr22,     12
4069.endm
4070
4071.macro dct_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \
4072                        out1, out2, out3, out4, out5, out6, out7, rect2
4073
4074    la.local      t0,       idct_coeffs
4075
4076.ifc \rect2, rect2_lsx
4077    vldrepl.w     vr23,      t0,       0        // 2896
4078.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
4079    rect2_lsx \i, vr23, \i
4080.endr
4081.endif
4082    vldrepl.w     vr20,      t0,       8        // 1567
4083    vldrepl.w     vr21,      t0,       12       // 3784
4084
4085    vmul_vmadd_w  \in2, \in6, vr21, vr20, vr8, vr9
4086    vssrarni.h.w  vr9,       vr8,      12       // t3
4087    vmul_vmsub_w  \in2, \in6, vr20, vr21, vr8, vr10
4088    vssrarni.h.w  vr10,      vr8,      12       // t2
4089
4090    vldrepl.w     vr20,      t0,       0        // 2896
4091    vmul_vmadd_w  \in0, \in4, vr20, vr20, vr8, \in2
4092    vssrarni.h.w  \in2,      vr8,      12       // t0
4093    vmul_vmsub_w  \in0, \in4, vr20, vr20, vr8, \in6
4094    vssrarni.h.w  \in6,      vr8,      12       // t1
4095
4096    vsadd.h       vr8,       \in2,     vr9      // c[0]
4097    vssub.h       vr9,       \in2,     vr9      // c[3]
4098    vsadd.h       \in0,      \in6,     vr10     // c[1]
4099    vssub.h       vr10,      \in6,     vr10     // c[2]
4100
4101    vldrepl.w     vr20,     t0,        16       // 799
4102    vldrepl.w     vr21,     t0,        20       // 4017
4103    vmul_vmadd_w  \in1, \in7, vr21, vr20, \in2, \in4
4104    vssrarni.h.w  \in4,     \in2,      12       // t7a
4105    vmul_vmsub_w  \in1, \in7, vr20, vr21, \in2, \in6
4106    vssrarni.h.w  \in6,     \in2,      12       // t4a
4107
4108    vldrepl.w     vr20,     t0,        24       // 3406
4109    vldrepl.w     vr21,     t0,        28       // 2276
4110    vmul_vmadd_w  \in5, \in3, vr21, vr20, \in2, \in1
4111    vssrarni.h.w  \in1,     \in2,      12       // t6a
4112    vmul_vmsub_w  \in5, \in3, vr20, vr21, \in2, \in7
4113    vssrarni.h.w  \in7,     \in2,      12       // t5a
4114
4115    vsadd.h       \in3,     \in6,      \in7     // t4
4116    vssub.h       \in6,     \in6,      \in7     // t5a
4117    vsadd.h       \in5,     \in4,      \in1     // t7
4118    vssub.h       \in4,     \in4,      \in1     // t6a
4119
4120    vldrepl.w     vr20,     t0,        0        // 2896
4121    vmul_vmadd_w  \in4, \in6, vr20, vr20, \in2, \in1
4122    vssrarni.h.w  \in1,     \in2,      12       // t6
4123    vmul_vmsub_w  \in4, \in6, vr20, vr20, \in2, \in7
4124    vssrarni.h.w  \in7,     \in2,      12       // t5
4125
4126    vsadd.h       \out0,    vr8,       \in5     // c[0]
4127    vssub.h       \out7,    vr8,       \in5     // c[7]
4128    vsadd.h       \out1,    \in0,      \in1     // c[1]
4129    vssub.h       \out6,    \in0,      \in1     // c[6]
4130    vsadd.h       \out2,    vr10,      \in7     // c[2]
4131    vssub.h       \out5,    vr10,      \in7     // c[5]
4132    vsadd.h       \out3,    vr9,       \in3     // c[3]
4133    vssub.h       \out4,    vr9,       \in3     // c[4]
4134.endm
4135
4136function inv_txfm_add_dct_dct_8x8_8bpc_lsx
4137    bnez          a3,       .NO_HAS_DCONLY_8x8
4138
4139    ld.h          t2,       a2,       0      // dc
4140    vldi          vr0,      0x8b5            // 181
4141    vreplgr2vr.w  vr1,      t2
4142    vldi          vr5,      0x880            // 128
4143    vmul.w        vr2,      vr0,      vr1    // dc * 181
4144    st.h          zero,     a2,       0
4145    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
4146    vld           vr10,     a0,       0      // 0 1 2 3 4 5 6 7
4147    vsrari.w      vr2,      vr2,      1      // (dc + rnd) >> shift
4148    vldx          vr11,     a0,       a1     // 8 9 10 11 12 13 14 15
4149    alsl.d        t2,       a1,       a0,    1
4150    vmadd.w       vr5,      vr2,      vr0
4151    vld           vr12,     t2,       0      // 16 17 18 19 20 21 22 23
4152    vssrarni.h.w  vr5,      vr5,      12
4153    vldx          vr13,     t2,       a1     // 24 25 26 27 28 29 30 31
4154
4155    DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
4156
4157    alsl.d        a0,       a1,       a0,     2
4158    alsl.d        t2,       a1,       a0,     1
4159
4160    VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
4161
4162    b             .DCT_DCT_8X8_END
4163
4164.NO_HAS_DCONLY_8x8:
4165
4166    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4167
4168    la.local      t0,       idct_coeffs
4169
4170    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4171                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
4172
4173    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
4174                       vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
4175                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4176
4177.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
4178    vsrari.h      \i,       \i,       1
4179.endr
4180
4181    vreplgr2vr.h  vr23,     zero
4182
4183.irp i, 0, 16, 32, 48, 64, 80, 96, 112
4184    vst           vr23,     a2,       \i
4185.endr
4186
4187    dct_8x8_core_lsx vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
4188                     vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23,  no_rect2
4189
4190.irp i, vr4, vr5, vr6, vr7, vr20, vr21, vr22, vr23
4191    vsrari.h      \i,       \i,       4
4192.endr
4193
4194    alsl.d        t2,       a1,       a0,     1
4195
4196    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
4197
4198    alsl.d        a0,       a1,       a0,     2
4199    alsl.d        t2,       a1,       a0,     1
4200
4201    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
4202
4203.DCT_DCT_8X8_END:
4204
4205endfunc
4206
4207.macro dct_8x16_core_lsx
4208    dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \
4209                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
4210
4211    la.local      t0,       idct_coeffs
4212    vldrepl.w     vr20,     t0,       32        // 401
4213    vldrepl.w     vr21,     t0,       36        // 4076
4214    vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10
4215    vssrarni.h.w  vr10,     vr0,      12        // t15a
4216    vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29
4217    vssrarni.h.w  vr29,     vr0,      12        // t8a
4218
4219    vldrepl.w     vr20,     t0,       40        // 3166 -> 1583
4220    vldrepl.w     vr21,     t0,       44        // 2598 -> 1299
4221    vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30
4222    vssrarni.h.w  vr30,     vr0,      12        // t14a
4223    vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31
4224    vssrarni.h.w  vr31,     vr0,      12        // t9a
4225
4226    vldrepl.w     vr20,     t0,       48        // 1931
4227    vldrepl.w     vr21,     t0,       52        // 3612
4228    vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24
4229    vssrarni.h.w  vr24,     vr0,      12        // t13a
4230    vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25
4231    vssrarni.h.w  vr25,     vr0,      12        // t10a
4232
4233    vldrepl.w     vr20,     t0,       56        // 3920
4234    vldrepl.w     vr21,     t0,       60        // 1189
4235    vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26
4236    vssrarni.h.w  vr26,     vr0,      12        // t12a
4237    vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27
4238    vssrarni.h.w  vr27,     vr0,      12        // t11a
4239
4240    // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
4241    vsadd.h       vr28,     vr29,      vr31     // t8
4242    vssub.h       vr19,     vr29,      vr31     // t9
4243    vssub.h       vr29,     vr27,      vr25     // t10
4244    vsadd.h       vr9,      vr27,      vr25     // t11
4245    vsadd.h       vr31,     vr26,      vr24     // t12
4246    vssub.h       vr25,     vr26,      vr24     // t13
4247    vssub.h       vr27,     vr10,      vr30     // t14
4248    vsadd.h       vr24,     vr10,      vr30     // t15
4249
4250    vldrepl.w     vr20,     t0,       8         // 1567
4251    vldrepl.w     vr21,     t0,       12        // 3784
4252    vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
4253    vssrarni.h.w  vr26,     vr0,       12       // t14a
4254    vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30
4255    vssrarni.h.w  vr30,     vr0,       12       // t9a
4256
4257    vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
4258    vneg.w        vr0,      vr0
4259    vneg.w        vr19,     vr19
4260    vssrarni.h.w  vr19,     vr0,       12       // t10a
4261    vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27
4262    vssrarni.h.w  vr27,     vr0,       12       // t13a
4263
4264    vsadd.h       vr25,     vr28,     vr9       // t8a
4265    vssub.h       vr29,     vr28,     vr9       // t11a
4266    vssub.h       vr28,     vr24,     vr31      // t12a
4267    vsadd.h       vr10,     vr24,     vr31      // t15a
4268    vsadd.h       vr9,      vr30,     vr19      // t9
4269    vssub.h       vr31,     vr30,     vr19      // t10
4270    vssub.h       vr30,     vr26,     vr27      // t13
4271    vsadd.h       vr24,     vr26,     vr27      // t14
4272
4273    vldrepl.w     vr20,     t0,       0         // 2896
4274    vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
4275    vssrarni.h.w  vr26,     vr0,      12        // t13a
4276    vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27
4277    vssrarni.h.w  vr27,     vr0,      12        // t10a
4278
4279    vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
4280    vssrarni.h.w  vr31,     vr0,      12        // t12
4281    vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30
4282    vssrarni.h.w  vr30,     vr0,      12        // t11
4283
4284    // vr11 vr12 ... vr18
4285    vsadd.h       vr28,     vr14,     vr31      // c[3]
4286    vssub.h       vr29,     vr14,     vr31      // c[12]
4287    vsadd.h       vr20,     vr15,     vr30      // c[4]
4288    vssub.h       vr21,     vr15,     vr30      // c[11]
4289    vsadd.h       vr14,     vr16,     vr27      // c[5]
4290    vssub.h       vr23,     vr16,     vr27      // c[10]
4291    vsadd.h       vr15,     vr17,     vr9       // c[6]
4292    vssub.h       vr30,     vr17,     vr9       // c[9]
4293    vsadd.h       vr16,     vr18,     vr25      // c[7]
4294    vssub.h       vr27,     vr18,     vr25      // c[8]
4295    vsadd.h       vr17,     vr13,     vr26      // c[2]
4296    vssub.h       vr26,     vr13,     vr26      // c[13]
4297    vsadd.h       vr18,     vr12,     vr24      // c[1]
4298    vssub.h       vr25,     vr12,     vr24      // c[14]
4299    vsadd.h       vr22,     vr11,     vr10      // c[0]
4300    vssub.h       vr24,     vr11,     vr10      // c[15]
4301.endm
4302
4303function inv_txfm_add_dct_dct_8x16_8bpc_lsx
4304    bnez          a3,       .NO_HAS_DCONLY_8x16
4305
4306    ld.h          t2,       a2,       0      // dc
4307    vldi          vr0,      0x8b5            // 181
4308    vreplgr2vr.w  vr1,      t2
4309    vldi          vr5,      0x880            // 128
4310    vmul.w        vr2,      vr0,      vr1    // dc * 181
4311    st.h          zero,     a2,       0
4312    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
4313    vld           vr10,     a0,       0      // 0 1 2 3 4 5 6 7
4314    vmul.w        vr2,      vr0,      vr2
4315    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
4316    vsrari.w      vr2,      vr2,      1      // (dc + rnd) >> shift
4317    vldx          vr11,     a0,       a1     // 8 9 10 11 12 13 14 15
4318    alsl.d        t2,       a1,       a0,    1
4319    vmadd.w       vr5,      vr2,      vr0
4320    vld           vr12,     t2,       0      // 16 17 18 19 20 21 22 23
4321    vssrarni.h.w  vr5,      vr5,      12
4322    vldx          vr13,     t2,       a1     // 24 25 26 27 28 29 30 31
4323
4324    DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
4325
4326    alsl.d        a0,       a1,       a0,     2
4327    alsl.d        t2,       a1,       a0,     1
4328
4329    VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
4330
4331    alsl.d        a0,       a1,       a0,     2
4332    alsl.d        t2,       a1,       a0,     1
4333
4334    VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
4335
4336    alsl.d        a0,       a1,       a0,     2
4337    alsl.d        t2,       a1,       a0,     1
4338
4339    VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
4340
4341    b             .DCT_DCT_8X16_END
4342
4343.NO_HAS_DCONLY_8x16:
4344    addi.d        sp,       sp,       -64
4345    fst.d         f24,      sp,       0
4346    fst.d         f25,      sp,       8
4347    fst.d         f26,      sp,       16
4348    fst.d         f27,      sp,       24
4349    fst.d         f28,      sp,       32
4350    fst.d         f29,      sp,       40
4351    fst.d         f30,      sp,       48
4352    fst.d         f31,      sp,       56
4353
4354    vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4355
4356    la.local      t0,       idct_coeffs
4357
4358    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4359                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
4360
4361    vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4362
4363    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4364                     vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
4365
4366.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
4367        vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
4368    vsrari.h      \i,       \i,       1
4369.endr
4370
4371    vreplgr2vr.h  vr23,     zero
4372
4373.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
4374    vst           vr23,     a2,       \i
4375.endr
4376
4377    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
4378                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4379                       vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
4380
4381    LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
4382                       vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
4383                       vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
4384
4385    dct_8x16_core_lsx
4386
4387.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
4388        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
4389    vsrari.h     \i,       \i,       4
4390.endr
4391
4392    alsl.d        t2,       a1,       a0,    1
4393
4394    VLD_DST_ADD_W8 vr22, vr18, vr17, vr28
4395
4396    alsl.d        a0,       a1,       a0,    2
4397    alsl.d        t2,       a1,       a0,    1
4398
4399    VLD_DST_ADD_W8 vr20, vr14, vr15, vr16
4400
4401    alsl.d        a0,       a1,       a0,    2
4402    alsl.d        t2,       a1,       a0,    1
4403
4404    VLD_DST_ADD_W8 vr27, vr30, vr23, vr21
4405
4406    alsl.d        a0,       a1,       a0,    2
4407    alsl.d        t2,       a1,       a0,    1
4408
4409    VLD_DST_ADD_W8 vr29, vr26, vr25, vr24
4410
4411    fld.d         f24,      sp,       0
4412    fld.d         f25,      sp,       8
4413    fld.d         f26,      sp,       16
4414    fld.d         f27,      sp,       24
4415    fld.d         f28,      sp,       32
4416    fld.d         f29,      sp,       40
4417    fld.d         f30,      sp,       48
4418    fld.d         f31,      sp,       56
4419    addi.d        sp,       sp,       64
4420.DCT_DCT_8X16_END:
4421endfunc
4422
4423.macro identity_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, rect2
4424
4425    la.local      t0,       idct_coeffs
4426
4427.ifc \rect2, rect2_lsx
4428    vldrepl.w     vr23,      t0,       0       // 2896
4429.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
4430    rect2_lsx \i, vr23, \i
4431.endr
4432.endif
4433    vsllwil.w.h   vr8,      \in0,     1
4434    vsllwil.w.h   vr9,      \in1,     1
4435    vsllwil.w.h   vr10,     \in2,     1
4436    vsllwil.w.h   vr11,     \in3,     1
4437    vsllwil.w.h   vr12,     \in4,     1
4438    vsllwil.w.h   vr13,     \in5,     1
4439    vsllwil.w.h   vr14,     \in6,     1
4440    vsllwil.w.h   vr15,     \in7,     1
4441
4442.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
4443    vexth.w.h     \i,       \i
4444.endr
4445
4446.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
4447    vslli.w       \i,       \i,       1
4448.endr
4449
4450    vssrarni.h.w  \in0,     vr8,      1
4451    vssrarni.h.w  \in1,     vr9,      1
4452    vssrarni.h.w  \in2,     vr10,     1
4453    vssrarni.h.w  \in3,     vr11,     1
4454    vssrarni.h.w  \in4,     vr12,     1
4455    vssrarni.h.w  \in5,     vr13,     1
4456    vssrarni.h.w  \in6,     vr14,     1
4457    vssrarni.h.w  \in7,     vr15,     1
4458.endm
4459
4460.macro identity_8x16_core_lsx in0, out0
4461    vsadd.h       vr10,     \in0,     \in0
4462    vsllwil.w.h   vr8,      \in0,     0
4463    vexth.w.h     \out0,    \in0
4464    vmul.w        vr8,      vr8,      vr20
4465    vmul.w        \out0,    \out0,    vr20
4466    vssrarni.h.w  \out0,    vr8,      11
4467    vsadd.h       \out0,    \out0,    vr10
4468.endm
4469
4470function inv_txfm_add_identity_identity_8x16_8bpc_lsx
4471    addi.d        sp,       sp,       -64
4472    fst.d         f24,      sp,       0
4473    fst.d         f25,      sp,       8
4474    fst.d         f26,      sp,       16
4475    fst.d         f27,      sp,       24
4476    fst.d         f28,      sp,       32
4477    fst.d         f29,      sp,       40
4478    fst.d         f30,      sp,       48
4479    fst.d         f31,      sp,       56
4480
4481    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4482
4483    identity_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, rect2_lsx
4484
4485    vld_x8 a2, 128, 16, vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27
4486
4487    identity_8x8_core_lsx vr16, vr17, vr18, vr19, vr24, vr25, vr26, vr27, rect2_lsx
4488
4489    vreplgr2vr.h  vr23,     zero
4490
4491.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
4492    vst           vr23,     a2,       \i
4493.endr
4494
4495
4496    LSX_TRANSPOSE8x8_H vr0, vr2, vr4, vr6, vr16, vr18, vr24, vr26, \
4497                       vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \
4498                       vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21
4499
4500    LSX_TRANSPOSE8x8_H vr1, vr3, vr5, vr7, vr17, vr19, vr25, vr27, \
4501                       vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27, \
4502                       vr8, vr9, vr10, vr11, vr12, vr13, vr20, vr21
4503
4504    li.w          t0,       1697
4505    vreplgr2vr.w  vr20,     t0
4506
4507.irp i, vr14, vr15, vr22, vr23, vr16, vr18, vr24, vr26, \
4508        vr28, vr29, vr30, vr31, vr17, vr19, vr25, vr27
4509    identity_8x16_core_lsx \i, \i
4510    vsrari.h      \i,       \i,       4
4511.endr
4512
4513    alsl.d        t2,       a1,       a0,    1
4514
4515    VLD_DST_ADD_W8 vr14, vr15, vr22, vr23
4516
4517    alsl.d        a0,       a1,       a0,    2
4518    alsl.d        t2,       a1,       a0,    1
4519
4520    VLD_DST_ADD_W8 vr16, vr18, vr24, vr26
4521
4522    alsl.d        a0,       a1,       a0,    2
4523    alsl.d        t2,       a1,       a0,    1
4524
4525    VLD_DST_ADD_W8 vr28, vr29, vr30, vr31
4526
4527    alsl.d        a0,       a1,       a0,    2
4528    alsl.d        t2,       a1,       a0,    1
4529
4530    VLD_DST_ADD_W8 vr17, vr19, vr25, vr27
4531
4532    fld.d         f24,      sp,       0
4533    fld.d         f25,      sp,       8
4534    fld.d         f26,      sp,       16
4535    fld.d         f27,      sp,       24
4536    fld.d         f28,      sp,       32
4537    fld.d         f29,      sp,       40
4538    fld.d         f30,      sp,       48
4539    fld.d         f31,      sp,       56
4540    addi.d        sp,       sp,       64
4541endfunc
4542
4543.macro adst_8x8_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
4544                         out2, out3, out4, out5, out6, out7, rect2
4545
4546    la.local      t0,       iadst8_coeffs
4547
4548.ifc \rect2, rect2_lsx
4549    vldrepl.w     vr23,      t0,       32       // 2896
4550.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
4551    rect2_lsx \i, vr23, \i
4552.endr
4553.endif
4554
4555    vldrepl.w     vr20,     t0,       0         // 4076
4556    vldrepl.w     vr21,     t0,       4         // 401
4557
4558    vmul_vmadd_w vr7, vr0, vr20, vr21, vr8, vr9
4559    vssrarni.h.w  vr9,      vr8,      12        // t0a low
4560    vmul_vmsub_w vr7, vr0, vr21, vr20, vr8, vr10
4561    vssrarni.h.w  vr10,     vr8,      12        // t1a low
4562
4563    vldrepl.w     vr20,     t0,       8         // 3612
4564    vldrepl.w     vr21,     t0,       12        // 1931
4565    vmul_vmadd_w vr5, vr2, vr20, vr21, vr8, vr0
4566    vssrarni.h.w  vr0,      vr8,      12        // t2a low
4567    vmul_vmsub_w vr5, vr2, vr21, vr20, vr8, vr7
4568    vssrarni.h.w  vr7,      vr8,      12        // t3a low
4569
4570    vldrepl.w     vr20,     t0,       16        // 2598 -> 1299
4571    vldrepl.w     vr21,     t0,       20        // 3166 -> 1583
4572    vmul_vmadd_w vr3, vr4, vr20, vr21, vr8, vr2
4573    vssrarni.h.w  vr2,      vr8,      12        // t4a low
4574    vmul_vmsub_w vr3, vr4, vr21, vr20, vr8, vr5
4575    vssrarni.h.w  vr5,      vr8,      12        // t5a low
4576
4577    vldrepl.w     vr20,     t0,       24        // 1189
4578    vldrepl.w     vr21,     t0,       28        // 3920
4579    vmul_vmadd_w vr1, vr6, vr20, vr21, vr8, vr3
4580    vssrarni.h.w  vr3,      vr8,      12        // t6a low
4581    vmul_vmsub_w vr1, vr6, vr21, vr20, vr8, vr4
4582    vssrarni.h.w  vr4,      vr8,      12        // t7a low
4583
4584    vsadd.h       vr1,      vr9,      vr2       // t0
4585    vssub.h       vr6,      vr9,      vr2       // t4
4586    vsadd.h       vr8,      vr10,     vr5       // t1
4587    vssub.h       vr2,      vr10,     vr5       // t5
4588    vsadd.h       vr9,      vr0,      vr3       // t2
4589    vssub.h       vr5,      vr0,      vr3       // t6
4590    vsadd.h       vr10,     vr7,      vr4       // t3
4591    vssub.h       vr0,      vr7,      vr4       // t7
4592
4593    vldrepl.w     vr20,     t0,       40        // 1567
4594    vldrepl.w     vr21,     t0,       44        // 3784
4595    vmul_vmadd_w vr6, vr2, vr21, vr20, vr3, vr4
4596    vssrarni.h.w  vr4,      vr3,      12        // t4a low
4597    vmul_vmsub_w vr6, vr2, vr20, vr21, vr3, vr7
4598    vssrarni.h.w  vr7,      vr3,      12        // t5a low
4599
4600    vmul_vmadd_w vr0, vr5, vr20, vr21, vr3, vr2
4601    vssrarni.h.w  vr2,      vr3,      12        // t7a low
4602    vmul_vmsub_w vr0, vr5, vr21, vr20, vr3, vr6
4603    vssrarni.h.w  vr6,      vr3,      12        // t6a low
4604
4605    vsadd.h       \out0,    vr1,      vr9       // out[0]
4606    vssub.h       vr5,      vr1,      vr9       // t2
4607    vsadd.h       vr3,      vr8,      vr10      // out[7]
4608    vssub.h       vr1,      vr8,      vr10      // t3
4609    vexth.w.h     vr9,      vr3
4610    vsllwil.w.h   vr21,     vr3,      0
4611    vneg.w        \out7,    vr9
4612    vneg.w        vr21,     vr21
4613    vssrarni.h.w  \out7,    vr21,     0         // out[7]
4614
4615    vsadd.h       vr8,      vr4,      vr6       // out[1]
4616    vssub.h       vr10,     vr4,      vr6       // t6
4617    vexth.w.h     vr20,     vr8
4618    vsllwil.w.h   vr21,     vr8,      0
4619    vneg.w        \out1,    vr20
4620    vneg.w        vr21,     vr21
4621    vssrarni.h.w  \out1,    vr21,     0         // out[1]
4622    vsadd.h       \out6,    vr7,      vr2       // out[6]
4623    vssub.h       vr4,      vr7,      vr2       // t7
4624
4625    vldrepl.w     vr20,     t0,       32        // 2896
4626    vmul_vmadd_w vr5, vr1, vr20, vr20, vr9, vr6
4627    vssrarni.h.w  vr6,      vr9,      12        // out[3]
4628    vmul_vmsub_w vr5, vr1, vr20, vr20, vr9, \out4
4629    vssrarni.h.w  \out4,    vr9,      12        // out[4]
4630
4631    vmul_vmadd_w vr10, vr4, vr20, vr20, vr9, \out2
4632    vssrarni.h.w  \out2,    vr9,      12        // out[2]
4633    vmul_vmsub_w vr10, vr4, vr20, vr20, vr9, vr5
4634    vssrarni.h.w  vr5,      vr9,      12        // out[5]
4635
4636    vexth.w.h     vr20,     vr6
4637    vsllwil.w.h   vr21,     vr6,      0
4638    vneg.w        \out3,    vr20
4639    vneg.w        vr21,     vr21
4640    vssrarni.h.w  \out3,    vr21,     0         // out[3]
4641
4642    vexth.w.h     vr20,     vr5
4643    vsllwil.w.h   vr21,     vr5,      0
4644    vneg.w        \out5,    vr20
4645    vneg.w        vr21,     vr21
4646    vssrarni.h.w  \out5,    vr21,     0         // out[5]
4647.endm
4648
4649function inv_txfm_add_adst_dct_8x16_8bpc_lsx
4650    addi.d        sp,       sp,       -64
4651    fst.d         f24,      sp,       0
4652    fst.d         f25,      sp,       8
4653    fst.d         f26,      sp,       16
4654    fst.d         f27,      sp,       24
4655    fst.d         f28,      sp,       32
4656    fst.d         f29,      sp,       40
4657    fst.d         f30,      sp,       48
4658    fst.d         f31,      sp,       56
4659
4660    vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4661
4662    adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4663                      vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
4664
4665    vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
4666
4667    adst_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4668                      vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
4669
4670.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
4671        vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
4672    vsrari.h     \i,       \i,       1
4673.endr
4674
4675    vreplgr2vr.h  vr23,     zero
4676
4677.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
4678    vst           vr23,     a2,       \i
4679.endr
4680
4681    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
4682                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
4683                       vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
4684
4685    LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
4686                       vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
4687                       vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
4688
4689    dct_8x8_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, \
4690                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
4691
4692    la.local      t0,       idct_coeffs
4693    vldrepl.w     vr20,     t0,       32        // 401
4694    vldrepl.w     vr21,     t0,       36        // 4076
4695    vmul_vmadd_w vr1, vr30, vr21, vr20, vr0, vr10
4696    vssrarni.h.w  vr10,     vr0,      12        // t15a
4697    vmul_vmsub_w vr1, vr30, vr20, vr21, vr0, vr29
4698    vssrarni.h.w  vr29,     vr0,      12        // t8a
4699
4700    vldrepl.w     vr20,     t0,       40        // 3166 -> 1583
4701    vldrepl.w     vr21,     t0,       44        // 2598 -> 1299
4702    vmul_vmadd_w vr24, vr7, vr21, vr20, vr0, vr30
4703    vssrarni.h.w  vr30,     vr0,      12        // t14a
4704    vmul_vmsub_w vr24, vr7, vr20, vr21, vr0, vr31
4705    vssrarni.h.w  vr31,     vr0,      12        // t9a
4706
4707    vldrepl.w     vr20,     t0,       48        // 1931
4708    vldrepl.w     vr21,     t0,       52        // 3612
4709    vmul_vmadd_w vr5, vr26, vr21, vr20, vr0, vr24
4710    vssrarni.h.w  vr24,     vr0,      12        // t13a
4711    vmul_vmsub_w vr5, vr26, vr20, vr21, vr0, vr25
4712    vssrarni.h.w  vr25,     vr0,      12        // t10a
4713
4714    vldrepl.w     vr20,     t0,       56        // 3920
4715    vldrepl.w     vr21,     t0,       60        // 1189
4716    vmul_vmadd_w vr28, vr3, vr21, vr20, vr0, vr26
4717    vssrarni.h.w  vr26,     vr0,      12        // t12a
4718    vmul_vmsub_w vr28, vr3, vr20, vr21, vr0, vr27
4719    vssrarni.h.w  vr27,     vr0,      12        // t11a
4720
4721    // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
4722    vsadd.h       vr28,     vr29,      vr31     // t8
4723    vssub.h       vr19,     vr29,      vr31     // t9
4724    vssub.h       vr29,     vr27,      vr25     // t10
4725    vsadd.h       vr9,      vr27,      vr25     // t11
4726    vsadd.h       vr31,     vr26,      vr24     // t12
4727    vssub.h       vr25,     vr26,      vr24     // t13
4728    vssub.h       vr27,     vr10,      vr30     // t14
4729    vsadd.h       vr24,     vr10,      vr30     // t15
4730
4731    vldrepl.w     vr20,     t0,       8         // 1567
4732    vldrepl.w     vr21,     t0,       12        // 3784
4733    vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
4734    vssrarni.h.w  vr26,     vr0,       12       // t14a
4735    vmul_vmsub_w vr27, vr19, vr20, vr21, vr0, vr30
4736    vssrarni.h.w  vr30,     vr0,       12       // t9a
4737
4738    vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
4739    vneg.w        vr0,      vr0
4740    vneg.w        vr19,     vr19
4741    vssrarni.h.w  vr19,     vr0,       12       // t10a
4742    vmul_vmsub_w vr25, vr29, vr20, vr21, vr0, vr27
4743    vssrarni.h.w  vr27,     vr0,       12       // t13a
4744
4745    vsadd.h       vr25,     vr28,     vr9       // t8a
4746    vssub.h       vr29,     vr28,     vr9       // t11a
4747    vssub.h       vr28,     vr24,     vr31      // t12a
4748    vsadd.h       vr10,     vr24,     vr31      // t15a
4749    vsadd.h       vr9,      vr30,     vr19      // t9
4750    vssub.h       vr31,     vr30,     vr19      // t10
4751    vssub.h       vr30,     vr26,     vr27      // t13
4752    vsadd.h       vr24,     vr26,     vr27      // t14
4753
4754    vldrepl.w     vr20,     t0,       0         // 2896
4755    vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
4756    vssrarni.h.w  vr26,     vr0,      12        // t13a
4757    vmul_vmsub_w vr30, vr31, vr20, vr20, vr0, vr27
4758    vssrarni.h.w  vr27,     vr0,      12        // t10a
4759
4760    vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
4761    vssrarni.h.w  vr31,     vr0,      12        // t12
4762    vmul_vmsub_w vr28, vr29, vr20, vr20, vr0, vr30
4763    vssrarni.h.w  vr30,     vr0,      12        // t11
4764
4765    // vr11 vr12 ... vr18
4766    vsadd.h       vr28,     vr14,     vr31      // c[3]
4767    vssub.h       vr29,     vr14,     vr31      // c[12]
4768    vsadd.h       vr20,     vr15,     vr30      // c[4]
4769    vssub.h       vr21,     vr15,     vr30      // c[11]
4770    vsadd.h       vr14,     vr16,     vr27      // c[5]
4771    vssub.h       vr23,     vr16,     vr27      // c[10]
4772    vsadd.h       vr15,     vr17,     vr9       // c[6]
4773    vssub.h       vr30,     vr17,     vr9       // c[9]
4774    vsadd.h       vr16,     vr18,     vr25      // c[7]
4775    vssub.h       vr27,     vr18,     vr25      // c[8]
4776    vsadd.h       vr17,     vr13,     vr26      // c[2]
4777    vssub.h       vr26,     vr13,     vr26      // c[13]
4778    vsadd.h       vr18,     vr12,     vr24      // c[1]
4779    vssub.h       vr25,     vr12,     vr24      // c[14]
4780    vsadd.h       vr22,     vr11,     vr10      // c[0]
4781    vssub.h       vr24,     vr11,     vr10      // c[15]
4782
4783.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
4784        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
4785    vsrari.h     \i,       \i,       4
4786.endr
4787
4788    alsl.d        t2,       a1,       a0,    1
4789
4790    VLD_DST_ADD_W8 vr22, vr18, vr17, vr28
4791
4792    alsl.d        a0,       a1,       a0,    2
4793    alsl.d        t2,       a1,       a0,    1
4794
4795    VLD_DST_ADD_W8 vr20, vr14, vr15, vr16
4796
4797    alsl.d        a0,       a1,       a0,    2
4798    alsl.d        t2,       a1,       a0,    1
4799
4800    VLD_DST_ADD_W8 vr27, vr30, vr23, vr21
4801
4802    alsl.d        a0,       a1,       a0,    2
4803    alsl.d        t2,       a1,       a0,    1
4804
4805    VLD_DST_ADD_W8 vr29, vr26, vr25, vr24
4806
4807    fld.d         f24,      sp,       0
4808    fld.d         f25,      sp,       8
4809    fld.d         f26,      sp,       16
4810    fld.d         f27,      sp,       24
4811    fld.d         f28,      sp,       32
4812    fld.d         f29,      sp,       40
4813    fld.d         f30,      sp,       48
4814    fld.d         f31,      sp,       56
4815    addi.d        sp,       sp,       64
4816endfunc
4817
4818const iadst16_coeffs, align=4
4819    .word         4091, 201, 3973, 995
4820    .word         3703, 1751, 3290, 2440
4821    .word         2751, 3035, 2106, 3513
4822    .word         1380, 3857, 601, 4052
4823endconst
4824
4825.macro adst16_core_lsx transpose8x8, shift, vst
4826    la.local      t0,       iadst16_coeffs
4827    vldrepl.w     vr20,     t0,        0        // 4091
4828    vldrepl.w     vr21,     t0,        4        // 201
4829
4830    vmul_vmadd_w vr15, vr0, vr20, vr21, vr16, vr18
4831    vmul_vmsub_w vr15, vr0, vr21, vr20, vr17, vr19
4832    vssrarni.h.w  vr18,     vr16,      12       // t0
4833    vssrarni.h.w  vr19,     vr17,      12       // t1
4834
4835    vldrepl.w     vr20,     t0,        8        // 3973
4836    vldrepl.w     vr21,     t0,        12       // 995
4837    vmul_vmadd_w vr13, vr2, vr20, vr21, vr16, vr0
4838    vmul_vmsub_w vr13, vr2, vr21, vr20, vr17, vr15
4839    vssrarni.h.w  vr0,      vr16,      12       // t2
4840    vssrarni.h.w  vr15,     vr17,      12       // t3
4841
4842    vldrepl.w     vr20,     t0,        16       // 3703
4843    vldrepl.w     vr21,     t0,        20       // 1751
4844    vmul_vmadd_w vr11, vr4, vr20, vr21, vr16, vr2
4845    vmul_vmsub_w vr11, vr4, vr21, vr20, vr17, vr13
4846    vssrarni.h.w  vr2,      vr16,      12       // t4
4847    vssrarni.h.w  vr13,     vr17,      12       // t5
4848
4849    vldrepl.w     vr20,     t0,        24       // 3290 -> 1645
4850    vldrepl.w     vr21,     t0,        28       // 2440 -> 1220
4851    vmul_vmadd_w vr9, vr6, vr20, vr21, vr16, vr4
4852    vmul_vmsub_w vr9, vr6, vr21, vr20, vr17, vr11
4853    vssrarni.h.w  vr4,      vr16,      12       // t6
4854    vssrarni.h.w  vr11,     vr17,      12       // t7
4855
4856    vldrepl.w     vr20,     t0,        32       // 2751
4857    vldrepl.w     vr21,     t0,        36       // 3035
4858    vmul_vmadd_w vr7, vr8, vr20, vr21, vr16, vr6
4859    vmul_vmsub_w vr7, vr8, vr21, vr20, vr17, vr9
4860    vssrarni.h.w  vr6,      vr16,      12       // t8
4861    vssrarni.h.w  vr9,      vr17,      12       // t9
4862
4863    vldrepl.w     vr20,     t0,        40       // 2106
4864    vldrepl.w     vr21,     t0,        44       // 3513
4865    vmul_vmadd_w vr5, vr10, vr20, vr21, vr16, vr7
4866    vmul_vmsub_w vr5, vr10, vr21, vr20, vr17, vr8
4867    vssrarni.h.w  vr7,      vr16,      12       // t10
4868    vssrarni.h.w  vr8,      vr17,      12       // t11
4869
4870    vldrepl.w     vr20,     t0,        48       // 1380
4871    vldrepl.w     vr21,     t0,        52       // 3857
4872    vmul_vmadd_w vr3, vr12, vr20, vr21, vr16, vr5
4873    vmul_vmsub_w vr3, vr12, vr21, vr20, vr17, vr10
4874    vssrarni.h.w  vr5,      vr16,      12       // t12
4875    vssrarni.h.w  vr10,     vr17,      12       // t13
4876
4877    vldrepl.w     vr20,     t0,        56       // 601
4878    vldrepl.w     vr21,     t0,        60       // 4052
4879    vmul_vmadd_w vr1, vr14, vr20, vr21, vr16, vr3
4880    vmul_vmsub_w vr1, vr14, vr21, vr20, vr17, vr12
4881    vssrarni.h.w  vr3,      vr16,      12       // t14
4882    vssrarni.h.w  vr12,     vr17,      12       // t15
4883
4884    vsadd.h       vr1,      vr18,      vr6      // t0a
4885    vssub.h       vr14,     vr18,      vr6      // t8a
4886    vsadd.h       vr16,     vr19,      vr9      // t1a
4887    vssub.h       vr17,     vr19,      vr9      // t9a
4888    vsadd.h       vr6,      vr0,       vr7      // t2a
4889    vssub.h       vr18,     vr0,       vr7      // t10a
4890    vsadd.h       vr9,      vr15,      vr8      // t3a
4891    vssub.h       vr19,     vr15,      vr8      // t11a
4892    vsadd.h       vr0,      vr2,       vr5      // t4a
4893    vssub.h       vr7,      vr2,       vr5      // t12a
4894    vsadd.h       vr8,      vr13,      vr10     // t5a
4895    vssub.h       vr15,     vr13,      vr10     // t13a
4896    vsadd.h       vr2,      vr4,       vr3      // t6a
4897    vssub.h       vr5,      vr4,       vr3      // t14a
4898    vsadd.h       vr10,     vr11,      vr12     // t7a
4899    vssub.h       vr13,     vr11,      vr12     // t15a
4900
4901    la.local      t0,       idct_coeffs
4902
4903    vldrepl.w     vr20,     t0,        16       // 799
4904    vldrepl.w     vr21,     t0,        20       // 4017
4905    vmul_vmadd_w vr14, vr17, vr21, vr20, vr3, vr11
4906    vmul_vmsub_w vr14, vr17, vr20, vr21, vr4, vr12
4907    vssrarni.h.w  vr11,     vr3,       12       // t8
4908    vssrarni.h.w  vr12,     vr4,       12       // t9
4909
4910    vmul_vmadd_w vr15, vr7, vr20, vr21, vr3, vr14
4911    vmul_vmsub_w vr15, vr7, vr21, vr20, vr4, vr17
4912    vssrarni.h.w  vr14,     vr3,       12       // t13
4913    vssrarni.h.w  vr17,     vr4,       12       // t12
4914
4915    vldrepl.w     vr20,     t0,        24       // 3406
4916    vldrepl.w     vr21,     t0,        28       // 2276
4917    vmul_vmadd_w vr18, vr19, vr21, vr20, vr3, vr7
4918    vmul_vmsub_w vr18, vr19, vr20, vr21, vr4, vr15
4919    vssrarni.h.w  vr7,      vr3,       12       // t10
4920    vssrarni.h.w  vr15,     vr4,       12       // t11
4921
4922    vmul_vmadd_w vr13, vr5, vr20, vr21, vr3, vr18
4923    vmul_vmsub_w vr13, vr5, vr21, vr20, vr4, vr19
4924    vssrarni.h.w  vr18,     vr3,       12       // t15
4925    vssrarni.h.w  vr19,     vr4,       12       // t14
4926
4927    vsadd.h       vr5,      vr1,       vr0      // t0
4928    vssub.h       vr13,     vr1,       vr0      // t4
4929    vsadd.h       vr3,      vr16,      vr8      // t1
4930    vssub.h       vr4,      vr16,      vr8      // t5
4931    vsadd.h       vr0,      vr6,       vr2      // t2
4932    vssub.h       vr1,      vr6,       vr2      // t6
4933    vsadd.h       vr8,      vr9,       vr10     // t3
4934    vssub.h       vr16,     vr9,       vr10     // t7
4935    vsadd.h       vr2,      vr11,      vr17     // t8a
4936    vssub.h       vr6,      vr11,      vr17     // t12a
4937    vsadd.h       vr9,      vr12,      vr14     // t9a
4938    vssub.h       vr10,     vr12,      vr14     // t13a
4939    vsadd.h       vr11,     vr7,       vr19     // t10a
4940    vssub.h       vr17,     vr7,       vr19     // t14a
4941    vsadd.h       vr12,     vr15,      vr18     // t11a
4942    vssub.h       vr14,     vr15,      vr18     // t15a
4943
4944    la.local      t0,       idct_coeffs
4945
4946    vldrepl.w     vr20,     t0,        8        // 1567
4947    vldrepl.w     vr21,     t0,        12       // 3784
4948    vmul_vmadd_w vr13, vr4, vr21, vr20, vr7, vr18
4949    vmul_vmsub_w vr13, vr4, vr20, vr21, vr15, vr19
4950    vssrarni.h.w  vr18,     vr7,       12       // t4a
4951    vssrarni.h.w  vr19,     vr15,      12       // t5a
4952
4953    vmul_vmadd_w vr16, vr1, vr20, vr21, vr7, vr4
4954    vmul_vmsub_w vr16, vr1, vr21, vr20, vr15, vr13
4955    vssrarni.h.w  vr4,      vr7,       12       // t7a
4956    vssrarni.h.w  vr13,     vr15,      12       // t6a
4957
4958    vmul_vmadd_w vr6, vr10, vr21, vr20, vr7, vr1
4959    vmul_vmsub_w vr6, vr10, vr20, vr21, vr15, vr16
4960    vssrarni.h.w  vr1,      vr7,       12       // t12
4961    vssrarni.h.w  vr16,     vr15,      12       // t13
4962
4963    vmul_vmadd_w vr14, vr17, vr20, vr21, vr7, vr6
4964    vmul_vmsub_w vr14, vr17, vr21, vr20, vr15, vr10
4965    vssrarni.h.w  vr6,      vr7,       12       // t15
4966    vssrarni.h.w  vr10,     vr15,      12       // t14
4967
4968    vsadd.h       vr14,     vr5,       vr0      // out[0]
4969    vssub.h       vr17,     vr5,       vr0      // t2a
4970    vssub.h       vr7,      vr3,       vr8      // t3a
4971    vsadd.h       vr15,     vr3,       vr8      // out[15]
4972    vsllwil.w.h   vr22,     vr15,      0
4973    vexth.w.h     vr15,     vr15
4974    vneg.w        vr22,     vr22
4975    vneg.w        vr15,     vr15
4976    vssrarni.h.w  vr15,     vr22,      0        // out[15]
4977    vsadd.h       vr14,     vr5,       vr0      // out[0]
4978    vssub.h       vr17,     vr5,       vr0      // t2a
4979    vssub.h       vr7,      vr3,       vr8      // t3a
4980
4981    vsadd.h       vr3,      vr19,      vr4      // out[12]
4982    vssub.h       vr8,      vr19,      vr4      // t7
4983    vssub.h       vr0,      vr18,      vr13     // t6
4984    vsadd.h       vr5,      vr18,      vr13     // out[3]
4985    vsllwil.w.h   vr22,     vr5,       0
4986    vexth.w.h     vr5,      vr5
4987    vneg.w        vr22,     vr22
4988    vneg.w        vr5,      vr5
4989    vssrarni.h.w  vr5,      vr22,      0        // out[3]
4990
4991    vsadd.h       vr13,     vr9,       vr12     // out[14]
4992    vssub.h       vr19,     vr9,       vr12     // t11
4993    vssub.h       vr4,      vr2,       vr11     // t10
4994    vsadd.h       vr18,     vr2,       vr11     // out[1]
4995    vsllwil.w.h   vr22,     vr18,      0
4996    vexth.w.h     vr18,     vr18
4997    vneg.w        vr22,     vr22
4998    vneg.w        vr18,     vr18
4999    vssrarni.h.w  vr18,     vr22,      0        // out[1]
5000
5001    vsadd.h       vr2,      vr1,       vr10     // out[2]
5002    vssub.h       vr11,     vr1,       vr10     // t14a
5003    vssub.h       vr12,     vr16,      vr6      // t15a
5004    vsadd.h       vr9,      vr16,      vr6      // out[13]
5005    vsllwil.w.h   vr22,     vr9,       0
5006    vexth.w.h     vr9,      vr9
5007    vneg.w        vr22,     vr22
5008    vneg.w        vr9,      vr9
5009    vssrarni.h.w  vr9,      vr22,      0        // out[13]
5010
5011    vldrepl.w     vr20,     t0,        0        // 2896
5012    vmul_vmadd_w vr17, vr7, vr20, vr20, vr6, vr10
5013    vmul_vmsub_w vr17, vr7, vr20, vr20, vr16, vr1
5014    vssrarni.h.w  vr10,     vr6,       12       // out[7]
5015
5016    vsllwil.w.h   vr7,      vr10,      0
5017    vexth.w.h     vr10,     vr10
5018    vneg.w        vr7,      vr7
5019    vneg.w        vr10,     vr10
5020    vssrarni.h.w  vr10,     vr7,       0
5021    vssrarni.h.w  vr1,      vr16,      12       // out[8]
5022
5023    vmul_vmsub_w vr0, vr8, vr20, vr20, vr16, vr17
5024    vmul_vmadd_w vr0, vr8, vr20, vr20, vr6, vr7
5025    vssrarni.h.w  vr17,     vr16,      12       // out[11]
5026
5027    vsllwil.w.h   vr0,      vr17,      0
5028    vexth.w.h     vr17,     vr17
5029    vneg.w        vr0,      vr0
5030    vneg.w        vr17,     vr17
5031    vssrarni.h.w  vr17,     vr0,       0
5032    vssrarni.h.w  vr7,      vr6,       12       // out[4]
5033
5034    vmul_vmsub_w vr4, vr19, vr20, vr20, vr16, vr0
5035    vmul_vmadd_w vr4, vr19, vr20, vr20, vr6, vr8
5036    vssrarni.h.w  vr0,      vr16,      12       // out[9]
5037
5038    vsllwil.w.h   vr4,      vr0,       0
5039    vexth.w.h     vr0,      vr0
5040    vneg.w        vr4,      vr4
5041    vneg.w        vr0,      vr0
5042    vssrarni.h.w  vr0,      vr4,       0
5043    vssrarni.h.w  vr8,      vr6,       12       // out[6]
5044
5045    vmul_vmadd_w vr11, vr12, vr20, vr20, vr6, vr4
5046    vmul_vmsub_w vr11, vr12, vr20, vr20, vr16, vr19
5047    vssrarni.h.w  vr4,      vr6,       12       // out[5]
5048
5049    vsllwil.w.h   vr24,     vr4,       0
5050    vexth.w.h     vr4,      vr4
5051    vneg.w        vr24,     vr24
5052    vneg.w        vr4,      vr4
5053    vssrarni.h.w  vr4,      vr24,      0
5054    vssrarni.h.w  vr19,     vr16,      12       // out[10]
5055
5056.ifnb \transpose8x8
5057    LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
5058                       vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
5059                       vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
5060
5061    LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
5062                       vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
5063                       vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
5064.endif
5065
5066.ifnb \shift
5067.irp i, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
5068    vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
5069    vsrari.h      \i,       \i,       \shift
5070.endr
5071.endif
5072
5073.ifnb \vst
5074    vst_x16 t1, 0, 16, vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
5075            vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
5076.endif
5077// out0 out1 out2 out3 out4 out5 out6 out7
5078// vr14 vr18 vr2  vr5  vr7  vr4  vr8  vr10
5079// out8 out9 out10 out11 out12 out13 out14 out15
5080// vr1  vr0  vr19  vr17  vr3   vr9   vr13  vr15
5081.endm // adst16_core_lsx
5082
5083.macro adst16_core_finish_lsx in0, in1, in2, in3, in4, in5, in6, in7
5084    fld.d         f20,      t2,       0
5085    fldx.d        f21,      t2,       a1
5086    fld.d         f22,      t3,       0
5087    fldx.d        f23,      t3,       a1
5088
5089    alsl.d        t2,       a1,       t2,     2
5090    alsl.d        t3,       a1,       t3,     2
5091
5092    fld.d         f24,      t2,       0
5093    fldx.d        f25,      t2,       a1
5094    fld.d         f26,      t3,       0
5095    fldx.d        f27,      t3,       a1
5096
5097.irp i, vr20, vr21, vr22, vr23, vr24, vr25, vr26, vr27
5098    vsllwil.hu.bu \i,       \i,       0
5099.endr
5100
5101.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
5102    vsrari.h      \i,       \i,       4
5103.endr
5104
5105    vadd.h        vr20,     vr20,     \in0
5106    vadd.h        vr21,     vr21,     \in1
5107    vadd.h        vr22,     vr22,     \in2
5108    vadd.h        vr23,     vr23,     \in3
5109    vadd.h        vr24,     vr24,     \in4
5110    vadd.h        vr25,     vr25,     \in5
5111    vadd.h        vr26,     vr26,     \in6
5112    vadd.h        vr27,     vr27,     \in7
5113
5114    vssrani.bu.h  vr21,     vr20,     0
5115    vssrani.bu.h  vr23,     vr22,     0
5116    vssrani.bu.h  vr25,     vr24,     0
5117    vssrani.bu.h  vr27,     vr26,     0
5118
5119    vstelm.d      vr21,     t4,       0,     0
5120    vstelm.d      vr21,     t5,       0,     1
5121
5122    alsl.d        t4,       a1,       t4,    1
5123    alsl.d        t5,       a1,       t5,    1
5124    vstelm.d      vr23,     t4,       0,     0
5125    vstelm.d      vr23,     t5,       0,     1
5126
5127    alsl.d        t4,       a1,       t4,    1
5128    alsl.d        t5,       a1,       t5,    1
5129    vstelm.d      vr25,     t4,       0,     0
5130    vstelm.d      vr25,     t5,       0,     1
5131
5132    alsl.d        t4,       a1,       t4,    1
5133    alsl.d        t5,       a1,       t5,    1
5134    vstelm.d      vr27,     t4,       0,     0
5135    vstelm.d      vr27,     t5,       0,     1
5136
5137.endm // adst16_core_finish_lsx
5138
5139function inv_txfm_add_dct_adst_8x16_8bpc_lsx
5140    addi.d        sp,       sp,       -64
5141    fst.d         f24,      sp,       0
5142    fst.d         f25,      sp,       8
5143    fst.d         f26,      sp,       16
5144    fst.d         f27,      sp,       24
5145    fst.d         f28,      sp,       32
5146    fst.d         f29,      sp,       40
5147    fst.d         f30,      sp,       48
5148    fst.d         f31,      sp,       56
5149
5150    vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5151
5152    la.local      t0,       idct_coeffs
5153
5154    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5155                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, rect2_lsx
5156
5157    vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5158
5159    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5160                     vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, rect2_lsx
5161
5162.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
5163        vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5164    vsrari.h      \i,       \i,       1
5165.endr
5166
5167    vreplgr2vr.h  vr23,     zero
5168
5169.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
5170    vst           vr23,     a2,       \i
5171.endr
5172
5173    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
5174                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5175                       vr8, vr9, vr10, vr20, vr21, vr22, vr23, vr31
5176
5177    LSX_TRANSPOSE8x8_H vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30, \
5178                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
5179                       vr16, vr17, vr18, vr20, vr21, vr22, vr23, vr31
5180
5181    adst16_core_lsx , ,
5182
5183    addi.d        t2,       a0,       0
5184    alsl.d        t3,       a1,       a0,     1
5185    addi.d        t4,       a0,       0
5186    add.d         t5,       a1,       a0
5187
5188    adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
5189
5190    alsl.d        t2,       a1,       t2,    2
5191    alsl.d        t3,       a1,       t3,    2
5192
5193    alsl.d        t4,       a1,       t4,    1
5194    alsl.d        t5,       a1,       t5,    1
5195
5196    adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
5197
5198    fld.d         f24,      sp,       0
5199    fld.d         f25,      sp,       8
5200    fld.d         f26,      sp,       16
5201    fld.d         f27,      sp,       24
5202    fld.d         f28,      sp,       32
5203    fld.d         f29,      sp,       40
5204    fld.d         f30,      sp,       48
5205    fld.d         f31,      sp,       56
5206    addi.d        sp,       sp,       64
5207endfunc
5208
5209.macro malloc_space number
5210    li.w          t0,       \number
5211    sub.d         sp,       sp,       t0
5212    addi.d        sp,       sp,       -64
5213    fst.d         f24,      sp,       0
5214    fst.d         f25,      sp,       8
5215    fst.d         f26,      sp,       16
5216    fst.d         f27,      sp,       24
5217    fst.d         f28,      sp,       32
5218    fst.d         f29,      sp,       40
5219    fst.d         f30,      sp,       48
5220    fst.d         f31,      sp,       56
5221.endm
5222
5223.macro free_space number
5224    fld.d         f24,      sp,       0
5225    fld.d         f25,      sp,       8
5226    fld.d         f26,      sp,       16
5227    fld.d         f27,      sp,       24
5228    fld.d         f28,      sp,       32
5229    fld.d         f29,      sp,       40
5230    fld.d         f30,      sp,       48
5231    fld.d         f31,      sp,       56
5232    li.w          t0,       \number
5233    add.d         sp,       sp,       t0
5234    addi.d        sp,       sp,       64
5235.endm
5236
5237.macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11
5238    vsllwil.hu.bu vr10,     \in0,     0
5239    vexth.hu.bu   vr0,      \in0
5240    vsllwil.hu.bu vr11,     \in1,     0
5241    vexth.hu.bu   vr1,      \in1
5242    vsllwil.hu.bu vr12,     \in2,     0
5243    vexth.hu.bu   vr2,      \in2
5244    vsllwil.hu.bu vr13,     \in3,     0
5245    vexth.hu.bu   vr3,      \in3
5246    vadd.h        vr10,     vr10,     \in4
5247    vadd.h        vr0,      vr0,      \in5
5248    vadd.h        vr11,     vr11,     \in6
5249    vadd.h        vr1,      vr1,      \in7
5250    vadd.h        vr12,     vr12,     \in8
5251    vadd.h        vr2,      vr2,      \in9
5252    vadd.h        vr13,     vr13,     \in10
5253    vadd.h        vr3,      vr3,      \in11
5254    vssrani.bu.h  vr0,      vr10,     0
5255    vssrani.bu.h  vr1,      vr11,     0
5256    vssrani.bu.h  vr2,      vr12,     0
5257    vssrani.bu.h  vr3,      vr13,     0
5258    vst           vr0,      a0,       0
5259    vstx          vr1,      a0,       a1
5260    vst           vr2,      t2,       0
5261    vstx          vr3,      t2,       a1
5262.endm
5263
5264.macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, shift
5265
5266.ifnb \shift
5267.irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
5268    vsrari.h      \i,       \i,       \shift
5269.endr
5270.endif
5271
5272    vld           vr0,      a0,       0
5273    vldx          vr1,      a0,       a1
5274    vld           vr2,      t2,       0
5275    vldx          vr3,      t2,       a1
5276    DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \
5277                \in4, \in5, \in6, \in7
5278.endm
5279
5280function inv_txfm_add_dct_dct_16x8_8bpc_lsx
5281    bnez          a3,       .NO_HAS_DCONLY_16x8
5282
5283    ld.h          t2,       a2,       0      // dc
5284    vldi          vr0,      0x8b5            // 181
5285    vreplgr2vr.w  vr1,      t2
5286    vldi          vr5,      0x880            // 128
5287    vmul.w        vr2,      vr0,      vr1    // dc * 181
5288    st.h          zero,     a2,       0
5289    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
5290    alsl.d        t2,       a1,       a0,    1
5291    vmul.w        vr2,      vr2,      vr0
5292    vldx          vr1,      a0,       a1
5293    vsrari.w      vr2,      vr2,      8
5294    vldx          vr3,      t2,       a1
5295    vsrari.w      vr2,      vr2,      1      // (dc + rnd) >> shift
5296    vmadd.w       vr5,      vr2,      vr0
5297    vld           vr0,      a0,       0
5298    vssrarni.h.w  vr5,      vr5,      12
5299    vld           vr2,      t2,       0
5300
5301    DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5
5302
5303    alsl.d        a0,       a1,       a0,     2
5304    alsl.d        t2,       a1,       a0,     1
5305
5306    VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
5307
5308    b             .DCT_DCT_16x8_END
5309
5310.NO_HAS_DCONLY_16x8:
5311    malloc_space 512
5312
5313    vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5314            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5315
5316    la.local      t0,       idct_coeffs
5317
5318    vldrepl.w     vr23,     t0,       0   //2896
5319.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5320    vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5321    rect2_lsx \i, vr23, \i
5322.endr
5323
5324    dct_8x16_core_lsx
5325
5326    LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5327                       vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
5328                       vr13, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5329
5330    LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5331                       vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \
5332                       vr13, vr31, vr2, vr3, vr4, vr5, vr6, vr7
5333
5334.irp i, vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
5335        vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24
5336    vsrari.h       \i,       \i,       1
5337.endr
5338
5339    vst_x16 sp, 64, 16, vr13, vr18, vr17, vr28, vr11, vr14, vr15, vr16, \
5340            vr27, vr30, vr23, vr12, vr29, vr26, vr25, vr24
5341
5342    vreplgr2vr.h  vr23,     zero
5343.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
5344    vst           vr23,     a2,       \i
5345.endr
5346
5347    dct_8x8_core_lsx vr0, vr18, vr17, vr28, vr11, vr14, vr15, vr16,  \
5348                     vr4, vr5, vr6, vr16, vr7, vr18, vr19, vr31, no_rect2
5349
5350    dct_8x8_core_lsx vr27, vr30, vr1, vr12, vr29, vr26, vr25, vr24, \
5351                     vr14, vr15, vr17, vr20, vr21, vr22, vr23, vr28, no_rect2
5352
5353    alsl.d        t2,       a1,       a0,    1
5354    VLD_DST_ADD_W16 vr4, vr14, vr5, vr15, vr6, vr17, vr16, vr20, 4
5355
5356    alsl.d        a0,       a1,       a0,    2
5357    alsl.d        t2,       a1,       a0,    1
5358    VLD_DST_ADD_W16 vr7, vr21, vr18, vr22, vr19, vr23, vr31, vr28, 4
5359
5360    free_space 512
5361
5362.DCT_DCT_16x8_END:
5363
5364endfunc
5365
5366function inv_txfm_add_adst_dct_16x8_8bpc_lsx
5367    addi.d        sp,       sp,       -64
5368    fst.d         f24,      sp,       0
5369    fst.d         f25,      sp,       8
5370    fst.d         f26,      sp,       16
5371    fst.d         f27,      sp,       24
5372    fst.d         f28,      sp,       32
5373    fst.d         f29,      sp,       40
5374    fst.d         f30,      sp,       48
5375    fst.d         f31,      sp,       56
5376
5377    addi.d        t1,       sp,       64
5378    addi.d        t2,       a2,       0
5379
5380    vld_x16 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5381            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5382
5383    la.local      t0,       idct_coeffs
5384
5385    vldrepl.w     vr23,     t0,       0         //2896
5386.irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5387     vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5388    rect2_lsx     \i,       vr23,     \i
5389.endr
5390
5391    adst16_core_lsx , 1,
5392
5393    // out0 out1 out2 out3 out4 out5 out6 out7
5394    // vr14 vr18 vr2  vr5  vr7  vr4  vr8  vr10
5395    // out8 out9 out10 out11 out12 out13 out14 out15
5396    // vr1  vr0  vr19  vr17  vr3   vr9   vr13  vr15
5397
5398    LSX_TRANSPOSE8x8_H vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10, \
5399                       vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \
5400                       vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
5401
5402    LSX_TRANSPOSE8x8_H vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15, \
5403                       vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \
5404                       vr6, vr11, vr12, vr16, vr20, vr21, vr22, vr23
5405
5406    vreplgr2vr.h  vr23,     zero
5407.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240
5408    vst           vr23,     a2,       \i
5409.endr
5410
5411    dct_8x8_core_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr24, vr25, \
5412                     vr27, vr28, vr29, vr25, vr30, vr31, vr6, vr16, no_rect2
5413
5414    dct_8x8_core_lsx vr1, vr0, vr19, vr17, vr3, vr26, vr13, vr15, \
5415                     vr5, vr7, vr18, vr20, vr21, vr22, vr23, vr24, no_rect2
5416
5417    alsl.d        t2,       a1,       a0,    1
5418    VLD_DST_ADD_W16 vr27, vr5, vr28, vr7, vr29, vr18, vr25, vr20, 4
5419
5420    alsl.d        a0,       a1,       a0,    2
5421    alsl.d        t2,       a1,       a0,    1
5422    VLD_DST_ADD_W16 vr30, vr21, vr31, vr22, vr6, vr23, vr16, vr24, 4
5423
5424    fld.d         f24,      sp,       0
5425    fld.d         f25,      sp,       8
5426    fld.d         f26,      sp,       16
5427    fld.d         f27,      sp,       24
5428    fld.d         f28,      sp,       32
5429    fld.d         f29,      sp,       40
5430    fld.d         f30,      sp,       48
5431    fld.d         f31,      sp,       56
5432    addi.d        sp,       sp,       64
5433endfunc
5434
5435function inv_txfm_add_dct_dct_16x16_8bpc_lsx
5436    bnez          a3,       .NO_HAS_DCONLY_16x16
5437
5438    ld.h          t2,       a2,       0      // dc
5439    vldi          vr0,      0x8b5            // 181
5440    vreplgr2vr.w  vr1,      t2
5441    vldi          vr5,      0x880            // 128
5442    vmul.w        vr2,      vr0,      vr1    // dc * 181
5443    st.h          zero,     a2,       0
5444    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
5445    alsl.d        t2,       a1,       a0,    1
5446    vsrari.w      vr2,      vr2,      2      // (dc + rnd) >> shift
5447    vldx          vr1,      a0,       a1
5448    vmadd.w       vr5,      vr2,      vr0
5449    vldx          vr3,      t2,       a1
5450    vssrarni.h.w  vr5,      vr5,      12
5451    vld           vr0,      a0,       0
5452    vld           vr2,      t2,       0
5453
5454    DST_ADD_W16 vr0, vr1, vr2, vr3, vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5
5455
5456    alsl.d        a0,       a1,       a0,     2
5457    alsl.d        t2,       a1,       a0,     1
5458
5459    VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
5460
5461    alsl.d        a0,       a1,       a0,     2
5462    alsl.d        t2,       a1,       a0,     1
5463
5464    VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
5465
5466    alsl.d        a0,       a1,       a0,     2
5467    alsl.d        t2,       a1,       a0,     1
5468
5469    VLD_DST_ADD_W16 vr5, vr5, vr5, vr5, vr5, vr5, vr5, vr5,
5470
5471    b             .DCT_DCT_16x16_END
5472
5473.NO_HAS_DCONLY_16x16:
5474
5475    malloc_space 512
5476
5477    vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5478            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5479
5480    dct_8x16_core_lsx
5481
5482    LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5483                       vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5484                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5485
5486    LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5487                       vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5488                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5489
5490.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5491        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5492    vsrari.h       \i,       \i,       2
5493.endr
5494
5495    vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5496            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5497
5498    vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5499            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5500
5501    dct_8x16_core_lsx
5502
5503    LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5504                       vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5505                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5506
5507    LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5508                       vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5509                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5510
5511.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5512        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5513    vsrari.h      \i,       \i,       2
5514.endr
5515
5516    vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5517            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5518
5519    vreplgr2vr.h  vr31,     zero
5520
5521.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
5522        240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
5523        464, 480, 496
5524    vst           vr31,     a2,       \i
5525.endr
5526
5527    vld_x8 sp, 64, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5528    vld_x8 sp, 320, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5529
5530    dct_8x16_core_lsx
5531
5532    vst_x8 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
5533    vst_x8 sp, 320, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5534
5535    vld_x8 sp, 192, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5536    vld_x8 sp, 448, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5537
5538    dct_8x16_core_lsx
5539
5540    alsl.d        t2,       a1,       a0,    1
5541    vld           vr4,      sp,       64
5542    vld           vr5,      sp,       80
5543    vld           vr6,      sp,       96
5544    vld           vr7,      sp,       112
5545    VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4
5546
5547    alsl.d        a0,       a1,       a0,    2
5548    alsl.d        t2,       a1,       a0,    1
5549    vld           vr4,      sp,       128
5550    vld           vr5,      sp,       144
5551    vld           vr6,      sp,       160
5552    vld           vr7,      sp,       176
5553    VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4
5554
5555    alsl.d        a0,       a1,       a0,    2
5556    alsl.d        t2,       a1,       a0,    1
5557    vld           vr4,      sp,       320
5558    vld           vr5,      sp,       336
5559    vld           vr6,      sp,       352
5560    vld           vr7,      sp,       368
5561    VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4
5562
5563    alsl.d        a0,       a1,       a0,    2
5564    alsl.d        t2,       a1,       a0,    1
5565    vld           vr4,      sp,       384
5566    vld           vr5,      sp,       400
5567    vld           vr6,      sp,       416
5568    vld           vr7,      sp,       432
5569    VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4
5570
5571    free_space 512
5572
5573.DCT_DCT_16x16_END:
5574endfunc
5575
5576function inv_txfm_add_adst_adst_16x16_8bpc_lsx
5577
5578    malloc_space 256+256
5579
5580    addi.d        t1,       sp,        64
5581    addi.d        t2,       a2,        0
5582
5583    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5584            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5585
5586    adst16_core_lsx transpose8x8, 2, vst_x16
5587
5588    addi.d        t2,       a2,        16
5589    addi.d        t1,       t1,        256
5590
5591    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5592            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5593
5594    adst16_core_lsx transpose8x8, 2, vst_x16
5595
5596    vreplgr2vr.h  vr23,     zero
5597
5598.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
5599        240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
5600        464, 480, 496
5601    vst           vr23,     a2,       \i
5602.endr
5603
5604    addi.d        t2,       sp,       64
5605
5606    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5607    vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5608
5609    adst16_core_lsx , ,
5610
5611    // out0 out1 out2 out3 out4 out5 out6 out7
5612    // vr14 vr18 vr2  vr5  vr7  vr4  vr8  vr10
5613    // out8 out9 out10 out11 out12 out13 out14 out15
5614    // vr1  vr0  vr19  vr17  vr3   vr9   vr13  vr15
5615
5616    addi.d        t2,       a0,       0
5617    alsl.d        t3,       a1,       a0,     1
5618    addi.d        t4,       a0,       0
5619    add.d         t5,       a1,       a0
5620
5621    adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
5622
5623    alsl.d        t2,       a1,       t2,    2
5624    alsl.d        t3,       a1,       t3,    2
5625
5626    alsl.d        t4,       a1,       t4,    1
5627    alsl.d        t5,       a1,       t5,    1
5628
5629    adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
5630
5631    addi.d        t2,       sp,       64+128
5632
5633    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5634    vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5635
5636    adst16_core_lsx , ,
5637
5638    addi.d        a0,       a0,       8
5639
5640    addi.d        t2,       a0,       0
5641    alsl.d        t3,       a1,       a0,    1
5642    addi.d        t4,       a0,       0
5643    add.d         t5,       a1,       a0
5644
5645    adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
5646
5647    alsl.d        t2,       a1,       t2,    2
5648    alsl.d        t3,       a1,       t3,    2
5649
5650    alsl.d        t4,       a1,       t4,    1
5651    alsl.d        t5,       a1,       t5,    1
5652
5653    adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
5654
5655    free_space 256+256
5656endfunc
5657
5658function inv_txfm_add_adst_dct_16x16_8bpc_lsx
5659    malloc_space 256+256
5660
5661    addi.d        t1,       sp,        64
5662    addi.d        t2,       a2,        0
5663
5664    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5665            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5666
5667    adst16_core_lsx transpose8x8, 2, vst_x16
5668
5669    addi.d        t2,       a2,        16
5670    addi.d        t1,       t1,        256
5671
5672    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5673            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5674
5675    adst16_core_lsx transpose8x8, 2, vst_x16
5676
5677    vreplgr2vr.h  vr23,     zero
5678.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
5679        240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
5680        464, 480, 496
5681    vst           vr23,     a2,       \i
5682.endr
5683
5684    addi.d        t2,       sp,       64
5685
5686    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5687    vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5688
5689    dct_8x16_core_lsx
5690
5691    vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
5692    vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5693
5694    addi.d        t2,       sp,       64+128
5695
5696    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5697    vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5698
5699    dct_8x16_core_lsx
5700
5701    alsl.d        t2,       a1,       a0,    1
5702    vld           vr4,      sp,       64
5703    vld           vr5,      sp,       80
5704    vld           vr6,      sp,       96
5705    vld           vr7,      sp,       112
5706    VLD_DST_ADD_W16 vr4, vr22, vr5, vr18, vr6, vr17, vr7, vr28, 4
5707
5708    alsl.d        a0,       a1,       a0,    2
5709    alsl.d        t2,       a1,       a0,    1
5710    vld           vr4,      sp,       128
5711    vld           vr5,      sp,       144
5712    vld           vr6,      sp,       160
5713    vld           vr7,      sp,       176
5714    VLD_DST_ADD_W16 vr4, vr20, vr5, vr14, vr6, vr15, vr7, vr16, 4
5715
5716    alsl.d        a0,       a1,       a0,    2
5717    alsl.d        t2,       a1,       a0,    1
5718    vld           vr4,      sp,       320
5719    vld           vr5,      sp,       336
5720    vld           vr6,      sp,       352
5721    vld           vr7,      sp,       368
5722    VLD_DST_ADD_W16 vr4, vr27, vr5, vr30, vr6, vr23, vr7, vr21, 4
5723
5724    alsl.d        a0,       a1,       a0,    2
5725    alsl.d        t2,       a1,       a0,    1
5726    vld           vr4,      sp,       384
5727    vld           vr5,      sp,       400
5728    vld           vr6,      sp,       416
5729    vld           vr7,      sp,       432
5730    VLD_DST_ADD_W16 vr4, vr29, vr5, vr26, vr6, vr25, vr7, vr24, 4
5731
5732    free_space 256+256
5733endfunc
5734
5735function inv_txfm_add_dct_adst_16x16_8bpc_lsx
5736    malloc_space 256+256
5737
5738    vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5739           vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5740
5741    dct_8x16_core_lsx
5742
5743    LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5744                       vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5745                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5746
5747    LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5748                       vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5749                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5750
5751.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5752        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5753    vsrari.h       \i,       \i,       2
5754.endr
5755
5756    vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5757            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5758
5759    vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5760           vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5761
5762    dct_8x16_core_lsx
5763
5764    LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5765                       vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5766                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5767
5768    LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5769                       vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5770                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5771
5772.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5773        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5774    vsrari.h      \i,       \i,       2
5775.endr
5776
5777    vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5778            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5779
5780    vreplgr2vr.h  vr31,     zero
5781
5782.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
5783        240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
5784        464, 480, 496
5785    vst           vr31,     a2,       \i
5786.endr
5787
5788    addi.d        t2,       sp,       64
5789
5790    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5791    vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5792
5793    adst16_core_lsx , ,
5794
5795    // out0 out1 out2 out3 out4 out5 out6 out7
5796    // vr14 vr18 vr2  vr5  vr7  vr4  vr8  vr10
5797    // out8 out9 out10 out11 out12 out13 out14 out15
5798    // vr1  vr0  vr19  vr17  vr3   vr9   vr13  vr15
5799
5800    addi.d        t2,       a0,       0
5801    alsl.d        t3,       a1,       a0,     1
5802    addi.d        t4,       a0,       0
5803    add.d         t5,       a1,       a0
5804
5805    adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
5806
5807    alsl.d        t2,       a1,       t2,    2
5808    alsl.d        t3,       a1,       t3,    2
5809
5810    alsl.d        t4,       a1,       t4,    1
5811    alsl.d        t5,       a1,       t5,    1
5812
5813    adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
5814
5815    addi.d        t2,       sp,       64+128
5816
5817    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5818    vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5819
5820    adst16_core_lsx , ,
5821
5822    addi.d        a0,       a0,       8
5823
5824    addi.d        t2,       a0,       0
5825    alsl.d        t3,       a1,       a0,    1
5826    addi.d        t4,       a0,       0
5827    add.d         t5,       a1,       a0
5828
5829    adst16_core_finish_lsx vr14, vr18, vr2, vr5, vr7, vr4, vr8, vr10
5830
5831    alsl.d        t2,       a1,       t2,    2
5832    alsl.d        t3,       a1,       t3,    2
5833
5834    alsl.d        t4,       a1,       t4,    1
5835    alsl.d        t5,       a1,       t5,    1
5836
5837    adst16_core_finish_lsx vr1, vr0, vr19, vr17, vr3, vr9, vr13, vr15
5838
5839    free_space 256+256
5840endfunc
5841
5842const shufb
5843    .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
5844endconst
5845
5846function inv_txfm_add_flipadst_dct_16x16_8bpc_lsx
5847    malloc_space 256+256
5848
5849    addi.d        t1,       sp,        64
5850    addi.d        t2,       a2,        0
5851
5852    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5853            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5854
5855    adst16_core_lsx transpose8x8, 2, vst_x16
5856
5857    addi.d        t2,       a2,        16
5858    addi.d        t1,       t1,        256
5859
5860    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5861            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5862
5863    adst16_core_lsx transpose8x8, 2, vst_x16
5864
5865    vreplgr2vr.h  vr23,     zero
5866.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
5867        240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
5868        464, 480, 496
5869    vst           vr23,     a2,       \i
5870.endr
5871
5872    addi.d        t2,       sp,       64
5873
5874    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5875    vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5876
5877    dct_8x16_core_lsx
5878
5879    la.local      t0,       shufb
5880    vld           vr0,      t0,       0
5881
5882.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5883     vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5884    vshuf.b       \i,       \i,       \i,    vr0
5885.endr
5886
5887    vst_x8 t2, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
5888    vst_x8 t2, 256, 16, vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5889
5890    addi.d        t2,       sp,       64+128
5891
5892    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5893    vld_x8 t2, 256, 16, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5894
5895    dct_8x16_core_lsx
5896
5897    la.local      t0,       shufb
5898    vld           vr0,      t0,       0
5899
5900.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5901     vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5902    vshuf.b       \i,       \i,       \i,    vr0
5903.endr
5904
5905    alsl.d        t2,       a1,       a0,    1
5906    vld           vr4,      sp,       64
5907    vld           vr5,      sp,       80
5908    vld           vr6,      sp,       96
5909    vld           vr7,      sp,       112
5910    VLD_DST_ADD_W16 vr22, vr4, vr18, vr5, vr17, vr6, vr28, vr7, 4
5911
5912    alsl.d        a0,       a1,       a0,    2
5913    alsl.d        t2,       a1,       a0,    1
5914    vld           vr4,      sp,       128
5915    vld           vr5,      sp,       144
5916    vld           vr6,      sp,       160
5917    vld           vr7,      sp,       176
5918    VLD_DST_ADD_W16 vr20, vr4, vr14, vr5, vr15, vr6, vr16, vr7, 4
5919
5920    alsl.d        a0,       a1,       a0,    2
5921    alsl.d        t2,       a1,       a0,    1
5922    vld           vr4,      sp,       320
5923    vld           vr5,      sp,       336
5924    vld           vr6,      sp,       352
5925    vld           vr7,      sp,       368
5926    VLD_DST_ADD_W16 vr27, vr4, vr30, vr5, vr23, vr6, vr21, vr7, 4
5927
5928    alsl.d        a0,       a1,       a0,    2
5929    alsl.d        t2,       a1,       a0,    1
5930    vld           vr4,      sp,       384
5931    vld           vr5,      sp,       400
5932    vld           vr6,      sp,       416
5933    vld           vr7,      sp,       432
5934    VLD_DST_ADD_W16 vr29, vr4, vr26, vr5, vr25, vr6, vr24, vr7, 4
5935
5936    free_space 256+256
5937endfunc
5938
5939function inv_txfm_add_dct_flipadst_16x16_8bpc_lsx
5940    malloc_space 256+256
5941
5942    vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5943           vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5944
5945    dct_8x16_core_lsx
5946
5947    LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5948                       vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5949                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5950
5951    LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5952                       vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5953                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5954
5955.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5956        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5957    vsrari.h       \i,       \i,       2
5958.endr
5959
5960    vst_x16 sp, 64, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5961            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5962
5963    vld_x16 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
5964           vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
5965
5966    dct_8x16_core_lsx
5967
5968    LSX_TRANSPOSE8x8_H vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5969                       vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5970                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5971
5972    LSX_TRANSPOSE8x8_H vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5973                       vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24, \
5974                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5975
5976.irp i, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5977        vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5978    vsrari.h      \i,       \i,       2
5979.endr
5980
5981    vst_x16 sp, 320, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
5982            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
5983
5984    vreplgr2vr.h  vr31,     zero
5985
5986.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
5987        240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
5988        464, 480, 496
5989    vst           vr31,     a2,       \i
5990.endr
5991
5992    addi.d        t2,       sp,       64
5993
5994    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
5995    vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
5996
5997    adst16_core_lsx , ,
5998
5999    // out0 out1 out2 out3 out4 out5 out6 out7
6000    // vr14 vr18 vr2  vr5  vr7  vr4  vr8  vr10
6001    // out8 out9 out10 out11 out12 out13 out14 out15
6002    // vr1  vr0  vr19  vr17  vr3   vr9   vr13  vr15
6003
6004    la.local      t0,       shufb
6005    vld           vr31,     t0,       0
6006
6007    addi.d        t2,       a0,       0
6008    alsl.d        t3,       a1,       a0,     1
6009    addi.d        t4,       a0,       0
6010    add.d         t5,       a1,       a0
6011
6012    adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1
6013
6014    alsl.d        t2,       a1,       t2,    2
6015    alsl.d        t3,       a1,       t3,    2
6016
6017    alsl.d        t4,       a1,       t4,    1
6018    alsl.d        t5,       a1,       t5,    1
6019
6020    adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14
6021
6022    addi.d        t2,       sp,       64+128
6023
6024    vld_x8 t2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
6025    vld_x8 t2, 256, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
6026
6027    adst16_core_lsx , ,
6028
6029    addi.d        a0,       a0,       8
6030
6031    la.local      t0,       shufb
6032    vld           vr31,     t0,       0
6033
6034    addi.d        t2,       a0,       0
6035    alsl.d        t3,       a1,       a0,    1
6036    addi.d        t4,       a0,       0
6037    add.d         t5,       a1,       a0
6038
6039    adst16_core_finish_lsx vr15, vr13, vr9, vr3, vr17, vr19, vr0, vr1
6040
6041    alsl.d        t2,       a1,       t2,    2
6042    alsl.d        t3,       a1,       t3,    2
6043
6044    alsl.d        t4,       a1,       t4,    1
6045    alsl.d        t5,       a1,       t5,    1
6046
6047    adst16_core_finish_lsx vr10, vr8, vr4, vr7, vr5, vr2, vr18, vr14
6048
6049    free_space 256+256
6050
6051endfunc
6052
6053function inv_txfm_add_dct_dct_8x32_8bpc_lsx
6054    bnez          a3,       .NO_HAS_DCONLY_8x32
6055
6056    ld.h          t2,       a2,       0      // dc
6057    vldi          vr0,      0x8b5            // 181
6058    vreplgr2vr.w  vr1,      t2
6059    vldi          vr5,      0x880            // 128
6060    vmul.w        vr2,      vr0,      vr1    // dc * 181
6061    st.h          zero,     a2,       0
6062    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
6063    vld           vr10,     a0,       0      // 0 1 2 3 4 5 6 7
6064    vsrari.w      vr2,      vr2,      2      // (dc + rnd) >> shift
6065    vldx          vr11,     a0,       a1     // 8 9 10 11 12 13 14 15
6066    alsl.d        t2,       a1,       a0,    1
6067    vmadd.w       vr5,      vr2,      vr0
6068    vld           vr12,     t2,       0      // 16 17 18 19 20 21 22 23
6069    vssrarni.h.w  vr5,      vr5,      12
6070    vldx          vr13,     t2,       a1     // 24 25 26 27 28 29 30 31
6071
6072    DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
6073
6074.rept 7
6075    alsl.d        a0,       a1,       a0,     2
6076    alsl.d        t2,       a1,       a0,     1
6077
6078    VLD_DST_ADD_W8 vr5, vr5, vr5, vr5
6079.endr
6080
6081    b             .DCT_DCT_8X32_END
6082
6083.NO_HAS_DCONLY_8x32:
6084    malloc_space 512
6085
6086    vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
6087
6088    la.local      t0,       idct_coeffs
6089
6090    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6091                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
6092
6093.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6094    vsrari.h      \i,       \i,       2
6095.endr
6096
6097    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6098                       vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6099                       vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6100
6101    vst_x8 sp, 64, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6102
6103    vld_x8 a2, 16, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
6104
6105    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6106                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
6107
6108.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6109    vsrari.h      \i,       \i,       2
6110.endr
6111
6112    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6113                       vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6114                       vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6115
6116    vst_x8 sp, 192, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6117
6118    vld_x8 a2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
6119
6120    la.local      t0,       idct_coeffs
6121
6122    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6123                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
6124
6125.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6126    vsrari.h      \i,       \i,       2
6127.endr
6128
6129    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6130                       vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6131                       vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6132
6133    vst_x8 sp, 320, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6134
6135    vld_x8 a2, 48, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
6136
6137    dct_8x8_core_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6138                     vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, no_rect2
6139
6140.irp i, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6141    vsrari.h      \i,       \i,       2
6142.endr
6143
6144    LSX_TRANSPOSE8x8_H vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6145                       vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, \
6146                       vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6147
6148    vst_x8 sp, 448, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6149
6150    vreplgr2vr.h  vr31,     zero
6151
6152.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
6153        240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, \
6154        464, 480, 496
6155    vst           vr31,     a2,       \i
6156.endr
6157
6158    addi.d       t2,   sp, 64
6159    addi.d       t3,   sp, 64
6160
6161    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6162            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6163
6164    dct_8x16_core_lsx
6165
6166    vst_x16 t3, 0, 32, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
6167            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
6168
6169    vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6170            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6171
6172    // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
6173    // in1  in3  in5  in7  in9 in11 in13 in15
6174    // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6175    // in17  in19  in21  in23  in25  in27  in29  in31
6176
6177    la.local      t0,       idct_coeffs
6178    vldrepl.w     vr20,     t0,       64           // 201
6179    vldrepl.w     vr21,     t0,       68           // 4091
6180
6181    vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
6182    vssrarni.h.w  vr9,      vr8,      12           // t31a
6183    vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
6184    vssrarni.h.w  vr10,     vr11,      12          // t16a
6185
6186    vldrepl.w     vr20,     t0,       72           // 3035
6187    vldrepl.w     vr21,     t0,       76           // 2751
6188    vmul_vmadd_w vr19, vr7, vr21, vr20, vr11, vr0
6189    vssrarni.h.w  vr0,      vr11,      12          // t30a
6190    vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
6191    vssrarni.h.w  vr30,     vr11,      12          // t17a
6192
6193    vldrepl.w     vr20,     t0,       80           // 1751
6194    vldrepl.w     vr21,     t0,       84           // 3703
6195    vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
6196    vssrarni.h.w  vr7,      vr8,      12           // t29a
6197    vmul_vmsub_w vr4, vr26, vr20, vr21, vr8, vr19
6198    vssrarni.h.w  vr19,     vr8,      12           // t18a
6199
6200    vldrepl.w     vr20,     t0,       88           // 3857
6201    vldrepl.w     vr21,     t0,       92           // 1380
6202    vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
6203    vssrarni.h.w  vr4,      vr8,      12           // t28a
6204    vmul_vmsub_w vr27, vr3, vr20, vr21, vr8, vr26
6205    vssrarni.h.w  vr26,     vr8,      12           // t19a
6206
6207    vldrepl.w     vr20,     t0,       96           // 995
6208    vldrepl.w     vr21,     t0,       100          // 3973
6209    vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
6210    vssrarni.h.w  vr3,      vr8,      12           // t27a
6211    vmul_vmsub_w vr2, vr28, vr20, vr21, vr8, vr27
6212    vssrarni.h.w  vr27,     vr8,      12           // t20a
6213
6214    vldrepl.w     vr20,     t0,       104          // 3513
6215    vldrepl.w     vr21,     t0,       108          // 2106
6216    vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
6217    vssrarni.h.w  vr2,      vr8,      12           // t26a
6218    vmul_vmsub_w vr25, vr5, vr20, vr21, vr8, vr28
6219    vssrarni.h.w  vr28,     vr8,      12           // t21a
6220
6221    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
6222    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
6223    vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
6224    vssrarni.h.w  vr5,      vr8,      12           // t25a
6225    vmul_vmsub_w vr6, vr24, vr20, vr21, vr8, vr25
6226    vssrarni.h.w  vr25,     vr8,      12           // t22a
6227
6228    vldrepl.w     vr20,     t0,       120          // 4052
6229    vldrepl.w     vr21,     t0,       124          // 601
6230    vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
6231    vssrarni.h.w  vr6,      vr8,      12           // t24a
6232    vmul_vmsub_w vr29, vr1, vr20, vr21, vr8, vr24
6233    vssrarni.h.w  vr24,     vr8,      12           // t23a
6234
6235    vsadd.h       vr1,      vr10,     vr30         // t16
6236    vssub.h       vr29,     vr10,     vr30         // t17
6237    vssub.h       vr8,      vr26,     vr19         // t18
6238    vsadd.h       vr31,     vr26,     vr19         // t19
6239    vsadd.h       vr10,     vr27,     vr28         // t20
6240    vssub.h       vr30,     vr27,     vr28         // t21
6241    vssub.h       vr19,     vr24,     vr25         // t22
6242    vsadd.h       vr26,     vr24,     vr25         // t23
6243    vsadd.h       vr27,     vr6,      vr5          // t24
6244    vssub.h       vr28,     vr6,      vr5          // t25
6245    vssub.h       vr24,     vr3,      vr2          // t26
6246    vsadd.h       vr25,     vr3,      vr2          // t27
6247    vsadd.h       vr5,      vr4,      vr7          // t28
6248    vssub.h       vr6,      vr4,      vr7          // t29
6249    vssub.h       vr2,      vr9,      vr0          // t30
6250    vsadd.h       vr3,      vr9,      vr0          // t31
6251
6252    vldrepl.w     vr20,     t0,       16           // 799
6253    vldrepl.w     vr21,     t0,       20           // 4017
6254    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
6255    vssrarni.h.w  vr7,      vr4,      12           // t30a
6256    vmul_vmsub_w vr2, vr29, vr20, vr21, vr4, vr0
6257    vssrarni.h.w  vr0,      vr4,      12           // t17a
6258    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
6259    vneg.w        vr4,      vr4
6260    vneg.w        vr9,      vr9
6261    vssrarni.h.w  vr9,      vr4,      12           // t18a
6262    vmul_vmsub_w vr6, vr8, vr20, vr21, vr4, vr2
6263    vssrarni.h.w  vr2,      vr4,      12           // t29a
6264
6265    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
6266    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
6267    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
6268    vssrarni.h.w  vr29,     vr4,      12           // t26a
6269    vmul_vmsub_w vr24, vr30, vr20, vr21, vr4, vr6
6270    vssrarni.h.w  vr6,      vr4,      12           // t21a
6271
6272    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
6273    vneg.w        vr4,      vr4
6274    vneg.w        vr8,      vr8
6275    vssrarni.h.w  vr8,      vr4,      12           // t22a
6276    vmul_vmsub_w vr28, vr19, vr20, vr21, vr4, vr24
6277    vssrarni.h.w  vr24,     vr4,      12           // t25a
6278
6279    vsadd.h       vr4,      vr1,      vr31         // t16a
6280    vssub.h       vr30,     vr1,      vr31         // t19a
6281    vsadd.h       vr19,     vr0,      vr9          // t17
6282    vssub.h       vr28,     vr0,      vr9          // t18
6283    vssub.h       vr1,      vr26,     vr10         // t20a
6284    vsadd.h       vr31,     vr26,     vr10         // t23a
6285    vssub.h       vr0,      vr8,      vr6          // t21
6286    vsadd.h       vr9,      vr8,      vr6          // t22
6287    vsadd.h       vr10,     vr27,     vr25         // t24a
6288    vssub.h       vr26,     vr27,     vr25         // t27a
6289    vsadd.h       vr6,      vr24,     vr29         // t25
6290    vssub.h       vr8,      vr24,     vr29         // t26
6291    vssub.h       vr25,     vr3,      vr5          // t28a
6292    vsadd.h       vr27,     vr3,      vr5          // t31a
6293    vssub.h       vr24,     vr7,      vr2          // t29
6294    vsadd.h       vr29,     vr7,      vr2          // t30
6295
6296    vldrepl.w     vr20,     t0,       8            // 1567
6297    vldrepl.w     vr21,     t0,       12           // 3784
6298    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
6299    vssrarni.h.w  vr5,      vr3,      12           // t29a
6300    vmul_vmsub_w vr24, vr28, vr20, vr21, vr3, vr2
6301    vssrarni.h.w  vr2,      vr3,      12           // 18a
6302
6303    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
6304    vssrarni.h.w  vr7,      vr3,      12           // t28
6305    vmul_vmsub_w vr25, vr30, vr20, vr21, vr3, vr24
6306    vssrarni.h.w  vr24,     vr3,      12           // t19
6307
6308    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
6309    vneg.w        vr3,      vr3
6310    vneg.w        vr28,     vr28
6311    vssrarni.h.w  vr28,     vr3,      12           // t20
6312    vmul_vmsub_w vr26, vr1, vr20, vr21, vr3, vr25
6313    vssrarni.h.w  vr25,     vr3,      12           // t27
6314
6315    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
6316    vneg.w        vr3,      vr3
6317    vneg.w        vr30,     vr30
6318    vssrarni.h.w  vr30,     vr3,      12           // t21a
6319    vmul_vmsub_w vr8, vr0, vr20, vr21, vr3, vr1
6320    vssrarni.h.w  vr1,      vr3,      12           // t26a
6321
6322    vsadd.h       vr3,      vr4,      vr31         // t16
6323    vssub.h       vr26,     vr4,      vr31         // t23
6324    vsadd.h       vr0,      vr19,     vr9          // t17a
6325    vssub.h       vr8,      vr19,     vr9          // t22a
6326    vsadd.h       vr4,      vr2,      vr30         // t18
6327    vssub.h       vr31,     vr2,      vr30         // t21
6328    vsadd.h       vr9,      vr24,     vr28         // t19a
6329    vssub.h       vr19,     vr24,     vr28         // t20a
6330    vssub.h       vr2,      vr27,     vr10         // t24
6331    vsadd.h       vr30,     vr27,     vr10         // t31
6332    vssub.h       vr24,     vr29,     vr6          // t25a
6333    vsadd.h       vr28,     vr29,     vr6          // t30a
6334    vssub.h       vr10,     vr5,      vr1          // t26
6335    vsadd.h       vr27,     vr5,      vr1          // t29
6336    vssub.h       vr6,      vr7,      vr25         // t27a
6337    vsadd.h       vr29,     vr7,      vr25         // t28a
6338
6339    vldrepl.w     vr20,     t0,       0            // 2896
6340    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
6341    vssrarni.h.w  vr5,      vr1,      12           // t20
6342    vmul_vmadd_w vr6, vr19, vr20, vr20, vr1, vr7
6343    vssrarni.h.w  vr7,      vr1,      12           // t27
6344
6345    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
6346    vssrarni.h.w  vr25,     vr1,      12           // t21a
6347    vmul_vmadd_w vr10, vr31, vr20, vr20, vr1, vr6
6348    vssrarni.h.w  vr6,      vr1,      12           // t26a
6349
6350    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
6351    vssrarni.h.w  vr19,     vr1,      12           // t22
6352    vmul_vmadd_w vr24, vr8, vr20, vr20, vr1, vr10
6353    vssrarni.h.w  vr10,     vr1,      12           // t25
6354
6355    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
6356    vssrarni.h.w  vr31,     vr1,      12           // t23a
6357    vmul_vmadd_w vr2, vr26, vr20, vr20, vr1, vr8
6358    vssrarni.h.w  vr8,      vr1,      12           // t24a
6359
6360    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
6361    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
6362
6363    vld_x8 t3, 0, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6364
6365    vsadd.h       vr1,      vr11,     vr30         // c[0]
6366    vssub.h       vr2,      vr11,     vr30         // c[31]
6367    vsadd.h       vr24,     vr12,     vr28         // c[1]
6368    vssub.h       vr26,     vr12,     vr28         // c[30]
6369    vsadd.h       vr11,     vr13,     vr27         // c[2]
6370    vssub.h       vr30,     vr13,     vr27         // c[29]
6371    vsadd.h       vr12,     vr14,     vr29         // c[3]
6372    vssub.h       vr28,     vr14,     vr29         // c[28]
6373    vsadd.h       vr13,     vr15,     vr7          // c[4]
6374    vssub.h       vr27,     vr15,     vr7          // c[27]
6375    vsadd.h       vr14,     vr16,     vr6          // c[5]
6376    vssub.h       vr29,     vr16,     vr6          // c[26]
6377    vsadd.h       vr7,      vr17,     vr10         // c[6]
6378    vssub.h       vr15,     vr17,     vr10         // c[25]
6379    vsadd.h       vr6,      vr18,     vr8          // c[7]
6380    vssub.h       vr16,     vr18,     vr8          // c[24]
6381
6382.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
6383        vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6384    vsrari.h      \i,       \i,       4
6385.endr
6386
6387    vst_x8 t2, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
6388
6389    vst_x8 t2, 128, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6390
6391    vld_x8 t3, 256, 32, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6392
6393    vsadd.h       vr1,      vr11,     vr31         // c[8]
6394    vssub.h       vr2,      vr11,     vr31         // c[23]
6395    vsadd.h       vr24,     vr12,     vr19         // c[9]
6396    vssub.h       vr26,     vr12,     vr19         // c[22]
6397    vsadd.h       vr11,     vr13,     vr25         // c[10]
6398    vssub.h       vr30,     vr13,     vr25         // c[21]
6399    vsadd.h       vr12,     vr14,     vr5          // c[11]
6400    vssub.h       vr28,     vr14,     vr5          // c[20]
6401    vsadd.h       vr13,     vr15,     vr9          // c[12]
6402    vssub.h       vr27,     vr15,     vr9          // c[19]
6403    vsadd.h       vr14,     vr16,     vr4          // c[13]
6404    vssub.h       vr29,     vr16,     vr4          // c[18]
6405    vsadd.h       vr7,      vr17,     vr0          // c[14]
6406    vssub.h       vr15,     vr17,     vr0          // c[17]
6407    vsadd.h       vr6,      vr18,     vr3          // c[15]
6408    vssub.h       vr16,     vr18,     vr3          // c[16]
6409
6410.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
6411        vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6412    vsrari.h      \i,       \i,       4
6413.endr
6414
6415    vst_x8 t2, 256, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
6416
6417    vst_x8 t2, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6418
6419    alsl.d        t2,       a1,       a0,     1
6420    addi.d        t3,       sp,       64
6421
6422    vld           vr4,      t3,       0
6423    vld           vr5,      t3,       16
6424    vld           vr6,      t3,       32
6425    vld           vr7,      t3,       48
6426    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6427
6428    addi.d        t3,       sp,       64+64
6429    alsl.d        a0,       a1,       a0,     2
6430    alsl.d        t2,       a1,       t2,     2
6431    vld           vr4,      t3,       0
6432    vld           vr5,      t3,       16
6433    vld           vr6,      t3,       32
6434    vld           vr7,      t3,       48
6435    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6436
6437    addi.d        t3,       sp,       64+256
6438    alsl.d        a0,       a1,       a0,     2
6439    alsl.d        t2,       a1,       t2,     2
6440    vld           vr4,      t3,       0
6441    vld           vr5,      t3,       16
6442    vld           vr6,      t3,       32
6443    vld           vr7,      t3,       48
6444    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6445
6446    addi.d        t3,       t3,       64
6447    alsl.d        a0,       a1,       a0,     2
6448    alsl.d        t2,       a1,       t2,     2
6449    vld           vr4,      t3,       0
6450    vld           vr5,      t3,       16
6451    vld           vr6,      t3,       32
6452    vld           vr7,      t3,       48
6453    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6454
6455    addi.d        t3,       sp,       64+384
6456    alsl.d        a0,       a1,       a0,     2
6457    alsl.d        t2,       a1,       t2,     2
6458    vld           vr4,      t3,       0
6459    vld           vr5,      t3,       16
6460    vld           vr6,      t3,       32
6461    vld           vr7,      t3,       48
6462    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6463
6464    addi.d        t3,       t3,       64
6465    alsl.d        a0,       a1,       a0,     2
6466    alsl.d        t2,       a1,       t2,     2
6467    vld           vr4,      t3,       0
6468    vld           vr5,      t3,       16
6469    vld           vr6,      t3,       32
6470    vld           vr7,      t3,       48
6471    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6472
6473    addi.d        t3,       sp,       64+128
6474    alsl.d        a0,       a1,       a0,     2
6475    alsl.d        t2,       a1,       t2,     2
6476    vld           vr4,      t3,       0
6477    vld           vr5,      t3,       16
6478    vld           vr6,      t3,       32
6479    vld           vr7,      t3,       48
6480    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6481
6482    addi.d        t3,       t3,       64
6483    alsl.d        a0,       a1,       a0,     2
6484    alsl.d        t2,       a1,       t2,     2
6485    vld           vr4,      t3,       0
6486    vld           vr5,      t3,       16
6487    vld           vr6,      t3,       32
6488    vld           vr7,      t3,       48
6489    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
6490
6491    free_space 512
6492.DCT_DCT_8X32_END:
6493endfunc
6494
6495.macro dct_8x32_core_lsx in1, in2, vst_start0, vst_start1, vst_start2, \
6496                         vst_start3, transpose8x8, shift
6497
6498    // vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
6499    // in1  in3  in5  in7  in9 in11 in13 in15
6500    // vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6501    // in17  in19  in21  in23  in25  in27  in29  in31
6502
6503    la.local      t0,       idct_coeffs
6504    vldrepl.w     vr20,     t0,       64           // 201
6505    vldrepl.w     vr21,     t0,       68           // 4091
6506
6507    vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
6508    vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
6509    vssrarni.h.w  vr9,      vr8,      12           // t31a
6510    vssrarni.h.w  vr10,     vr11,     12           // t16a
6511
6512    vldrepl.w     vr20,     t0,       72           // 3035
6513    vldrepl.w     vr21,     t0,       76           // 2751
6514    vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0
6515    vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
6516    vssrarni.h.w  vr0,      vr8,      12           // t30a
6517    vssrarni.h.w  vr30,     vr11,     12           // t17a
6518
6519    vldrepl.w     vr20,     t0,       80           // 1751
6520    vldrepl.w     vr21,     t0,       84           // 3703
6521    vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
6522    vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19
6523    vssrarni.h.w  vr7,      vr8,      12           // t29a
6524    vssrarni.h.w  vr19,     vr11,     12           // t18a
6525
6526    vldrepl.w     vr20,     t0,       88           // 3857
6527    vldrepl.w     vr21,     t0,       92           // 1380
6528    vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
6529    vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26
6530    vssrarni.h.w  vr4,      vr8,      12           // t28a
6531    vssrarni.h.w  vr26,     vr11,     12           // t19a
6532
6533    vldrepl.w     vr20,     t0,       96           // 995
6534    vldrepl.w     vr21,     t0,       100          // 3973
6535    vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
6536    vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27
6537    vssrarni.h.w  vr3,      vr8,      12           // t27a
6538    vssrarni.h.w  vr27,     vr11,     12           // t20a
6539
6540    vldrepl.w     vr20,     t0,       104          // 3513
6541    vldrepl.w     vr21,     t0,       108          // 2106
6542    vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
6543    vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28
6544    vssrarni.h.w  vr2,      vr8,      12           // t26a
6545    vssrarni.h.w  vr28,     vr11,     12           // t21a
6546
6547    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
6548    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
6549    vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
6550    vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25
6551    vssrarni.h.w  vr5,      vr8,      12           // t25a
6552    vssrarni.h.w  vr25,     vr11,     12           // t22a
6553
6554    vldrepl.w     vr20,     t0,       120          // 4052
6555    vldrepl.w     vr21,     t0,       124          // 601
6556    vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
6557    vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24
6558    vssrarni.h.w  vr6,      vr8,      12           // t24a
6559    vssrarni.h.w  vr24,     vr11,     12           // t23a
6560
6561    vsadd.h       vr1,      vr10,     vr30         // t16
6562    vssub.h       vr29,     vr10,     vr30         // t17
6563    vssub.h       vr8,      vr26,     vr19         // t18
6564    vsadd.h       vr31,     vr26,     vr19         // t19
6565    vsadd.h       vr10,     vr27,     vr28         // t20
6566    vssub.h       vr30,     vr27,     vr28         // t21
6567    vssub.h       vr19,     vr24,     vr25         // t22
6568    vsadd.h       vr26,     vr24,     vr25         // t23
6569    vsadd.h       vr27,     vr6,      vr5          // t24
6570    vssub.h       vr28,     vr6,      vr5          // t25
6571    vssub.h       vr24,     vr3,      vr2          // t26
6572    vsadd.h       vr25,     vr3,      vr2          // t27
6573    vsadd.h       vr5,      vr4,      vr7          // t28
6574    vssub.h       vr6,      vr4,      vr7          // t29
6575    vssub.h       vr2,      vr9,      vr0          // t30
6576    vsadd.h       vr3,      vr9,      vr0          // t31
6577
6578    vldrepl.w     vr20,     t0,       16           // 799
6579    vldrepl.w     vr21,     t0,       20           // 4017
6580    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
6581    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
6582    vssrarni.h.w  vr7,      vr4,      12           // t30a
6583    vssrarni.h.w  vr0,      vr11,     12           // t17a
6584    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
6585    vneg.w        vr4,      vr4
6586    vneg.w        vr9,      vr9
6587    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
6588    vssrarni.h.w  vr9,      vr4,      12           // t18a
6589    vssrarni.h.w  vr2,      vr11,     12           // t29a
6590
6591    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
6592    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
6593    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
6594    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
6595    vssrarni.h.w  vr29,     vr4,      12           // t26a
6596    vssrarni.h.w  vr6,      vr11,     12           // t21a
6597
6598    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
6599    vneg.w        vr4,      vr4
6600    vneg.w        vr8,      vr8
6601    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
6602    vssrarni.h.w  vr8,      vr4,      12           // t22a
6603    vssrarni.h.w  vr24,     vr11,     12           // t25a
6604
6605    vsadd.h       vr4,      vr1,      vr31         // t16a
6606    vssub.h       vr30,     vr1,      vr31         // t19a
6607    vsadd.h       vr19,     vr0,      vr9          // t17
6608    vssub.h       vr28,     vr0,      vr9          // t18
6609    vssub.h       vr1,      vr26,     vr10         // t20a
6610    vsadd.h       vr31,     vr26,     vr10         // t23a
6611    vssub.h       vr0,      vr8,      vr6          // t21
6612    vsadd.h       vr9,      vr8,      vr6          // t22
6613    vsadd.h       vr10,     vr27,     vr25         // t24a
6614    vssub.h       vr26,     vr27,     vr25         // t27a
6615    vsadd.h       vr6,      vr24,     vr29         // t25
6616    vssub.h       vr8,      vr24,     vr29         // t26
6617    vssub.h       vr25,     vr3,      vr5          // t28a
6618    vsadd.h       vr27,     vr3,      vr5          // t31a
6619    vssub.h       vr24,     vr7,      vr2          // t29
6620    vsadd.h       vr29,     vr7,      vr2          // t30
6621
6622    vldrepl.w     vr20,     t0,       8            // 1567
6623    vldrepl.w     vr21,     t0,       12           // 3784
6624    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
6625    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
6626    vssrarni.h.w  vr5,      vr3,      12           // t29a
6627    vssrarni.h.w  vr2,      vr11,     12           // 18a
6628
6629    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
6630    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
6631    vssrarni.h.w  vr7,      vr3,      12           // t28
6632    vssrarni.h.w  vr24,     vr11,     12           // t19
6633
6634    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
6635    vneg.w        vr3,      vr3
6636    vneg.w        vr28,     vr28
6637    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
6638    vssrarni.h.w  vr28,     vr3,      12           // t20
6639    vssrarni.h.w  vr25,     vr11,     12           // t27
6640
6641    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
6642    vneg.w        vr3,      vr3
6643    vneg.w        vr30,     vr30
6644    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
6645    vssrarni.h.w  vr30,     vr3,      12           // t21a
6646    vssrarni.h.w  vr1,      vr11,     12           // t26a
6647
6648    vsadd.h       vr3,      vr4,      vr31         // t16
6649    vssub.h       vr26,     vr4,      vr31         // t23
6650    vsadd.h       vr0,      vr19,     vr9          // t17a
6651    vssub.h       vr8,      vr19,     vr9          // t22a
6652    vsadd.h       vr4,      vr2,      vr30         // t18
6653    vssub.h       vr31,     vr2,      vr30         // t21
6654    vsadd.h       vr9,      vr24,     vr28         // t19a
6655    vssub.h       vr19,     vr24,     vr28         // t20a
6656    vssub.h       vr2,      vr27,     vr10         // t24
6657    vsadd.h       vr30,     vr27,     vr10         // t31
6658    vssub.h       vr24,     vr29,     vr6          // t25a
6659    vsadd.h       vr28,     vr29,     vr6          // t30a
6660    vssub.h       vr10,     vr5,      vr1          // t26
6661    vsadd.h       vr27,     vr5,      vr1          // t29
6662    vssub.h       vr6,      vr7,      vr25         // t27a
6663    vsadd.h       vr29,     vr7,      vr25         // t28a
6664
6665    vldrepl.w     vr20,     t0,       0            // 2896
6666    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
6667    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
6668    vssrarni.h.w  vr5,      vr1,      12           // t20
6669    vssrarni.h.w  vr7,      vr11,     12           // t27
6670
6671    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
6672    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
6673    vssrarni.h.w  vr25,     vr1,      12           // t21a
6674    vssrarni.h.w  vr6,      vr11,     12           // t26a
6675
6676    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
6677    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
6678    vssrarni.h.w  vr19,     vr1,      12           // t22
6679    vssrarni.h.w  vr10,     vr11,     12           // t25
6680
6681    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
6682    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
6683    vssrarni.h.w  vr31,     vr1,      12           // t23a
6684    vssrarni.h.w  vr8,      vr11,     12           // t24a
6685
6686    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
6687    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
6688
6689    vld_x8 \in2, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6690
6691    vsadd.h       vr1,      vr11,     vr30         // c[0]
6692    vssub.h       vr2,      vr11,     vr30         // c[31]
6693    vsadd.h       vr24,     vr12,     vr28         // c[1]
6694    vssub.h       vr26,     vr12,     vr28         // c[30]
6695    vsadd.h       vr11,     vr13,     vr27         // c[2]
6696    vssub.h       vr30,     vr13,     vr27         // c[29]
6697    vsadd.h       vr12,     vr14,     vr29         // c[3]
6698    vssub.h       vr28,     vr14,     vr29         // c[28]
6699    vsadd.h       vr13,     vr15,     vr7          // c[4]
6700    vssub.h       vr27,     vr15,     vr7          // c[27]
6701    vsadd.h       vr14,     vr16,     vr6          // c[5]
6702    vssub.h       vr29,     vr16,     vr6          // c[26]
6703    vsadd.h       vr7,      vr17,     vr10         // c[6]
6704    vssub.h       vr15,     vr17,     vr10         // c[25]
6705    vsadd.h       vr6,      vr18,     vr8          // c[7]
6706    vssub.h       vr16,     vr18,     vr8          // c[24]
6707
6708.ifnb \transpose8x8
6709    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
6710                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
6711                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
6712.endif
6713
6714.ifnb \shift
6715.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
6716    vsrari.h      \i,       \i,       \shift
6717.endr
6718.endif
6719
6720    vst_x8 \in1, \vst_start0, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
6721
6722.ifnb \transpose8x8
6723    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
6724                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
6725                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
6726.endif
6727
6728.ifnb \shift
6729.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6730    vsrari.h      \i,       \i,       \shift
6731.endr
6732.endif
6733
6734    vst_x8 \in1, \vst_start3, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6735
6736    vld_x8 \in2, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
6737
6738    vsadd.h       vr1,      vr11,     vr31         // c[8]
6739    vssub.h       vr2,      vr11,     vr31         // c[23]
6740    vsadd.h       vr24,     vr12,     vr19         // c[9]
6741    vssub.h       vr26,     vr12,     vr19         // c[22]
6742    vsadd.h       vr11,     vr13,     vr25         // c[10]
6743    vssub.h       vr30,     vr13,     vr25         // c[21]
6744    vsadd.h       vr12,     vr14,     vr5          // c[11]
6745    vssub.h       vr28,     vr14,     vr5          // c[20]
6746    vsadd.h       vr13,     vr15,     vr9          // c[12]
6747    vssub.h       vr27,     vr15,     vr9          // c[19]
6748    vsadd.h       vr14,     vr16,     vr4          // c[13]
6749    vssub.h       vr29,     vr16,     vr4          // c[18]
6750    vsadd.h       vr7,      vr17,     vr0          // c[14]
6751    vssub.h       vr15,     vr17,     vr0          // c[17]
6752    vsadd.h       vr6,      vr18,     vr3          // c[15]
6753    vssub.h       vr16,     vr18,     vr3          // c[16]
6754
6755.ifnb \transpose8x8
6756    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
6757                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
6758                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
6759.endif
6760
6761.ifnb \shift
6762.irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
6763    vsrari.h      \i,       \i,       \shift
6764.endr
6765.endif
6766
6767    vst_x8 \in1, \vst_start1, 64, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
6768
6769.ifnb \transpose8x8
6770    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
6771                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
6772                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
6773.endif
6774
6775.ifnb \shift
6776.irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6777    vsrari.h      \i,       \i,       \shift
6778.endr
6779.endif
6780
6781    vst_x8 \in1, \vst_start2, 64, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
6782.endm
6783
6784function inv_txfm_add_dct_dct_32x32_8bpc_lsx
6785    bnez          a3,       .NO_HAS_DCONLY_32x32
6786
6787    ld.h          t2,       a2,       0      // dc
6788    vldi          vr0,      0x8b5            // 181
6789    vreplgr2vr.w  vr1,      t2
6790    vldi          vr20,     0x880            // 128
6791    vmul.w        vr2,      vr0,      vr1    // dc * 181
6792    st.h          zero,     a2,       0
6793    add.d         t0,       a0,       a1
6794    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
6795    vld           vr3,      t0,       16
6796    vsrari.w      vr2,      vr2,      2      // (dc + rnd) >> shift
6797    vld           vr1,      a0,       16
6798    vmadd.w       vr20,     vr2,      vr0
6799    vld           vr2,      t0,       0
6800    vssrarni.h.w  vr20,     vr20,     12
6801    vld           vr0,      a0,       0
6802
6803    vsllwil.hu.bu vr4,      vr0,      0
6804    vsllwil.hu.bu vr5,      vr1,      0
6805    vsllwil.hu.bu vr6,      vr2,      0
6806    vsllwil.hu.bu vr7,      vr3,      0
6807    vexth.hu.bu   vr0,      vr0
6808    vexth.hu.bu   vr1,      vr1
6809    vexth.hu.bu   vr2,      vr2
6810    vexth.hu.bu   vr3,      vr3
6811    vadd.h        vr8,      vr4,      vr20
6812    vadd.h        vr9,      vr0,      vr20
6813    vadd.h        vr10,     vr5,      vr20
6814    vadd.h        vr11,     vr1,      vr20
6815    vadd.h        vr12,     vr6,      vr20
6816    vadd.h        vr13,     vr2,      vr20
6817    vadd.h        vr14,     vr7,      vr20
6818    vadd.h        vr15,     vr3,      vr20
6819    vssrani.bu.h  vr9,      vr8,      0
6820    vssrani.bu.h  vr11,     vr10,     0
6821    vssrani.bu.h  vr13,     vr12,     0
6822    vssrani.bu.h  vr15,     vr14,     0
6823    vst           vr9,      a0,       0
6824    vst           vr11,     a0,       16
6825    vst           vr13,     t0,       0
6826    vst           vr15,     t0,       16
6827
6828.rept 15
6829    alsl.d        a0,       a1,       a0,     1
6830    add.d         t0,       a0,       a1
6831
6832    vld           vr0,      a0,       0
6833    vld           vr1,      a0,       16
6834    vld           vr2,      t0,       0
6835    vld           vr3,      t0,       16
6836    vsllwil.hu.bu vr4,      vr0,      0
6837    vsllwil.hu.bu vr5,      vr1,      0
6838    vsllwil.hu.bu vr6,      vr2,      0
6839    vsllwil.hu.bu vr7,      vr3,      0
6840    vexth.hu.bu   vr0,      vr0
6841    vexth.hu.bu   vr1,      vr1
6842    vexth.hu.bu   vr2,      vr2
6843    vexth.hu.bu   vr3,      vr3
6844    vadd.h        vr8,      vr4,      vr20
6845    vadd.h        vr9,      vr0,      vr20
6846    vadd.h        vr10,     vr5,      vr20
6847    vadd.h        vr11,     vr1,      vr20
6848    vadd.h        vr12,     vr6,      vr20
6849    vadd.h        vr13,     vr2,      vr20
6850    vadd.h        vr14,     vr7,      vr20
6851    vadd.h        vr15,     vr3,      vr20
6852    vssrani.bu.h  vr9,      vr8,      0
6853    vssrani.bu.h  vr11,     vr10,     0
6854    vssrani.bu.h  vr13,     vr12,     0
6855    vssrani.bu.h  vr15,     vr14,     0
6856    vst           vr9,      a0,       0
6857    vst           vr11,     a0,       16
6858    vst           vr13,     t0,       0
6859    vst           vr15,     t0,       16
6860.endr
6861
6862    b             .DCT_DCT_32X32_END
6863.NO_HAS_DCONLY_32x32:
6864
6865    malloc_space 2560                              // 32*32*2+512
6866
6867    addi.d        t1,       sp,       64
6868    addi.d        t2,       a2,       0
6869    addi.d        t3,       sp,       1024
6870    addi.d        t3,       t3,       1024
6871    addi.d        t3,       t3,       64
6872
6873    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6874            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6875
6876    dct_8x16_core_lsx
6877
6878    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
6879            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
6880
6881    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6882            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6883
6884    dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2
6885
6886.rept 3
6887    addi.d        t2,       t2,       16
6888    addi.d        t1,       t1,       512
6889
6890    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6891            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6892
6893    dct_8x16_core_lsx
6894
6895    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
6896            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
6897
6898    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6899            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6900
6901    dct_8x32_core_lsx t1, t3, 0, 16, 32, 48, transpose8x8, 2
6902.endr
6903
6904    vreplgr2vr.h     vr31,     zero
6905.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032
6906    vst           vr31,     a2,       \i
6907.endr
6908
6909    addi.d        t2,       sp,       64
6910    addi.d        t1,       sp,       64
6911
6912    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6913            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6914
6915    dct_8x16_core_lsx
6916
6917    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
6918            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
6919
6920    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6921            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6922
6923    dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4
6924
6925.rept 3
6926    addi.d        t2,       t2,       16
6927    addi.d        t1,       t1,       16
6928
6929    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6930            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6931
6932    dct_8x16_core_lsx
6933
6934    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
6935            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
6936
6937    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
6938            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
6939
6940    dct_8x32_core_lsx t1, t3, 0, 512, 1024, 1536, , 4
6941.endr
6942
6943    addi.d        t2,       sp,       64
6944
6945.rept 16
6946    add.d         t0,       a0,       a1
6947    vld           vr0,      a0,       0
6948    vld           vr1,      a0,       16
6949    vld           vr2,      t0,       0
6950    vld           vr3,      t0,       16
6951    vsllwil.hu.bu vr4,      vr0,      0
6952    vsllwil.hu.bu vr5,      vr1,      0
6953    vsllwil.hu.bu vr6,      vr2,      0
6954    vsllwil.hu.bu vr7,      vr3,      0
6955    vexth.hu.bu   vr0,      vr0
6956    vexth.hu.bu   vr1,      vr1
6957    vexth.hu.bu   vr2,      vr2
6958    vexth.hu.bu   vr3,      vr3
6959    vld_x8 t2, 0, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
6960    vadd.h        vr8,      vr4,      vr8
6961    vadd.h        vr9,      vr0,      vr9
6962    vadd.h        vr10,     vr5,      vr10
6963    vadd.h        vr11,     vr1,      vr11
6964    vadd.h        vr12,     vr6,      vr12
6965    vadd.h        vr13,     vr2,      vr13
6966    vadd.h        vr14,     vr7,      vr14
6967    vadd.h        vr15,     vr3,      vr15
6968    vssrani.bu.h  vr9,      vr8,      0
6969    vssrani.bu.h  vr11,     vr10,     0
6970    vssrani.bu.h  vr13,     vr12,     0
6971    vssrani.bu.h  vr15,     vr14,     0
6972    vst           vr9,      a0,       0
6973    vst           vr11,     a0,       16
6974    vst           vr13,     t0,       0
6975    vst           vr15,     t0,       16
6976
6977    alsl.d        a0,       a1,       a0,     1
6978    addi.d        t2,       t2,       128
6979.endr
6980
6981    free_space 2560                                // 32*32*2+512
6982
6983.DCT_DCT_32X32_END:
6984endfunc
6985
6986.macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, \
6987                             out0, out1, out2, out3, out4, out5, out6, out7
6988
6989    // in0 in1 in2 in3
6990    // dct4 in0 in2
6991    la.local      t0,       idct_coeffs
6992
6993    vldrepl.w     vr20,     t0,       8            // 1567
6994    vldrepl.w     vr21,     t0,       12           // 3784
6995    vsllwil.w.h   vr22,     \in2,     0
6996    vexth.w.h     vr23,     \in2
6997    vmul.w        vr8,      vr22,     vr20
6998    vmul.w        vr10,     vr23,     vr20
6999    vmul.w        \in2,     vr22,     vr21
7000    vmul.w        vr9,      vr23,     vr21
7001    vssrarni.h.w  vr10,     vr8,      12           // t2
7002    vssrarni.h.w  vr9,      \in2,     12           // t3
7003
7004    vldrepl.w     vr20,     t0,       0            // 2896
7005    vsllwil.w.h   vr22,     \in0,     0
7006    vexth.w.h     vr23,     \in0
7007    vmul.w        vr8,      vr22,     vr20
7008    vmul.w        \in2,     vr23,     vr20
7009    vssrarni.h.w  \in2,     vr8,      12
7010
7011    vsadd.h       vr8,      \in2,     vr9          // c[0]
7012    vssub.h       vr9,      \in2,     vr9          // c[3]
7013    vsadd.h       \in0,     \in2,     vr10         // c[1]
7014    vssub.h       vr10,     \in2,     vr10         // c[2]
7015
7016    // inv_dct8_1d_internal_c tx64
7017    // in1 in3
7018    vldrepl.w     vr20,     t0,       16           // 799
7019    vldrepl.w     vr21,     t0,       20           // 4017
7020
7021    vsllwil.w.h   vr22,     \in1,     0
7022    vexth.w.h     vr23,     \in1
7023    vmul.w        \in2,     vr22,     vr21
7024    vmul.w        \in4,     vr23,     vr21
7025    vmul.w        \in1,     vr22,     vr20
7026    vmul.w        \in6,     vr23,     vr20
7027    vssrarni.h.w  \in4,     \in2,     12           // t7a
7028    vssrarni.h.w  \in6,     \in1,     12           // t4a
7029
7030    vldrepl.w     vr20,     t0,       24           // 3406
7031    vldrepl.w     vr21,     t0,       28           // 2276
7032
7033    vsllwil.w.h   vr22,     \in3,     0
7034    vexth.w.h     vr23,     \in3
7035    vneg.w        vr21,     vr21
7036    vmul.w        \in2,     vr22,     vr20
7037    vmul.w        \in1,     vr23,     vr20
7038    vmul.w        \in3,     vr22,     vr21
7039    vmul.w        \in7,     vr23,     vr21
7040    vssrarni.h.w  \in1,     \in2,     12           // t6a
7041    vssrarni.h.w  \in7,     \in3,     12           // t5a
7042
7043    vsadd.h       \in3,     \in6,     \in7         // t4
7044    vssub.h       \in6,     \in6,     \in7         // t5a
7045    vsadd.h       \in5,     \in4,     \in1         // t7
7046    vssub.h       \in4,     \in4,     \in1         // t6a
7047
7048    vldrepl.w     vr20,     t0,       0            // 2896
7049    vmul_vmadd_w  \in4, \in6, vr20, vr20, vr21, \in1
7050    vmul_vmsub_w  \in4, \in6, vr20, vr20, \in2, \in7
7051    vssrarni.h.w  \in1,     vr21,     12           // t6
7052    vssrarni.h.w  \in7,     \in2,     12           // t5
7053
7054    vsadd.h       \out0,    vr8,      \in5         // c[0]
7055    vssub.h       \out7,    vr8,      \in5         // c[7]
7056    vsadd.h       \out1,    \in0,     \in1         // c[1]
7057    vssub.h       \out6,    \in0,     \in1         // c[6]
7058    vsadd.h       \out2,    vr10,     \in7         // c[2]
7059    vssub.h       \out5,    vr10,     \in7         // c[5]
7060    vsadd.h       \out3,    vr9,      \in3         // c[3]
7061    vssub.h       \out4,    vr9,      \in3         // c[4]
7062.endm
7063
7064.macro dct_8x16_tx64_core_lsx
7065    dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \
7066                          vr12, vr13, vr14, vr15, vr16, vr17, vr18
7067
7068    // in1 in3 in5 in7 in9  in11 in13 in15
7069    // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30
7070    la.local      t0,       idct_coeffs
7071
7072    vldrepl.w     vr20,     t0,       32           // 401
7073    vldrepl.w     vr21,     t0,       36           // 4076
7074    vsllwil.w.h   vr22,     vr1,      0
7075    vexth.w.h     vr23,     vr1
7076    vmul.w        vr0,      vr22,     vr21
7077    vmul.w        vr10,     vr23,     vr21
7078    vmul.w        vr1,      vr22,     vr20
7079    vmul.w        vr29,     vr23,     vr20
7080    vssrarni.h.w  vr10,     vr0,      12           // t15a
7081    vssrarni.h.w  vr29,     vr1,      12           // t8a
7082
7083    vldrepl.w     vr20,     t0,       40           // 3166 -> 1583
7084    vldrepl.w     vr21,     t0,       44           // 2598 -> 1299
7085    vsllwil.w.h   vr22,     vr7,      0
7086    vexth.w.h     vr23,     vr7
7087    vneg.w        vr21,     vr21
7088    vmul.w        vr0,      vr22,     vr20
7089    vmul.w        vr30,     vr23,     vr20
7090    vmul.w        vr7,      vr22,     vr21
7091    vmul.w        vr31,     vr23,     vr21
7092    vssrarni.h.w  vr30,     vr0,      12           // t14a
7093    vssrarni.h.w  vr31,     vr7,      12           // t9a
7094
7095    vldrepl.w     vr20,     t0,       48           // 1931
7096    vldrepl.w     vr21,     t0,       52           // 3612
7097    vsllwil.w.h   vr22,     vr5,      0
7098    vexth.w.h     vr23,     vr5
7099    vmul.w        vr0,      vr22,     vr21
7100    vmul.w        vr24,     vr23,     vr21
7101    vmul.w        vr5,      vr22,     vr20
7102    vmul.w        vr25,     vr23,     vr20
7103    vssrarni.h.w  vr24,     vr0,      12           // t13a
7104    vssrarni.h.w  vr25,     vr5,      12           // t10a
7105
7106    vldrepl.w     vr20,     t0,       56           // 3920
7107    vldrepl.w     vr21,     t0,       60           // 1189
7108    vsllwil.w.h   vr22,     vr3,      0
7109    vexth.w.h     vr23,     vr3
7110    vneg.w        vr21,     vr21
7111    vmul.w        vr0,      vr22,     vr20
7112    vmul.w        vr26,     vr23,     vr20
7113    vmul.w        vr3,      vr22,     vr21
7114    vmul.w        vr27,     vr23,     vr21
7115    vssrarni.h.w  vr26,     vr0,      12           // t12a
7116    vssrarni.h.w  vr27,     vr3,      12           // t11a
7117
7118    // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
7119    vsadd.h       vr28,     vr29,      vr31        // t8
7120    vssub.h       vr19,     vr29,      vr31        // t9
7121    vssub.h       vr29,     vr27,      vr25        // t10
7122    vsadd.h       vr9,      vr27,      vr25        // t11
7123    vsadd.h       vr31,     vr26,      vr24        // t12
7124    vssub.h       vr25,     vr26,      vr24        // t13
7125    vssub.h       vr27,     vr10,      vr30        // t14
7126    vsadd.h       vr24,     vr10,      vr30        // t15
7127
7128    vldrepl.w     vr20,     t0,       8            // 1567
7129    vldrepl.w     vr21,     t0,       12           // 3784
7130    vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
7131    vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30
7132    vssrarni.h.w  vr26,     vr0,       12          // t14a
7133    vssrarni.h.w  vr30,     vr1,       12          // t9a
7134
7135    vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
7136    vneg.w        vr0,      vr0
7137    vneg.w        vr19,     vr19
7138    vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27
7139    vssrarni.h.w  vr19,     vr0,       12          // t10a
7140    vssrarni.h.w  vr27,     vr1,       12          // t13a
7141
7142    vsadd.h       vr25,     vr28,     vr9          // t8a
7143    vssub.h       vr29,     vr28,     vr9          // t11a
7144    vssub.h       vr28,     vr24,     vr31         // t12a
7145    vsadd.h       vr10,     vr24,     vr31         // t15a
7146    vsadd.h       vr9,      vr30,     vr19         // t9
7147    vssub.h       vr31,     vr30,     vr19         // t10
7148    vssub.h       vr30,     vr26,     vr27         // t13
7149    vsadd.h       vr24,     vr26,     vr27         // t14
7150
7151    vldrepl.w     vr20,     t0,       0            // 2896
7152    vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
7153    vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27
7154    vssrarni.h.w  vr26,     vr0,      12           // t13a
7155    vssrarni.h.w  vr27,     vr1,      12           // t10a
7156
7157    vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
7158    vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30
7159    vssrarni.h.w  vr31,     vr0,      12           // t12
7160    vssrarni.h.w  vr30,     vr1,      12           // t11
7161
7162    // vr11 vr12 ... vr18
7163    vsadd.h       vr28,     vr14,     vr31         // c[3]
7164    vssub.h       vr29,     vr14,     vr31         // c[12]
7165    vsadd.h       vr20,     vr15,     vr30         // c[4]
7166    vssub.h       vr21,     vr15,     vr30         // c[11]
7167    vsadd.h       vr14,     vr16,     vr27         // c[5]
7168    vssub.h       vr23,     vr16,     vr27         // c[10]
7169    vsadd.h       vr15,     vr17,     vr9          // c[6]
7170    vssub.h       vr30,     vr17,     vr9          // c[9]
7171    vsadd.h       vr16,     vr18,     vr25         // c[7]
7172    vssub.h       vr27,     vr18,     vr25         // c[8]
7173    vsadd.h       vr17,     vr13,     vr26         // c[2]
7174    vssub.h       vr26,     vr13,     vr26         // c[13]
7175    vsadd.h       vr18,     vr12,     vr24         // c[1]
7176    vssub.h       vr25,     vr12,     vr24         // c[14]
7177    vsadd.h       vr22,     vr11,     vr10         // c[0]
7178    vssub.h       vr24,     vr11,     vr10         // c[15]
7179.endm // dct_8x16_tx64_core_lsx
7180
7181.macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1
7182    vsllwil.w.h   vr22,      \in0,     0
7183    vexth.w.h     vr23,      \in0
7184    vmul.w        \tmp0,     vr22,     \in1
7185    vmul.w        \out0,     vr23,     \in1
7186    vmul.w        \tmp1,     vr22,     \in2
7187    vmul.w        \out1,     vr23,     \in2
7188    vssrarni.h.w  \out0,     \tmp0,    12
7189    vssrarni.h.w  \out1,     \tmp1,    12
7190.endm
7191
7192const idct64_coeffs, align=4
7193    .word         101, 4095, 2967, -2824
7194    .word         1660, 3745, 3822, -1474
7195    .word         4076, 401, 4017, 799
7196
7197    .word         4036, -700, 2359, 3349
7198    .word         3461, -2191, 897, 3996
7199    .word         -3166, -2598, -799, -4017
7200
7201    .word         501, 4065, 3229, -2520
7202    .word         2019, 3564, 3948, -1092
7203    .word         3612, 1931, 2276, 3406
7204
7205    .word         4085, -301, 2675, 3102
7206    .word         3659, -1842, 1285, 3889
7207    .word         -3920, -1189, -3406, -2276
7208endconst
7209
7210// in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
7211// in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
7212// in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
7213// in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
7214
7215.macro dct64_step1_lsx
7216
7217    vldrepl.w     vr20,     t0,       0            // 101
7218    vldrepl.w     vr21,     t0,       4            // 4095
7219    vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9    // vr8 t32a vr9 t63a
7220
7221    vldrepl.w     vr20,     t0,       8            // 2967
7222    vldrepl.w     vr21,     t0,       12           // -2824
7223    vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11  // vr10 t62a vr11 t33a
7224
7225    vldrepl.w     vr20,     t0,       16           // 1660
7226    vldrepl.w     vr21,     t0,       20           // 3745
7227    vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13  // vr12 t34a vr13 t61a
7228
7229    vldrepl.w     vr20,     t0,       24           // 3822
7230    vldrepl.w     vr21,     t0,       28           // -1474
7231    vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15  // vr14 t60a vr15 t35a
7232
7233    vsadd.h       vr0,      vr8,      vr11         // t32
7234    vssub.h       vr1,      vr8,      vr11         // t33
7235    vssub.h       vr2,      vr15,     vr12         // t34
7236    vsadd.h       vr3,      vr15,     vr12         // t35
7237    vsadd.h       vr4,      vr14,     vr13         // t60
7238    vssub.h       vr5,      vr14,     vr13         // t61
7239    vssub.h       vr6,      vr9,      vr10         // t62
7240    vsadd.h       vr7,      vr9,      vr10         // t63
7241
7242    vldrepl.w     vr20,     t0,       32           // 4076
7243    vldrepl.w     vr21,     t0,       36           // 401
7244    vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10
7245    vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11
7246    vssrarni.h.w  vr10,     vr9,      12           // t62a
7247    vssrarni.h.w  vr11,     vr13,     12           // t33a
7248
7249    vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1
7250    vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6
7251    vneg.w        vr9,      vr9
7252    vneg.w        vr1,      vr1
7253    vssrarni.h.w  vr6,      vr13,     12           // t61a
7254    vssrarni.h.w  vr1,      vr9,      12           // t34a
7255
7256    vsadd.h       vr2,      vr0,      vr3          // t32a
7257    vssub.h       vr5,      vr0,      vr3          // t35a
7258    vsadd.h       vr9,      vr11,     vr1          // t33
7259    vssub.h       vr13,     vr11,     vr1          // t34
7260    vssub.h       vr0,      vr7,      vr4          // t60a
7261    vsadd.h       vr3,      vr7,      vr4          // t63a
7262    vssub.h       vr1,      vr10,     vr6          // t61
7263    vsadd.h       vr11,     vr10,     vr6          // t62
7264
7265    vldrepl.w     vr20,     t0,       40           // 4017
7266    vldrepl.w     vr21,     t0,       44           // 799
7267
7268    vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4
7269    vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7
7270    vssrarni.h.w  vr4,      vr8,      12           // t61a
7271    vssrarni.h.w  vr7,      vr12,     12           // t34a
7272
7273    vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6
7274    vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10
7275    vssrarni.h.w  vr6,      vr8,      12           // t60
7276    vssrarni.h.w  vr10,     vr12,     12           // t35
7277
7278    vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3
7279.endm // dct64_step1
7280
7281    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
7282    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
7283    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
7284    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
7285.macro dct64_step2_lsx
7286    vld           vr0,      t5,       0            // t32a
7287    vld           vr2,      t4,       0            // t63a
7288    vld           vr3,      t5,       16*8         // t56a
7289    vld           vr1,      t4,       16*8         // t39a
7290    vld           vr4,      t5,       16*16        // t40a
7291    vld           vr6,      t4,       16*16        // t55a
7292    vld           vr7,      t5,       16*24        // t48a
7293    vld           vr5,      t4,       16*24        // t47a
7294
7295    vsadd.h       vr8,      vr0,      vr1          // t32
7296    vssub.h       vr9,      vr0,      vr1          // t39
7297    vsadd.h       vr10,     vr2,      vr3          // t63
7298    vssub.h       vr11,     vr2,      vr3          // t56
7299    vssub.h       vr12,     vr5,      vr4          // t40
7300    vsadd.h       vr13,     vr5,      vr4          // t47
7301    vsadd.h       vr14,     vr7,      vr6          // t48
7302    vssub.h       vr15,     vr7,      vr6          // t55
7303
7304    vldrepl.w     vr20,     t0,       8            // 1567
7305    vldrepl.w     vr21,     t0,       12           // 3784
7306    vmul_vmadd_w  vr11, vr9, vr21, vr20, vr0, vr2
7307    vmul_vmsub_w  vr11, vr9, vr20, vr21, vr1, vr3
7308    vssrarni.h.w  vr2,      vr0,      12           // t56a
7309    vssrarni.h.w  vr3,      vr1,      12           // t39a
7310
7311    vmul_vmadd_w  vr15, vr12, vr21, vr20, vr0, vr4
7312    vmul_vmsub_w  vr15, vr12, vr20, vr21, vr1, vr5
7313    vneg.w        vr0,      vr0
7314    vneg.w        vr4,      vr4
7315    vssrarni.h.w  vr5,      vr1,      12           // t55a
7316    vssrarni.h.w  vr4,      vr0,      12           // t40a
7317
7318    vsadd.h       vr9,      vr8,      vr13         // t32a
7319    vssub.h       vr11,     vr8,      vr13         // t47a
7320    vsadd.h       vr6,      vr3,      vr4          // t39
7321    vssub.h       vr7,      vr3,      vr4          // t40
7322    vssub.h       vr12,     vr10,     vr14         // t48a
7323    vsadd.h       vr15,     vr10,     vr14         // t63a
7324    vssub.h       vr0,      vr2,      vr5          // t55
7325    vsadd.h       vr1,      vr2,      vr5          // t56
7326
7327    vldrepl.w     vr20,     t0,       0            // 2896
7328    vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13
7329    vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4
7330    vssrarni.h.w  vr13,     vr8,      12           // t40a
7331    vssrarni.h.w  vr4,      vr3,      12           // t55a
7332    vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10
7333    vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14
7334    vssrarni.h.w  vr10,     vr8,      12           // t47
7335    vssrarni.h.w  vr14,     vr3,      12           // t48
7336
7337    // t32a t39 t40a t47  t48  t55a t56 t63a
7338    // vr9  vr6 vr13 vr10 vr14 vr4  vr1 vr15
7339    vst           vr9,      t5,       0            // t32a
7340    vst           vr6,      t4,       0            // t39
7341    vst           vr13,     t5,       16*8         // t40a
7342    vst           vr10,     t4,       16*8         // t47
7343    vst           vr14,     t5,       16*16        // t48
7344    vst           vr4,      t4,       16*16        // t55a
7345    vst           vr1,      t5,       16*24        // t56
7346    vst           vr15,     t4,       16*24        // t63a
7347.endm // dct64_step2_lsx
7348
7349.macro dct64_step3_lsx
7350    //                t0   t1   t2   t3   t4    t5    t6    t7
7351    vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17
7352
7353    vld           vr9,      t5,       16*24    // t56
7354    vld           vr6,      t5,       16*24+16 // t57a
7355    vld           vr13,     t5,       16*24+32 // t58
7356    vld           vr10,     t5,       16*24+48 // t59a
7357    vld           vr14,     t4,       16*24-48 // t60
7358    vld           vr4,      t4,       16*24-32 // t61a
7359    vld           vr1,      t4,       16*24-16 // t62
7360    vld           vr15,     t4,       16*24    // t63a
7361
7362    vsadd.h       vr20,     vr2,      vr15     // c[0]
7363    vssub.h       vr21,     vr2,      vr15     // c[63]
7364    vsadd.h       vr22,     vr3,      vr1      // c[1]
7365    vssub.h       vr23,     vr3,      vr1      // c[62]
7366    vsadd.h       vr24,     vr7,      vr4      // c[2]
7367    vssub.h       vr25,     vr7,      vr4      // c[61]
7368    vsadd.h       vr26,     vr8,      vr14     // c[3]
7369    vssub.h       vr27,     vr8,      vr14     // c[60]
7370
7371    vsadd.h       vr28,     vr11,     vr10     // c[4]
7372    vssub.h       vr29,     vr11,     vr10     // c[59]
7373    vsadd.h       vr30,     vr12,     vr13     // c[5]
7374    vssub.h       vr31,     vr12,     vr13     // c[58]
7375    vsadd.h       vr2,      vr16,     vr6      // c[6]
7376    vssub.h       vr15,     vr16,     vr6      // c[57]
7377    vsadd.h       vr1,      vr17,     vr9      // c[7]
7378    vssub.h       vr3,      vr17,     vr9      // c[56]
7379.endm // dct64_step3_lsx
7380
7381.macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1
7382
7383    dct64_step3_lsx
7384
7385.ifnb \transpose8x8
7386    LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
7387                       vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
7388                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
7389
7390    LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
7391                       vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
7392                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
7393.endif
7394
7395.ifnb \shift
7396.irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
7397     vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
7398     vsrari.h     \i,       \i,       \shift
7399.endr
7400.endif
7401
7402    vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
7403
7404    vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
7405
7406.endm // dct64_step4_lsx
7407
7408.macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7
7409
7410    fld.d         f4,       t0,       0
7411    fldx.d        f5,       t0,       a1
7412    fld.d         f6,       t6,       0
7413    fldx.d        f7,       t6,       a1
7414    alsl.d        t0,       a1,       t0,    2
7415    alsl.d        t6,       a1,       t6,    2
7416    fld.d         f8,       t0,       0
7417    fldx.d        f9,       t0,       a1
7418    fld.d         f10,      t6,       0
7419    fldx.d        f11,      t6,       a1
7420
7421.irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11
7422    vsllwil.hu.bu   \i,      \i,       0
7423.endr
7424
7425    vsrari.h      vr20,     \in0,     4
7426    vsrari.h      vr22,     \in1,     4
7427    vsrari.h      vr24,     \in2,     4
7428    vsrari.h      vr26,     \in3,     4
7429    vsrari.h      vr28,     \in4,     4
7430    vsrari.h      vr30,     \in5,     4
7431    vsrari.h      vr2,      \in6,     4
7432    vsrari.h      vr1,      \in7,     4
7433
7434    vadd.h        vr4,      vr4,      vr20
7435    vadd.h        vr5,      vr5,      vr22
7436    vadd.h        vr6,      vr6,      vr24
7437    vadd.h        vr7,      vr7,      vr26
7438    vadd.h        vr8,      vr8,      vr28
7439    vadd.h        vr9,      vr9,      vr30
7440    vadd.h        vr10,     vr10,     vr2
7441    vadd.h        vr11,     vr11,     vr1
7442
7443    vssrani.bu.h  vr5,      vr4,      0
7444    vssrani.bu.h  vr7,      vr6,      0
7445    vssrani.bu.h  vr9,      vr8,      0
7446    vssrani.bu.h  vr11,     vr10,     0
7447
7448    vstelm.d      vr5,      t1,       0,     0
7449    vstelm.d      vr5,      t2,       0,     1
7450
7451    alsl.d        t1,       a1,       t1,    1
7452    alsl.d        t2,       a1,       t2,    1
7453    vstelm.d      vr7,      t1,       0,     0
7454    vstelm.d      vr7,      t2,       0,     1
7455
7456    alsl.d        t1,       a1,       t1,    1
7457    alsl.d        t2,       a1,       t2,    1
7458    vstelm.d      vr9,      t1,       0,     0
7459    vstelm.d      vr9,      t2,       0,     1
7460
7461    alsl.d        t1,       a1,       t1,    1
7462    alsl.d        t2,       a1,       t2,    1
7463    vstelm.d      vr11,     t1,       0,     0
7464    vstelm.d      vr11,     t2,       0,     1
7465.endm // dct64_step5_lsx
7466
7467.macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1
7468    vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
7469
7470    dct_8x16_tx64_core_lsx
7471
7472    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
7473            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
7474
7475    vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
7476
7477    la.local      t0,       idct_coeffs
7478
7479    vldrepl.w     vr20,     t0,       64           // 201
7480    vldrepl.w     vr21,     t0,       68           // 4091
7481    vsllwil.w.h   vr22,     vr0,      0
7482    vexth.w.h     vr23,     vr0
7483    vmul.w        vr8,      vr22,     vr21
7484    vmul.w        vr9,      vr23,     vr21
7485    vmul.w        vr0,      vr22,     vr20
7486    vmul.w        vr10,     vr23,     vr20
7487    vssrarni.h.w  vr9,      vr8,      12           // t31a
7488    vssrarni.h.w  vr10,     vr0,      12           // t16a
7489
7490    vldrepl.w     vr20,     t0,       72           // 3035
7491    vldrepl.w     vr21,     t0,       76           // 2751
7492    vsllwil.w.h   vr22,     vr7,      0
7493    vexth.w.h     vr23,     vr7
7494    vneg.w        vr21,     vr21
7495    vmul.w        vr8,      vr22,     vr20
7496    vmul.w        vr0,      vr23,     vr20
7497    vmul.w        vr7,      vr22,     vr21
7498    vmul.w        vr30,     vr23,     vr21
7499    vssrarni.h.w  vr0,      vr8,      12           // t30a
7500    vssrarni.h.w  vr30,     vr7,      12           // t17a
7501
7502    vldrepl.w     vr20,     t0,       80           // 1751
7503    vldrepl.w     vr21,     t0,       84           // 3703
7504    vsllwil.w.h   vr22,     vr4,      0
7505    vexth.w.h     vr23,     vr4
7506    vmul.w        vr8,      vr22,     vr21
7507    vmul.w        vr7,      vr23,     vr21
7508    vmul.w        vr4,      vr22,     vr20
7509    vmul.w        vr19,     vr23,     vr20
7510    vssrarni.h.w  vr7,      vr8,      12           // t29a
7511    vssrarni.h.w  vr19,     vr4,      12           // t18a
7512
7513    vldrepl.w     vr20,     t0,       88           // 3857
7514    vldrepl.w     vr21,     t0,       92           // 1380
7515    vsllwil.w.h   vr22,     vr3,      0
7516    vexth.w.h     vr23,     vr3
7517    vneg.w        vr21,     vr21
7518    vmul.w        vr8,      vr22,     vr20
7519    vmul.w        vr4,      vr23,     vr20
7520    vmul.w        vr3,      vr22,     vr21
7521    vmul.w        vr26,     vr23,     vr21
7522    vssrarni.h.w  vr4,      vr8,      12           // t28a
7523    vssrarni.h.w  vr26,     vr3,      12           // t19a
7524
7525    vldrepl.w     vr20,     t0,       96           // 995
7526    vldrepl.w     vr21,     t0,       100          // 3973
7527    vsllwil.w.h   vr22,     vr2,      0
7528    vexth.w.h     vr23,     vr2
7529    vmul.w        vr8,      vr22,     vr21
7530    vmul.w        vr3,      vr23,     vr21
7531    vmul.w        vr2,      vr22,     vr20
7532    vmul.w        vr27,     vr23,     vr20
7533    vssrarni.h.w  vr3,      vr8,      12           // t27a
7534    vssrarni.h.w  vr27,     vr2,      12           // t20a
7535
7536    vldrepl.w     vr20,     t0,       104          // 3513
7537    vldrepl.w     vr21,     t0,       108          // 2106
7538    vsllwil.w.h   vr22,     vr5,      0
7539    vexth.w.h     vr23,     vr5
7540    vneg.w        vr21,     vr21
7541    vmul.w        vr8,      vr22,     vr20
7542    vmul.w        vr2,      vr23,     vr20
7543    vmul.w        vr5,      vr22,     vr21
7544    vmul.w        vr28,     vr23,     vr21
7545    vssrarni.h.w  vr2,      vr8,      12           // t26a
7546    vssrarni.h.w  vr28,     vr5,      12           // t21a
7547
7548    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
7549    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
7550    vsllwil.w.h   vr22,     vr6,      0
7551    vexth.w.h     vr23,     vr6
7552    vmul.w        vr8,      vr22,     vr21
7553    vmul.w        vr5,      vr23,     vr21
7554    vmul.w        vr6,      vr22,     vr20
7555    vmul.w        vr25,     vr23,     vr20
7556    vssrarni.h.w  vr5,      vr8,      12           // t25a
7557    vssrarni.h.w  vr25,     vr6,      12           // t22a
7558
7559    vldrepl.w     vr20,     t0,       120          // 4052
7560    vldrepl.w     vr21,     t0,       124          // 601
7561    vsllwil.w.h   vr22,     vr1,      0
7562    vexth.w.h     vr23,     vr1
7563    vneg.w        vr21,     vr21
7564    vmul.w        vr8,      vr22,     vr20
7565    vmul.w        vr6,      vr23,     vr20
7566    vmul.w        vr1,      vr22,     vr21
7567    vmul.w        vr24,     vr23,     vr21
7568    vssrarni.h.w  vr6,      vr8,      12           // t24a
7569    vssrarni.h.w  vr24,     vr1,      12           // t23a
7570
7571    vsadd.h       vr1,      vr10,     vr30         // t16
7572    vssub.h       vr29,     vr10,     vr30         // t17
7573    vssub.h       vr8,      vr26,     vr19         // t18
7574    vsadd.h       vr31,     vr26,     vr19         // t19
7575    vsadd.h       vr10,     vr27,     vr28         // t20
7576    vssub.h       vr30,     vr27,     vr28         // t21
7577    vssub.h       vr19,     vr24,     vr25         // t22
7578    vsadd.h       vr26,     vr24,     vr25         // t23
7579    vsadd.h       vr27,     vr6,      vr5          // t24
7580    vssub.h       vr28,     vr6,      vr5          // t25
7581    vssub.h       vr24,     vr3,      vr2          // t26
7582    vsadd.h       vr25,     vr3,      vr2          // t27
7583    vsadd.h       vr5,      vr4,      vr7          // t28
7584    vssub.h       vr6,      vr4,      vr7          // t29
7585    vssub.h       vr2,      vr9,      vr0          // t30
7586    vsadd.h       vr3,      vr9,      vr0          // t31
7587
7588    vldrepl.w     vr20,     t0,       16           // 799
7589    vldrepl.w     vr21,     t0,       20           // 4017
7590    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
7591    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
7592    vssrarni.h.w  vr7,      vr4,      12           // t30a
7593    vssrarni.h.w  vr0,      vr11,     12           // t17a
7594    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
7595    vneg.w        vr4,      vr4
7596    vneg.w        vr9,      vr9
7597    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
7598    vssrarni.h.w  vr9,      vr4,      12           // t18a
7599    vssrarni.h.w  vr2,      vr11,     12           // t29a
7600
7601    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
7602    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
7603    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
7604    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
7605    vssrarni.h.w  vr29,     vr4,      12           // t26a
7606    vssrarni.h.w  vr6,      vr11,     12           // t21a
7607
7608    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
7609    vneg.w        vr4,      vr4
7610    vneg.w        vr8,      vr8
7611    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
7612    vssrarni.h.w  vr8,      vr4,      12           // t22a
7613    vssrarni.h.w  vr24,     vr11,     12           // t25a
7614
7615    vsadd.h       vr4,      vr1,      vr31         // t16a
7616    vssub.h       vr30,     vr1,      vr31         // t19a
7617    vsadd.h       vr19,     vr0,      vr9          // t17
7618    vssub.h       vr28,     vr0,      vr9          // t18
7619    vssub.h       vr1,      vr26,     vr10         // t20a
7620    vsadd.h       vr31,     vr26,     vr10         // t23a
7621    vssub.h       vr0,      vr8,      vr6          // t21
7622    vsadd.h       vr9,      vr8,      vr6          // t22
7623    vsadd.h       vr10,     vr27,     vr25         // t24a
7624    vssub.h       vr26,     vr27,     vr25         // t27a
7625    vsadd.h       vr6,      vr24,     vr29         // t25
7626    vssub.h       vr8,      vr24,     vr29         // t26
7627    vssub.h       vr25,     vr3,      vr5          // t28a
7628    vsadd.h       vr27,     vr3,      vr5          // t31a
7629    vssub.h       vr24,     vr7,      vr2          // t29
7630    vsadd.h       vr29,     vr7,      vr2          // t30
7631
7632    vldrepl.w     vr20,     t0,       8            // 1567
7633    vldrepl.w     vr21,     t0,       12           // 3784
7634    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
7635    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
7636    vssrarni.h.w  vr5,      vr3,      12           // t29a
7637    vssrarni.h.w  vr2,      vr11,     12           // 18a
7638
7639    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
7640    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
7641    vssrarni.h.w  vr7,      vr3,      12           // t28
7642    vssrarni.h.w  vr24,     vr11,     12           // t19
7643
7644    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
7645    vneg.w        vr3,      vr3
7646    vneg.w        vr28,     vr28
7647    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
7648    vssrarni.h.w  vr28,     vr3,      12           // t20
7649    vssrarni.h.w  vr25,     vr11,     12           // t27
7650
7651    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
7652    vneg.w        vr3,      vr3
7653    vneg.w        vr30,     vr30
7654    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
7655    vssrarni.h.w  vr30,     vr3,      12           // t21a
7656    vssrarni.h.w  vr1,      vr11,     12           // t26a
7657
7658    vsadd.h       vr3,      vr4,      vr31         // t16
7659    vssub.h       vr26,     vr4,      vr31         // t23
7660    vsadd.h       vr0,      vr19,     vr9          // t17a
7661    vssub.h       vr8,      vr19,     vr9          // t22a
7662    vsadd.h       vr4,      vr2,      vr30         // t18
7663    vssub.h       vr31,     vr2,      vr30         // t21
7664    vsadd.h       vr9,      vr24,     vr28         // t19a
7665    vssub.h       vr19,     vr24,     vr28         // t20a
7666    vssub.h       vr2,      vr27,     vr10         // t24
7667    vsadd.h       vr30,     vr27,     vr10         // t31
7668    vssub.h       vr24,     vr29,     vr6          // t25a
7669    vsadd.h       vr28,     vr29,     vr6          // t30a
7670    vssub.h       vr10,     vr5,      vr1          // t26
7671    vsadd.h       vr27,     vr5,      vr1          // t29
7672    vssub.h       vr6,      vr7,      vr25         // t27a
7673    vsadd.h       vr29,     vr7,      vr25         // t28a
7674
7675    vldrepl.w     vr20,     t0,       0            // 2896
7676    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
7677    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
7678    vssrarni.h.w  vr5,      vr1,      12           // t20
7679    vssrarni.h.w  vr7,      vr11,     12           // t27
7680
7681    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
7682    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
7683    vssrarni.h.w  vr25,     vr1,      12           // t21a
7684    vssrarni.h.w  vr6,      vr11,     12           // t26a
7685
7686    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
7687    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
7688    vssrarni.h.w  vr19,     vr1,      12           // t22
7689    vssrarni.h.w  vr10,     vr11,     12           // t25
7690
7691    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
7692    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
7693    vssrarni.h.w  vr31,     vr1,      12           // t23a
7694    vssrarni.h.w  vr8,      vr11,     12           // t24a
7695
7696    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
7697    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
7698
7699    vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
7700
7701    vsadd.h       vr1,      vr11,     vr30         // c[0]
7702    vssub.h       vr2,      vr11,     vr30         // c[31]
7703    vsadd.h       vr24,     vr12,     vr28         // c[1]
7704    vssub.h       vr26,     vr12,     vr28         // c[30]
7705    vsadd.h       vr11,     vr13,     vr27         // c[2]
7706    vssub.h       vr30,     vr13,     vr27         // c[29]
7707    vsadd.h       vr12,     vr14,     vr29         // c[3]
7708    vssub.h       vr28,     vr14,     vr29         // c[28]
7709    vsadd.h       vr13,     vr15,     vr7          // c[4]
7710    vssub.h       vr27,     vr15,     vr7          // c[27]
7711    vsadd.h       vr14,     vr16,     vr6          // c[5]
7712    vssub.h       vr29,     vr16,     vr6          // c[26]
7713    vsadd.h       vr7,      vr17,     vr10         // c[6]
7714    vssub.h       vr15,     vr17,     vr10         // c[25]
7715    vsadd.h       vr6,      vr18,     vr8          // c[7]
7716    vssub.h       vr16,     vr18,     vr8          // c[24]
7717
7718    vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
7719
7720    vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
7721
7722    vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
7723
7724    vsadd.h       vr1,      vr11,     vr31         // c[8]
7725    vssub.h       vr2,      vr11,     vr31         // c[23]
7726    vsadd.h       vr24,     vr12,     vr19         // c[9]
7727    vssub.h       vr26,     vr12,     vr19         // c[22]
7728    vsadd.h       vr11,     vr13,     vr25         // c[10]
7729    vssub.h       vr30,     vr13,     vr25         // c[21]
7730    vsadd.h       vr12,     vr14,     vr5          // c[11]
7731    vssub.h       vr28,     vr14,     vr5          // c[20]
7732    vsadd.h       vr13,     vr15,     vr9          // c[12]
7733    vssub.h       vr27,     vr15,     vr9          // c[19]
7734    vsadd.h       vr14,     vr16,     vr4          // c[13]
7735    vssub.h       vr29,     vr16,     vr4          // c[18]
7736    vsadd.h       vr7,      vr17,     vr0          // c[14]
7737    vssub.h       vr15,     vr17,     vr0          // c[17]
7738    vsadd.h       vr6,      vr18,     vr3          // c[15]
7739    vssub.h       vr16,     vr18,     vr3          // c[16]
7740
7741    vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
7742
7743    vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
7744.endm // dct_8x32_tx64_new_lsx
7745
7746function inv_txfm_add_dct_dct_64x64_8bpc_lsx
7747    bnez          a3,       .NO_HAS_DCONLY_64x64
7748
7749    ld.h          t2,       a2,       0
7750    vldi          vr0,      0x8b5
7751    vreplgr2vr.w  vr1,      t2
7752    vldi          vr20,     0x880
7753    vmul.w        vr2,      vr0,      vr1
7754    st.h          zero,     a2,       0
7755    vsrari.w      vr2,      vr2,      8
7756    vld           vr3,      a0,       48
7757    vsrari.w      vr2,      vr2,      2
7758    vld           vr1,      a0,       16
7759    vmadd.w       vr20,     vr2,      vr0
7760    vld           vr2,      a0,       32
7761    vssrarni.h.w  vr20,     vr20,     12
7762    vld           vr0,      a0,       0
7763
7764    vsllwil.hu.bu vr4,      vr0,      0
7765    vsllwil.hu.bu vr5,      vr1,      0
7766    vsllwil.hu.bu vr6,      vr2,      0
7767    vsllwil.hu.bu vr7,      vr3,      0
7768    vexth.hu.bu   vr0,      vr0
7769    vexth.hu.bu   vr1,      vr1
7770    vexth.hu.bu   vr2,      vr2
7771    vexth.hu.bu   vr3,      vr3
7772    vadd.h        vr8,      vr4,      vr20
7773    vadd.h        vr9,      vr0,      vr20
7774    vadd.h        vr10,     vr5,      vr20
7775    vadd.h        vr11,     vr1,      vr20
7776    vadd.h        vr12,     vr6,      vr20
7777    vadd.h        vr13,     vr2,      vr20
7778    vadd.h        vr14,     vr7,      vr20
7779    vadd.h        vr15,     vr3,      vr20
7780    vssrani.bu.h  vr9,      vr8,      0
7781    vssrani.bu.h  vr11,     vr10,     0
7782    vssrani.bu.h  vr13,     vr12,     0
7783    vssrani.bu.h  vr15,     vr14,     0
7784    vst           vr9,      a0,       0
7785    vst           vr11,     a0,       16
7786    vst           vr13,     a0,       32
7787    vst           vr15,     a0,       48
7788
7789.rept 63
7790    add.d         a0,       a0,       a1
7791    vld           vr0,      a0,       0
7792    vld           vr1,      a0,       16
7793    vld           vr2,      a0,       32
7794    vld           vr3,      a0,       48
7795    vsllwil.hu.bu vr4,      vr0,      0
7796    vsllwil.hu.bu vr5,      vr1,      0
7797    vsllwil.hu.bu vr6,      vr2,      0
7798    vsllwil.hu.bu vr7,      vr3,      0
7799    vexth.hu.bu   vr0,      vr0
7800    vexth.hu.bu   vr1,      vr1
7801    vexth.hu.bu   vr2,      vr2
7802    vexth.hu.bu   vr3,      vr3
7803    vadd.h        vr8,      vr4,      vr20
7804    vadd.h        vr9,      vr0,      vr20
7805    vadd.h        vr10,     vr5,      vr20
7806    vadd.h        vr11,     vr1,      vr20
7807    vadd.h        vr12,     vr6,      vr20
7808    vadd.h        vr13,     vr2,      vr20
7809    vadd.h        vr14,     vr7,      vr20
7810    vadd.h        vr15,     vr3,      vr20
7811    vssrani.bu.h  vr9,      vr8,      0
7812    vssrani.bu.h  vr11,     vr10,     0
7813    vssrani.bu.h  vr13,     vr12,     0
7814    vssrani.bu.h  vr15,     vr14,     0
7815    vst           vr9,      a0,       0
7816    vst           vr11,     a0,       16
7817    vst           vr13,     a0,       32
7818    vst           vr15,     a0,       48
7819.endr
7820    b             .DCT_DCT_64X64_END
7821.NO_HAS_DCONLY_64x64:
7822
7823    malloc_space  64*32*2+512+512
7824
7825    addi.d        t7,       sp,       64
7826
7827.macro dct64x64_core1_lsx in0, in1, in2
7828    addi.d        t2,       a2,       \in0
7829    addi.d        t7,       t7,       \in1
7830    li.w          t4,       64*32*2+64
7831    add.d         t3,       sp,       t4
7832    addi.d        t6,       t3,       512
7833    add.d         t5,       t6,       zero
7834
7835    dct_8x32_tx64_new_lsx 0, 256, 128, 256
7836
7837    la.local      t0,       idct64_coeffs
7838
7839    addi.d        t2,       a2,       \in2         // 32 ...
7840    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
7841    vld           vr0,      t2,       128*0        // in1
7842    vld           vr1,      t2,       128*15       // in31
7843    vld           vr2,      t2,       128*8        // in17
7844    vld           vr3,      t2,       128*7        // in15
7845    dct64_step1_lsx
7846
7847    addi.d        t0,       t0,       48
7848    addi.d        t6,       t6,       128
7849    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
7850    vld           vr0,      t2,       128*3        // in7
7851    vld           vr1,      t2,       128*12       // in25
7852    vld           vr2,      t2,       128*11       // in23
7853    vld           vr3,      t2,       128*4        // in9
7854    dct64_step1_lsx
7855
7856    addi.d        t0,       t0,       48
7857    addi.d        t6,       t6,       128
7858    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
7859    vld           vr0,      t2,       128*2        // in5
7860    vld           vr1,      t2,       128*13       // in27
7861    vld           vr2,      t2,       128*10       // in21
7862    vld           vr3,      t2,       128*5        // in11
7863    dct64_step1_lsx
7864
7865    addi.d        t0,       t0,       48
7866    addi.d        t6,       t6,       128
7867    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
7868    vld           vr0,      t2,       128*1        // in3
7869    vld           vr1,      t2,       128*14       // in29
7870    vld           vr2,      t2,       128*9        // in19
7871    vld           vr3,      t2,       128*6        // in13
7872    dct64_step1_lsx
7873
7874    la.local      t0,       idct_coeffs
7875    addi.d        t4,       t5,       16*7
7876    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
7877    dct64_step2_lsx
7878
7879    addi.d        t5,       t5,       16
7880    addi.d        t4,       t4,       -16
7881    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
7882    dct64_step2_lsx
7883
7884    addi.d        t5,       t5,       16
7885    addi.d        t4,       t4,       -16
7886    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
7887    dct64_step2_lsx
7888
7889    addi.d        t5,       t5,       16
7890    addi.d        t4,       t4,       -16
7891    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
7892    dct64_step2_lsx
7893
7894    li.w          t4,       64*32*2+64+512
7895    add.d         t5,       t4,       sp
7896    addi.d        t4,       t5,       16*7
7897    dct64_step4_lsx transpose8x8, 2, 0, 128, 112, 128
7898
7899    addi.d        t3,       t3,       128
7900    addi.d        t4,       t4,       -16*8
7901    addi.d        t5,       t5,       -16*8
7902    dct64_step4_lsx transpose8x8, 2, 16, 128, 96, 128
7903
7904    addi.d        t5,       t5,       -16*8
7905    addi.d        t4,       t4,       -16*8
7906    addi.d        t3,       t3,       128
7907    dct64_step4_lsx transpose8x8, 2, 32, 128, 80, 128
7908
7909    addi.d        t5,       t5,       -16*8
7910    addi.d        t4,       t4,       -16*8
7911    addi.d        t3,       t3,       128
7912    dct64_step4_lsx transpose8x8, 2, 48, 128, 64, 128
7913.endm
7914
7915    dct64x64_core1_lsx 0, 0, 64
7916
7917    dct64x64_core1_lsx 16, 128*8, 64+16
7918
7919    dct64x64_core1_lsx 32, 128*8, 64+16*2
7920
7921    dct64x64_core1_lsx 48, 128*8, 64+16*3
7922
7923    vreplgr2vr.h  vr31,     zero
7924.irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024, 1040, 1056, 1072, 1088, 1104, 1120, 1136, 1152, 1168, 1184, 1200, 1216, 1232, 1248, 1264, 1280, 1296, 1312, 1328, 1344, 1360, 1376, 1392, 1408, 1424, 1440, 1456, 1472, 1488, 1504, 1520, 1536, 1552, 1568, 1584, 1600, 1616, 1632, 1648, 1664, 1680, 1696, 1712, 1728, 1744, 1760, 1776, 1792, 1808, 1824, 1840, 1856, 1872, 1888, 1904, 1920, 1936, 1952, 1968, 1984, 2000, 2016, 2032
7925    vst           vr31,     a2,       \i
7926.endr
7927
7928.macro dct64x64_core2_lsx in0, in1
7929    addi.d        t2,       sp,       64+\in0
7930    addi.d        t7,       sp,       64+\in0
7931    li.w          t4,       64*32*2+64
7932    add.d         t3,       sp,       t4
7933    addi.d        t6,       t3,       512
7934    add.d         t5,       t6,       zero
7935
7936    addi.d        t2, t2, 1024
7937    addi.d        t2, t2, 1024
7938    dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512
7939
7940    la.local      t0,       idct64_coeffs
7941
7942    addi.d        t2,       sp,       64+64*2+\in0
7943    addi.d        t4,       t2,       256*7
7944    addi.d        t4,       t4,       256
7945
7946    vld           vr0,      t2,       256*0        // in1
7947    vld           vr1,      t4,       256*7        // in31
7948    vld           vr2,      t4,       256*0        // in17
7949    vld           vr3,      t2,       256*7        // in15
7950    dct64_step1_lsx
7951
7952    addi.d        t0,       t0,       48
7953    addi.d        t6,       t6,       128
7954    vld           vr0,      t2,       256*3        // in7
7955    vld           vr1,      t4,       256*4        // in25
7956    vld           vr2,      t4,       256*3        // in23
7957    vld           vr3,      t2,       256*4        // in9
7958    dct64_step1_lsx
7959
7960    addi.d        t0,        t0,       48
7961    addi.d        t6,        t6,       128
7962    vld           vr0,       t2,       256*2       // in5
7963    vld           vr1,       t4,       256*5       // in27
7964    vld           vr2,       t4,       256*2       // in21
7965    vld           vr3,       t2,       256*5       // in11
7966    dct64_step1_lsx
7967
7968    addi.d        t0,        t0,       48
7969    addi.d        t6,        t6,       128
7970    vld           vr0,       t2,       256*1       // in3
7971    vld           vr1,       t4,       256*6       // in29
7972    vld           vr2,       t4,       256*1       // in19
7973    vld           vr3,       t2,       256*6       // in13
7974    dct64_step1_lsx
7975
7976    la.local      t0,       idct_coeffs
7977    addi.d        t4,       t5,       16*7
7978    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
7979    dct64_step2_lsx
7980
7981    addi.d        t5,       t5,       16
7982    addi.d        t4,       t4,       -16
7983    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
7984    dct64_step2_lsx
7985
7986    addi.d        t5,       t5,       16
7987    addi.d        t4,       t4,       -16
7988    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
7989    dct64_step2_lsx
7990
7991    addi.d        t5,       t5,       16
7992    addi.d        t4,       t4,       -16
7993    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
7994    dct64_step2_lsx
7995
7996    li.w          t4,       64*32*2+64+512
7997    add.d         t5,       t4,       sp
7998    addi.d        t4,       t5,       16*7
7999    addi.d        a0,       a0,       \in1
8000    // 0 - 7, 56 -63
8001    dct64_step3_lsx
8002
8003    li.w          t8,       0
8004    mul.w         t0,       t8,       a1
8005    add.d         t0,       a0,       t0
8006    alsl.d        t6,       a1,       t0,      1
8007    addi.d        t1,       t0,       0
8008    add.d         t2,       t0,       a1
8009    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
8010
8011    li.w          t8,       56
8012    mul.w         t0,       t8,       a1
8013    add.d         t0,       a0,       t0
8014    alsl.d        t6,       a1,       t0,      1
8015    addi.d        t1,       t0,       0
8016    add.d         t2,       t0,       a1
8017    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
8018
8019    // 8 - 15, 48 - 55
8020    addi.d        t3,       t3,       128
8021    addi.d        t4,       t4,       -16*8
8022    addi.d        t5,       t5,       -16*8
8023    dct64_step3_lsx
8024
8025    li.w          t8,       8
8026    mul.w         t0,       t8,       a1
8027    add.d         t0,       t0,       a0
8028    alsl.d        t6,       a1,       t0,     1
8029    addi.d        t1,       t0,       0
8030    add.d         t2,       t0,       a1
8031    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
8032
8033    li.w          t8,       48
8034    mul.w         t0,       t8,       a1
8035    add.d         t0,       t0,       a0
8036    alsl.d        t6,       a1,       t0,     1
8037    addi.d        t1,       t0,       0
8038    add.d         t2,       t0,       a1
8039    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
8040
8041    // 16 - 23, 40 - 47
8042    addi.d        t3,       t3,       128
8043    addi.d        t4,       t4,       -16*8
8044    addi.d        t5,       t5,       -16*8
8045    dct64_step3_lsx
8046
8047    li.w          t8,       16
8048    mul.w         t0,       t8,       a1
8049    add.d         t0,       t0,       a0
8050    alsl.d        t6,       a1,       t0,     1
8051    addi.d        t1,       t0,       0
8052    add.d         t2,       t0,       a1
8053    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
8054
8055    li.w          t8,       40
8056    mul.w         t0,       t8,       a1
8057    add.d         t0,       t0,       a0
8058    alsl.d        t6,       a1,       t0,     1
8059    addi.d        t1,       t0,       0
8060    add.d         t2,       t0,       a1
8061    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
8062
8063    // 24 - 31, 32 - 39
8064    addi.d        t3,       t3,       128
8065    addi.d        t4,       t4,       -16*8
8066    addi.d        t5,       t5,       -16*8
8067    dct64_step3_lsx
8068
8069    li.w          t8,       24
8070    mul.w         t0,       t8,       a1
8071    add.d         t0,       t0,       a0
8072    alsl.d        t6,       a1,       t0,     1
8073    addi.d        t1,       t0,       0
8074    add.d         t2,       t0,       a1
8075    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
8076
8077    li.w          t8,       32
8078    mul.w         t0,       t8,       a1
8079    add.d         t0,       t0,       a0
8080    alsl.d        t6,       a1,       t0,     1
8081    addi.d        t1,       t0,       0
8082    add.d         t2,       t0,       a1
8083    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
8084.endm
8085
8086    dct64x64_core2_lsx 16*0, 0
8087
8088    dct64x64_core2_lsx 16*1, 8
8089
8090    dct64x64_core2_lsx 16*2, 8
8091
8092    dct64x64_core2_lsx 16*3, 8
8093
8094    dct64x64_core2_lsx 16*4, 8
8095
8096    dct64x64_core2_lsx 16*5, 8
8097
8098    dct64x64_core2_lsx 16*6, 8
8099
8100    dct64x64_core2_lsx 16*7, 8
8101
8102    free_space 64*32*2+512+512
8103.DCT_DCT_64X64_END:
8104endfunc
8105