• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/******************************************************************************
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2023, Nathan Egge
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 *****************************************************************************/
27
28#include "src/riscv/asm.S"
29
30function inv_txfm_add_4x4_rvv, export=1, ext=v
31  csrw vxrm, zero
32
33  vsetivli zero, 4, e16, mf2, ta, ma
34  vle16.v v0, (a2)
35  addi t0, a2, 8
36  vle16.v v1, (t0)
37  addi t0, t0, 8
38  vle16.v v2, (t0)
39  addi t0, t0, 8
40  vle16.v v3, (t0)
41
42  jalr t0, a4
43
44  vmv.v.x v4, zero
45
46  vsseg4e16.v v0, (a2)
47  vle16.v v0, (a2)
48  vse16.v v4, (a2)
49  addi t0, a2, 8
50  vle16.v v1, (t0)
51  vse16.v v4, (t0)
52  addi t0, t0, 8
53  vle16.v v2, (t0)
54  vse16.v v4, (t0)
55  addi t0, t0, 8
56  vle16.v v3, (t0)
57  vse16.v v4, (t0)
58
59  jalr t0, a5
60
61  vssra.vi v0, v0, 4
62  vssra.vi v1, v1, 4
63  vssra.vi v2, v2, 4
64  vssra.vi v3, v3, 4
65
66itx_4x4_end:
67  vsetvli zero, zero, e8, mf4, ta, ma
68  vle8.v v4, (a0)
69  add t0, a0, a1
70  vle8.v v5, (t0)
71  add t0, t0, a1
72  vle8.v v6, (t0)
73  add t0, t0, a1
74  vle8.v v7, (t0)
75
76  vwaddu.wv v0, v0, v4
77  vwaddu.wv v1, v1, v5
78  vwaddu.wv v2, v2, v6
79  vwaddu.wv v3, v3, v7
80
81  vsetvli zero, zero, e16, mf2, ta, ma
82  vmax.vx v0, v0, zero
83  vmax.vx v1, v1, zero
84  vmax.vx v2, v2, zero
85  vmax.vx v3, v3, zero
86
87  vsetvli zero, zero, e8, mf4, ta, ma
88
89  vnclipu.wi v4, v0, 0
90  vnclipu.wi v5, v1, 0
91  vnclipu.wi v6, v2, 0
92  vnclipu.wi v7, v3, 0
93
94  vse8.v v4, (a0)
95  add a0, a0, a1
96  vse8.v v5, (a0)
97  add a0, a0, a1
98  vse8.v v6, (a0)
99  add a0, a0, a1
100  vse8.v v7, (a0)
101
102  ret
103endfunc
104
105function inv_identity_e16_x4_rvv, export=1, ext=v
106  li t1, (5793-4096)*8
107  vsmul.vx v4, v0, t1
108  vsmul.vx v5, v1, t1
109  vsmul.vx v6, v2, t1
110  vsmul.vx v7, v3, t1
111
112  vsadd.vv v0, v0, v4
113  vsadd.vv v1, v1, v5
114  vsadd.vv v2, v2, v6
115  vsadd.vv v3, v3, v7
116
117  jr t0
118endfunc
119
120.macro iwht_4
121  vadd.vv v0, v0, v1
122  vsub.vv v5, v2, v3
123  vsub.vv v4, v0, v5
124  vsra.vi v4, v4, 1
125  vsub.vv v2, v4, v1
126  vsub.vv v1, v4, v3
127  vadd.vv v3, v5, v2
128  vsub.vv v0, v0, v1
129.endm
130
131.macro idct_4 o0, o1, o2, o3
132  li t1, 2896
133  li t2, 1567
134  li t3, 3784
135
136  vwmul.vx v16, \o0, t1
137  vwmul.vx v18, \o0, t1
138  vwmacc.vx v16, t1, \o2
139  neg t1, t1
140  vwmacc.vx v18, t1, \o2
141
142  vwmul.vx v20, \o1, t3
143  neg t3, t3
144  vwmul.vx v22, \o1, t2
145  vwmacc.vx v20, t2, \o3
146  vwmacc.vx v22, t3, \o3
147
148  li t1, 2048
149
150  vwadd.wx v16, v16, t1
151  vwadd.wx v18, v18, t1
152  vwadd.wx v20, v20, t1
153  vwadd.wx v22, v22, t1
154
155  vnsra.wi v16, v16, 12
156  vnsra.wi v18, v18, 12
157  vnsra.wi v20, v20, 12
158  vnsra.wi v22, v22, 12
159
160  vsadd.vv \o0, v16, v20
161  vsadd.vv \o1, v18, v22
162  vssub.vv \o2, v18, v22
163  vssub.vv \o3, v16, v20
164.endm
165
166.macro iadst_4 o0, o1, o2, o3, lm2, lm
167  li t1, 1321
168  li t2, 3803
169  li t3, 2482
170
171  vwmul.vx v16, v0, t1
172  vwmul.vx v18, v0, t3
173  neg t1, t1
174  vwmacc.vx v16, t2, v2
175  vwmacc.vx v18, t1, v2
176  neg t2, t2
177  vwmacc.vx v16, t3, v3
178  vwmacc.vx v18, t2, v3
179
180  vwsub.vv v20,  v0, v2
181  vwadd.wv v20, v20, v3
182
183  li t1, 3344
184  vwmul.vx v22, v1, t1
185
186  vsetvli zero, zero, e32, \lm2, ta, ma
187
188  vmul.vx v20, v20, t1
189
190  vadd.vv v24, v16, v18
191  vadd.vv v16, v16, v22
192  vadd.vv v18, v18, v22
193  vsub.vv v22, v24, v22
194
195  li t1, 2048
196
197  vadd.vx v16, v16, t1
198  vadd.vx v18, v18, t1
199  vadd.vx v20, v20, t1
200  vadd.vx v22, v22, t1
201
202  vsetvli zero, zero, e16, \lm, ta, ma
203
204  vnsra.wi \o0, v16, 12
205  vnsra.wi \o1, v18, 12
206  vnsra.wi \o2, v20, 12
207  vnsra.wi \o3, v22, 12
208.endm
209
210function inv_dct_e16_x4_rvv, export=1, ext=v
211  idct_4 v0, v1, v2, v3
212  jr t0
213endfunc
214
215function inv_adst_e16_x4_rvv, export=1, ext=v
216  iadst_4 v0, v1, v2, v3, m1, mf2
217  jr t0
218endfunc
219
220function inv_flipadst_e16_x4_rvv, export=1, ext=v
221  iadst_4 v3, v2, v1, v0, m1, mf2
222  jr t0
223endfunc
224
225function inv_adst_e16_x4w_rvv, export=1, ext=v
226  iadst_4 v0, v1, v2, v3, m2, m1
227  jr t0
228endfunc
229
230function inv_flipadst_e16_x4w_rvv, export=1, ext=v
231  iadst_4 v3, v2, v1, v0, m2, m1
232  jr t0
233endfunc
234
235function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v
236  csrw vxrm, zero
237
238  vsetivli zero, 4, e16, mf2, ta, ma
239  vle16.v v0, (a2)
240  addi t0, a2, 8
241  vle16.v v1, (t0)
242  addi t0, t0, 8
243  vle16.v v2, (t0)
244  addi t0, t0, 8
245  vle16.v v3, (t0)
246
247  vsra.vi v0, v0, 2
248  vsra.vi v1, v1, 2
249  vsra.vi v2, v2, 2
250  vsra.vi v3, v3, 2
251
252  iwht_4
253
254  vmv.v.x v4, zero
255
256  vsseg4e16.v v0, (a2)
257  vle16.v v0, (a2)
258  vse16.v v4, (a2)
259  addi t0, a2, 8
260  vle16.v v1, (t0)
261  vse16.v v4, (t0)
262  addi t0, t0, 8
263  vle16.v v2, (t0)
264  vse16.v v4, (t0)
265  addi t0, t0, 8
266  vle16.v v3, (t0)
267  vse16.v v4, (t0)
268
269  iwht_4
270
271  j itx_4x4_end
272endfunc
273
274.macro def_fn_4x4 txfm1, txfm2
275function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
276.ifc \txfm1\()_\txfm2, dct_dct
277  beqz a3, 1f
278.endif
279  la a4, inv_\txfm1\()_e16_x4_rvv
280  la a5, inv_\txfm2\()_e16_x4_rvv
281  j inv_txfm_add_4x4_rvv
282.ifc \txfm1\()_\txfm2, dct_dct
2831:
284  csrw vxrm, zero
285  vsetivli zero, 4, e16, mf2, ta, ma
286  ld t2, (a2)
287  li t1, 2896*8
288  vmv.v.x v0, t2
289  vsmul.vx v0, v0, t1
290  sd x0, (a2)
291  vsmul.vx v0, v0, t1
292  vssra.vi v0, v0, 4
293  vmv.v.v v1, v0
294  vmv.v.v v2, v0
295  vmv.v.v v3, v0
296  j itx_4x4_end
297.endif
298endfunc
299.endm
300
301def_fn_4x4 dct, dct
302def_fn_4x4 identity, identity
303def_fn_4x4 dct, adst
304def_fn_4x4 dct, flipadst
305def_fn_4x4 dct, identity
306def_fn_4x4 adst, dct
307def_fn_4x4 adst, adst
308def_fn_4x4 adst, flipadst
309def_fn_4x4 flipadst, dct
310def_fn_4x4 flipadst, adst
311def_fn_4x4 flipadst, flipadst
312def_fn_4x4 identity, dct
313def_fn_4x4 adst, identity
314def_fn_4x4 flipadst, identity
315def_fn_4x4 identity, adst
316def_fn_4x4 identity, flipadst
317
318.macro def_fn_8x8_base variant
319function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
320  csrw vxrm, zero
321
322  vsetivli zero, 8, e16, m1, ta, ma
323  vle16.v v0, (a2)
324  addi t0, a2, 16
325  vle16.v v1, (t0)
326  addi t0, t0, 16
327  vle16.v v2, (t0)
328  addi t0, t0, 16
329  vle16.v v3, (t0)
330  addi t0, t0, 16
331  vle16.v v4, (t0)
332  addi t0, t0, 16
333  vle16.v v5, (t0)
334  addi t0, t0, 16
335  vle16.v v6, (t0)
336  addi t0, t0, 16
337  vle16.v v7, (t0)
338
339.ifc \variant, identity_
340  // The identity vsadd.vv and downshift vssra.vi 1 cancel out
341
342  j L(itx_8x8_epilog)
343.else
344  jalr t0, a4
345
346  vssra.vi v0, v0, 1
347  vssra.vi v1, v1, 1
348  vssra.vi v2, v2, 1
349  vssra.vi v3, v3, 1
350  vssra.vi v4, v4, 1
351  vssra.vi v5, v5, 1
352  vssra.vi v6, v6, 1
353  vssra.vi v7, v7, 1
354
355L(itx_8x8_epilog):
356  vsseg8e16.v v0, (a2)
357  vle16.v v0, (a2)
358  addi t0, a2, 16
359  vle16.v v1, (t0)
360  addi t0, t0, 16
361  vle16.v v2, (t0)
362  addi t0, t0, 16
363  vle16.v v3, (t0)
364  addi t0, t0, 16
365  vle16.v v4, (t0)
366  addi t0, t0, 16
367  vle16.v v5, (t0)
368  addi t0, t0, 16
369  vle16.v v6, (t0)
370  addi t0, t0, 16
371  vle16.v v7, (t0)
372
373  jalr t0, a5
374
375  vssra.vi v0, v0, 4
376  vssra.vi v1, v1, 4
377  vssra.vi v2, v2, 4
378  vssra.vi v3, v3, 4
379  vssra.vi v4, v4, 4
380  vssra.vi v5, v5, 4
381  vssra.vi v6, v6, 4
382  vssra.vi v7, v7, 4
383
384  li t1, 64
385  vsetvli zero, t1, e16, m8, ta, ma
386  vmv.v.x v8, zero
387  vse16.v v8, (a2)
388
389itx_8x8_end:
390  vsetivli zero, 8, e8, mf2, ta, ma
391  vle8.v v8, (a0)
392  add t0, a0, a1
393  vle8.v v9, (t0)
394  add t0, t0, a1
395  vle8.v v10, (t0)
396  add t0, t0, a1
397  vle8.v v11, (t0)
398  add t0, t0, a1
399  vle8.v v12, (t0)
400  add t0, t0, a1
401  vle8.v v13, (t0)
402  add t0, t0, a1
403  vle8.v v14, (t0)
404  add t0, t0, a1
405  vle8.v v15, (t0)
406
407  vwaddu.wv v0, v0, v8
408  vwaddu.wv v1, v1, v9
409  vwaddu.wv v2, v2, v10
410  vwaddu.wv v3, v3, v11
411  vwaddu.wv v4, v4, v12
412  vwaddu.wv v5, v5, v13
413  vwaddu.wv v6, v6, v14
414  vwaddu.wv v7, v7, v15
415
416  vsetvli zero, zero, e16, m1, ta, ma
417  vmax.vx v0, v0, zero
418  vmax.vx v1, v1, zero
419  vmax.vx v2, v2, zero
420  vmax.vx v3, v3, zero
421  vmax.vx v4, v4, zero
422  vmax.vx v5, v5, zero
423  vmax.vx v6, v6, zero
424  vmax.vx v7, v7, zero
425
426  vsetvli zero, zero, e8, mf2, ta, ma
427
428  vnclipu.wi v8, v0, 0
429  vnclipu.wi v9, v1, 0
430  vnclipu.wi v10, v2, 0
431  vnclipu.wi v11, v3, 0
432  vnclipu.wi v12, v4, 0
433  vnclipu.wi v13, v5, 0
434  vnclipu.wi v14, v6, 0
435  vnclipu.wi v15, v7, 0
436
437  vse8.v v8, (a0)
438  add a0, a0, a1
439  vse8.v v9, (a0)
440  add a0, a0, a1
441  vse8.v v10, (a0)
442  add a0, a0, a1
443  vse8.v v11, (a0)
444  add a0, a0, a1
445  vse8.v v12, (a0)
446  add a0, a0, a1
447  vse8.v v13, (a0)
448  add a0, a0, a1
449  vse8.v v14, (a0)
450  add a0, a0, a1
451  vse8.v v15, (a0)
452
453  ret
454.endif
455endfunc
456.endm
457
458def_fn_8x8_base identity_
459def_fn_8x8_base
460
461function inv_identity_e16_x8_rvv, export=1, ext=v
462  vsadd.vv v0, v0, v0
463  vsadd.vv v1, v1, v1
464  vsadd.vv v2, v2, v2
465  vsadd.vv v3, v3, v3
466  vsadd.vv v4, v4, v4
467  vsadd.vv v5, v5, v5
468  vsadd.vv v6, v6, v6
469  vsadd.vv v7, v7, v7
470
471  jr t0
472endfunc
473
474.macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7
475  idct_4 \o0, \o2, \o4, \o6
476
477  li t1, 799
478  li t2, 4017
479  li t3, 3406
480  li t4, 2276
481
482  vwmul.vx v22, \o1, t2
483  neg t2, t2
484  vwmul.vx v16, \o1, t1
485  vwmacc.vx v22, t1, \o7
486  vwmacc.vx v16, t2, \o7
487
488  vwmul.vx v20, \o5, t4
489  neg t4, t4
490  vwmul.vx v18, \o5, t3
491  vwmacc.vx v20, t3, \o3
492  vwmacc.vx v18, t4, \o3
493
494  li t1, 2048
495
496  vwadd.wx v16, v16, t1
497  vwadd.wx v18, v18, t1
498  vwadd.wx v20, v20, t1
499  vwadd.wx v22, v22, t1
500
501  vnsra.wi v16, v16, 12
502  vnsra.wi v18, v18, 12
503  vnsra.wi v20, v20, 12
504  vnsra.wi v22, v22, 12
505
506  vssub.vv \o7, v22, v20
507  vsadd.vv v22, v22, v20
508  vssub.vv \o1, v16, v18
509  vsadd.vv v16, v16, v18
510
511  li t2, 2896
512
513  vwmul.vx v18, \o7, t2
514  vwmul.vx v20, \o7, t2
515  vwmacc.vx v20, t2, \o1
516  neg t2, t2
517  vwmacc.vx v18, t2, \o1
518
519  vwadd.wx v18, v18, t1
520  vwadd.wx v20, v20, t1
521
522  vnsra.wi v18, v18, 12
523  vnsra.wi v20, v20, 12
524
525  vssub.vv \o7, \o0, v22
526  vsadd.vv \o0, \o0, v22
527  vssub.vv v17, \o2, v20
528  vsadd.vv \o1, \o2, v20
529  vssub.vv \o5, \o4, v18
530  vsadd.vv \o2, \o4, v18
531  vssub.vv \o4, \o6, v16
532  vsadd.vv \o3, \o6, v16
533  vmv.v.v \o6, v17
534.endm
535
536.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
537  li t1, 4076
538  li t2, 401
539  li t3, 3612
540  li t4, 1931
541  li t5, 2598
542  li t6, 3166
543
544  vwmul.vx v16, v7, t1
545  neg t1, t1
546  vwmul.vx v18, v7, t2
547  vwmacc.vx v16, t2, v0
548  vwmacc.vx v18, t1, v0
549
550  vwmul.vx v20, v5, t3
551  neg t3, t3
552  vwmul.vx v22, v5, t4
553  vwmacc.vx v20, t4, v2
554  vwmacc.vx v22, t3, v2
555
556  vwmul.vx v24, v3, t5
557  neg t5, t5
558  vwmul.vx v26, v3, t6
559  vwmacc.vx v24, t6, v4
560  vwmacc.vx v26, t5, v4
561
562  li t1, 2048
563  li t2, 1189
564  li t3, 3920
565  li t4, 1567
566  li t5, 3784
567  li t6, 2896
568
569  vwmul.vx v28, v1, t2
570  neg t2, t2
571  vwmul.vx v30, v1, t3
572  vwmacc.vx v28, t3, v6
573  vwmacc.vx v30, t2, v6
574
575  vwadd.wx v16, v16, t1
576  vwadd.wx v18, v18, t1
577  vwadd.wx v20, v20, t1
578  vwadd.wx v22, v22, t1
579  vwadd.wx v24, v24, t1
580  vwadd.wx v26, v26, t1
581  vwadd.wx v28, v28, t1
582  vwadd.wx v30, v30, t1
583
584  vnsra.wi v16, v16, 12
585  vnsra.wi v18, v18, 12
586  vnsra.wi v20, v20, 12
587  vnsra.wi v22, v22, 12
588  vnsra.wi v24, v24, 12
589  vnsra.wi v26, v26, 12
590  vnsra.wi v28, v28, 12
591  vnsra.wi v30, v30, 12
592
593  vssub.vv  v4, v16, v24
594  vsadd.vv v16, v16, v24
595  vsadd.vv  v1, v18, v26
596  vsadd.vv  v2, v20, v28
597  vsadd.vv  v3, v22, v30
598  vssub.vv  v5, v18, v26
599  vssub.vv  v6, v20, v28
600  vssub.vv v30, v22, v30
601
602  vsadd.vv \o0, v16, v2
603  vsadd.vv \o7,  v1, v3
604  vssub.vv  v2, v16, v2
605  vssub.vv  v3,  v1, v3
606
607  vwmul.vx v16,  v4, t5
608  vwmul.vx v18,  v4, t4
609  vwmul.vx v20, v30, t5
610  vwmul.vx v22, v30, t4
611  vwmacc.vx v16, t4, v5
612  neg t4, t4
613  vwmacc.vx v22, t5, v6
614  neg t5, t5
615  vwmacc.vx v20, t4, v6
616  vwmacc.vx v18, t5, v5
617
618  vwadd.wx v16, v16, t1
619  vwadd.wx v18, v18, t1
620  vwadd.wx v20, v20, t1
621  vwadd.wx v22, v22, t1
622
623  vnsra.wi v16, v16, 12
624  vnsra.wi v18, v18, 12
625  vnsra.wi v20, v20, 12
626  vnsra.wi v22, v22, 12
627
628  vsadd.vv \o1, v16, v20
629  vsadd.vv \o6, v18, v22
630  vssub.vv v16, v16, v20
631  vssub.vv v17, v18, v22
632
633  vwmul.vx v18, v2, t6
634  vwmul.vx v20, v2, t6
635  vwmul.vx v22, v16, t6
636  vwmul.vx v24, v16, t6
637  vwmacc.vx v18, t6, v3
638  vwmacc.vx v22, t6, v17
639  neg t6, t6
640  vwmacc.vx v20, t6, v3
641  vwmacc.vx v24, t6, v17
642
643  vwadd.wx v18, v18, t1
644  vwadd.wx v20, v20, t1
645  vwadd.wx v22, v22, t1
646  vwadd.wx v24, v24, t1
647
648  vnsra.wi \o3, v18, 12
649  vnsra.wi \o4, v20, 12
650  vnsra.wi \o2, v22, 12
651  vnsra.wi \o5, v24, 12
652
653  vmv.v.x v16, zero
654  vssub.vv \o1, v16, \o1
655  vssub.vv \o3, v16, \o3
656  vssub.vv \o5, v16, \o5
657  vssub.vv \o7, v16, \o7
658.endm
659
660function inv_dct_e16_x8_rvv, export=1, ext=v
661  idct_8 v0, v1, v2, v3, v4, v5, v6, v7
662  jr t0
663endfunc
664
665function inv_adst_e16_x8_rvv, export=1, ext=v
666  iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
667  jr t0
668endfunc
669
670function inv_flipadst_e16_x8_rvv, export=1, ext=v
671  iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
672  jr t0
673endfunc
674
675.macro def_fn_8x8 txfm1, txfm2
676function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
677.ifc \txfm1\()_\txfm2, dct_dct
678  beqz a3, 1f
679.endif
680  la a5, inv_\txfm2\()_e16_x8_rvv
681.ifc \txfm1, identity
682  j inv_txfm_identity_add_8x8_rvv
683.else
684  la a4, inv_\txfm1\()_e16_x8_rvv
685  j inv_txfm_add_8x8_rvv
686.endif
687.ifc \txfm1\()_\txfm2, dct_dct
6881:
689  csrw vxrm, zero
690  vsetivli zero, 8, e16, m1, ta, ma
691  ld t2, (a2)
692  li t1, 2896*8
693  vmv.v.x v0, t2
694  vsmul.vx v0, v0, t1
695  sd x0, (a2)
696  vssra.vi v0, v0, 1
697  vsmul.vx v0, v0, t1
698  vssra.vi v0, v0, 4
699  vmv.v.v v1, v0
700  vmv.v.v v2, v0
701  vmv.v.v v3, v0
702  vmv.v.v v4, v0
703  vmv.v.v v5, v0
704  vmv.v.v v6, v0
705  vmv.v.v v7, v0
706  j itx_8x8_end
707.endif
708endfunc
709.endm
710
711def_fn_8x8 dct, dct
712def_fn_8x8 identity, identity
713def_fn_8x8 dct, adst
714def_fn_8x8 dct, flipadst
715def_fn_8x8 dct, identity
716def_fn_8x8 adst, dct
717def_fn_8x8 adst, adst
718def_fn_8x8 adst, flipadst
719def_fn_8x8 flipadst, dct
720def_fn_8x8 flipadst, adst
721def_fn_8x8 flipadst, flipadst
722def_fn_8x8 identity, dct
723def_fn_8x8 adst, identity
724def_fn_8x8 flipadst, identity
725def_fn_8x8 identity, adst
726def_fn_8x8 identity, flipadst
727
728function inv_txfm_add_4x8_rvv, export=1, ext=v
729  csrw vxrm, zero
730
731  vsetivli zero, 8, e16, m1, ta, ma
732  vle16.v v0, (a2)
733  addi t0, a2, 16
734  vle16.v v1, (t0)
735  addi t0, t0, 16
736  vle16.v v2, (t0)
737  addi t0, t0, 16
738  vle16.v v3, (t0)
739
740  li t1, 2896*8
741.irp i, 0, 1, 2, 3
742  vsmul.vx v\i, v\i, t1
743.endr
744
745  jalr t0, a4
746
747  vsseg4e16.v v0, (a2)
748
749  vsetivli zero, 4, e16, mf2, ta, ma
750  vmv.v.x v8, zero
751  vle16.v v0, (a2)
752  vse16.v v8, (a2)
753.irp i, 1, 2, 3, 4, 5, 6, 7
754  addi a2, a2, 8
755  vle16.v v\i, (a2)
756  vse16.v v8, (a2)
757.endr
758
759  jalr t0, a5
760
761.irp i, 0, 1, 2, 3, 4, 5, 6, 7
762  vssra.vi v\i, v\i, 4
763.endr
764
765  vsetvli zero, zero, e8, mf4, ta, ma
766  vle8.v v8, (a0)
767  add t0, a0, a1
768  vle8.v v9, (t0)
769.irp i, 10, 11, 12, 13, 14, 15
770  add t0, t0, a1
771  vle8.v v\i, (t0)
772.endr
773
774  vwaddu.wv v0, v0,  v8
775  vwaddu.wv v1, v1,  v9
776  vwaddu.wv v2, v2, v10
777  vwaddu.wv v3, v3, v11
778  vwaddu.wv v4, v4, v12
779  vwaddu.wv v5, v5, v13
780  vwaddu.wv v6, v6, v14
781  vwaddu.wv v7, v7, v15
782
783  vsetvli zero, zero, e16, mf2, ta, ma
784.irp i, 0, 1, 2, 3, 4, 5, 6, 7
785  vmax.vx v\i, v\i, zero
786.endr
787
788  vsetvli zero, zero, e8, mf4, ta, ma
789
790  vnclipu.wi  v8, v0, 0
791  vnclipu.wi  v9, v1, 0
792  vnclipu.wi v10, v2, 0
793  vnclipu.wi v11, v3, 0
794  vnclipu.wi v12, v4, 0
795  vnclipu.wi v13, v5, 0
796  vnclipu.wi v14, v6, 0
797  vnclipu.wi v15, v7, 0
798
799  vse8.v v8, (a0)
800.irp i, 9, 10, 11, 12, 13, 14, 15
801  add a0, a0, a1
802  vse8.v v\i, (a0)
803.endr
804
805  ret
806endfunc
807
808function inv_txfm_add_8x4_rvv, export=1, ext=v
809  csrw vxrm, zero
810
811  vsetivli zero, 4, e16, mf2, ta, ma
812  vle16.v v0, (a2)
813  addi t0, a2, 8
814  vle16.v v1, (t0)
815.irp i, 2, 3, 4, 5, 6, 7
816  addi t0, t0, 8
817  vle16.v v\i, (t0)
818.endr
819
820  li t1, 2896*8
821.irp i, 0, 1, 2, 3, 4, 5, 6, 7
822  vsmul.vx v\i, v\i, t1
823.endr
824
825  jalr t0, a4
826
827  vsseg8e16.v v0, (a2)
828
829  vsetivli zero, 8, e16, m1, ta, ma
830  vmv.v.x v4, zero
831  vle16.v v0, (a2)
832  vse16.v v4, (a2)
833.irp i, 1, 2, 3
834  addi a2, a2, 16
835  vle16.v v\i, (a2)
836  vse16.v v4, (a2)
837.endr
838
839  jalr t0, a5
840
841  vssra.vi v0, v0, 4
842  vssra.vi v1, v1, 4
843  vssra.vi v2, v2, 4
844  vssra.vi v3, v3, 4
845
846  vsetvli zero, zero, e8, mf2, ta, ma
847  vle8.v v4, (a0)
848  add t0, a0, a1
849  vle8.v v5, (t0)
850  add t0, t0, a1
851  vle8.v v6, (t0)
852  add t0, t0, a1
853  vle8.v v7, (t0)
854
855  vwaddu.wv v0, v0, v4
856  vwaddu.wv v1, v1, v5
857  vwaddu.wv v2, v2, v6
858  vwaddu.wv v3, v3, v7
859
860  vsetvli zero, zero, e16, m1, ta, ma
861  vmax.vx v0, v0, zero
862  vmax.vx v1, v1, zero
863  vmax.vx v2, v2, zero
864  vmax.vx v3, v3, zero
865
866  vsetvli zero, zero, e8, mf2, ta, ma
867
868  vnclipu.wi v4, v0, 0
869  vnclipu.wi v5, v1, 0
870  vnclipu.wi v6, v2, 0
871  vnclipu.wi v7, v3, 0
872
873  vse8.v v4, (a0)
874  add a0, a0, a1
875  vse8.v v5, (a0)
876  add a0, a0, a1
877  vse8.v v6, (a0)
878  add a0, a0, a1
879  vse8.v v7, (a0)
880
881  ret
882endfunc
883
884/* Define symbols added in .if statement */
885.equ dct, 1
886.equ identity, 2
887.equ adst, 3
888.equ flipadst, 4
889
890.macro def_fn_48 w, h, txfm1, txfm2
891function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
892.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
893  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
894.else
895  la a4, inv_\txfm1\()_e16_x\w\()_rvv
896.endif
897.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
898  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
899.else
900  la a5, inv_\txfm2\()_e16_x\h\()_rvv
901.endif
902  j inv_txfm_add_\w\()x\h\()_rvv
903endfunc
904.endm
905
906.macro def_fns_48 w, h
907def_fn_48 \w, \h, dct, dct
908def_fn_48 \w, \h, identity, identity
909def_fn_48 \w, \h, dct, adst
910def_fn_48 \w, \h, dct, flipadst
911def_fn_48 \w, \h, dct, identity
912def_fn_48 \w, \h, adst, dct
913def_fn_48 \w, \h, adst, adst
914def_fn_48 \w, \h, adst, flipadst
915def_fn_48 \w, \h, flipadst, dct
916def_fn_48 \w, \h, flipadst, adst
917def_fn_48 \w, \h, flipadst, flipadst
918def_fn_48 \w, \h, identity, dct
919def_fn_48 \w, \h, adst, identity
920def_fn_48 \w, \h, flipadst, identity
921def_fn_48 \w, \h, identity, adst
922def_fn_48 \w, \h, identity, flipadst
923.endm
924
925def_fns_48 4, 8
926def_fns_48 8, 4
927
928function inv_identity_e16_x16_rvv, export=1, ext=v
929  li t1, 2*(5793-4096)*8
930.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
931  vsmul.vx v16, v\i, t1
932  vsadd.vv v\i, v\i, v\i
933  vsadd.vv v\i, v\i, v16
934.endr
935  jr t0
936endfunc
937
938function inv_dct_e16_x16_rvv, export=1, ext=v
939  idct_8 v0, v2, v4, v6, v8, v10, v12, v14
940
941  li t1, 401
942  li t2, 4076
943  li t3, 3166
944  li t4, 2598
945
946  vwmul.vx v30, v1, t2
947  neg t2, t2
948  vwmul.vx v16, v1, t1
949  vwmacc.vx v30, t1, v15
950  vwmacc.vx v16, t2, v15
951
952  vwmul.vx v28, v9, t4
953  neg t4, t4
954  vwmul.vx v18, v9, t3
955  vwmacc.vx v28, t3, v7
956  vwmacc.vx v18, t4, v7
957
958  li t1, 1931
959  li t2, 3612
960  li t3, 3920
961  li t4, 1189
962
963  vwmul.vx v26, v5, t2
964  neg t2, t2
965  vwmul.vx v20, v5, t1
966  vwmacc.vx v26, t1, v11
967  vwmacc.vx v20, t2, v11
968
969  vwmul.vx v24, v13, t4
970  neg t4, t4
971  vwmul.vx v22, v13, t3
972  vwmacc.vx v24, t3, v3
973  vwmacc.vx v22, t4, v3
974
975  li t1, 2048
976  li t2, 2896
977  li t3, 1567
978  li t4, 3784
979
980  vwadd.wx v16, v16, t1
981  vwadd.wx v18, v18, t1
982  vwadd.wx v20, v20, t1
983  vwadd.wx v22, v22, t1
984  vwadd.wx v24, v24, t1
985  vwadd.wx v26, v26, t1
986  vwadd.wx v28, v28, t1
987  vwadd.wx v30, v30, t1
988
989  vnsra.wi v16, v16, 12
990  vnsra.wi v18, v18, 12
991  vnsra.wi v20, v20, 12
992  vnsra.wi v22, v22, 12
993  vnsra.wi v24, v24, 12
994  vnsra.wi v26, v26, 12
995  vnsra.wi v28, v28, 12
996  vnsra.wi v30, v30, 12
997
998  vssub.vv  v3, v16, v18
999  vsadd.vv v16, v16, v18
1000  vssub.vv  v5, v22, v20
1001  vsadd.vv v22, v22, v20
1002  vssub.vv v11, v24, v26
1003  vsadd.vv v24, v24, v26
1004  vssub.vv v13, v30, v28
1005  vsadd.vv v30, v30, v28
1006
1007  vwmul.vx v28, v13, t4
1008  neg t4, t4
1009  vwmul.vx v18, v13, t3
1010  vwmul.vx v26, v11, t3
1011  vwmacc.vx v28, t3, v3
1012  neg t3, t3
1013  vwmul.vx v20, v11, t4
1014  vwmacc.vx v18, t4, v3
1015  vwmacc.vx v20, t3, v5
1016  vwmacc.vx v26, t4, v5
1017
1018  vwadd.wx v18, v18, t1
1019  vwadd.wx v20, v20, t1
1020  vwadd.wx v26, v26, t1
1021  vwadd.wx v28, v28, t1
1022
1023  vnsra.wi v18, v18, 12
1024  vnsra.wi v20, v20, 12
1025  vnsra.wi v26, v26, 12
1026  vnsra.wi v28, v28, 12
1027
1028  vssub.vv  v5, v18, v20
1029  vsadd.vv v18, v18, v20
1030  vssub.vv v11, v28, v26
1031  vsadd.vv v28, v28, v26
1032
1033  vssub.vv  v7, v16, v22
1034  vsadd.vv v16, v16, v22
1035  vssub.vv  v9, v30, v24
1036  vsadd.vv v30, v30, v24
1037
1038  vwmul.vx v20, v11, t2
1039  vwmul.vx v22,  v9, t2
1040  vwmul.vx v24,  v9, t2
1041  vwmul.vx v26, v11, t2
1042  vwmacc.vx v24, t2, v7
1043  vwmacc.vx v26, t2, v5
1044  neg t2, t2
1045  vwmacc.vx v20, t2, v5
1046  vwmacc.vx v22, t2, v7
1047
1048  vwadd.wx v20, v20, t1
1049  vwadd.wx v22, v22, t1
1050  vwadd.wx v24, v24, t1
1051  vwadd.wx v26, v26, t1
1052
1053  vnsra.wi v20, v20, 12
1054  vnsra.wi v22, v22, 12
1055  vnsra.wi v24, v24, 12
1056  vnsra.wi v26, v26, 12
1057
1058  vssub.vv v15,  v0, v30
1059  vsadd.vv  v0,  v0, v30
1060  vssub.vv v17,  v2, v28
1061  vsadd.vv  v1,  v2, v28
1062  vssub.vv v13,  v4, v26
1063  vsadd.vv  v2,  v4, v26
1064  vssub.vv v19,  v6, v24
1065  vsadd.vv  v3,  v6, v24
1066  vssub.vv v11,  v8, v22
1067  vsadd.vv  v4,  v8, v22
1068  vsadd.vv  v5, v10, v20
1069  vssub.vv v10, v10, v20
1070  vssub.vv  v9, v12, v18
1071  vsadd.vv  v6, v12, v18
1072  vssub.vv  v8, v14, v16
1073  vsadd.vv  v7, v14, v16
1074  vmv.v.v v14, v17
1075  vmv.v.v v12, v19
1076
1077  jr t0
1078endfunc
1079
1080.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
1081  li t1, 4091
1082  li t2, 201
1083  li t3, 3973
1084  li t4, 995
1085
1086  vwmul.vx v16, v15, t1
1087  neg t1, t1
1088  vwmul.vx v18, v15, t2
1089  vwmacc.vx v16, t2, v0
1090  vwmacc.vx v18, t1, v0
1091
1092  vwmul.vx v20, v13, t3
1093  neg t3, t3
1094  vwmul.vx v22, v13, t4
1095  vwmacc.vx v20, t4, v2
1096  vwmacc.vx v22, t3, v2
1097
1098  li t1, 3703
1099  li t2, 1751
1100  li t3, 3290
1101  li t4, 2440
1102
1103  vwmul.vx v24, v11, t1
1104  neg t1, t1
1105  vwmul.vx v26, v11, t2
1106  vwmacc.vx v24, t2, v4
1107  vwmacc.vx v26, t1, v4
1108
1109  vwmul.vx v28, v9, t3
1110  neg t3, t3
1111  vwmul.vx v30, v9, t4
1112  vwmacc.vx v28, t4, v6
1113  vwmacc.vx v30, t3, v6
1114
1115  li t1, 2048
1116
1117  vwadd.wx v16, v16, t1
1118  vwadd.wx v18, v18, t1
1119  vwadd.wx v20, v20, t1
1120  vwadd.wx v22, v22, t1
1121  vwadd.wx v24, v24, t1
1122  vwadd.wx v26, v26, t1
1123  vwadd.wx v28, v28, t1
1124  vwadd.wx v30, v30, t1
1125
1126  vnsra.wi  v0, v16, 12
1127  vnsra.wi v18, v18, 12
1128  vnsra.wi  v2, v20, 12
1129  vnsra.wi v22, v22, 12
1130  vnsra.wi  v4, v24, 12
1131  vnsra.wi v26, v26, 12
1132  vnsra.wi  v6, v28, 12
1133  vnsra.wi v30, v30, 12
1134
1135  li t1, 2751
1136  li t2, 3035
1137  li t3, 2106
1138  li t4, 3513
1139
1140  vwmul.vx v16, v7, t1
1141  neg t1, t1
1142  vwmul.vx v20, v7, t2
1143  vwmacc.vx v16, t2, v8
1144  vwmacc.vx v20, t1, v8
1145
1146  vwmul.vx v24, v5, t3
1147  neg t3, t3
1148  vwmul.vx v28, v5, t4
1149  vwmacc.vx v24, t4, v10
1150  vwmacc.vx v28, t3, v10
1151
1152  li t1, 2048
1153
1154  vwadd.wx v16, v16, t1
1155  vwadd.wx v20, v20, t1
1156  vwadd.wx v24, v24, t1
1157  vwadd.wx v28, v28, t1
1158
1159  vnsra.wi v16, v16, 12
1160  vnsra.wi  v9, v20, 12
1161  vnsra.wi v24, v24, 12
1162  vnsra.wi v11, v28, 12
1163
1164  vssub.vv  v8,  v0, v16
1165  vsadd.vv  v0,  v0, v16
1166  vssub.vv v10,  v2, v24
1167  vsadd.vv  v2,  v2, v24
1168
1169  li t1, 1380
1170  li t2, 3857
1171  li t3, 601
1172  li t4, 4052
1173
1174  vwmul.vx v16, v3, t1
1175  neg t1, t1
1176  vwmul.vx v20, v3, t2
1177  vwmacc.vx v16, t2, v12
1178  vwmacc.vx v20, t1, v12
1179
1180  vwmul.vx v24, v1, t3
1181  neg t3, t3
1182  vwmul.vx v28, v1, t4
1183  vwmacc.vx v24, t4, v14
1184  vwmacc.vx v28, t3, v14
1185
1186  li t1, 2048
1187
1188  vwadd.wx v16, v16, t1
1189  vwadd.wx v20, v20, t1
1190  vwadd.wx v24, v24, t1
1191  vwadd.wx v28, v28, t1
1192
1193  vnsra.wi v16, v16, 12
1194  vnsra.wi v13, v20, 12
1195  vnsra.wi v24, v24, 12
1196  vnsra.wi v15, v28, 12
1197
1198  vssub.vv v12,  v4, v16
1199  vsadd.vv v16,  v4, v16
1200  vssub.vv v14,  v6, v24
1201  vsadd.vv v20,  v6, v24
1202
1203  vsadd.vv  v1, v18,  v9
1204  vssub.vv  v9, v18,  v9
1205  vsadd.vv  v3, v22, v11
1206  vssub.vv v11, v22, v11
1207  vsadd.vv v18, v26, v13
1208  vssub.vv v13, v26, v13
1209  vsadd.vv v22, v30, v15
1210  vssub.vv v15, v30, v15
1211
1212  vssub.vv v4, v0, v16
1213  vsadd.vv v0, v0, v16
1214  vssub.vv v5, v1, v18
1215  vsadd.vv v1, v1, v18
1216  vssub.vv v6, v2, v20
1217  vsadd.vv v2, v2, v20
1218  vssub.vv v7, v3, v22
1219  vsadd.vv v3, v3, v22
1220
1221  li t1, 799
1222  li t2, 4017
1223  li t3, 3406
1224  li t4, 2276
1225
1226  vwmul.vx v16,  v8, t2
1227  vwmul.vx v18,  v8, t1
1228  vwmul.vx v20, v10, t4
1229  vwmul.vx v22, v10, t3
1230  vwmul.vx v24, v13, t2
1231  vwmul.vx v26, v13, t1
1232  vwmul.vx v28, v15, t4
1233  vwmul.vx v30, v15, t3
1234  vwmacc.vx v16, t1,  v9
1235  neg t1, t1
1236  vwmacc.vx v20, t3, v11
1237  neg t3, t3
1238  vwmacc.vx v26, t2, v12
1239  neg t2, t2
1240  vwmacc.vx v30, t4, v14
1241  neg t4, t4
1242  vwmacc.vx v18, t2,  v9
1243  vwmacc.vx v22, t4, v11
1244  vwmacc.vx v24, t1, v12
1245  vwmacc.vx v28, t3, v14
1246
1247  li t1, 2048
1248  li t2, 2896
1249  li t3, 1567
1250  li t4, 3784
1251
1252  vwadd.wx v16, v16, t1
1253  vwadd.wx v18, v18, t1
1254  vwadd.wx v20, v20, t1
1255  vwadd.wx v22, v22, t1
1256  vwadd.wx v24, v24, t1
1257  vwadd.wx v26, v26, t1
1258  vwadd.wx v28, v28, t1
1259  vwadd.wx v30, v30, t1
1260
1261  vnsra.wi v16, v16, 12
1262  vnsra.wi v18, v18, 12
1263  vnsra.wi v20, v20, 12
1264  vnsra.wi v22, v22, 12
1265  vnsra.wi v24, v24, 12
1266  vnsra.wi v26, v26, 12
1267  vnsra.wi v28, v28, 12
1268  vnsra.wi v30, v30, 12
1269
1270  vsadd.vv  v8, v16, v24
1271  vsadd.vv  v9, v18, v26
1272  vsadd.vv v10, v20, v28
1273  vsadd.vv v11, v22, v30
1274  vssub.vv v12, v16, v24
1275  vssub.vv v13, v18, v26
1276  vssub.vv v14, v20, v28
1277  vssub.vv v15, v22, v30
1278
1279  vwmul.vx v16,  v4, t4
1280  vwmul.vx v18,  v4, t3
1281  vwmul.vx v20,  v7, t4
1282  vwmul.vx v22,  v7, t3
1283  vwmul.vx v24, v12, t4
1284  vwmul.vx v26, v12, t3
1285  vwmul.vx v28, v15, t4
1286  vwmul.vx v30, v15, t3
1287  vwmacc.vx v16, t3,  v5
1288  vwmacc.vx v22, t4,  v6
1289  vwmacc.vx v24, t3, v13
1290  neg t3, t3
1291  vwmacc.vx v30, t4, v14
1292  neg t4, t4
1293  vwmacc.vx v20, t3,  v6
1294  vwmacc.vx v28, t3, v14
1295  vwmacc.vx v18, t4,  v5
1296  vwmacc.vx v26, t4, v13
1297
1298  vwadd.wx v16, v16, t1
1299  vwadd.wx v18, v18, t1
1300  vwadd.wx v20, v20, t1
1301  vwadd.wx v22, v22, t1
1302  vwadd.wx v24, v24, t1
1303  vwadd.wx v26, v26, t1
1304  vwadd.wx v28, v28, t1
1305  vwadd.wx v30, v30, t1
1306
1307  vnsra.wi v16, v16, 12
1308  vnsra.wi v18, v18, 12
1309  vnsra.wi v20, v20, 12
1310  vnsra.wi v22, v22, 12
1311  vnsra.wi v24, v24, 12
1312  vnsra.wi v26, v26, 12
1313  vnsra.wi v28, v28, 12
1314  vnsra.wi v30, v30, 12
1315
1316.ifc \o0, v0
1317  vsadd.vv \o14, v9, v11
1318  vssub.vv  v11, v9, v11
1319  vssub.vv   v9, v1,  v3
1320  vsadd.vv \o15, v1,  v3
1321  vsadd.vv  \o1, v8, v10
1322  vssub.vv  v10, v8, v10
1323  vssub.vv   v8, v0,  v2
1324  vsadd.vv  \o0, v0,  v2
1325.else
1326  vsadd.vv  \o1, v8, v10
1327  vssub.vv  v10, v8, v10
1328  vssub.vv   v8, v0,  v2
1329  vsadd.vv  \o0, v0,  v2
1330  vsadd.vv   v2, v9, v11
1331  vssub.vv  v11, v9, v11
1332  vssub.vv   v9, v1,  v3
1333  vsadd.vv \o15, v1,  v3
1334  vmv.v.v  \o14, v2
1335.endif
1336
1337  vsadd.vv  \o3, v16, v20
1338  vssub.vv   v6, v16, v20
1339  vsadd.vv \o12, v18, v22
1340  vssub.vv   v7, v18, v22
1341  vsadd.vv  \o2, v24, v28
1342  vssub.vv  v24, v24, v28
1343  vsadd.vv \o13, v26, v30
1344  vssub.vv  v26, v26, v30
1345
1346  neg t3, t2
1347
1348  vwmul.vx v28, v24, t2
1349  vwmul.vx v30, v24, t2
1350  vwmacc.vx v28, t2, v26
1351  vwmacc.vx v30, t3, v26
1352
1353  vwmul.vx v24, v10, t2
1354  vwmul.vx v26, v10, t2
1355  vwmacc.vx v24, t2, v11
1356  vwmacc.vx v26, t3, v11
1357
1358  vwmul.vx v20, v6, t2
1359  vwmul.vx v22, v6, t2
1360  vwmacc.vx v20, t2, v7
1361  vwmacc.vx v22, t3, v7
1362
1363  vwmul.vx v16, v8, t2
1364  vwmul.vx v18, v8, t2
1365  vwmacc.vx v16, t2, v9
1366  vwmacc.vx v18, t3, v9
1367
1368  vwadd.wx v16, v16, t1
1369  vwadd.wx v18, v18, t1
1370  vwadd.wx v20, v20, t1
1371  vwadd.wx v22, v22, t1
1372  vwadd.wx v24, v24, t1
1373  vwadd.wx v26, v26, t1
1374  vwadd.wx v28, v28, t1
1375  vwadd.wx v30, v30, t1
1376
1377  vnsra.wi  \o7, v16, 12
1378  vnsra.wi  \o8, v18, 12
1379  vnsra.wi  \o4, v20, 12
1380  vnsra.wi \o11, v22, 12
1381  vnsra.wi  \o6, v24, 12
1382  vnsra.wi  \o9, v26, 12
1383  vnsra.wi  \o5, v28, 12
1384  vnsra.wi \o10, v30, 12
1385
1386  vmv.v.x v16, zero
1387  vssub.vv  \o1, v16,  \o1
1388  vssub.vv  \o3, v16,  \o3
1389  vssub.vv  \o5, v16,  \o5
1390  vssub.vv  \o7, v16,  \o7
1391  vssub.vv  \o9, v16,  \o9
1392  vssub.vv \o11, v16, \o11
1393  vssub.vv \o13, v16, \o13
1394  vssub.vv \o15, v16, \o15
1395.endm
1396
1397function inv_adst_e16_x16_rvv, export=1, ext=v
1398  iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
1399  jr t0
1400endfunc
1401
1402function inv_flipadst_e16_x16_rvv, export=1, ext=v
1403  iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0
1404  jr t0
1405endfunc
1406
1407.macro def_horz_16 variant
1408function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
1409  vmv.v.x v16, zero
1410  vle16.v v0, (t4)
1411  vse16.v v16, (t4)
1412.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1413  add t4, t4, t6
1414  vle16.v v\i, (t4)
1415  vse16.v v16, (t4)
1416.endr
1417.ifc \variant, _identity
1418  li t1, 2*(5793-4096)*8
1419.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1420  vsmul.vx v16, v\i, t1
1421  vsra.vi v16, v16, 1
1422  vaadd.vv v\i, v\i, v16
1423.endr
1424  j L(horz_16x8_epilog)
1425.else
1426  jalr t0, a4
1427.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1428  vssra.vi v\i, v\i, 2
1429.endr
1430L(horz_16x8_epilog):
1431  vsse16.v v0, (t5), t6
1432.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1433  addi t5, t5, 2
1434  vsse16.v v\i, (t5), t6
1435.endr
1436  jr a7
1437.endif
1438endfunc
1439.endm
1440
1441def_horz_16 _identity
1442def_horz_16
1443
1444function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
1445  vsetivli zero, 8, e16, m1, ta, ma
1446
1447  vle16.v v0, (t4)
1448.irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1449  add t4, t4, t6
1450  vle16.v v\i, (t4)
1451.endr
1452
1453  jalr t0, a5
1454
1455.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1456  vssra.vi v\i, v\i, 4
1457.endr
1458
1459  vsetivli zero, 8, e8, mf2, ta, ma
1460
1461  vle8.v v16, (t5)
1462  add t0, t5, a1
1463  vle8.v v17, (t0)
1464.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1465  add t0, t0, a1
1466  vle8.v v\i, (t0)
1467.endr
1468
1469  vwaddu.wv v0, v0, v16
1470  vwaddu.wv v1, v1, v17
1471  vwaddu.wv v2, v2, v18
1472  vwaddu.wv v3, v3, v19
1473  vwaddu.wv v4, v4, v20
1474  vwaddu.wv v5, v5, v21
1475  vwaddu.wv v6, v6, v22
1476  vwaddu.wv v7, v7, v23
1477  vwaddu.wv v8, v8, v24
1478  vwaddu.wv v9, v9, v25
1479  vwaddu.wv v10, v10, v26
1480  vwaddu.wv v11, v11, v27
1481  vwaddu.wv v12, v12, v28
1482  vwaddu.wv v13, v13, v29
1483  vwaddu.wv v14, v14, v30
1484  vwaddu.wv v15, v15, v31
1485
1486  vsetvli zero, zero, e16, m1, ta, ma
1487.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1488  vmax.vx v\i, v\i, zero
1489.endr
1490
1491  vsetvli zero, zero, e8, mf2, ta, ma
1492  vnclipu.wi v16, v0, 0
1493  vnclipu.wi v17, v1, 0
1494  vnclipu.wi v18, v2, 0
1495  vnclipu.wi v19, v3, 0
1496  vnclipu.wi v20, v4, 0
1497  vnclipu.wi v21, v5, 0
1498  vnclipu.wi v22, v6, 0
1499  vnclipu.wi v23, v7, 0
1500  vnclipu.wi v24, v8, 0
1501  vnclipu.wi v25, v9, 0
1502  vnclipu.wi v26, v10, 0
1503  vnclipu.wi v27, v11, 0
1504  vnclipu.wi v28, v12, 0
1505  vnclipu.wi v29, v13, 0
1506  vnclipu.wi v30, v14, 0
1507  vnclipu.wi v31, v15, 0
1508
1509  vse8.v v16, (t5)
1510.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1511  add t5, t5, a1
1512  vse8.v v\i, (t5)
1513.endr
1514
1515  jr a7
1516endfunc
1517
1518function inv_txfm_add_16x16_rvv, export=1, ext=v
1519  csrw vxrm, zero
1520  vsetivli zero, 8, e16, m1, ta, ma
1521  addi sp, sp, -16*32
1522.irp i, 8, 0
1523  addi t4, a2, \i*2
1524  addi t5, sp, \i*16*2
1525.if \i == 8
1526  blt a3, a7, 1f
1527.endif
1528  li t6, 16*2
1529  jalr a7, a6
1530.if \i == 8
1531  j 2f
15321:
1533  li t1, 64
1534  vsetvli zero, t1, e16, m8, ta, ma
1535  vmv.v.x v0, zero
1536  vse16.v v0, (t5)
1537  addi t5, t5, 128
1538  vse16.v v0, (t5)
1539  vsetivli zero, 8, e16, m1, ta, ma
15402:
1541.endif
1542.endr
1543.irp i, 0, 8
1544  addi t4, sp, \i*2
1545  addi t5, a0, \i
1546  li t6, 16*2
1547  jal a7, inv_txfm_add_vert_8x16_rvv
1548.endr
1549  addi sp, sp, 16*32
1550  ret
1551endfunc
1552
1553.macro def_fn_16x16 txfm1, txfm2, eob_half
1554function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
1555.ifc \txfm1, identity
1556  la a6, inv_txfm_horz_identity_16x8_rvv
1557.else
1558  la a6, inv_txfm_horz_16x8_rvv
1559  la a4, inv_\txfm1\()_e16_x16_rvv
1560.endif
1561  la a5, inv_\txfm2\()_e16_x16_rvv
1562  li a7, \eob_half
1563  j inv_txfm_add_16x16_rvv
1564endfunc
1565.endm
1566
1567def_fn_16x16 dct, dct, 36
1568def_fn_16x16 identity, identity, 36
1569def_fn_16x16 dct, adst, 36
1570def_fn_16x16 dct, flipadst, 36
1571def_fn_16x16 dct, identity, 8
1572def_fn_16x16 adst, dct, 36
1573def_fn_16x16 adst, adst, 36
1574def_fn_16x16 adst, flipadst, 36
1575def_fn_16x16 flipadst, dct, 36
1576def_fn_16x16 flipadst, adst, 36
1577def_fn_16x16 flipadst, flipadst, 36
1578def_fn_16x16 identity, dct, 8
1579
1580.macro def_fn_416_base variant
1581function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v
1582  csrw vxrm, zero
1583
1584  vsetivli zero, 8, e16, m1, ta, ma
1585
1586  blt a3, a6, 1f
1587
1588  addi t0, a2, 16
1589  vle16.v v0, (t0)
1590  addi t0, t0, 32
1591  vle16.v v1, (t0)
1592  addi t0, t0, 32
1593  vle16.v v2, (t0)
1594  addi t0, t0, 32
1595  vle16.v v3, (t0)
1596
1597.ifc \variant, identity_
1598  li t1, (5793-4096)*8
1599  vsmul.vx v8, v0, t1
1600  vaadd.vv v4, v0, v8
1601  vsmul.vx v8, v1, t1
1602  vaadd.vv v5, v1, v8
1603  vsmul.vx v8, v2, t1
1604  vaadd.vv v6, v2, v8
1605  vsmul.vx v8, v3, t1
1606  vaadd.vv v7, v3, v8
1607.else
1608  jalr t0, a4
1609
1610  vssra.vi v4, v0, 1
1611  vssra.vi v5, v1, 1
1612  vssra.vi v6, v2, 1
1613  vssra.vi v7, v3, 1
1614.endif
1615
1616  j 2f
1617
16181:
1619.irp i, 4, 5, 6, 7
1620  vmv.v.x v\i, zero
1621.endr
1622
16232:
1624  vle16.v v0, (a2)
1625  addi t0, a2, 32
1626  vle16.v v1, (t0)
1627  addi t0, t0, 32
1628  vle16.v v2, (t0)
1629  addi t0, t0, 32
1630  vle16.v v3, (t0)
1631
1632.ifc \variant, identity_
1633  li t1, (5793-4096)*8
1634.irp i, 0, 1, 2, 3
1635  vsmul.vx v8, v\i, t1
1636  vaadd.vv v\i, v\i, v8
1637.endr
1638
1639  j L(itx_4x16_epilog)
1640.else
1641  jalr t0, a4
1642
1643  vssra.vi v0, v0, 1
1644  vssra.vi v1, v1, 1
1645  vssra.vi v2, v2, 1
1646  vssra.vi v3, v3, 1
1647
1648L(itx_4x16_epilog):
1649  vsseg4e16.v v0, (a2)
1650  addi t0, a2, 64
1651  vsseg4e16.v v4, (t0)
1652
1653  vsetivli zero, 4, e16, mf2, ta, ma
1654
1655  vmv.v.x v16, zero
1656  vle16.v v0, (a2)
1657  vse16.v v16, (a2)
1658  addi t0, a2, 8
1659  vle16.v v1, (t0)
1660  vse16.v v16, (t0)
1661.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1662  addi t0, t0, 8
1663  vle16.v v\i, (t0)
1664  vse16.v v16, (t0)
1665.endr
1666
1667  jalr t0, a5
1668
1669.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1670  vssra.vi v\i, v\i, 4
1671.endr
1672
1673  vsetvli zero, zero, e8, mf4, ta, ma
1674
1675  vle8.v v16, (a0)
1676  add t0, a0, a1
1677  vle8.v v17, (t0)
1678.irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1679  add t0, t0, a1
1680  vle8.v v\i, (t0)
1681.endr
1682
1683  vwaddu.wv  v0,  v0, v16
1684  vwaddu.wv  v1,  v1, v17
1685  vwaddu.wv  v2,  v2, v18
1686  vwaddu.wv  v3,  v3, v19
1687  vwaddu.wv  v4,  v4, v20
1688  vwaddu.wv  v5,  v5, v21
1689  vwaddu.wv  v6,  v6, v22
1690  vwaddu.wv  v7,  v7, v23
1691  vwaddu.wv  v8,  v8, v24
1692  vwaddu.wv  v9,  v9, v25
1693  vwaddu.wv v10, v10, v26
1694  vwaddu.wv v11, v11, v27
1695  vwaddu.wv v12, v12, v28
1696  vwaddu.wv v13, v13, v29
1697  vwaddu.wv v14, v14, v30
1698  vwaddu.wv v15, v15, v31
1699
1700  vsetvli zero, zero, e16, mf2, ta, ma
1701.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1702  vmax.vx v\i, v\i, zero
1703.endr
1704
1705  vsetvli zero, zero, e8, mf4, ta, ma
1706
1707  vnclipu.wi v16,  v0, 0
1708  vnclipu.wi v17,  v1, 0
1709  vnclipu.wi v18,  v2, 0
1710  vnclipu.wi v19,  v3, 0
1711  vnclipu.wi v20,  v4, 0
1712  vnclipu.wi v21,  v5, 0
1713  vnclipu.wi v22,  v6, 0
1714  vnclipu.wi v23,  v7, 0
1715  vnclipu.wi v24,  v8, 0
1716  vnclipu.wi v25,  v9, 0
1717  vnclipu.wi v26, v10, 0
1718  vnclipu.wi v27, v11, 0
1719  vnclipu.wi v28, v12, 0
1720  vnclipu.wi v29, v13, 0
1721  vnclipu.wi v30, v14, 0
1722  vnclipu.wi v31, v15, 0
1723
1724  vse8.v v16, (a0)
1725.irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1726  add a0, a0, a1
1727  vse8.v v\i, (a0)
1728.endr
1729
1730  ret
1731.endif
1732endfunc
1733
1734function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v
1735  csrw vxrm, zero
1736
1737  vsetivli zero, 4, e16, mf2, ta, ma
1738  vle16.v v0, (a2)
1739  addi t0, a2, 8
1740  vle16.v v1, (t0)
1741.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1742  addi t0, t0, 8
1743  vle16.v v\i, (t0)
1744.endr
1745
1746.ifc \variant, identity_
1747  li t1, 2*(5793-4096)*8
1748.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1749  vsmul.vx v16, v\i, t1
1750  vssra.vi v16, v16, 1
1751  vsadd.vv v\i, v\i, v16
1752.endr
1753
1754  j L(itx_16x4_epilog)
1755.else
1756  jalr t0, a4
1757
1758.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1759  vssra.vi v\i, v\i, 1
1760.endr
1761
1762L(itx_16x4_epilog):
1763  li t0, 32
1764  vssseg8e16.v v0, (a2), t0
1765  addi t1, a2, 16
1766  vssseg8e16.v v8, (t1), t0
1767
1768.irp j, 0, 8
1769  vsetivli zero, 8, e16, m1, ta, ma
1770
1771  vmv.v.x v4, zero
1772  addi t0, a2, \j*2
1773  vle16.v v0, (t0)
1774  vse16.v v4, (t0)
1775.irp i, 1, 2, 3
1776  addi t0, t0, 32
1777  vle16.v v\i, (t0)
1778  vse16.v v4, (t0)
1779.endr
1780
1781  jalr t0, a5
1782
1783  vssra.vi v0, v0, 4
1784  vssra.vi v1, v1, 4
1785  vssra.vi v2, v2, 4
1786  vssra.vi v3, v3, 4
1787
1788  vsetvli zero, zero, e8, mf2, ta, ma
1789  addi t0, a0, \j
1790  vle8.v v4, (t0)
1791  add t0, t0, a1
1792  vle8.v v5, (t0)
1793  add t0, t0, a1
1794  vle8.v v6, (t0)
1795  add t0, t0, a1
1796  vle8.v v7, (t0)
1797
1798  vwaddu.wv v0, v0, v4
1799  vwaddu.wv v1, v1, v5
1800  vwaddu.wv v2, v2, v6
1801  vwaddu.wv v3, v3, v7
1802
1803  vsetvli zero, zero, e16, m1, ta, ma
1804  vmax.vx v0, v0, zero
1805  vmax.vx v1, v1, zero
1806  vmax.vx v2, v2, zero
1807  vmax.vx v3, v3, zero
1808
1809  vsetvli zero, zero, e8, mf2, ta, ma
1810
1811  vnclipu.wi v4, v0, 0
1812  vnclipu.wi v5, v1, 0
1813  vnclipu.wi v6, v2, 0
1814  vnclipu.wi v7, v3, 0
1815
1816  addi t0, a0, \j
1817  vse8.v v4, (t0)
1818  add t0, t0, a1
1819  vse8.v v5, (t0)
1820  add t0, t0, a1
1821  vse8.v v6, (t0)
1822  add t0, t0, a1
1823  vse8.v v7, (t0)
1824.endr
1825
1826  ret
1827.endif
1828endfunc
1829.endm
1830
1831def_fn_416_base identity_
1832def_fn_416_base
1833
1834.macro def_fn_416 w, h, txfm1, txfm2, eob_half
1835function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
1836.if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
1837  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
1838.elseif \txfm1 != identity
1839  la a4, inv_\txfm1\()_e16_x\w\()_rvv
1840.endif
1841.if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
1842  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
1843.else
1844  la a5, inv_\txfm2\()_e16_x\h\()_rvv
1845.endif
1846.if \w == 4
1847  li a6, \eob_half
1848.endif
1849.ifc \txfm1, identity
1850  j inv_txfm_identity_add_\w\()x\h\()_rvv
1851.else
1852  j inv_txfm_add_\w\()x\h\()_rvv
1853.endif
1854endfunc
1855.endm
1856
1857.macro def_fns_416 w, h
1858def_fn_416 \w, \h, dct, dct, 29
1859def_fn_416 \w, \h, identity, identity, 29
1860def_fn_416 \w, \h, dct, adst, 29
1861def_fn_416 \w, \h, dct, flipadst, 29
1862def_fn_416 \w, \h, dct, identity, 8
1863def_fn_416 \w, \h, adst, dct, 29
1864def_fn_416 \w, \h, adst, adst, 29
1865def_fn_416 \w, \h, adst, flipadst, 29
1866def_fn_416 \w, \h, flipadst, dct, 29
1867def_fn_416 \w, \h, flipadst, adst, 29
1868def_fn_416 \w, \h, flipadst, flipadst, 29
1869def_fn_416 \w, \h, identity, dct, 32
1870def_fn_416 \w, \h, adst, identity, 8
1871def_fn_416 \w, \h, flipadst, identity, 8
1872def_fn_416 \w, \h, identity, adst, 32
1873def_fn_416 \w, \h, identity, flipadst, 32
1874.endm
1875
1876def_fns_416 4, 16
1877def_fns_416 16, 4
1878
1879.macro def_fn_816_base variant
1880function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v
1881  csrw vxrm, zero
1882
1883  vsetivli zero, 8, e16, m1, ta, ma
1884
1885  blt a3, a6, 1f
1886
1887  vmv.v.x v16, zero
1888  addi t0, a2, 16
1889  vle16.v v0, (t0)
1890  vse16.v v16, (t0)
1891.irp i, 1, 2, 3, 4, 5, 6, 7
1892  addi t0, t0, 32
1893  vle16.v v\i, (t0)
1894  vse16.v v16, (t0)
1895.endr
1896
1897  li t1, 2896*8
1898.ifc \variant, identity_
1899  vsmul.vx  v8, v0, t1
1900  vsmul.vx  v9, v1, t1
1901  vsmul.vx v10, v2, t1
1902  vsmul.vx v11, v3, t1
1903  vsmul.vx v12, v4, t1
1904  vsmul.vx v13, v5, t1
1905  vsmul.vx v14, v6, t1
1906  vsmul.vx v15, v7, t1
1907.else
1908.irp i, 0, 1, 2, 3, 4, 5, 6, 7
1909  vsmul.vx v\i, v\i, t1
1910.endr
1911
1912  jalr t0, a4
1913
1914  vssra.vi  v8, v0, 1
1915  vssra.vi  v9, v1, 1
1916  vssra.vi v10, v2, 1
1917  vssra.vi v11, v3, 1
1918  vssra.vi v12, v4, 1
1919  vssra.vi v13, v5, 1
1920  vssra.vi v14, v6, 1
1921  vssra.vi v15, v7, 1
1922.endif
1923
1924  j 2f
1925
19261:
1927.irp i, 8, 9, 10, 11, 12, 13, 14, 15
1928  vmv.v.x v\i, zero
1929.endr
1930
19312:
1932  vmv.v.x v16, zero
1933  vle16.v v0, (a2)
1934  vse16.v v16, (a2)
1935  addi t0, a2, 32
1936  vle16.v v1, (t0)
1937  vse16.v v16, (t0)
1938.irp i, 2, 3, 4, 5, 6, 7
1939  addi t0, t0, 32
1940  vle16.v v\i, (t0)
1941  vse16.v v16, (t0)
1942.endr
1943
1944  li t1, 2896*8
1945.irp i, 0, 1, 2, 3, 4, 5, 6, 7
1946  vsmul.vx v\i, v\i, t1
1947.endr
1948
1949.ifc \variant, identity_
1950  j L(itx_8x16_epilog)
1951.else
1952  jalr t0, a4
1953
1954.irp i, 0, 1, 2, 3, 4, 5, 6, 7
1955  vssra.vi v\i, v\i, 1
1956.endr
1957
1958L(itx_8x16_epilog):
1959  addi t4, sp, -8*32
1960  vsseg8e16.v v0, (t4)
1961  addi t0, t4, 8*16
1962  vsseg8e16.v v8, (t0)
1963
1964  mv t5, a0
1965  li t6, 16
1966  jal a7, inv_txfm_add_vert_8x16_rvv
1967
1968  ret
1969.endif
1970endfunc
1971
1972function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v
1973  csrw vxrm, zero
1974
1975  vsetivli zero, 8, e16, m1, ta, ma
1976  vle16.v v0, (a2)
1977  addi t0, a2, 16
1978  vle16.v v1, (t0)
1979.irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1980  addi t0, t0, 16
1981  vle16.v v\i, (t0)
1982.endr
1983
1984  li t1, 2896*8
1985.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1986  vsmul.vx v\i, v\i, t1
1987.endr
1988
1989.ifc \variant, identity_
1990  li t1, 2*(5793-4096)*8
1991.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1992  vsmul.vx v16, v\i, t1
1993  vssra.vi v16, v16, 1
1994  vsadd.vv v\i, v\i, v16
1995.endr
1996
1997  j L(itx_16x8_epilog)
1998.else
1999  jalr t0, a4
2000
2001.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
2002  vssra.vi v\i, v\i, 1
2003.endr
2004
2005L(itx_16x8_epilog):
2006  li t0, 32
2007  vssseg8e16.v v0, (a2), t0
2008  addi t1, a2, 16
2009  vssseg8e16.v v8, (t1), t0
2010
2011.irp j, 0, 8
2012  vsetivli zero, 8, e16, m1, ta, ma
2013
2014  vmv.v.x v8, zero
2015  addi t0, a2, \j*2
2016  vle16.v v0, (t0)
2017  vse16.v v8, (t0)
2018.irp i, 1, 2, 3, 4, 5, 6, 7
2019  addi t0, t0, 32
2020  vle16.v v\i, (t0)
2021  vse16.v v8, (t0)
2022.endr
2023
2024  jalr t0, a5
2025
2026.irp i, 0, 1, 2, 3, 4, 5, 6, 7
2027  vssra.vi v\i, v\i, 4
2028.endr
2029
2030  vsetvli zero, zero, e8, mf2, ta, ma
2031  addi t0, a0, \j
2032  vle8.v v8, (t0)
2033.irp i, 9, 10, 11, 12, 13, 14, 15
2034  add t0, t0, a1
2035  vle8.v v\i, (t0)
2036.endr
2037
2038  vwaddu.wv v0, v0, v8
2039  vwaddu.wv v1, v1, v9
2040  vwaddu.wv v2, v2, v10
2041  vwaddu.wv v3, v3, v11
2042  vwaddu.wv v4, v4, v12
2043  vwaddu.wv v5, v5, v13
2044  vwaddu.wv v6, v6, v14
2045  vwaddu.wv v7, v7, v15
2046
2047  vsetvli zero, zero, e16, m1, ta, ma
2048.irp i, 0, 1, 2, 3, 4, 5, 6, 7
2049  vmax.vx v\i, v\i, zero
2050.endr
2051
2052  vsetvli zero, zero, e8, mf2, ta, ma
2053
2054  vnclipu.wi  v8, v0, 0
2055  vnclipu.wi  v9, v1, 0
2056  vnclipu.wi v10, v2, 0
2057  vnclipu.wi v11, v3, 0
2058  vnclipu.wi v12, v4, 0
2059  vnclipu.wi v13, v5, 0
2060  vnclipu.wi v14, v6, 0
2061  vnclipu.wi v15, v7, 0
2062
2063  addi t0, a0, \j
2064  vse8.v v8, (t0)
2065.irp i, 9, 10, 11, 12, 13, 14, 15
2066  add t0, t0, a1
2067  vse8.v v\i, (t0)
2068.endr
2069.endr
2070
2071  ret
2072.endif
2073endfunc
2074.endm
2075
2076def_fn_816_base identity_
2077def_fn_816_base
2078
2079.macro def_fn_816 w, h, txfm1, txfm2, eob_half
2080function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
2081.ifnc \txfm1, identity
2082  la a4, inv_\txfm1\()_e16_x\w\()_rvv
2083.endif
2084  la a5, inv_\txfm2\()_e16_x\h\()_rvv
2085.if \w == 8
2086  li a6, \eob_half
2087.endif
2088.ifc \txfm1, identity
2089  j inv_txfm_identity_add_\w\()x\h\()_rvv
2090.else
2091  j inv_txfm_add_\w\()x\h\()_rvv
2092.endif
2093endfunc
2094.endm
2095
2096.macro def_fns_816 w, h
2097def_fn_816 \w, \h, dct, dct, 43
2098def_fn_816 \w, \h, identity, identity, 43
2099def_fn_816 \w, \h, dct, adst, 43
2100def_fn_816 \w, \h, dct, flipadst, 43
2101def_fn_816 \w, \h, dct, identity, 8
2102def_fn_816 \w, \h, adst, dct, 43
2103def_fn_816 \w, \h, adst, adst, 43
2104def_fn_816 \w, \h, adst, flipadst, 43
2105def_fn_816 \w, \h, flipadst, dct, 43
2106def_fn_816 \w, \h, flipadst, adst, 43
2107def_fn_816 \w, \h, flipadst, flipadst, 43
2108def_fn_816 \w, \h, identity, dct, 64
2109def_fn_816 \w, \h, adst, identity, 8
2110def_fn_816 \w, \h, flipadst, identity, 8
2111def_fn_816 \w, \h, identity, adst, 64
2112def_fn_816 \w, \h, identity, flipadst, 64
2113.endm
2114
2115def_fns_816 8, 16
2116def_fns_816 16, 8
2117