• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30/*
31static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
32                              const pixel *src, const ptrdiff_t src_stride,
33                              const int16_t *const abcd, int mx, int my
34                              HIGHBD_DECL_SUFFIX)
35*/
36.macro FILTER_WARP_RND_P_LSX in0, in1, in2, in3, out0, out1, out2, out3
37    vbsrl.v         vr2,    \in0,     \in1
38    vbsrl.v         vr20,   \in0,     \in2
39    addi.w          t4,     \in3,     512
40    srai.w          t4,     t4,       10
41    addi.w          t4,     t4,       64
42    slli.w          t4,     t4,       3
43    vldx            vr1,    t5,       t4
44    add.w           t3,     t3,       t0   // tmx += abcd[0]
45
46    addi.w          t4,     t3,       512
47    srai.w          t4,     t4,       10
48    addi.w          t4,     t4,       64
49    slli.w          t4,     t4,       3
50    vldx            vr29,   t5,       t4
51    add.w           t3,     t3,       t0   // tmx += abcd[0]
52
53    vilvl.d         vr2,    vr20,     vr2
54    vilvl.d         vr1,    vr29,     vr1
55    vmulwev.h.bu.b  vr3,    vr2,      vr1
56    vmulwod.h.bu.b  vr20,   vr2,      vr1
57    vilvl.d         vr2,    vr20,     vr3
58    vhaddw.w.h      vr2,    vr2,      vr2
59    vhaddw.d.w      vr2,    vr2,      vr2
60    vhaddw.q.d      vr2,    vr2,      vr2
61    vilvh.d         vr3,    vr20,     vr3
62    vhaddw.w.h      vr3,    vr3,      vr3
63    vhaddw.d.w      vr3,    vr3,      vr3
64    vhaddw.q.d      vr3,    vr3,      vr3
65    vextrins.w      \out0,  vr2,      \out1
66    vextrins.w      \out2,  vr3,      \out3
67.endm
68
69.macro FILTER_WARP_CLIP_LSX in0, in1, in2, out0, out1
70    add.w           \in0,     \in0,    \in1
71    addi.w          t6,       \in0,    512
72    srai.w          t6,       t6,      10
73    addi.w          t6,       t6,      64
74    slli.w          t6,       t6,      3
75    fldx.d          f1,       t5,      t6
76    vsllwil.h.b     vr1,      vr1,     0
77    vmulwev.w.h     vr3,      \in2,    vr1
78    vmaddwod.w.h    vr3,      \in2,    vr1
79    vhaddw.d.w      vr3,      vr3,     vr3
80    vhaddw.q.d      vr3,      vr3,     vr3
81    vextrins.w      \out0,    vr3,     \out1
82.endm
83
84const warp_sh
85.rept 2
86.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
87.endr
88.rept 2
89.byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
90.endr
91endconst
92
93.macro warp_lsx t, shift
94function warp_affine_8x8\t\()_8bpc_lsx
95    addi.d          sp,       sp,      -64
96    fst.d           f24,      sp,      0
97    fst.d           f25,      sp,      8
98    fst.d           f26,      sp,      16
99    fst.d           f27,      sp,      24
100    fst.d           f28,      sp,      32
101    fst.d           f29,      sp,      40
102    fst.d           f30,      sp,      48
103    fst.d           f31,      sp,      56
104
105    la.local        t4,       warp_sh
106    ld.h            t0,       a4,      0   // abcd[0]
107    ld.h            t1,       a4,      2   // abcd[1]
108
109    alsl.w          t2,       a3,      a3,     1
110    addi.w          t3,       a5,      0
111    la.local        t5,       dav1d_mc_warp_filter
112    sub.d           a2,       a2,      t2
113    addi.d          a2,       a2,      -3
114    vld             vr0,      a2,      0
115    vld             vr30,     t4,      0
116    vld             vr31,     t4,      32
117
118    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
119    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
120    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
121    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
122
123    add.w           a5,       a5,      t1
124    or              t3,       a5,      a5
125    add.d           a2,       a2,      a3
126    vld             vr0,      a2,      0
127    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
128    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
129    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
130    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
131
132    add.w           a5,       a5,      t1
133    or              t3,       a5,      a5
134    add.d           a2,       a2,      a3
135    vld             vr0,      a2,      0
136    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
137    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
138    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
139    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
140
141    add.w           a5,       a5,      t1
142    or              t3,       a5,      a5
143    add.d           a2,       a2,      a3
144    vld             vr0,      a2,      0
145    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
146    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
147    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
148    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
149
150    add.w           a5,       t1,      a5
151    or              t3,       a5,      a5
152    add.d           a2,       a2,      a3
153    vld             vr0,      a2,      0
154    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr12, 0x00, vr13, 0x00
155    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr14, 0x00, vr15, 0x00
156    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr16, 0x00, vr17, 0x00
157    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr18, 0x00, vr19, 0x00
158
159    add.w           a5,       a5,      t1
160    or              t3,       a5,      a5
161    add.d           a2,       a2,      a3
162    vld             vr0,      a2,      0
163    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr12, 0x10, vr13, 0x10
164    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr14, 0x10, vr15, 0x10
165    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr16, 0x10, vr17, 0x10
166    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr18, 0x10, vr19, 0x10
167
168    add.w           a5,       a5,      t1
169    or              t3,       a5,      a5
170    add.d           a2,       a2,      a3
171    vld             vr0,      a2,      0
172    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr12, 0x20, vr13, 0x20
173    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr14, 0x20, vr15, 0x20
174    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr16, 0x20, vr17, 0x20
175    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr18, 0x20, vr19, 0x20
176
177    add.w           a5,       a5,      t1
178    or              t3,       a5,      a5
179    add.d           a2,       a2,      a3
180    vld             vr0,      a2,      0
181    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr12, 0x30, vr13, 0x30
182    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr14, 0x30, vr15, 0x30
183    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr16, 0x30, vr17, 0x30
184    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr18, 0x30, vr19, 0x30
185
186    vsrarni.h.w       vr12,     vr4,     3
187    vsrarni.h.w       vr13,     vr5,     3
188    vsrarni.h.w       vr14,     vr6,     3
189    vsrarni.h.w       vr15,     vr7,     3
190    vsrarni.h.w       vr16,     vr8,     3
191    vsrarni.h.w       vr17,     vr9,     3
192    vsrarni.h.w       vr18,     vr10,    3
193    vsrarni.h.w       vr19,     vr11,    3
194
195    add.w           a5,       a5,      t1
196    or              t3,       a5,      a5
197    add.d           a2,       a2,      a3
198    vld             vr0,      a2,      0
199    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x00, vr5, 0x00
200    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x00, vr7, 0x00
201    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x00, vr9, 0x00
202    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x00, vr11, 0x00
203
204    add.w           a5,       a5,      t1
205    or              t3,       a5,      a5
206    add.d           a2,       a2,      a3
207    vld             vr0,      a2,      0
208    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x10, vr5, 0x10
209    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x10, vr7, 0x10
210    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x10, vr9, 0x10
211    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x10, vr11, 0x10
212
213    add.w           a5,       a5,      t1
214    or              t3,       a5,      a5
215    add.d           a2,       a2,      a3
216    vld             vr0,      a2,      0
217    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x20, vr5, 0x20
218    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x20, vr7, 0x20
219    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x20, vr9, 0x20
220    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x20, vr11, 0x20
221
222    add.w           a5,       a5,      t1
223    or              t3,       a5,      a5
224    add.d           a2,       a2,      a3
225    vld             vr0,      a2,      0
226    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr4, 0x30, vr5, 0x30
227    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr6, 0x30, vr7, 0x30
228    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr8, 0x30, vr9, 0x30
229    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr10, 0x30, vr11, 0x30
230
231    add.w           a5,       a5,      t1
232    or              t3,       a5,      a5
233    add.d           a2,       a2,      a3
234    vld             vr0,      a2,      0
235    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr21, 0x00, vr22, 0x00
236    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr23, 0x00, vr24, 0x00
237    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr25, 0x00, vr26, 0x00
238    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr27, 0x00, vr28, 0x00
239
240    add.w           a5,       a5,      t1
241    or              t3,       a5,      a5
242    add.d           a2,       a2,      a3
243    vld             vr0,      a2,      0
244    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr21, 0x10, vr22, 0x10
245    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr23, 0x10, vr24, 0x10
246    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr25, 0x10, vr26, 0x10
247    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr27, 0x10, vr28, 0x10
248
249    add.w           a5,       a5,      t1
250    or              t3,       a5,      a5
251    add.d           a2,       a2,      a3
252    vld             vr0,      a2,      0
253    FILTER_WARP_RND_P_LSX   vr0, 0, 1, a5, vr21, 0x20, vr22, 0x20
254    FILTER_WARP_RND_P_LSX   vr0, 2, 3, t3, vr23, 0x20, vr24, 0x20
255    FILTER_WARP_RND_P_LSX   vr0, 4, 5, t3, vr25, 0x20, vr26, 0x20
256    FILTER_WARP_RND_P_LSX   vr0, 6, 7, t3, vr27, 0x20, vr28, 0x20
257
258    vsrarni.h.w     vr21,     vr4,     3
259    vsrarni.h.w     vr22,     vr5,     3
260    vsrarni.h.w     vr23,     vr6,     3
261    vsrarni.h.w     vr24,     vr7,     3
262    vsrarni.h.w     vr25,     vr8,     3
263    vsrarni.h.w     vr26,     vr9,     3
264    vsrarni.h.w     vr27,     vr10,    3
265    vsrarni.h.w     vr28,     vr11,    3
266
267    addi.w          t2,       a6,      0   // my
268    ld.h            t7,       a4,      4   // abcd[2]
269    ld.h            t8,       a4,      6   // abcd[3]
270
271.ifnb \t
272    slli.d          a1,       a1,      1
273.endif
274
275    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
276    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
277    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
278    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
279    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
280    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
281    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
282    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
283.ifnb \t
284    vssrarni.h.w    vr5,      vr4,     \shift
285    vst             vr5,      a0,      0
286.else
287    vssrarni.hu.w   vr5,      vr4,     \shift
288    vssrlni.bu.h    vr5,      vr5,     0
289    fst.d           f5,       a0,      0
290.endif
291
292    vshuf.b         vr12,     vr21,    vr12,   vr30
293    vshuf.b         vr13,     vr22,    vr13,   vr30
294    vshuf.b         vr14,     vr23,    vr14,   vr30
295    vshuf.b         vr15,     vr24,    vr15,   vr30
296    vshuf.b         vr16,     vr25,    vr16,   vr30
297    vshuf.b         vr17,     vr26,    vr17,   vr30
298    vshuf.b         vr18,     vr27,    vr18,   vr30
299    vshuf.b         vr19,     vr28,    vr19,   vr30
300    vextrins.h      vr30,     vr31,    0x70
301
302    add.w           a6,       a6,      t8
303    addi.w          t2,       a6,      0
304    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
305    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
306    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
307    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
308    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
309    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
310    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
311    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
312.ifnb \t
313    vssrarni.h.w    vr5,      vr4,     \shift
314    vstx            vr5,      a0,      a1
315.else
316    vssrarni.hu.w   vr5,      vr4,     \shift
317    vssrlni.bu.h    vr5,      vr5,     0
318    fstx.d          f5,       a0,      a1
319.endif
320
321    vaddi.bu        vr31,     vr31,    2
322    vshuf.b         vr12,     vr21,    vr12,   vr30
323    vshuf.b         vr13,     vr22,    vr13,   vr30
324    vshuf.b         vr14,     vr23,    vr14,   vr30
325    vshuf.b         vr15,     vr24,    vr15,   vr30
326    vshuf.b         vr16,     vr25,    vr16,   vr30
327    vshuf.b         vr17,     vr26,    vr17,   vr30
328    vshuf.b         vr18,     vr27,    vr18,   vr30
329    vshuf.b         vr19,     vr28,    vr19,   vr30
330    vextrins.h      vr30,     vr31,    0x70
331
332    add.w           a6,       a6,      t8
333    addi.w          t2,       a6,      0
334    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
335    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
336    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
337    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
338    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
339    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
340    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
341    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
342    alsl.d          a0,       a1,      a0,   1
343.ifnb \t
344    vssrarni.h.w    vr5,      vr4,     \shift
345    vst             vr5,      a0,      0
346.else
347    vssrarni.hu.w   vr5,      vr4,     \shift
348    vssrlni.bu.h    vr5,      vr5,     0
349    fst.d           f5,       a0,      0
350.endif
351
352    vaddi.bu        vr31,     vr31,    2
353    vshuf.b         vr12,     vr21,    vr12,   vr30
354    vshuf.b         vr13,     vr22,    vr13,   vr30
355    vshuf.b         vr14,     vr23,    vr14,   vr30
356    vshuf.b         vr15,     vr24,    vr15,   vr30
357    vshuf.b         vr16,     vr25,    vr16,   vr30
358    vshuf.b         vr17,     vr26,    vr17,   vr30
359    vshuf.b         vr18,     vr27,    vr18,   vr30
360    vshuf.b         vr19,     vr28,    vr19,   vr30
361    vextrins.h      vr30,     vr31,    0x70
362
363    add.w           a6,       a6,       t8
364    addi.w          t2,       a6,       0
365    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
366    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
367    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
368    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
369    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
370    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
371    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
372    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
373.ifnb \t
374    vssrarni.h.w    vr5,      vr4,      \shift
375    vstx            vr5,      a0,       a1
376.else
377    vssrarni.hu.w   vr5,      vr4,      \shift
378    vssrlni.bu.h    vr5,      vr5,      0
379    fstx.d          f5,       a0,       a1
380.endif
381
382    vaddi.bu        vr31,     vr31,    2
383    vshuf.b         vr12,     vr21,    vr12,   vr30
384    vshuf.b         vr13,     vr22,    vr13,   vr30
385    vshuf.b         vr14,     vr23,    vr14,   vr30
386    vshuf.b         vr15,     vr24,    vr15,   vr30
387    vshuf.b         vr16,     vr25,    vr16,   vr30
388    vshuf.b         vr17,     vr26,    vr17,   vr30
389    vshuf.b         vr18,     vr27,    vr18,   vr30
390    vshuf.b         vr19,     vr28,    vr19,   vr30
391    vextrins.h      vr30,     vr31,    0x70
392
393    add.w           a6,       a6,       t8
394    addi.w          t2,       a6,       0
395    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
396    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
397    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
398    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
399    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
400    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
401    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
402    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
403    alsl.d          a0,       a1,       a0,   1
404.ifnb \t
405    vssrarni.h.w    vr5,      vr4,      \shift
406    vst             vr5,      a0,       0
407.else
408    vssrarni.hu.w   vr5,      vr4,      \shift
409    vssrlni.bu.h    vr5,      vr5,      0
410    fst.d           f5,       a0,       0
411.endif
412
413    vaddi.bu        vr31,     vr31,    2
414    vshuf.b         vr12,     vr21,    vr12,   vr30
415    vshuf.b         vr13,     vr22,    vr13,   vr30
416    vshuf.b         vr14,     vr23,    vr14,   vr30
417    vshuf.b         vr15,     vr24,    vr15,   vr30
418    vshuf.b         vr16,     vr25,    vr16,   vr30
419    vshuf.b         vr17,     vr26,    vr17,   vr30
420    vshuf.b         vr18,     vr27,    vr18,   vr30
421    vshuf.b         vr19,     vr28,    vr19,   vr30
422    vextrins.h      vr30,     vr31,    0x70
423
424    add.w           a6,       a6,       t8
425    addi.w          t2,       a6,       0
426    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
427    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
428    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
429    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
430    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
431    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
432    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
433    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
434.ifnb \t
435    vssrarni.h.w    vr5,      vr4,      \shift
436    vstx            vr5,      a0,       a1
437.else
438    vssrarni.hu.w   vr5,      vr4,      \shift
439    vssrlni.bu.h    vr5,      vr5,      0
440    fstx.d          f5,       a0,       a1
441.endif
442
443    vaddi.bu        vr31,     vr31,    2
444    vshuf.b         vr12,     vr21,    vr12,   vr30
445    vshuf.b         vr13,     vr22,    vr13,   vr30
446    vshuf.b         vr14,     vr23,    vr14,   vr30
447    vshuf.b         vr15,     vr24,    vr15,   vr30
448    vshuf.b         vr16,     vr25,    vr16,   vr30
449    vshuf.b         vr17,     vr26,    vr17,   vr30
450    vshuf.b         vr18,     vr27,    vr18,   vr30
451    vshuf.b         vr19,     vr28,    vr19,   vr30
452    vextrins.h      vr30,     vr31,    0x70
453
454    add.w           a6,       a6,       t8
455    addi.w          t2,       a6,       0
456    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
457    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
458    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
459    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
460    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
461    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
462    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
463    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
464    alsl.d          a0,       a1,       a0,   1
465.ifnb \t
466    vssrarni.h.w    vr5,      vr4,      \shift
467    vst             vr5,      a0,       0
468.else
469    vssrarni.hu.w   vr5,      vr4,      \shift
470    vssrlni.bu.h    vr5,      vr5,      0
471    fst.d           f5,       a0,       0
472.endif
473
474    vshuf.b         vr12,     vr21,    vr12,   vr30
475    vshuf.b         vr13,     vr22,    vr13,   vr30
476    vshuf.b         vr14,     vr23,    vr14,   vr30
477    vshuf.b         vr15,     vr24,    vr15,   vr30
478    vshuf.b         vr16,     vr25,    vr16,   vr30
479    vshuf.b         vr17,     vr26,    vr17,   vr30
480    vshuf.b         vr18,     vr27,    vr18,   vr30
481    vshuf.b         vr19,     vr28,    vr19,   vr30
482
483    add.w           a6,       a6,       t8
484    addi.w          t2,       a6,       0
485    FILTER_WARP_CLIP_LSX  t2, zero, vr12,  vr4, 0x00
486    FILTER_WARP_CLIP_LSX  t2, t7,   vr13,  vr4, 0x10
487    FILTER_WARP_CLIP_LSX  t2, t7,   vr14,  vr4, 0x20
488    FILTER_WARP_CLIP_LSX  t2, t7,   vr15,  vr4, 0x30
489    FILTER_WARP_CLIP_LSX  t2, t7,   vr16,  vr5, 0x00
490    FILTER_WARP_CLIP_LSX  t2, t7,   vr17,  vr5, 0x10
491    FILTER_WARP_CLIP_LSX  t2, t7,   vr18,  vr5, 0x20
492    FILTER_WARP_CLIP_LSX  t2, t7,   vr19,  vr5, 0x30
493.ifnb \t
494    vssrarni.h.w    vr5,      vr4,      \shift
495    vstx            vr5,      a0,       a1
496.else
497    vssrarni.hu.w   vr5,      vr4,      \shift
498    vssrlni.bu.h    vr5,      vr5,      0
499    fstx.d          f5,       a0,       a1
500.endif
501
502    fld.d           f24,      sp,       0
503    fld.d           f25,      sp,       8
504    fld.d           f26,      sp,       16
505    fld.d           f27,      sp,       24
506    fld.d           f28,      sp,       32
507    fld.d           f29,      sp,       40
508    fld.d           f30,      sp,       48
509    fld.d           f31,      sp,       56
510    addi.d          sp,       sp,       64
511endfunc
512.endm
513
514warp_lsx , 11
515warp_lsx t, 7
516
517.macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
518    xvshuf.b        xr2,    \in0,     \in0,     \in2
519
520    addi.w          t4,     \in1,     512
521    srai.w          t4,     t4,       10
522    addi.w          t4,     t4,       64
523    slli.w          t4,     t4,       3
524    vldx            vr3,    t5,       t4
525    add.w           t3,     t3,       t0   // tmx += abcd[0]
526
527    addi.w          t4,     t3,       512
528    srai.w          t4,     t4,       10
529    addi.w          t4,     t4,       64
530    slli.w          t4,     t4,       3
531    vldx            vr4,    t5,       t4
532    add.w           t3,     t3,       t0   // tmx += abcd[0]
533
534    addi.w          t4,     t3,       512
535    srai.w          t4,     t4,       10
536    addi.w          t4,     t4,       64
537    slli.w          t4,     t4,       3
538    vldx            vr5,    t5,       t4
539    add.w           t3,     t3,       t0   // tmx += abcd[0]
540
541    addi.w          t4,     t3,       512
542    srai.w          t4,     t4,       10
543    addi.w          t4,     t4,       64
544    slli.w          t4,     t4,       3
545    vldx            vr6,    t5,       t4
546    add.w           t3,     t3,       t0   // tmx += abcd[0]
547
548    xvinsve0.d      xr3,    xr5,      1
549    xvinsve0.d      xr3,    xr4,      2
550    xvinsve0.d      xr3,    xr6,      3
551
552    xvmulwev.h.bu.b xr4,    xr2,      xr3
553    xvmulwod.h.bu.b xr5,    xr2,      xr3
554    xvilvl.d        xr2,    xr5,      xr4
555    xvilvh.d        xr3,    xr5,      xr4
556    xvhaddw.w.h     xr2,    xr2,      xr2
557    xvhaddw.w.h     xr3,    xr3,      xr3
558    xvhaddw.d.w     xr2,    xr2,      xr2
559    xvhaddw.d.w     xr3,    xr3,      xr3
560    xvhaddw.q.d     xr2,    xr2,      xr2
561    xvhaddw.q.d     xr3,    xr3,      xr3
562
563    xvextrins.w     \out0,  xr2,      \out1
564    xvextrins.w     \out2,  xr3,      \out3
565.endm
566
567.macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
568    add.w           \in0,     \in0,    \in1
569    addi.w          t6,       \in0,    512
570    srai.w          t6,       t6,      10
571    addi.w          t6,       t6,      64
572    slli.w          t6,       t6,      3
573    fldx.d          f1,       t5,      t6
574
575    add.w           t2,       t2,      t7
576    addi.w          t6,       t2,      512
577    srai.w          t6,       t6,      10
578    addi.w          t6,       t6,      64
579    slli.w          t6,       t6,      3
580    fldx.d          f2,       t5,      t6
581
582    vilvl.d         vr0,      vr2,     vr1
583    vext2xv.h.b     xr0,      xr0
584    xvmulwev.w.h    xr3,      \in2,    xr0
585    xvmaddwod.w.h   xr3,      \in2,    xr0
586    xvhaddw.d.w     xr3,      xr3,     xr3
587    xvhaddw.q.d     xr3,      xr3,     xr3
588    xvextrins.w     \out0,    xr3,     \out1
589.endm
590
591const shuf0
592.byte  0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
593.byte  1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
594endconst
595
596.macro warp_lasx t, shift
597function warp_affine_8x8\t\()_8bpc_lasx
598    addi.d          sp,       sp,      -16
599    ld.h            t0,       a4,      0   // abcd[0]
600    ld.h            t1,       a4,      2   // abcd[1]
601    fst.d           f24,      sp,      0
602    fst.d           f25,      sp,      8
603
604    alsl.w          t2,       a3,      a3,     1
605    addi.w          t3,       a5,      0
606    la.local        t4,       warp_sh
607    la.local        t5,       dav1d_mc_warp_filter
608    sub.d           a2,       a2,      t2
609    addi.d          a2,       a2,      -3
610    vld             vr0,      a2,      0
611    xvld            xr24,     t4,      0
612    xvld            xr25,     t4,      32
613    la.local        t2,       shuf0
614    xvld            xr1,      t2,      0
615    xvpermi.q       xr0,      xr0,     0x00
616    xvaddi.bu        xr9,    xr1,      4
617    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
618    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
619
620    add.w           a5,       a5,      t1
621    or              t3,       a5,      a5
622    add.d           a2,       a2,      a3
623    vld             vr0,      a2,      0
624    xvpermi.q       xr0,      xr0,     0x00
625    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
626    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
627
628    add.w           a5,       a5,      t1
629    or              t3,       a5,      a5
630    add.d           a2,       a2,      a3
631    vld             vr0,      a2,      0
632    xvpermi.q       xr0,      xr0,     0x00
633    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
634    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
635
636    add.w           a5,       a5,      t1
637    or              t3,       a5,      a5
638    add.d           a2,       a2,      a3
639    vld             vr0,      a2,      0
640    xvpermi.q       xr0,      xr0,     0x00
641    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
642    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
643
644    add.w           a5,       a5,      t1
645    or              t3,       a5,      a5
646    add.d           a2,       a2,      a3
647    vld             vr0,      a2,      0
648    xvpermi.q       xr0,      xr0,     0x00
649    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
650    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
651
652    add.w           a5,       a5,      t1
653    or              t3,       a5,      a5
654    add.d           a2,       a2,      a3
655    vld             vr0,      a2,      0
656    xvpermi.q       xr0,      xr0,     0x00
657    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
658    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
659
660    add.w           a5,       a5,      t1
661    or              t3,       a5,      a5
662    add.d           a2,       a2,      a3
663    vld             vr0,      a2,      0
664    xvpermi.q       xr0,      xr0,     0x00
665    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
666    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
667
668    add.w           a5,       a5,      t1
669    or              t3,       a5,      a5
670    add.d           a2,       a2,      a3
671    vld             vr0,      a2,      0
672    xvpermi.q       xr0,      xr0,     0x00
673    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
674    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
675
676    xvsrarni.h.w    xr12,     xr7,     3
677    xvsrarni.h.w    xr13,     xr8,     3
678    xvsrarni.h.w    xr14,     xr10,    3
679    xvsrarni.h.w    xr15,     xr11,    3
680
681    add.w           a5,       a5,      t1
682    or              t3,       a5,      a5
683    add.d           a2,       a2,      a3
684    vld             vr0,      a2,      0
685    xvpermi.q       xr0,      xr0,     0x00
686    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
687    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
688
689    add.w           a5,       a5,      t1
690    or              t3,       a5,      a5
691    add.d           a2,       a2,      a3
692    vld             vr0,      a2,      0
693    xvpermi.q       xr0,      xr0,     0x00
694    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
695    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
696
697    add.w           a5,       a5,      t1
698    or              t3,       a5,      a5
699    add.d           a2,       a2,      a3
700    vld             vr0,      a2,      0
701    xvpermi.q       xr0,      xr0,     0x00
702    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
703    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
704
705    add.w           a5,       a5,      t1
706    or              t3,       a5,      a5
707    add.d           a2,       a2,      a3
708    vld             vr0,      a2,      0
709    xvpermi.q       xr0,      xr0,     0x00
710    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
711    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
712
713    add.w           a5,       a5,      t1
714    or              t3,       a5,      a5
715    add.d           a2,       a2,      a3
716    vld             vr0,      a2,      0
717    xvpermi.q       xr0,      xr0,     0x00
718    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
719    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
720
721    add.w           a5,       a5,      t1
722    or              t3,       a5,      a5
723    add.d           a2,       a2,      a3
724    vld             vr0,      a2,      0
725    xvpermi.q       xr0,      xr0,     0x00
726    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
727    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
728
729    add.w           a5,       a5,      t1
730    or              t3,       a5,      a5
731    add.d           a2,       a2,      a3
732    vld             vr0,      a2,      0
733    xvpermi.q       xr0,      xr0,     0x00
734    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
735    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
736
737    xvsrarni.h.w    xr16,     xr7,     3
738    xvsrarni.h.w    xr17,     xr8,     3
739    xvsrarni.h.w    xr18,     xr10,    3
740    xvsrarni.h.w    xr19,     xr11,    3
741
742    addi.w          t2,       a6,      0   // my
743    ld.h            t7,       a4,      4   // abcd[2]
744    ld.h            t8,       a4,      6   // abcd[3]
745
746.ifnb \t
747    slli.d          a1,       a1,      1
748.endif
749
750    // y = 0
751    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
752    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
753    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
754    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
755
756    xvshuf.b         xr12,     xr16,    xr12,   xr24
757    xvshuf.b         xr13,     xr17,    xr13,   xr24
758    xvshuf.b         xr14,     xr18,    xr14,   xr24
759    xvshuf.b         xr15,     xr19,    xr15,   xr24
760    xvextrins.h      xr24,     xr25,    0x70
761
762    add.w           a6,       a6,      t8
763    addi.w          t2,       a6,      0
764    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
765    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
766    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
767    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
768
769.ifnb \t
770    xvssrarni.h.w   xr21,     xr20,     \shift
771    xvpermi.q       xr22,     xr21,     0x01
772    vilvl.h         vr23,     vr22,     vr21
773    vilvh.h         vr21,     vr22,     vr21
774    vst             vr23,     a0,       0
775    vstx            vr21,     a0,       a1
776.else
777    xvssrarni.hu.w   xr21,    xr20,     \shift
778    xvssrlni.bu.h    xr22,    xr21,     0
779    xvpermi.q        xr23,    xr22,     0x01
780    vilvl.b          vr21,    vr23,     vr22
781    fst.d            f21,     a0,       0
782    add.d            a0,      a0,       a1
783    vstelm.d         vr21,    a0,       0,     1
784.endif
785
786    xvaddi.bu        xr25,     xr25,    2
787    xvshuf.b         xr12,     xr16,    xr12,   xr24
788    xvshuf.b         xr13,     xr17,    xr13,   xr24
789    xvshuf.b         xr14,     xr18,    xr14,   xr24
790    xvshuf.b         xr15,     xr19,    xr15,   xr24
791    xvextrins.h      xr24,     xr25,    0x70
792
793    add.w           a6,       a6,      t8
794    addi.w          t2,       a6,      0
795    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
796    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
797    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
798    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
799
800    xvaddi.bu        xr25,     xr25,    2
801    xvshuf.b         xr12,     xr16,    xr12,   xr24
802    xvshuf.b         xr13,     xr17,    xr13,   xr24
803    xvshuf.b         xr14,     xr18,    xr14,   xr24
804    xvshuf.b         xr15,     xr19,    xr15,   xr24
805    xvextrins.h      xr24,     xr25,    0x70
806
807    add.w           a6,       a6,      t8
808    addi.w          t2,       a6,      0
809    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
810    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
811    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
812    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
813
814.ifnb \t
815    xvssrarni.h.w   xr21,     xr20,     \shift
816    alsl.d          a0,       a1,       a0,     1
817    xvpermi.q       xr22,     xr21,     0x01
818    vilvl.h         vr23,     vr22,     vr21
819    vilvh.h         vr21,     vr22,     vr21
820    vst             vr23,     a0,       0
821    vstx            vr21,     a0,       a1
822.else
823    xvssrarni.hu.w   xr21,    xr20,     11
824    xvssrlni.bu.h    xr22,    xr21,     0
825    xvpermi.q        xr23,    xr22,     0x01
826    vilvl.b          vr21,    vr23,     vr22
827    add.d            a0,      a0,       a1
828    fst.d            f21,     a0,       0
829    add.d            a0,      a0,       a1
830    vstelm.d         vr21,    a0,       0,     1
831.endif
832
833    xvaddi.bu        xr25,     xr25,    2
834    xvshuf.b         xr12,     xr16,    xr12,   xr24
835    xvshuf.b         xr13,     xr17,    xr13,   xr24
836    xvshuf.b         xr14,     xr18,    xr14,   xr24
837    xvshuf.b         xr15,     xr19,    xr15,   xr24
838    xvextrins.h      xr24,     xr25,    0x70
839
840    add.w           a6,       a6,      t8
841    addi.w          t2,       a6,      0
842    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
843    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
844    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
845    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
846
847    xvaddi.bu        xr25,     xr25,    2
848    xvshuf.b         xr12,     xr16,    xr12,   xr24
849    xvshuf.b         xr13,     xr17,    xr13,   xr24
850    xvshuf.b         xr14,     xr18,    xr14,   xr24
851    xvshuf.b         xr15,     xr19,    xr15,   xr24
852    xvextrins.h      xr24,     xr25,    0x70
853
854    add.w           a6,       a6,      t8
855    addi.w          t2,       a6,      0
856    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
857    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
858    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
859    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
860
861.ifnb \t
862    xvssrarni.h.w   xr21,     xr20,     \shift
863    alsl.d          a0,       a1,       a0,     1
864    xvpermi.q       xr22,     xr21,     0x01
865    vilvl.h         vr23,     vr22,     vr21
866    vilvh.h         vr21,     vr22,     vr21
867    vst             vr23,     a0,       0
868    vstx            vr21,     a0,       a1
869.else
870    xvssrarni.hu.w   xr21,    xr20,     11
871    xvssrlni.bu.h    xr22,    xr21,     0
872    xvpermi.q        xr23,    xr22,     0x01
873    vilvl.b          vr21,    vr23,     vr22
874    add.d            a0,      a0,       a1
875    fst.d            f21,     a0,       0
876    add.d            a0,      a0,       a1
877    vstelm.d         vr21,    a0,       0,     1
878.endif
879
880    xvaddi.bu        xr25,     xr25,    2
881    xvshuf.b         xr12,     xr16,    xr12,   xr24
882    xvshuf.b         xr13,     xr17,    xr13,   xr24
883    xvshuf.b         xr14,     xr18,    xr14,   xr24
884    xvshuf.b         xr15,     xr19,    xr15,   xr24
885    xvextrins.h      xr24,     xr25,    0x70
886
887    add.w           a6,       a6,      t8
888    addi.w          t2,       a6,      0
889    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
890    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
891    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
892    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
893
894    xvshuf.b         xr12,     xr16,    xr12,   xr24
895    xvshuf.b         xr13,     xr17,    xr13,   xr24
896    xvshuf.b         xr14,     xr18,    xr14,   xr24
897    xvshuf.b         xr15,     xr19,    xr15,   xr24
898
899    add.w           a6,       a6,      t8
900    addi.w          t2,       a6,      0
901    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
902    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
903    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
904    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
905
906.ifnb \t
907    xvssrarni.h.w   xr21,     xr20,     \shift
908    alsl.d          a0,       a1,       a0,     1
909    xvpermi.q       xr22,     xr21,     0x01
910    vilvl.h         vr23,     vr22,     vr21
911    vilvh.h         vr21,     vr22,     vr21
912    vst             vr23,     a0,       0
913    vstx            vr21,     a0,       a1
914.else
915    xvssrarni.hu.w   xr21,    xr20,     11
916    xvssrlni.bu.h    xr22,    xr21,     0
917    xvpermi.q        xr23,    xr22,     0x01
918    vilvl.b          vr21,    vr23,     vr22
919    add.d            a0,      a0,       a1
920    fst.d            f21,     a0,       0
921    add.d            a0,      a0,       a1
922    vstelm.d         vr21,    a0,       0,     1
923.endif
924    fld.d            f24,     sp,       0
925    fld.d            f25,     sp,       8
926    addi.d           sp,      sp,       16
927endfunc
928.endm
929
930warp_lasx , 11
931warp_lasx t, 7
932
933/*
934static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
935                    const int16_t *tmp1, const int16_t *tmp2,
936                    const int w, int h,
937                    const int weight HIGHBD_DECL_SUFFIX)
938*/
939
940#define bpc8_sh     5     // sh = intermediate_bits + 1
941#define bpcw8_sh    8     // sh = intermediate_bits + 4
942
943#define bpc_sh   bpc8_sh
944#define bpcw_sh  bpcw8_sh
945
946function avg_8bpc_lsx
947    addi.d        t8,     a0,     0
948
949    clz.w         t0,     a4
950    li.w          t1,     24
951    sub.w         t0,     t0,      t1
952    la.local      t1,     .AVG_LSX_JRTABLE
953    alsl.d        t0,     t0,      t1,    1
954    ld.h          t2,     t0,      0  // The jump addresses are relative to AVG_LSX_JRTABLE
955    add.d         t1,     t1,      t2 // Get absolute address
956    jirl          $r0,    t1,      0
957
958    .align   3
959.AVG_LSX_JRTABLE:
960    .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
961    .hword .AVG_W64_LSX  - .AVG_LSX_JRTABLE
962    .hword .AVG_W32_LSX  - .AVG_LSX_JRTABLE
963    .hword .AVG_W16_LSX  - .AVG_LSX_JRTABLE
964    .hword .AVG_W8_LSX   - .AVG_LSX_JRTABLE
965    .hword .AVG_W4_LSX   - .AVG_LSX_JRTABLE
966
967.AVG_W4_LSX:
968    vld           vr0,    a2,     0
969    vld           vr1,    a3,     0
970    vadd.h        vr2,    vr0,    vr1
971    vssrarni.bu.h vr3,    vr2,    bpc_sh
972    vstelm.w      vr3,    a0,     0,    0
973    add.d         a0,     a0,     a1
974    vstelm.w      vr3,    a0,     0,    1
975    addi.w        a5,     a5,     -2
976    addi.d        a2,     a2,     16
977    addi.d        a3,     a3,     16
978    add.d         a0,     a0,     a1
979    blt           zero,   a5,     .AVG_W4_LSX
980    b             .AVG_END_LSX
981
982.AVG_W8_LSX:
983    vld           vr0,    a2,     0
984    vld           vr2,    a2,     16
985    vld           vr1,    a3,     0
986    vld           vr3,    a3,     16
987    vadd.h        vr4,    vr0,    vr1
988    vadd.h        vr5,    vr2,    vr3
989    vssrarni.bu.h vr5,    vr4,    bpc_sh
990    addi.w        a5,     a5,     -2
991    addi.d        a2,     a2,     32
992    vstelm.d      vr5,    a0,     0,    0
993    add.d         a0,     a0,     a1
994    vstelm.d      vr5,    a0,     0,    1
995    addi.d        a3,     a3,     32
996    add.d         a0,     a0,     a1
997    blt           zero,   a5,     .AVG_W8_LSX
998    b             .AVG_END_LSX
999
1000.AVG_W16_LSX:
1001    vld           vr0,    a2,     0
1002    vld           vr2,    a2,     16
1003    vld           vr1,    a3,     0
1004    vld           vr3,    a3,     16
1005    vadd.h        vr4,    vr0,    vr1
1006    vadd.h        vr5,    vr2,    vr3
1007    vssrarni.bu.h vr5,    vr4,    bpc_sh
1008    addi.w        a5,     a5,     -1
1009    addi.d        a2,     a2,     32
1010    vst           vr5,    a0,     0
1011    addi.d        a3,     a3,     32
1012    add.d         a0,     a0,     a1
1013    blt           zero,   a5,     .AVG_W16_LSX
1014    b             .AVG_END_LSX
1015
1016.AVG_W32_LSX:
1017    vld           vr0,    a2,     0
1018    vld           vr2,    a2,     16
1019    vld           vr4,    a2,     32
1020    vld           vr6,    a2,     48
1021    vld           vr1,    a3,     0
1022    vld           vr3,    a3,     16
1023    vld           vr5,    a3,     32
1024    vld           vr7,    a3,     48
1025    vadd.h        vr0,    vr0,    vr1
1026    vadd.h        vr2,    vr2,    vr3
1027    vadd.h        vr4,    vr4,    vr5
1028    vadd.h        vr6,    vr6,    vr7
1029    vssrarni.bu.h vr2,    vr0,    bpc_sh
1030    vssrarni.bu.h vr6,    vr4,    bpc_sh
1031    addi.w        a5,     a5,     -1
1032    addi.d        a2,     a2,     64
1033    vst           vr2,    a0,     0
1034    vst           vr6,    a0,     16
1035    addi.d        a3,     a3,     64
1036    add.d         a0,     a0,     a1
1037    blt           zero,   a5,     .AVG_W32_LSX
1038    b             .AVG_END_LSX
1039
1040.AVG_W64_LSX:
1041.rept 4
1042    vld           vr0,    a2,     0
1043    vld           vr2,    a2,     16
1044    vld           vr1,    a3,     0
1045    vld           vr3,    a3,     16
1046    vadd.h        vr0,    vr0,    vr1
1047    vadd.h        vr2,    vr2,    vr3
1048    vssrarni.bu.h vr2,    vr0,    bpc_sh
1049    addi.d        a2,     a2,     32
1050    addi.d        a3,     a3,     32
1051    vst           vr2,    a0,     0
1052    addi.d        a0,     a0,     16
1053.endr
1054    addi.w        a5,     a5,     -1
1055    add.d         t8,     t8,     a1
1056    add.d         a0,     t8,     zero
1057    blt           zero,   a5,     .AVG_W64_LSX
1058    b             .AVG_END_LSX
1059
1060.AVG_W128_LSX:
1061.rept 8
1062    vld           vr0,    a2,     0
1063    vld           vr2,    a2,     16
1064    vld           vr1,    a3,     0
1065    vld           vr3,    a3,     16
1066    vadd.h        vr0,    vr0,    vr1
1067    vadd.h        vr2,    vr2,    vr3
1068    vssrarni.bu.h vr2,    vr0,    bpc_sh
1069    addi.d        a2,     a2,     32
1070    addi.d        a3,     a3,     32
1071    vst           vr2,    a0,     0
1072    addi.d        a0,     a0,     16
1073.endr
1074    addi.w        a5,     a5,     -1
1075    add.d         t8,     t8,     a1
1076    add.d         a0,     t8,     zero
1077    blt           zero,   a5,     .AVG_W128_LSX
1078.AVG_END_LSX:
1079endfunc
1080
1081function avg_8bpc_lasx
1082    clz.w         t0,     a4
1083    li.w          t1,     24
1084    sub.w         t0,     t0,      t1
1085    la.local      t1,     .AVG_LASX_JRTABLE
1086    alsl.d        t0,     t0,      t1,    1
1087    ld.h          t2,     t0,      0
1088    add.d         t1,     t1,      t2
1089    jirl          $r0,    t1,      0
1090
1091    .align   3
1092.AVG_LASX_JRTABLE:
1093    .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
1094    .hword .AVG_W64_LASX  - .AVG_LASX_JRTABLE
1095    .hword .AVG_W32_LASX  - .AVG_LASX_JRTABLE
1096    .hword .AVG_W16_LASX  - .AVG_LASX_JRTABLE
1097    .hword .AVG_W8_LASX   - .AVG_LASX_JRTABLE
1098    .hword .AVG_W4_LASX   - .AVG_LASX_JRTABLE
1099
1100.AVG_W4_LASX:
1101    vld            vr0,    a2,     0
1102    vld            vr1,    a3,     0
1103    vadd.h         vr0,    vr0,    vr1
1104    vssrarni.bu.h  vr1,    vr0,    bpc_sh
1105    vstelm.w       vr1,    a0,     0,    0
1106    add.d          a0,     a0,     a1
1107    vstelm.w       vr1,    a0,     0,    1
1108    addi.w         a5,     a5,     -2
1109    addi.d         a2,     a2,     16
1110    addi.d         a3,     a3,     16
1111    add.d          a0,     a0,     a1
1112    blt            zero,   a5,     .AVG_W4_LASX
1113    b              .AVG_END_LASX
1114.AVG_W8_LASX:
1115    xvld           xr0,    a2,     0
1116    xvld           xr1,    a3,     0
1117    xvadd.h        xr2,    xr0,    xr1
1118    xvssrarni.bu.h xr1,    xr2,    bpc_sh
1119    xvstelm.d      xr1,    a0,     0,    0
1120    add.d          a0,     a0,     a1
1121    xvstelm.d      xr1,    a0,     0,    2
1122    addi.w         a5,     a5,     -2
1123    addi.d         a2,     a2,     32
1124    addi.d         a3,     a3,     32
1125    add.d          a0,     a1,     a0
1126    blt            zero,   a5,     .AVG_W8_LASX
1127    b              .AVG_END_LASX
1128.AVG_W16_LASX:
1129    xvld           xr0,    a2,     0
1130    xvld           xr2,    a2,     32
1131    xvld           xr1,    a3,     0
1132    xvld           xr3,    a3,     32
1133    xvadd.h        xr4,    xr0,    xr1
1134    xvadd.h        xr5,    xr2,    xr3
1135    xvssrarni.bu.h xr5,    xr4,    bpc_sh
1136    xvpermi.d      xr2,    xr5,    0xd8
1137    xvpermi.d      xr3,    xr5,    0x8d
1138    vst            vr2,    a0,     0
1139    vstx           vr3,    a0,     a1
1140    addi.w         a5,     a5,     -2
1141    addi.d         a2,     a2,     64
1142    addi.d         a3,     a3,     64
1143    alsl.d         a0,     a1,     a0,   1
1144    blt            zero,   a5,     .AVG_W16_LASX
1145    b              .AVG_END_LASX
1146.AVG_W32_LASX:
1147    xvld           xr0,    a2,     0
1148    xvld           xr2,    a2,     32
1149    xvld           xr1,    a3,     0
1150    xvld           xr3,    a3,     32
1151    xvadd.h        xr4,    xr0,    xr1
1152    xvadd.h        xr5,    xr2,    xr3
1153    xvssrarni.bu.h xr5,    xr4,    bpc_sh
1154    xvpermi.d      xr6,    xr5,    0xd8
1155    xvst           xr6,    a0,     0
1156    addi.w         a5,     a5,     -1
1157    addi.d         a2,     a2,     64
1158    addi.d         a3,     a3,     64
1159    add.d          a0,     a0,     a1
1160    blt            zero,   a5,     .AVG_W32_LASX
1161    b              .AVG_END_LASX
1162.AVG_W64_LASX:
1163    xvld           xr0,    a2,     0
1164    xvld           xr2,    a2,     32
1165    xvld           xr4,    a2,     64
1166    xvld           xr6,    a2,     96
1167    xvld           xr1,    a3,     0
1168    xvld           xr3,    a3,     32
1169    xvld           xr5,    a3,     64
1170    xvld           xr7,    a3,     96
1171    xvadd.h        xr0,    xr0,    xr1
1172    xvadd.h        xr2,    xr2,    xr3
1173    xvadd.h        xr4,    xr4,    xr5
1174    xvadd.h        xr6,    xr6,    xr7
1175    xvssrarni.bu.h xr2,    xr0,    bpc_sh
1176    xvssrarni.bu.h xr6,    xr4,    bpc_sh
1177    xvpermi.d      xr1,    xr2,    0xd8
1178    xvpermi.d      xr3,    xr6,    0xd8
1179    xvst           xr1,    a0,     0
1180    xvst           xr3,    a0,     32
1181    addi.w         a5,     a5,     -1
1182    addi.d         a2,     a2,     128
1183    addi.d         a3,     a3,     128
1184    add.d          a0,     a0,     a1
1185    blt            zero,   a5,     .AVG_W64_LASX
1186    b              .AVG_END_LASX
1187.AVG_W128_LASX:
1188    xvld           xr0,    a2,     0
1189    xvld           xr2,    a2,     32
1190    xvld           xr4,    a2,     64
1191    xvld           xr6,    a2,     96
1192    xvld           xr8,    a2,     128
1193    xvld           xr10,   a2,     160
1194    xvld           xr12,   a2,     192
1195    xvld           xr14,   a2,     224
1196    xvld           xr1,    a3,     0
1197    xvld           xr3,    a3,     32
1198    xvld           xr5,    a3,     64
1199    xvld           xr7,    a3,     96
1200    xvld           xr9,    a3,     128
1201    xvld           xr11,   a3,     160
1202    xvld           xr13,   a3,     192
1203    xvld           xr15,   a3,     224
1204    xvadd.h        xr0,    xr0,    xr1
1205    xvadd.h        xr2,    xr2,    xr3
1206    xvadd.h        xr4,    xr4,    xr5
1207    xvadd.h        xr6,    xr6,    xr7
1208    xvadd.h        xr8,    xr8,    xr9
1209    xvadd.h        xr10,   xr10,   xr11
1210    xvadd.h        xr12,   xr12,   xr13
1211    xvadd.h        xr14,   xr14,   xr15
1212    xvssrarni.bu.h xr2,    xr0,    bpc_sh
1213    xvssrarni.bu.h xr6,    xr4,    bpc_sh
1214    xvssrarni.bu.h xr10,   xr8,    bpc_sh
1215    xvssrarni.bu.h xr14,   xr12,   bpc_sh
1216    xvpermi.d      xr1,    xr2,    0xd8
1217    xvpermi.d      xr3,    xr6,    0xd8
1218    xvpermi.d      xr5,    xr10,   0xd8
1219    xvpermi.d      xr7,    xr14,   0xd8
1220    xvst           xr1,    a0,     0
1221    xvst           xr3,    a0,     32
1222    xvst           xr5,    a0,     64
1223    xvst           xr7,    a0,     96
1224    addi.w         a5,     a5,     -1
1225    addi.d         a2,     a2,     256
1226    addi.d         a3,     a3,     256
1227    add.d          a0,     a0,     a1
1228    blt            zero,   a5,     .AVG_W128_LASX
1229.AVG_END_LASX:
1230endfunc
1231
1232function w_avg_8bpc_lsx
1233    addi.d        t8,     a0,     0
1234    li.w          t2,     16
1235    sub.w         t2,     t2,     a6  // 16 - weight
1236    vreplgr2vr.h  vr21,   a6
1237    vreplgr2vr.h  vr22,   t2
1238
1239    clz.w         t0,     a4
1240    li.w          t1,     24
1241    sub.w         t0,     t0,      t1
1242    la.local      t1,     .W_AVG_LSX_JRTABLE
1243    alsl.d        t0,     t0,      t1,    1
1244    ld.h          t2,     t0,      0
1245    add.d         t1,     t1,      t2
1246    jirl          $r0,    t1,      0
1247
1248    .align   3
1249.W_AVG_LSX_JRTABLE:
1250    .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
1251    .hword .W_AVG_W64_LSX  - .W_AVG_LSX_JRTABLE
1252    .hword .W_AVG_W32_LSX  - .W_AVG_LSX_JRTABLE
1253    .hword .W_AVG_W16_LSX  - .W_AVG_LSX_JRTABLE
1254    .hword .W_AVG_W8_LSX   - .W_AVG_LSX_JRTABLE
1255    .hword .W_AVG_W4_LSX   - .W_AVG_LSX_JRTABLE
1256
1257.W_AVG_W4_LSX:
1258    vld           vr0,    a2,     0
1259    vld           vr1,    a3,     0
1260    vmulwev.w.h   vr2,    vr0,    vr21
1261    vmulwod.w.h   vr3,    vr0,    vr21
1262    vmaddwev.w.h  vr2,    vr1,    vr22
1263    vmaddwod.w.h  vr3,    vr1,    vr22
1264    vssrarni.hu.w vr3,    vr2,    bpcw_sh
1265    vssrlni.bu.h  vr1,    vr3,    0
1266    vpickod.w     vr4,    vr2,    vr1
1267    vilvl.b       vr0,    vr4,    vr1
1268    fst.s         f0,     a0,     0
1269    add.d         a0,     a0,     a1
1270    vstelm.w      vr0,    a0,     0,   1
1271    addi.w        a5,     a5,     -2
1272    addi.d        a2,     a2,     16
1273    addi.d        a3,     a3,     16
1274    add.d         a0,     a1,     a0
1275    blt           zero,   a5,     .W_AVG_W4_LSX
1276    b             .W_AVG_END_LSX
1277.W_AVG_W8_LSX:
1278    vld           vr0,    a2,     0
1279    vld           vr1,    a3,     0
1280    vmulwev.w.h   vr2,    vr0,    vr21
1281    vmulwod.w.h   vr3,    vr0,    vr21
1282    vmaddwev.w.h  vr2,    vr1,    vr22
1283    vmaddwod.w.h  vr3,    vr1,    vr22
1284    vssrarni.hu.w vr3,    vr2,    bpcw_sh
1285    vssrlni.bu.h  vr1,    vr3,    0
1286    vpickod.w     vr4,    vr2,    vr1
1287    vilvl.b       vr0,    vr4,    vr1
1288    fst.d         f0,     a0,     0
1289    addi.w        a5,     a5,     -1
1290    addi.d        a2,     a2,     16
1291    addi.d        a3,     a3,     16
1292    add.d         a0,     a0,     a1
1293    blt           zero,   a5,     .W_AVG_W8_LSX
1294    b             .W_AVG_END_LSX
1295.W_AVG_W16_LSX:
1296    vld           vr0,    a2,     0
1297    vld           vr2,    a2,     16
1298    vld           vr1,    a3,     0
1299    vld           vr3,    a3,     16
1300    vmulwev.w.h   vr4,    vr0,    vr21
1301    vmulwod.w.h   vr5,    vr0,    vr21
1302    vmulwev.w.h   vr6,    vr2,    vr21
1303    vmulwod.w.h   vr7,    vr2,    vr21
1304    vmaddwev.w.h  vr4,    vr1,    vr22
1305    vmaddwod.w.h  vr5,    vr1,    vr22
1306    vmaddwev.w.h  vr6,    vr3,    vr22
1307    vmaddwod.w.h  vr7,    vr3,    vr22
1308    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1309    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1310    vssrlrni.bu.h vr7,    vr6,    0
1311    vshuf4i.w     vr8,    vr7,    0x4E
1312    vilvl.b       vr0,    vr8,    vr7
1313    vst           vr0,    a0,     0
1314    addi.w        a5,     a5,     -1
1315    addi.d        a2,     a2,     32
1316    addi.d        a3,     a3,     32
1317    add.d         a0,     a0,     a1
1318    blt           zero,   a5,     .W_AVG_W16_LSX
1319    b             .W_AVG_END_LSX
1320.W_AVG_W32_LSX:
1321.rept 2
1322    vld           vr0,    a2,     0
1323    vld           vr2,    a2,     16
1324    vld           vr1,    a3,     0
1325    vld           vr3,    a3,     16
1326    vmulwev.w.h   vr4,    vr0,    vr21
1327    vmulwod.w.h   vr5,    vr0,    vr21
1328    vmulwev.w.h   vr6,    vr2,    vr21
1329    vmulwod.w.h   vr7,    vr2,    vr21
1330    vmaddwev.w.h  vr4,    vr1,    vr22
1331    vmaddwod.w.h  vr5,    vr1,    vr22
1332    vmaddwev.w.h  vr6,    vr3,    vr22
1333    vmaddwod.w.h  vr7,    vr3,    vr22
1334    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1335    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1336    vssrlrni.bu.h vr7,    vr6,    0
1337    vshuf4i.w     vr8,    vr7,    0x4E
1338    vilvl.b       vr0,    vr8,    vr7
1339    vst           vr0,    a0,     0
1340    addi.d        a2,     a2,     32
1341    addi.d        a3,     a3,     32
1342    addi.d        a0,     a0,     16
1343.endr
1344    addi.w        a5,     a5,     -1
1345    add.d         t8,     t8,     a1
1346    add.d         a0,     t8,     zero
1347    blt           zero,   a5,     .W_AVG_W32_LSX
1348    b             .W_AVG_END_LSX
1349
1350.W_AVG_W64_LSX:
1351.rept 4
1352    vld           vr0,    a2,     0
1353    vld           vr2,    a2,     16
1354    vld           vr1,    a3,     0
1355    vld           vr3,    a3,     16
1356    vmulwev.w.h   vr4,    vr0,    vr21
1357    vmulwod.w.h   vr5,    vr0,    vr21
1358    vmulwev.w.h   vr6,    vr2,    vr21
1359    vmulwod.w.h   vr7,    vr2,    vr21
1360    vmaddwev.w.h  vr4,    vr1,    vr22
1361    vmaddwod.w.h  vr5,    vr1,    vr22
1362    vmaddwev.w.h  vr6,    vr3,    vr22
1363    vmaddwod.w.h  vr7,    vr3,    vr22
1364    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1365    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1366    vssrlrni.bu.h vr7,    vr6,    0
1367    vshuf4i.w     vr8,    vr7,    0x4E
1368    vilvl.b       vr0,    vr8,    vr7
1369    vst           vr0,    a0,     0
1370    addi.d        a2,     a2,     32
1371    addi.d        a3,     a3,     32
1372    addi.d        a0,     a0,     16
1373.endr
1374    addi.w        a5,     a5,     -1
1375    add.d         t8,     t8,     a1
1376    add.d         a0,     t8,     zero
1377    blt           zero,   a5,     .W_AVG_W64_LSX
1378    b             .W_AVG_END_LSX
1379
1380.W_AVG_W128_LSX:
1381.rept 8
1382    vld           vr0,    a2,     0
1383    vld           vr2,    a2,     16
1384    vld           vr1,    a3,     0
1385    vld           vr3,    a3,     16
1386    vmulwev.w.h   vr4,    vr0,    vr21
1387    vmulwod.w.h   vr5,    vr0,    vr21
1388    vmulwev.w.h   vr6,    vr2,    vr21
1389    vmulwod.w.h   vr7,    vr2,    vr21
1390    vmaddwev.w.h  vr4,    vr1,    vr22
1391    vmaddwod.w.h  vr5,    vr1,    vr22
1392    vmaddwev.w.h  vr6,    vr3,    vr22
1393    vmaddwod.w.h  vr7,    vr3,    vr22
1394    vssrarni.hu.w vr6,    vr4,    bpcw_sh
1395    vssrarni.hu.w vr7,    vr5,    bpcw_sh
1396    vssrlrni.bu.h vr7,    vr6,    0
1397    vshuf4i.w     vr8,    vr7,    0x4E
1398    vilvl.b       vr0,    vr8,    vr7
1399    vst           vr0,    a0,     0
1400    addi.d        a2,     a2,     32
1401    addi.d        a3,     a3,     32
1402    addi.d        a0,     a0,     16
1403.endr
1404    addi.w        a5,     a5,     -1
1405    add.d         t8,     t8,     a1
1406    add.d         a0,     t8,     zero
1407    blt           zero,   a5,     .W_AVG_W128_LSX
1408.W_AVG_END_LSX:
1409endfunc
1410
1411function w_avg_8bpc_lasx
1412    addi.d        t8,     a0,     0
1413    li.w          t2,     16
1414    sub.w         t2,     t2,     a6  // 16 - weight
1415    xvreplgr2vr.h xr21,   a6
1416    xvreplgr2vr.h xr22,   t2
1417
1418    clz.w         t0,     a4
1419    li.w          t1,     24
1420    sub.w         t0,     t0,      t1
1421    la.local      t1,     .W_AVG_LASX_JRTABLE
1422    alsl.d        t0,     t0,      t1,    1
1423    ld.h          t2,     t0,      0
1424    add.d         t1,     t1,      t2
1425    jirl          $r0,    t1,      0
1426
1427    .align   3
1428.W_AVG_LASX_JRTABLE:
1429    .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
1430    .hword .W_AVG_W64_LASX  - .W_AVG_LASX_JRTABLE
1431    .hword .W_AVG_W32_LASX  - .W_AVG_LASX_JRTABLE
1432    .hword .W_AVG_W16_LASX  - .W_AVG_LASX_JRTABLE
1433    .hword .W_AVG_W8_LASX   - .W_AVG_LASX_JRTABLE
1434    .hword .W_AVG_W4_LASX   - .W_AVG_LASX_JRTABLE
1435
1436.W_AVG_W4_LASX:
1437    vld            vr0,    a2,     0
1438    vld            vr1,    a3,     0
1439    xvpermi.d      xr2,    xr0,    0xD8
1440    xvpermi.d      xr3,    xr1,    0xD8
1441    xvilvl.h       xr4,    xr3,    xr2
1442    xvmulwev.w.h   xr0,    xr4,    xr21
1443    xvmaddwod.w.h  xr0,    xr4,    xr22
1444    xvssrarni.hu.w xr1,    xr0,    bpcw_sh
1445    xvssrlni.bu.h  xr0,    xr1,    0
1446    fst.s          f0,     a0,     0
1447    add.d          a0,     a0,     a1
1448    xvstelm.w      xr0,    a0,     0,     4
1449    addi.w         a5,     a5,     -2
1450    addi.d         a2,     a2,     16
1451    addi.d         a3,     a3,     16
1452    add.d          a0,     a1,     a0
1453    blt            zero,   a5,     .W_AVG_W4_LASX
1454    b              .W_AVG_END_LASX
1455
1456.W_AVG_W8_LASX:
1457    xvld           xr0,    a2,     0
1458    xvld           xr1,    a3,     0
1459    xvmulwev.w.h   xr2,    xr0,    xr21
1460    xvmulwod.w.h   xr3,    xr0,    xr21
1461    xvmaddwev.w.h  xr2,    xr1,    xr22
1462    xvmaddwod.w.h  xr3,    xr1,    xr22
1463    xvssrarni.hu.w xr3,    xr2,    bpcw_sh
1464    xvssrlni.bu.h  xr1,    xr3,    0
1465    xvpickod.w     xr4,    xr2,    xr1
1466    xvilvl.b       xr0,    xr4,    xr1
1467    xvstelm.d      xr0,    a0,     0,     0
1468    add.d          a0,     a0,     a1
1469    xvstelm.d      xr0,    a0,     0,     2
1470    addi.w         a5,     a5,     -2
1471    addi.d         a2,     a2,     32
1472    addi.d         a3,     a3,     32
1473    add.d          a0,     a0,     a1
1474    blt            zero,   a5,     .W_AVG_W8_LASX
1475    b              .W_AVG_END_LASX
1476
1477.W_AVG_W16_LASX:
1478    xvld           xr0,    a2,     0
1479    xvld           xr1,    a3,     0
1480    xvmulwev.w.h   xr2,    xr0,    xr21
1481    xvmulwod.w.h   xr3,    xr0,    xr21
1482    xvmaddwev.w.h  xr2,    xr1,    xr22
1483    xvmaddwod.w.h  xr3,    xr1,    xr22
1484    xvssrarni.hu.w xr3,    xr2,    bpcw_sh
1485    xvssrlni.bu.h  xr1,    xr3,    0
1486    xvpickod.w     xr4,    xr2,    xr1
1487    xvilvl.b       xr0,    xr4,    xr1
1488    xvpermi.d      xr1,    xr0,    0xD8
1489    vst            vr1,    a0,     0
1490    addi.w         a5,     a5,     -1
1491    addi.d         a2,     a2,     32
1492    addi.d         a3,     a3,     32
1493    add.d          a0,     a0,     a1
1494    blt            zero,   a5,     .W_AVG_W16_LASX
1495    b              .W_AVG_END_LSX
1496
1497.W_AVG_W32_LASX:
1498    xvld           xr0,    a2,     0
1499    xvld           xr2,    a2,     32
1500    xvld           xr1,    a3,     0
1501    xvld           xr3,    a3,     32
1502    xvmulwev.w.h   xr4,    xr0,    xr21
1503    xvmulwod.w.h   xr5,    xr0,    xr21
1504    xvmulwev.w.h   xr6,    xr2,    xr21
1505    xvmulwod.w.h   xr7,    xr2,    xr21
1506    xvmaddwev.w.h  xr4,    xr1,    xr22
1507    xvmaddwod.w.h  xr5,    xr1,    xr22
1508    xvmaddwev.w.h  xr6,    xr3,    xr22
1509    xvmaddwod.w.h  xr7,    xr3,    xr22
1510    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
1511    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
1512    xvssrlni.bu.h  xr7,    xr6,    0
1513    xvshuf4i.w     xr8,    xr7,    0x4E
1514    xvilvl.b       xr9,    xr8,    xr7
1515    xvpermi.d      xr0,    xr9,    0xD8
1516    xvst           xr0,    a0,     0
1517    addi.w         a5,     a5,     -1
1518    addi.d         a2,     a2,     64
1519    addi.d         a3,     a3,     64
1520    add.d          a0,     a0,     a1
1521    blt            zero,   a5,     .W_AVG_W32_LASX
1522    b              .W_AVG_END_LASX
1523
1524.W_AVG_W64_LASX:
1525.rept 2
1526    xvld           xr0,    a2,     0
1527    xvld           xr2,    a2,     32
1528    xvld           xr1,    a3,     0
1529    xvld           xr3,    a3,     32
1530    xvmulwev.w.h   xr4,    xr0,    xr21
1531    xvmulwod.w.h   xr5,    xr0,    xr21
1532    xvmulwev.w.h   xr6,    xr2,    xr21
1533    xvmulwod.w.h   xr7,    xr2,    xr21
1534    xvmaddwev.w.h  xr4,    xr1,    xr22
1535    xvmaddwod.w.h  xr5,    xr1,    xr22
1536    xvmaddwev.w.h  xr6,    xr3,    xr22
1537    xvmaddwod.w.h  xr7,    xr3,    xr22
1538    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
1539    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
1540    xvssrlni.bu.h  xr7,    xr6,    0
1541    xvshuf4i.w     xr8,    xr7,    0x4E
1542    xvilvl.b       xr9,    xr8,    xr7
1543    xvpermi.d      xr0,    xr9,    0xD8
1544    xvst           xr0,    a0,     0
1545    addi.d         a2,     a2,     64
1546    addi.d         a3,     a3,     64
1547    addi.d         a0,     a0,     32
1548.endr
1549    addi.w         a5,     a5,     -1
1550    add.d          t8,     t8,     a1
1551    add.d          a0,     t8,     zero
1552    blt            zero,   a5,     .W_AVG_W64_LASX
1553    b              .W_AVG_END_LASX
1554
1555.W_AVG_W128_LASX:
1556.rept 4
1557    xvld           xr0,    a2,     0
1558    xvld           xr2,    a2,     32
1559    xvld           xr1,    a3,     0
1560    xvld           xr3,    a3,     32
1561    xvmulwev.w.h   xr4,    xr0,    xr21
1562    xvmulwod.w.h   xr5,    xr0,    xr21
1563    xvmulwev.w.h   xr6,    xr2,    xr21
1564    xvmulwod.w.h   xr7,    xr2,    xr21
1565    xvmaddwev.w.h  xr4,    xr1,    xr22
1566    xvmaddwod.w.h  xr5,    xr1,    xr22
1567    xvmaddwev.w.h  xr6,    xr3,    xr22
1568    xvmaddwod.w.h  xr7,    xr3,    xr22
1569    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
1570    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
1571    xvssrlni.bu.h  xr7,    xr6,    0
1572    xvshuf4i.w     xr8,    xr7,    0x4E
1573    xvilvl.b       xr9,    xr8,    xr7
1574    xvpermi.d      xr0,    xr9,    0xD8
1575    xvst           xr0,    a0,     0
1576    addi.d         a2,     a2,     64
1577    addi.d         a3,     a3,     64
1578    addi.d         a0,     a0,     32
1579.endr
1580
1581    addi.w         a5,     a5,     -1
1582    add.d          t8,     t8,     a1
1583    add.d          a0,     t8,     zero
1584    blt            zero,   a5,     .W_AVG_W128_LASX
1585.W_AVG_END_LASX:
1586endfunc
1587
1588#undef bpc_sh
1589#undef bpcw_sh
1590
1591#define mask_sh         10
1592/*
1593static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
1594                   const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
1595                   const uint8_t *mask HIGHBD_DECL_SUFFIX)
1596*/
1597function mask_8bpc_lsx
1598    vldi          vr21,   0x440   // 64
1599    vxor.v        vr19,   vr19,   vr19
1600    addi.d        t8,     a0,     0
1601    clz.w         t0,     a4
1602    li.w          t1,     24
1603    sub.w         t0,     t0,      t1
1604    la.local      t1,     .MASK_LSX_JRTABLE
1605    alsl.d        t0,     t0,      t1,    1
1606    ld.h          t2,     t0,      0
1607    add.d         t1,     t1,      t2
1608    jirl          $r0,    t1,      0
1609
1610    .align   3
1611.MASK_LSX_JRTABLE:
1612    .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
1613    .hword .MASK_W64_LSX  - .MASK_LSX_JRTABLE
1614    .hword .MASK_W32_LSX  - .MASK_LSX_JRTABLE
1615    .hword .MASK_W16_LSX  - .MASK_LSX_JRTABLE
1616    .hword .MASK_W8_LSX   - .MASK_LSX_JRTABLE
1617    .hword .MASK_W4_LSX   - .MASK_LSX_JRTABLE
1618
1619.MASK_W4_LSX:
1620    vld           vr0,     a2,     0
1621    vld           vr1,     a3,     0
1622    fld.d         f22,     a6,     0
1623
1624    vilvl.b       vr2,    vr19,   vr22
1625    vsub.h        vr3,    vr21,   vr2
1626
1627    vmulwev.w.h   vr4,    vr0,    vr2
1628    vmulwod.w.h   vr5,    vr0,    vr2
1629    vmaddwev.w.h  vr4,    vr1,    vr3
1630    vmaddwod.w.h  vr5,    vr1,    vr3
1631    vssrarni.hu.w vr5,    vr4,    mask_sh
1632    vssrlrni.bu.h vr1,    vr5,    0
1633    vpickod.w     vr4,    vr2,    vr1
1634    vilvl.b       vr0,    vr4,    vr1
1635    fst.s         f0,     a0,     0
1636    add.d         a0,     a0,     a1
1637    vstelm.w      vr0,    a0,     0,    1
1638    addi.d        a2,     a2,     16
1639    addi.d        a3,     a3,     16
1640    addi.d        a6,     a6,     8
1641    add.d         a0,     a0,     a1
1642    addi.w        a5,     a5,     -2
1643    blt           zero,   a5,     .MASK_W4_LSX
1644    b             .MASK_END_LSX
1645.MASK_W8_LSX:
1646    vld           vr0,    a2,     0
1647    vld           vr10,   a2,     16
1648    vld           vr1,    a3,     0
1649    vld           vr11,   a3,     16
1650    vld           vr22,   a6,     0
1651
1652    vilvl.b       vr2,    vr19,   vr22
1653    vilvh.b       vr12,   vr19,   vr22
1654    vsub.h        vr3,    vr21,   vr2
1655    vsub.h        vr13,   vr21,   vr12
1656
1657    vmulwev.w.h   vr4,    vr0,    vr2
1658    vmulwod.w.h   vr5,    vr0,    vr2
1659    vmulwev.w.h   vr14,   vr10,   vr12
1660    vmulwod.w.h   vr15,   vr10,   vr12
1661    vmaddwev.w.h  vr4,    vr1,    vr3
1662    vmaddwod.w.h  vr5,    vr1,    vr3
1663    vmaddwev.w.h  vr14,   vr11,   vr13
1664    vmaddwod.w.h  vr15,   vr11,   vr13
1665    vssrarni.hu.w vr14,   vr4,    mask_sh
1666    vssrarni.hu.w vr15,   vr5,    mask_sh
1667    vssrlrni.bu.h vr15,   vr14,   0
1668    vshuf4i.w     vr6,    vr15,   0x4E
1669    vilvl.b       vr0,    vr6,    vr15
1670    fst.d         f0,     a0,     0
1671    add.d         a0,     a0,     a1
1672    vstelm.d      vr0,    a0,     0,   1
1673    addi.d        a2,     a2,     32
1674    addi.d        a3,     a3,     32
1675    addi.d        a6,     a6,     16
1676    add.d         a0,     a0,     a1
1677    addi.w        a5,     a5,     -2
1678    blt           zero,   a5,     .MASK_W8_LSX
1679    b             .MASK_END_LSX
1680
1681.MASK_W16_LSX:
1682    vld           vr0,    a2,     0
1683    vld           vr10,   a2,     16
1684    vld           vr1,    a3,     0
1685    vld           vr11,   a3,     16
1686    vld           vr22,   a6,     0
1687
1688    vilvl.b       vr2,    vr19,   vr22
1689    vilvh.b       vr12,   vr19,   vr22
1690    vsub.h        vr3,    vr21,   vr2
1691    vsub.h        vr13,   vr21,   vr12
1692
1693    vmulwev.w.h   vr4,    vr0,    vr2
1694    vmulwod.w.h   vr5,    vr0,    vr2
1695    vmulwev.w.h   vr14,   vr10,   vr12
1696    vmulwod.w.h   vr15,   vr10,   vr12
1697    vmaddwev.w.h  vr4,    vr1,    vr3
1698    vmaddwod.w.h  vr5,    vr1,    vr3
1699    vmaddwev.w.h  vr14,   vr11,   vr13
1700    vmaddwod.w.h  vr15,   vr11,   vr13
1701    vssrarni.hu.w vr14,   vr4,    mask_sh
1702    vssrarni.hu.w vr15,   vr5,    mask_sh
1703    vssrlrni.bu.h vr15,   vr14,   0
1704    vshuf4i.w     vr6,    vr15,   0x4E
1705    vilvl.b       vr0,    vr6,    vr15
1706    vst           vr0,    a0,     0
1707    addi.d        a2,     a2,     32
1708    addi.d        a3,     a3,     32
1709    addi.d        a6,     a6,     16
1710    add.d         a0,     a0,     a1
1711    addi.w        a5,     a5,     -1
1712    blt           zero,   a5,     .MASK_W16_LSX
1713    b             .MASK_END_LSX
1714.MASK_W32_LSX:
1715.rept 2
1716    vld           vr0,    a2,     0
1717    vld           vr10,   a2,     16
1718    vld           vr1,    a3,     0
1719    vld           vr11,   a3,     16
1720    vld           vr22,   a6,     0
1721    vilvl.b       vr2,    vr19,   vr22
1722    vilvh.b       vr12,   vr19,   vr22
1723    vsub.h        vr3,    vr21,   vr2
1724    vsub.h        vr13,   vr21,   vr12
1725    vmulwev.w.h   vr4,    vr0,    vr2
1726    vmulwod.w.h   vr5,    vr0,    vr2
1727    vmulwev.w.h   vr14,   vr10,   vr12
1728    vmulwod.w.h   vr15,   vr10,   vr12
1729    vmaddwev.w.h  vr4,    vr1,    vr3
1730    vmaddwod.w.h  vr5,    vr1,    vr3
1731    vmaddwev.w.h  vr14,   vr11,   vr13
1732    vmaddwod.w.h  vr15,   vr11,   vr13
1733    vssrarni.hu.w vr14,   vr4,    mask_sh
1734    vssrarni.hu.w vr15,   vr5,    mask_sh
1735    vssrlrni.bu.h vr15,   vr14,   0
1736    vshuf4i.w     vr6,    vr15,   0x4E
1737    vilvl.b       vr0,    vr6,    vr15
1738    vst           vr0,    a0,     0
1739    addi.d        a2,     a2,     32
1740    addi.d        a3,     a3,     32
1741    addi.d        a6,     a6,     16
1742    addi.d        a0,     a0,     16
1743.endr
1744    add.d         t8,     t8,     a1
1745    add.d         a0,     t8,     zero
1746    addi.w        a5,     a5,     -1
1747    blt           zero,   a5,     .MASK_W32_LSX
1748    b             .MASK_END_LSX
1749.MASK_W64_LSX:
1750.rept 4
1751    vld           vr0,    a2,     0
1752    vld           vr10,   a2,     16
1753    vld           vr1,    a3,     0
1754    vld           vr11,   a3,     16
1755    vld           vr22,   a6,     0
1756    vilvl.b       vr2,    vr19,   vr22
1757    vilvh.b       vr12,   vr19,   vr22
1758    vsub.h        vr3,    vr21,   vr2
1759    vsub.h        vr13,   vr21,   vr12
1760    vmulwev.w.h   vr4,    vr0,    vr2
1761    vmulwod.w.h   vr5,    vr0,    vr2
1762    vmulwev.w.h   vr14,   vr10,   vr12
1763    vmulwod.w.h   vr15,   vr10,   vr12
1764    vmaddwev.w.h  vr4,    vr1,    vr3
1765    vmaddwod.w.h  vr5,    vr1,    vr3
1766    vmaddwev.w.h  vr14,   vr11,   vr13
1767    vmaddwod.w.h  vr15,   vr11,   vr13
1768    vssrarni.hu.w vr14,   vr4,    mask_sh
1769    vssrarni.hu.w vr15,   vr5,    mask_sh
1770    vssrlrni.bu.h vr15,   vr14,   0
1771    vshuf4i.w     vr6,    vr15,   0x4E
1772    vilvl.b       vr0,    vr6,    vr15
1773    vst           vr0,    a0,     0
1774    addi.d        a2,     a2,     32
1775    addi.d        a3,     a3,     32
1776    addi.d        a6,     a6,     16
1777    addi.d        a0,     a0,     16
1778.endr
1779    add.d         t8,     t8,     a1
1780    add.d         a0,     t8,     zero
1781    addi.w        a5,     a5,     -1
1782    blt           zero,   a5,     .MASK_W64_LSX
1783    b             .MASK_END_LSX
1784.MASK_W128_LSX:
1785.rept 8
1786    vld           vr0,    a2,     0
1787    vld           vr10,   a2,     16
1788    vld           vr1,    a3,     0
1789    vld           vr11,   a3,     16
1790    vld           vr22,   a6,     0
1791    vilvl.b       vr2,    vr19,   vr22
1792    vilvh.b       vr12,   vr19,   vr22
1793    vsub.h        vr3,    vr21,   vr2
1794    vsub.h        vr13,   vr21,   vr12
1795    vmulwev.w.h   vr4,    vr0,    vr2
1796    vmulwod.w.h   vr5,    vr0,    vr2
1797    vmulwev.w.h   vr14,   vr10,   vr12
1798    vmulwod.w.h   vr15,   vr10,   vr12
1799    vmaddwev.w.h  vr4,    vr1,    vr3
1800    vmaddwod.w.h  vr5,    vr1,    vr3
1801    vmaddwev.w.h  vr14,   vr11,   vr13
1802    vmaddwod.w.h  vr15,   vr11,   vr13
1803    vssrarni.hu.w vr14,   vr4,    mask_sh
1804    vssrarni.hu.w vr15,   vr5,    mask_sh
1805    vssrlrni.bu.h vr15,   vr14,   0
1806    vshuf4i.w     vr6,    vr15,   0x4E
1807    vilvl.b       vr0,    vr6,    vr15
1808    vst           vr0,    a0,     0
1809    addi.d        a2,     a2,     32
1810    addi.d        a3,     a3,     32
1811    addi.d        a6,     a6,     16
1812    addi.d        a0,     a0,     16
1813.endr
1814    add.d         t8,     t8,     a1
1815    add.d         a0,     t8,     zero
1816    addi.w        a5,     a5,     -1
1817    blt           zero,   a5,     .MASK_W128_LSX
1818.MASK_END_LSX:
1819endfunc
1820
1821function mask_8bpc_lasx
1822    xvldi         xr21,   0x440   // 64
1823    xvxor.v       xr19,   xr19,   xr19
1824    addi.d        t8,     a0,     0
1825    clz.w         t0,     a4
1826    li.w          t1,     24
1827    sub.w         t0,     t0,      t1
1828    la.local      t1,     .MASK_LASX_JRTABLE
1829    alsl.d        t0,     t0,      t1,    1
1830    ld.h          t2,     t0,      0
1831    add.d         t1,     t1,      t2
1832    jirl          $r0,    t1,      0
1833
1834    .align   3
1835.MASK_LASX_JRTABLE:
1836    .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
1837    .hword .MASK_W64_LASX  - .MASK_LASX_JRTABLE
1838    .hword .MASK_W32_LASX  - .MASK_LASX_JRTABLE
1839    .hword .MASK_W16_LASX  - .MASK_LASX_JRTABLE
1840    .hword .MASK_W8_LASX   - .MASK_LASX_JRTABLE
1841    .hword .MASK_W4_LASX   - .MASK_LASX_JRTABLE
1842
1843.MASK_W4_LASX:
1844    vld            vr0,    a2,     0
1845    vld            vr1,    a3,     0
1846    fld.d          f22,    a6,     0
1847
1848    vilvl.h        vr4,    vr1,    vr0
1849    vilvh.h        vr14,   vr1,    vr0
1850    vilvl.b        vr2,    vr19,   vr22
1851    vsub.h         vr3,    vr21,   vr2
1852    xvpermi.q      xr14,   xr4,    0x20
1853    vilvl.h        vr5,    vr3,    vr2
1854    vilvh.h        vr15,   vr3,    vr2
1855    xvpermi.q      xr15,   xr5,    0x20
1856    xvmulwev.w.h   xr0,    xr14,   xr15
1857    xvmaddwod.w.h  xr0,    xr14,   xr15
1858    xvssrarni.hu.w xr1,    xr0,    mask_sh
1859    xvssrlni.bu.h  xr2,    xr1,    0
1860    fst.s          f2,     a0,     0
1861    add.d          a0,     a0,     a1
1862    xvstelm.w      xr2,    a0,     0,    4
1863
1864    addi.d         a2,     a2,     16
1865    addi.d         a3,     a3,     16
1866    addi.d         a6,     a6,     8
1867    add.d          a0,     a0,     a1
1868    addi.w         a5,     a5,     -2
1869    blt            zero,   a5,     .MASK_W4_LASX
1870    b              .MASK_END_LASX
1871
1872.MASK_W8_LASX:
1873    xvld           xr0,    a2,      0
1874    xvld           xr1,    a3,      0
1875    vld            vr22,   a6,      0
1876
1877    vext2xv.hu.bu  xr2,    xr22
1878    xvsub.h        xr3,    xr21,    xr2
1879    xvmulwev.w.h   xr4,    xr0,     xr2
1880    xvmulwod.w.h   xr5,    xr0,     xr2
1881    xvmaddwev.w.h  xr4,    xr1,     xr3
1882    xvmaddwod.w.h  xr5,    xr1,     xr3
1883    xvssrarni.hu.w xr5,    xr4,     mask_sh
1884    xvssrlni.bu.h  xr1,    xr5,     0
1885    xvpickod.w     xr4,    xr2,     xr1
1886    xvilvl.b       xr0,    xr4,     xr1
1887    fst.d          f0,     a0,      0
1888    add.d          a0,     a0,      a1
1889    xvstelm.d      xr0,    a0,      0,    2
1890
1891    addi.d         a2,     a2,      32
1892    addi.d         a3,     a3,      32
1893    addi.d         a6,     a6,      16
1894    add.d          a0,     a0,      a1
1895    addi.w         a5,     a5,      -2
1896    blt            zero,   a5,      .MASK_W8_LASX
1897    b              .MASK_END_LASX
1898
1899.MASK_W16_LASX:
1900    xvld           xr0,    a2,      0
1901    xvld           xr1,    a3,      0
1902    vld            vr22,   a6,      0
1903
1904    vext2xv.hu.bu  xr2,    xr22
1905    xvsub.h        xr3,    xr21,    xr2
1906    xvmulwev.w.h   xr4,    xr0,     xr2
1907    xvmulwod.w.h   xr5,    xr0,     xr2
1908    xvmaddwev.w.h  xr4,    xr1,     xr3
1909    xvmaddwod.w.h  xr5,    xr1,     xr3
1910    xvssrarni.hu.w xr5,    xr4,     mask_sh
1911    xvssrlni.bu.h  xr1,    xr5,     0
1912    xvpickod.w     xr4,    xr2,    xr1
1913    xvilvl.b       xr0,    xr4,    xr1
1914    xvpermi.d      xr1,    xr0,     0xD8
1915    vst            vr1,    a0,      0
1916
1917    addi.d         a2,     a2,      32
1918    addi.d         a3,     a3,      32
1919    addi.d         a6,     a6,      16
1920    add.d          a0,     a0,      a1
1921    addi.w         a5,     a5,      -1
1922    blt            zero,   a5,      .MASK_W16_LASX
1923    b              .MASK_END_LASX
1924.MASK_W32_LASX:
1925    xvld           xr0,    a2,      0
1926    xvld           xr10,   a2,      32
1927    xvld           xr1,    a3,      0
1928    xvld           xr11,   a3,      32
1929    xvld           xr22,   a6,      0
1930    vext2xv.hu.bu  xr2,    xr22
1931    xvpermi.q      xr4,    xr22,    0x01
1932    vext2xv.hu.bu  xr12,   xr4
1933    xvsub.h        xr3,    xr21,    xr2
1934    xvsub.h        xr13,   xr21,    xr12
1935
1936    xvmulwev.w.h   xr4,    xr0,     xr2
1937    xvmulwod.w.h   xr5,    xr0,     xr2
1938    xvmulwev.w.h   xr14,   xr10,    xr12
1939    xvmulwod.w.h   xr15,   xr10,    xr12
1940    xvmaddwev.w.h  xr4,    xr1,     xr3
1941    xvmaddwod.w.h  xr5,    xr1,     xr3
1942    xvmaddwev.w.h  xr14,   xr11,    xr13
1943    xvmaddwod.w.h  xr15,   xr11,    xr13
1944    xvssrarni.hu.w xr14,   xr4,     mask_sh
1945    xvssrarni.hu.w xr15,   xr5,     mask_sh
1946    xvssrlni.bu.h  xr15,   xr14,    0
1947    xvshuf4i.w     xr6,    xr15,    0x4E
1948    xvilvl.b       xr1,    xr6,     xr15
1949    xvpermi.d      xr0,    xr1,     0xD8
1950    xvst           xr0,    a0,      0
1951
1952    addi.d         a2,     a2,      64
1953    addi.d         a3,     a3,      64
1954    addi.d         a6,     a6,      32
1955    add.d          a0,     a0,      a1
1956    addi.w         a5,     a5,      -1
1957    blt            zero,   a5,      .MASK_W32_LASX
1958    b              .MASK_END_LASX
1959
1960.MASK_W64_LASX:
1961.rept 2
1962    xvld           xr0,    a2,      0
1963    xvld           xr10,   a2,      32
1964    xvld           xr1,    a3,      0
1965    xvld           xr11,   a3,      32
1966    xvld           xr22,   a6,      0
1967    vext2xv.hu.bu  xr2,    xr22
1968    xvpermi.q      xr4,    xr22,    0x01
1969    vext2xv.hu.bu  xr12,   xr4
1970    xvsub.h        xr3,    xr21,    xr2
1971    xvsub.h        xr13,   xr21,    xr12
1972
1973    xvmulwev.w.h   xr4,    xr0,     xr2
1974    xvmulwod.w.h   xr5,    xr0,     xr2
1975    xvmulwev.w.h   xr14,   xr10,    xr12
1976    xvmulwod.w.h   xr15,   xr10,    xr12
1977    xvmaddwev.w.h  xr4,    xr1,     xr3
1978    xvmaddwod.w.h  xr5,    xr1,     xr3
1979    xvmaddwev.w.h  xr14,   xr11,    xr13
1980    xvmaddwod.w.h  xr15,   xr11,    xr13
1981    xvssrarni.hu.w xr14,   xr4,     mask_sh
1982    xvssrarni.hu.w xr15,   xr5,     mask_sh
1983    xvssrlni.bu.h  xr15,   xr14,    0
1984    xvshuf4i.w     xr6,    xr15,    0x4E
1985    xvilvl.b       xr1,    xr6,     xr15
1986    xvpermi.d      xr0,    xr1,     0xD8
1987    xvst           xr0,    a0,      0
1988    addi.d         a2,     a2,      64
1989    addi.d         a3,     a3,      64
1990    addi.d         a6,     a6,      32
1991    addi.d         a0,     a0,      32
1992.endr
1993    add.d          t8,     t8,     a1
1994    add.d          a0,     t8,     zero
1995    addi.w         a5,     a5,      -1
1996    blt            zero,   a5,      .MASK_W64_LASX
1997    b              .MASK_END_LASX
1998
1999.MASK_W128_LASX:
2000.rept 4
2001    xvld           xr0,    a2,      0
2002    xvld           xr10,   a2,      32
2003    xvld           xr1,    a3,      0
2004    xvld           xr11,   a3,      32
2005    xvld           xr22,   a6,      0
2006    vext2xv.hu.bu  xr2,    xr22
2007    xvpermi.q      xr4,    xr22,    0x01
2008    vext2xv.hu.bu  xr12,   xr4
2009    xvsub.h        xr3,    xr21,    xr2
2010    xvsub.h        xr13,   xr21,    xr12
2011
2012    xvmulwev.w.h   xr4,    xr0,     xr2
2013    xvmulwod.w.h   xr5,    xr0,     xr2
2014    xvmulwev.w.h   xr14,   xr10,    xr12
2015    xvmulwod.w.h   xr15,   xr10,    xr12
2016    xvmaddwev.w.h  xr4,    xr1,     xr3
2017    xvmaddwod.w.h  xr5,    xr1,     xr3
2018    xvmaddwev.w.h  xr14,   xr11,    xr13
2019    xvmaddwod.w.h  xr15,   xr11,    xr13
2020    xvssrarni.hu.w xr14,   xr4,     mask_sh
2021    xvssrarni.hu.w xr15,   xr5,     mask_sh
2022    xvssrlni.bu.h  xr15,   xr14,    0
2023    xvshuf4i.w     xr6,    xr15,    0x4E
2024    xvilvl.b       xr1,    xr6,     xr15
2025    xvpermi.d      xr0,    xr1,     0xD8
2026    xvst           xr0,    a0,      0
2027
2028    addi.d         a2,     a2,      64
2029    addi.d         a3,     a3,      64
2030    addi.d         a6,     a6,      32
2031    addi.d         a0,     a0,      32
2032.endr
2033    add.d          t8,     t8,     a1
2034    add.d          a0,     t8,     zero
2035    addi.w         a5,     a5,      -1
2036    blt            zero,   a5,      .MASK_W128_LASX
2037.MASK_END_LASX:
2038endfunc
2039
2040/*
2041static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
2042                     const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
2043                     uint8_t *mask, const int sign,
2044                     const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
2045*/
2046function w_mask_420_8bpc_lsx
2047    addi.d        sp,      sp,    -24
2048    fst.d         f24,     sp,    0
2049    fst.d         f25,     sp,    8
2050    fst.d         f26,     sp,    16
2051    vldi          vr20,    0x440
2052    vreplgr2vr.h  vr21,    a7
2053    vldi          vr22,    0x426
2054
2055    clz.w         t0,      a4
2056    li.w          t1,      24
2057    sub.w         t0,      t0,      t1
2058    la.local      t1,      .WMASK420_LSX_JRTABLE
2059    alsl.d        t0,      t0,      t1,    1
2060    ld.h          t8,      t0,      0
2061    add.d         t1,      t1,      t8
2062    jirl          $r0,     t1,      0
2063
2064    .align   3
2065.WMASK420_LSX_JRTABLE:
2066    .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
2067    .hword .WMASK420_W64_LSX  - .WMASK420_LSX_JRTABLE
2068    .hword .WMASK420_W32_LSX  - .WMASK420_LSX_JRTABLE
2069    .hword .WMASK420_W16_LSX  - .WMASK420_LSX_JRTABLE
2070    .hword .WMASK420_W8_LSX   - .WMASK420_LSX_JRTABLE
2071    .hword .WMASK420_W4_LSX   - .WMASK420_LSX_JRTABLE
2072
2073.WMASK420_W4_LSX:
2074    vld           vr0,     a2,       0
2075    vld           vr1,     a2,       16
2076    vld           vr2,     a3,       0
2077    vld           vr3,     a3,       16
2078    addi.w        a5,      a5,       -4
2079
2080    vabsd.h       vr4,     vr0,      vr2
2081    vabsd.h       vr5,     vr1,      vr3
2082    vaddi.hu      vr4,     vr4,      8
2083    vaddi.hu      vr5,     vr5,      8
2084    vsrli.h       vr4,     vr4,      8
2085    vsrli.h       vr5,     vr5,      8
2086    vadd.h        vr4,     vr4,      vr22
2087    vadd.h        vr5,     vr5,      vr22
2088    vmin.hu       vr6,     vr4,      vr20
2089    vmin.hu       vr7,     vr5,      vr20
2090    vsub.h        vr8,     vr20,     vr6
2091    vsub.h        vr9,     vr20,     vr7
2092    vmulwev.w.h   vr4,     vr6,      vr0
2093    vmulwod.w.h   vr5,     vr6,      vr0
2094    vmulwev.w.h   vr10,    vr7,      vr1
2095    vmulwod.w.h   vr11,    vr7,      vr1
2096    vmaddwev.w.h  vr4,     vr8,      vr2
2097    vmaddwod.w.h  vr5,     vr8,      vr2
2098    vmaddwev.w.h  vr10,    vr9,      vr3
2099    vmaddwod.w.h  vr11,    vr9,      vr3
2100    vilvl.w       vr0,     vr5,      vr4
2101    vilvh.w       vr1,     vr5,      vr4
2102    vilvl.w       vr2,     vr11,     vr10
2103    vilvh.w       vr3,     vr11,     vr10
2104    vssrarni.hu.w vr1,     vr0,      10
2105    vssrarni.hu.w vr3,     vr2,      10
2106    vssrlni.bu.h  vr3,     vr1,      0
2107    vstelm.w      vr3,     a0,       0,    0
2108    add.d         a0,      a0,       a1
2109    vstelm.w      vr3,     a0,       0,    1
2110    add.d         a0,      a0,       a1
2111    vstelm.w      vr3,     a0,       0,    2
2112    add.d         a0,      a0,       a1
2113    vstelm.w      vr3,     a0,       0,    3
2114    add.d         a0,      a0,       a1
2115    vpickev.h     vr0,     vr7,      vr6
2116    vpickod.h     vr1,     vr7,      vr6
2117    vadd.h        vr0,     vr0,      vr1
2118    vshuf4i.h     vr0,     vr0,      0xd8
2119    vhaddw.w.h    vr2,     vr0,      vr0
2120    vpickev.h     vr2,     vr2,      vr2
2121    vsub.h        vr2,     vr2,      vr21
2122    vaddi.hu      vr2,     vr2,      2
2123    vssrani.bu.h  vr2,     vr2,      2
2124    vstelm.w      vr2,     a6,       0,    0
2125
2126    addi.d        a2,      a2,       32
2127    addi.d        a3,      a3,       32
2128    addi.d        a6,      a6,       4
2129    blt           zero,    a5,       .WMASK420_W4_LSX
2130    b             .END_W420
2131
2132.WMASK420_W8_LSX:
2133    vld           vr0,     a2,       0
2134    vld           vr1,     a2,       16
2135    vld           vr2,     a3,       0
2136    vld           vr3,     a3,       16
2137    addi.w        a5,      a5,       -2
2138
2139    vabsd.h       vr4,     vr0,      vr2
2140    vabsd.h       vr5,     vr1,      vr3
2141    vaddi.hu      vr4,     vr4,      8
2142    vaddi.hu      vr5,     vr5,      8
2143    vsrli.h       vr4,     vr4,      8
2144    vsrli.h       vr5,     vr5,      8
2145    vadd.h        vr4,     vr4,      vr22
2146    vadd.h        vr5,     vr5,      vr22
2147    vmin.hu       vr6,     vr4,      vr20
2148    vmin.hu       vr7,     vr5,      vr20
2149    vsub.h        vr8,     vr20,     vr6
2150    vsub.h        vr9,     vr20,     vr7
2151    vmulwev.w.h   vr4,     vr6,      vr0
2152    vmulwod.w.h   vr5,     vr6,      vr0
2153    vmulwev.w.h   vr10,    vr7,      vr1
2154    vmulwod.w.h   vr11,    vr7,      vr1
2155    vmaddwev.w.h  vr4,     vr8,      vr2
2156    vmaddwod.w.h  vr5,     vr8,      vr2
2157    vmaddwev.w.h  vr10,    vr9,      vr3
2158    vmaddwod.w.h  vr11,    vr9,      vr3
2159    vssrarni.hu.w vr10,    vr4,      10
2160    vssrarni.hu.w vr11,    vr5,      10
2161    vssrlni.bu.h  vr11,    vr10,     0
2162    vshuf4i.w     vr0,     vr11,     0x4E
2163    vilvl.b       vr3,     vr0,      vr11
2164    vstelm.d      vr3,     a0,       0,     0
2165    add.d         a0,      a0,       a1
2166    vstelm.d      vr3,     a0,       0,     1
2167    add.d         a0,      a0,       a1
2168    vpickev.h     vr0,     vr7,      vr6
2169    vpickod.h     vr1,     vr7,      vr6
2170    vadd.h        vr0,     vr0,      vr1
2171    vilvh.d       vr2,     vr0,      vr0
2172    vadd.h        vr2,     vr2,      vr0
2173    vsub.h        vr2,     vr2,      vr21
2174    vaddi.hu      vr2,     vr2,      2
2175    vssrani.bu.h  vr2,     vr2,      2
2176    vstelm.w      vr2,     a6,       0,     0
2177
2178    addi.d        a2,      a2,       32
2179    addi.d        a3,      a3,       32
2180    addi.d        a6,      a6,       4
2181    blt           zero,    a5,       .WMASK420_W8_LSX
2182    b             .END_W420
2183
2184.WMASK420_W16_LSX:
2185    vld           vr0,     a2,       0
2186    vld           vr1,     a2,       16
2187    alsl.d        a2,      a4,       a2,    1
2188    vld           vr2,     a2,       0
2189    vld           vr3,     a2,       16
2190    vld           vr4,     a3,       0
2191    vld           vr5,     a3,       16
2192    alsl.d        a3,      a4,       a3,    1
2193    vld           vr6,     a3,       0
2194    vld           vr7,     a3,       16
2195
2196    vabsd.h       vr8,     vr0,      vr4
2197    vabsd.h       vr9,     vr1,      vr5
2198    vabsd.h       vr10,    vr2,      vr6
2199    vabsd.h       vr11,    vr3,      vr7
2200    vaddi.hu      vr8,     vr8,      8
2201    vaddi.hu      vr9,     vr9,      8
2202    vaddi.hu      vr10,    vr10,     8
2203    vaddi.hu      vr11,    vr11,     8
2204    vsrli.h       vr8,     vr8,      8
2205    vsrli.h       vr9,     vr9,      8
2206    vsrli.h       vr10,    vr10,     8
2207    vsrli.h       vr11,    vr11,     8
2208    vadd.h        vr8,     vr8,      vr22
2209    vadd.h        vr9,     vr9,      vr22
2210    vadd.h        vr10,    vr10,     vr22
2211    vadd.h        vr11,    vr11,     vr22
2212    vmin.hu       vr12,    vr8,      vr20
2213    vmin.hu       vr13,    vr9,      vr20
2214    vmin.hu       vr14,    vr10,     vr20
2215    vmin.hu       vr15,    vr11,     vr20
2216    vsub.h        vr16,    vr20,     vr12
2217    vsub.h        vr17,    vr20,     vr13
2218    vsub.h        vr18,    vr20,     vr14
2219    vsub.h        vr19,    vr20,     vr15
2220    vmulwev.w.h   vr8,     vr12,     vr0
2221    vmulwod.w.h   vr9,     vr12,     vr0
2222    vmulwev.w.h   vr10,    vr13,     vr1
2223    vmulwod.w.h   vr11,    vr13,     vr1
2224    vmulwev.w.h   vr23,    vr14,     vr2
2225    vmulwod.w.h   vr24,    vr14,     vr2
2226    vmulwev.w.h   vr25,    vr15,     vr3
2227    vmulwod.w.h   vr26,    vr15,     vr3
2228    vmaddwev.w.h  vr8,     vr16,     vr4
2229    vmaddwod.w.h  vr9,     vr16,     vr4
2230    vmaddwev.w.h  vr10,    vr17,     vr5
2231    vmaddwod.w.h  vr11,    vr17,     vr5
2232    vmaddwev.w.h  vr23,    vr18,     vr6
2233    vmaddwod.w.h  vr24,    vr18,     vr6
2234    vmaddwev.w.h  vr25,    vr19,     vr7
2235    vmaddwod.w.h  vr26,    vr19,     vr7
2236    vssrarni.hu.w vr10,    vr8,      10
2237    vssrarni.hu.w vr11,    vr9,      10
2238    vssrarni.hu.w vr25,    vr23,     10
2239    vssrarni.hu.w vr26,    vr24,     10
2240    vssrlni.bu.h  vr11,    vr10,     0
2241    vssrlni.bu.h  vr26,    vr25,     0
2242    vshuf4i.w     vr0,     vr11,     0x4E
2243    vshuf4i.w     vr1,     vr26,     0x4E
2244    vilvl.b       vr3,     vr0,      vr11
2245    vilvl.b       vr7,     vr1,      vr26
2246    vst           vr3,     a0,       0
2247    vstx          vr7,     a0,       a1
2248    vpickev.h     vr0,     vr13,     vr12
2249    vpickod.h     vr1,     vr13,     vr12
2250    vpickev.h     vr2,     vr15,     vr14
2251    vpickod.h     vr3,     vr15,     vr14
2252    vadd.h        vr4,     vr0,      vr1
2253    vadd.h        vr5,     vr2,      vr3
2254    vadd.h        vr4,     vr4,      vr5
2255    vsub.h        vr4,     vr4,      vr21
2256    vssrarni.bu.h vr4,     vr4,      2
2257    vstelm.d      vr4,     a6,       0,    0
2258
2259    alsl.d        a2,      a4,       a2,   1
2260    alsl.d        a3,      a4,       a3,   1
2261    alsl.d        a0,      a1,       a0,   1
2262    addi.d        a6,      a6,       8
2263    addi.w        a5,      a5,       -2
2264    blt           zero,    a5,       .WMASK420_W16_LSX
2265    b    .END_W420
2266
2267.WMASK420_W32_LSX:
2268.WMASK420_W64_LSX:
2269.WMASK420_W128_LSX:
2270
2271.LOOP_W32_420_LSX:
2272    add.d         t1,       a2,       zero
2273    add.d         t2,       a3,       zero
2274    add.d         t3,       a0,       zero
2275    add.d         t4,       a6,       zero
2276    alsl.d        t5,       a4,       t1,     1
2277    alsl.d        t6,       a4,       t2,     1
2278    or            t7,       a4,       a4
2279
2280.W32_420_LSX:
2281    vld           vr0,      t1,       0
2282    vld           vr1,      t1,       16
2283    vld           vr2,      t2,       0
2284    vld           vr3,      t2,       16
2285    vld           vr4,      t5,       0
2286    vld           vr5,      t5,       16
2287    vld           vr6,      t6,       0
2288    vld           vr7,      t6,       16
2289    addi.d        t1,       t1,       32
2290    addi.d        t2,       t2,       32
2291    addi.d        t5,       t5,       32
2292    addi.d        t6,       t6,       32
2293    addi.w        t7,       t7,       -16
2294    vabsd.h       vr8,      vr0,      vr2
2295    vabsd.h       vr9,      vr1,      vr3
2296    vabsd.h       vr10,     vr4,      vr6
2297    vabsd.h       vr11,     vr5,      vr7
2298    vaddi.hu      vr8,      vr8,      8
2299    vaddi.hu      vr9,      vr9,      8
2300    vaddi.hu      vr10,     vr10,     8
2301    vaddi.hu      vr11,     vr11,     8
2302    vsrli.h       vr8,      vr8,      8
2303    vsrli.h       vr9,      vr9,      8
2304    vsrli.h       vr10,     vr10,     8
2305    vsrli.h       vr11,     vr11,     8
2306    vadd.h        vr8,      vr8,      vr22
2307    vadd.h        vr9,      vr9,      vr22
2308    vadd.h        vr10,     vr10,     vr22
2309    vadd.h        vr11,     vr11,     vr22
2310    vmin.hu       vr12,     vr8,      vr20
2311    vmin.hu       vr13,     vr9,      vr20
2312    vmin.hu       vr14,     vr10,     vr20
2313    vmin.hu       vr15,     vr11,     vr20
2314    vsub.h        vr16,     vr20,     vr12
2315    vsub.h        vr17,     vr20,     vr13
2316    vsub.h        vr18,     vr20,     vr14
2317    vsub.h        vr19,     vr20,     vr15
2318    vmulwev.w.h   vr8,      vr12,     vr0
2319    vmulwod.w.h   vr9,      vr12,     vr0
2320    vmulwev.w.h   vr10,     vr13,     vr1
2321    vmulwod.w.h   vr11,     vr13,     vr1
2322    vmulwev.w.h   vr23,     vr14,     vr4
2323    vmulwod.w.h   vr24,     vr14,     vr4
2324    vmulwev.w.h   vr25,     vr15,     vr5
2325    vmulwod.w.h   vr26,     vr15,     vr5
2326    vmaddwev.w.h  vr8,      vr16,     vr2
2327    vmaddwod.w.h  vr9,      vr16,     vr2
2328    vmaddwev.w.h  vr10,     vr17,     vr3
2329    vmaddwod.w.h  vr11,     vr17,     vr3
2330    vmaddwev.w.h  vr23,     vr18,     vr6
2331    vmaddwod.w.h  vr24,     vr18,     vr6
2332    vmaddwev.w.h  vr25,     vr19,     vr7
2333    vmaddwod.w.h  vr26,     vr19,     vr7
2334    vssrarni.hu.w vr10,     vr8,      10
2335    vssrarni.hu.w vr11,     vr9,      10
2336    vssrarni.hu.w vr25,     vr23,     10
2337    vssrarni.hu.w vr26,     vr24,     10
2338    vssrlni.bu.h  vr11,     vr10,     0
2339    vssrlni.bu.h  vr26,     vr25,     0
2340    vshuf4i.w     vr8,      vr11,     0x4E
2341    vshuf4i.w     vr9,      vr26,     0x4E
2342    vilvl.b       vr3,      vr8,      vr11
2343    vilvl.b       vr7,      vr9,      vr26
2344    vst           vr3,      t3,       0
2345    vstx          vr7,      a1,       t3
2346    addi.d        t3,       t3,       16
2347    vpickev.h     vr8,      vr13,     vr12
2348    vpickod.h     vr9,      vr13,     vr12
2349    vpickev.h     vr10,     vr15,     vr14
2350    vpickod.h     vr11,     vr15,     vr14
2351    vadd.h        vr8,      vr8,      vr9
2352    vadd.h        vr10,     vr10,     vr11
2353    vadd.h        vr12,     vr8,      vr10
2354    vsub.h        vr12,     vr12,     vr21
2355    vssrarni.bu.h vr12,     vr12,     2
2356    vstelm.d      vr12,     t4,       0,     0
2357    addi.d        t4,       t4,       8
2358    bne           t7,       zero,     .W32_420_LSX
2359
2360    alsl.d        a2,       a4,       a2,     2
2361    alsl.d        a3,       a4,       a3,     2
2362    alsl.d        a0,       a1,       a0,     1
2363    srai.w        t8,       a4,       1
2364    add.d         a6,       a6,       t8
2365    addi.w        a5,       a5,       -2
2366    blt           zero,     a5,       .LOOP_W32_420_LSX
2367
2368.END_W420:
2369    fld.d            f24,     sp,    0
2370    fld.d            f25,     sp,    8
2371    fld.d            f26,     sp,    16
2372    addi.d           sp,      sp,    24
2373endfunc
2374
2375function w_mask_420_8bpc_lasx
2376    xvldi          xr20,    0x440
2377    xvreplgr2vr.h  xr21,    a7
2378    xvldi          xr22,    0x426
2379
2380    clz.w          t0,      a4
2381    li.w           t1,      24
2382    sub.w          t0,      t0,      t1
2383    la.local       t1,      .WMASK420_LASX_JRTABLE
2384    alsl.d         t0,      t0,      t1,    1
2385    ld.h           t8,      t0,      0
2386    add.d          t1,      t1,      t8
2387    jirl           $r0,     t1,      0
2388
2389    .align   3
2390.WMASK420_LASX_JRTABLE:
2391    .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE
2392    .hword .WMASK420_W64_LASX  - .WMASK420_LASX_JRTABLE
2393    .hword .WMASK420_W32_LASX  - .WMASK420_LASX_JRTABLE
2394    .hword .WMASK420_W16_LASX  - .WMASK420_LASX_JRTABLE
2395    .hword .WMASK420_W8_LASX   - .WMASK420_LASX_JRTABLE
2396    .hword .WMASK420_W4_LASX   - .WMASK420_LASX_JRTABLE
2397
2398.WMASK420_W4_LASX:
2399    xvld           xr0,     a2,     0
2400    xvld           xr1,     a3,     0
2401    addi.w         a5,      a5,     -4
2402
2403    xvabsd.h       xr2,     xr0,    xr1
2404    xvaddi.hu      xr2,     xr2,    8
2405    xvsrli.h       xr2,     xr2,    8
2406    xvadd.h        xr2,     xr2,    xr22
2407    xvmin.hu       xr3,     xr2,    xr20
2408    xvsub.h        xr4,     xr20,   xr3
2409    xvmulwev.w.h   xr5,     xr3,    xr0
2410    xvmulwod.w.h   xr6,     xr3,    xr0
2411    xvmaddwev.w.h  xr5,     xr4,    xr1
2412    xvmaddwod.w.h  xr6,     xr4,    xr1
2413    xvilvl.w       xr7,     xr6,    xr5
2414    xvilvh.w       xr8,     xr6,    xr5
2415    xvssrarni.hu.w xr8,     xr7,    10
2416    xvssrlni.bu.h  xr9,     xr8,    0
2417    vstelm.w       vr9,     a0,     0,     0
2418    add.d          a0,      a0,     a1
2419    vstelm.w       vr9,     a0,     0,     1
2420    add.d          a0,      a0,     a1
2421    xvstelm.w      xr9,     a0,     0,     4
2422    add.d          a0,      a0,     a1
2423    xvstelm.w      xr9,     a0,     0,     5
2424    add.d          a0,      a0,     a1
2425
2426    xvhaddw.w.h    xr3,     xr3,    xr3
2427    xvpermi.d      xr4,     xr3,    0xb1
2428    xvadd.h        xr3,     xr3,    xr4
2429    xvpickev.h     xr3,     xr3,    xr3
2430    xvsub.h        xr3,     xr3,    xr21
2431    xvssrarni.bu.h xr3,     xr3,    2
2432    vstelm.h       vr3,     a6,     0,     0
2433    xvstelm.h      xr3,     a6,     2,     8
2434
2435    addi.d         a2,     a2,      32
2436    addi.d         a3,     a3,      32
2437    addi.d         a6,     a6,      4
2438    blt            zero,   a5,      .WMASK420_W4_LASX
2439    b              .END_W420_LASX
2440
2441.WMASK420_W8_LASX:
2442    xvld           xr0,      a2,     0
2443    xvld           xr1,      a2,     32
2444    xvld           xr2,      a3,     0
2445    xvld           xr3,      a3,     32
2446    addi.w         a5,       a5,     -4
2447
2448    xvabsd.h       xr4,      xr0,    xr2
2449    xvabsd.h       xr5,      xr1,    xr3
2450    xvaddi.hu      xr4,      xr4,    8
2451    xvaddi.hu      xr5,      xr5,    8
2452    xvsrli.h       xr4,      xr4,    8
2453    xvsrli.h       xr5,      xr5,    8
2454    xvadd.h        xr4,      xr4,    xr22
2455    xvadd.h        xr5,      xr5,    xr22
2456    xvmin.hu       xr6,      xr4,    xr20
2457    xvmin.hu       xr7,      xr5,    xr20
2458    xvsub.h        xr8,      xr20,   xr6
2459    xvsub.h        xr9,      xr20,   xr7
2460    xvmulwev.w.h   xr10,     xr6,    xr0
2461    xvmulwod.w.h   xr11,     xr6,    xr0
2462    xvmulwev.w.h   xr12,     xr7,    xr1
2463    xvmulwod.w.h   xr13,     xr7,    xr1
2464    xvmaddwev.w.h  xr10,     xr8,    xr2
2465    xvmaddwod.w.h  xr11,     xr8,    xr2
2466    xvmaddwev.w.h  xr12,     xr9,    xr3
2467    xvmaddwod.w.h  xr13,     xr9,    xr3
2468    xvssrarni.hu.w xr12,     xr10,   10
2469    xvssrarni.hu.w xr13,     xr11,   10
2470    xvssrlni.bu.h  xr13,     xr12,   0
2471    xvshuf4i.w     xr1,      xr13,   0x4E
2472    xvilvl.b       xr17,     xr1,    xr13
2473    vstelm.d       vr17,     a0,     0,     0
2474    add.d          a0,       a0,     a1
2475    xvstelm.d      xr17,     a0,     0,     2
2476    add.d          a0,       a0,     a1
2477    xvstelm.d      xr17,     a0,     0,     1
2478    add.d          a0,       a0,     a1
2479    xvstelm.d      xr17,     a0,     0,     3
2480    add.d          a0,       a0,     a1
2481
2482    xvhaddw.w.h    xr6,      xr6,    xr6
2483    xvhaddw.w.h    xr7,      xr7,    xr7
2484    xvpickev.h     xr8,      xr7,    xr6
2485    xvpermi.q      xr9,      xr8,    0x01
2486    vadd.h         vr8,      vr8,    vr9
2487    vsub.h         vr8,      vr8,    vr21
2488    vssrarni.bu.h  vr8,      vr8,    2
2489    vstelm.d       vr8,      a6,     0,    0
2490    addi.d         a2,       a2,     64
2491    addi.d         a3,       a3,     64
2492    addi.d         a6,       a6,     8
2493    blt            zero,     a5,     .WMASK420_W8_LASX
2494    b              .END_W420_LASX
2495
2496.WMASK420_W16_LASX:
2497    xvld           xr0,      a2,     0
2498    xvld           xr1,      a2,     32
2499    xvld           xr2,      a3,     0
2500    xvld           xr3,      a3,     32
2501    addi.w         a5,       a5,     -2
2502
2503    xvabsd.h       xr4,      xr0,    xr2
2504    xvabsd.h       xr5,      xr1,    xr3
2505    xvaddi.hu      xr4,      xr4,    8
2506    xvaddi.hu      xr5,      xr5,    8
2507    xvsrli.h       xr4,      xr4,    8
2508    xvsrli.h       xr5,      xr5,    8
2509    xvadd.h        xr4,      xr4,    xr22
2510    xvadd.h        xr5,      xr5,    xr22
2511    xvmin.hu       xr4,      xr4,    xr20
2512    xvmin.hu       xr5,      xr5,    xr20
2513    xvsub.h        xr6,      xr20,   xr4
2514    xvsub.h        xr7,      xr20,   xr5
2515    xvmulwev.w.h   xr8,      xr4,    xr0
2516    xvmulwod.w.h   xr9,      xr4,    xr0
2517    xvmulwev.w.h   xr10,     xr5,    xr1
2518    xvmulwod.w.h   xr11,     xr5,    xr1
2519    xvmaddwev.w.h  xr8,      xr6,    xr2
2520    xvmaddwod.w.h  xr9,      xr6,    xr2
2521    xvmaddwev.w.h  xr10,     xr7,    xr3
2522    xvmaddwod.w.h  xr11,     xr7,    xr3
2523    xvssrarni.hu.w xr10,     xr8,    10
2524    xvssrarni.hu.w xr11,     xr9,    10
2525    xvssrlni.bu.h  xr11,     xr10,   0
2526    xvshuf4i.w     xr8,      xr11,   0x4E
2527    xvilvl.b       xr15,     xr8,    xr11
2528    xvpermi.d      xr16,     xr15,   0xd8
2529    vst            vr16,     a0,     0
2530    add.d          a0,       a0,     a1
2531    xvpermi.q      xr16,     xr16,   0x01
2532    vst            vr16,     a0,     0
2533    add.d          a0,       a0,     a1
2534
2535    xvhaddw.w.h    xr4,      xr4,    xr4
2536    xvhaddw.w.h    xr5,      xr5,    xr5
2537    xvadd.h        xr4,      xr5,    xr4
2538    xvpickev.h     xr6,      xr4,    xr4
2539    xvpermi.d      xr7,      xr6,    0x08
2540    vsub.h         vr7,      vr7,    vr21
2541    vssrarni.bu.h  vr7,      vr7,    2
2542    vstelm.d       vr7,      a6,     0,    0
2543
2544    addi.d         a2,       a2,     64
2545    addi.d         a3,       a3,     64
2546    addi.d         a6,       a6,     8
2547    blt            zero,     a5,     .WMASK420_W16_LASX
2548    b              .END_W420_LASX
2549
2550.WMASK420_W32_LASX:
2551.WMASK420_W64_LASX:
2552.WMASK420_W128_LASX:
2553
2554.LOOP_W32_420_LASX:
2555    add.d          t1,       a2,       zero
2556    add.d          t2,       a3,       zero
2557    add.d          t3,       a0,       zero
2558    add.d          t4,       a6,       zero
2559    alsl.d         t5,       a4,       t1,     1
2560    alsl.d         t6,       a4,       t2,     1
2561    or             t7,       a4,       a4
2562.W32_420_LASX:
2563    xvld           xr0,      t1,       0
2564    xvld           xr1,      t2,       0
2565    xvld           xr2,      t5,       0
2566    xvld           xr3,      t6,       0
2567    addi.d         t1,       t1,       32
2568    addi.d         t2,       t2,       32
2569    addi.d         t5,       t5,       32
2570    addi.d         t6,       t6,       32
2571    addi.w         t7,       t7,       -16
2572    xvabsd.h       xr4,      xr0,      xr1
2573    xvabsd.h       xr5,      xr2,      xr3
2574    xvaddi.hu      xr4,      xr4,      8
2575    xvaddi.hu      xr5,      xr5,      8
2576    xvsrli.h       xr4,      xr4,      8
2577    xvsrli.h       xr5,      xr5,      8
2578    xvadd.h        xr4,      xr4,      xr22
2579    xvadd.h        xr5,      xr5,      xr22
2580    xvmin.hu       xr6,      xr4,      xr20
2581    xvmin.hu       xr7,      xr5,      xr20
2582    xvsub.h        xr8,      xr20,     xr6
2583    xvsub.h        xr9,      xr20,     xr7
2584    xvmulwev.w.h   xr10,     xr6,      xr0
2585    xvmulwod.w.h   xr11,     xr6,      xr0
2586    xvmulwev.w.h   xr12,     xr7,      xr2
2587    xvmulwod.w.h   xr13,     xr7,      xr2
2588    xvmaddwev.w.h  xr10,     xr8,      xr1
2589    xvmaddwod.w.h  xr11,     xr8,      xr1
2590    xvmaddwev.w.h  xr12,     xr9,      xr3
2591    xvmaddwod.w.h  xr13,     xr9,      xr3
2592    xvssrarni.hu.w xr12,     xr10,     10
2593    xvssrarni.hu.w xr13,     xr11,     10
2594    xvssrlni.bu.h  xr13,     xr12,     0
2595    xvshuf4i.w     xr10,     xr13,     0x4E
2596    xvilvl.b       xr17,     xr10,     xr13
2597    xvpermi.d      xr18,     xr17,     0x08
2598    xvpermi.d      xr19,     xr17,     0x0d
2599    vst            vr18,     t3,       0
2600    vstx           vr19,     t3,       a1
2601    addi.d         t3,       t3,       16
2602
2603    xvhaddw.w.h    xr6,      xr6,      xr6
2604    xvhaddw.w.h    xr7,      xr7,      xr7
2605    xvadd.h        xr6,      xr7,      xr6
2606    xvpickev.h     xr7,      xr6,      xr6
2607    xvpermi.d      xr8,      xr7,      0x08
2608    vsub.h         vr9,      vr8,      vr21
2609    vssrarni.bu.h  vr9,      vr9,      2
2610    vstelm.d       vr9,      t4,       0,      0
2611    addi.d         t4,       t4,       8
2612    bne            t7,       zero,     .W32_420_LASX
2613
2614    alsl.d         a2,       a4,       a2,     2
2615    alsl.d         a3,       a4,       a3,     2
2616    alsl.d         a0,       a1,       a0,     1
2617    srai.w         t8,       a4,       1
2618    add.d          a6,       a6,       t8
2619    addi.w         a5,       a5,       -2
2620    blt            zero,     a5,       .LOOP_W32_420_LASX
2621
2622.END_W420_LASX:
2623endfunc
2624
2625#undef bpc_sh
2626#undef bpcw_sh
2627
2628.macro  vhaddw.d.h  in0
2629    vhaddw.w.h  \in0,  \in0,  \in0
2630    vhaddw.d.w  \in0,  \in0,  \in0
2631.endm
2632.macro  vhaddw.q.w  in0
2633    vhaddw.d.w  \in0,  \in0,  \in0
2634    vhaddw.q.d  \in0,  \in0,  \in0
2635.endm
2636.macro PUT_H_8W in0
2637    vbsrl.v          vr2,    \in0,  1
2638    vbsrl.v          vr3,    \in0,  2
2639    vbsrl.v          vr4,    \in0,  3
2640    vbsrl.v          vr5,    \in0,  4
2641    vbsrl.v          vr6,    \in0,  5
2642    vbsrl.v          vr7,    \in0,  6
2643    vbsrl.v          vr10,   \in0,  7
2644    vilvl.d          vr2,    vr2,   \in0
2645    vilvl.d          vr3,    vr4,   vr3
2646    vilvl.d          vr4,    vr6,   vr5
2647    vilvl.d          vr5,    vr10,  vr7
2648    vdp2.h.bu.b      \in0,   vr2,   vr8
2649    vdp2.h.bu.b      vr2,    vr3,   vr8
2650    vdp2.h.bu.b      vr3,    vr4,   vr8
2651    vdp2.h.bu.b      vr4,    vr5,   vr8
2652    vhaddw.d.h       \in0
2653    vhaddw.d.h       vr2
2654    vhaddw.d.h       vr3
2655    vhaddw.d.h       vr4
2656    vpickev.w        \in0,   vr2,   \in0
2657    vpickev.w        vr2,    vr4,   vr3
2658    vpickev.h        \in0,   vr2,   \in0
2659    vadd.h           \in0,   \in0,  vr9
2660.endm
2661.macro FILTER_8TAP_4W in0
2662    vbsrl.v          vr10,   \in0,  1
2663    vbsrl.v          vr11,   \in0,  2
2664    vbsrl.v          vr12,   \in0,  3
2665    vilvl.d          vr10,   vr10, \in0
2666    vilvl.d          vr11,   vr12,  vr11
2667    vdp2.h.bu.b      vr7,    vr10,  vr8
2668    vdp2.h.bu.b      vr10,   vr11,  vr8
2669    vhaddw.d.h       vr7
2670    vhaddw.d.h       vr10
2671    vpickev.w        \in0,   vr10,  vr7
2672.endm
2673.macro FILTER_8TAP_8W in0
2674    vbsrl.v         vr10,    \in0,  1
2675    vbsrl.v         vr11,    \in0,  2
2676    vbsrl.v         vr12,    \in0,  3
2677    vbsrl.v         vr13,    \in0,  4
2678    vbsrl.v         vr14,    \in0,  5
2679    vbsrl.v         vr15,    \in0,  6
2680    vbsrl.v         vr16,    \in0,  7
2681    vilvl.d         vr10,    vr10,  \in0
2682    vilvl.d         vr11,    vr12,  vr11
2683    vilvl.d         vr12,    vr14,  vr13
2684    vilvl.d         vr13,    vr16,  vr15
2685    vdp2.h.bu.b     vr14,    vr10,  vr8
2686    vdp2.h.bu.b     vr15,    vr11,  vr8
2687    vdp2.h.bu.b     vr16,    vr12,  vr8
2688    vdp2.h.bu.b     vr17,    vr13,  vr8
2689    vhaddw.d.h      vr14
2690    vhaddw.d.h      vr15
2691    vhaddw.d.h      vr16
2692    vhaddw.d.h      vr17
2693    vpickev.w       vr13,    vr15,  vr14
2694    vpickev.w       vr14,    vr17,  vr16
2695    vpickev.h       \in0,    vr14,  vr13 //x0 ... x7
2696    vsrari.h        \in0,    \in0,  2
2697.endm
2698.macro FILTER_8TAP_8W_CLIP_STORE
2699    vdp2.w.h        vr12,    vr0,   vr9
2700    vdp2.w.h        vr13,    vr1,   vr9
2701    vdp2.w.h        vr14,    vr2,   vr9
2702    vdp2.w.h        vr15,    vr3,   vr9
2703    vdp2.w.h        vr16,    vr4,   vr9
2704    vdp2.w.h        vr17,    vr5,   vr9
2705    vdp2.w.h        vr18,    vr6,   vr9
2706    vdp2.w.h        vr19,    vr7,   vr9
2707    vhaddw.q.w      vr12
2708    vhaddw.q.w      vr13
2709    vhaddw.q.w      vr14
2710    vhaddw.q.w      vr15
2711    vhaddw.q.w      vr16
2712    vhaddw.q.w      vr17
2713    vhaddw.q.w      vr18
2714    vhaddw.q.w      vr19
2715    vpackev.w       vr12,    vr13,  vr12
2716    vpackev.w       vr13,    vr15,  vr14
2717    vpackev.d       vr12,    vr13,  vr12
2718    vpackev.w       vr14,    vr17,  vr16
2719    vpackev.w       vr15,    vr19,  vr18
2720    vpackev.d       vr13,    vr15,  vr14
2721    vssrarni.hu.w   vr13,    vr12,  10
2722    vssrani.bu.h    vr13,    vr13,  0
2723    vstelm.d        vr13,    a0,    0,   0
2724    add.d           a0,      a0,    a1
2725.endm
2726.macro VEXTRINS_Hx8 in0
2727    vextrins.h      vr0,     \in0,  0x70
2728    vextrins.h      vr1,     \in0,  0x71
2729    vextrins.h      vr2,     \in0,  0x72
2730    vextrins.h      vr3,     \in0,  0x73
2731    vextrins.h      vr4,     \in0,  0x74
2732    vextrins.h      vr5,     \in0,  0x75
2733    vextrins.h      vr6,     \in0,  0x76
2734    vextrins.h      vr7,     \in0,  0x77
2735.endm
2736.macro VBSRL_Vx8
2737    vbsrl.v         vr0,     vr0,   2
2738    vbsrl.v         vr1,     vr1,   2
2739    vbsrl.v         vr2,     vr2,   2
2740    vbsrl.v         vr3,     vr3,   2
2741    vbsrl.v         vr4,     vr4,   2
2742    vbsrl.v         vr5,     vr5,   2
2743    vbsrl.v         vr6,     vr6,   2
2744    vbsrl.v         vr7,     vr7,   2
2745.endm
2746
2747.macro PUT_8TAP_8BPC_LSX lable
2748    li.w             t0,     4
2749    la.local         t6,     dav1d_mc_subpel_filters
2750    slli.d           t2,     a3,    1  //src_stride*2
2751    add.d            t3,     t2,    a3 //src_stride*3
2752    slli.d           t4,     t2,    1  //src_stride*4
2753
2754    bnez             a6,     .l_\lable\()put_h //mx
2755    bnez             a7,     .l_\lable\()put_v //my
2756
2757    clz.w            t1,     a4
2758    li.w             t5,     24
2759    sub.w            t1,     t1,    t5
2760    la.local         t5,     .l_\lable\()put_hv0_jtable
2761    alsl.d           t1,     t1,    t5,   3
2762    ld.d             t6,     t1,    0
2763    add.d            t5,     t5,    t6
2764    jirl             $r0,    t5,    0
2765
2766    .align   3
2767.l_\lable\()put_hv0_jtable:
2768    .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
2769    .dword .l_\lable\()put_hv0_64w  - .l_\lable\()put_hv0_jtable
2770    .dword .l_\lable\()put_hv0_32w  - .l_\lable\()put_hv0_jtable
2771    .dword .l_\lable\()put_hv0_16w  - .l_\lable\()put_hv0_jtable
2772    .dword .l_\lable\()put_hv0_8w   - .l_\lable\()put_hv0_jtable
2773    .dword .l_\lable\()put_hv0_4w   - .l_\lable\()put_hv0_jtable
2774    .dword .l_\lable\()put_hv0_2w   - .l_\lable\()put_hv0_jtable
2775
2776.l_\lable\()put_hv0_2w:
2777    vldrepl.h        vr0,    a2,    0
2778    add.d            a2,     a2,    a3
2779    vldrepl.h        vr1,    a2,    0
2780    vstelm.h         vr0,    a0,    0,     0
2781    add.d            a0,     a0,    a1
2782    vstelm.h         vr1,    a0,    0,     0
2783    add.d            a2,     a2,    a3
2784    add.d            a0,     a0,    a1
2785    addi.w           a5,     a5,    -2
2786    bnez             a5,     .l_\lable\()put_hv0_2w
2787    b                .l_\lable\()end_put_8tap
2788.l_\lable\()put_hv0_4w:
2789    fld.s            f0,     a2,    0
2790    fldx.s           f1,     a2,    a3
2791    fst.s            f0,     a0,    0
2792    fstx.s           f1,     a0,    a1
2793    alsl.d           a2,     a3,    a2,    1
2794    alsl.d           a0,     a1,    a0,    1
2795    addi.w           a5,     a5,    -2
2796    bnez             a5,     .l_\lable\()put_hv0_4w
2797    b                .l_\lable\()end_put_8tap
2798.l_\lable\()put_hv0_8w:
2799    fld.d            f0,     a2,    0
2800    fldx.d           f1,     a2,    a3
2801    fst.d            f0,     a0,    0
2802    fstx.d           f1,     a0,    a1
2803    alsl.d           a2,     a3,    a2,    1
2804    alsl.d           a0,     a1,    a0,    1
2805    addi.w           a5,     a5,    -2
2806    bnez             a5,     .l_\lable\()put_hv0_8w
2807    b                .l_\lable\()end_put_8tap
2808.l_\lable\()put_hv0_16w:
2809    vld              vr0,    a2,    0
2810    vldx             vr1,    a2,    a3
2811    vst              vr0,    a0,    0
2812    vstx             vr1,    a0,    a1
2813    alsl.d           a2,     a3,    a2,    1
2814    alsl.d           a0,     a1,    a0,    1
2815    addi.w           a5,     a5,    -2
2816    bnez             a5,     .l_\lable\()put_hv0_16w
2817    b                .l_\lable\()end_put_8tap
2818.l_\lable\()put_hv0_32w:
2819    vld              vr0,    a2,    0
2820    vld              vr1,    a2,    16
2821    add.d            a2,     a2,    a3
2822    vld              vr2,    a2,    0
2823    vld              vr3,    a2,    16
2824    vst              vr0,    a0,    0
2825    vst              vr1,    a0,    16
2826    add.d            a0,     a0,    a1
2827    vst              vr2,    a0,    0
2828    vst              vr3,    a0,    16
2829    add.d            a2,     a2,    a3
2830    add.d            a0,     a0,    a1
2831    addi.w           a5,     a5,    -2
2832    bnez             a5,     .l_\lable\()put_hv0_32w
2833    b                .l_\lable\()end_put_8tap
2834.l_\lable\()put_hv0_64w:
2835    vld              vr0,    a2,    0
2836    vld              vr1,    a2,    16
2837    vld              vr2,    a2,    32
2838    vld              vr3,    a2,    48
2839    add.d            a2,     a2,    a3
2840    vld              vr4,    a2,    0
2841    vld              vr5,    a2,    16
2842    vld              vr6,    a2,    32
2843    vld              vr7,    a2,    48
2844    add.d            a2,     a2,    a3
2845    vst              vr0,    a0,    0
2846    vst              vr1,    a0,    16
2847    vst              vr2,    a0,    32
2848    vst              vr3,    a0,    48
2849    add.d            a0,     a0,    a1
2850    vst              vr4,    a0,    0
2851    vst              vr5,    a0,    16
2852    vst              vr6,    a0,    32
2853    vst              vr7,    a0,    48
2854    add.d            a0,     a0,    a1
2855    addi.w           a5,     a5,    -2
2856    bnez             a5,     .l_\lable\()put_hv0_64w
2857    b                .l_\lable\()end_put_8tap
2858.l_\lable\()put_hv0_128w:
2859    vld              vr0,    a2,    0
2860    vld              vr1,    a2,    16
2861    vld              vr2,    a2,    32
2862    vld              vr3,    a2,    48
2863    vld              vr4,    a2,    64
2864    vld              vr5,    a2,    80
2865    vld              vr6,    a2,    96
2866    vld              vr7,    a2,    112
2867    add.d            a2,     a2,    a3
2868    vld              vr8,    a2,    0
2869    vld              vr9,    a2,    16
2870    vld              vr10,   a2,    32
2871    vld              vr11,   a2,    48
2872    vld              vr12,   a2,    64
2873    vld              vr13,   a2,    80
2874    vld              vr14,   a2,    96
2875    vld              vr15,   a2,    112
2876    add.d            a2,     a2,    a3
2877    vst              vr0,    a0,    0
2878    vst              vr1,    a0,    16
2879    vst              vr2,    a0,    32
2880    vst              vr3,    a0,    48
2881    vst              vr4,    a0,    64
2882    vst              vr5,    a0,    80
2883    vst              vr6,    a0,    96
2884    vst              vr7,    a0,    112
2885    add.d            a0,     a0,    a1
2886    vst              vr8,    a0,    0
2887    vst              vr9,    a0,    16
2888    vst              vr10,   a0,    32
2889    vst              vr11,   a0,    48
2890    vst              vr12,   a0,    64
2891    vst              vr13,   a0,    80
2892    vst              vr14,   a0,    96
2893    vst              vr15,   a0,    112
2894    add.d            a0,     a0,    a1
2895    addi.w           a5,     a5,    -2
2896    bnez             a5,     .l_\lable\()put_hv0_128w
2897    b                .l_\lable\()end_put_8tap
2898
2899.l_\lable\()put_h:
2900    bnez             a7,     .l_\lable\()put_hv //if(fh) && if (fv)
2901    ld.d             t5,     sp,    0  //filter_type
2902    andi             t1,     t5,    3
2903    blt              t0,     a4,    .l_\lable\()put_h_idx_fh
2904    andi             t1,     t5,    1
2905    addi.w           t1,     t1,    3
2906
2907.l_\lable\()put_h_idx_fh:
2908    addi.w           t5,     zero,  120
2909    mul.w            t1,     t1,    t5
2910    addi.w           t5,     a6,    -1
2911    slli.w           t5,     t5,    3
2912    add.w            t1,     t1,    t5
2913    add.d            t1,     t6,    t1 //fh's offset
2914    vldrepl.d        vr8,    t1,    0
2915    addi.d           a2,     a2,    -3
2916    li.w             t1,     34
2917    vreplgr2vr.h     vr9,    t1
2918
2919    clz.w            t1,     a4
2920    li.w             t5,     24
2921    sub.w            t1,     t1,    t5
2922    la.local         t5,     .l_\lable\()put_h_jtable
2923    alsl.d           t1,     t1,    t5,   3
2924    ld.d             t6,     t1,    0
2925    add.d            t5,     t5,    t6
2926    jirl             $r0,    t5,    0
2927
2928    .align   3
2929.l_\lable\()put_h_jtable:
2930    .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
2931    .dword .l_\lable\()put_h_64w  - .l_\lable\()put_h_jtable
2932    .dword .l_\lable\()put_h_32w  - .l_\lable\()put_h_jtable
2933    .dword .l_\lable\()put_h_16w  - .l_\lable\()put_h_jtable
2934    .dword .l_\lable\()put_h_8w   - .l_\lable\()put_h_jtable
2935    .dword .l_\lable\()put_h_4w   - .l_\lable\()put_h_jtable
2936    .dword .l_\lable\()put_h_2w   - .l_\lable\()put_h_jtable
2937
2938.l_\lable\()put_h_2w:
2939    vld              vr0,    a2,    0
2940    vldx             vr1,    a2,    a3
2941    add.d            a2,     a2,    t2
2942
2943    vbsrl.v          vr2,    vr0,   1
2944    vilvl.d          vr0,    vr2,   vr0
2945    vdp2.h.bu.b      vr2,    vr0,   vr8
2946    vhaddw.w.h       vr0,    vr2,   vr2
2947    vhaddw.d.w       vr0,    vr0,   vr0
2948    vbsrl.v          vr2,    vr1,   1
2949    vilvl.d          vr1,    vr2,   vr1
2950    vdp2.h.bu.b      vr2,    vr1,   vr8
2951    vhaddw.w.h       vr1,    vr2,   vr2
2952    vhaddw.d.w       vr1,    vr1,   vr1
2953    vpickev.w        vr0,    vr1,   vr0
2954    vpickev.h        vr0,    vr0,   vr0
2955    vadd.h           vr0,    vr0,   vr9
2956    vssrani.bu.h     vr0,    vr0,   6
2957
2958    vstelm.h         vr0,    a0,    0,   0
2959    add.d            a0,     a0,    a1
2960    vstelm.h         vr0,    a0,    0,   1
2961    add.d            a0,     a0,    a1
2962    addi.w           a5,     a5,    -2
2963    bnez             a5,     .l_\lable\()put_h_2w
2964    b                .l_\lable\()end_put_8tap
2965
2966.l_\lable\()put_h_4w:
2967    vld              vr0,    a2,    0
2968    vldx             vr1,    a2,    a3
2969    add.d            a2,     a2,    t2
2970
2971    vbsrl.v          vr2,    vr0,   1
2972    vbsrl.v          vr3,    vr0,   2
2973    vbsrl.v          vr4,    vr0,   3
2974    vilvl.d          vr0,    vr2,   vr0 //x0 x1
2975    vilvl.d          vr2,    vr4,   vr3 //x2 x3
2976    vdp2.h.bu.b      vr3,    vr0,   vr8
2977    vdp2.h.bu.b      vr4,    vr2,   vr8
2978    vhaddw.w.h       vr0,    vr3,   vr3
2979    vhaddw.d.w       vr0,    vr0,   vr0
2980    vhaddw.w.h       vr2,    vr4,   vr4
2981    vhaddw.d.w       vr2,    vr2,   vr2
2982    vpickev.w        vr5,    vr2,   vr0
2983    vbsrl.v          vr2,    vr1,   1
2984    vbsrl.v          vr3,    vr1,   2
2985    vbsrl.v          vr4,    vr1,   3
2986    vilvl.d          vr0,    vr2,   vr1 //x0 x1
2987    vilvl.d          vr2,    vr4,   vr3 //x2 x3
2988    vdp2.h.bu.b      vr3,    vr0,   vr8
2989    vdp2.h.bu.b      vr4,    vr2,   vr8
2990    vhaddw.w.h       vr0,    vr3,   vr3
2991    vhaddw.d.w       vr0,    vr0,   vr0
2992    vhaddw.w.h       vr2,    vr4,   vr4
2993    vhaddw.d.w       vr2,    vr2,   vr2
2994    vpickev.w        vr6,    vr2,   vr0
2995    vpickev.h        vr0,    vr6,   vr5
2996    vadd.h           vr0,    vr0,   vr9
2997    vssrani.bu.h     vr0,    vr0,   6
2998
2999    vstelm.w         vr0,    a0,    0,    0
3000    add.d            a0,     a0,    a1
3001    vstelm.w         vr0,    a0,    0,    1
3002    add.d            a0,     a0,    a1
3003    addi.d           a5,     a5,    -2
3004    bnez             a5,     .l_\lable\()put_h_4w
3005    b                .l_\lable\()end_put_8tap
3006
3007.l_\lable\()put_h_8w:
3008    vld              vr0,    a2,    0
3009    vldx             vr1,    a2,    a3
3010    add.d            a2,     a2,    t2
3011    PUT_H_8W         vr0
3012    PUT_H_8W         vr1
3013    vssrani.bu.h     vr1,    vr0,   6
3014    vstelm.d         vr1,    a0,    0,    0
3015    add.d            a0,     a0,    a1
3016    vstelm.d         vr1,    a0,    0,    1
3017    add.d            a0,     a0,    a1
3018    addi.w           a5,     a5,    -2
3019    bnez             a5,     .l_\lable\()put_h_8w
3020    b                .l_\lable\()end_put_8tap
3021
3022.l_\lable\()put_h_16w:
3023.l_\lable\()put_h_32w:
3024.l_\lable\()put_h_64w:
3025.l_\lable\()put_h_128w:
3026    addi.d           t0,     a2,    0 //src
3027    addi.w           t5,     a5,    0 //h
3028    addi.d           t8,     a0,    0 //dst
3029.l_\lable\()put_h_16w_loop:
3030    vld              vr0,    a2,    0
3031    vldx             vr1,    a2,    a3
3032    add.d            a2,     a2,    t2
3033    PUT_H_8W         vr0
3034    PUT_H_8W         vr1
3035    vssrani.bu.h     vr1,    vr0,   6
3036    vstelm.d         vr1,    a0,    0,   0
3037    add.d            a0,     a0,    a1
3038    vstelm.d         vr1,    a0,    0,   1
3039    add.d            a0,     a0,    a1
3040    addi.d           a5,     a5,    -2
3041    bnez             a5,     .l_\lable\()put_h_16w_loop
3042    addi.d           a2,     t0,    8
3043    addi.d           t0,     t0,    8
3044    addi.d           a0,     t8,    8
3045    addi.d           t8,     t8,    8
3046    addi.w           a5,     t5,    0
3047    addi.w           a4,     a4,    -8
3048    bnez             a4,     .l_\lable\()put_h_16w_loop
3049    b                .l_\lable\()end_put_8tap
3050
3051.l_\lable\()put_v:
3052    ld.d             t1,     sp,    0  //filter_type
3053    srli.w           t1,     t1,    2
3054    blt              t0,     a5,    .l_\lable\()put_v_idx_fv
3055    andi             t1,     t1,    1
3056    addi.w           t1,     t1,    3
3057
3058.l_\lable\()put_v_idx_fv:
3059    addi.w           t5,     zero,  120
3060    mul.w            t1,     t1,    t5
3061    addi.w           t5,     a7,    -1
3062    slli.w           t5,     t5,    3
3063    add.w            t1,     t1,    t5
3064    add.d            t1,     t6,    t1 //fv's offset
3065    vldrepl.d        vr8,    t1,    0
3066    sub.d            a2,     a2,    t3
3067
3068    clz.w            t1,     a4
3069    li.w             t5,     24
3070    sub.w            t1,     t1,    t5
3071    la.local         t5,     .l_\lable\()put_v_jtable
3072    alsl.d           t1,     t1,    t5,   3
3073    ld.d             t6,     t1,    0
3074    add.d            t5,     t5,    t6
3075    jirl             $r0,    t5,    0
3076
3077    .align   3
3078.l_\lable\()put_v_jtable:
3079    .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
3080    .dword .l_\lable\()put_v_64w  - .l_\lable\()put_v_jtable
3081    .dword .l_\lable\()put_v_32w  - .l_\lable\()put_v_jtable
3082    .dword .l_\lable\()put_v_16w  - .l_\lable\()put_v_jtable
3083    .dword .l_\lable\()put_v_8w   - .l_\lable\()put_v_jtable
3084    .dword .l_\lable\()put_v_4w   - .l_\lable\()put_v_jtable
3085    .dword .l_\lable\()put_v_2w   - .l_\lable\()put_v_jtable
3086
3087.l_\lable\()put_v_2w:
3088    fld.s            f0,     a2,    0
3089    fldx.s           f1,     a2,    a3
3090    fldx.s           f2,     a2,    t2
3091    add.d            a2,     a2,    t3
3092    fld.s            f3,     a2,    0
3093    fldx.s           f4,     a2,    a3
3094    fldx.s           f5,     a2,    t2
3095    fldx.s           f6,     a2,    t3
3096    add.d            a2,     a2,    t4
3097    vilvl.b          vr0,    vr1,   vr0
3098    vilvl.b          vr1,    vr3,   vr2
3099    vilvl.b          vr2,    vr5,   vr4
3100    vilvl.b          vr3,    vr7,   vr6
3101    vilvl.h          vr0,    vr1,   vr0
3102    vilvl.h          vr1,    vr3,   vr2
3103    vilvl.w          vr0,    vr1,   vr0
3104
3105.l_\lable\()put_v_2w_loop:
3106    fld.s            f7,     a2,    0  //h0
3107    fldx.s           f10,    a2,    a3 //h1
3108    add.d            a2,     a2,    t2
3109
3110    vextrins.b       vr0,    vr7,   0x70
3111    vextrins.b       vr0,    vr7,   0xf1
3112    vbsrl.v          vr1,    vr0,   1
3113    vextrins.b       vr1,    vr10,  0x70
3114    vextrins.b       vr1,    vr10,  0xf1
3115    vdp2.h.bu.b      vr10,   vr0,   vr8
3116    vdp2.h.bu.b      vr11,   vr1,   vr8
3117    vbsrl.v          vr0,    vr1,   1
3118    vhaddw.d.h       vr10
3119    vhaddw.d.h       vr11
3120    vpickev.w        vr10,   vr11,  vr10
3121    vssrarni.hu.w    vr10,   vr10,  6
3122    vssrani.bu.h     vr10,   vr10,  0
3123
3124    vstelm.h         vr10,   a0,    0,   0
3125    add.d            a0,     a0,    a1
3126    vstelm.h         vr10,   a0,    0,   1
3127    add.d            a0,     a0,    a1
3128    addi.w           a5,     a5,    -2
3129    bnez             a5,     .l_\lable\()put_v_2w_loop
3130    b                .l_\lable\()end_put_8tap
3131
3132.l_\lable\()put_v_4w:
3133    fld.s            f0,     a2,    0
3134    fldx.s           f1,     a2,    a3
3135    fldx.s           f2,     a2,    t2
3136    add.d            a2,     a2,    t3
3137    fld.s            f3,     a2,    0
3138    fldx.s           f4,     a2,    a3
3139    fldx.s           f5,     a2,    t2
3140    fldx.s           f6,     a2,    t3
3141    add.d            a2,     a2,    t4
3142
3143    vilvl.b          vr0,    vr1,   vr0
3144    vilvl.b          vr1,    vr3,   vr2
3145    vilvl.b          vr2,    vr5,   vr4
3146    vilvl.b          vr3,    vr7,   vr6
3147    vilvl.h          vr0,    vr1,   vr0
3148    vilvl.h          vr1,    vr3,   vr2
3149    vilvl.w          vr2,    vr1,   vr0
3150    vilvh.w          vr3,    vr1,   vr0
3151
3152.l_\lable\()put_v_4w_loop:
3153    fld.s            f7,     a2,    0
3154    fldx.s           f10,    a2,    a3
3155    add.d            a2,     a2,    t2
3156
3157    vextrins.b       vr2,    vr7,   0x70
3158    vextrins.b       vr2,    vr7,   0xf1 //x0x1(h0)
3159    vbsrl.v          vr4,    vr2,   1
3160    vextrins.b       vr4,    vr10,  0x70
3161    vextrins.b       vr4,    vr10,  0xf1 //x0x1(h1)
3162    vdp2.h.bu.b      vr11,   vr2,   vr8
3163    vdp2.h.bu.b      vr12,   vr4,   vr8
3164    vbsrl.v          vr2,    vr4,   1
3165
3166    vextrins.b       vr3,    vr7,   0x72
3167    vextrins.b       vr3,    vr7,   0xf3 //x2x3(h0)
3168    vbsrl.v          vr4,    vr3,   1
3169    vextrins.b       vr4,    vr10,  0x72
3170    vextrins.b       vr4,    vr10,  0xf3 //x2x3(h1)
3171    vdp2.h.bu.b      vr13,   vr3,   vr8
3172    vdp2.h.bu.b      vr14,   vr4,   vr8
3173    vbsrl.v          vr3,    vr4,   1
3174
3175    vhaddw.d.h       vr11
3176    vhaddw.d.h       vr12
3177    vhaddw.d.h       vr13
3178    vhaddw.d.h       vr14
3179
3180    vpickev.w        vr11,   vr13,  vr11
3181    vpickev.w        vr12,   vr14,  vr12
3182    vpickev.h        vr11,   vr12,  vr11
3183    vssrarni.bu.h    vr11,   vr11,  6
3184    vstelm.w         vr11,   a0,    0,   0
3185    add.d            a0,     a0,    a1
3186    vstelm.w         vr11,   a0,    0,   1
3187    add.d            a0,     a0,    a1
3188    addi.w           a5,     a5,    -2
3189    bnez             a5,     .l_\lable\()put_v_4w_loop
3190    b                .l_\lable\()end_put_8tap
3191
3192.l_\lable\()put_v_8w:
3193.l_\lable\()put_v_16w:
3194.l_\lable\()put_v_32w:
3195.l_\lable\()put_v_64w:
3196.l_\lable\()put_v_128w:
3197    addi.d           t0,     a2,    0 //src
3198    addi.d           t5,     a5,    0 //h
3199    addi.d           t8,     a0,    0 //dst
3200.l_\lable\()put_v_8w_loop0:
3201    fld.d            f0,     a2,    0
3202    fldx.d           f1,     a2,    a3
3203    fldx.d           f2,     a2,    t2
3204    add.d            a2,     a2,    t3
3205    fld.d            f3,     a2,    0
3206    fldx.d           f4,     a2,    a3
3207    fldx.d           f5,     a2,    t2
3208    fldx.d           f6,     a2,    t3
3209    add.d            a2,     a2,    t4
3210
3211    vilvl.b          vr0,    vr1,   vr0
3212    vilvl.b          vr1,    vr3,   vr2
3213    vilvl.b          vr2,    vr5,   vr4
3214    vilvl.b          vr3,    vr7,   vr6
3215    vilvl.h          vr4,    vr1,   vr0
3216    vilvh.h          vr5,    vr1,   vr0
3217    vilvl.h          vr6,    vr3,   vr2
3218    vilvh.h          vr7,    vr3,   vr2
3219    vilvl.w          vr0,    vr6,   vr4 // x0x1
3220    vilvh.w          vr1,    vr6,   vr4 // x2x3
3221    vilvl.w          vr2,    vr7,   vr5 // x4x5
3222    vilvh.w          vr3,    vr7,   vr5 // x6x7
3223.l_\lable\()put_v_8w_loop:
3224    fld.d            f7,     a2,    0
3225    fldx.d           f10,    a2,    a3
3226    add.d            a2,     a2,    t2
3227    //h0
3228    vextrins.b       vr0,    vr7,   0x70
3229    vextrins.b       vr0,    vr7,   0xf1
3230    vextrins.b       vr1,    vr7,   0x72
3231    vextrins.b       vr1,    vr7,   0xf3
3232    vextrins.b       vr2,    vr7,   0x74
3233    vextrins.b       vr2,    vr7,   0xf5
3234    vextrins.b       vr3,    vr7,   0x76
3235    vextrins.b       vr3,    vr7,   0xf7
3236    vdp2.h.bu.b      vr11,   vr0,   vr8
3237    vdp2.h.bu.b      vr12,   vr1,   vr8
3238    vdp2.h.bu.b      vr13,   vr2,   vr8
3239    vdp2.h.bu.b      vr14,   vr3,   vr8
3240    vhaddw.d.h       vr11
3241    vhaddw.d.h       vr12
3242    vhaddw.d.h       vr13
3243    vhaddw.d.h       vr14
3244    vpickev.w        vr11,   vr12,  vr11
3245    vpickev.w        vr12,   vr14,  vr13
3246    vpickev.h        vr11,   vr12,  vr11
3247    vssrarni.bu.h    vr11,   vr11,  6
3248    fst.d            f11,    a0,    0
3249    add.d            a0,     a0,    a1
3250    //h1
3251    vbsrl.v          vr0,    vr0,   1
3252    vbsrl.v          vr1,    vr1,   1
3253    vbsrl.v          vr2,    vr2,   1
3254    vbsrl.v          vr3,    vr3,   1
3255    vextrins.b       vr0,    vr10,  0x70
3256    vextrins.b       vr0,    vr10,  0xf1
3257    vextrins.b       vr1,    vr10,  0x72
3258    vextrins.b       vr1,    vr10,  0xf3
3259    vextrins.b       vr2,    vr10,  0x74
3260    vextrins.b       vr2,    vr10,  0xf5
3261    vextrins.b       vr3,    vr10,  0x76
3262    vextrins.b       vr3,    vr10,  0xf7
3263    vdp2.h.bu.b      vr11,   vr0,   vr8
3264    vdp2.h.bu.b      vr12,   vr1,   vr8
3265    vdp2.h.bu.b      vr13,   vr2,   vr8
3266    vdp2.h.bu.b      vr14,   vr3,   vr8
3267    vhaddw.d.h       vr11
3268    vhaddw.d.h       vr12
3269    vhaddw.d.h       vr13
3270    vhaddw.d.h       vr14
3271    vpickev.w        vr11,   vr12,  vr11
3272    vpickev.w        vr12,   vr14,  vr13
3273    vpickev.h        vr11,   vr12,  vr11
3274    vssrarni.bu.h    vr11,   vr11,  6
3275    fst.d            f11,    a0,    0
3276    add.d            a0,     a0,    a1
3277    vbsrl.v          vr0,    vr0,   1
3278    vbsrl.v          vr1,    vr1,   1
3279    vbsrl.v          vr2,    vr2,   1
3280    vbsrl.v          vr3,    vr3,   1
3281    addi.w           a5,     a5,    -2
3282    bnez             a5,     .l_\lable\()put_v_8w_loop
3283    addi.d           a2,     t0,    8
3284    addi.d           t0,     t0,    8
3285    addi.d           a0,     t8,    8
3286    addi.d           t8,     t8,    8
3287    addi.d           a5,     t5,    0
3288    addi.w           a4,     a4,    -8
3289    bnez             a4,     .l_\lable\()put_v_8w_loop0
3290    b                .l_\lable\()end_put_8tap
3291
3292.l_\lable\()put_hv:
3293    ld.d             t5,     sp,    0  //filter_type
3294    andi             t1,     t5,    3
3295    blt              t0,     a4,    .l_\lable\()put_hv_idx_fh
3296    andi             t1,     t5,    1
3297    addi.w           t1,     t1,    3
3298.l_\lable\()put_hv_idx_fh:
3299    addi.w           t5,     zero,  120
3300    mul.w            t1,     t1,    t5
3301    addi.w           t5,     a6,    -1
3302    slli.w           t5,     t5,    3
3303    add.w            t1,     t1,    t5
3304    add.d            t1,     t6,    t1 //fh's offset
3305    vldrepl.d        vr8,    t1,    0
3306    ld.d             t1,     sp,    0  //filter_type
3307    srli.w           t1,     t1,    2
3308    blt              t0,     a5,    .l_\lable\()put_hv_idx_fv
3309    andi             t1,     t1,    1
3310    addi.w           t1,     t1,    3
3311.l_\lable\()put_hv_idx_fv:
3312    addi.w           t5,     zero,  120
3313    mul.w            t1,     t1,    t5
3314    addi.w           t5,     a7,    -1
3315    slli.w           t5,     t5,    3
3316    add.w            t1,     t1,    t5
3317    add.d            t1,     t6,    t1 //fv's offset
3318    vldrepl.d        vr9,    t1,    0
3319    vexth.h.b        vr9,    vr9
3320
3321    sub.d            a2,     a2,    t3
3322    addi.d           a2,     a2,    -3
3323
3324    clz.w            t1,     a4
3325    li.w             t5,     24
3326    sub.w            t1,     t1,    t5
3327    la.local         t5,     .l_\lable\()put_hv_jtable
3328    alsl.d           t1,     t1,    t5,   3
3329    ld.d             t6,     t1,    0
3330    add.d            t5,     t5,    t6
3331    jirl             $r0,    t5,    0
3332
3333    .align   3
3334.l_\lable\()put_hv_jtable:
3335    .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
3336    .dword .l_\lable\()put_hv_64w  - .l_\lable\()put_hv_jtable
3337    .dword .l_\lable\()put_hv_32w  - .l_\lable\()put_hv_jtable
3338    .dword .l_\lable\()put_hv_16w  - .l_\lable\()put_hv_jtable
3339    .dword .l_\lable\()put_hv_8w   - .l_\lable\()put_hv_jtable
3340    .dword .l_\lable\()put_hv_4w   - .l_\lable\()put_hv_jtable
3341    .dword .l_\lable\()put_hv_2w   - .l_\lable\()put_hv_jtable
3342
3343.l_\lable\()put_hv_2w:
3344    vld              vr0,    a2,    0
3345    vldx             vr1,    a2,    a3
3346    vldx             vr2,    a2,    t2
3347    add.d            a2,     a2,    t3
3348    vld              vr3,    a2,    0
3349    vldx             vr4,    a2,    a3
3350    vldx             vr5,    a2,    t2
3351    vldx             vr6,    a2,    t3
3352    add.d            a2,     a2,    t4
3353
3354    vbsrl.v          vr10,   vr0,   1
3355    vbsrl.v          vr11,   vr1,   1
3356    vbsrl.v          vr12,   vr2,   1
3357    vbsrl.v          vr13,   vr3,   1
3358    vbsrl.v          vr14,   vr4,   1
3359    vbsrl.v          vr15,   vr5,   1
3360    vbsrl.v          vr16,   vr6,   1
3361    vilvl.d          vr0,    vr10,  vr0
3362    vilvl.d          vr1,    vr11,  vr1
3363    vilvl.d          vr2,    vr12,  vr2
3364    vilvl.d          vr3,    vr13,  vr3
3365    vilvl.d          vr4,    vr14,  vr4
3366    vilvl.d          vr5,    vr15,  vr5
3367    vilvl.d          vr6,    vr16,  vr6
3368    vdp2.h.bu.b      vr10,   vr0,   vr8
3369    vdp2.h.bu.b      vr11,   vr1,   vr8
3370    vdp2.h.bu.b      vr12,   vr2,   vr8
3371    vdp2.h.bu.b      vr13,   vr3,   vr8
3372    vdp2.h.bu.b      vr14,   vr4,   vr8
3373    vdp2.h.bu.b      vr15,   vr5,   vr8
3374    vdp2.h.bu.b      vr16,   vr6,   vr8
3375    vhaddw.d.h       vr10
3376    vhaddw.d.h       vr11
3377    vhaddw.d.h       vr12
3378    vhaddw.d.h       vr13
3379    vhaddw.d.h       vr14
3380    vhaddw.d.h       vr15
3381    vhaddw.d.h       vr16
3382
3383    vpackev.w        vr10,   vr11,  vr10
3384    vpackev.w        vr12,   vr13,  vr12
3385    vpackod.d        vr11,   vr12,  vr10
3386    vpackev.d        vr10,   vr12,  vr10
3387
3388    vpackev.w        vr12,   vr15,  vr14
3389    vpackev.w        vr16,   vr17,  vr16
3390    vpackod.d        vr13,   vr16,  vr12
3391    vpackev.d        vr12,   vr16,  vr12
3392
3393    vpickev.h        vr10,   vr12,  vr10 //0 1 2  3  4  5  6  * (h0)
3394    vpickev.h        vr11,   vr13,  vr11 //8 9 10 11 12 13 14 * (h1)
3395    vsrari.h         vr10,   vr10,  2
3396    vsrari.h         vr11,   vr11,  2
3397.l_\lable\()put_hv_2w_loop:
3398    vld              vr7,    a2,    0
3399    vldx             vr12,   a2,    a3
3400    add.d            a2,     a2,    t2
3401
3402    vbsrl.v          vr1,    vr7,   1
3403    vbsrl.v          vr2,    vr12,  1
3404    vilvl.d          vr0,    vr1,   vr7
3405    vilvl.d          vr1,    vr2,   vr12
3406    vdp2.h.bu.b      vr2,    vr0,   vr8
3407    vdp2.h.bu.b      vr3,    vr1,   vr8
3408    vhaddw.d.h       vr2
3409    vhaddw.d.h       vr3
3410    vpickev.w        vr2,    vr3,   vr2
3411    vpickev.h        vr2,    vr2,   vr2
3412    vsrari.h         vr2,    vr2,   2
3413    vextrins.h       vr10,   vr2,   0x70 //0 1 2 3 4 5 6 7
3414    vextrins.h       vr11,   vr2,   0x71
3415    vbsrl.v          vr12,   vr10,  2
3416    vbsrl.v          vr13,   vr11,  2
3417    vextrins.h       vr12,   vr2,   0x72 //1 2 3 4 5 6 7 8
3418    vextrins.h       vr13,   vr2,   0x73
3419    vdp2.w.h         vr0,    vr10,  vr9
3420    vdp2.w.h         vr1,    vr11,  vr9
3421    vdp2.w.h         vr2,    vr12,  vr9
3422    vdp2.w.h         vr3,    vr13,  vr9
3423    vhaddw.q.w       vr0
3424    vhaddw.q.w       vr1
3425    vhaddw.q.w       vr2
3426    vhaddw.q.w       vr3
3427    vpackev.w        vr0,    vr1,   vr0
3428    vpackev.w        vr1,    vr3,   vr2
3429    vpackev.d        vr0,    vr1,   vr0
3430    vssrarni.hu.w    vr0,    vr0,   10
3431    vssrani.bu.h     vr0,    vr0,   0
3432    vbsrl.v          vr10,   vr12,  2
3433    vbsrl.v          vr11,   vr13,  2
3434    vstelm.h         vr0,    a0,    0,   0
3435    add.d            a0,     a0,    a1
3436    vstelm.h         vr0,    a0,    0,   1
3437    add.d            a0,     a0,    a1
3438    addi.d           a5,     a5,    -2
3439    bnez             a5,     .l_\lable\()put_hv_2w_loop
3440    b                .l_\lable\()end_put_8tap
3441
3442.l_\lable\()put_hv_4w:
3443    vld              vr0,    a2,    0
3444    vldx             vr1,    a2,    a3
3445    vldx             vr2,    a2,    t2
3446    add.d            a2,     a2,    t3
3447    vld              vr3,    a2,    0
3448    vldx             vr4,    a2,    a3
3449    vldx             vr5,    a2,    t2
3450    vldx             vr6,    a2,    t3
3451    add.d            a2,     a2,    t4
3452    FILTER_8TAP_4W   vr0 //x0 x1 x2 x3
3453    FILTER_8TAP_4W   vr1
3454    FILTER_8TAP_4W   vr2
3455    FILTER_8TAP_4W   vr3
3456    FILTER_8TAP_4W   vr4
3457    FILTER_8TAP_4W   vr5
3458    FILTER_8TAP_4W   vr6
3459    vpackev.h        vr0,    vr1,   vr0
3460    vpackev.h        vr1,    vr3,   vr2
3461    vpackev.h        vr2,    vr5,   vr4
3462    vpackev.h        vr3,    vr7,   vr6
3463    vilvl.w          vr4,    vr1,   vr0
3464    vilvh.w          vr5,    vr1,   vr0
3465    vilvl.w          vr6,    vr3,   vr2
3466    vilvh.w          vr7,    vr3,   vr2
3467    vilvl.d          vr0,    vr6,   vr4 //0 1 2 3 4 5 6 *
3468    vilvh.d          vr1,    vr6,   vr4
3469    vilvl.d          vr2,    vr7,   vr5
3470    vilvh.d          vr3,    vr7,   vr5
3471    vsrari.h         vr0,    vr0,   2
3472    vsrari.h         vr1,    vr1,   2
3473    vsrari.h         vr2,    vr2,   2
3474    vsrari.h         vr3,    vr3,   2
3475.l_\lable\()put_hv_4w_loop:
3476    vld              vr4,    a2,    0
3477    vldx             vr5,    a2,    a3
3478    add.d            a2,     a2,    t2
3479    FILTER_8TAP_4W   vr4
3480    FILTER_8TAP_4W   vr5
3481    vpickev.h        vr4,    vr5,   vr4
3482    vsrari.h         vr4,    vr4,   2
3483    vextrins.h       vr0,    vr4,   0x70
3484    vextrins.h       vr1,    vr4,   0x71
3485    vextrins.h       vr2,    vr4,   0x72
3486    vextrins.h       vr3,    vr4,   0x73
3487    vbsrl.v          vr5,    vr0,   2
3488    vbsrl.v          vr6,    vr1,   2
3489    vbsrl.v          vr7,    vr2,   2
3490    vbsrl.v          vr10,   vr3,   2
3491    vextrins.h       vr5,    vr4,   0x74
3492    vextrins.h       vr6,    vr4,   0x75
3493    vextrins.h       vr7,    vr4,   0x76
3494    vextrins.h       vr10,   vr4,   0x77
3495    vdp2.w.h         vr11,   vr0,   vr9
3496    vdp2.w.h         vr12,   vr1,   vr9
3497    vdp2.w.h         vr13,   vr2,   vr9
3498    vdp2.w.h         vr14,   vr3,   vr9
3499    vhaddw.q.w       vr11
3500    vhaddw.q.w       vr12
3501    vhaddw.q.w       vr13
3502    vhaddw.q.w       vr14
3503    vpackev.w        vr0,    vr12,  vr11
3504    vpackev.w        vr1,    vr14,  vr13
3505    vpackev.d        vr0,    vr1,   vr0
3506    vdp2.w.h         vr11,   vr5,   vr9
3507    vdp2.w.h         vr12,   vr6,   vr9
3508    vdp2.w.h         vr13,   vr7,   vr9
3509    vdp2.w.h         vr14,   vr10,  vr9
3510    vhaddw.q.w       vr11
3511    vhaddw.q.w       vr12
3512    vhaddw.q.w       vr13
3513    vhaddw.q.w       vr14
3514    vpackev.w        vr1,    vr12,  vr11
3515    vpackev.w        vr2,    vr14,  vr13
3516    vpackev.d        vr1,    vr2,   vr1
3517    vssrarni.hu.w    vr1,    vr0,   10
3518    vssrani.bu.h     vr1,    vr1,   0
3519    vstelm.w         vr1,    a0,    0,    0
3520    add.d            a0,     a0,    a1
3521    vstelm.w         vr1,    a0,    0,    1
3522    add.d            a0,     a0,    a1
3523    vbsrl.v          vr0,    vr5,   2
3524    vbsrl.v          vr1,    vr6,   2
3525    vbsrl.v          vr2,    vr7,   2
3526    vbsrl.v          vr3,    vr10,  2
3527    addi.w           a5,     a5,    -2
3528    bnez             a5,     .l_\lable\()put_hv_4w_loop
3529    b                .l_\lable\()end_put_8tap
3530
3531.l_\lable\()put_hv_8w:
3532.l_\lable\()put_hv_16w:
3533.l_\lable\()put_hv_32w:
3534.l_\lable\()put_hv_64w:
3535.l_\lable\()put_hv_128w:
3536    addi.d          t0,      a2,    0 //src
3537    addi.d          t5,      a5,    0 //h
3538    addi.d          t8,      a0,    0 //dst
3539.l_\lable\()put_hv_8w_loop0:
3540    vld             vr0,     a2,    0
3541    vldx            vr1,     a2,    a3
3542    vldx            vr2,     a2,    t2
3543    add.d           a2,      a2,    t3
3544    vld             vr3,     a2,    0
3545    vldx            vr4,     a2,    a3
3546    vldx            vr5,     a2,    t2
3547    vldx            vr6,     a2,    t3
3548    add.d           a2,      a2,    t4
3549    FILTER_8TAP_8W  vr0
3550    FILTER_8TAP_8W  vr1
3551    FILTER_8TAP_8W  vr2
3552    FILTER_8TAP_8W  vr3
3553    FILTER_8TAP_8W  vr4
3554    FILTER_8TAP_8W  vr5
3555    FILTER_8TAP_8W  vr6
3556    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
3557                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7,\
3558                       vr10,vr11,vr12,vr13,vr14,vr15,vr16,vr17
3559.l_\lable\()put_hv_8w_loop:
3560    vld             vr20,    a2,    0
3561    vldx            vr21,    a2,    a3
3562    add.d           a2,      a2,    t2
3563    FILTER_8TAP_8W  vr20
3564    FILTER_8TAP_8W  vr21
3565    VEXTRINS_Hx8    vr20
3566    FILTER_8TAP_8W_CLIP_STORE
3567    VBSRL_Vx8
3568    VEXTRINS_Hx8    vr21
3569    FILTER_8TAP_8W_CLIP_STORE
3570    VBSRL_Vx8
3571    addi.w          a5,      a5,    -2
3572    bnez            a5,      .l_\lable\()put_hv_8w_loop
3573    addi.d          a2,      t0,    8
3574    addi.d          t0,      t0,    8
3575    addi.d          a0,      t8,    8
3576    addi.d          t8,      t8,    8
3577    addi.d          a5,      t5,    0
3578    addi.w          a4,      a4,    -8
3579    bnez            a4,      .l_\lable\()put_hv_8w_loop0
3580.l_\lable\()end_put_8tap:
3581.endm
3582
3583function put_8tap_regular_8bpc_lsx
3584    addi.d   sp, sp,  -16
3585    st.d   zero, sp,  0
3586    PUT_8TAP_8BPC_LSX 0
3587    addi.d   sp, sp,  16
3588endfunc
3589
3590function put_8tap_smooth_regular_8bpc_lsx
3591    addi.d   sp, sp,  -16
3592    li.w     t0, 1
3593    st.d     t0, sp,  0
3594    PUT_8TAP_8BPC_LSX 1
3595    addi.d   sp, sp,  16
3596endfunc
3597
3598function put_8tap_sharp_regular_8bpc_lsx
3599    addi.d   sp, sp,  -16
3600    li.w     t0, 2
3601    st.d     t0, sp,  0
3602    PUT_8TAP_8BPC_LSX 2
3603    addi.d   sp, sp,  16
3604endfunc
3605
3606function put_8tap_regular_smooth_8bpc_lsx
3607    addi.d   sp, sp,  -16
3608    li.w     t0, 4
3609    st.d     t0, sp,  0
3610    PUT_8TAP_8BPC_LSX 4
3611    addi.d   sp, sp,  16
3612endfunc
3613
3614function put_8tap_smooth_8bpc_lsx
3615    addi.d   sp, sp,  -16
3616    li.w     t0, 5
3617    st.d     t0, sp,  0
3618    PUT_8TAP_8BPC_LSX 5
3619    addi.d   sp, sp,  16
3620endfunc
3621
3622function put_8tap_sharp_smooth_8bpc_lsx
3623    addi.d   sp, sp,  -16
3624    li.w     t0, 6
3625    st.d     t0, sp,  0
3626    PUT_8TAP_8BPC_LSX 6
3627    addi.d   sp, sp,  16
3628endfunc
3629
3630function put_8tap_regular_sharp_8bpc_lsx
3631    addi.d   sp, sp,  -16
3632    li.w     t0, 8
3633    st.d     t0, sp,  0
3634    PUT_8TAP_8BPC_LSX 8
3635    addi.d   sp, sp,  16
3636endfunc
3637
3638function put_8tap_smooth_sharp_8bpc_lsx
3639    addi.d   sp, sp,  -16
3640    li.w     t0, 9
3641    st.d     t0, sp,  0
3642    PUT_8TAP_8BPC_LSX 9
3643    addi.d   sp, sp,  16
3644endfunc
3645
3646function put_8tap_sharp_8bpc_lsx
3647    addi.d   sp, sp,  -16
3648    li.w     t0, 10
3649    st.d     t0, sp,  0
3650    PUT_8TAP_8BPC_LSX 10
3651    addi.d   sp, sp,  16
3652endfunc
3653
3654const shufb1
3655.byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8,0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8
3656endconst
3657
3658.macro SHUFB in0, in1, tmp, out
3659    xvbsrl.v  \tmp, \in0, 2
3660    xvpermi.q \tmp, \in0, 0x20
3661    xvshuf.b  \out, \tmp, \tmp, \in1
3662.endm
3663
3664.macro HADDWDH in0
3665    xvhaddw.w.h \in0, \in0, \in0
3666    xvhaddw.d.w \in0, \in0, \in0
3667.endm
3668
3669.macro HADDWQW in0
3670    xvhaddw.d.w \in0, \in0, \in0
3671    xvhaddw.q.d \in0, \in0, \in0
3672.endm
3673
3674.macro PREP_W16_H in0
3675    xvbsrl.v         xr4,    \in0,    4
3676    xvbsrl.v         xr5,    \in0,    8
3677    xvpermi.q        xr9,    \in0,    0x31
3678    xvpackev.d       xr5,     xr9,    xr5
3679    xvbsrl.v         xr6,     xr5,    4
3680    SHUFB           \in0,     xr23,   xr9,   \in0
3681    SHUFB            xr4,     xr23,   xr9,    xr4
3682    SHUFB            xr5,     xr23,   xr9,    xr5
3683    SHUFB            xr6,     xr23,   xr9,    xr6
3684    xvdp2.h.bu.b     xr10,   \in0,    xr22
3685    xvdp2.h.bu.b     xr11,    xr4,    xr22
3686    xvdp2.h.bu.b     xr12,    xr5,    xr22
3687    xvdp2.h.bu.b     xr13,    xr6,    xr22
3688    HADDWDH          xr10
3689    HADDWDH          xr11
3690    HADDWDH          xr12
3691    HADDWDH          xr13
3692    xvpickev.w       xr10,    xr11,   xr10
3693    xvpickev.w       xr11,    xr13,   xr12
3694    xvpermi.d        xr10,    xr10,   0xd8
3695    xvpermi.d        xr11,    xr11,   0xd8
3696    xvpickev.h       xr10,    xr11,   xr10
3697    xvpermi.d        xr10,    xr10,   0xd8
3698    xvsrari.h       \in0,     xr10,   2
3699.endm
3700
3701.macro PREP_8TAP_8BPC_LASX lable
3702    li.w             t0,     4
3703    la.local         t6,     dav1d_mc_subpel_filters
3704    la.local         t7,     shufb1
3705    xvld             xr23,   t7,    0
3706    slli.d           t2,     a2,    1  //src_stride*2
3707    add.d            t3,     t2,    a2 //src_stride*3
3708    slli.d           t4,     t2,    1
3709
3710    bnez             a5,     .l_\lable\()h //mx
3711    bnez             a6,     .l_\lable\()v
3712
3713    clz.w            t1,     a3
3714    li.w             t5,     24
3715    sub.w            t1,     t1,    t5
3716    la.local         t5,     .l_\lable\()prep_hv0_jtable
3717    alsl.d           t1,     t1,    t5,   1
3718    ld.h             t8,     t1,    0
3719    add.d            t5,     t5,    t8
3720    jirl             $r0,    t5,    0
3721
3722    .align   3
3723.l_\lable\()prep_hv0_jtable:
3724    .hword .l_\lable\()hv0_128w - .l_\lable\()prep_hv0_jtable
3725    .hword .l_\lable\()hv0_64w  - .l_\lable\()prep_hv0_jtable
3726    .hword .l_\lable\()hv0_32w  - .l_\lable\()prep_hv0_jtable
3727    .hword .l_\lable\()hv0_16w  - .l_\lable\()prep_hv0_jtable
3728    .hword .l_\lable\()hv0_8w   - .l_\lable\()prep_hv0_jtable
3729    .hword .l_\lable\()hv0_4w   - .l_\lable\()prep_hv0_jtable
3730
3731.l_\lable\()hv0_4w:
3732    fld.s            f0,     a1,    0
3733    fldx.s           f1,     a1,    a2
3734    fldx.s           f2,     a1,    t2
3735    fldx.s           f3,     a1,    t3
3736    add.d            a1,     a1,    t4
3737    xvpackev.w       xr0,    xr1,   xr0
3738    xvpackev.w       xr1,    xr3,   xr2
3739    xvpermi.q        xr0,    xr1,   0x02
3740    xvsllwil.hu.bu   xr0,    xr0,   4
3741    xvst             xr0,    a0,    0
3742    addi.d           a0,     a0,    32
3743    addi.d           a4,     a4,    -4
3744    bnez             a4,     .l_\lable\()hv0_4w
3745    b                .l_\lable\()end_pre_8tap
3746.l_\lable\()hv0_8w:
3747    fld.d            f0,     a1,    0
3748    fldx.d           f1,     a1,    a2
3749    fldx.d           f2,     a1,    t2
3750    fldx.d           f3,     a1,    t3
3751    add.d            a1,     a1,    t4
3752    xvpermi.q        xr0,    xr1,   0x02
3753    xvpermi.q        xr2,    xr3,   0x02
3754    xvsllwil.hu.bu   xr0,    xr0,   4
3755    xvsllwil.hu.bu   xr2,    xr2,   4
3756    xvst             xr0,    a0,    0
3757    xvst             xr2,    a0,    32
3758    addi.d           a0,     a0,    64
3759    addi.d           a4,     a4,    -4
3760    bnez             a4,     .l_\lable\()hv0_8w
3761    b                .l_\lable\()end_pre_8tap
3762.l_\lable\()hv0_16w:
3763    vld              vr0,    a1,    0
3764    vldx             vr1,    a1,    a2
3765    vldx             vr2,    a1,    t2
3766    vldx             vr3,    a1,    t3
3767    add.d            a1,     a1,    t4
3768    vext2xv.hu.bu    xr0,    xr0
3769    vext2xv.hu.bu    xr1,    xr1
3770    vext2xv.hu.bu    xr2,    xr2
3771    vext2xv.hu.bu    xr3,    xr3
3772    xvslli.h         xr0,    xr0,   4
3773    xvslli.h         xr1,    xr1,   4
3774    xvslli.h         xr2,    xr2,   4
3775    xvslli.h         xr3,    xr3,   4
3776    xvst             xr0,    a0,    0
3777    xvst             xr1,    a0,    32
3778    xvst             xr2,    a0,    64
3779    xvst             xr3,    a0,    96
3780    addi.d           a0,     a0,    128
3781    addi.d           a4,     a4,    -4
3782    bnez             a4,     .l_\lable\()hv0_16w
3783    b                .l_\lable\()end_pre_8tap
3784.l_\lable\()hv0_32w:
3785    xvld             xr0,    a1,    0
3786    xvldx            xr1,    a1,    a2
3787    xvldx            xr2,    a1,    t2
3788    xvldx            xr3,    a1,    t3
3789    add.d            a1,     a1,    t4
3790    xvpermi.d        xr4,    xr0,   0xD8
3791    xvpermi.d        xr5,    xr1,   0xD8
3792    xvpermi.d        xr6,    xr2,   0xD8
3793    xvpermi.d        xr7,    xr3,   0xD8
3794    xvpermi.d        xr10,   xr0,   0x32
3795    xvpermi.d        xr11,   xr1,   0x32
3796    xvpermi.d        xr12,   xr2,   0x32
3797    xvpermi.d        xr13,   xr3,   0x32
3798    xvsllwil.hu.bu   xr0,    xr4,   4
3799    xvsllwil.hu.bu   xr1,    xr5,   4
3800    xvsllwil.hu.bu   xr2,    xr6,   4
3801    xvsllwil.hu.bu   xr3,    xr7,   4
3802    xvsllwil.hu.bu   xr4,    xr10,  4
3803    xvsllwil.hu.bu   xr5,    xr11,  4
3804    xvsllwil.hu.bu   xr6,    xr12,  4
3805    xvsllwil.hu.bu   xr7,    xr13,  4
3806    xvst             xr0,    a0,    0
3807    xvst             xr4,    a0,    32
3808    xvst             xr1,    a0,    64
3809    xvst             xr5,    a0,    96
3810    xvst             xr2,    a0,    128
3811    xvst             xr6,    a0,    160
3812    xvst             xr3,    a0,    192
3813    xvst             xr7,    a0,    224
3814    addi.d           a0,     a0,    256
3815    addi.d           a4,     a4,    -4
3816    bnez             a4,     .l_\lable\()hv0_32w
3817    b                .l_\lable\()end_pre_8tap
3818.l_\lable\()hv0_64w:
3819.l_\lable\()hv0_128w:
3820    addi.d           t0,     a1,    0
3821    addi.d           t5,     a4,    0
3822    srli.w           t7,     a3,    5
3823    slli.w           t7,     t7,    6
3824    addi.d           t8,     a0,    0
3825.l_\lable\()hv0_32_loop:
3826    xvld             xr0,    a1,    0
3827    xvldx            xr1,    a1,    a2
3828    xvldx            xr2,    a1,    t2
3829    xvldx            xr3,    a1,    t3
3830    add.d            a1,     a1,    t4
3831    xvpermi.d        xr4,    xr0,   0xD8
3832    xvpermi.d        xr5,    xr1,   0xD8
3833    xvpermi.d        xr6,    xr2,   0xD8
3834    xvpermi.d        xr7,    xr3,   0xD8
3835    xvpermi.d        xr10,   xr0,   0x32
3836    xvpermi.d        xr11,   xr1,   0x32
3837    xvpermi.d        xr12,   xr2,   0x32
3838    xvpermi.d        xr13,   xr3,   0x32
3839    xvsllwil.hu.bu   xr0,    xr4,   4
3840    xvsllwil.hu.bu   xr1,    xr5,   4
3841    xvsllwil.hu.bu   xr2,    xr6,   4
3842    xvsllwil.hu.bu   xr3,    xr7,   4
3843    xvsllwil.hu.bu   xr4,    xr10,  4
3844    xvsllwil.hu.bu   xr5,    xr11,  4
3845    xvsllwil.hu.bu   xr6,    xr12,  4
3846    xvsllwil.hu.bu   xr7,    xr13,  4
3847    xvst             xr0,    a0,    0
3848    xvst             xr4,    a0,    32
3849    add.d            t1,     a0,    t7
3850    xvst             xr1,    t1,    0
3851    xvst             xr5,    t1,    32
3852    add.d            t1,     t1,    t7
3853    xvst             xr2,    t1,    0
3854    xvst             xr6,    t1,    32
3855    add.d            t1,     t1,    t7
3856    xvst             xr3,    t1,    0
3857    xvst             xr7,    t1,    32
3858    add.d            a0,     t1,    t7
3859    addi.d           a4,     a4,    -4
3860    bnez             a4,     .l_\lable\()hv0_32_loop
3861    addi.d           a1,     t0,    32
3862    addi.d           t0,     t0,    32
3863    addi.d           a0,     t8,    64
3864    addi.d           t8,     t8,    64
3865    addi.d           a4,     t5,    0
3866    addi.d           a3,     a3,    -32
3867    bnez             a3,     .l_\lable\()hv0_32_loop
3868    b                .l_\lable\()end_pre_8tap
3869
3870.l_\lable\()h:
3871    bnez             a6,     .l_\lable\()hv //if(fh) && if (fv)
3872
3873    andi             t1,    a7,    3
3874    blt              t0,    a3,    .l_\lable\()h_idx_fh
3875    andi             t1,    a7,    1
3876    addi.w           t1,    t1,    3
3877.l_\lable\()h_idx_fh:
3878    addi.w           t5,    zero,  120
3879    mul.w            t1,    t1,    t5
3880    addi.w           t5,    a5,    -1
3881    slli.w           t5,    t5,    3
3882    add.w            t1,    t1,    t5
3883    add.d            t1,    t6,    t1 //fh's offset
3884    xvldrepl.d       xr22,  t1,    0
3885
3886    addi.d           a1,     a1,    -3
3887    clz.w            t1,     a3
3888    li.w             t5,     24
3889    sub.w            t1,     t1,    t5
3890    la.local         t5,     .l_\lable\()prep_h_jtable
3891    alsl.d           t1,     t1,    t5,   1
3892    ld.h             t8,     t1,    0
3893    add.d            t5,     t5,    t8
3894    jirl             $r0,    t5,    0
3895
3896    .align   3
3897.l_\lable\()prep_h_jtable:
3898    .hword .l_\lable\()h_128w - .l_\lable\()prep_h_jtable
3899    .hword .l_\lable\()h_64w  - .l_\lable\()prep_h_jtable
3900    .hword .l_\lable\()h_32w  - .l_\lable\()prep_h_jtable
3901    .hword .l_\lable\()h_16w  - .l_\lable\()prep_h_jtable
3902    .hword .l_\lable\()h_8w   - .l_\lable\()prep_h_jtable
3903    .hword .l_\lable\()h_4w   - .l_\lable\()prep_h_jtable
3904
3905.l_\lable\()h_4w:
3906    xvld             xr0,    a1,    0
3907    xvldx            xr1,    a1,    a2
3908    xvldx            xr2,    a1,    t2
3909    xvldx            xr3,    a1,    t3
3910    add.d            a1,     a1,    t4
3911
3912    SHUFB            xr0,    xr23,  xr9,   xr0
3913    SHUFB            xr1,    xr23,  xr9,   xr1
3914    SHUFB            xr2,    xr23,  xr9,   xr2
3915    SHUFB            xr3,    xr23,  xr9,   xr3
3916
3917    xvdp2.h.bu.b     xr10,   xr0,   xr22
3918    xvdp2.h.bu.b     xr12,   xr1,   xr22
3919    xvdp2.h.bu.b     xr14,   xr2,   xr22
3920    xvdp2.h.bu.b     xr16,   xr3,   xr22
3921
3922    HADDWDH          xr10    //h0 mid0 mid1 mid2 mid3
3923    HADDWDH          xr12    //h1 mid4 mid5 mid6 mid7
3924    HADDWDH          xr14    //h2
3925    HADDWDH          xr16    //h3
3926
3927    xvpickev.w       xr10,   xr12,    xr10
3928    xvpickev.w       xr14,   xr16,    xr14
3929    xvpermi.d        xr10,   xr10,    0xd8
3930    xvpermi.d        xr14,   xr14,    0xd8
3931    xvpickev.h       xr10,   xr14,    xr10
3932    xvpermi.d        xr10,   xr10,    0xd8
3933    xvsrari.h        xr10,   xr10,    2
3934
3935    xvst             xr10,   a0,      0
3936    addi.d           a0,     a0,      32
3937    addi.w           a4,     a4,      -4
3938    bnez             a4,     .l_\lable\()h_4w
3939    b                .l_\lable\()end_pre_8tap
3940
3941.l_\lable\()h_8w:
3942    xvld             xr0,    a1,      0
3943    xvldx            xr2,    a1,      a2
3944    xvldx            xr4,    a1,      t2
3945    xvldx            xr6,    a1,      t3
3946    add.d            a1,     a1,      t4
3947
3948    xvbsrl.v         xr1,    xr0,     4
3949    xvbsrl.v         xr3,    xr2,     4
3950    xvbsrl.v         xr5,    xr4,     4
3951    xvbsrl.v         xr7,    xr6,     4
3952
3953    SHUFB            xr0,    xr23,    xr9,    xr10
3954    SHUFB            xr1,    xr23,    xr9,    xr11
3955    SHUFB            xr2,    xr23,    xr9,    xr12
3956    SHUFB            xr3,    xr23,    xr9,    xr13
3957    SHUFB            xr4,    xr23,    xr9,    xr14
3958    SHUFB            xr5,    xr23,    xr9,    xr15
3959    SHUFB            xr6,    xr23,    xr9,    xr16
3960    SHUFB            xr7,    xr23,    xr9,    xr17
3961
3962    xvdp2.h.bu.b     xr0,    xr10,    xr22
3963    xvdp2.h.bu.b     xr1,    xr11,    xr22
3964    xvdp2.h.bu.b     xr2,    xr12,    xr22
3965    xvdp2.h.bu.b     xr3,    xr13,    xr22
3966    xvdp2.h.bu.b     xr4,    xr14,    xr22
3967    xvdp2.h.bu.b     xr5,    xr15,    xr22
3968    xvdp2.h.bu.b     xr6,    xr16,    xr22
3969    xvdp2.h.bu.b     xr7,    xr17,    xr22
3970
3971    HADDWDH          xr0
3972    HADDWDH          xr1
3973    HADDWDH          xr2
3974    HADDWDH          xr3
3975    HADDWDH          xr4
3976    HADDWDH          xr5
3977    HADDWDH          xr6
3978    HADDWDH          xr7
3979
3980    xvpickev.w       xr0,    xr1,    xr0
3981    xvpickev.w       xr2,    xr3,    xr2
3982    xvpermi.d        xr0,    xr0,    0xd8
3983    xvpermi.d        xr2,    xr2,    0xd8
3984    xvpickev.h       xr0,    xr2,    xr0
3985    xvpermi.d        xr0,    xr0,    0xd8
3986    xvsrari.h        xr0,    xr0,    2
3987
3988    xvpickev.w       xr4,    xr5,    xr4
3989    xvpickev.w       xr6,    xr7,    xr6
3990    xvpermi.d        xr4,    xr4,    0xd8
3991    xvpermi.d        xr6,    xr6,    0xd8
3992    xvpickev.h       xr4,    xr6,    xr4
3993    xvpermi.d        xr4,    xr4,    0xd8
3994    xvsrari.h        xr4,    xr4,    2
3995
3996    xvst             xr0,    a0,     0
3997    xvst             xr4,    a0,     32
3998    addi.d           a0,     a0,     64
3999    addi.d           a4,     a4,     -4
4000    bnez             a4,     .l_\lable\()h_8w
4001    b                .l_\lable\()end_pre_8tap
4002
4003.l_\lable\()h_16w:
4004    xvld             xr0,    a1,     0
4005    xvldx            xr1,    a1,     a2
4006    xvldx            xr2,    a1,     t2
4007    xvldx            xr3,    a1,     t3
4008    add.d            a1,     a1,     t4
4009
4010    PREP_W16_H       xr0
4011    PREP_W16_H       xr1
4012    PREP_W16_H       xr2
4013    PREP_W16_H       xr3
4014
4015    xvst             xr0,    a0,     0
4016    xvst             xr1,    a0,     32
4017    xvst             xr2,    a0,     64
4018    xvst             xr3,    a0,     96
4019
4020    addi.d           a0,     a0,     128
4021    addi.w           a4,     a4,     -4
4022    bnez             a4,     .l_\lable\()h_16w
4023    b                .l_\lable\()end_pre_8tap
4024
4025.l_\lable\()h_32w:
4026.l_\lable\()h_64w:
4027.l_\lable\()h_128w:
4028    addi.d           t0,     a1,     0 //src
4029    addi.d           t5,     a4,     0 //h
4030    srli.w           t7,     a3,     4 //w
4031    slli.w           t7,     t7,     5 //store offset
4032    addi.d           t8,     a0,     0 //dst
4033.l_\lable\()h_16_loop:
4034    xvld             xr0,    a1,     0
4035    xvldx            xr1,    a1,     a2
4036    xvldx            xr2,    a1,     t2
4037    xvldx            xr3,    a1,     t3
4038    add.d            a1,     a1,     t4
4039
4040    PREP_W16_H       xr0
4041    PREP_W16_H       xr1
4042    PREP_W16_H       xr2
4043    PREP_W16_H       xr3
4044
4045    xvst             xr0,    a0,     0
4046    xvstx            xr1,    a0,     t7
4047    slli.w           t1,     t7,     1
4048    xvstx            xr2,    a0,     t1
4049    add.w            t1,     t1,     t7
4050    xvstx            xr3,    a0,     t1
4051    slli.w           t1,     t7,     2
4052    add.d            a0,     a0,     t1
4053    addi.d           a4,     a4,     -4
4054    bnez             a4,     .l_\lable\()h_16_loop
4055
4056    addi.d           a1,     t0,     16
4057    addi.d           t0,     t0,     16
4058    addi.d           a0,     t8,     32
4059    addi.d           t8,     t8,     32
4060    addi.d           a4,     t5,     0
4061    addi.d           a3,     a3,     -16
4062    bnez             a3,     .l_\lable\()h_16_loop
4063    b                .l_\lable\()end_pre_8tap
4064.l_\lable\()hv:
4065    andi             t1,    a7,    3
4066    blt              t0,    a3,    .l_\lable\()hv_idx_fh
4067    andi             t1,    a7,    1
4068    addi.w           t1,    t1,    3
4069.l_\lable\()hv_idx_fh:
4070    addi.w           t5,    zero,  120
4071    mul.w            t1,    t1,    t5
4072    addi.w           t5,    a5,    -1
4073    slli.w           t5,    t5,    3
4074    add.w            t1,    t1,    t5
4075    add.d            t1,    t6,    t1 //fh's offset
4076    xvldrepl.d       xr22,  t1,    0
4077    srli.w           a7,    a7,    2
4078    blt              t0,    a4,    .l_\lable\()hv_idx_fv
4079    andi             a7,    a7,    1
4080    addi.w           a7,    a7,    3
4081.l_\lable\()hv_idx_fv:
4082    addi.w           t5,     zero,  120
4083    mul.w            a7,     a7,    t5
4084    addi.w           t5,     a6,    -1
4085    slli.w           t5,     t5,    3
4086    add.w            a7,     a7,    t5
4087    add.d            a7,     t6,    a7 //fv's offset
4088    xvldrepl.d       xr8,    a7,    0
4089    xvsllwil.h.b     xr8,    xr8,   0
4090
4091    sub.d            a1,     a1,     t3
4092    addi.d           a1,     a1,     -3
4093    beq              a3,     t0,     .l_\lable\()hv_4w
4094    b                .l_\lable\()hv_8w
4095.l_\lable\()hv_4w:
4096    xvld             xr0,    a1,     0
4097    xvldx            xr1,    a1,     a2
4098    xvldx            xr2,    a1,     t2
4099    xvldx            xr3,    a1,     t3
4100    add.d            a1,     a1,     t4
4101    xvld             xr4,    a1,     0
4102    xvldx            xr5,    a1,     a2
4103    xvldx            xr6,    a1,     t2
4104
4105    SHUFB            xr0,    xr23,   xr9,   xr0
4106    SHUFB            xr1,    xr23,   xr9,   xr1
4107    SHUFB            xr2,    xr23,   xr9,   xr2
4108    SHUFB            xr3,    xr23,   xr9,   xr3
4109
4110    SHUFB            xr4,    xr23,   xr9,   xr4
4111    SHUFB            xr5,    xr23,   xr9,   xr5
4112    SHUFB            xr6,    xr23,   xr9,   xr6
4113
4114    xvdp2.h.bu.b     xr10,   xr0,    xr22
4115    xvdp2.h.bu.b     xr11,   xr1,    xr22
4116    xvdp2.h.bu.b     xr12,   xr2,    xr22
4117    xvdp2.h.bu.b     xr13,   xr3,    xr22
4118
4119    xvdp2.h.bu.b     xr14,   xr4,    xr22
4120    xvdp2.h.bu.b     xr15,   xr5,    xr22
4121    xvdp2.h.bu.b     xr16,   xr6,    xr22
4122
4123    HADDWDH          xr10    //h0 mid0 mid1 mid2 mid3
4124    HADDWDH          xr11    //h1 mid4 mid5 mid6 mid7
4125    HADDWDH          xr12    //h2
4126    HADDWDH          xr13    //h3
4127
4128    xvpackev.w       xr10,   xr11,   xr10
4129    xvpackev.w       xr12,   xr13,   xr12
4130    xvpackev.d       xr11,   xr12,   xr10
4131    xvpackod.d       xr10,   xr12,   xr10
4132    xvpickev.h       xr11,   xr10,   xr11
4133    xvsrari.h        xr11,   xr11,   2
4134
4135    HADDWDH          xr14    //h4
4136    HADDWDH          xr15    //h5
4137    HADDWDH          xr16    //h6
4138
4139    xvpackev.w       xr14,   xr15,   xr14
4140    xvpackev.w       xr16,   xr17,   xr16
4141    xvpackev.d       xr17,   xr16,   xr14
4142    xvpackod.d       xr14,   xr16,   xr14
4143    xvpickev.h       xr13,   xr14,   xr17
4144    xvsrari.h        xr13,   xr13,   2
4145
4146    xvpackev.d       xr18,   xr13,   xr11 //0 4 8 12 16 20 24 *  2 6 10 14 18 22 26 *
4147    xvpackod.d       xr19,   xr13,   xr11 //1 5 9 13 17 21 25 *  3 7 11 15 19 23 27 *
4148.l_\lable\()hv_w4_loop:
4149    xvldx            xr0,    a1,     t3
4150    add.d            a1,     a1,     t4
4151    xvld             xr1,    a1,     0
4152    xvldx            xr2,    a1,     a2
4153    xvldx            xr3,    a1,     t2
4154
4155    SHUFB            xr0,    xr23,   xr9,   xr0
4156    SHUFB            xr1,    xr23,   xr9,   xr1
4157    SHUFB            xr2,    xr23,   xr9,   xr2
4158    SHUFB            xr3,    xr23,   xr9,   xr3
4159
4160    xvdp2.h.bu.b     xr10,   xr0,    xr22
4161    xvdp2.h.bu.b     xr12,   xr1,    xr22
4162    xvdp2.h.bu.b     xr14,   xr2,    xr22
4163    xvdp2.h.bu.b     xr16,   xr3,    xr22
4164
4165    HADDWDH          xr10    //h0 mid0 mid1 mid2 mid3
4166    HADDWDH          xr12    //h1 mid4 mid5 mid6 mid7
4167    HADDWDH          xr14    //h2
4168    HADDWDH          xr16    //h3
4169
4170    xvpackev.w       xr10,   xr12,    xr10
4171    xvpackev.w       xr14,   xr16,    xr14
4172    xvpackev.d       xr12,   xr14,    xr10
4173    xvpackod.d       xr10,   xr14,    xr10
4174    xvpickev.h       xr12,   xr10,    xr12
4175    xvsrari.h        xr12,   xr12,    2
4176
4177    xvextrins.h      xr18,   xr12,    0x70 //0 4 8 12 16 20 24  0(x0)   2 6 10 14 18 22 26  2(x2)
4178    xvextrins.h      xr19,   xr12,    0x74 //1 5 9 13 17 21 25  0(x1)   3 7 11 15 19 23 27  2(x3)
4179
4180    xvdp2.w.h        xr0,    xr18,    xr8
4181    xvdp2.w.h        xr2,    xr19,    xr8
4182    HADDWQW          xr0
4183    HADDWQW          xr2
4184    xvpackev.w       xr0,    xr2,     xr0
4185
4186    xvbsrl.v         xr18,   xr18,    2
4187    xvbsrl.v         xr19,   xr19,    2
4188    xvextrins.h      xr18,   xr12,    0x71
4189    xvextrins.h      xr19,   xr12,    0x75
4190    xvdp2.w.h        xr2,    xr18,    xr8
4191    xvdp2.w.h        xr4,    xr19,    xr8
4192    HADDWQW          xr2
4193    HADDWQW          xr4
4194    xvpackev.w       xr2,    xr4,     xr2
4195
4196    xvbsrl.v         xr18,   xr18,    2
4197    xvbsrl.v         xr19,   xr19,    2
4198    xvextrins.h      xr18,   xr12,    0x72
4199    xvextrins.h      xr19,   xr12,    0x76
4200    xvdp2.w.h        xr4,    xr18,    xr8
4201    xvdp2.w.h        xr9,    xr19,    xr8
4202    HADDWQW          xr4
4203    HADDWQW          xr9
4204    xvpackev.w       xr4,    xr9,     xr4
4205
4206    xvbsrl.v         xr18,   xr18,    2
4207    xvbsrl.v         xr19,   xr19,    2
4208    xvextrins.h      xr18,   xr12,    0x73
4209    xvextrins.h      xr19,   xr12,    0x77
4210    xvdp2.w.h        xr9,    xr18,    xr8
4211    xvdp2.w.h        xr11,   xr19,    xr8
4212    HADDWQW          xr9
4213    HADDWQW          xr11
4214    xvpackev.w       xr9,    xr11,    xr9
4215
4216    xvpackev.d       xr0,    xr2,     xr0
4217    xvpackev.d       xr4,    xr9,     xr4
4218    xvsrari.w        xr0,    xr0,     6
4219    xvsrari.w        xr4,    xr4,     6
4220    xvpermi.d        xr0,    xr0,     0xd8
4221    xvpermi.d        xr4,    xr4,     0xd8
4222    xvpickev.h       xr0,    xr4,     xr0
4223    xvpermi.d        xr0,    xr0,     0xd8
4224    xvst             xr0,    a0,      0
4225    addi.d           a0,     a0,      32
4226
4227    xvbsrl.v         xr18,   xr18,    2
4228    xvbsrl.v         xr19,   xr19,    2
4229
4230    addi.d           a4,     a4,      -4
4231    bnez             a4,     .l_\lable\()hv_w4_loop
4232    b                .l_\lable\()end_pre_8tap
4233
4234.l_\lable\()hv_8w:
4235    addi.d           t0,     a1,      0
4236    addi.d           t5,     a4,      0
4237    srli.w           t7,     a3,      3
4238    slli.w           t7,     t7,      4 // store offset
4239    addi.d           t8,     a0,      0
4240.l_\lable\()hv_8w_loop0:
4241    xvld             xr0,    a1,      0
4242    xvldx            xr2,    a1,      a2
4243    xvldx            xr4,    a1,      t2
4244    xvldx            xr6,    a1,      t3
4245
4246    add.d            a1,     a1,      t4
4247    xvld             xr10,   a1,      0
4248    xvldx            xr11,   a1,      a2
4249    xvldx            xr12,   a1,      t2
4250
4251    xvbsrl.v         xr1,    xr0,     4
4252    xvbsrl.v         xr3,    xr2,     4
4253    xvbsrl.v         xr5,    xr4,     4
4254    xvbsrl.v         xr7,    xr6,     4
4255
4256    SHUFB            xr0,    xr23,    xr9,    xr13
4257    SHUFB            xr1,    xr23,    xr9,    xr14
4258    SHUFB            xr2,    xr23,    xr9,    xr15
4259    SHUFB            xr3,    xr23,    xr9,    xr16
4260    SHUFB            xr4,    xr23,    xr9,    xr17
4261    SHUFB            xr5,    xr23,    xr9,    xr18
4262    SHUFB            xr6,    xr23,    xr9,    xr19
4263    SHUFB            xr7,    xr23,    xr9,    xr20
4264
4265    xvdp2.h.bu.b     xr0,    xr13,    xr22
4266    xvdp2.h.bu.b     xr1,    xr14,    xr22
4267    xvdp2.h.bu.b     xr2,    xr15,    xr22
4268    xvdp2.h.bu.b     xr3,    xr16,    xr22
4269    xvdp2.h.bu.b     xr4,    xr17,    xr22
4270    xvdp2.h.bu.b     xr5,    xr18,    xr22
4271    xvdp2.h.bu.b     xr6,    xr19,    xr22
4272    xvdp2.h.bu.b     xr7,    xr20,    xr22
4273
4274    HADDWDH          xr0
4275    HADDWDH          xr1
4276    HADDWDH          xr2
4277    HADDWDH          xr3
4278    HADDWDH          xr4
4279    HADDWDH          xr5
4280    HADDWDH          xr6
4281    HADDWDH          xr7
4282
4283    xvpackev.w       xr0,    xr2,    xr0
4284    xvpackev.w       xr2,    xr6,    xr4
4285    xvpackev.d       xr16,   xr2,    xr0
4286    xvpackod.d       xr0,    xr2,    xr0
4287    xvpickev.h       xr0,    xr0,    xr16
4288    xvsrari.h        xr0,    xr0,    2   // 0 8 16 24  1 9 17 25  2 10 18 26  3 11 19 27
4289
4290    xvpackev.w       xr1,    xr3,    xr1
4291    xvpackev.w       xr3,    xr7,    xr5
4292    xvpackev.d       xr16,   xr3,    xr1
4293    xvpackod.d       xr1,    xr3,    xr1
4294    xvpickev.h       xr1,    xr1,    xr16
4295    xvsrari.h        xr1,    xr1,    2   // 4 12 20 28  5 13 21 29  6 14 22 30  7 15 23 31
4296
4297    xvbsrl.v         xr13,   xr10,    4
4298    xvbsrl.v         xr14,   xr11,    4
4299    xvbsrl.v         xr15,   xr12,    4
4300
4301    SHUFB            xr10,   xr23,   xr9,    xr10
4302    SHUFB            xr13,   xr23,   xr9,    xr13
4303    SHUFB            xr11,   xr23,   xr9,    xr11
4304    SHUFB            xr14,   xr23,   xr9,    xr14
4305    SHUFB            xr12,   xr23,   xr9,    xr12
4306    SHUFB            xr15,   xr23,   xr9,    xr15
4307
4308    xvdp2.h.bu.b     xr4,    xr10,   xr22
4309    xvdp2.h.bu.b     xr5,    xr13,   xr22
4310    xvdp2.h.bu.b     xr6,    xr11,   xr22
4311    xvdp2.h.bu.b     xr7,    xr14,   xr22
4312    xvdp2.h.bu.b     xr9,    xr12,   xr22
4313    xvdp2.h.bu.b     xr10,   xr15,   xr22
4314
4315    HADDWDH          xr4
4316    HADDWDH          xr5
4317    HADDWDH          xr6
4318    HADDWDH          xr7
4319    HADDWDH          xr9
4320    HADDWDH          xr10
4321
4322    xvpackev.w       xr4,    xr6,    xr4
4323    xvpackev.w       xr9,    xr12,   xr9
4324    xvpackev.d       xr16,   xr9,    xr4
4325    xvpackod.d       xr11,   xr9,    xr4
4326    xvpickev.h       xr2,    xr11,   xr16
4327    xvsrari.h        xr2,    xr2,    2   // 32 40 48 *  33 41 49 *  34 42 50 *  35 43 51 *
4328
4329    xvpackev.w       xr5,    xr7,    xr5
4330    xvpackev.w       xr10,   xr12,   xr10
4331    xvpackev.d       xr16,   xr10,   xr5
4332    xvpackod.d       xr11,   xr10,   xr5
4333    xvpickev.h       xr3,    xr11,   xr16
4334    xvsrari.h        xr3,    xr3,    2   // 36 44 52 *  37 45 53 *  38 46 54 *  39 47 56 *
4335
4336    xvpackev.d       xr18,   xr2,    xr0 // 0 8 16 24 32 40 48 *  2 10 18 26 34 42 50 *
4337    xvpackod.d       xr19,   xr2,    xr0 // 1 9 17 25 33 41 49 *  3 11 19 27 35 43 51 *
4338    xvpackev.d       xr20,   xr3,    xr1 // 4 12 20 28 36 44 52 *  6 14 22 30 38 46 54 *
4339    xvpackod.d       xr21,   xr3,    xr1 // 5 13 21 29 37 45 53 *  7 15 23 31 39 47 55 *
4340
4341.l_\lable\()hv_8w_loop:
4342    xvldx            xr0,    a1,     t3
4343    add.d            a1,     a1,     t4
4344    xvld             xr2,    a1,     0
4345    xvldx            xr4,    a1,     a2
4346    xvldx            xr6,    a1,     t2
4347
4348    xvbsrl.v         xr1,    xr0,    4
4349    xvbsrl.v         xr3,    xr2,    4
4350    xvbsrl.v         xr5,    xr4,    4
4351    xvbsrl.v         xr7,    xr6,    4
4352
4353    SHUFB            xr0,    xr23,   xr9,   xr0
4354    SHUFB            xr1,    xr23,   xr9,   xr1
4355    SHUFB            xr2,    xr23,   xr9,   xr2
4356    SHUFB            xr3,    xr23,   xr9,   xr3
4357    SHUFB            xr4,    xr23,   xr9,   xr4
4358    SHUFB            xr5,    xr23,   xr9,   xr5
4359    SHUFB            xr6,    xr23,   xr9,   xr6
4360    SHUFB            xr7,    xr23,   xr9,   xr7
4361
4362    xvdp2.h.bu.b     xr10,   xr0,    xr22
4363    xvdp2.h.bu.b     xr11,   xr1,    xr22
4364    xvdp2.h.bu.b     xr12,   xr2,    xr22
4365    xvdp2.h.bu.b     xr13,   xr3,    xr22
4366    xvdp2.h.bu.b     xr14,   xr4,    xr22
4367    xvdp2.h.bu.b     xr15,   xr5,    xr22
4368    xvdp2.h.bu.b     xr16,   xr6,    xr22
4369    xvdp2.h.bu.b     xr17,   xr7,    xr22
4370
4371    HADDWDH          xr10
4372    HADDWDH          xr11
4373    HADDWDH          xr12
4374    HADDWDH          xr13
4375    HADDWDH          xr14
4376    HADDWDH          xr15
4377    HADDWDH          xr16
4378    HADDWDH          xr17
4379
4380    xvpackev.w       xr0,    xr12,   xr10
4381    xvpackev.w       xr2,    xr16,   xr14
4382    xvpackev.d       xr9,    xr2,    xr0
4383    xvpackod.d       xr0,    xr2,    xr0
4384    xvpickev.h       xr0,    xr0,    xr9
4385    xvsrari.h        xr0,    xr0,    2   // 56 64 72 80  57 65 73 81  58 66 74 82  59 67 75 83
4386
4387    xvpackev.w       xr1,    xr13,   xr11
4388    xvpackev.w       xr3,    xr17,   xr15
4389    xvpackev.d       xr9,    xr3,    xr1
4390    xvpackod.d       xr1,    xr3,    xr1
4391    xvpickev.h       xr1,    xr1,    xr9
4392    xvsrari.h        xr1,    xr1,    2   // 60 68 76 84  61 69 77 85  62 70 78 86  63 71 79 87
4393
4394    xvextrins.h      xr18,   xr0,    0x70 // 0 8 16 24 32 40 48 (56)  2 10 18 26 34 42 50 (58)
4395    xvextrins.h      xr19,   xr0,    0x74 // 1 9 17 25 33 41 49 (57)  3 11 19 27 35 43 51 (59)
4396    xvextrins.h      xr20,   xr1,    0x70
4397    xvextrins.h      xr21,   xr1,    0x74
4398
4399    //h - 1
4400    xvdp2.w.h        xr10,   xr18,   xr8
4401    xvdp2.w.h        xr11,   xr19,   xr8
4402    xvdp2.w.h        xr12,   xr20,   xr8
4403    xvdp2.w.h        xr13,   xr21,   xr8
4404
4405    HADDWQW          xr10
4406    HADDWQW          xr11
4407    HADDWQW          xr12
4408    HADDWQW          xr13
4409
4410    xvpackev.w       xr2,    xr11,   xr10 //0 1 * * 2 3 * *
4411    xvpackev.w       xr3,    xr13,   xr12 //4 5 * * 6 7 * *
4412    xvpackev.d       xr2,    xr3,    xr2  //0 1 4 5  2 3 6 7
4413    //h - 2
4414    xvbsrl.v         xr4,    xr18,   2
4415    xvbsrl.v         xr5,    xr19,   2
4416    xvbsrl.v         xr6,    xr20,   2
4417    xvbsrl.v         xr7,    xr21,   2
4418    xvextrins.h      xr4,    xr0,    0x71
4419    xvextrins.h      xr5,    xr0,    0x75
4420    xvextrins.h      xr6,    xr1,    0x71
4421    xvextrins.h      xr7,    xr1,    0x75
4422
4423    xvdp2.w.h        xr10,   xr4,    xr8
4424    xvdp2.w.h        xr11,   xr5,    xr8
4425    xvdp2.w.h        xr12,   xr6,    xr8
4426    xvdp2.w.h        xr13,   xr7,    xr8
4427
4428    HADDWQW          xr10
4429    HADDWQW          xr11
4430    HADDWQW          xr12
4431    HADDWQW          xr13
4432
4433    xvpackev.w       xr14,   xr11,   xr10
4434    xvpackev.w       xr15,   xr13,   xr12
4435    xvpackev.d       xr14,   xr15,   xr14 //8 9 12 13  10 11 14 15
4436    //h - 3
4437    xvbsrl.v         xr4,    xr4,    2
4438    xvbsrl.v         xr5,    xr5,    2
4439    xvbsrl.v         xr6,    xr6,    2
4440    xvbsrl.v         xr7,    xr7,    2
4441    xvextrins.h      xr4,    xr0,    0x72
4442    xvextrins.h      xr5,    xr0,    0x76
4443    xvextrins.h      xr6,    xr1,    0x72
4444    xvextrins.h      xr7,    xr1,    0x76
4445
4446    xvdp2.w.h        xr10,   xr4,    xr8
4447    xvdp2.w.h        xr11,   xr5,    xr8
4448    xvdp2.w.h        xr12,   xr6,    xr8
4449    xvdp2.w.h        xr13,   xr7,    xr8
4450
4451    HADDWQW          xr10
4452    HADDWQW          xr11
4453    HADDWQW          xr12
4454    HADDWQW          xr13
4455
4456    xvpackev.w       xr15,   xr11,   xr10
4457    xvpackev.w       xr16,   xr13,   xr12
4458    xvpackev.d       xr15,   xr16,   xr15 //16 17 20 21  18 19 22 23
4459    //h - 4
4460    xvbsrl.v         xr4,    xr4,    2
4461    xvbsrl.v         xr5,    xr5,    2
4462    xvbsrl.v         xr6,    xr6,    2
4463    xvbsrl.v         xr7,    xr7,    2
4464    xvextrins.h      xr4,    xr0,    0x73
4465    xvextrins.h      xr5,    xr0,    0x77
4466    xvextrins.h      xr6,    xr1,    0x73
4467    xvextrins.h      xr7,    xr1,    0x77
4468
4469    xvdp2.w.h        xr10,   xr4,    xr8
4470    xvdp2.w.h        xr11,   xr5,    xr8
4471    xvdp2.w.h        xr12,   xr6,    xr8
4472    xvdp2.w.h        xr13,   xr7,    xr8
4473
4474    HADDWQW          xr10
4475    HADDWQW          xr11
4476    HADDWQW          xr12
4477    HADDWQW          xr13
4478
4479    xvpackev.w       xr16,   xr11,   xr10
4480    xvpackev.w       xr17,   xr13,   xr12
4481    xvpackev.d       xr16,   xr17,   xr16 //24 25 28 29  26 27 30 31
4482
4483    xvsrari.w        xr2,    xr2,    6
4484    xvsrari.w        xr14,   xr14,   6
4485    xvsrari.w        xr15,   xr15,   6
4486    xvsrari.w        xr16,   xr16,   6
4487
4488    xvpermi.d        xr2,    xr2,    0xd8
4489    xvpermi.d        xr14,   xr14,   0xd8
4490    xvpermi.d        xr15,   xr15,   0xd8
4491    xvpermi.d        xr16,   xr16,   0xd8
4492    xvpickev.h       xr2,    xr14,   xr2
4493    xvpickev.h       xr3,    xr16,   xr15
4494    xvpermi.d        xr2,    xr2,    0xd8
4495    xvpermi.d        xr3,    xr3,    0xd8
4496
4497    xvpermi.q        xr10,   xr2,    0x31
4498    xvpermi.q        xr11,   xr3,    0x31
4499
4500    vst              vr2,    a0,     0
4501    vstx             vr10,   a0,     t7 //32
4502    slli.w           t1,     t7,     1  //64
4503    vstx             vr3,    a0,     t1
4504    add.w            t1,     t1,     t7 //96
4505    vstx             vr11,   a0,     t1
4506    slli.w           t1,     t7,     2  //128
4507    add.d            a0,     a0,     t1
4508
4509    xvbsrl.v         xr18,   xr4,    2
4510    xvbsrl.v         xr19,   xr5,    2
4511    xvbsrl.v         xr20,   xr6,    2
4512    xvbsrl.v         xr21,   xr7,    2
4513
4514    addi.d           a4,     a4,     -4
4515    bnez             a4,     .l_\lable\()hv_8w_loop
4516
4517    addi.d           a1,     t0,     8
4518    addi.d           t0,     t0,     8
4519    addi.d           a0,     t8,     16
4520    addi.d           t8,     t8,     16
4521    addi.d           a4,     t5,     0
4522    addi.d           a3,     a3,    -8
4523    bnez             a3,     .l_\lable\()hv_8w_loop0
4524    b                .l_\lable\()end_pre_8tap
4525.l_\lable\()v:
4526
4527    srli.w           a7,    a7,    2
4528    blt              t0,    a4,    .l_\lable\()v_idx_fv
4529    andi             a7,    a7,    1
4530    addi.w           a7,    a7,    3
4531.l_\lable\()v_idx_fv:
4532    addi.w           t5,     zero,  120
4533    mul.w            a7,     a7,    t5
4534    addi.w           t5,     a6,    -1
4535    slli.w           t5,     t5,    3
4536    add.w            a7,     a7,    t5
4537    add.d            a7,     t6,    a7 //fv's offset
4538    xvldrepl.d       xr8,    a7,     0
4539
4540    sub.d            a1,     a1,     t3
4541    beq              a3,     t0,     .l_\lable\()v_4w
4542    blt              t0,     a3,     .l_\lable\()v_8w
4543.l_\lable\()v_4w:
4544    fld.s            f0,     a1,     0
4545    fldx.s           f1,     a1,     a2
4546    fldx.s           f2,     a1,     t2
4547    add.d            a1,     a1,     t3
4548    fld.s            f3,     a1,     0
4549    fldx.s           f4,     a1,     a2
4550    fldx.s           f5,     a1,     t2
4551    fldx.s           f6,     a1,     t3
4552
4553    xvilvl.b         xr0,    xr1,    xr0 // 0 1  8 9  16 17 24 25
4554    xvilvl.b         xr1,    xr3,    xr2 // 2 3 10 11 18 19 26 27
4555    xvilvl.b         xr2,    xr5,    xr4 // 4 5 12 13 20 21 28 29
4556    xvilvl.b         xr3,    xr7,    xr6 // 6 7 14 15 22 23 30 31
4557    xvilvl.h         xr0,    xr1,    xr0 // 0 1 2 3  8  9  10 11  16 17 18 19  24 25 26 27
4558    xvilvl.h         xr1,    xr3,    xr2 // 4 5 6 7  12 13 14 15  20 21 22 23  28 29 30 31
4559    xvilvl.w         xr2,    xr1,    xr0
4560    xvilvh.w         xr0,    xr1,    xr0
4561    xvpermi.q        xr0,    xr2,    0x20
4562
4563.l_\lable\()v_4w_loop:
4564    add.d            a1,     a1,     t4
4565    fld.s            f7,     a1,     0  //h0
4566    fldx.s           f10,    a1,     a2 //h1
4567    fldx.s           f11,    a1,     t2 //h2
4568    fldx.s           f12,    a1,     t3 //h3
4569
4570    xvbsrl.v         xr9,    xr7,    2
4571    xvpermi.q        xr9,    xr7,    0x20
4572    xvextrins.b      xr0,    xr9,    0x70
4573    xvextrins.b      xr0,    xr9,    0xf1
4574
4575    xvbsrl.v         xr1,    xr0,    1
4576    xvbsrl.v         xr7,    xr10,   2
4577    xvpermi.q        xr7,    xr10,   0x20
4578    xvextrins.b      xr1,    xr7,    0x70
4579    xvextrins.b      xr1,    xr7,    0xf1
4580
4581    xvbsrl.v         xr2,    xr1,    1
4582    xvbsrl.v         xr7,    xr11,   2
4583    xvpermi.q        xr7,    xr11,   0x20
4584    xvextrins.b      xr2,    xr7,    0x70
4585    xvextrins.b      xr2,    xr7,    0xf1
4586
4587    xvbsrl.v         xr3,    xr2,    1
4588    xvbsrl.v         xr7,    xr12,   2
4589    xvpermi.q        xr7,    xr12,   0x20
4590    xvextrins.b      xr3,    xr7,    0x70
4591    xvextrins.b      xr3,    xr7,    0xf1
4592    xvbsrl.v         xr4,    xr3,    1
4593
4594    xvdp2.h.bu.b     xr10,   xr0,    xr8
4595    xvdp2.h.bu.b     xr11,   xr1,    xr8
4596    xvdp2.h.bu.b     xr12,   xr2,    xr8
4597    xvdp2.h.bu.b     xr13,   xr3,    xr8
4598    HADDWDH          xr10
4599    HADDWDH          xr11
4600    HADDWDH          xr12
4601    HADDWDH          xr13
4602    xvpickev.w       xr10,   xr11,   xr10
4603    xvpickev.w       xr11,   xr13,   xr12
4604    xvpermi.d        xr10,   xr10,   0xd8
4605    xvpermi.d        xr11,   xr11,   0xd8
4606    xvpickev.h       xr10,   xr11,   xr10
4607    xvpermi.d        xr10,   xr10,   0xd8
4608    xvsrari.h        xr10,   xr10,   2
4609
4610    xvaddi.bu        xr0,    xr4,    0
4611
4612    xvst             xr10,   a0,     0
4613    addi.d           a0,     a0,     32
4614    addi.w           a4,     a4,     -4
4615    bnez             a4,     .l_\lable\()v_4w_loop
4616    b                .l_\lable\()end_pre_8tap
4617
4618.l_\lable\()v_8w:
4619    addi.d           t0,     a1,     0
4620    addi.d           t5,     a4,     0
4621    srli.w           t7,     a3,     2
4622    slli.w           t7,     t7,     3
4623    addi.d           t8,     a0,     0
4624.l_\lable\()v_8w_loop0:
4625    fld.s            f0,     a1,     0
4626    fldx.s           f1,     a1,     a2
4627    fldx.s           f2,     a1,     t2
4628    add.d            a1,     a1,     t3
4629    fld.s            f3,     a1,     0
4630    fldx.s           f4,     a1,     a2
4631    fldx.s           f5,     a1,     t2
4632    fldx.s           f6,     a1,     t3
4633
4634    xvilvl.b         xr0,    xr1,    xr0 // 0 1  8 9  16 17 24 25
4635    xvilvl.b         xr1,    xr3,    xr2 // 2 3 10 11 18 19 26 27
4636    xvilvl.b         xr2,    xr5,    xr4 // 4 5 12 13 20 21 28 29
4637    xvilvl.b         xr3,    xr7,    xr6 // 6 7 14 15 22 23 30 31
4638    xvilvl.h         xr0,    xr1,    xr0 // 0 1 2 3  8  9  10 11  16 17 18 19  24 25 26 27
4639    xvilvl.h         xr1,    xr3,    xr2 // 4 5 6 7  12 13 14 15  20 21 22 23  28 29 30 31
4640    xvilvl.w         xr2,    xr1,    xr0
4641    xvilvh.w         xr0,    xr1,    xr0
4642    xvpermi.q        xr0,    xr2,    0x20
4643
4644.l_\lable\()v_8w_loop:
4645    add.d            a1,     a1,     t4
4646    fld.s            f7,     a1,     0  //h0
4647    fldx.s           f10,    a1,     a2 //h1
4648    fldx.s           f11,    a1,     t2 //h2
4649    fldx.s           f12,    a1,     t3 //h3
4650
4651    xvbsrl.v         xr9,    xr7,    2
4652    xvpermi.q        xr9,    xr7,    0x20
4653    xvextrins.b      xr0,    xr9,    0x70
4654    xvextrins.b      xr0,    xr9,    0xf1
4655
4656    xvbsrl.v         xr1,    xr0,    1
4657    xvbsrl.v         xr7,    xr10,   2
4658    xvpermi.q        xr7,    xr10,   0x20
4659    xvextrins.b      xr1,    xr7,    0x70
4660    xvextrins.b      xr1,    xr7,    0xf1
4661
4662    xvbsrl.v         xr2,    xr1,    1
4663    xvbsrl.v         xr7,    xr11,   2
4664    xvpermi.q        xr7,    xr11,   0x20
4665    xvextrins.b      xr2,    xr7,    0x70
4666    xvextrins.b      xr2,    xr7,    0xf1
4667
4668    xvbsrl.v         xr3,    xr2,    1
4669    xvbsrl.v         xr7,    xr12,   2
4670    xvpermi.q        xr7,    xr12,   0x20
4671    xvextrins.b      xr3,    xr7,    0x70
4672    xvextrins.b      xr3,    xr7,    0xf1
4673    xvbsrl.v         xr4,    xr3,    1
4674
4675    xvdp2.h.bu.b     xr10,   xr0,    xr8
4676    xvdp2.h.bu.b     xr11,   xr1,    xr8
4677    xvdp2.h.bu.b     xr12,   xr2,    xr8
4678    xvdp2.h.bu.b     xr13,   xr3,    xr8
4679    HADDWDH          xr10
4680    HADDWDH          xr11
4681    HADDWDH          xr12
4682    HADDWDH          xr13
4683    xvpickev.w       xr10,   xr11,   xr10
4684    xvpickev.w       xr11,   xr13,   xr12
4685    xvpermi.d        xr10,   xr10,   0xd8
4686    xvpermi.d        xr11,   xr11,   0xd8
4687    xvpickev.h       xr10,   xr11,   xr10
4688    xvpermi.d        xr10,   xr10,   0xd8
4689    xvsrari.h        xr10,   xr10,   2
4690
4691    xvaddi.bu        xr0,    xr4,    0
4692
4693    xvstelm.d        xr10,   a0,     0,    0
4694    add.d            a0,     a0,     t7
4695    xvstelm.d        xr10,   a0,     0,    1
4696    add.d            a0,     a0,     t7
4697    xvstelm.d        xr10,   a0,     0,    2
4698    add.d            a0,     a0,     t7
4699    xvstelm.d        xr10,   a0,     0,    3
4700    add.d            a0,     a0,     t7
4701    addi.w           a4,     a4,     -4
4702    bnez             a4,     .l_\lable\()v_8w_loop
4703
4704    addi.d           a1,     t0,     4
4705    addi.d           t0,     t0,     4
4706    addi.d           a0,     t8,     8
4707    addi.d           t8,     t8,     8
4708    addi.d           a4,     t5,     0
4709    addi.d           a3,     a3,     -4
4710    bnez             a3,     .l_\lable\()v_8w_loop0
4711
4712.l_\lable\()end_pre_8tap:
4713.endm
4714
4715function prep_8tap_regular_8bpc_lasx
4716    addi.w a7, zero, 0
4717    PREP_8TAP_8BPC_LASX 0
4718endfunc
4719
4720function prep_8tap_smooth_regular_8bpc_lasx
4721    addi.w a7, zero, 1
4722    PREP_8TAP_8BPC_LASX 1
4723endfunc
4724
4725function prep_8tap_sharp_regular_8bpc_lasx
4726    addi.w a7, zero, 2
4727    PREP_8TAP_8BPC_LASX 2
4728endfunc
4729
4730function prep_8tap_regular_smooth_8bpc_lasx
4731    addi.w a7, zero, 4
4732    PREP_8TAP_8BPC_LASX 4
4733endfunc
4734
4735function prep_8tap_smooth_8bpc_lasx
4736    addi.w a7, zero, 5
4737    PREP_8TAP_8BPC_LASX 5
4738endfunc
4739
4740function prep_8tap_sharp_smooth_8bpc_lasx
4741    addi.w a7, zero, 6
4742    PREP_8TAP_8BPC_LASX 6
4743endfunc
4744
4745function prep_8tap_regular_sharp_8bpc_lasx
4746    addi.w a7, zero, 8
4747    PREP_8TAP_8BPC_LASX 8
4748endfunc
4749
4750function prep_8tap_smooth_sharp_8bpc_lasx
4751    addi.w a7, zero, 9
4752    PREP_8TAP_8BPC_LASX 9
4753endfunc
4754
4755function prep_8tap_sharp_8bpc_lasx
4756    addi.w a7, zero, 10
4757    PREP_8TAP_8BPC_LASX 10
4758endfunc
4759