• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30.macro FILTER_W4 DIR, TYPE
31.ifc \DIR, h
32    addi.d           t5,     a0,    -2
33    fld.s            f6,     t5,     0  //p1 p0 q0 q1
34    fldx.s           f7,     t5,     a1
35    alsl.d           t5,     a1,     t5,    1
36    fld.s            f8,     t5,     0
37    fldx.s           f9,     t5,     a1
38
39    vilvl.b          vr6,    vr7,    vr6
40    vilvl.b          vr7,    vr9,    vr8
41    vilvl.h          vr6,    vr7,    vr6 //p1p1p1p1
42    vbsrl.v          vr7,    vr6,    4   //p0p0p0p0
43    vbsrl.v          vr8,    vr7,    4   //q0q0q0q0
44    vbsrl.v          vr9,    vr8,    4   //q1q1q1q1
45.else
46    sub.d            t5,     a0,     a1
47    fld.s            f7,     t5,     0
48    sub.d            t5,     t5,     a1
49    fld.s            f6,     t5,     0
50    fld.s            f8,     a0,     0
51    fldx.s           f9,     a0,     a1
52.endif
53
54    vabsd.bu         vr10,   vr6,    vr7 // (p1 - p0)
55    vabsd.bu         vr11,   vr9,    vr8 // (q1 - q0)
56    vabsd.bu         vr12,   vr7,    vr8 // (p0 - q0)
57    vabsd.bu         vr13,   vr6,    vr9 // (p1 - q1)
58
59    vmax.bu          vr14,   vr10,   vr11
60    vsle.bu          vr15,   vr14,   vr4  //abs(p1 - p0) <= I && abs(q1 - q0) <= I
61    vsadd.bu         vr16,   vr12,   vr12
62    vsrli.b          vr17,   vr13,   1
63    vsadd.bu         vr16,   vr16,   vr17 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1)
64    vsle.bu          vr16,   vr16,   vr3
65    vand.v           vr20,   vr15,   vr16 //fm
66
67    vpickve2gr.wu    t5,     vr20,   0
68    beqz             t5,     .END_FILTER_\DIR\()\TYPE\()_W4
69
70    vslt.bu          vr16,   vr2,    vr14 //hev
71
72    vsllwil.h.b      vr30,   vr20,   0 //expand fm to w
73    vsllwil.w.h      vr30,   vr30,   0
74
75    vsllwil.hu.bu    vr17,   vr6,    0
76    vsllwil.hu.bu    vr18,   vr9,    0
77    vsub.h           vr17,   vr17,   vr18
78    vssrarni.b.h     vr17,   vr17,   0    //f = iclip_diff(p1 - q1)
79
80    vand.v           vr17,   vr17,   vr16
81    vsllwil.h.b      vr18,   vr17,   0
82
83    vsllwil.hu.bu    vr10,   vr8,    0
84    vsllwil.hu.bu    vr11,   vr7,    0
85    vsub.h           vr10,   vr10,   vr11
86
87    vsadd.h          vr11,   vr10,   vr10
88    vsadd.h          vr10,   vr10,   vr11 //3 * (q0 - p0)
89    vsadd.h          vr10,   vr10,   vr18 //f = iclip_diff(3 * (q0 - p0) + f);
90    vssrani.b.h      vr10,   vr10,   0
91    vsllwil.h.b      vr10,   vr10,   0
92
93    vaddi.hu         vr11,   vr10,   4
94    vaddi.hu         vr12,   vr10,   3
95    li.w             t5,     127
96    vreplgr2vr.h     vr13,   t5
97    vmin.h           vr11,   vr11,   vr13
98    vmin.h           vr12,   vr12,   vr13
99    vsrai.h          vr11,   vr11,   3 //f1
100    vsrai.h          vr12,   vr12,   3 //f2
101
102    vsllwil.hu.bu    vr13,   vr7,    0 //p0
103    vsllwil.hu.bu    vr14,   vr8,    0 //q0
104    vsadd.h          vr13,   vr13,   vr12
105    vssub.h          vr14,   vr14,   vr11
106    vssrani.bu.h     vr13,   vr13,   0 //dst-1
107    vssrani.bu.h     vr14,   vr14,   0 //dst+0
108
109    vsrari.h         vr15,   vr11,   1 //f
110    vsllwil.hu.bu    vr18,   vr6,    0 //p1
111    vsllwil.hu.bu    vr19,   vr9,    0 //q1
112    vsadd.h          vr18,   vr18,   vr15
113    vssub.h          vr19,   vr19,   vr15
114    vssrani.bu.h     vr18,   vr18,   0 //dst-2
115    vssrani.bu.h     vr19,   vr19,   0 //dst+1
116    vbitsel.v        vr26,   vr18,   vr6,    vr16
117    vbitsel.v        vr29,   vr19,   vr9,    vr16
118
119    vbitsel.v        vr6,    vr6,    vr26,   vr20
120    vbitsel.v        vr7,    vr7,    vr13,   vr20
121    vbitsel.v        vr8,    vr8,    vr14,   vr20
122    vbitsel.v        vr9,    vr9,    vr29,   vr20
123
124.ifc \DIR, h
125    vilvl.b          vr6,    vr7,    vr6
126    vilvl.b          vr9,    vr9,    vr8
127    vilvl.h          vr6,    vr9,    vr6
128
129    addi.d           t5,     a0,    -2
130    vstelm.w         vr6,    t5,     0,      0
131    add.d            t5,     t5,     a1
132    vstelm.w         vr6,    t5,     0,      1
133    add.d            t5,     t5,     a1
134    vstelm.w         vr6,    t5,     0,      2
135    add.d            t5,     t5,     a1
136    vstelm.w         vr6,    t5,     0,      3
137.else
138    fst.s            f8,     a0,     0
139    fstx.s           f9,     a0,     a1
140    sub.d            t5,     a0,     a1
141    fst.s            f7,     t5,     0
142    sub.d            t5,     t5,     a1
143    fst.s            f6,     t5,     0
144.endif
145.END_FILTER_\DIR\()\TYPE\()_W4:
146.endm
147
148.macro FILTER_W6 DIR, TYPE
149.ifc \DIR, h
150    addi.d           t5,     a0,    -3
151    fld.d            f6,     t5,     0 //p2 p1 p0 q0 q1 q2
152    fldx.d           f7,     t5,     a1
153    alsl.d           t5,     a1,     t5,    1
154    fld.d            f8,     t5,     0
155    fldx.d           f9,     t5,     a1
156
157    vilvl.b          vr6,    vr7,    vr6
158    vilvl.b          vr7,    vr9,    vr8
159    vilvh.h          vr10,   vr7,    vr6
160    vilvl.h          vr6,    vr7,    vr6
161
162    vbsrl.v          vr7,    vr6,    4 //p1
163    vbsrl.v          vr8,    vr7,    4 //p0
164    vbsrl.v          vr9,    vr8,    4 //q0
165    vbsrl.v          vr11,   vr10,   4 //q2
166.else
167    alsl.d           t5,     a1,     a1,    1
168    sub.d            t5,     a0,     t5
169    fld.d            f6,     t5,     0
170    fldx.d           f7,     t5,     a1
171    alsl.d           t5,     a1,     t5,    1
172    fld.d            f8,     t5,     0
173    fldx.d           f9,     t5,     a1
174    alsl.d           t5,     a1,     t5,    1
175    fld.d            f10,    t5,     0
176    fldx.d           f11,    t5,     a1
177.endif
178
179    vabsd.bu         vr12,   vr7,    vr8 //abs(p1-p0)
180    vabsd.bu         vr13,   vr10,   vr9 //abs(q1-q0)
181    vmax.bu          vr14,   vr12,   vr13
182    vslt.bu          vr2,    vr2,    vr14 //hev
183    vabsd.bu         vr12,   vr6,    vr7 //abs(p2-p1)
184    vmax.bu          vr12,   vr12,   vr14
185    vabsd.bu         vr13,   vr11,   vr10 //abs(q2-q1)
186    vmax.bu          vr12,   vr12,   vr13
187    vsle.bu          vr0,    vr12,   vr4 // <=I
188
189    vabsd.bu         vr13,   vr8,    vr9 //abs(p0-q0)
190    vsadd.bu         vr13,   vr13,   vr13
191    vabsd.bu         vr15,   vr7,    vr10
192    vsrli.b          vr15,   vr15,   1
193    vsadd.bu         vr13,   vr13,   vr15
194    vsle.bu          vr13,   vr13,   vr3 //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E
195    vand.v           vr0,    vr0,    vr13 //fm
196
197    vpickve2gr.wu    t5,     vr0,    0
198    beqz             t5,     .END_FILTER_\DIR\()\TYPE\()_W6
199
200    vabsd.bu         vr12,   vr6,    vr8 //abs(p2-p0)
201    vabsd.bu         vr13,   vr11,   vr9 //abs(q2-q0)
202    vmax.bu          vr12,   vr12,   vr14
203    vmax.bu          vr12,   vr12,   vr13
204    vxor.v           vr13,   vr13,   vr13
205    vaddi.bu         vr13,   vr13,   1
206    vsle.bu          vr1,    vr12,   vr13 //flat8in
207
208    //6789 10 11 --expand to h
209    vsllwil.hu.bu    vr12,   vr6,    0
210    vsllwil.hu.bu    vr13,   vr7,    0
211    vsllwil.hu.bu    vr14,   vr8,    0
212    vsllwil.hu.bu    vr15,   vr9,    0
213    vsllwil.hu.bu    vr16,   vr10,   0
214    vsllwil.hu.bu    vr17,   vr11,   0
215
216    //dst-2
217    vsadd.hu         vr18,   vr12,   vr12
218    vsadd.hu         vr18,   vr18,   vr12
219    vsadd.hu         vr18,   vr18,   vr13
220    vsadd.hu         vr18,   vr18,   vr13
221    vsadd.hu         vr18,   vr18,   vr14
222    vsadd.hu         vr18,   vr18,   vr14
223    vsadd.hu         vr18,   vr18,   vr15
224
225    //dst-1
226    vsadd.hu         vr19,   vr18,   vr15
227    vsadd.hu         vr19,   vr19,   vr16
228    vssub.hu         vr19,   vr19,   vr12
229    vssub.hu         vr19,   vr19,   vr12
230
231    //dst+0
232    vsadd.hu         vr20,   vr19,   vr17
233    vsadd.hu         vr20,   vr20,   vr16
234    vssub.hu         vr20,   vr20,   vr12
235    vssub.hu         vr20,   vr20,   vr13
236
237    //dst+1
238    vsadd.hu         vr21,   vr20,   vr17
239    vsadd.hu         vr21,   vr21,   vr17
240    vssub.hu         vr21,   vr21,   vr13
241    vssub.hu         vr21,   vr21,   vr14
242
243    vsrari.h         vr18,   vr18,   3
244    vsrari.h         vr19,   vr19,   3
245    vsrari.h         vr20,   vr20,   3
246    vsrari.h         vr21,   vr21,   3
247
248    vsub.h           vr22,   vr13,   vr16
249    vssrani.b.h      vr22,   vr22,   0
250    vand.v           vr22,   vr22,   vr2
251    vsllwil.h.b      vr22,   vr22,   0 //f = iclip_diff(p1 - q1);
252
253    vsub.h           vr23,   vr15,   vr14
254    vsadd.h          vr24,   vr23,   vr23
255    vsadd.h          vr23,   vr23,   vr24
256    vsadd.h          vr23,   vr23,   vr22
257    vssrani.b.h      vr23,   vr23,   0
258    vsllwil.h.b      vr23,   vr23,   0 //f = iclip_diff(3 * (q0 - p0) + f);
259
260    vaddi.hu         vr24,   vr23,   4
261    vaddi.hu         vr25,   vr23,   3
262    li.w             t5,     127
263    vreplgr2vr.h     vr3,    t5
264    vmin.h           vr24,   vr24,   vr3
265    vmin.h           vr25,   vr25,   vr3
266    vsrai.h          vr24,   vr24,   3 //f1
267    vsrai.h          vr25,   vr25,   3 //f2
268
269    vsadd.h          vr26,   vr14,   vr25 //dst-1
270    vssub.h          vr27,   vr15,   vr24 //dst+0
271
272    vsrari.h         vr24,   vr24,   1
273    vsadd.h          vr28,   vr13,   vr24
274    vssub.h          vr29,   vr16,   vr24
275    vsllwil.h.b      vr2,    vr2,    0
276    vbitsel.v        vr28,   vr28,   vr13,   vr2 //dst-2
277    vbitsel.v        vr29,   vr29,   vr16,   vr2 //dst+1
278
279    //flat8in
280    vsllwil.h.b      vr1,    vr1,    0
281    vbitsel.v        vr18,   vr28,   vr18,   vr1
282    vbitsel.v        vr19,   vr26,   vr19,   vr1
283    vbitsel.v        vr20,   vr27,   vr20,   vr1
284    vbitsel.v        vr21,   vr29,   vr21,   vr1
285
286    vssrani.bu.h     vr18,   vr18,   0
287    vssrani.bu.h     vr19,   vr19,   0
288    vssrani.bu.h     vr20,   vr20,   0
289    vssrani.bu.h     vr21,   vr21,   0
290
291    vbitsel.v        vr7,    vr7,    vr18,   vr0 //p1
292    vbitsel.v        vr8,    vr8,    vr19,   vr0 //p0
293    vbitsel.v        vr9,    vr9,    vr20,   vr0 //q0
294    vbitsel.v        vr10,   vr10,   vr21,   vr0 //q1
295
296.ifc \DIR, h
297    vilvl.b          vr7,    vr8,    vr7
298    vilvl.b          vr9,    vr10,   vr9
299    vilvl.h          vr7,    vr9,    vr7
300
301    addi.d           t5,     a0,    -2
302    vstelm.w         vr7,    t5,     0,      0
303    add.d            t5,     t5,     a1
304    vstelm.w         vr7,    t5,     0,      1
305    add.d            t5,     t5,     a1
306    vstelm.w         vr7,    t5,     0,      2
307    add.d            t5,     t5,     a1
308    vstelm.w         vr7,    t5,     0,      3
309.else
310    fst.s            f9,     a0,     0
311    fstx.s           f10,    a0,     a1
312    sub.d            t5,     a0,     a1
313    fst.s            f8,     t5,     0
314    sub.d            t5,     t5,     a1
315    fst.s            f7,     t5,     0
316.endif
317.END_FILTER_\DIR\()\TYPE\()_W6:
318.endm
319
320.macro FILTER_W8 DIR, TYPE
321.ifc \DIR, h
322    addi.d           t5,     a0,    -4
323    fld.d            f6,     t5,     0 //p3 p2 p1 p0 q0 q1 q2 q3
324    fldx.d           f7,     t5,     a1
325    alsl.d           t5,     a1,     t5,     1
326    fld.d            f8,     t5,     0
327    fldx.d           f9,     t5,     a1
328
329    vilvl.b          vr6,    vr7,    vr6
330    vilvl.b          vr7,    vr9,    vr8
331    vilvh.h          vr10,   vr7,    vr6 //q0
332    vilvl.h          vr6,    vr7,    vr6 //p3
333    vbsrl.v          vr7,    vr6,    4   //p2
334    vbsrl.v          vr8,    vr6,    8   //p1
335    vbsrl.v          vr9,    vr6,    12  //p0
336    vbsrl.v          vr11,   vr10,   4   //q1
337    vbsrl.v          vr12,   vr10,   8   //q2
338    vbsrl.v          vr13,   vr10,   12  //q3
339.else
340    fld.s            f10,    a0,     0
341    fldx.s           f11,    a0,     a1
342    add.d            t5,     a0,     a1
343    fldx.s           f12,    t5,     a1
344    add.d            t5,     t5,     a1
345    fldx.s           f13,    t5,     a1
346    sub.d            t5,     a0,     a1
347    fld.s            f9,     t5,     0
348    sub.d            t5,     t5,     a1
349    fld.s            f8,     t5,     0
350    sub.d            t5,     t5,     a1
351    fld.s            f7,     t5,     0
352    sub.d            t5,     t5,     a1
353    fld.s            f6,     t5,     0
354.endif
355
356    vabsd.bu         vr14,   vr8,    vr9  //p1-p0
357    vabsd.bu         vr15,   vr11,   vr10 //q1-q0
358    vabsd.bu         vr16,   vr9,    vr10 //p0-q0
359    vabsd.bu         vr17,   vr8,    vr11 //p1-q1
360    vabsd.bu         vr18,   vr7,    vr8  //p2-p1
361    vabsd.bu         vr19,   vr12,   vr11 //q2-q1
362    vabsd.bu         vr20,   vr6,    vr7  //p3-p2
363    vabsd.bu         vr21,   vr13,   vr12 //q3-q2
364
365    vmax.bu          vr22,   vr14,   vr15
366    vsle.bu          vr23,   vr22,   vr4  //abs(p1 - p0) <= I && abs(q1 - q0) <= I
367    vsadd.bu         vr16,   vr16,   vr16
368    vsrli.b          vr17,   vr17,   1
369    vsadd.bu         vr16,   vr16,   vr17
370    vsle.bu          vr16,   vr16,   vr3  //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1) <= E
371    vand.v           vr16,   vr16,   vr23 //fm
372
373    vpickve2gr.wu    t5,     vr16,   0
374    beqz             t5,     .END_FILTER_\DIR\()\TYPE\()_W8
375
376    vmax.bu          vr23,   vr18,   vr19
377    vmax.bu          vr23,   vr23,   vr20
378    vmax.bu          vr23,   vr23,   vr21
379    vsle.bu          vr23,   vr23,   vr4
380    vand.v           vr16,   vr16,   vr23 //fm
381
382    vabsd.bu         vr17,   vr7,    vr9  //abs(p2-p0)
383    vabsd.bu         vr18,   vr12,   vr10 //abs(q2-q0)
384    vmax.bu          vr17,   vr17,   vr14
385    vmax.bu          vr17,   vr17,   vr15
386    vmax.bu          vr17,   vr17,   vr18
387    vabsd.bu         vr18,   vr6,    vr9  //abs(p3 - p0)
388    vabsd.bu         vr19,   vr13,   vr10 //abs(q3 - q0)
389    vmax.bu          vr17,   vr17,   vr18
390    vmax.bu          vr17,   vr17,   vr19
391
392    vxor.v           vr5,    vr5,    vr5
393    vaddi.bu         vr5,    vr5,    1    //F
394    vsle.bu          vr17,   vr17,   vr5  //flat8in
395
396    vsllwil.hu.bu    vr0,    vr6,    0 //p3
397    vsllwil.hu.bu    vr1,    vr7,    0 //p2
398    vsllwil.hu.bu    vr27,   vr8,    0 //p1
399    vsllwil.hu.bu    vr3,    vr9,    0 //p0
400    vsllwil.hu.bu    vr4,    vr10,   0 //q0
401    vsllwil.hu.bu    vr5,    vr11,   0 //q1
402    vsllwil.hu.bu    vr14,   vr12,   0 //q2
403    vsllwil.hu.bu    vr15,   vr13,   0 //q3
404
405    vsadd.hu         vr18,   vr0,    vr0  //p3+p3
406    vsadd.hu         vr19,   vr15,   vr15 //q3+q3
407    vsadd.hu         vr20,   vr0,    vr1  //p3+p2
408    vsadd.hu         vr21,   vr1,    vr27 //p2+p1
409    vsadd.hu         vr28,   vr27,   vr3  //p1+p0
410    vsadd.hu         vr23,   vr3,    vr4  //p0+q0
411    vsadd.hu         vr24,   vr4,    vr5  //q0+q1
412    vsadd.hu         vr25,   vr5,    vr14 //q1+q2
413    vsadd.hu         vr26,   vr14,   vr15 //q2+q3
414
415    // dst-3
416    vsadd.hu         vr29,   vr18,   vr20
417    vsadd.hu         vr29,   vr29,   vr21
418    vsadd.hu         vr29,   vr29,   vr23
419
420    // dst-2
421    vsadd.hu         vr30,   vr18,   vr21
422    vsadd.hu         vr30,   vr30,   vr28
423    vsadd.hu         vr30,   vr30,   vr24
424
425    // dst-1
426    vsadd.hu         vr31,   vr20,   vr28
427    vsadd.hu         vr31,   vr31,   vr23
428    vsadd.hu         vr31,   vr31,   vr25
429
430    // dst+0
431    vsadd.hu         vr18,   vr21,   vr23
432    vsadd.hu         vr18,   vr18,   vr24
433    vsadd.hu         vr18,   vr18,   vr26
434
435    //dst+1
436    vsadd.hu         vr20,   vr28,   vr24
437    vsadd.hu         vr20,   vr20,   vr25
438    vsadd.hu         vr20,   vr20,   vr19
439
440    //dst+2
441    vsadd.hu         vr21,   vr23,   vr25
442    vsadd.hu         vr21,   vr21,   vr26
443    vsadd.hu         vr21,   vr21,   vr19
444
445    vssrarni.bu.h    vr23,   vr29,   3
446    vssrarni.bu.h    vr24,   vr30,   3
447    vssrarni.bu.h    vr25,   vr31,   3
448    vssrarni.bu.h    vr19,   vr18,   3
449    vssrarni.bu.h    vr20,   vr20,   3
450    vssrarni.bu.h    vr21,   vr21,   3
451
452    // !flat8in
453    vslt.bu          vr2,    vr2,    vr22 //hev
454
455    vsub.h           vr30,   vr27,   vr5 //p1-q1
456    vssrani.b.h      vr30,   vr30,   0
457    vand.v           vr30,   vr30,   vr2
458    vsllwil.h.b      vr30,   vr30,   0
459
460    vsub.h           vr31,   vr4,    vr3
461    vsadd.h          vr0,    vr31,   vr31
462    vsadd.h          vr31,   vr31,   vr0
463    vsadd.h          vr31,   vr31,   vr30
464    vssrani.b.h      vr31,   vr31,   0
465    vsllwil.h.b      vr31,   vr31,   0 //f = iclip_diff(3 * (q0 - p0) + f);
466
467    vaddi.hu         vr14,   vr31,   4
468    vaddi.hu         vr15,   vr31,   3
469    li.w             t5,     127
470    vreplgr2vr.h     vr18,   t5
471    vmin.h           vr14,   vr14,   vr18
472    vmin.h           vr15,   vr15,   vr18
473    vsrai.h          vr14,   vr14,   3 //f1
474    vsrai.h          vr15,   vr15,   3 //f2
475
476    vsadd.h          vr3,    vr3,    vr15
477    vssub.h          vr4,    vr4,    vr14
478    vssrani.bu.h     vr3,    vr3,    0 //dst-1
479    vssrani.bu.h     vr4,    vr4,    0 //dst+0
480
481    vsrari.h         vr14,   vr14,   1
482    vsadd.h          vr18,   vr27,   vr14
483    vssub.h          vr26,   vr5,    vr14
484    vssrani.bu.h     vr18,   vr18,   0 //dst-2
485    vssrani.bu.h     vr26,   vr26,   0 //dst+1
486
487    vbitsel.v        vr27,   vr18,   vr8,   vr2 //dst-2
488    vbitsel.v        vr28,   vr26,   vr11,  vr2 //dst+1
489
490    vbitsel.v        vr23,   vr7,    vr23,  vr17 //dst-3 (p2)
491    vbitsel.v        vr24,   vr27,   vr24,  vr17 //dst-2
492    vbitsel.v        vr25,   vr3,    vr25,  vr17 //dst-1
493    vbitsel.v        vr19,   vr4,    vr19,  vr17 //dst+0
494    vbitsel.v        vr20,   vr28,   vr20,  vr17 //dst+1
495    vbitsel.v        vr21,   vr12,   vr21,  vr17 //dst+2
496
497    vbitsel.v        vr7,    vr7,    vr23,  vr16 //-3
498    vbitsel.v        vr8,    vr8,    vr24,  vr16 //-2
499    vbitsel.v        vr9,    vr9,    vr25,  vr16 //-1
500    vbitsel.v        vr10,   vr10,   vr19,  vr16 //+0
501    vbitsel.v        vr11,   vr11,   vr20,  vr16 //+1
502    vbitsel.v        vr12,   vr12,   vr21,  vr16 //+2
503
504.ifc \DIR, h
505    vilvl.b          vr6,    vr7,    vr6
506    vilvl.b          vr8,    vr9,    vr8
507    vilvl.b          vr10,   vr11,   vr10
508    vilvl.b          vr12,   vr13,   vr12
509    vilvl.h          vr6,    vr8,    vr6  //p3p2p1p0 -- -- --
510    vilvl.h          vr10,   vr12,   vr10 //q0q1q2q3 -- -- --
511    vilvl.w          vr0,    vr10,   vr6  //p3p2p1p0q0q1q2q3 --
512    vilvh.w          vr1,    vr10,   vr6  //--
513
514    addi.d           t5,     a0,     -4
515    vstelm.d         vr0,    t5,     0,     0
516    add.d            t5,     t5,     a1
517    vstelm.d         vr0,    t5,     0,     1
518    add.d            t5,     t5,     a1
519    vstelm.d         vr1,    t5,     0,     0
520    add.d            t5,     t5,     a1
521    vstelm.d         vr1,    t5,     0,     1
522.else
523    alsl.d           t5,     a1,     a1,    1
524    sub.d            t5,     a0,     t5
525    fst.s            f7,     t5,     0
526    fstx.s           f8,     t5,     a1
527    add.d            t5,     t5,     a1
528    fstx.s           f9,     t5,     a1
529
530    fst.s            f10,    a0,     0
531    add.d            t5,     a0,     a1
532    fst.s            f11,    t5,     0
533    fstx.s           f12,    t5,     a1
534.endif
535.END_FILTER_\DIR\()\TYPE\()_W8:
536.endm
537
538.macro FILTER_W16 DIR, TYPE
539.ifc \DIR, h
540    addi.d           t5,     a0,    -7
541    vld              vr6,    t5,     0 //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6
542    vldx             vr7,    t5,     a1
543    add.d            t5,     t5,     a1
544    vldx             vr8,    t5,     a1
545    add.d            t5,     t5,     a1
546    vldx             vr9,    t5,     a1
547
548    vilvl.b          vr10,   vr7,    vr6
549    vilvh.b          vr11,   vr7,    vr6
550    vilvl.b          vr12,   vr9,    vr8
551    vilvh.b          vr13,   vr9,    vr8
552    vilvl.h          vr6,    vr12,   vr10
553    vilvh.h          vr10,   vr12,   vr10 //p2---
554    vilvl.h          vr15,   vr13,   vr11 //q1---
555    vilvh.h          vr19,   vr13,   vr11
556
557    vbsrl.v          vr7,    vr6,    4    //p5---
558    vbsrl.v          vr8,    vr6,    8    //p4---
559    vbsrl.v          vr9,    vr6,    12   //p3---
560    vbsrl.v          vr12,   vr10,   4    //p1---
561    vbsrl.v          vr13,   vr10,   8    //p0---
562    vbsrl.v          vr14,   vr10,   12   //q0---
563    vbsrl.v          vr16,   vr15,   4    //q2---
564    vbsrl.v          vr17,   vr15,   8    //q3---
565    vbsrl.v          vr18,   vr15,   12   //q4---
566    vbsrl.v          vr20,   vr19,   4    //q6---
567.else
568    slli.d           t5,     a1,     3
569    sub.d            t5,     a0,     t5
570    fldx.s           f6,     t5,     a1  //p6
571    alsl.d           t5,     a1,     t5,    1
572    fld.s            f7,     t5,     0   //p5
573    fldx.s           f8,     t5,     a1  //p4
574    alsl.d           t5,     a1,     t5,    1
575    fld.s            f9,     t5,     0   //p3
576    fldx.s           f10,    t5,     a1  //p2
577    alsl.d           t5,     a1,     t5,    1
578    fld.s            f12,    t5,     0   //p1
579    fldx.s           f13,    t5,     a1  //p0
580    alsl.d           t5,     a1,     t5,    1
581    fld.s            f14,    t5,     0   //q0
582    fldx.s           f15,    t5,     a1  //q1
583    alsl.d           t5,     a1,     t5,    1
584    fld.s            f16,    t5,     0   //q2
585    fldx.s           f17,    t5,     a1  //q3
586    alsl.d           t5,     a1,     t5,    1
587    fld.s            f18,    t5,     0   //q4
588    fldx.s           f19,    t5,     a1  //q5
589    add.d            t5,     t5,     a1
590    fldx.s           f20,    t5,     a1  //q6
591
592    //temp store
593    addi.d           sp,     sp,    -96
594    fst.d            f7,     sp,     0
595    fst.d            f8,     sp,     8
596    fst.d            f9,     sp,     16
597    fst.d            f10,    sp,     24
598    fst.d            f12,    sp,     32
599    fst.d            f13,    sp,     40
600    fst.d            f14,    sp,     48
601    fst.d            f15,    sp,     56
602    fst.d            f16,    sp,     64
603    fst.d            f17,    sp,     72
604    fst.d            f18,    sp,     80
605    fst.d            f19,    sp,     88
606.endif
607
608    vabsd.bu         vr21,   vr12,   vr13 //abs(p1-p0)
609    vabsd.bu         vr22,   vr15,   vr14 //abs(q1-q0)
610    vmax.bu          vr0,    vr21,   vr22
611    vslt.bu          vr2,    vr2,    vr0  //hev
612    vabsd.bu         vr1,    vr10,   vr12 //abs(p2-p1)
613    vmax.bu          vr0,    vr0,    vr1
614    vabsd.bu         vr1,    vr16,   vr15 //abs(q2-q1)
615    vmax.bu          vr0,    vr0,    vr1
616    vabsd.bu         vr1,    vr9,    vr10 //abs(p3-p2)
617    vmax.bu          vr0,    vr0,    vr1
618    vabsd.bu         vr1,    vr17,   vr16 //abs(q3-q2)
619    vmax.bu          vr0,    vr0,    vr1
620    vsle.bu          vr0,    vr0,    vr4  //vr4 released I
621    vabsd.bu         vr1,    vr13,   vr14 //abs(p0-q0)
622    vsadd.bu         vr1,    vr1,    vr1
623    vabsd.bu         vr4,    vr12,   vr15 //abs(p1-q1)
624    vsrli.b          vr4,    vr4,    1
625    vsadd.bu         vr1,    vr1,    vr4  //abs(p0 - q0) * 2 + (abs(p1 - q1) >> 1)
626    vsle.bu          vr1,    vr1,    vr3  //vr3 released E
627    vand.v           vr0,    vr0,    vr1  //fm
628
629    vpickve2gr.wu    t5,     vr0,    0
630    beqz             t5,     .END_FILTER_\DIR\()\TYPE\()_W16
631
632    vabsd.bu         vr1,    vr6,    vr13 //abs(p6-p0)
633    vabsd.bu         vr4,    vr7,    vr13 //abs(p5-p0)
634    vmax.bu          vr1,    vr1,    vr4
635    vabsd.bu         vr4,    vr8,    vr13 //abs(p4-p0)
636    vmax.bu          vr1,    vr1,    vr4
637    vabsd.bu         vr4,    vr18,   vr14 //abs(q4-q0)
638    vmax.bu          vr1,    vr1,    vr4
639    vabsd.bu         vr4,    vr19,   vr14 //abs(q5-q0)
640    vmax.bu          vr1,    vr1,    vr4
641    vabsd.bu         vr4,    vr20,   vr14
642    vmax.bu          vr1,    vr1,    vr4
643    vxor.v           vr5,    vr5,    vr5
644    vaddi.bu         vr5,    vr5,    1    //F
645    vsle.bu          vr1,    vr1,    vr5  //flat8out
646
647    vabsd.bu         vr3,    vr10,   vr13 //abs(p2-p0)
648    vmax.bu          vr3,    vr3,    vr21
649    vmax.bu          vr3,    vr3,    vr22
650    vabsd.bu         vr4,    vr16,   vr14 //abs(q2-q0)
651    vmax.bu          vr3,    vr3,    vr4
652    vabsd.bu         vr4,    vr9,    vr13 //abs(p3-p0)
653    vmax.bu          vr3,    vr3,    vr4
654    vabsd.bu         vr4,    vr17,   vr14 //abs(q3-q0)
655    vmax.bu          vr3,    vr3,    vr4
656    vsle.bu          vr3,    vr3,    vr5  //flatin released vr5
657
658    vsllwil.hu.bu    vr6,    vr6,    0    //p6
659    vsllwil.hu.bu    vr7,    vr7,    0    //p5
660    vsllwil.hu.bu    vr8,    vr8,    0    //p4
661    vsllwil.hu.bu    vr9,    vr9,    0    //p3
662    vsllwil.hu.bu    vr10,   vr10,   0    //p2
663    vsllwil.hu.bu    vr12,   vr12,   0    //p1
664    vsllwil.hu.bu    vr13,   vr13,   0    //p0
665    vsllwil.hu.bu    vr14,   vr14,   0    //q0
666    vsllwil.hu.bu    vr15,   vr15,   0    //q1
667    vsllwil.hu.bu    vr16,   vr16,   0    //q2
668    vsllwil.hu.bu    vr17,   vr17,   0    //q3
669    vsllwil.hu.bu    vr18,   vr18,   0    //q4
670    vsllwil.hu.bu    vr19,   vr19,   0    //q5
671    vsllwil.hu.bu    vr20,   vr20,   0    //q6
672
673    //dst-6
674    vslli.w          vr21,   vr6,    3
675    vssub.hu         vr21,   vr21,   vr6
676    vsadd.hu         vr21,   vr21,   vr7
677    vsadd.hu         vr21,   vr21,   vr7
678    vsadd.hu         vr21,   vr21,   vr8
679    vsadd.hu         vr21,   vr21,   vr8
680    vsadd.hu         vr21,   vr21,   vr9
681    vsadd.hu         vr21,   vr21,   vr10
682    vsadd.hu         vr21,   vr21,   vr12
683    vsadd.hu         vr21,   vr21,   vr13
684    vsadd.hu         vr21,   vr21,   vr14
685
686    //dst-5
687    vsadd.hu         vr22,   vr21,   vr15
688    vsadd.hu         vr22,   vr22,   vr9
689    vssub.hu         vr22,   vr22,   vr6
690    vssub.hu         vr22,   vr22,   vr6
691
692    //dst-4
693    vsadd.hu         vr23,   vr22,   vr16
694    vsadd.hu         vr23,   vr23,   vr10
695    vssub.hu         vr23,   vr23,   vr7
696    vssub.hu         vr23,   vr23,   vr6
697
698    //dst-3
699    vsadd.hu         vr24,   vr23,   vr12
700    vsadd.hu         vr24,   vr24,   vr17
701    vssub.hu         vr24,   vr24,   vr6
702    vssub.hu         vr24,   vr24,   vr8
703
704    //dst-2
705    vsadd.hu         vr25,   vr24,   vr18
706    vsadd.hu         vr25,   vr25,   vr13
707    vssub.hu         vr25,   vr25,   vr6
708    vssub.hu         vr25,   vr25,   vr9
709
710    //dst-1
711    vsadd.hu         vr26,   vr25,   vr19
712    vsadd.hu         vr26,   vr26,   vr14
713    vssub.hu         vr26,   vr26,   vr6
714    vssub.hu         vr26,   vr26,   vr10
715
716    //dst+0
717    vsadd.hu         vr27,   vr26,   vr20
718    vsadd.hu         vr27,   vr27,   vr15
719    vssub.hu         vr27,   vr27,   vr6
720    vssub.hu         vr27,   vr27,   vr12
721
722    //dst+1
723    vsadd.hu         vr28,   vr27,   vr20
724    vsadd.hu         vr28,   vr28,   vr16
725    vssub.hu         vr28,   vr28,   vr7
726    vssub.hu         vr28,   vr28,   vr13
727
728    //dst+2
729    vsadd.hu         vr29,   vr28,   vr20
730    vsadd.hu         vr29,   vr29,   vr17
731    vssub.hu         vr29,   vr29,   vr8
732    vssub.hu         vr29,   vr29,   vr14
733
734    //dst+3
735    vsadd.hu         vr30,   vr29,   vr20
736    vsadd.hu         vr30,   vr30,   vr18
737    vssub.hu         vr30,   vr30,   vr9
738    vssub.hu         vr30,   vr30,   vr15
739
740    //dst+4
741    vsadd.hu         vr31,   vr30,   vr20
742    vsadd.hu         vr31,   vr31,   vr19
743    vssub.hu         vr31,   vr31,   vr10
744    vssub.hu         vr31,   vr31,   vr16
745
746    //dst+5
747    vsadd.hu         vr11,   vr31,   vr20
748    vsadd.hu         vr11,   vr11,   vr20
749    vssub.hu         vr11,   vr11,   vr12
750    vssub.hu         vr11,   vr11,   vr17
751
752    vsrari.h         vr21,   vr21,   4
753    vsrari.h         vr22,   vr22,   4
754    vsrari.h         vr23,   vr23,   4
755    vsrari.h         vr24,   vr24,   4
756    vsrari.h         vr25,   vr25,   4
757    vsrari.h         vr26,   vr26,   4
758    vsrari.h         vr27,   vr27,   4
759    vsrari.h         vr28,   vr28,   4
760    vsrari.h         vr29,   vr29,   4
761    vsrari.h         vr30,   vr30,   4
762    vsrari.h         vr31,   vr31,   4
763    vsrari.h         vr11,   vr11,   4
764
765    vand.v           vr1,    vr1,    vr3
766    vsllwil.h.b      vr1,    vr1,    0 //expand to h
767    //(flat8out & flat8in)
768    vbitsel.v        vr21,   vr7,    vr21,    vr1  //dst-6
769    vbitsel.v        vr22,   vr8,    vr22,    vr1  //dst-5
770    vbitsel.v        vr23,   vr9,    vr23,    vr1  //dst-4
771    vbitsel.v        vr30,   vr17,   vr30,    vr1  //dst+3
772    vbitsel.v        vr31,   vr18,   vr31,    vr1  //dst+4
773    vbitsel.v        vr11,   vr19,   vr11,    vr1  //dst+5
774
775    //flat8in
776    //dst-3
777    vslli.h          vr4,    vr9,    1
778    vsadd.hu         vr4,    vr4,    vr9 //p3*3
779    vsadd.hu         vr4,    vr4,    vr10
780    vsadd.hu         vr4,    vr4,    vr10
781    vsadd.hu         vr4,    vr4,    vr12
782    vsadd.hu         vr4,    vr4,    vr13
783    vsadd.hu         vr4,    vr4,    vr14
784
785    //dst-2
786    vsadd.hu         vr5,    vr4,    vr12
787    vsadd.hu         vr5,    vr5,    vr15
788    vssub.hu         vr5,    vr5,    vr9
789    vssub.hu         vr5,    vr5,    vr10
790
791    //dst-1
792    vsadd.hu         vr18,   vr5,    vr13
793    vsadd.hu         vr18,   vr18,   vr16
794    vssub.hu         vr18,   vr18,   vr9
795    vssub.hu         vr18,   vr18,   vr12
796
797    //dst+0
798    vsadd.hu         vr7,    vr18,   vr14
799    vsadd.hu         vr7,    vr7,    vr17
800    vssub.hu         vr7,    vr7,    vr9
801    vssub.hu         vr7,    vr7,    vr13
802
803    //dst+1
804    vsadd.hu         vr8,    vr7,    vr15
805    vsadd.hu         vr8,    vr8,    vr17
806    vssub.hu         vr8,    vr8,    vr10
807    vssub.hu         vr8,    vr8,    vr14
808
809    //dst+2
810    vsadd.hu         vr9,    vr8,    vr16
811    vsadd.hu         vr9,    vr9,    vr17
812    vssub.hu         vr9,    vr9,    vr12
813    vssub.hu         vr9,    vr9,    vr15
814
815    vsrari.h         vr4,    vr4,    3
816    vsrari.h         vr5,    vr5,    3
817    vsrari.h         vr18,   vr18,   3
818    vsrari.h         vr7,    vr7,    3
819    vsrari.h         vr8,    vr8,    3
820    vsrari.h         vr9,    vr9,    3
821
822    //flat8out & flat8in
823    vbitsel.v        vr24,   vr4,    vr24,    vr1 //dst-3
824    vbitsel.v        vr25,   vr5,    vr25,    vr1 //dst-2
825    vbitsel.v        vr26,   vr18,   vr26,    vr1 //dst-1
826    vbitsel.v        vr27,   vr7,    vr27,    vr1 //dst+0
827    vbitsel.v        vr28,   vr8,    vr28,    vr1 //dst+1
828    vbitsel.v        vr29,   vr9,    vr29,    vr1 //dst+2
829
830    //!flat8in
831    vsub.h           vr17,   vr12,   vr15 //p1-q1
832    vsllwil.h.b      vr2,    vr2,    0
833    vand.v           vr17,   vr17,   vr2  //&hev
834    vssrani.b.h      vr17,   vr17,   0
835    vsllwil.h.b      vr17,   vr17,   0
836
837    vsub.h           vr7,    vr14,   vr13
838    vsadd.h          vr8,    vr7,    vr7
839    vsadd.h          vr7,    vr7,    vr8
840    vsadd.h          vr7,    vr7,    vr17
841    vssrani.b.h      vr7,    vr7,    0
842    vsllwil.h.b      vr17,   vr7,    0  //f = iclip_diff(3 * (q0 - p0) + f);
843
844    vaddi.hu         vr7,    vr17,   4
845    vaddi.hu         vr8,    vr17,   3
846    li.w             t5,     127
847    vreplgr2vr.h     vr9,    t5
848    vmin.h           vr7,    vr7,    vr9
849    vmin.h           vr8,    vr8,    vr9
850    vsrai.h          vr7,    vr7,    3  //f1
851    vsrai.h          vr8,    vr8,    3  //f2
852
853    vsadd.h          vr4,    vr13,   vr8  //dst-1
854    vssub.h          vr5,    vr14,   vr7  //dst+0
855
856    vsrari.h         vr7,    vr7,    1
857    vsadd.h          vr17,   vr12,   vr7
858    vssub.h          vr7,    vr15,   vr7
859    vbitsel.v        vr17,   vr17,   vr12,    vr2  //dst-2
860    vbitsel.v        vr7,    vr7,    vr15,    vr2  //dst+1
861
862    //flat8in or !flat8in
863    vsllwil.h.b      vr3,    vr3,    0
864    vbitsel.v        vr24,   vr10,   vr24,    vr3  //dst-3
865    vbitsel.v        vr25,   vr17,   vr25,    vr3  //dst-2
866    vbitsel.v        vr26,   vr4,    vr26,    vr3  //dst-1
867    vbitsel.v        vr27,   vr5,    vr27,    vr3  //dst+0
868    vbitsel.v        vr28,   vr7,    vr28,    vr3  //dst+1
869    vbitsel.v        vr29,   vr16,   vr29,    vr3  //dst+2
870
871.ifc \DIR, h
872    //dst-6,dst-2,dst-5,dst-1
873    vssrani.bu.h     vr25,   vr21,   0
874    vssrani.bu.h     vr26,   vr22,   0
875    vpermi.w         vr25,   vr25,   0xd8
876    vpermi.w         vr26,   vr26,   0xd8
877    vilvl.b          vr6,    vr26,   vr25 //65656565 21212121
878
879    //dst-4,dst+0,dst-3,dst+1
880    vssrani.bu.h     vr27,   vr23,   0
881    vssrani.bu.h     vr28,   vr24,   0
882    vpermi.w         vr27,   vr27,   0xd8
883    vpermi.w         vr28,   vr28,   0xd8
884    vilvl.b          vr26,   vr28,   vr27 //43434343 01010101
885
886    vilvl.h          vr21,   vr26,   vr6  //6543 -- -- --
887    vilvh.h          vr22,   vr26,   vr6  //2101 -- -- --
888    vilvl.w          vr20,   vr22,   vr21 //65432101 --
889    vilvh.w          vr22,   vr22,   vr21 //65432101 --
890    vreplvei.d       vr21,   vr20,   1
891    vreplvei.d       vr23,   vr22,   1
892
893    //dst+2,dst+4,dst+3,dst+5
894    vssrani.bu.h     vr31,   vr29,   0
895    vssrani.bu.h     vr11,   vr30,   0
896    vpermi.w         vr31,   vr31,   0xd8
897    vpermi.w         vr11,   vr11,   0xd8
898    vilvl.b          vr11,   vr11,   vr31 //23232323 45454545
899    vshuf4i.w        vr11,   vr11,   0xd8
900    vshuf4i.h        vr11,   vr11,   0xd8 //2345 -- -- --
901
902    vextrins.w       vr20,   vr11,   0x20
903    vextrins.w       vr21,   vr11,   0x21
904    vextrins.w       vr22,   vr11,   0x22
905    vextrins.w       vr23,   vr11,   0x23
906
907    addi.d           t5,     a0,     -6
908    vld              vr6,    t5,     0  //p6p5p4p3p2p1p0q0 q1q2q3q4q5q6
909    vldx             vr7,    t5,     a1
910    add.d            t5,     t5,     a1
911    vldx             vr8,    t5,     a1
912    add.d            t5,     t5,     a1
913    vldx             vr9,    t5,     a1
914
915    //expand fm to 128
916    vreplvei.b       vr10,   vr0,    0
917    vreplvei.b       vr11,   vr0,    1
918    vreplvei.b       vr12,   vr0,    2
919    vreplvei.b       vr13,   vr0,    3
920
921    vbitsel.v        vr20,   vr6,    vr20,    vr10
922    vbitsel.v        vr21,   vr7,    vr21,    vr11
923    vbitsel.v        vr22,   vr8,    vr22,    vr12
924    vbitsel.v        vr23,   vr9,    vr23,    vr13
925
926    addi.d           t5,     a0,    -6
927    vstelm.d         vr20,   t5,     0,       0
928    vstelm.w         vr20,   t5,     8,       2
929    add.d            t5,     t5,     a1
930    vstelm.d         vr21,   t5,     0,       0
931    vstelm.w         vr21,   t5,     8,       2
932    add.d            t5,     t5,     a1
933    vstelm.d         vr22,   t5,     0,       0
934    vstelm.w         vr22,   t5,     8,       2
935    add.d            t5,     t5,     a1
936    vstelm.d         vr23,   t5,     0,       0
937    vstelm.w         vr23,   t5,     8,       2
938.else
939    //reload
940    fld.d            f7,     sp,     0
941    fld.d            f8,     sp,     8
942    fld.d            f9,     sp,     16
943    fld.d            f10,    sp,     24
944    fld.d            f12,    sp,     32
945    fld.d            f13,    sp,     40
946    fld.d            f14,    sp,     48
947    fld.d            f15,    sp,     56
948    fld.d            f16,    sp,     64
949    fld.d            f17,    sp,     72
950    fld.d            f18,    sp,     80
951    fld.d            f19,    sp,     88
952
953    vssrarni.bu.h    vr21,   vr21,   0
954    vssrarni.bu.h    vr22,   vr22,   0
955    vssrarni.bu.h    vr23,   vr23,   0
956    vssrarni.bu.h    vr24,   vr24,   0
957    vssrarni.bu.h    vr25,   vr25,   0
958    vssrarni.bu.h    vr26,   vr26,   0
959    vssrarni.bu.h    vr27,   vr27,   0
960    vssrarni.bu.h    vr28,   vr28,   0
961    vssrarni.bu.h    vr29,   vr29,   0
962    vssrarni.bu.h    vr30,   vr30,   0
963    vssrarni.bu.h    vr31,   vr31,   0
964    vssrarni.bu.h    vr11,   vr11,   0
965
966    vbitsel.v        vr7,    vr7,    vr21,   vr0 //p5
967    vbitsel.v        vr8,    vr8,    vr22,   vr0 //p4
968    vbitsel.v        vr9,    vr9,    vr23,   vr0 //p3
969    vbitsel.v        vr10,   vr10,   vr24,   vr0 //p2
970    vbitsel.v        vr12,   vr12,   vr25,   vr0 //p1
971    vbitsel.v        vr13,   vr13,   vr26,   vr0 //p0
972    vbitsel.v        vr14,   vr14,   vr27,   vr0 //q0
973    vbitsel.v        vr15,   vr15,   vr28,   vr0 //q1
974    vbitsel.v        vr16,   vr16,   vr29,   vr0 //q2
975    vbitsel.v        vr17,   vr17,   vr30,   vr0 //q3
976    vbitsel.v        vr18,   vr18,   vr31,   vr0 //q4
977    vbitsel.v        vr19,   vr19,   vr11,   vr0 //q5
978
979    fst.s            f14,    a0,     0
980    fstx.s           f15,    a0,     a1
981    alsl.d           t5,     a1,     a0,     1
982    fst.s            f16,    t5,     0
983    fstx.s           f17,    t5,     a1
984    alsl.d           t5,     a1,     t5,     1
985    fst.s            f18,    t5,     0
986    fstx.s           f19,    t5,     a1
987
988    slli.w           t5,     a1,     2
989    alsl.d           t5,     a1,     t5,     1
990    sub.d            t5,     a0,     t5
991    fst.s            f7,     t5,     0
992    fstx.s           f8,     t5,     a1
993    alsl.d           t5,     a1,     t5,     1
994    fst.s            f9,     t5,     0
995    fstx.s           f10,    t5,     a1
996    alsl.d           t5,     a1,     t5,     1
997    fst.s            f12,    t5,     0
998    fstx.s           f13,    t5,     a1
999.endif
1000.END_FILTER_\DIR\()\TYPE\()_W16:
1001.ifc \DIR, v
1002    addi.d           sp,     sp,     96
1003.endif
1004.endm
1005
1006.macro PUSH_REG
1007    addi.d           sp,     sp,    -64
1008    fst.d            f24,    sp,     0
1009    fst.d            f25,    sp,     8
1010    fst.d            f26,    sp,     16
1011    fst.d            f27,    sp,     24
1012    fst.d            f28,    sp,     32
1013    fst.d            f29,    sp,     40
1014    fst.d            f30,    sp,     48
1015    fst.d            f31,    sp,     56
1016.endm
1017.macro POP_REG
1018    fld.d            f24,    sp,     0
1019    fld.d            f25,    sp,     8
1020    fld.d            f26,    sp,     16
1021    fld.d            f27,    sp,     24
1022    fld.d            f28,    sp,     32
1023    fld.d            f29,    sp,     40
1024    fld.d            f30,    sp,     48
1025    fld.d            f31,    sp,     56
1026    addi.d           sp,     sp,     64
1027.endm
1028
1029.macro LPF_FUNC DIR, TYPE
1030function lpf_\DIR\()_sb_\TYPE\()_8bpc_lsx
1031    PUSH_REG
1032    vld              vr0,    a2,     0 //vmask
1033    vpickve2gr.wu    t0,     vr0,    0
1034    vpickve2gr.wu    t1,     vr0,    1
1035    vpickve2gr.wu    t2,     vr0,    2
1036    li.w             t3,     1          //y
1037    or               t0,     t0,     t1
1038.ifc \TYPE, y
1039    or               t0,     t0,     t2 //vm
1040.endif
1041    addi.w           t8,     t3,    -1
1042    andn             t8,     t0,     t8
1043    beqz             t0,     .\DIR\()\TYPE\()_END
1044.\DIR\()\TYPE\()_LOOP:
1045    and              t4,     t0,     t3 //vm & y
1046    beqz             t4,     .\DIR\()\TYPE\()_LOOP_NEXT
1047    vldrepl.b        vr1,    a3,     0 //l[0][0]
1048.ifc \DIR, h
1049    addi.d           t5,     a3,    -4
1050.else
1051    slli.d           t5,     a4,     2
1052    sub.d            t5,     a3,     t5
1053.endif
1054    vldrepl.b        vr2,    t5,     0 //l[-1][0]
1055    vseqi.b          vr3,    vr1,    0
1056    vbitsel.v        vr1,    vr1,    vr2,    vr3 //L
1057    vpickve2gr.b     t5,     vr1,    0
1058    beqz             t5,     .\DIR\()\TYPE\()_LOOP_NEXT
1059    vsrai.b          vr2,    vr1,    4 //H
1060    add.d            t6,     a5,     t5
1061    vldrepl.b        vr3,    t6,     0 //E
1062    addi.d           t6,     t6,     64
1063    vldrepl.b        vr4,    t6,     0 //I
1064.ifc \TYPE, y
1065    and              t5,     t2,     t3
1066    bnez             t5,     .FILTER_\DIR\()\TYPE\()_16
1067.endif
1068    and              t5,     t1,     t3
1069.ifc \TYPE, y
1070    bnez             t5,     .FILTER_\DIR\()\TYPE\()_8
1071.else
1072    bnez             t5,     .FILTER_\DIR\()\TYPE\()_6
1073.endif
1074    FILTER_W4 \DIR, \TYPE
1075    b                .\DIR\()\TYPE\()_LOOP_NEXT
1076.ifc \TYPE, uv
1077.FILTER_\DIR\()\TYPE\()_6:
1078    FILTER_W6 \DIR, \TYPE
1079.endif
1080.ifc \TYPE, y
1081.FILTER_\DIR\()\TYPE\()_8:
1082    FILTER_W8 \DIR, \TYPE
1083    b                .\DIR\()\TYPE\()_LOOP_NEXT
1084.FILTER_\DIR\()\TYPE\()_16:
1085    FILTER_W16 \DIR, \TYPE
1086.endif
1087.\DIR\()\TYPE\()_LOOP_NEXT:
1088    slli.w           t3,     t3,     1
1089.ifc \DIR, h
1090    alsl.d           a0,     a1,     a0,    2
1091    slli.w           t8,     a4,     2
1092    add.d            a3,     a3,     t8
1093.else
1094    addi.d           a0,     a0,     4
1095    addi.d           a3,     a3,     4
1096.endif
1097    addi.w           t8,     t3,    -1
1098    andn             t8,     t0,     t8
1099    bnez             t8,     .\DIR\()\TYPE\()_LOOP
1100.\DIR\()\TYPE\()_END:
1101    POP_REG
1102endfunc
1103.endm
1104
1105LPF_FUNC h, y
1106LPF_FUNC v, y
1107LPF_FUNC h, uv
1108LPF_FUNC v, uv
1109