• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/loongarch/loongson_asm.S"
29
30#define REST_UNIT_STRIDE (400)
31
32.macro MADD_HU_BU in0, in1, out0, out1
33    vsllwil.hu.bu vr12,     \in0,     0
34    vexth.hu.bu   vr13,     \in0
35    vmadd.h       \out0,    vr12,     \in1
36    vmadd.h       \out1,    vr13,     \in1
37.endm
38
39const wiener_shuf
40.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
41endconst
42
43/*
44void wiener_filter_h_lsx(int32_t *hor_ptr,
45                         uint8_t *tmp_ptr,
46                         const int16_t filterh[8],
47                         const int w, const int h)
48*/
49function wiener_filter_h_8bpc_lsx
50    addi.d        sp,       sp,       -40
51    fst.d         f24,      sp,       0
52    fst.d         f25,      sp,       8
53    fst.d         f26,      sp,       16
54    fst.d         f27,      sp,       24
55    fst.d         f28,      sp,       32
56    li.w          t7,       1<<14          // clip_limit
57
58    la.local      t1,       wiener_shuf
59    vld           vr4,      t1,       0
60    vld           vr14,     a2,       0    // filter[0][k]
61    vreplvei.h    vr21,     vr14,     0
62    vreplvei.h    vr22,     vr14,     1
63    vreplvei.h    vr23,     vr14,     2
64    vreplvei.h    vr24,     vr14,     3
65    vreplvei.h    vr25,     vr14,     4
66    vreplvei.h    vr26,     vr14,     5
67    vreplvei.h    vr27,     vr14,     6
68    vreplgr2vr.w  vr0,      t7
69
70.WIENER_FILTER_H_H:
71    addi.w        a4,       a4,       -1    // h
72    addi.w        t0,       a3,       0     // w
73    addi.d        t1,       a1,       0     // tmp_ptr
74    addi.d        t2,       a0,       0     // hor_ptr
75
76.WIENER_FILTER_H_W:
77    addi.w        t0,       t0,       -16
78    vld           vr5,      t1,       0
79    vld           vr13,     t1,       16
80
81    vsubi.bu      vr14,     vr4,      2
82    vsubi.bu      vr15,     vr4,      1
83    vshuf.b       vr6,      vr13,     vr5,     vr14  // 1 ... 8, 9 ... 16
84    vshuf.b       vr7,      vr13,     vr5,     vr15  // 2 ... 9, 10 ... 17
85    vshuf.b       vr8,      vr13,     vr5,     vr4   // 3 ... 10, 11 ... 18
86    vaddi.bu      vr14,     vr4,      1
87    vaddi.bu      vr15,     vr4,      2
88    vshuf.b       vr9,      vr13,     vr5,     vr14  // 4 ... 11, 12 ... 19
89    vshuf.b       vr10,     vr13,     vr5,     vr15  // 5 ... 12, 13 ... 20
90    vaddi.bu      vr14,     vr4,      3
91    vshuf.b       vr11,     vr13,     vr5,     vr14  // 6 ... 13, 14 ... 21
92
93    vsllwil.hu.bu vr15,     vr8,      0    //  3  4  5  6  7  8  9 10
94    vexth.hu.bu   vr16,     vr8            // 11 12 13 14 15 16 17 18
95    vsllwil.wu.hu vr17,     vr15,     0    //  3  4  5  6
96    vexth.wu.hu   vr18,     vr15           //  7  8  9 10
97    vsllwil.wu.hu vr19,     vr16,     0    // 11 12 13 14
98    vexth.wu.hu   vr20,     vr16           // 15 16 17 18
99    vslli.w       vr17,     vr17,     7
100    vslli.w       vr18,     vr18,     7
101    vslli.w       vr19,     vr19,     7
102    vslli.w       vr20,     vr20,     7
103    vxor.v        vr15,     vr15,     vr15
104    vxor.v        vr14,     vr14,     vr14
105
106    MADD_HU_BU    vr5,   vr21,  vr14,  vr15
107    MADD_HU_BU    vr6,   vr22,  vr14,  vr15
108    MADD_HU_BU    vr7,   vr23,  vr14,  vr15
109    MADD_HU_BU    vr8,   vr24,  vr14,  vr15
110    MADD_HU_BU    vr9,   vr25,  vr14,  vr15
111    MADD_HU_BU    vr10,  vr26,  vr14,  vr15
112    MADD_HU_BU    vr11,  vr27,  vr14,  vr15
113
114    vsllwil.w.h   vr5,      vr14,     0   //  0  1  2  3
115    vexth.w.h     vr6,      vr14          //  4  5  6  7
116    vsllwil.w.h   vr7,      vr15,     0   //  8  9 10 11
117    vexth.w.h     vr8,      vr15          // 12 13 14 15
118    vadd.w        vr17,     vr17,     vr5
119    vadd.w        vr18,     vr18,     vr6
120    vadd.w        vr19,     vr19,     vr7
121    vadd.w        vr20,     vr20,     vr8
122    vadd.w        vr17,     vr17,     vr0
123    vadd.w        vr18,     vr18,     vr0
124    vadd.w        vr19,     vr19,     vr0
125    vadd.w        vr20,     vr20,     vr0
126
127    vsrli.w       vr1,      vr0,      1
128    vsubi.wu      vr1,      vr1,      1
129    vxor.v        vr3,      vr3,      vr3
130    vsrari.w      vr17,     vr17,     3
131    vsrari.w      vr18,     vr18,     3
132    vsrari.w      vr19,     vr19,     3
133    vsrari.w      vr20,     vr20,     3
134    vclip.w       vr17,     vr17,     vr3,     vr1
135    vclip.w       vr18,     vr18,     vr3,     vr1
136    vclip.w       vr19,     vr19,     vr3,     vr1
137    vclip.w       vr20,     vr20,     vr3,     vr1
138
139    vst           vr17,     t2,       0
140    vst           vr18,     t2,       16
141    vst           vr19,     t2,       32
142    vst           vr20,     t2,       48
143    addi.d        t1,       t1,       16
144    addi.d        t2,       t2,       64
145    blt           zero,     t0,       .WIENER_FILTER_H_W
146
147    addi.d        a1,       a1,       REST_UNIT_STRIDE
148    addi.d        a0,       a0,       (REST_UNIT_STRIDE << 2)
149    bnez          a4,       .WIENER_FILTER_H_H
150
151    fld.d         f24,      sp,       0
152    fld.d         f25,      sp,       8
153    fld.d         f26,      sp,       16
154    fld.d         f27,      sp,       24
155    fld.d         f28,      sp,       32
156    addi.d        sp,       sp,       40
157endfunc
158
159.macro APPLY_FILTER in0, in1, in2
160    alsl.d         t7,      \in0,     \in1,    2
161    vld            vr10,    t7,       0
162    vld            vr11,    t7,       16
163    vld            vr12,    t7,       32
164    vld            vr13,    t7,       48
165    vmadd.w        vr14,    vr10,     \in2
166    vmadd.w        vr15,    vr11,     \in2
167    vmadd.w        vr16,    vr12,     \in2
168    vmadd.w        vr17,    vr13,     \in2
169.endm
170
171.macro wiener_filter_v_8bpc_core_lsx
172    vreplgr2vr.w  vr14,     t6
173    vreplgr2vr.w  vr15,     t6
174    vreplgr2vr.w  vr16,     t6
175    vreplgr2vr.w  vr17,     t6
176
177    addi.w        t7,       t2,       0      // j + index k
178    mul.w         t7,       t7,       t8     // (j + index) * REST_UNIT_STRIDE
179    add.w         t7,       t7,       t4     // (j + index) * REST_UNIT_STRIDE + i
180
181    APPLY_FILTER  t7, a2, vr2
182    APPLY_FILTER  t8, t7, vr3
183    APPLY_FILTER  t8, t7, vr4
184    APPLY_FILTER  t8, t7, vr5
185    APPLY_FILTER  t8, t7, vr6
186    APPLY_FILTER  t8, t7, vr7
187    APPLY_FILTER  t8, t7, vr8
188    vssrarni.hu.w vr15,     vr14,     11
189    vssrarni.hu.w vr17,     vr16,     11
190    vssrlni.bu.h  vr17,     vr15,     0
191.endm
192
193/*
194void wiener_filter_v_lsx(uint8_t *p,
195                         const ptrdiff_t p_stride,
196                         const int32_t *hor,
197                         const int16_t filterv[8],
198                         const int w, const int h)
199*/
200function wiener_filter_v_8bpc_lsx
201    li.w          t6,       -(1 << 18)
202
203    li.w          t8,       REST_UNIT_STRIDE
204    ld.h          t0,       a3,       0
205    ld.h          t1,       a3,       2
206    vreplgr2vr.w  vr2,      t0
207    vreplgr2vr.w  vr3,      t1
208    ld.h          t0,       a3,       4
209    ld.h          t1,       a3,       6
210    vreplgr2vr.w  vr4,      t0
211    vreplgr2vr.w  vr5,      t1
212    ld.h          t0,       a3,       8
213    ld.h          t1,       a3,       10
214    vreplgr2vr.w  vr6,      t0
215    vreplgr2vr.w  vr7,      t1
216    ld.h          t0,       a3,       12
217    vreplgr2vr.w  vr8,      t0
218
219    andi          t1,       a4,       0xf
220    sub.w         t0,       a4,       t1    // w-w%16
221    or            t2,       zero,     zero  // j
222    or            t4,       zero,     zero
223    beqz          t0,       .WIENER_FILTER_V_W_LT16
224
225.WIENER_FILTER_V_H:
226    andi          t1,       a4,       0xf
227    add.d         t3,       zero,     a0     // p
228    or            t4,       zero,     zero   // i
229
230.WIENER_FILTER_V_W:
231
232    wiener_filter_v_8bpc_core_lsx
233
234    mul.w         t5,       t2,       a1   // j * stride
235    add.w         t5,       t5,       t4   // j * stride + i
236    add.d         t3,       a0,       t5
237    addi.w        t4,       t4,       16
238    vst           vr17,     t3,       0
239    bne           t0,       t4,       .WIENER_FILTER_V_W
240
241    beqz          t1,       .WIENER_FILTER_V_W_EQ16
242
243    wiener_filter_v_8bpc_core_lsx
244
245    addi.d        t3,       t3,       16
246    andi          t1,       a4,       0xf
247
248.WIENER_FILTER_V_ST_REM:
249    vstelm.b      vr17,     t3,       0,    0
250    vbsrl.v       vr17,     vr17,     1
251    addi.d        t3,       t3,       1
252    addi.w        t1,       t1,       -1
253    bnez          t1,       .WIENER_FILTER_V_ST_REM
254.WIENER_FILTER_V_W_EQ16:
255    addi.w        t2,       t2,       1
256    blt           t2,       a5,       .WIENER_FILTER_V_H
257    b              .WIENER_FILTER_V_END
258
259.WIENER_FILTER_V_W_LT16:
260    andi          t1,       a4,       0xf
261    add.d         t3,       zero,     a0
262
263    wiener_filter_v_8bpc_core_lsx
264
265    mul.w         t5,       t2,       a1   // j * stride
266    add.d         t3,       a0,       t5
267
268.WIENER_FILTER_V_ST_REM_1:
269    vstelm.b      vr17,     t3,       0,    0
270    vbsrl.v       vr17,     vr17,     1
271    addi.d        t3,       t3,       1
272    addi.w        t1,       t1,       -1
273    bnez          t1,       .WIENER_FILTER_V_ST_REM_1
274
275    addi.w        t2,       t2,       1
276    blt           t2,       a5,       .WIENER_FILTER_V_W_LT16
277
278.WIENER_FILTER_V_END:
279endfunc
280
281/*
282void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src,
283               const int w, const int h)
284*/
285function boxsum3_h_8bpc_lsx
286    addi.d         a2,      a2,      REST_UNIT_STRIDE
287    li.w           t0,      1
288    addi.w         a3,      a3,      -2
289    addi.w         a4,      a4,      -4
290
291.LBS3_H_H:
292    alsl.d         t1,      t0,      a1,    1     // sum_v    *sum_v = sum + x
293    alsl.d         t2,      t0,      a0,    2     // sumsq_v  *sumsq_v = sumsq + x
294    add.d          t3,      t0,      a2           // s
295    addi.w         t5,      a3,      0
296.LBS3_H_W:
297    vld            vr0,     t3,      0
298    vld            vr1,     t3,      REST_UNIT_STRIDE
299    vld            vr2,     t3,      (REST_UNIT_STRIDE<<1)
300
301    vilvl.b        vr3,     vr1,     vr0
302    vhaddw.hu.bu   vr4,     vr3,     vr3
303    vilvh.b        vr5,     vr1,     vr0
304    vhaddw.hu.bu   vr6,     vr5,     vr5
305    vsllwil.hu.bu  vr7,     vr2,     0
306    vexth.hu.bu    vr8,     vr2
307    // sum_v
308    vadd.h         vr4,     vr4,     vr7
309    vadd.h         vr6,     vr6,     vr8
310    vst            vr4,     t1,      REST_UNIT_STRIDE<<1
311    vst            vr6,     t1,      (REST_UNIT_STRIDE<<1)+16
312    addi.d         t1,      t1,      32
313    // sumsq
314    vmulwev.h.bu   vr9,     vr3,     vr3
315    vmulwod.h.bu   vr10,    vr3,     vr3
316    vmulwev.h.bu   vr11,    vr5,     vr5
317    vmulwod.h.bu   vr12,    vr5,     vr5
318    vmul.h         vr7,     vr7,     vr7
319    vmul.h         vr8,     vr8,     vr8
320    vaddwev.w.hu   vr13,    vr10,    vr9
321    vaddwod.w.hu   vr14,    vr10,    vr9
322    vilvl.w        vr3,     vr14,    vr13
323    vilvh.w        vr4,     vr14,    vr13
324    vaddwev.w.hu   vr13,    vr12,    vr11
325    vaddwod.w.hu   vr14,    vr12,    vr11
326    vilvl.w        vr15,    vr14,    vr13
327    vilvh.w        vr16,    vr14,    vr13
328    vsllwil.wu.hu  vr9,     vr7,     0
329    vexth.wu.hu    vr10,    vr7
330    vsllwil.wu.hu  vr11,    vr8,     0
331    vexth.wu.hu    vr12,    vr8
332    vadd.w         vr9,     vr9,     vr3
333    vadd.w         vr10,    vr10,    vr4
334    vadd.w         vr11,    vr11,    vr15
335    vadd.w         vr12,    vr12,    vr16
336    vst            vr9,     t2,      REST_UNIT_STRIDE<<2
337    vst            vr10,    t2,      (REST_UNIT_STRIDE<<2)+16
338    vst            vr11,    t2,      (REST_UNIT_STRIDE<<2)+32
339    vst            vr12,    t2,      (REST_UNIT_STRIDE<<2)+48
340    addi.d         t2,      t2,      64
341
342    addi.w         t5,      t5,      -16
343    addi.d         t3,      t3,      16
344    blt            zero,    t5,      .LBS3_H_W
345
346    addi.d         a0,      a0,      REST_UNIT_STRIDE<<2
347    addi.d         a1,      a1,      REST_UNIT_STRIDE<<1
348    addi.d         a2,      a2,      REST_UNIT_STRIDE
349    addi.d         a4,      a4,      -1
350    blt            zero,    a4,      .LBS3_H_H
351
352.LBS3_H_END:
353endfunc
354
355/*
356void boxsum3_v(int32_t *sumsq, coef *sum,
357               const int w, const int h)
358*/
359function boxsum3_v_8bpc_lsx
360    addi.d         a0,      a0,      (REST_UNIT_STRIDE<<2)
361    addi.d         a1,      a1,      (REST_UNIT_STRIDE<<1)
362    addi.w         a3,      a3,      -4
363    addi.w         a2,      a2,      -4
364
365.LBS3_V_H:
366    sub.w          t3,      a2,      zero
367    addi.d         t0,      a0,      4
368    addi.d         t1,      a1,      2
369    addi.d         t5,      a0,      8
370    addi.d         t6,      a1,      4
371
372    vld            vr0,      t1,      0   // a 0 1 2 3 4 5 6 7
373    vld            vr1,      t1,      2   // b 1 2 3 4 5 6 7 8
374    vld            vr2,      t1,      4   // c 2 3 4 5 6 7 8 9
375    vld            vr3,      t0,      0   // a2 0 1 2 3
376    vld            vr4,      t0,      4   // b2 1 2 3 4
377    vld            vr5,      t0,      8   // c2 2 3 4 5
378    vld            vr6,      t0,      16  //    3 4 5 6
379    vld            vr7,      t0,      20  //    4 5 6 7
380    vld            vr8,      t0,      24  //    5 6 7 8
381    vadd.h         vr9,      vr0,     vr1
382    vadd.h         vr9,      vr9,     vr2
383    vadd.w         vr10,     vr3,     vr4
384    vadd.w         vr10,     vr10,    vr5
385    vadd.w         vr11,     vr6,     vr7
386    vadd.w         vr11,     vr11,    vr8
387    vpickve2gr.h   t7,       vr2,     6
388    vpickve2gr.w   t8,       vr8,     2
389    vst            vr9,      t6,      0
390    vst            vr10,     t5,      0
391    vst            vr11,     t5,      16
392
393    addi.d         t1,       t1,      16
394    addi.d         t0,       t0,      32
395    addi.d         t5,       t5,      32
396    addi.d         t6,       t6,      16
397    addi.d         t3,       t3,      -8
398    ble            t3,       zero,    .LBS3_V_H0
399
400.LBS3_V_W8:
401    vld            vr0,      t1,      0   // a 0 1 2 3 4 5 6 7
402    vld            vr1,      t1,      2   // b 1 2 3 4 5 6 7 8
403    vld            vr2,      t1,      4   // c 2 3 4 5 6 7 8 9
404    vld            vr3,      t0,      0   // a2 0 1 2 3
405    vld            vr4,      t0,      4   // b2 1 2 3 4
406    vld            vr5,      t0,      8   // c2 2 3 4 5
407    vld            vr6,      t0,      16  //    3 4 5 6
408    vld            vr7,      t0,      20  //    4 5 6 7
409    vld            vr8,      t0,      24  //    5 6 7 8
410    vinsgr2vr.h    vr0,      t7,      0
411    vinsgr2vr.w    vr3,      t8,      0
412    vpickve2gr.h   t7,       vr2,     6
413    vpickve2gr.w   t8,       vr8,     2
414    vadd.h         vr9,      vr0,     vr1
415    vadd.w         vr10,     vr3,     vr4
416    vadd.w         vr11,     vr6,     vr7
417    vadd.h         vr9,      vr9,     vr2
418    vadd.w         vr10,     vr10,    vr5
419    vadd.w         vr11,     vr11,    vr8
420    vst            vr9,      t6,      0
421    vst            vr10,     t5,      0
422    vst            vr11,     t5,      16
423    addi.d         t3,       t3,      -8
424    addi.d         t1,       t1,      16
425    addi.d         t0,       t0,      32
426    addi.d         t5,       t5,      32
427    addi.d         t6,       t6,      16
428    blt            zero,     t3,       .LBS3_V_W8
429
430.LBS3_V_H0:
431    addi.d         a1,       a1,      REST_UNIT_STRIDE<<1
432    addi.d         a0,       a0,      REST_UNIT_STRIDE<<2
433    addi.w         a3,       a3,      -1
434    bnez           a3,       .LBS3_V_H
435
436.LBS3_V_END:
437endfunc
438
439/*
440boxsum3_selfguided_filter(int32_t *sumsq, coef *sum,
441                          const int w, const int h,
442                          const unsigned s)
443*/
444function boxsum3_sgf_h_8bpc_lsx
445    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
446    addi.d        a0,       a0,        12   // AA
447    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
448    addi.d        a1,       a1,        6    // BB
449    la.local      t8,       dav1d_sgr_x_by_x
450    li.w          t6,       455
451    vreplgr2vr.w  vr20,     t6
452    li.w          t6,       255
453    vreplgr2vr.w  vr22,     t6
454    vaddi.wu      vr21,     vr22,      1  // 256
455    vreplgr2vr.w  vr6,      a4
456    vldi          vr19,     0x809
457    addi.w        a2,       a2,        2  // w + 2
458    addi.w        a3,       a3,        2  // h + 2
459
460.LBS3SGF_H_H:
461    addi.w        t2,       a2,        0
462    addi.d        t0,       a0,        -4
463    addi.d        t1,       a1,        -2
464
465.LBS3SGF_H_W:
466    addi.w        t2,       t2,        -8
467    vld           vr0,      t0,        0   // AA[i]
468    vld           vr1,      t0,        16
469    vld           vr2,      t1,        0   // BB[i]
470
471    vmul.w        vr4,      vr0,       vr19 // a * n
472    vmul.w        vr5,      vr1,       vr19 // a * n
473    vsllwil.w.h   vr9,      vr2,       0
474    vexth.w.h     vr10,     vr2
475    vmsub.w       vr4,      vr9,       vr9   // p
476    vmsub.w       vr5,      vr10,      vr10   // p
477    vmaxi.w       vr4,      vr4,       0
478    vmaxi.w       vr5,      vr5,       0    // p
479    vmul.w        vr4,      vr4,       vr6  // p * s
480    vmul.w        vr5,      vr5,       vr6  // p * s
481    vsrlri.w      vr4,      vr4,       20
482    vsrlri.w      vr5,      vr5,       20   // z
483    vmin.w        vr4,      vr4,       vr22
484    vmin.w        vr5,      vr5,       vr22
485
486    vpickve2gr.w  t6,       vr4,       0
487    ldx.bu        t7,       t8,        t6
488    vinsgr2vr.w   vr7,      t7,        0
489    vpickve2gr.w  t6,       vr4,       1
490    ldx.bu        t7,       t8,        t6
491    vinsgr2vr.w   vr7,      t7,        1
492    vpickve2gr.w  t6,       vr4,       2
493    ldx.bu        t7,       t8,        t6
494    vinsgr2vr.w   vr7,      t7,        2
495    vpickve2gr.w  t6,       vr4,       3
496    ldx.bu        t7,       t8,        t6
497    vinsgr2vr.w   vr7,      t7,        3
498
499    vpickve2gr.w  t6,       vr5,       0
500    ldx.bu        t7,       t8,        t6
501    vinsgr2vr.w   vr8,      t7,        0
502    vpickve2gr.w  t6,       vr5,       1
503    ldx.bu        t7,       t8,        t6
504    vinsgr2vr.w   vr8,      t7,        1
505    vpickve2gr.w  t6,       vr5,       2
506    ldx.bu        t7,       t8,        t6
507    vinsgr2vr.w   vr8,      t7,        2
508    vpickve2gr.w  t6,       vr5,       3
509    ldx.bu        t7,       t8,        t6
510    vinsgr2vr.w   vr8,      t7,        3     // x
511
512    vmul.w        vr9,      vr7,       vr9   // x * BB[i]
513    vmul.w        vr10,     vr8,       vr10
514    vmul.w        vr9,      vr9,       vr20  // x * BB[i] * sgr_one_by_x
515    vmul.w        vr10,     vr10,      vr20
516    vsrlri.w      vr9,      vr9,       12
517    vsrlri.w      vr10,     vr10,      12
518    vsub.w        vr7,      vr21,      vr7
519    vsub.w        vr8,      vr21,      vr8
520    vpickev.h     vr8,      vr8,       vr7
521
522    vst           vr9,      t0,        0
523    vst           vr10,     t0,        16
524    vst           vr8,      t1,        0
525    addi.d        t0,       t0,        32
526    addi.d        t1,       t1,        16
527    blt           zero,     t2,        .LBS3SGF_H_W
528
529    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
530    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
531    addi.w        a3,       a3,        -1
532    bnez          a3,       .LBS3SGF_H_H
533endfunc
534
535/*
536boxsum3_selfguided_filter(coef *dst, pixel *src,
537                  int32_t *sumsq, coef *sum,
538                  const int w, const int h)
539*/
540function boxsum3_sgf_v_8bpc_lsx
541    addi.d        a1,        a1,      (3*REST_UNIT_STRIDE+3)   // src
542    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
543    addi.d        a2,        a2,      (REST_UNIT_STRIDE<<2)+12
544    addi.d        a3,        a3,      REST_UNIT_STRIDE<<2
545    addi.d        a3,        a3,      6
546.LBS3SGF_V_H:
547    // A int32_t *sumsq
548    addi.d        t0,        a2,      -(REST_UNIT_STRIDE<<2)   // -stride
549    addi.d        t1,        a2,      0    // sumsq
550    addi.d        t2,        a2,      REST_UNIT_STRIDE<<2      // +stride
551    addi.d        t6,        a1,      0
552    addi.w        t7,        a4,      0
553    addi.d        t8,        a0,      0
554    // B coef *sum
555    addi.d        t3,        a3,      -(REST_UNIT_STRIDE<<1)   // -stride
556    addi.d        t4,        a3,      0
557    addi.d        t5,        a3,      REST_UNIT_STRIDE<<1
558
559.LBS3SGF_V_W:
560    vld           vr0,       t0,      0   // P[i - REST_UNIT_STRIDE]
561    vld           vr1,       t0,      16
562    vld           vr2,       t1,      -4  // P[i-1]
563    vld           vr3,       t1,      12
564    vld           vr4,       t2,      0   // P[i + REST_UNIT_STRIDE]
565    vld           vr5,       t2,      16
566    vld           vr6,       t1,      0   // p[i]
567    vld           vr7,       t1,      16
568    vld           vr8,       t1,      4   // p[i+1]
569    vld           vr9,       t1,      20
570
571    vld           vr10,      t0,      -4  // P[i - 1 - REST_UNIT_STRIDE]
572    vld           vr11,      t0,      12
573    vld           vr12,      t2,      -4  // P[i - 1 + REST_UNIT_STRIDE]
574    vld           vr13,      t2,      12
575    vld           vr14,      t0,      4   // P[i + 1 - REST_UNIT_STRIDE]
576    vld           vr15,      t0,      20
577    vld           vr16,      t2,      4   // P[i + 1 + REST_UNIT_STRIDE]
578    vld           vr17,      t2,      20
579
580    vadd.w        vr0,       vr2,     vr0
581    vadd.w        vr4,       vr6,     vr4
582    vadd.w        vr0,       vr0,     vr8
583    vadd.w        vr20,      vr0,     vr4
584    vslli.w       vr20,      vr20,    2      // 0 1 2 3
585    vadd.w        vr0,       vr1,     vr3
586    vadd.w        vr4,       vr5,     vr7
587    vadd.w        vr0,       vr0,     vr9
588    vadd.w        vr21,      vr0,     vr4
589    vslli.w       vr21,      vr21,    2      // 4 5 6 7
590    vadd.w        vr12,      vr10,    vr12
591    vadd.w        vr16,      vr14,    vr16
592    vadd.w        vr22,      vr12,    vr16
593    vslli.w       vr23,      vr22,    1
594    vadd.w        vr22,      vr23,    vr22
595    vadd.w        vr11,      vr11,    vr13
596    vadd.w        vr15,      vr15,    vr17
597    vadd.w        vr0,       vr11,    vr15
598    vslli.w       vr23,      vr0,     1
599    vadd.w        vr23,      vr23,    vr0
600    vadd.w        vr20,      vr20,    vr22   // b
601    vadd.w        vr21,      vr21,    vr23
602
603    // B coef *sum
604    vld           vr0,       t3,      0   // P[i - REST_UNIT_STRIDE]
605    vld           vr1,       t4,      -2  // p[i - 1]
606    vld           vr2,       t4,      0   // p[i]
607    vld           vr3,       t4,      2   // p[i + 1]
608    vld           vr4,       t5,      0   // P[i + REST_UNIT_STRIDE]
609    vld           vr5,       t3,      -2  // P[i - 1 - REST_UNIT_STRIDE]
610    vld           vr6,       t5,      -2  // P[i - 1 + REST_UNIT_STRIDE]
611    vld           vr7,       t3,      2   // P[i + 1 - REST_UNIT_STRIDE]
612    vld           vr8,       t5,      2   // P[i + 1 + REST_UNIT_STRIDE]
613    vaddwev.w.h   vr9,       vr0,     vr1
614    vaddwod.w.h   vr10,      vr0,     vr1
615    vaddwev.w.h   vr11,      vr2,     vr3
616    vaddwod.w.h   vr12,      vr2,     vr3
617    vadd.w        vr9,       vr11,    vr9
618    vadd.w        vr10,      vr12,    vr10
619    vilvl.w       vr11,      vr10,    vr9    // 0 1 2 3
620    vilvh.w       vr12,      vr10,    vr9    // 4 5 6 7
621    vsllwil.w.h   vr0,       vr4,     0
622    vexth.w.h     vr1,       vr4
623    vadd.w        vr0,       vr11,    vr0
624    vadd.w        vr1,       vr12,    vr1
625    vslli.w       vr0,       vr0,     2
626    vslli.w       vr1,       vr1,     2
627    vaddwev.w.h   vr9,       vr5,     vr6
628    vaddwod.w.h   vr10,      vr5,     vr6
629    vaddwev.w.h   vr11,      vr7,     vr8
630    vaddwod.w.h   vr12,      vr7,     vr8
631    vadd.w        vr9,       vr11,    vr9
632    vadd.w        vr10,      vr12,    vr10
633    vilvl.w       vr13,      vr10,    vr9
634    vilvh.w       vr14,      vr10,    vr9
635    vslli.w       vr15,      vr13,    1
636    vslli.w       vr16,      vr14,    1
637    vadd.w        vr15,      vr13,    vr15   // a
638    vadd.w        vr16,      vr14,    vr16
639    vadd.w        vr22,      vr0,     vr15
640    vadd.w        vr23,      vr1,     vr16
641    vld           vr0,       t6,      0      // src
642    vsllwil.hu.bu vr0,       vr0,     0
643    vsllwil.wu.hu vr1,       vr0,     0
644    vexth.wu.hu   vr2,       vr0
645    vmadd.w       vr20,      vr22,    vr1
646    vmadd.w       vr21,      vr23,    vr2
647    vssrlrni.h.w  vr21,      vr20,    9
648    vst           vr21,      t8,      0
649    addi.d        t8,        t8,      16
650
651    addi.d        t0,        t0,      32
652    addi.d        t1,        t1,      32
653    addi.d        t2,        t2,      32
654    addi.d        t3,        t3,      16
655    addi.d        t4,        t4,      16
656    addi.d        t5,        t5,      16
657    addi.d        t6,        t6,      8
658    addi.w        t7,        t7,      -8
659    blt           zero,      t7,      .LBS3SGF_V_W
660
661    addi.w        a5,        a5,      -1
662    addi.d        a0,        a0,      384*2
663    addi.d        a1,        a1,      REST_UNIT_STRIDE
664    addi.d        a3,        a3,      REST_UNIT_STRIDE<<1
665    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
666    bnez          a5,        .LBS3SGF_V_H
667endfunc
668
669#define FILTER_OUT_STRIDE (384)
670
671/*
672sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride,
673                   const int16_t *dst, const int w1;
674                   const int w, const int h);
675*/
676function sgr_3x3_finish_8bpc_lsx
677    vreplgr2vr.w  vr3,     a3            // w1
678    andi          t4,      a4,       0x7
679    sub.w         t5,      a4,       t4
680
681    beq           zero,    t5,       .LSGR3X3_REM
682
683.LSGR3X3_H:
684    addi.d        t0,      a0,       0
685    addi.d        t1,      a2,       0
686    addi.w        t2,      t5,       0
687    andi          t4,      a4,       0x7
688.LSGR3X3_W:
689    vld           vr0,     t0,       0
690    vld           vr1,     t1,       0
691    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
692    vsllwil.wu.hu vr4,     vr2,      0   // p
693    vexth.wu.hu   vr5,     vr2           // p
694    vslli.w       vr6,     vr4,      7
695    vslli.w       vr7,     vr5,      7
696    vsllwil.w.h   vr8,     vr1,      0   // dst
697    vexth.w.h     vr9,     vr1           // dst
698    vsub.w        vr8,     vr8,      vr4
699    vsub.w        vr9,     vr9,      vr5
700    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
701    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
702    vssrarni.hu.w vr7,     vr6,      11
703    vssrlni.bu.h  vr7,     vr7,      0
704    vstelm.d      vr7,     t0,       0,    0
705    addi.d        t0,      t0,       8
706    addi.d        t1,      t1,       16
707    addi.d        t2,      t2,       -8
708    bne           zero,    t2,       .LSGR3X3_W
709
710    beq           t4,      zero,     .LSGR3X3_NOREM
711
712    vld           vr0,     t0,       0
713    vld           vr1,     t1,       0
714    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
715    vsllwil.wu.hu vr4,     vr2,      0   // p
716    vexth.wu.hu   vr5,     vr2           // p
717    vslli.w       vr6,     vr4,      7
718    vslli.w       vr7,     vr5,      7
719    vsllwil.w.h   vr8,     vr1,      0   // dst
720    vexth.w.h     vr9,     vr1           // dst
721    vsub.w        vr8,     vr8,      vr4
722    vsub.w        vr9,     vr9,      vr5
723    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
724    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
725    vssrarni.hu.w vr7,     vr6,      11
726    vssrlni.bu.h  vr7,     vr7,      0
727
728.LSGR3X3_ST:
729    vstelm.b      vr7,     t0,       0,    0
730    addi.d        t0,      t0,       1
731    vbsrl.v       vr7,     vr7,      1
732    addi.w        t4,      t4,       -1
733    bnez          t4,      .LSGR3X3_ST
734
735.LSGR3X3_NOREM:
736    addi.w        a5,      a5,       -1
737    add.d         a0,      a0,       a1
738    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
739    bnez          a5,      .LSGR3X3_H
740    b             .LSGR3X3_END
741
742.LSGR3X3_REM:
743    andi          t4,      a4,       0x7
744    addi.d        t0,      a0,       0
745    vld           vr0,     t0,       0
746    vld           vr1,     a2,       0
747    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
748    vsllwil.wu.hu vr4,     vr2,      0   // p
749    vexth.wu.hu   vr5,     vr2           // p
750    vslli.w       vr6,     vr4,      7
751    vslli.w       vr7,     vr5,      7
752    vsllwil.w.h   vr8,     vr1,      0   // dst
753    vexth.w.h     vr9,     vr1           // dst
754    vsub.w        vr8,     vr8,      vr4
755    vsub.w        vr9,     vr9,      vr5
756    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
757    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
758    vssrarni.hu.w vr7,     vr6,      11
759    vssrlni.bu.h  vr7,     vr7,      0
760
761.LSGR3X3_REM_ST:
762    vstelm.b      vr7,     t0,       0,    0
763    addi.d        t0,      t0,       1
764    vbsrl.v       vr7,     vr7,      1
765    addi.w        t4,      t4,       -1
766    bnez          t4,      .LSGR3X3_REM_ST
767    addi.w        a5,      a5,       -1
768    add.d         a0,      a0,       a1
769    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
770    bnez          a5,      .LSGR3X3_REM
771
772.LSGR3X3_END:
773endfunc
774
775/*
776void boxsum5(int32_t *sumsq, coef *sum,
777             const pixel *const src,
778             const int w, const int h)
779*/
780function boxsum5_h_8bpc_lsx
781    addi.w        a4,      a4,        -4
782    addi.d        a0,      a0,        REST_UNIT_STRIDE<<2
783    addi.d        a1,      a1,        REST_UNIT_STRIDE<<1
784    li.w          t6,      1
785.LBOXSUM5_H_H:
786    addi.w        t3,      a3,        0
787    addi.d        t2,      a2,        0
788    addi.d        t0,      a0,        0
789    addi.d        t1,      a1,        0
790
791.LBOXSUM5_H_W:
792    vld           vr0,     t2,        0                   // a
793    vld           vr1,     t2,        REST_UNIT_STRIDE    // b
794    vld           vr2,     t2,        REST_UNIT_STRIDE<<1 // c
795    vld           vr3,     t2,        REST_UNIT_STRIDE*3  // d
796    vld           vr4,     t2,        REST_UNIT_STRIDE<<2 // e
797
798    vilvl.b       vr5,     vr1,       vr0
799    vilvh.b       vr6,     vr1,       vr0
800    vilvl.b       vr7,     vr3,       vr2
801    vilvh.b       vr8,     vr3,       vr2
802    //sum_v
803    vhaddw.hu.bu  vr9,     vr5,       vr5  // 0 1  2  3  4  5  6  7
804    vhaddw.hu.bu  vr10,    vr6,       vr6  // 8 9 10 11 12 13 14 15  a+b
805    vhaddw.hu.bu  vr11,    vr7,       vr7
806    vhaddw.hu.bu  vr12,    vr8,       vr8
807    vadd.h        vr9,     vr9,       vr11
808    vadd.h        vr10,    vr10,      vr12  // a + b + c + d
809    vsllwil.hu.bu vr11,    vr4,       0
810    vexth.hu.bu   vr12,    vr4
811    vadd.h        vr9,     vr9,       vr11
812    vadd.h        vr10,    vr10,      vr12
813    vst           vr9,     t1,        0
814    vst           vr10,    t1,        16
815    addi.d        t1,      t1,        32
816
817    // sumsq
818    vmulwev.h.bu  vr9,     vr5,       vr5  // a*a 0 1  2  3  4  5  6  7
819    vmulwev.h.bu  vr10,    vr6,       vr6  // a*a 8 9 10 11 12 13 14 15
820    vmulwod.h.bu  vr13,    vr5,       vr5  // b*b 0 1  2  3  4  5  6  7
821    vmulwod.h.bu  vr14,    vr6,       vr6  // b*b 8 9 10 11 12 13 14 15
822    vmulwev.h.bu  vr15,    vr7,       vr7  // c*c 0 1  2  3  4  5  6  7
823    vmulwev.h.bu  vr16,    vr8,       vr8  // c*c 8 9 10 11 12 13 14 15
824    vmulwod.h.bu  vr17,    vr7,       vr7  // d*d 0 1  2  3  4  5  6  7
825    vmulwod.h.bu  vr18,    vr8,       vr8  // d*d 8 9 10 11 12 13 14 15
826    vaddwev.w.hu  vr5,     vr9,       vr13  // 0 2 4 6
827    vaddwod.w.hu  vr6,     vr9,       vr13  // 1 3 5 7
828    vaddwev.w.hu  vr7,     vr10,      vr14  // 8 10 12 14
829    vaddwod.w.hu  vr8,     vr10,      vr14  // 9 11 13 15   a + b
830    vaddwev.w.hu  vr19,    vr15,      vr17  // 0 2 4 6
831    vaddwod.w.hu  vr20,    vr15,      vr17  // 1 3 5 7
832    vaddwev.w.hu  vr21,    vr16,      vr18  // 8 10 12 14
833    vaddwod.w.hu  vr22,    vr16,      vr18  // 9 11 13 15   c + d
834    vadd.w        vr5,     vr5,       vr19
835    vadd.w        vr6,     vr6,       vr20
836    vadd.w        vr7,     vr7,       vr21
837    vadd.w        vr8,     vr8,       vr22
838    vilvl.w       vr19,    vr6,       vr5
839    vilvh.w       vr20,    vr6,       vr5
840    vilvl.w       vr21,    vr8,       vr7
841    vilvh.w       vr22,    vr8,       vr7
842    vmul.h        vr11,    vr11,      vr11
843    vmul.h        vr12,    vr12,      vr12
844    vsllwil.wu.hu vr0,     vr11,      0
845    vexth.wu.hu   vr1,     vr11
846    vsllwil.wu.hu vr2,     vr12,      0
847    vexth.wu.hu   vr3,     vr12
848    vadd.w        vr19,    vr19,      vr0
849    vadd.w        vr20,    vr20,      vr1
850    vadd.w        vr21,    vr21,      vr2
851    vadd.w        vr22,    vr22,      vr3
852    vst           vr19,    t0,        0
853    vst           vr20,    t0,        16
854    vst           vr21,    t0,        32
855    vst           vr22,    t0,        48
856    addi.d        t0,      t0,        64
857    addi.d        t2,      t2,        16
858    addi.w        t3,      t3,        -16
859    blt           zero,    t3,        .LBOXSUM5_H_W
860
861    addi.d        a0,      a0,        REST_UNIT_STRIDE<<2
862    addi.d        a1,      a1,        REST_UNIT_STRIDE<<1
863    addi.d        a2,      a2,        REST_UNIT_STRIDE
864    addi.d        a4,      a4,        -1
865    bnez          a4,      .LBOXSUM5_H_H
866endfunc
867
868/*
869void boxsum5_h(int32_t *sumsq, coef *sum,
870               const int w, const int h)
871*/
872function boxsum5_v_8bpc_lsx
873    addi.d         a0,      a0,      (REST_UNIT_STRIDE<<2)
874    addi.d         a1,      a1,      (REST_UNIT_STRIDE<<1)
875    addi.w         a3,      a3,      -4
876    addi.w         a2,      a2,      -4
877
878.LBOXSUM5_V_H:
879    addi.w         t3,      a2,      0
880    addi.d         t0,      a0,      0
881    addi.d         t1,      a1,      0
882    addi.d         t2,      a0,      8
883    addi.d         t3,      a1,      4
884    addi.d         t4,      a2,      0
885
886    vld            vr0,     t1,      0   // a 0 1 2 3 4 5 6 7
887    vld            vr1,     t1,      2   // b 1 2 3 4 5 6 7 8
888    vld            vr2,     t1,      4   // c 2
889    vld            vr3,     t1,      6   // d 3
890    vld            vr4,     t1,      8   // e 4 5 6 7 8 9 10 11
891    vadd.h         vr5,     vr0,     vr1
892    vadd.h         vr6,     vr2,     vr3
893    vpickve2gr.w   t5,      vr4,     2
894    vadd.h         vr5,     vr5,     vr6
895    vadd.h         vr5,     vr5,     vr4
896    vst            vr5,     t3,      0
897
898    vld            vr0,     t0,      0  // 0 1 2 3   a
899    vld            vr1,     t0,      4  // 1 2 3 4   b
900    vld            vr2,     t0,      8  // 2 3 4 5   c
901    vld            vr3,     t0,      12 // 3 4 5 6   d
902    vld            vr4,     t0,      16 // 4 5 6 7   e  a
903    vld            vr5,     t0,      20 // 5 6 7 8      b
904    vld            vr6,     t0,      24 // 6 7 8 9      c
905    vld            vr7,     t0,      28 // 7 8 9 10     d
906    vld            vr8,     t0,      32 // 8 9 10 11    e
907
908    vadd.w         vr9,     vr0,     vr1
909    vadd.w         vr10,    vr2,     vr3
910    vadd.w         vr9,     vr9,     vr10
911    vadd.w         vr9,     vr9,     vr4
912    vadd.w         vr10,    vr4,     vr5
913    vadd.w         vr11,    vr6,     vr7
914    vadd.w         vr10,    vr10,    vr8
915    vadd.w         vr10,    vr10,    vr11
916    vst            vr9,     t2,      0
917    vst            vr10,    t2,      16
918
919    addi.d         t3,      t3,      16
920    addi.d         t1,      t1,      16
921    addi.d         t0,      t0,      32
922    addi.d         t2,      t2,      32
923    addi.w         t4,      t4,      -8
924    ble            t4,      zero,    .LBOXSUM5_V_H1
925
926.LBOXSUM5_V_W:
927    vld            vr0,     t1,      0   // a 0 1 2 3 4 5 6 7
928    vld            vr1,     t1,      2   // b 1 2 3 4 5 6 7 8
929    vld            vr2,     t1,      4   // c 2
930    vld            vr3,     t1,      6   // d 3
931    vld            vr4,     t1,      8   // e 4 5 6 7 8 9 10 11
932    vinsgr2vr.w    vr0,     t5,      0
933    vpickve2gr.w   t5,      vr4,     2
934    vextrins.h     vr1,     vr0,     0x01
935    vadd.h         vr5,     vr0,     vr1
936    vadd.h         vr6,     vr2,     vr3
937    vadd.h         vr5,     vr5,     vr6
938    vadd.h         vr5,     vr5,     vr4
939    vst            vr5,     t3,      0
940
941    vaddi.hu       vr0,     vr8,     0  // 8  9 10 11  a
942    vld            vr1,     t0,      4  // 9 10 11 12  b
943    vld            vr2,     t0,      8  // 10 11 12 13 c
944    vld            vr3,     t0,      12 // 14 15 16 17 d
945    vld            vr4,     t0,      16 // 15 16 17 18 e  a
946    vld            vr5,     t0,      20 // 16 17 18 19    b
947    vld            vr6,     t0,      24 // 17 18 19 20    c
948    vld            vr7,     t0,      28 // 18 19 20 21    d
949    vld            vr8,     t0,      32 // 19 20 21 22    e
950    vextrins.w     vr1,     vr0,     0x01
951    vadd.w         vr9,     vr0,     vr1
952    vadd.w         vr10,    vr2,     vr3
953    vadd.w         vr9,     vr9,     vr10
954    vadd.w         vr9,     vr9,     vr4
955    vadd.w         vr10,    vr4,     vr5
956    vadd.w         vr11,    vr6,     vr7
957    vadd.w         vr10,    vr10,    vr8
958    vadd.w         vr10,    vr10,    vr11
959    vst            vr9,     t2,      0
960    vst            vr10,    t2,      16
961
962    addi.d         t3,      t3,      16
963    addi.d         t1,      t1,      16
964    addi.d         t0,      t0,      32
965    addi.d         t2,      t2,      32
966    addi.w         t4,      t4,      -8
967    blt            zero,    t4,      .LBOXSUM5_V_W
968
969.LBOXSUM5_V_H1:
970    addi.d         a1,       a1,      REST_UNIT_STRIDE<<1
971    addi.d         a0,       a0,      REST_UNIT_STRIDE<<2
972    addi.w         a3,       a3,      -1
973    bnez           a3,       .LBOXSUM5_V_H
974endfunc
975
976/*
977selfguided_filter(int32_t *sumsq, coef *sum,
978                  const int w, const int h,
979                  const unsigned s)
980*/
981function boxsum5_sgf_h_8bpc_lsx
982    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
983    addi.d        a0,       a0,        12   // AA
984    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
985    addi.d        a1,       a1,        6    // BB
986    la.local      t8,       dav1d_sgr_x_by_x
987    li.w          t6,       164
988    vreplgr2vr.w  vr20,     t6
989    li.w          t6,       255
990    vreplgr2vr.w  vr22,     t6
991    vaddi.wu      vr21,     vr22,      1  // 256
992    vreplgr2vr.w  vr6,      a4
993    vldi          vr19,     0x819
994    addi.w        a2,       a2,        2  // w + 2
995    addi.w        a3,       a3,        2  // h + 2
996
997.LBS5SGF_H_H:
998    addi.w        t2,       a2,        0
999    addi.d        t0,       a0,        -4
1000    addi.d        t1,       a1,        -2
1001
1002.LBS5SGF_H_W:
1003    vld           vr0,      t0,        0   // AA[i]
1004    vld           vr1,      t0,        16
1005    vld           vr2,      t1,        0   // BB[i]
1006
1007    vmul.w        vr4,      vr0,       vr19 // a * n
1008    vmul.w        vr5,      vr1,       vr19 // a * n
1009    vsllwil.w.h   vr9,      vr2,       0
1010    vexth.w.h     vr10,     vr2
1011    vmsub.w       vr4,      vr9,       vr9   // p
1012    vmsub.w       vr5,      vr10,      vr10   // p
1013    vmaxi.w       vr4,      vr4,       0
1014    vmaxi.w       vr5,      vr5,       0    // p
1015    vmul.w        vr4,      vr4,       vr6  // p * s
1016    vmul.w        vr5,      vr5,       vr6  // p * s
1017    vsrlri.w      vr4,      vr4,       20
1018    vsrlri.w      vr5,      vr5,       20   // z
1019    vmin.w        vr4,      vr4,       vr22
1020    vmin.w        vr5,      vr5,       vr22
1021
1022    // load table data
1023    vpickve2gr.w  t6,       vr4,       0
1024    ldx.bu        t7,       t8,        t6
1025    vinsgr2vr.w   vr7,      t7,        0
1026    vpickve2gr.w  t6,       vr4,       1
1027    ldx.bu        t7,       t8,        t6
1028    vinsgr2vr.w   vr7,      t7,        1
1029    vpickve2gr.w  t6,       vr4,       2
1030    ldx.bu        t7,       t8,        t6
1031    vinsgr2vr.w   vr7,      t7,        2
1032    vpickve2gr.w  t6,       vr4,       3
1033    ldx.bu        t7,       t8,        t6
1034    vinsgr2vr.w   vr7,      t7,        3
1035
1036    vpickve2gr.w  t6,       vr5,       0
1037    ldx.bu        t7,       t8,        t6
1038    vinsgr2vr.w   vr8,      t7,        0
1039    vpickve2gr.w  t6,       vr5,       1
1040    ldx.bu        t7,       t8,        t6
1041    vinsgr2vr.w   vr8,      t7,        1
1042    vpickve2gr.w  t6,       vr5,       2
1043    ldx.bu        t7,       t8,        t6
1044    vinsgr2vr.w   vr8,      t7,        2
1045    vpickve2gr.w  t6,       vr5,       3
1046    ldx.bu        t7,       t8,        t6
1047    vinsgr2vr.w   vr8,      t7,        3     // x
1048
1049    vmul.w        vr9,      vr7,       vr9   // x * BB[i]
1050    vmul.w        vr10,     vr8,       vr10
1051    vmul.w        vr9,      vr9,       vr20  // x * BB[i] * sgr_one_by_x
1052    vmul.w        vr10,     vr10,      vr20
1053    vsrlri.w      vr9,      vr9,       12
1054    vsrlri.w      vr10,     vr10,      12
1055    vsub.w        vr7,      vr21,      vr7
1056    vsub.w        vr8,      vr21,      vr8
1057    vpickev.h     vr8,      vr8,       vr7
1058    vst           vr9,      t0,        0
1059    vst           vr10,     t0,        16
1060    vst           vr8,      t1,        0
1061    addi.d        t0,       t0,        32
1062    addi.d        t1,       t1,        16
1063    addi.w        t2,       t2,        -8
1064    blt           zero,     t2,        .LBS5SGF_H_W
1065
1066    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
1067    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
1068    addi.d        a1,       a1,        REST_UNIT_STRIDE<<2
1069    addi.w        a3,       a3,        -2
1070    blt           zero,     a3,        .LBS5SGF_H_H
1071endfunc
1072
1073/*
1074selfguided_filter(coef *dst, pixel *src,
1075                  int32_t *sumsq, coef *sum,
1076                  const int w, const int h)
1077*/
1078function boxsum5_sgf_v_8bpc_lsx
1079    addi.d        a1,        a1,       3*REST_UNIT_STRIDE+3       // src
1080    addi.d        a2,        a2,       (2*REST_UNIT_STRIDE+3)<<1  // A
1081    addi.d        a2,        a2,       (2*REST_UNIT_STRIDE+3)<<1
1082    addi.d        a3,        a3,       (2*REST_UNIT_STRIDE+3)<<1  // B
1083    addi.w        a5,        a5,       -1
1084    vldi          vr10,      0x806
1085    vldi          vr11,      0x805
1086    vldi          vr22,      0x406
1087
1088.LBS5SGF_V_H:
1089    addi.d        t0,        a0,       0
1090    addi.d        t1,        a1,       0
1091    addi.d        t2,        a2,       0
1092    addi.d        t3,        a3,       0
1093    addi.w        t4,        a4,       0
1094
1095    addi.d        t5,        a0,       384*2
1096    addi.d        t6,        a1,       REST_UNIT_STRIDE
1097    addi.d        t7,        a2,       REST_UNIT_STRIDE<<2
1098    addi.d        t8,        a3,       REST_UNIT_STRIDE<<1   // B
1099.LBS5SGF_V_W:
1100    // a
1101    vld           vr0,       t3,       -REST_UNIT_STRIDE*2
1102    vld           vr1,       t3,       REST_UNIT_STRIDE*2
1103    vld           vr2,       t3,       (-REST_UNIT_STRIDE-1)*2
1104    vld           vr3,       t3,       (REST_UNIT_STRIDE-1)*2
1105    vld           vr4,       t3,       (1-REST_UNIT_STRIDE)*2
1106    vld           vr5,       t3,       (1+REST_UNIT_STRIDE)*2
1107    vaddwev.w.h   vr6,       vr0,      vr1
1108    vaddwod.w.h   vr7,       vr0,      vr1
1109    vmul.w        vr6,       vr6,      vr10
1110    vmul.w        vr7,       vr7,      vr10
1111    vaddwev.w.h   vr8,       vr2,      vr3
1112    vaddwod.w.h   vr9,       vr2,      vr3
1113    vaddwev.w.h   vr12,      vr4,      vr5
1114    vaddwod.w.h   vr13,      vr4,      vr5
1115    vadd.w        vr8,       vr8,      vr12
1116    vadd.w        vr9,       vr9,      vr13
1117    vmadd.w       vr6,       vr8,      vr11
1118    vmadd.w       vr7,       vr9,      vr11
1119    vilvl.w       vr18,      vr7,      vr6
1120    vilvh.w       vr19,      vr7,      vr6
1121    // b
1122    vld           vr0,       t2,       -REST_UNIT_STRIDE*4
1123    vld           vr1,       t2,       -REST_UNIT_STRIDE*4+16
1124    vld           vr2,       t2,       REST_UNIT_STRIDE*4
1125    vld           vr3,       t2,       REST_UNIT_STRIDE*4+16
1126    vld           vr4,       t2,       (-REST_UNIT_STRIDE-1)*4
1127    vld           vr5,       t2,       (-REST_UNIT_STRIDE-1)*4+16
1128    vld           vr8,       t2,       (REST_UNIT_STRIDE-1)*4
1129    vld           vr9,       t2,       (REST_UNIT_STRIDE-1)*4+16
1130    vld           vr12,      t2,       (1-REST_UNIT_STRIDE)*4
1131    vld           vr13,      t2,       (1-REST_UNIT_STRIDE)*4+16
1132    vld           vr14,      t2,       (1+REST_UNIT_STRIDE)*4
1133    vld           vr15,      t2,       (1+REST_UNIT_STRIDE)*4+16
1134    vadd.w        vr0,       vr0,      vr2  // 0 1 2 3
1135    vadd.w        vr1,       vr1,      vr3  // 4 5 6 7
1136    vmul.w        vr20,      vr0,      vr10
1137    vmul.w        vr21,      vr1,      vr10
1138    vadd.w        vr4,       vr4,      vr8  // 0 1 2 3
1139    vadd.w        vr5,       vr5,      vr9  // 4 5 6 7
1140    vadd.w        vr12,      vr12,     vr14
1141    vadd.w        vr13,      vr13,     vr15
1142    vadd.w        vr12,      vr12,     vr4
1143    vadd.w        vr13,      vr13,     vr5
1144    vmadd.w       vr20,      vr12,     vr11
1145    vmadd.w       vr21,      vr13,     vr11
1146    vld           vr2,       t1,       0
1147    vsllwil.hu.bu vr2,       vr2,      0
1148    vsllwil.wu.hu vr3,       vr2,      0
1149    vexth.wu.hu   vr4,       vr2
1150    vmadd.w       vr20,      vr18,     vr3
1151    vmadd.w       vr21,      vr19,     vr4
1152    vssrlrni.h.w  vr21,      vr20,     9
1153    vst           vr21,      t0,       0
1154
1155    addi.d        t1,        t1,       8
1156    addi.d        t2,        t2,       32
1157    addi.d        t3,        t3,       16
1158
1159    // a
1160    vld           vr0,       t8,       0
1161    vld           vr1,       t8,       -2
1162    vld           vr2,       t8,       2
1163    vmulwev.w.h   vr3,       vr0,      vr22
1164    vmulwod.w.h   vr4,       vr0,      vr22
1165    vaddwev.w.h   vr5,       vr1,      vr2
1166    vaddwod.w.h   vr6,       vr1,      vr2
1167    vmadd.w       vr3,       vr5,      vr11
1168    vmadd.w       vr4,       vr6,      vr11
1169    vilvl.w       vr19,      vr4,      vr3
1170    vilvh.w       vr20,      vr4,      vr3
1171    // b
1172    vld           vr0,       t7,       0
1173    vld           vr1,       t7,       -4
1174    vld           vr2,       t7,       4
1175    vld           vr5,       t7,       16
1176    vld           vr6,       t7,       12
1177    vld           vr7,       t7,       20
1178    vmul.w        vr8,       vr0,      vr10
1179    vmul.w        vr9,       vr5,      vr10
1180    vadd.w        vr12,      vr1,      vr2
1181    vadd.w        vr13,      vr6,      vr7
1182    vmadd.w       vr8,       vr12,     vr11
1183    vmadd.w       vr9,       vr13,     vr11
1184    vld           vr2,       t6,       0
1185    vsllwil.hu.bu vr2,       vr2,      0
1186    vsllwil.wu.hu vr3,       vr2,      0
1187    vexth.wu.hu   vr4,       vr2
1188    vmadd.w       vr8,       vr19,     vr3
1189    vmadd.w       vr9,       vr20,     vr4
1190    vssrlrni.h.w  vr9,       vr8,      8
1191    vst           vr9,       t0,       384*2
1192
1193    addi.d        t0,        t0,       16
1194    addi.d        t8,        t8,       16
1195    addi.d        t7,        t7,       32
1196    addi.d        t6,        t6,       8
1197    addi.w        t4,        t4,       -8
1198    blt           zero,      t4,       .LBS5SGF_V_W
1199
1200    addi.w        a5,        a5,       -2
1201    addi.d        a0,        a0,       384*4                // dst
1202    addi.d        a1,        a1,       REST_UNIT_STRIDE<<1  // src
1203    addi.d        a2,        a2,       REST_UNIT_STRIDE<<2  //
1204    addi.d        a2,        a2,       REST_UNIT_STRIDE<<2
1205    addi.d        a3,        a3,       REST_UNIT_STRIDE<<2  //
1206    blt           zero,      a5,       .LBS5SGF_V_H
1207    bnez          a5,        .LBS5SGF_END
1208.LBS5SGF_V_W1:
1209    // a
1210    vld           vr0,       a3,       -REST_UNIT_STRIDE*2
1211    vld           vr1,       a3,       REST_UNIT_STRIDE*2
1212    vld           vr2,       a3,       (-REST_UNIT_STRIDE-1)*2
1213    vld           vr3,       a3,       (REST_UNIT_STRIDE-1)*2
1214    vld           vr4,       a3,       (1-REST_UNIT_STRIDE)*2
1215    vld           vr5,       a3,       (1+REST_UNIT_STRIDE)*2
1216    vaddwev.w.h   vr6,       vr0,      vr1
1217    vaddwod.w.h   vr7,       vr0,      vr1
1218    vmul.w        vr6,       vr6,      vr10
1219    vmul.w        vr7,       vr7,      vr10
1220    vaddwev.w.h   vr8,       vr2,      vr3
1221    vaddwod.w.h   vr9,       vr2,      vr3
1222    vaddwev.w.h   vr12,      vr4,      vr5
1223    vaddwod.w.h   vr13,      vr4,      vr5
1224    vadd.w        vr8,       vr8,      vr12
1225    vadd.w        vr9,       vr9,      vr13
1226    vmadd.w       vr6,       vr8,      vr11
1227    vmadd.w       vr7,       vr9,      vr11
1228    vilvl.w       vr18,      vr7,      vr6
1229    vilvh.w       vr19,      vr7,      vr6
1230    // b
1231    vld           vr0,       a2,       -REST_UNIT_STRIDE*4
1232    vld           vr1,       a2,       -REST_UNIT_STRIDE*4+16
1233    vld           vr2,       a2,       REST_UNIT_STRIDE*4
1234    vld           vr3,       a2,       REST_UNIT_STRIDE*4+16
1235    vld           vr4,       a2,       (-REST_UNIT_STRIDE-1)*4
1236    vld           vr5,       a2,       (-REST_UNIT_STRIDE-1)*4+16
1237    vld           vr8,       a2,       (REST_UNIT_STRIDE-1)*4
1238    vld           vr9,       a2,       (REST_UNIT_STRIDE-1)*4+16
1239    vld           vr12,      a2,       (1-REST_UNIT_STRIDE)*4
1240    vld           vr13,      a2,       (1-REST_UNIT_STRIDE)*4+16
1241    vld           vr14,      a2,       (1+REST_UNIT_STRIDE)*4
1242    vld           vr15,      a2,       (1+REST_UNIT_STRIDE)*4+16
1243    vadd.w        vr0,       vr0,      vr2  // 0 1 2 3
1244    vadd.w        vr1,       vr1,      vr3  // 4 5 6 7
1245    vmul.w        vr20,      vr0,      vr10
1246    vmul.w        vr21,      vr1,      vr10
1247    vadd.w        vr4,       vr4,      vr8  // 0 1 2 3
1248    vadd.w        vr5,       vr5,      vr9  // 4 5 6 7
1249    vadd.w        vr12,      vr12,     vr14
1250    vadd.w        vr13,      vr13,     vr15
1251    vadd.w        vr12,      vr12,     vr4
1252    vadd.w        vr13,      vr13,     vr5
1253    vmadd.w       vr20,      vr12,     vr11
1254    vmadd.w       vr21,      vr13,     vr11
1255    vld           vr2,       a1,       0
1256    vsllwil.hu.bu vr2,       vr2,      0
1257    vsllwil.wu.hu vr3,       vr2,      0
1258    vexth.wu.hu   vr4,       vr2
1259    vmadd.w       vr20,      vr18,     vr3
1260    vmadd.w       vr21,      vr19,     vr4
1261    vssrlrni.h.w  vr21,      vr20,     9
1262    vst           vr21,      a0,       0
1263    addi.d        a3,        a3,       16
1264    addi.d        a2,        a2,       32
1265    addi.d        a1,        a1,       8
1266    addi.d        a0,        a0,       16
1267    addi.w        a4,        a4,       -8
1268    blt           zero,      a4,       .LBS5SGF_V_W1
1269.LBS5SGF_END:
1270endfunc
1271
1272/*
1273void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride,
1274                              const int16_t *dst0, const int16_t *dst1,
1275                              const int w0, const int w1,
1276                              const int w, const int h);
1277*/
1278function sgr_mix_finish_8bpc_lsx
1279    vreplgr2vr.w  vr3,     a4            // w0
1280    vreplgr2vr.w  vr13,    a5            // w1
1281    andi          t4,      a6,       0x7
1282    sub.w         t5,      a6,       t4
1283
1284    beq           zero,    t5,      .LSGRMIX_REM
1285
1286.LSGRMIX_H:
1287    addi.d        t0,      a0,       0
1288    addi.d        t1,      a2,       0   // dst0
1289    addi.d        t3,      a3,       0   // dst1
1290    addi.w        t2,      t5,       0
1291    andi          t4,      a6,       0x7
1292.LSGRMIX_W:
1293    vld           vr0,     t0,       0
1294    vld           vr1,     t1,       0
1295    vld           vr10,    t3,       0
1296    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
1297    vsllwil.wu.hu vr4,     vr2,      0   // u 0 1 2 3
1298    vexth.wu.hu   vr5,     vr2           // u 4 5 6 7
1299    vslli.w       vr6,     vr4,      7
1300    vslli.w       vr7,     vr5,      7
1301    vsllwil.w.h   vr8,     vr1,      0   // dst0
1302    vexth.w.h     vr9,     vr1           // dst0
1303    vsub.w        vr8,     vr8,      vr4
1304    vsub.w        vr9,     vr9,      vr5
1305    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
1306    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
1307
1308    vsllwil.w.h   vr11,    vr10,     0    // dst1
1309    vexth.w.h     vr12,    vr10           // dst1
1310    vsub.w        vr11,    vr11,     vr4
1311    vsub.w        vr12,    vr12,     vr5
1312    vmadd.w       vr6,     vr11,     vr13
1313    vmadd.w       vr7,     vr12,     vr13
1314
1315    vssrarni.hu.w vr7,     vr6,      11
1316    vssrlni.bu.h  vr7,     vr7,      0
1317    vstelm.d      vr7,     t0,       0,    0
1318    addi.d        t0,      t0,       8
1319    addi.d        t1,      t1,       16
1320    addi.d        t3,      t3,       16
1321    addi.d        t2,      t2,       -8
1322    bne           zero,    t2,       .LSGRMIX_W
1323
1324    beq           t4,      zero,     .LSGRMIX_W8
1325
1326    vld           vr0,     t0,       0
1327    vld           vr1,     t1,       0
1328    vld           vr10,    t3,       0
1329    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
1330    vsllwil.wu.hu vr4,     vr2,      0   // p
1331    vexth.wu.hu   vr5,     vr2           // p
1332    vslli.w       vr6,     vr4,      7
1333    vslli.w       vr7,     vr5,      7
1334    vsllwil.w.h   vr8,     vr1,      0   // dst
1335    vexth.w.h     vr9,     vr1           // dst
1336    vsub.w        vr8,     vr8,      vr4
1337    vsub.w        vr9,     vr9,      vr5
1338    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
1339    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
1340
1341    vsllwil.w.h   vr11,    vr10,     0    // dst1
1342    vexth.w.h     vr12,    vr10           // dst1
1343    vsub.w        vr11,    vr11,     vr4
1344    vsub.w        vr12,    vr12,     vr5
1345    vmadd.w       vr6,     vr11,     vr13
1346    vmadd.w       vr7,     vr12,     vr13
1347
1348    vssrarni.hu.w vr7,     vr6,      11
1349    vssrlni.bu.h  vr7,     vr7,      0
1350
1351.LSGRMIX_ST:
1352    vstelm.b      vr7,     t0,       0,    0
1353    addi.d        t0,      t0,       1
1354    vbsrl.v       vr7,     vr7,      1
1355    addi.w        t4,      t4,       -1
1356    bnez          t4,      .LSGRMIX_ST
1357
1358.LSGRMIX_W8:
1359    addi.w        a7,      a7,       -1
1360    add.d         a0,      a0,       a1
1361    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
1362    addi.d        a3,      a3,       (FILTER_OUT_STRIDE<<1)
1363    bnez          a7,      .LSGRMIX_H
1364    b             .LSGR_MIX_END
1365
1366.LSGRMIX_REM:
1367    andi          t4,      a6,       0x7
1368    vld           vr0,     a0,       0
1369    vld           vr1,     a2,       0
1370    vld           vr10,    a3,       0
1371    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
1372    vsllwil.wu.hu vr4,     vr2,      0   // p
1373    vexth.wu.hu   vr5,     vr2           // p
1374    vslli.w       vr6,     vr4,      7
1375    vslli.w       vr7,     vr5,      7
1376    vsllwil.w.h   vr8,     vr1,      0   // dst
1377    vexth.w.h     vr9,     vr1           // dst
1378    vsub.w        vr8,     vr8,      vr4
1379    vsub.w        vr9,     vr9,      vr5
1380    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
1381    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
1382
1383    vsllwil.w.h   vr11,    vr10,     0    // dst1
1384    vexth.w.h     vr12,    vr10           // dst1
1385    vsub.w        vr11,    vr11,     vr4
1386    vsub.w        vr12,    vr12,     vr5
1387    vmadd.w       vr6,     vr11,     vr13
1388    vmadd.w       vr7,     vr12,     vr13
1389
1390    vssrarni.hu.w vr7,     vr6,      11
1391    vssrlni.bu.h  vr7,     vr7,      0
1392    addi.d        t0,      a0,       0
1393.LSGRMIX_REM_ST:
1394    vstelm.b      vr7,     t0,       0,    0
1395    addi.d        t0,      t0,       1
1396    vbsrl.v       vr7,     vr7,      1
1397    addi.w        t4,      t4,       -1
1398    bnez          t4,      .LSGRMIX_REM_ST
1399
1400    addi.w        a7,      a7,       -1
1401    add.d         a0,      a0,       a1
1402    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
1403    addi.d        a3,      a3,       (FILTER_OUT_STRIDE<<1)
1404    bnez          a7,      .LSGRMIX_REM
1405
1406.LSGR_MIX_END:
1407endfunc
1408