• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2023, VideoLAN and dav1d authors
3 * Copyright © 2023, Loongson Technology Corporation Limited
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "loongson_asm.S"
29
30const min_prob
31  .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
32endconst
33
34.macro decode_symbol_adapt w
35    addi.d          sp,      sp,     -48
36    addi.d          a4,      a0,      24
37    vldrepl.h       vr0,     a4,      0    //rng
38    fst.s           f0,      sp,      0    //val==0
39    vld             vr1,     a1,      0    //cdf
40.if \w == 16
41    li.w            t4,      16
42    vldx            vr11,    a1,      t4
43.endif
44    addi.d          a6,      a0,      16
45    vldrepl.d       vr2,     a6,      0    //dif
46    addi.d          t0,      a0,      32
47    ld.w            t1,      t0,      0    //allow_update_cdf
48    la.local        t2,      min_prob
49    addi.d          t2,      t2,      32
50    addi.w          t3,      a2,      1
51    slli.w          t3,      t3,      1
52    sub.d           t2,      t2,      t3
53    vld             vr3,     t2,      0    //min_prob
54.if \w == 16
55    vldx            vr13,    t2,      t4
56.endif
57    vsrli.h         vr4,     vr0,     8    //r = s->rng >> 8
58    vslli.h         vr4,     vr4,     8    //r << 8
59    vsrli.h         vr5,     vr1,     6
60    vslli.h         vr5,     vr5,     7
61.if \w == 16
62    vsrli.h         vr15,    vr11,    6
63    vslli.h         vr15,    vr15,    7
64.endif
65    vmuh.hu         vr5,     vr4,     vr5
66    vadd.h          vr5,     vr5,     vr3  //v
67.if \w == 16
68    vmuh.hu         vr15,    vr4,     vr15
69    vadd.h          vr15,    vr15,    vr13
70.endif
71    addi.d          t8,      sp,      4
72    vst             vr5,     t8,      0    //store v
73.if \w == 16
74    vstx            vr15,    t8,      t4
75.endif
76    vreplvei.h      vr20,    vr2,     3    //c
77    vssub.hu        vr6,     vr5,     vr20 //c >=v
78    vseqi.h         vr6,     vr6,     0
79.if \w == 16
80    vssub.hu        vr16,    vr15,    vr20 //c >=v
81    vseqi.h         vr16,    vr16,    0
82    vpickev.b       vr21,    vr16,    vr6
83.endif
84.if \w <= 8
85    vmskltz.h       vr10,    vr6
86.else
87    vmskltz.b       vr10,    vr21
88.endif
89    beqz            t1,      .renorm\()\w
90
91    // update_cdf
92    alsl.d          t1,      a2,      a1,   1
93    ld.h            t2,      t1,      0    //count
94    srli.w          t3,      t2,      4    //count >> 4
95    addi.w          t3,      t3,      4
96    li.w            t5,      2
97    sltu            t5,      t5,      a2
98    add.w           t3,      t3,      t5   //rate
99    sltui           t5,      t2,      32
100    add.w           t2,      t2,      t5   //count + (count < 32)
101    vreplgr2vr.h    vr9,     t3
102    vseq.h          vr7,     vr7,     vr7
103    vavgr.hu        vr5,     vr6,     vr7  //i >= val ? -1 : 32768
104    vsub.h          vr5,     vr5,     vr1
105    vsub.h          vr8,     vr1,     vr6
106.if \w == 16
107    vavgr.hu        vr15,    vr16,    vr7
108    vsub.h          vr15,    vr15,    vr11
109    vsub.h          vr18,    vr11,    vr16
110.endif
111    vsra.h          vr5,     vr5,     vr9
112    vadd.h          vr8,     vr8,     vr5
113.if \w == 4
114    fst.d           f8,      a1,      0
115.else
116    vst             vr8,     a1,      0
117.endif
118.if \w == 16
119    vsra.h          vr15,    vr15,    vr9
120    vadd.h          vr18,    vr18,    vr15
121    vstx            vr18,    a1,      t4
122.endif
123    st.h            t2,      t1,      0
124
125.renorm\()\w:
126    vpickve2gr.h    t3,      vr10,    0
127    ctz.w           a7,      t3            // ret
128    alsl.d          t3,      a7,      t8,      1
129    ld.hu           t4,      t3,      0    // v
130    addi.d          t3,      t3,      -2
131    ld.hu           t5,      t3,      0    // u
132    sub.w           t5,      t5,      t4   // rng
133    slli.d          t4,      t4,      48
134    vpickve2gr.d    t6,      vr2,     0
135    sub.d           t6,      t6,      t4   // dif
136    clz.w           t4,      t5            // d
137    xori            t4,      t4,      16   // d
138    sll.d           t6,      t6,      t4
139    addi.d          a5,      a0,      28   // cnt
140    ld.w            t0,      a5,      0
141    sll.w           t5,      t5,      t4
142    sub.w           t7,      t0,      t4   // cnt-d
143    st.w            t5,      a4,      0    // store rng
144    bgeu            t0,      t4,      9f
145
146    // refill
147    ld.d            t0,      a0,      0    // buf_pos
148    ld.d            t1,      a0,      8    // buf_end
149    addi.d          t2,      t0,      8
150    bltu            t1,      t2,      2f
151
152    ld.d            t3,      t0,      0    // next_bits
153    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
154    nor             t3,      t3,      t3
155    sub.w           t2,      zero,    t1
156    revb.d          t3,      t3            // next_bits = bswap(next_bits)
157    srli.w          t2,      t2,      3    // num_bytes_read
158    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
159    b               3f
1601:
161    addi.w          t3,      t7,      -48
162    srl.d           t3,      t3,      t3   // pad with ones
163    b               4f
1642:
165    bgeu            t0,      t1,      1b
166    ld.d            t3,      t1,      -8   // next_bits
167    sub.w           t2,      t2,      t1
168    sub.w           t1,      t1,      t0   // num_bytes_left
169    slli.w          t2,      t2,      3
170    srl.d           t3,      t3,      t2
171    addi.w          t2,      t7,      -48
172    nor             t3,      t3,      t3
173    sub.w           t4,      zero,    t2
174    revb.d          t3,      t3
175    srli.w          t4,      t4,      3
176    srl.d           t3,      t3,      t2
177    sltu            t2,      t1,      t4
178    maskeqz         t1,      t1,      t2
179    masknez         t2,      t4,      t2
180    or              t2,      t2,      t1   // num_bytes_read
1813:
182    slli.w          t1,      t2,      3
183    add.d           t0,      t0,      t2
184    add.w           t7,      t7,      t1   // cnt += num_bits_read
185    st.d            t0,      a0,      0
1864:
187    or              t6,      t6,      t3   // dif |= next_bits
1889:
189    st.w            t7,      a5,      0    // store cnt
190    st.d            t6,      a6,      0    // store dif
191    move            a0,      a7
192    addi.d          sp,      sp,      48
193.endm
194
195function msac_decode_symbol_adapt4_lsx
196    decode_symbol_adapt 4
197endfunc
198
199function msac_decode_symbol_adapt8_lsx
200    decode_symbol_adapt 8
201endfunc
202
203function msac_decode_symbol_adapt16_lsx
204    decode_symbol_adapt 16
205endfunc
206
207function msac_decode_bool_lsx
208    ld.w            t0,      a0,      24   // rng
209    srli.w          a1,      a1,      6
210    ld.d            t1,      a0,      16   // dif
211    srli.w          t2,      t0,      8    // r >> 8
212    mul.w           t2,      t2,      a1
213    ld.w            a5,      a0,      28   // cnt
214    srli.w          t2,      t2,      1
215    addi.w          t2,      t2,      4    // v
216    slli.d          t3,      t2,      48   // vw
217    sltu            t4,      t1,      t3
218    move            t8,      t4            // ret
219    xori            t4,      t4,      1
220    maskeqz         t6,      t3,      t4   // if (ret) vw
221    sub.d           t6,      t1,      t6   // dif
222    slli.w          t5,      t2,      1
223    sub.w           t5,      t0,      t5   // r - 2v
224    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
225    add.w           t5,      t2,      t7   // v(rng)
226
227    // renorm
228    clz.w           t4,      t5            // d
229    xori            t4,      t4,      16   // d
230    sll.d           t6,      t6,      t4
231    sll.w           t5,      t5,      t4
232    sub.w           t7,      a5,      t4   // cnt-d
233    st.w            t5,      a0,      24   // store rng
234    bgeu            a5,      t4,      9f
235
236    // refill
237    ld.d            t0,      a0,      0    // buf_pos
238    ld.d            t1,      a0,      8    // buf_end
239    addi.d          t2,      t0,      8
240    bltu            t1,      t2,      2f
241
242    ld.d            t3,      t0,      0    // next_bits
243    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
244    nor             t3,      t3,      t3
245    sub.w           t2,      zero,    t1
246    revb.d          t3,      t3            // next_bits = bswap(next_bits)
247    srli.w          t2,      t2,      3    // num_bytes_read
248    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
249    b               3f
2501:
251    addi.w          t3,      t7,      -48
252    srl.d           t3,      t3,      t3   // pad with ones
253    b               4f
2542:
255    bgeu            t0,      t1,      1b
256    ld.d            t3,      t1,      -8   // next_bits
257    sub.w           t2,      t2,      t1
258    sub.w           t1,      t1,      t0   // num_bytes_left
259    slli.w          t2,      t2,      3
260    srl.d           t3,      t3,      t2
261    addi.w          t2,      t7,      -48
262    nor             t3,      t3,      t3
263    sub.w           t4,      zero,    t2
264    revb.d          t3,      t3
265    srli.w          t4,      t4,      3
266    srl.d           t3,      t3,      t2
267    sltu            t2,      t1,      t4
268    maskeqz         t1,      t1,      t2
269    masknez         t2,      t4,      t2
270    or              t2,      t2,      t1   // num_bytes_read
2713:
272    slli.w          t1,      t2,      3
273    add.d           t0,      t0,      t2
274    add.w           t7,      t7,      t1   // cnt += num_bits_read
275    st.d            t0,      a0,      0
2764:
277    or              t6,      t6,      t3   // dif |= next_bits
2789:
279    st.w            t7,      a0,      28   // store cnt
280    st.d            t6,      a0,      16   // store dif
281    move            a0,      t8
282endfunc
283
284function msac_decode_bool_adapt_lsx
285    ld.hu           a3,      a1,      0    // cdf[0] /f
286    ld.w            t0,      a0,      24   // rng
287    ld.d            t1,      a0,      16   // dif
288    srli.w          t2,      t0,      8    // r >> 8
289    srli.w          a7,      a3,      6
290    mul.w           t2,      t2,      a7
291    ld.w            a4,      a0,      32   // allow_update_cdf
292    ld.w            a5,      a0,      28   // cnt
293    srli.w          t2,      t2,      1
294    addi.w          t2,      t2,      4    // v
295    slli.d          t3,      t2,      48   // vw
296    sltu            t4,      t1,      t3
297    move            t8,      t4            // bit
298    xori            t4,      t4,      1
299    maskeqz         t6,      t3,      t4   // if (ret) vw
300    sub.d           t6,      t1,      t6   // dif
301    slli.w          t5,      t2,      1
302    sub.w           t5,      t0,      t5   // r - 2v
303    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
304    add.w           t5,      t2,      t7   // v(rng)
305    beqz            a4,      .renorm
306
307    // update_cdf
308    ld.hu           t0,      a1,      2    // cdf[1]
309    srli.w          t1,      t0,      4
310    addi.w          t1,      t1,      4    // rate
311    sltui           t2,      t0,      32   // count < 32
312    add.w           t0,      t0,      t2   // count + (count < 32)
313    sub.w           a3,      a3,      t8   // cdf[0] -= bit
314    slli.w          t4,      t8,      15
315    sub.w           t7,      a3,      t4   // cdf[0] - bit - 32768
316    sra.w           t7,      t7,      t1   // (cdf[0] - bit - 32768) >> rate
317    sub.w           t7,      a3,      t7   // cdf[0]
318    st.h            t7,      a1,      0
319    st.h            t0,      a1,      2
320
321.renorm:
322    clz.w           t4,      t5            // d
323    xori            t4,      t4,      16   // d
324    sll.d           t6,      t6,      t4
325    sll.w           t5,      t5,      t4
326    sub.w           t7,      a5,      t4   // cnt-d
327    st.w            t5,      a0,      24   // store rng
328    bgeu            a5,      t4,      9f
329
330    // refill
331    ld.d            t0,      a0,      0    // buf_pos
332    ld.d            t1,      a0,      8    // buf_end
333    addi.d          t2,      t0,      8
334    bltu            t1,      t2,      2f
335
336    ld.d            t3,      t0,      0    // next_bits
337    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
338    nor             t3,      t3,      t3
339    sub.w           t2,      zero,    t1
340    revb.d          t3,      t3            // next_bits = bswap(next_bits)
341    srli.w          t2,      t2,      3    // num_bytes_read
342    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
343    b               3f
3441:
345    addi.w          t3,      t7,      -48
346    srl.d           t3,      t3,      t3   // pad with ones
347    b               4f
3482:
349    bgeu            t0,      t1,      1b
350    ld.d            t3,      t1,      -8   // next_bits
351    sub.w           t2,      t2,      t1
352    sub.w           t1,      t1,      t0   // num_bytes_left
353    slli.w          t2,      t2,      3
354    srl.d           t3,      t3,      t2
355    addi.w          t2,      t7,      -48
356    nor             t3,      t3,      t3
357    sub.w           t4,      zero,    t2
358    revb.d          t3,      t3
359    srli.w          t4,      t4,      3
360    srl.d           t3,      t3,      t2
361    sltu            t2,      t1,      t4
362    maskeqz         t1,      t1,      t2
363    masknez         t2,      t4,      t2
364    or              t2,      t2,      t1   // num_bytes_read
3653:
366    slli.w          t1,      t2,      3
367    add.d           t0,      t0,      t2
368    add.w           t7,      t7,      t1   // cnt += num_bits_read
369    st.d            t0,      a0,      0
3704:
371    or              t6,      t6,      t3   // dif |= next_bits
3729:
373    st.w            t7,      a0,      28   // store cnt
374    st.d            t6,      a0,      16   // store dif
375    move            a0,      t8
376endfunc
377