1/* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "loongson_asm.S" 29 30const min_prob 31 .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 32endconst 33 34.macro decode_symbol_adapt w 35 addi.d sp, sp, -48 36 addi.d a4, a0, 24 37 vldrepl.h vr0, a4, 0 //rng 38 fst.s f0, sp, 0 //val==0 39 vld vr1, a1, 0 //cdf 40.if \w == 16 41 li.w t4, 16 42 vldx vr11, a1, t4 43.endif 44 addi.d a6, a0, 16 45 vldrepl.d vr2, a6, 0 //dif 46 addi.d t0, a0, 32 47 ld.w t1, t0, 0 //allow_update_cdf 48 la.local t2, min_prob 49 addi.d t2, t2, 32 50 addi.w t3, a2, 1 51 slli.w t3, t3, 1 52 sub.d t2, t2, t3 53 vld vr3, t2, 0 //min_prob 54.if \w == 16 55 vldx vr13, t2, t4 56.endif 57 vsrli.h vr4, vr0, 8 //r = s->rng >> 8 58 vslli.h vr4, vr4, 8 //r << 8 59 vsrli.h vr5, vr1, 6 60 vslli.h vr5, vr5, 7 61.if \w == 16 62 vsrli.h vr15, vr11, 6 63 vslli.h vr15, vr15, 7 64.endif 65 vmuh.hu vr5, vr4, vr5 66 vadd.h vr5, vr5, vr3 //v 67.if \w == 16 68 vmuh.hu vr15, vr4, vr15 69 vadd.h vr15, vr15, vr13 70.endif 71 addi.d t8, sp, 4 72 vst vr5, t8, 0 //store v 73.if \w == 16 74 vstx vr15, t8, t4 75.endif 76 vreplvei.h vr20, vr2, 3 //c 77 vssub.hu vr6, vr5, vr20 //c >=v 78 vseqi.h vr6, vr6, 0 79.if \w == 16 80 vssub.hu vr16, vr15, vr20 //c >=v 81 vseqi.h vr16, vr16, 0 82 vpickev.b vr21, vr16, vr6 83.endif 84.if \w <= 8 85 vmskltz.h vr10, vr6 86.else 87 vmskltz.b vr10, vr21 88.endif 89 beqz t1, .renorm\()\w 90 91 // update_cdf 92 alsl.d t1, a2, a1, 1 93 ld.h t2, t1, 0 //count 94 srli.w t3, t2, 4 //count >> 4 95 addi.w t3, t3, 4 96 li.w t5, 2 97 sltu t5, t5, a2 98 add.w t3, t3, t5 //rate 99 sltui t5, t2, 32 100 add.w t2, t2, t5 //count + (count < 32) 101 vreplgr2vr.h vr9, t3 102 vseq.h vr7, vr7, vr7 103 vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768 104 vsub.h vr5, vr5, vr1 105 vsub.h vr8, vr1, vr6 106.if \w == 16 107 vavgr.hu vr15, vr16, vr7 108 vsub.h vr15, vr15, vr11 109 vsub.h vr18, vr11, vr16 110.endif 111 vsra.h vr5, vr5, vr9 112 vadd.h vr8, vr8, vr5 113.if \w == 4 114 fst.d f8, a1, 0 115.else 116 vst vr8, a1, 0 117.endif 118.if \w == 16 119 vsra.h vr15, vr15, vr9 120 vadd.h vr18, vr18, vr15 121 vstx vr18, a1, t4 122.endif 123 st.h t2, t1, 0 124 125.renorm\()\w: 126 vpickve2gr.h t3, vr10, 0 127 ctz.w a7, t3 // ret 128 alsl.d t3, a7, t8, 1 129 ld.hu t4, t3, 0 // v 130 addi.d t3, t3, -2 131 ld.hu t5, t3, 0 // u 132 sub.w t5, t5, t4 // rng 133 slli.d t4, t4, 48 134 vpickve2gr.d t6, vr2, 0 135 sub.d t6, t6, t4 // dif 136 clz.w t4, t5 // d 137 xori t4, t4, 16 // d 138 sll.d t6, t6, t4 139 addi.d a5, a0, 28 // cnt 140 ld.w t0, a5, 0 141 sll.w t5, t5, t4 142 sub.w t7, t0, t4 // cnt-d 143 st.w t5, a4, 0 // store rng 144 bgeu t0, t4, 9f 145 146 // refill 147 ld.d t0, a0, 0 // buf_pos 148 ld.d t1, a0, 8 // buf_end 149 addi.d t2, t0, 8 150 bltu t1, t2, 2f 151 152 ld.d t3, t0, 0 // next_bits 153 addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) 154 nor t3, t3, t3 155 sub.w t2, zero, t1 156 revb.d t3, t3 // next_bits = bswap(next_bits) 157 srli.w t2, t2, 3 // num_bytes_read 158 srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) 159 b 3f 1601: 161 addi.w t3, t7, -48 162 srl.d t3, t3, t3 // pad with ones 163 b 4f 1642: 165 bgeu t0, t1, 1b 166 ld.d t3, t1, -8 // next_bits 167 sub.w t2, t2, t1 168 sub.w t1, t1, t0 // num_bytes_left 169 slli.w t2, t2, 3 170 srl.d t3, t3, t2 171 addi.w t2, t7, -48 172 nor t3, t3, t3 173 sub.w t4, zero, t2 174 revb.d t3, t3 175 srli.w t4, t4, 3 176 srl.d t3, t3, t2 177 sltu t2, t1, t4 178 maskeqz t1, t1, t2 179 masknez t2, t4, t2 180 or t2, t2, t1 // num_bytes_read 1813: 182 slli.w t1, t2, 3 183 add.d t0, t0, t2 184 add.w t7, t7, t1 // cnt += num_bits_read 185 st.d t0, a0, 0 1864: 187 or t6, t6, t3 // dif |= next_bits 1889: 189 st.w t7, a5, 0 // store cnt 190 st.d t6, a6, 0 // store dif 191 move a0, a7 192 addi.d sp, sp, 48 193.endm 194 195function msac_decode_symbol_adapt4_lsx 196 decode_symbol_adapt 4 197endfunc 198 199function msac_decode_symbol_adapt8_lsx 200 decode_symbol_adapt 8 201endfunc 202 203function msac_decode_symbol_adapt16_lsx 204 decode_symbol_adapt 16 205endfunc 206 207function msac_decode_bool_lsx 208 ld.w t0, a0, 24 // rng 209 srli.w a1, a1, 6 210 ld.d t1, a0, 16 // dif 211 srli.w t2, t0, 8 // r >> 8 212 mul.w t2, t2, a1 213 ld.w a5, a0, 28 // cnt 214 srli.w t2, t2, 1 215 addi.w t2, t2, 4 // v 216 slli.d t3, t2, 48 // vw 217 sltu t4, t1, t3 218 move t8, t4 // ret 219 xori t4, t4, 1 220 maskeqz t6, t3, t4 // if (ret) vw 221 sub.d t6, t1, t6 // dif 222 slli.w t5, t2, 1 223 sub.w t5, t0, t5 // r - 2v 224 maskeqz t7, t5, t4 // if (ret) r - 2v 225 add.w t5, t2, t7 // v(rng) 226 227 // renorm 228 clz.w t4, t5 // d 229 xori t4, t4, 16 // d 230 sll.d t6, t6, t4 231 sll.w t5, t5, t4 232 sub.w t7, a5, t4 // cnt-d 233 st.w t5, a0, 24 // store rng 234 bgeu a5, t4, 9f 235 236 // refill 237 ld.d t0, a0, 0 // buf_pos 238 ld.d t1, a0, 8 // buf_end 239 addi.d t2, t0, 8 240 bltu t1, t2, 2f 241 242 ld.d t3, t0, 0 // next_bits 243 addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) 244 nor t3, t3, t3 245 sub.w t2, zero, t1 246 revb.d t3, t3 // next_bits = bswap(next_bits) 247 srli.w t2, t2, 3 // num_bytes_read 248 srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) 249 b 3f 2501: 251 addi.w t3, t7, -48 252 srl.d t3, t3, t3 // pad with ones 253 b 4f 2542: 255 bgeu t0, t1, 1b 256 ld.d t3, t1, -8 // next_bits 257 sub.w t2, t2, t1 258 sub.w t1, t1, t0 // num_bytes_left 259 slli.w t2, t2, 3 260 srl.d t3, t3, t2 261 addi.w t2, t7, -48 262 nor t3, t3, t3 263 sub.w t4, zero, t2 264 revb.d t3, t3 265 srli.w t4, t4, 3 266 srl.d t3, t3, t2 267 sltu t2, t1, t4 268 maskeqz t1, t1, t2 269 masknez t2, t4, t2 270 or t2, t2, t1 // num_bytes_read 2713: 272 slli.w t1, t2, 3 273 add.d t0, t0, t2 274 add.w t7, t7, t1 // cnt += num_bits_read 275 st.d t0, a0, 0 2764: 277 or t6, t6, t3 // dif |= next_bits 2789: 279 st.w t7, a0, 28 // store cnt 280 st.d t6, a0, 16 // store dif 281 move a0, t8 282endfunc 283 284function msac_decode_bool_adapt_lsx 285 ld.hu a3, a1, 0 // cdf[0] /f 286 ld.w t0, a0, 24 // rng 287 ld.d t1, a0, 16 // dif 288 srli.w t2, t0, 8 // r >> 8 289 srli.w a7, a3, 6 290 mul.w t2, t2, a7 291 ld.w a4, a0, 32 // allow_update_cdf 292 ld.w a5, a0, 28 // cnt 293 srli.w t2, t2, 1 294 addi.w t2, t2, 4 // v 295 slli.d t3, t2, 48 // vw 296 sltu t4, t1, t3 297 move t8, t4 // bit 298 xori t4, t4, 1 299 maskeqz t6, t3, t4 // if (ret) vw 300 sub.d t6, t1, t6 // dif 301 slli.w t5, t2, 1 302 sub.w t5, t0, t5 // r - 2v 303 maskeqz t7, t5, t4 // if (ret) r - 2v 304 add.w t5, t2, t7 // v(rng) 305 beqz a4, .renorm 306 307 // update_cdf 308 ld.hu t0, a1, 2 // cdf[1] 309 srli.w t1, t0, 4 310 addi.w t1, t1, 4 // rate 311 sltui t2, t0, 32 // count < 32 312 add.w t0, t0, t2 // count + (count < 32) 313 sub.w a3, a3, t8 // cdf[0] -= bit 314 slli.w t4, t8, 15 315 sub.w t7, a3, t4 // cdf[0] - bit - 32768 316 sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate 317 sub.w t7, a3, t7 // cdf[0] 318 st.h t7, a1, 0 319 st.h t0, a1, 2 320 321.renorm: 322 clz.w t4, t5 // d 323 xori t4, t4, 16 // d 324 sll.d t6, t6, t4 325 sll.w t5, t5, t4 326 sub.w t7, a5, t4 // cnt-d 327 st.w t5, a0, 24 // store rng 328 bgeu a5, t4, 9f 329 330 // refill 331 ld.d t0, a0, 0 // buf_pos 332 ld.d t1, a0, 8 // buf_end 333 addi.d t2, t0, 8 334 bltu t1, t2, 2f 335 336 ld.d t3, t0, 0 // next_bits 337 addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64) 338 nor t3, t3, t3 339 sub.w t2, zero, t1 340 revb.d t3, t3 // next_bits = bswap(next_bits) 341 srli.w t2, t2, 3 // num_bytes_read 342 srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63) 343 b 3f 3441: 345 addi.w t3, t7, -48 346 srl.d t3, t3, t3 // pad with ones 347 b 4f 3482: 349 bgeu t0, t1, 1b 350 ld.d t3, t1, -8 // next_bits 351 sub.w t2, t2, t1 352 sub.w t1, t1, t0 // num_bytes_left 353 slli.w t2, t2, 3 354 srl.d t3, t3, t2 355 addi.w t2, t7, -48 356 nor t3, t3, t3 357 sub.w t4, zero, t2 358 revb.d t3, t3 359 srli.w t4, t4, 3 360 srl.d t3, t3, t2 361 sltu t2, t1, t4 362 maskeqz t1, t1, t2 363 masknez t2, t4, t2 364 or t2, t2, t1 // num_bytes_read 3653: 366 slli.w t1, t2, 3 367 add.d t0, t0, t2 368 add.w t7, t7, t1 // cnt += num_bits_read 369 st.d t0, a0, 0 3704: 371 or t6, t6, t3 // dif |= next_bits 3729: 373 st.w t7, a0, 28 // store cnt 374 st.d t6, a0, 16 // store dif 375 move a0, t8 376endfunc 377