1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION .text 14 15%macro QUANTIZE_FN 2 16cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 17 shift, qcoeff, dqcoeff, dequant, \ 18 eob, scan, iscan 19 20 vzeroupper 21 22 ; If we can skip this block, then just zero the output 23 cmp skipmp, 0 24 jne .blank 25 26%ifnidn %1, b_32x32 27 28 ; Special case for ncoeff == 16, as it is frequent and we can save on 29 ; not setting up a loop. 30 cmp ncoeffmp, 16 31 jne .generic 32 33 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 34 ;; Special case of ncoeff == 16 35 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 36 37.single: 38 39 movifnidn coeffq, coeffmp 40 movifnidn zbinq, zbinmp 41 mova m0, [zbinq] ; m0 = zbin 42 43 ; Get DC and first 15 AC coeffs - in this special case, that is all. 44%if CONFIG_VP9_HIGHBITDEPTH 45 ; coeff stored as 32bit numbers but we process them as 16 bit numbers 46 mova m9, [coeffq] 47 packssdw m9, [coeffq+16] ; m9 = c[i] 48 mova m10, [coeffq+32] 49 packssdw m10, [coeffq+48] ; m10 = c[i] 50%else 51 mova m9, [coeffq] ; m9 = c[i] 52 mova m10, [coeffq+16] ; m10 = c[i] 53%endif 54 55 mov r0, eobmp ; Output pointer 56 mov r1, qcoeffmp ; Output pointer 57 mov r2, dqcoeffmp ; Output pointer 58 59 pxor m5, m5 ; m5 = dedicated zero 60 61 pcmpeqw m4, m4 ; All word lanes -1 62 paddw m0, m4 ; m0 = zbin - 1 63 64 pabsw m6, m9 ; m6 = abs(m9) 65 pabsw m11, m10 ; m11 = abs(m10) 66 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 67 punpckhqdq m0, m0 68 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 69 70 ; Check if all coeffs are less than zbin. If yes, we just write zeros 71 ; to the outputs and we are done. 72 por m14, m7, m12 73 ptest m14, m14 74 jnz .single_nonzero 75 76%if CONFIG_VP9_HIGHBITDEPTH 77 mova [r1 ], ymm5 78 mova [r1+32], ymm5 79 mova [r2 ], ymm5 80 mova [r2+32], ymm5 81%else 82 mova [r1], ymm5 83 mova [r2], ymm5 84%endif 85 mov [r0], word 0 86 87 vzeroupper 88 RET 89 90.single_nonzero: 91 92 ; Actual quantization of size 16 block - setup pointers, rounders, etc. 93 movifnidn r4, roundmp 94 movifnidn r5, quantmp 95 mov r3, dequantmp 96 mov r6, shiftmp 97 mova m1, [r4] ; m1 = round 98 mova m2, [r5] ; m2 = quant 99 mova m3, [r3] ; m3 = dequant 100 mova m4, [r6] ; m4 = shift 101 102 mov r3, iscanmp 103 104 DEFINE_ARGS eob, qcoeff, dqcoeff, iscan 105 106 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 107 108 paddsw m6, m1 ; m6 += round 109 punpckhqdq m1, m1 110 paddsw m11, m1 ; m11 += round 111 pmulhw m8, m6, m2 ; m8 = m6*q>>16 112 punpckhqdq m2, m2 113 pmulhw m13, m11, m2 ; m13 = m11*q>>16 114 paddw m8, m6 ; m8 += m6 115 paddw m13, m11 ; m13 += m11 116 pmulhw m8, m4 ; m8 = m8*qsh>>16 117 punpckhqdq m4, m4 118 pmulhw m13, m4 ; m13 = m13*qsh>>16 119 psignw m8, m9 ; m8 = reinsert sign 120 psignw m13, m10 ; m13 = reinsert sign 121 pand m8, m7 122 pand m13, m12 123 124%if CONFIG_VP9_HIGHBITDEPTH 125 ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff 126 pcmpgtw m6, m5, m8 127 punpckhwd m6, m8, m6 128 pmovsxwd m11, m8 129 mova [qcoeffq ], m11 130 mova [qcoeffq+16], m6 131 pcmpgtw m6, m5, m13 132 punpckhwd m6, m13, m6 133 pmovsxwd m11, m13 134 mova [qcoeffq+32], m11 135 mova [qcoeffq+48], m6 136%else 137 mova [qcoeffq ], m8 138 mova [qcoeffq+16], m13 139%endif 140 141 pmullw m8, m3 ; dqc[i] = qc[i] * q 142 punpckhqdq m3, m3 143 pmullw m13, m3 ; dqc[i] = qc[i] * q 144 145%if CONFIG_VP9_HIGHBITDEPTH 146 ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff 147 pcmpgtw m6, m5, m8 148 punpckhwd m6, m8, m6 149 pmovsxwd m11, m8 150 mova [dqcoeffq ], m11 151 mova [dqcoeffq+16], m6 152 pcmpgtw m6, m5, m13 153 punpckhwd m6, m13, m6 154 pmovsxwd m11, m13 155 mova [dqcoeffq+32], m11 156 mova [dqcoeffq+48], m6 157%else 158 mova [dqcoeffq ], m8 159 mova [dqcoeffq+16], m13 160%endif 161 162 mova m6, [iscanq] ; m6 = scan[i] 163 mova m11, [iscanq+16] ; m11 = scan[i] 164 165 pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 166 pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 167 psubw m6, m6, m7 ; m6 = scan[i] + 1 168 psubw m11, m11, m12 ; m11 = scan[i] + 1 169 pandn m8, m8, m6 ; m8 = max(eob) 170 pandn m13, m13, m11 ; m13 = max(eob) 171 pmaxsw m8, m8, m13 172 173 ; Horizontally accumulate/max eobs and write into [eob] memory pointer 174 pshufd m7, m8, 0xe 175 pmaxsw m8, m7 176 pshuflw m7, m8, 0xe 177 pmaxsw m8, m7 178 pshuflw m7, m8, 0x1 179 pmaxsw m8, m7 180 movq rax, m8 181 mov [eobq], ax 182 183 vzeroupper 184 RET 185 186 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 187 ;; Generic case of ncoeff != 16 188 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 189 190.generic: 191 192%endif ; %ifnidn %1, b_32x32 193 194DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ 195 qcoeff, dqcoeff, dequant, eob, scan, iscan 196 197 ; Actual quantization loop - setup pointers, rounders, etc. 198 movifnidn coeffq, coeffmp 199 movifnidn ncoeffq, ncoeffmp 200 mov r2, dequantmp 201 movifnidn zbinq, zbinmp 202 movifnidn roundq, roundmp 203 movifnidn quantq, quantmp 204 mova m0, [zbinq] ; m0 = zbin 205 mova m1, [roundq] ; m1 = round 206 mova m2, [quantq] ; m2 = quant 207 mova m3, [r2] ; m3 = dequant 208 pcmpeqw m4, m4 ; All lanes -1 209%ifidn %1, b_32x32 210 psubw m0, m4 211 psubw m1, m4 212 psrlw m0, 1 ; m0 = (m0 + 1) / 2 213 psrlw m1, 1 ; m1 = (m1 + 1) / 2 214%endif 215 paddw m0, m4 ; m0 = m0 + 1 216 217 mov r2, shiftmp 218 mov r3, qcoeffmp 219 mova m4, [r2] ; m4 = shift 220 mov r4, dqcoeffmp 221 mov r5, iscanmp 222%ifidn %1, b_32x32 223 psllw m4, 1 224%endif 225 pxor m5, m5 ; m5 = dedicated zero 226 227 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob 228 229%if CONFIG_VP9_HIGHBITDEPTH 230 lea coeffq, [ coeffq+ncoeffq*4] 231 lea qcoeffq, [ qcoeffq+ncoeffq*4] 232 lea dqcoeffq, [dqcoeffq+ncoeffq*4] 233%else 234 lea coeffq, [ coeffq+ncoeffq*2] 235 lea qcoeffq, [ qcoeffq+ncoeffq*2] 236 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 237%endif 238 lea iscanq, [ iscanq+ncoeffq*2] 239 neg ncoeffq 240 241 ; get DC and first 15 AC coeffs 242%if CONFIG_VP9_HIGHBITDEPTH 243 ; coeff stored as 32bit numbers & require 16bit numbers 244 mova m9, [coeffq+ncoeffq*4+ 0] 245 packssdw m9, [coeffq+ncoeffq*4+16] 246 mova m10, [coeffq+ncoeffq*4+32] 247 packssdw m10, [coeffq+ncoeffq*4+48] 248%else 249 mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] 250 mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] 251%endif 252 253 pabsw m6, m9 ; m6 = abs(m9) 254 pabsw m11, m10 ; m11 = abs(m10) 255 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 256 punpckhqdq m0, m0 257 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 258 259 ; Check if all coeffs are less than zbin. If yes, skip forward quickly. 260 por m14, m7, m12 261 ptest m14, m14 262 jnz .first_nonzero 263 264%if CONFIG_VP9_HIGHBITDEPTH 265 mova [qcoeffq+ncoeffq*4 ], ymm5 266 mova [qcoeffq+ncoeffq*4+32], ymm5 267 mova [dqcoeffq+ncoeffq*4 ], ymm5 268 mova [dqcoeffq+ncoeffq*4+32], ymm5 269%else 270 mova [qcoeffq+ncoeffq*2], ymm5 271 mova [dqcoeffq+ncoeffq*2], ymm5 272%endif 273 274 add ncoeffq, mmsize 275 276 punpckhqdq m1, m1 277 punpckhqdq m2, m2 278 punpckhqdq m3, m3 279 punpckhqdq m4, m4 280 pxor m8, m8 281 282 jmp .ac_only_loop 283 284.first_nonzero: 285 286 paddsw m6, m1 ; m6 += round 287 punpckhqdq m1, m1 288 paddsw m11, m1 ; m11 += round 289 pmulhw m8, m6, m2 ; m8 = m6*q>>16 290 punpckhqdq m2, m2 291 pmulhw m13, m11, m2 ; m13 = m11*q>>16 292 paddw m8, m6 ; m8 += m6 293 paddw m13, m11 ; m13 += m11 294 pmulhw m8, m4 ; m8 = m8*qsh>>16 295 punpckhqdq m4, m4 296 pmulhw m13, m4 ; m13 = m13*qsh>>16 297 psignw m8, m9 ; m8 = reinsert sign 298 psignw m13, m10 ; m13 = reinsert sign 299 pand m8, m7 300 pand m13, m12 301 302%if CONFIG_VP9_HIGHBITDEPTH 303 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 304 pcmpgtw m6, m5, m8 305 punpckhwd m6, m8, m6 306 pmovsxwd m11, m8 307 mova [qcoeffq+ncoeffq*4+ 0], m11 308 mova [qcoeffq+ncoeffq*4+16], m6 309 pcmpgtw m6, m5, m13 310 punpckhwd m6, m13, m6 311 pmovsxwd m11, m13 312 mova [qcoeffq+ncoeffq*4+32], m11 313 mova [qcoeffq+ncoeffq*4+48], m6 314%else 315 mova [qcoeffq+ncoeffq*2+ 0], m8 316 mova [qcoeffq+ncoeffq*2+16], m13 317%endif 318 319%ifidn %1, b_32x32 320 pabsw m8, m8 321 pabsw m13, m13 322%endif 323 pmullw m8, m3 ; dqc[i] = qc[i] * q 324 punpckhqdq m3, m3 325 pmullw m13, m3 ; dqc[i] = qc[i] * q 326%ifidn %1, b_32x32 327 psrlw m8, 1 328 psrlw m13, 1 329 psignw m8, m9 330 psignw m13, m10 331%endif 332 333%if CONFIG_VP9_HIGHBITDEPTH 334 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 335 pcmpgtw m6, m5, m8 336 punpckhwd m6, m8, m6 337 pmovsxwd m11, m8 338 mova [dqcoeffq+ncoeffq*4+ 0], m11 339 mova [dqcoeffq+ncoeffq*4+16], m6 340 pcmpgtw m6, m5, m13 341 punpckhwd m6, m13, m6 342 pmovsxwd m11, m13 343 mova [dqcoeffq+ncoeffq*4+32], m11 344 mova [dqcoeffq+ncoeffq*4+48], m6 345%else 346 mova [dqcoeffq+ncoeffq*2+ 0], m8 347 mova [dqcoeffq+ncoeffq*2+16], m13 348%endif 349 350 pcmpeqw m8, m5 ; m8 = c[i] == 0 351 pcmpeqw m13, m5 ; m13 = c[i] == 0 352 mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] 353 mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] 354 psubw m6, m7 ; m6 = scan[i] + 1 355 psubw m11, m12 ; m11 = scan[i] + 1 356 pandn m8, m6 ; m8 = max(eob) 357 pandn m13, m11 ; m13 = max(eob) 358 pmaxsw m8, m13 359 add ncoeffq, mmsize 360 361.ac_only_loop: 362 363%if CONFIG_VP9_HIGHBITDEPTH 364 ; pack coeff from 32bit to 16bit array 365 mova m9, [coeffq+ncoeffq*4+ 0] 366 packssdw m9, [coeffq+ncoeffq*4+16] 367 mova m10, [coeffq+ncoeffq*4+32] 368 packssdw m10, [coeffq+ncoeffq*4+48] 369%else 370 mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] 371 mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] 372%endif 373 374 pabsw m6, m9 ; m6 = abs(m9) 375 pabsw m11, m10 ; m11 = abs(m10) 376 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 377 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 378 379 ; Check if all coeffs are less than zbin. If yes, skip this itertion. 380 ; And just write zeros as the result would be. 381 por m14, m7, m12 382 ptest m14, m14 383 jnz .rest_nonzero 384 385%if CONFIG_VP9_HIGHBITDEPTH 386 mova [qcoeffq+ncoeffq*4+ 0], ymm5 387 mova [qcoeffq+ncoeffq*4+32], ymm5 388 mova [dqcoeffq+ncoeffq*4+ 0], ymm5 389 mova [dqcoeffq+ncoeffq*4+32], ymm5 390%else 391 mova [qcoeffq+ncoeffq*2+ 0], ymm5 392 mova [dqcoeffq+ncoeffq*2+ 0], ymm5 393%endif 394 add ncoeffq, mmsize 395 jnz .ac_only_loop 396 397 ; Horizontally accumulate/max eobs and write into [eob] memory pointer 398 mov r2, eobmp 399 pshufd m7, m8, 0xe 400 pmaxsw m8, m7 401 pshuflw m7, m8, 0xe 402 pmaxsw m8, m7 403 pshuflw m7, m8, 0x1 404 pmaxsw m8, m7 405 movq rax, m8 406 mov [r2], ax 407 vzeroupper 408 RET 409 410.rest_nonzero: 411 paddsw m6, m1 ; m6 += round 412 paddsw m11, m1 ; m11 += round 413 pmulhw m14, m6, m2 ; m14 = m6*q>>16 414 pmulhw m13, m11, m2 ; m13 = m11*q>>16 415 paddw m14, m6 ; m14 += m6 416 paddw m13, m11 ; m13 += m11 417 pmulhw m14, m4 ; m14 = m14*qsh>>16 418 pmulhw m13, m4 ; m13 = m13*qsh>>16 419 psignw m14, m9 ; m14 = reinsert sign 420 psignw m13, m10 ; m13 = reinsert sign 421 pand m14, m7 422 pand m13, m12 423 424%if CONFIG_VP9_HIGHBITDEPTH 425 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 426 pcmpgtw m6, m5, m14 427 punpckhwd m6, m14, m6 428 pmovsxwd m11, m14 429 mova [qcoeffq+ncoeffq*4+ 0], m11 430 mova [qcoeffq+ncoeffq*4+16], m6 431 pcmpgtw m6, m5, m13 432 punpckhwd m6, m13, m6 433 pmovsxwd m11, m13 434 mova [qcoeffq+ncoeffq*4+32], m11 435 mova [qcoeffq+ncoeffq*4+48], m6 436%else 437 mova [qcoeffq+ncoeffq*2+ 0], m14 438 mova [qcoeffq+ncoeffq*2+16], m13 439%endif 440 441%ifidn %1, b_32x32 442 pabsw m14, m14 443 pabsw m13, m13 444%endif 445 pmullw m14, m3 ; dqc[i] = qc[i] * q 446 pmullw m13, m3 ; dqc[i] = qc[i] * q 447%ifidn %1, b_32x32 448 psrlw m14, 1 449 psrlw m13, 1 450 psignw m14, m9 451 psignw m13, m10 452%endif 453 454%if CONFIG_VP9_HIGHBITDEPTH 455 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 456 pcmpgtw m6, m5, m14 457 punpckhwd m6, m14, m6 458 pmovsxwd m11, m14 459 mova [dqcoeffq+ncoeffq*4+ 0], m11 460 mova [dqcoeffq+ncoeffq*4+16], m6 461 pcmpgtw m6, m5, m13 462 punpckhwd m6, m13, m6 463 pmovsxwd m11, m13 464 mova [dqcoeffq+ncoeffq*4+32], m11 465 mova [dqcoeffq+ncoeffq*4+48], m6 466%else 467 mova [dqcoeffq+ncoeffq*2+ 0], m14 468 mova [dqcoeffq+ncoeffq*2+16], m13 469%endif 470 471 pcmpeqw m14, m5 ; m14 = c[i] == 0 472 pcmpeqw m13, m5 ; m13 = c[i] == 0 473 mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 474 mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] 475 psubw m6, m7 ; m6 = scan[i] + 1 476 psubw m11, m12 ; m11 = scan[i] + 1 477 pandn m14, m6 ; m14 = max(eob) 478 pandn m13, m11 ; m13 = max(eob) 479 pmaxsw m8, m14 480 pmaxsw m8, m13 481 add ncoeffq, mmsize 482 jnz .ac_only_loop 483 484 ; Horizontally accumulate/max eobs and write into [eob] memory pointer 485 mov r2, eobmp 486 pshufd m7, m8, 0xe 487 pmaxsw m8, m7 488 pshuflw m7, m8, 0xe 489 pmaxsw m8, m7 490 pshuflw m7, m8, 0x1 491 pmaxsw m8, m7 492 movq rax, m8 493 mov [r2], ax 494 vzeroupper 495 RET 496 497 ; Skip-block, i.e. just write all zeroes 498.blank: 499 500DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ 501 qcoeff, dqcoeff, dequant, eob, scan, iscan 502 503 mov r0, dqcoeffmp 504 movifnidn ncoeffq, ncoeffmp 505 mov r2, qcoeffmp 506 mov r3, eobmp 507 508DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 509 510%if CONFIG_VP9_HIGHBITDEPTH 511 lea dqcoeffq, [dqcoeffq+ncoeffq*4] 512 lea qcoeffq, [ qcoeffq+ncoeffq*4] 513%else 514 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 515 lea qcoeffq, [ qcoeffq+ncoeffq*2] 516%endif 517 518 neg ncoeffq 519 pxor m7, m7 520 521.blank_loop: 522%if CONFIG_VP9_HIGHBITDEPTH 523 mova [dqcoeffq+ncoeffq*4+ 0], ymm7 524 mova [dqcoeffq+ncoeffq*4+32], ymm7 525 mova [qcoeffq+ncoeffq*4+ 0], ymm7 526 mova [qcoeffq+ncoeffq*4+32], ymm7 527%else 528 mova [dqcoeffq+ncoeffq*2+ 0], ymm7 529 mova [qcoeffq+ncoeffq*2+ 0], ymm7 530%endif 531 add ncoeffq, mmsize 532 jl .blank_loop 533 534 mov [eobq], word 0 535 536 vzeroupper 537 RET 538%endmacro 539 540INIT_XMM avx 541QUANTIZE_FN b, 7 542QUANTIZE_FN b_32x32, 7 543 544END 545