1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_1: times 8 dw 1 15 16SECTION .text 17 18%macro QUANTIZE_FN 2 19cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 20 shift, qcoeff, dqcoeff, dequant, zbin_oq, \ 21 eob, scan, iscan 22 cmp dword skipm, 0 23 jne .blank 24 25 ; actual quantize loop - setup pointers, rounders, etc. 26 movifnidn coeffq, coeffmp 27 movifnidn ncoeffq, ncoeffmp 28 mov r2, dequantmp 29 movifnidn zbinq, zbinmp 30 movifnidn roundq, roundmp 31 movifnidn quantq, quantmp 32 movd m4, dword zbin_oqm ; m4 = zbin_oq 33 mova m0, [zbinq] ; m0 = zbin 34 punpcklwd m4, m4 35 mova m1, [roundq] ; m1 = round 36 pshufd m4, m4, 0 37 mova m2, [quantq] ; m2 = quant 38 paddw m0, m4 ; m0 = zbin + zbin_oq 39%ifidn %1, b_32x32 40 pcmpeqw m5, m5 41 psrlw m5, 15 42 paddw m0, m5 43 paddw m1, m5 44 psrlw m0, 1 ; m0 = (m0 + 1) / 2 45 psrlw m1, 1 ; m1 = (m1 + 1) / 2 46%endif 47 mova m3, [r2q] ; m3 = dequant 48 psubw m0, [pw_1] 49 mov r2, shiftmp 50 mov r3, qcoeffmp 51 mova m4, [r2] ; m4 = shift 52 mov r4, dqcoeffmp 53 mov r5, iscanmp 54%ifidn %1, b_32x32 55 psllw m4, 1 56%endif 57 pxor m5, m5 ; m5 = dedicated zero 58 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob 59 lea coeffq, [ coeffq+ncoeffq*2] 60 lea iscanq, [ iscanq+ncoeffq*2] 61 lea qcoeffq, [ qcoeffq+ncoeffq*2] 62 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 63 neg ncoeffq 64 65 ; get DC and first 15 AC coeffs 66 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 67 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 68 pabsw m6, m9 ; m6 = abs(m9) 69 pabsw m11, m10 ; m11 = abs(m10) 70 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 71 punpckhqdq m0, m0 72 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 73 paddsw m6, m1 ; m6 += round 74 punpckhqdq m1, m1 75 paddsw m11, m1 ; m11 += round 76 pmulhw m8, m6, m2 ; m8 = m6*q>>16 77 punpckhqdq m2, m2 78 pmulhw m13, m11, m2 ; m13 = m11*q>>16 79 paddw m8, m6 ; m8 += m6 80 paddw m13, m11 ; m13 += m11 81 pmulhw m8, m4 ; m8 = m8*qsh>>16 82 punpckhqdq m4, m4 83 pmulhw m13, m4 ; m13 = m13*qsh>>16 84 psignw m8, m9 ; m8 = reinsert sign 85 psignw m13, m10 ; m13 = reinsert sign 86 pand m8, m7 87 pand m13, m12 88 mova [qcoeffq+ncoeffq*2+ 0], m8 89 mova [qcoeffq+ncoeffq*2+16], m13 90%ifidn %1, b_32x32 91 pabsw m8, m8 92 pabsw m13, m13 93%endif 94 pmullw m8, m3 ; dqc[i] = qc[i] * q 95 punpckhqdq m3, m3 96 pmullw m13, m3 ; dqc[i] = qc[i] * q 97%ifidn %1, b_32x32 98 psrlw m8, 1 99 psrlw m13, 1 100 psignw m8, m9 101 psignw m13, m10 102%endif 103 mova [dqcoeffq+ncoeffq*2+ 0], m8 104 mova [dqcoeffq+ncoeffq*2+16], m13 105 pcmpeqw m8, m5 ; m8 = c[i] == 0 106 pcmpeqw m13, m5 ; m13 = c[i] == 0 107 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 108 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 109 psubw m6, m7 ; m6 = scan[i] + 1 110 psubw m11, m12 ; m11 = scan[i] + 1 111 pandn m8, m6 ; m8 = max(eob) 112 pandn m13, m11 ; m13 = max(eob) 113 pmaxsw m8, m13 114 add ncoeffq, mmsize 115 jz .accumulate_eob 116 117.ac_only_loop: 118 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 119 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 120 pabsw m6, m9 ; m6 = abs(m9) 121 pabsw m11, m10 ; m11 = abs(m10) 122 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 123 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 124%ifidn %1, b_32x32 125 pmovmskb r6, m7 126 pmovmskb r2, m12 127 or r6, r2 128 jz .skip_iter 129%endif 130 paddsw m6, m1 ; m6 += round 131 paddsw m11, m1 ; m11 += round 132 pmulhw m14, m6, m2 ; m14 = m6*q>>16 133 pmulhw m13, m11, m2 ; m13 = m11*q>>16 134 paddw m14, m6 ; m14 += m6 135 paddw m13, m11 ; m13 += m11 136 pmulhw m14, m4 ; m14 = m14*qsh>>16 137 pmulhw m13, m4 ; m13 = m13*qsh>>16 138 psignw m14, m9 ; m14 = reinsert sign 139 psignw m13, m10 ; m13 = reinsert sign 140 pand m14, m7 141 pand m13, m12 142 mova [qcoeffq+ncoeffq*2+ 0], m14 143 mova [qcoeffq+ncoeffq*2+16], m13 144%ifidn %1, b_32x32 145 pabsw m14, m14 146 pabsw m13, m13 147%endif 148 pmullw m14, m3 ; dqc[i] = qc[i] * q 149 pmullw m13, m3 ; dqc[i] = qc[i] * q 150%ifidn %1, b_32x32 151 psrlw m14, 1 152 psrlw m13, 1 153 psignw m14, m9 154 psignw m13, m10 155%endif 156 mova [dqcoeffq+ncoeffq*2+ 0], m14 157 mova [dqcoeffq+ncoeffq*2+16], m13 158 pcmpeqw m14, m5 ; m14 = c[i] == 0 159 pcmpeqw m13, m5 ; m13 = c[i] == 0 160 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 161 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 162 psubw m6, m7 ; m6 = scan[i] + 1 163 psubw m11, m12 ; m11 = scan[i] + 1 164 pandn m14, m6 ; m14 = max(eob) 165 pandn m13, m11 ; m13 = max(eob) 166 pmaxsw m8, m14 167 pmaxsw m8, m13 168 add ncoeffq, mmsize 169 jl .ac_only_loop 170 171%ifidn %1, b_32x32 172 jmp .accumulate_eob 173.skip_iter: 174 mova [qcoeffq+ncoeffq*2+ 0], m5 175 mova [qcoeffq+ncoeffq*2+16], m5 176 mova [dqcoeffq+ncoeffq*2+ 0], m5 177 mova [dqcoeffq+ncoeffq*2+16], m5 178 add ncoeffq, mmsize 179 jl .ac_only_loop 180%endif 181 182.accumulate_eob: 183 ; horizontally accumulate/max eobs and write into [eob] memory pointer 184 mov r2, eobmp 185 pshufd m7, m8, 0xe 186 pmaxsw m8, m7 187 pshuflw m7, m8, 0xe 188 pmaxsw m8, m7 189 pshuflw m7, m8, 0x1 190 pmaxsw m8, m7 191 pextrw r6, m8, 0 192 mov [r2], r6 193 RET 194 195 ; skip-block, i.e. just write all zeroes 196.blank: 197 mov r0, dqcoeffmp 198 movifnidn ncoeffq, ncoeffmp 199 mov r2, qcoeffmp 200 mov r3, eobmp 201 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 202 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 203 lea qcoeffq, [ qcoeffq+ncoeffq*2] 204 neg ncoeffq 205 pxor m7, m7 206.blank_loop: 207 mova [dqcoeffq+ncoeffq*2+ 0], m7 208 mova [dqcoeffq+ncoeffq*2+16], m7 209 mova [qcoeffq+ncoeffq*2+ 0], m7 210 mova [qcoeffq+ncoeffq*2+16], m7 211 add ncoeffq, mmsize 212 jl .blank_loop 213 mov word [eobq], 0 214 RET 215%endmacro 216 217INIT_XMM ssse3 218QUANTIZE_FN b, 7 219QUANTIZE_FN b_32x32, 7 220 221%macro QUANTIZE_FP 2 222cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 223 shift, qcoeff, dqcoeff, dequant, zbin_oq, \ 224 eob, scan, iscan 225 cmp dword skipm, 0 226 jne .blank 227 228 ; actual quantize loop - setup pointers, rounders, etc. 229 movifnidn coeffq, coeffmp 230 movifnidn ncoeffq, ncoeffmp 231 mov r2, dequantmp 232 movifnidn zbinq, zbinmp 233 movifnidn roundq, roundmp 234 movifnidn quantq, quantmp 235 mova m1, [roundq] ; m1 = round 236 mova m2, [quantq] ; m2 = quant 237%ifidn %1, fp_32x32 238 pcmpeqw m5, m5 239 psrlw m5, 15 240 paddw m1, m5 241 psrlw m1, 1 ; m1 = (m1 + 1) / 2 242%endif 243 mova m3, [r2q] ; m3 = dequant 244 mov r3, qcoeffmp 245 mov r4, dqcoeffmp 246 mov r5, iscanmp 247%ifidn %1, fp_32x32 248 psllw m2, 1 249%endif 250 pxor m5, m5 ; m5 = dedicated zero 251 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob 252 lea coeffq, [ coeffq+ncoeffq*2] 253 lea iscanq, [ iscanq+ncoeffq*2] 254 lea qcoeffq, [ qcoeffq+ncoeffq*2] 255 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 256 neg ncoeffq 257 258 ; get DC and first 15 AC coeffs 259 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 260 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 261 pabsw m6, m9 ; m6 = abs(m9) 262 pabsw m11, m10 ; m11 = abs(m10) 263 pcmpeqw m7, m7 264 265 paddsw m6, m1 ; m6 += round 266 punpckhqdq m1, m1 267 paddsw m11, m1 ; m11 += round 268 pmulhw m8, m6, m2 ; m8 = m6*q>>16 269 punpckhqdq m2, m2 270 pmulhw m13, m11, m2 ; m13 = m11*q>>16 271 psignw m8, m9 ; m8 = reinsert sign 272 psignw m13, m10 ; m13 = reinsert sign 273 mova [qcoeffq+ncoeffq*2+ 0], m8 274 mova [qcoeffq+ncoeffq*2+16], m13 275%ifidn %1, fp_32x32 276 pabsw m8, m8 277 pabsw m13, m13 278%endif 279 pmullw m8, m3 ; dqc[i] = qc[i] * q 280 punpckhqdq m3, m3 281 pmullw m13, m3 ; dqc[i] = qc[i] * q 282%ifidn %1, fp_32x32 283 psrlw m8, 1 284 psrlw m13, 1 285 psignw m8, m9 286 psignw m13, m10 287 psrlw m0, m3, 2 288%endif 289 mova [dqcoeffq+ncoeffq*2+ 0], m8 290 mova [dqcoeffq+ncoeffq*2+16], m13 291 pcmpeqw m8, m5 ; m8 = c[i] == 0 292 pcmpeqw m13, m5 ; m13 = c[i] == 0 293 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 294 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 295 psubw m6, m7 ; m6 = scan[i] + 1 296 psubw m11, m7 ; m11 = scan[i] + 1 297 pandn m8, m6 ; m8 = max(eob) 298 pandn m13, m11 ; m13 = max(eob) 299 pmaxsw m8, m13 300 add ncoeffq, mmsize 301 jz .accumulate_eob 302 303.ac_only_loop: 304 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 305 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 306 pabsw m6, m9 ; m6 = abs(m9) 307 pabsw m11, m10 ; m11 = abs(m10) 308%ifidn %1, fp_32x32 309 pcmpgtw m7, m6, m0 310 pcmpgtw m12, m11, m0 311 pmovmskb r6, m7 312 pmovmskb r2, m12 313 314 or r6, r2 315 jz .skip_iter 316%endif 317 pcmpeqw m7, m7 318 319 paddsw m6, m1 ; m6 += round 320 paddsw m11, m1 ; m11 += round 321 pmulhw m14, m6, m2 ; m14 = m6*q>>16 322 pmulhw m13, m11, m2 ; m13 = m11*q>>16 323 psignw m14, m9 ; m14 = reinsert sign 324 psignw m13, m10 ; m13 = reinsert sign 325 mova [qcoeffq+ncoeffq*2+ 0], m14 326 mova [qcoeffq+ncoeffq*2+16], m13 327%ifidn %1, fp_32x32 328 pabsw m14, m14 329 pabsw m13, m13 330%endif 331 pmullw m14, m3 ; dqc[i] = qc[i] * q 332 pmullw m13, m3 ; dqc[i] = qc[i] * q 333%ifidn %1, fp_32x32 334 psrlw m14, 1 335 psrlw m13, 1 336 psignw m14, m9 337 psignw m13, m10 338%endif 339 mova [dqcoeffq+ncoeffq*2+ 0], m14 340 mova [dqcoeffq+ncoeffq*2+16], m13 341 pcmpeqw m14, m5 ; m14 = c[i] == 0 342 pcmpeqw m13, m5 ; m13 = c[i] == 0 343 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 344 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 345 psubw m6, m7 ; m6 = scan[i] + 1 346 psubw m11, m7 ; m11 = scan[i] + 1 347 pandn m14, m6 ; m14 = max(eob) 348 pandn m13, m11 ; m13 = max(eob) 349 pmaxsw m8, m14 350 pmaxsw m8, m13 351 add ncoeffq, mmsize 352 jl .ac_only_loop 353 354%ifidn %1, fp_32x32 355 jmp .accumulate_eob 356.skip_iter: 357 mova [qcoeffq+ncoeffq*2+ 0], m5 358 mova [qcoeffq+ncoeffq*2+16], m5 359 mova [dqcoeffq+ncoeffq*2+ 0], m5 360 mova [dqcoeffq+ncoeffq*2+16], m5 361 add ncoeffq, mmsize 362 jl .ac_only_loop 363%endif 364 365.accumulate_eob: 366 ; horizontally accumulate/max eobs and write into [eob] memory pointer 367 mov r2, eobmp 368 pshufd m7, m8, 0xe 369 pmaxsw m8, m7 370 pshuflw m7, m8, 0xe 371 pmaxsw m8, m7 372 pshuflw m7, m8, 0x1 373 pmaxsw m8, m7 374 pextrw r6, m8, 0 375 mov [r2], r6 376 RET 377 378 ; skip-block, i.e. just write all zeroes 379.blank: 380 mov r0, dqcoeffmp 381 movifnidn ncoeffq, ncoeffmp 382 mov r2, qcoeffmp 383 mov r3, eobmp 384 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 385 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 386 lea qcoeffq, [ qcoeffq+ncoeffq*2] 387 neg ncoeffq 388 pxor m7, m7 389.blank_loop: 390 mova [dqcoeffq+ncoeffq*2+ 0], m7 391 mova [dqcoeffq+ncoeffq*2+16], m7 392 mova [qcoeffq+ncoeffq*2+ 0], m7 393 mova [qcoeffq+ncoeffq*2+16], m7 394 add ncoeffq, mmsize 395 jl .blank_loop 396 mov word [eobq], 0 397 RET 398%endmacro 399 400INIT_XMM ssse3 401QUANTIZE_FP fp, 7 402QUANTIZE_FP fp_32x32, 7 403