1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%define private_prefix av1 15 16%include "third_party/x86inc/x86inc.asm" 17 18SECTION_RODATA 19pw_1: times 8 dw 1 20 21SECTION .text 22 23%macro QUANTIZE_FP 2 24cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 25 shift, qcoeff, dqcoeff, dequant, \ 26 eob, scan, iscan 27 cmp dword skipm, 0 28 jne .blank 29 30 ; actual quantize loop - setup pointers, rounders, etc. 31 movifnidn coeffq, coeffmp 32 movifnidn ncoeffq, ncoeffmp 33 mov r2, dequantmp 34 movifnidn zbinq, zbinmp 35 movifnidn roundq, roundmp 36 movifnidn quantq, quantmp 37 mova m1, [roundq] ; m1 = round 38 mova m2, [quantq] ; m2 = quant 39%ifidn %1, fp_32x32 40 pcmpeqw m5, m5 41 psrlw m5, 15 42 paddw m1, m5 43 psrlw m1, 1 ; m1 = (m1 + 1) / 2 44%endif 45 mova m3, [r2q] ; m3 = dequant 46 mov r3, qcoeffmp 47 mov r4, dqcoeffmp 48 mov r5, iscanmp 49%ifidn %1, fp_32x32 50 psllw m2, 1 51%endif 52 pxor m5, m5 ; m5 = dedicated zero 53 54 lea coeffq, [ coeffq+ncoeffq*2] 55 lea r5q, [ r5q+ncoeffq*2] 56 lea r3q, [ r3q+ncoeffq*2] 57 lea r4q, [r4q+ncoeffq*2] 58 neg ncoeffq 59 60 ; get DC and first 15 AC coeffs 61 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 62 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 63 pabsw m6, m9 ; m6 = abs(m9) 64 pabsw m11, m10 ; m11 = abs(m10) 65 pcmpeqw m7, m7 66 67 paddsw m6, m1 ; m6 += round 68 punpckhqdq m1, m1 69 paddsw m11, m1 ; m11 += round 70 pmulhw m8, m6, m2 ; m8 = m6*q>>16 71 punpckhqdq m2, m2 72 pmulhw m13, m11, m2 ; m13 = m11*q>>16 73 psignw m8, m9 ; m8 = reinsert sign 74 psignw m13, m10 ; m13 = reinsert sign 75 mova [r3q+ncoeffq*2+ 0], m8 76 mova [r3q+ncoeffq*2+16], m13 77%ifidn %1, fp_32x32 78 pabsw m8, m8 79 pabsw m13, m13 80%endif 81 pmullw m8, m3 ; r4[i] = r3[i] * q 82 punpckhqdq m3, m3 83 pmullw m13, m3 ; r4[i] = r3[i] * q 84%ifidn %1, fp_32x32 85 psrlw m8, 1 86 psrlw m13, 1 87 psignw m8, m9 88 psignw m13, m10 89 psrlw m0, m3, 2 90%else 91 psrlw m0, m3, 1 92%endif 93 mova [r4q+ncoeffq*2+ 0], m8 94 mova [r4q+ncoeffq*2+16], m13 95 pcmpeqw m8, m5 ; m8 = c[i] == 0 96 pcmpeqw m13, m5 ; m13 = c[i] == 0 97 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] 98 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] 99 psubw m6, m7 ; m6 = scan[i] + 1 100 psubw m11, m7 ; m11 = scan[i] + 1 101 pandn m8, m6 ; m8 = max(eob) 102 pandn m13, m11 ; m13 = max(eob) 103 pmaxsw m8, m13 104 add ncoeffq, mmsize 105 jz .accumulate_eob 106 107.ac_only_loop: 108 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 109 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 110 pabsw m6, m9 ; m6 = abs(m9) 111 pabsw m11, m10 ; m11 = abs(m10) 112 113 pcmpgtw m7, m6, m0 114 pcmpgtw m12, m11, m0 115 pmovmskb r6d, m7 116 pmovmskb r2d, m12 117 118 or r6, r2 119 jz .skip_iter 120 121 pcmpeqw m7, m7 122 123 paddsw m6, m1 ; m6 += round 124 paddsw m11, m1 ; m11 += round 125 pmulhw m14, m6, m2 ; m14 = m6*q>>16 126 pmulhw m13, m11, m2 ; m13 = m11*q>>16 127 psignw m14, m9 ; m14 = reinsert sign 128 psignw m13, m10 ; m13 = reinsert sign 129 mova [r3q+ncoeffq*2+ 0], m14 130 mova [r3q+ncoeffq*2+16], m13 131%ifidn %1, fp_32x32 132 pabsw m14, m14 133 pabsw m13, m13 134%endif 135 pmullw m14, m3 ; r4[i] = r3[i] * q 136 pmullw m13, m3 ; r4[i] = r3[i] * q 137%ifidn %1, fp_32x32 138 psrlw m14, 1 139 psrlw m13, 1 140 psignw m14, m9 141 psignw m13, m10 142%endif 143 mova [r4q+ncoeffq*2+ 0], m14 144 mova [r4q+ncoeffq*2+16], m13 145 pcmpeqw m14, m5 ; m14 = c[i] == 0 146 pcmpeqw m13, m5 ; m13 = c[i] == 0 147 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] 148 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] 149 psubw m6, m7 ; m6 = scan[i] + 1 150 psubw m11, m7 ; m11 = scan[i] + 1 151 pandn m14, m6 ; m14 = max(eob) 152 pandn m13, m11 ; m13 = max(eob) 153 pmaxsw m8, m14 154 pmaxsw m8, m13 155 add ncoeffq, mmsize 156 jl .ac_only_loop 157 158 jmp .accumulate_eob 159.skip_iter: 160 mova [r3q+ncoeffq*2+ 0], m5 161 mova [r3q+ncoeffq*2+16], m5 162 mova [r4q+ncoeffq*2+ 0], m5 163 mova [r4q+ncoeffq*2+16], m5 164 add ncoeffq, mmsize 165 jl .ac_only_loop 166 167.accumulate_eob: 168 ; horizontally accumulate/max eobs and write into [eob] memory pointer 169 mov r2, eobmp 170 pshufd m7, m8, 0xe 171 pmaxsw m8, m7 172 pshuflw m7, m8, 0xe 173 pmaxsw m8, m7 174 pshuflw m7, m8, 0x1 175 pmaxsw m8, m7 176 pextrw r6, m8, 0 177 mov [r2], r6 178 RET 179 180 ; skip-block, i.e. just write all zeroes 181.blank: 182 mov r0, dqcoeffmp 183 movifnidn ncoeffq, ncoeffmp 184 mov r2, qcoeffmp 185 mov r3, eobmp 186 187 lea r0q, [r0q+ncoeffq*2] 188 lea r2q, [r2q+ncoeffq*2] 189 neg ncoeffq 190 pxor m7, m7 191.blank_loop: 192 mova [r0q+ncoeffq*2+ 0], m7 193 mova [r0q+ncoeffq*2+16], m7 194 mova [r2q+ncoeffq*2+ 0], m7 195 mova [r2q+ncoeffq*2+16], m7 196 add ncoeffq, mmsize 197 jl .blank_loop 198 mov word [r3q], 0 199 RET 200%endmacro 201 202INIT_XMM ssse3 203QUANTIZE_FP fp, 7 204QUANTIZE_FP fp_32x32, 7 205