1;****************************************************************************** 2;* SIMD optimized Opus encoder DSP function 3;* 4;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "config.asm" 24%include "libavutil/x86/x86util.asm" 25 26%ifdef __NASM_VER__ 27%use "smartalign" 28ALIGNMODE p6 29%endif 30 31SECTION_RODATA 64 32 33const_float_abs_mask: times 8 dd 0x7fffffff 34const_align_abs_edge: times 8 dd 0 35 36const_float_0_5: times 8 dd 0.5 37const_float_1: times 8 dd 1.0 38const_float_sign_mask: times 8 dd 0x80000000 39 40const_int32_offsets: 41 %rep 8 42 dd $-const_int32_offsets 43 %endrep 44SECTION .text 45 46; 47; Setup High Register to be used 48; for holding memory constants 49; 50; %1 - the register to be used, assmues it is >= mm8 51; %2 - name of the constant. 52; 53; Subsequent opcodes are going to use the constant in the form 54; "addps m0, mm_const_name" and it would be turned into: 55; "addps m0, [const_name]" on 32 bit arch or 56; "addps m0, m8" on 64 bit arch 57%macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name 58%if num_mmregs > 8 59 %define mm_%3 %2 60 %{1} %2, [%3] ; movaps m8, [const_name] 61%else 62 %define mm_%3 [%3] 63%endif 64%endmacro 65 66; 67; Set Position Independent Code 68; Base address of a constant 69; %1 - the register to be used, if PIC is set 70; %2 - name of the constant. 71; 72; Subsequent opcode are going to use the base address in the form 73; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into 74; "movaps m0, [r5 + r4]" if PIC is enabled 75; "movaps m0, [constant_name + r4]" if texrel are used 76%macro SET_PIC_BASE 3; reg, const_label 77%ifdef PIC 78 %{1} %2, [%3] ; lea r5, [rip+const] 79 %define pic_base_%3 %2 80%else 81 %define pic_base_%3 %3 82%endif 83%endmacro 84 85%macro PULSES_SEARCH 1 86; m6 Syy_norm 87; m7 Sxy_norm 88 addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 89 pxor m1, m1 ; max_idx 90 xorps m3, m3 ; p_max 91 xor r4d, r4d 92align 16 93%%distortion_search: 94 movd xm2, dword r4d ; movd zero extends 95%ifidn %1,add 96 movaps m4, [tmpY + r4] ; y[i] 97 movaps m5, [tmpX + r4] ; X[i] 98 99 %if USE_APPROXIMATION == 1 100 xorps m0, m0 101 cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) 102 %endif 103 104 addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm 105 addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm 106 107 %if USE_APPROXIMATION == 1 108 andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. 109 %endif 110 111%else 112 movaps m5, [tmpY + r4] ; m5 = y[i] 113 114 xorps m0, m0 ; m0 = 0; 115 cmpps m0, m0, m5, 1 ; m0 = (0<y) 116 117 subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i] 118 subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i] 119 andps m5, m0 ; (0<y)?m5:0 120%endif 121 122%if USE_APPROXIMATION == 1 123 rsqrtps m4, m4 124 mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) 125%else 126 mulps m5, m5 127 divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy 128%endif 129 VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later) 130 131 cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max) 132 maxps m3, m5 ; m3=max(p_max,p) 133 ; maxps here is faster than blendvps, despite blend having lower latency. 134 135 pand m2, m0 ; This version seems faster than sse41 pblendvb 136 pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 137 138 add r4d, mmsize 139 cmp r4d, Nd 140 jb %%distortion_search 141 142 por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop) 143 movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing 144 145%if mmsize >= 32 146; Merge parallel maximums round 8 (4 vs 4) 147 148 vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] 149 cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) 150 151 vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] 152 BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] 153 PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128] 154%endif 155 156; Merge parallel maximums round 4 (2 vs 2) 157 ; m3=p[3210] 158 movhlps xm5, xm3 ; m5=p[xx32] 159 cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) 160 161 pshufd xm2, xm1, q3232 162 BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] 163 PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0] 164 165; Merge parallel maximums final round (1 vs 1) 166 shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1] 167 cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] ) 168 169 pshufd xm2, xm1, q1111 170 PBLENDVB xm1, xm2, xm0 171 172 movd dword r4d, xm1 ; zero extends to the rest of r4q 173 174 VBROADCASTSS m3, [tmpX + r4] 175 %{1}ps m7, m3 ; Sxy += X[max_idx] 176 177 VBROADCASTSS m5, [tmpY + r4] 178 %{1}ps m6, m5 ; Syy += Y[max_idx] 179 180 ; We have to update a single element in Y[i] 181 ; However writing 4 bytes and then doing 16 byte load in the inner loop 182 ; could cause a stall due to breaking write forwarding. 183 VPBROADCASTD m1, xm1 184 pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it 185 186 and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load 187 movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0] 188 andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0] 189 %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0] 190 movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0; 191%endmacro 192 193; 194; We need one more register for 195; PIC relative addressing. Use this 196; to count it in cglobal 197; 198%ifdef PIC 199 %define num_pic_regs 1 200%else 201 %define num_pic_regs 0 202%endif 203 204; 205; Pyramid Vector Quantization Search implementation 206; 207; float * inX - Unaligned (SIMD) access, it will be overread, 208; but extra data is masked away. 209; int32 * outY - Should be aligned and padded buffer. 210; It is used as temp buffer. 211; uint32 K - Number of pulses to have after quantizations. 212; uint32 N - Number of vector elements. Must be 0 < N < 256 213; 214%macro PVQ_FAST_SEARCH 1 215cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N 216%define tmpX rsp 217%define tmpY outYq 218 219 movaps m0, [const_float_abs_mask] 220 shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode. 221 mov r4d, Nd 222 223 neg r4d 224 and r4d, mmsize-1 225 226 SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const 227 movups m2, [pic_base_const_align_abs_edge + r4 - mmsize] 228 229 add Nd, r4d ; N = align(N, mmsize) 230 231 lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0. 232 movups m1, [inXq + r4] 233 andps m1, m2 234 movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] ) 235 236align 16 237%%loop_abs_sum: 238 sub r4d, mmsize 239 jc %%end_loop_abs_sum 240 241 movups m2, [inXq + r4] 242 andps m2, m0 243 244 movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i]) 245 addps m1, m2 ; Sx += abs(X[i]) 246 jmp %%loop_abs_sum 247 248align 16 249%%end_loop_abs_sum: 250 251 HSUMPS m1, m2 ; m1 = Sx 252 253 xorps m0, m0 254 comiss xm0, xm1 ; 255 jz %%zero_input ; if (Sx==0) goto zero_input 256 257 cvtsi2ss xm0, dword Kd ; m0 = K 258%if USE_APPROXIMATION == 1 259 rcpss xm1, xm1 ; m1 = approx(1/Sx) 260 mulss xm0, xm1 ; m0 = K*(1/Sx) 261%else 262 divss xm0, xm1 ; b = K/Sx 263 ; b = K/max_x 264%endif 265 266 VBROADCASTSS m0, xm0 267 268 lea r4d, [Nd - mmsize] 269 pxor m5, m5 ; Sy ( Sum of abs( y[i]) ) 270 xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) 271 xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) 272align 16 273%%loop_guess: 274 movaps m1, [tmpX + r4] ; m1 = X[i] 275 mulps m2, m0, m1 ; m2 = res*X[i] 276 cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) 277 paddd m5, m2 ; Sy += yt 278 cvtdq2ps m2, m2 ; yt = (float)yt 279 mulps m1, m2 ; m1 = X[i]*yt 280 movaps [tmpY + r4], m2 ; y[i] = m2 281 addps m7, m1 ; Sxy += m1; 282 mulps m2, m2 ; m2 = yt*yt 283 addps m6, m2 ; Syy += m2 284 285 sub r4d, mmsize 286 jnc %%loop_guess 287 288 HSUMPS m6, m1 ; Syy_norm 289 HADDD m5, m4 ; pulses 290 291 movd dword r4d, xm5 ; zero extends to the rest of r4q 292 293 sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode. 294 jz %%finish ; K - pulses == 0 295 296 SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5 297 SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1 298 SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets 299 ; Use Syy/2 in distortion parameter calculations. 300 ; Saves pre and post-caclulation to correct Y[] values. 301 ; Same precision, since float mantisa is normalized. 302 ; The SQRT approximation does differ. 303 HSUMPS m7, m0 ; Sxy_norm 304 mulps m6, mm_const_float_0_5 305 306 jc %%remove_pulses_loop ; K - pulses < 0 307 308align 16 ; K - pulses > 0 309%%add_pulses_loop: 310 311 PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm 312 313 sub Kd, 1 314 jnz %%add_pulses_loop 315 316 addps m6, m6 ; Syy*=2 317 318 jmp %%finish 319 320align 16 321%%remove_pulses_loop: 322 323 PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm 324 325 add Kd, 1 326 jnz %%remove_pulses_loop 327 328 addps m6, m6 ; Syy*=2 329 330align 16 331%%finish: 332 lea r4d, [Nd - mmsize] 333 movaps m2, [const_float_sign_mask] 334 335align 16 336%%restore_sign_loop: 337 movaps m0, [tmpY + r4] ; m0 = Y[i] 338 movups m1, [inXq + r4] ; m1 = X[i] 339 andps m1, m2 ; m1 = sign(X[i]) 340 orps m0, m1 ; m0 = Y[i]*sign 341 cvtps2dq m3, m0 ; m3 = (int)m0 342 movaps [outYq + r4], m3 343 344 sub r4d, mmsize 345 jnc %%restore_sign_loop 346%%return: 347 348%if ARCH_X86_64 == 0 ; sbrdsp 349 movss r0m, xm6 ; return (float)Syy_norm 350 fld dword r0m 351%else 352 movaps m0, m6 ; return (float)Syy_norm 353%endif 354 355 RET 356 357align 16 358%%zero_input: 359 lea r4d, [Nd - mmsize] 360 xorps m0, m0 361%%zero_loop: 362 movaps [outYq + r4], m0 363 sub r4d, mmsize 364 jnc %%zero_loop 365 366 movaps m6, [const_float_1] 367 jmp %%return 368%endmacro 369 370; if 1, use a float op that give half precision but execute for around 3 cycles. 371; On Skylake & Ryzen the division is much faster (around 11c/3), 372; that makes the full precision code about 2% slower. 373; Opus also does use rsqrt approximation in their intrinsics code. 374%define USE_APPROXIMATION 1 375 376INIT_XMM sse2 377PVQ_FAST_SEARCH _approx 378 379INIT_XMM sse4 380PVQ_FAST_SEARCH _approx 381 382%define USE_APPROXIMATION 0 383 384INIT_XMM avx 385PVQ_FAST_SEARCH _exact 386