1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; Macro Arguments 19; Arg 1: Width 20; Arg 2: Height 21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7 24%macro HIGH_SAD_FN 4-5 7 25%if %4 == 0 26%if %3 == 5 27cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows 28%else ; %3 == 7 29cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ 30 src_stride3, ref_stride3, n_rows 31%endif ; %3 == 5/7 32%elif %4 == 1 ; avg 33%if %3 == 5 34cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \ 35 second_pred, n_rows 36%else ; %3 == 7 37cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \ 38 ref, ref_stride, \ 39 second_pred, \ 40 src_stride3, ref_stride3 41%if AOM_ARCH_X86_64 42%define n_rowsd r7d 43%else ; x86-32 44%define n_rowsd dword r0m 45%endif ; x86-32/64 46%endif ; %3 == 5/7 47%else ; %4 == 2, skip rows 48%if %3 == 5 49cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows 50%else ; %3 == 7 51cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ 52 src_stride3, ref_stride3, n_rows 53%endif ; %3 == 5/7 54%endif ; sad/avg/skip 55%if %4 == 2 ; double the stride if we are skipping rows 56 lea src_strided, [src_strided*2] 57 lea ref_strided, [ref_strided*2] 58%endif 59 movsxdifnidn src_strideq, src_strided 60 movsxdifnidn ref_strideq, ref_strided 61%if %3 == 7 62 lea src_stride3q, [src_strideq*3] 63 lea ref_stride3q, [ref_strideq*3] 64%endif ; %3 == 7 65; convert src, ref & second_pred to short ptrs (from byte ptrs) 66 shl srcq, 1 67 shl refq, 1 68%if %4 == 1 69 shl second_predq, 1 70%endif 71%endmacro 72 73; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, 74; uint8_t *ref, int ref_stride); 75%macro HIGH_SAD64XN 1-2 0 76 HIGH_SAD_FN 64, %1, 5, %2 77%if %2 == 2 ; skip rows, so divide number of rows by 2 78 mov n_rowsd, %1/2 79%else 80 mov n_rowsd, %1 81%endif 82 pxor m0, m0 83 pxor m6, m6 84 85.loop: 86 ; first half of each row 87 movu m1, [refq] 88 movu m2, [refq+16] 89 movu m3, [refq+32] 90 movu m4, [refq+48] 91%if %2 == 1 92 pavgw m1, [second_predq+mmsize*0] 93 pavgw m2, [second_predq+mmsize*1] 94 pavgw m3, [second_predq+mmsize*2] 95 pavgw m4, [second_predq+mmsize*3] 96 lea second_predq, [second_predq+mmsize*4] 97%endif 98 mova m5, [srcq] 99 psubusw m5, m1 100 psubusw m1, [srcq] 101 por m1, m5 102 mova m5, [srcq+16] 103 psubusw m5, m2 104 psubusw m2, [srcq+16] 105 por m2, m5 106 mova m5, [srcq+32] 107 psubusw m5, m3 108 psubusw m3, [srcq+32] 109 por m3, m5 110 mova m5, [srcq+48] 111 psubusw m5, m4 112 psubusw m4, [srcq+48] 113 por m4, m5 114 paddw m1, m2 115 paddw m3, m4 116 movhlps m2, m1 117 movhlps m4, m3 118 paddw m1, m2 119 paddw m3, m4 120 punpcklwd m1, m6 121 punpcklwd m3, m6 122 paddd m0, m1 123 paddd m0, m3 124 ; second half of each row 125 movu m1, [refq+64] 126 movu m2, [refq+80] 127 movu m3, [refq+96] 128 movu m4, [refq+112] 129%if %2 == 1 130 pavgw m1, [second_predq+mmsize*0] 131 pavgw m2, [second_predq+mmsize*1] 132 pavgw m3, [second_predq+mmsize*2] 133 pavgw m4, [second_predq+mmsize*3] 134 lea second_predq, [second_predq+mmsize*4] 135%endif 136 mova m5, [srcq+64] 137 psubusw m5, m1 138 psubusw m1, [srcq+64] 139 por m1, m5 140 mova m5, [srcq+80] 141 psubusw m5, m2 142 psubusw m2, [srcq+80] 143 por m2, m5 144 mova m5, [srcq+96] 145 psubusw m5, m3 146 psubusw m3, [srcq+96] 147 por m3, m5 148 mova m5, [srcq+112] 149 psubusw m5, m4 150 psubusw m4, [srcq+112] 151 por m4, m5 152 paddw m1, m2 153 paddw m3, m4 154 movhlps m2, m1 155 movhlps m4, m3 156 paddw m1, m2 157 paddw m3, m4 158 punpcklwd m1, m6 159 punpcklwd m3, m6 160 lea refq, [refq+ref_strideq*2] 161 paddd m0, m1 162 lea srcq, [srcq+src_strideq*2] 163 paddd m0, m3 164 165 dec n_rowsd 166 jg .loop 167 168 movhlps m1, m0 169 paddd m0, m1 170 punpckldq m0, m6 171 movhlps m1, m0 172 paddd m0, m1 173%if %2 == 2 ; we skipped rows, so we need to double the sad 174 pslld m0, 1 175%endif 176 movd eax, m0 177 RET 178%endmacro 179 180INIT_XMM sse2 181HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 182HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 183HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 184HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 185HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 186HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 187%if CONFIG_REALTIME_ONLY==0 188HIGH_SAD64XN 16 ; highbd_sad64x16_sse2 189HIGH_SAD64XN 16, 1 ; highbd_sad64x16_avg_sse2 190HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2 191%endif 192 193; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, 194; uint8_t *ref, int ref_stride); 195%macro HIGH_SAD32XN 1-2 0 196 HIGH_SAD_FN 32, %1, 5, %2 197%if %2 == 2 ; skip rows, so divide number of rows by 2 198 mov n_rowsd, %1/2 199%else 200 mov n_rowsd, %1 201%endif 202 pxor m0, m0 203 pxor m6, m6 204 205.loop: 206 movu m1, [refq] 207 movu m2, [refq+16] 208 movu m3, [refq+32] 209 movu m4, [refq+48] 210%if %2 == 1 211 pavgw m1, [second_predq+mmsize*0] 212 pavgw m2, [second_predq+mmsize*1] 213 pavgw m3, [second_predq+mmsize*2] 214 pavgw m4, [second_predq+mmsize*3] 215 lea second_predq, [second_predq+mmsize*4] 216%endif 217 mova m5, [srcq] 218 psubusw m5, m1 219 psubusw m1, [srcq] 220 por m1, m5 221 mova m5, [srcq+16] 222 psubusw m5, m2 223 psubusw m2, [srcq+16] 224 por m2, m5 225 mova m5, [srcq+32] 226 psubusw m5, m3 227 psubusw m3, [srcq+32] 228 por m3, m5 229 mova m5, [srcq+48] 230 psubusw m5, m4 231 psubusw m4, [srcq+48] 232 por m4, m5 233 paddw m1, m2 234 paddw m3, m4 235 movhlps m2, m1 236 movhlps m4, m3 237 paddw m1, m2 238 paddw m3, m4 239 punpcklwd m1, m6 240 punpcklwd m3, m6 241 lea refq, [refq+ref_strideq*2] 242 paddd m0, m1 243 lea srcq, [srcq+src_strideq*2] 244 paddd m0, m3 245 dec n_rowsd 246 jg .loop 247 248 movhlps m1, m0 249 paddd m0, m1 250 punpckldq m0, m6 251 movhlps m1, m0 252 paddd m0, m1 253%if %2 == 2 ; we skipped rows, so we need to double the sad 254 pslld m0, 1 255%endif 256 movd eax, m0 257 RET 258%endmacro 259 260INIT_XMM sse2 261HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 262HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 263HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 264HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 265HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 266HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 267HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 268HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 269HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 270%if CONFIG_REALTIME_ONLY==0 271HIGH_SAD32XN 8 ; highbd_sad32x8_sse2 272HIGH_SAD32XN 8, 1 ; highbd_sad32x8_avg_sse2 273%endif 274 275; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, 276; uint8_t *ref, int ref_stride); 277%macro HIGH_SAD16XN 1-2 0 278 HIGH_SAD_FN 16, %1, 5, %2 279%if %2 == 2 ; skip rows, so divide number of rows by 2 280 mov n_rowsd, %1/4 281%else 282 mov n_rowsd, %1/2 283%endif 284 pxor m0, m0 285 pxor m6, m6 286 287.loop: 288 movu m1, [refq] 289 movu m2, [refq+16] 290 movu m3, [refq+ref_strideq*2] 291 movu m4, [refq+ref_strideq*2+16] 292%if %2 == 1 293 pavgw m1, [second_predq+mmsize*0] 294 pavgw m2, [second_predq+16] 295 pavgw m3, [second_predq+mmsize*2] 296 pavgw m4, [second_predq+mmsize*2+16] 297 lea second_predq, [second_predq+mmsize*4] 298%endif 299 mova m5, [srcq] 300 psubusw m5, m1 301 psubusw m1, [srcq] 302 por m1, m5 303 mova m5, [srcq+16] 304 psubusw m5, m2 305 psubusw m2, [srcq+16] 306 por m2, m5 307 mova m5, [srcq+src_strideq*2] 308 psubusw m5, m3 309 psubusw m3, [srcq+src_strideq*2] 310 por m3, m5 311 mova m5, [srcq+src_strideq*2+16] 312 psubusw m5, m4 313 psubusw m4, [srcq+src_strideq*2+16] 314 por m4, m5 315 paddw m1, m2 316 paddw m3, m4 317 movhlps m2, m1 318 movhlps m4, m3 319 paddw m1, m2 320 paddw m3, m4 321 punpcklwd m1, m6 322 punpcklwd m3, m6 323 lea refq, [refq+ref_strideq*4] 324 paddd m0, m1 325 lea srcq, [srcq+src_strideq*4] 326 paddd m0, m3 327 dec n_rowsd 328 jg .loop 329 330 movhlps m1, m0 331 paddd m0, m1 332 punpckldq m0, m6 333 movhlps m1, m0 334 paddd m0, m1 335%if %2 == 2 ; we skipped rows, so we need to double the sad 336 pslld m0, 1 337%endif 338 movd eax, m0 339 RET 340%endmacro 341 342INIT_XMM sse2 343HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 344HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 345HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 346HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 347HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 348HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 349HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 350HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 351%if CONFIG_REALTIME_ONLY==0 352HIGH_SAD16XN 64 ; highbd_sad16x64_sse2 353HIGH_SAD16XN 4 ; highbd_sad16x4_sse2 354HIGH_SAD16XN 64, 1 ; highbd_sad16x64_avg_sse2 355HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2 356%endif 357 358; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, 359; uint8_t *ref, int ref_stride); 360%macro HIGH_SAD8XN 1-2 0 361 HIGH_SAD_FN 8, %1, 7, %2, 8 362%if %2 == 2 ; skip rows, so divide number of rows by 2 363 mov n_rowsd, %1/8 364%else 365 mov n_rowsd, %1/4 366%endif 367 pxor m0, m0 368 pxor m6, m6 369 370.loop: 371 movu m1, [refq] 372 movu m2, [refq+ref_strideq*2] 373 movu m3, [refq+ref_strideq*4] 374 movu m4, [refq+ref_stride3q*2] 375%if %2 == 1 376 pavgw m1, [second_predq+mmsize*0] 377 pavgw m2, [second_predq+mmsize*1] 378 pavgw m3, [second_predq+mmsize*2] 379 pavgw m4, [second_predq+mmsize*3] 380 lea second_predq, [second_predq+mmsize*4] 381%endif 382 mova m7, m1 383 movu m5, [srcq] 384 psubusw m1, m5 385 psubusw m5, m7 386 por m1, m5 387 388 mova m7, m2 389 movu m5, [srcq+src_strideq*2] 390 psubusw m2, m5 391 psubusw m5, m7 392 por m2, m5 393 394 mova m7, m3 395 movu m5, [srcq+src_strideq*4] 396 psubusw m3, m5 397 psubusw m5, m7 398 por m3, m5 399 400 mova m7, m4 401 movu m5, [srcq+src_stride3q*2] 402 psubusw m4, m5 403 psubusw m5, m7 404 por m4, m5 405 406 paddw m1, m2 407 paddw m3, m4 408 movhlps m2, m1 409 movhlps m4, m3 410 paddw m1, m2 411 paddw m3, m4 412 punpcklwd m1, m6 413 punpcklwd m3, m6 414 lea refq, [refq+ref_strideq*8] 415 paddd m0, m1 416 lea srcq, [srcq+src_strideq*8] 417 paddd m0, m3 418 dec n_rowsd 419 jg .loop 420 421 movhlps m1, m0 422 paddd m0, m1 423 punpckldq m0, m6 424 movhlps m1, m0 425 paddd m0, m1 426%if %2 == 2 ; we skipped rows, so we need to double the sad 427 pslld m0, 1 428%endif 429 movd eax, m0 430 RET 431%endmacro 432 433INIT_XMM sse2 434HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 435HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 436HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 437HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 438HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 439HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 440%if CONFIG_REALTIME_ONLY==0 441HIGH_SAD8XN 32 ; highbd_sad8x32_sse2 442HIGH_SAD8XN 32, 1 ; highbd_sad8x32_avg_sse2 443HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2 444%endif 445 446; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride, 447; uint8_t *ref, int ref_stride); 448%macro HIGH_SAD4XN 1-2 0 449 HIGH_SAD_FN 4, %1, 7, %2 450%if %2 == 2 ; skip rows, so divide number of rows by 2 451 mov n_rowsd, %1/8 452%else 453 mov n_rowsd, %1/4 454%endif 455 pxor m0, m0 456 pxor m6, m6 457 458.loop: 459 movq m1, [refq] 460 movq m2, [refq+ref_strideq*2] 461 movq m3, [refq+ref_strideq*4] 462 movq m4, [refq+ref_stride3q*2] 463 punpcklwd m1, m3 464 punpcklwd m2, m4 465%if %2 == 1 466 movq m3, [second_predq+8*0] 467 movq m5, [second_predq+8*2] 468 punpcklwd m3, m5 469 movq m4, [second_predq+8*1] 470 movq m5, [second_predq+8*3] 471 punpcklwd m4, m5 472 lea second_predq, [second_predq+8*4] 473 pavgw m1, m3 474 pavgw m2, m4 475%endif 476 movq m5, [srcq] 477 movq m3, [srcq+src_strideq*4] 478 punpcklwd m5, m3 479 movdqa m3, m1 480 psubusw m1, m5 481 psubusw m5, m3 482 por m1, m5 483 movq m5, [srcq+src_strideq*2] 484 movq m4, [srcq+src_stride3q*2] 485 punpcklwd m5, m4 486 movdqa m4, m2 487 psubusw m2, m5 488 psubusw m5, m4 489 por m2, m5 490 paddw m1, m2 491 movdqa m2, m1 492 punpcklwd m1, m6 493 punpckhwd m2, m6 494 lea refq, [refq+ref_strideq*8] 495 paddd m0, m1 496 lea srcq, [srcq+src_strideq*8] 497 paddd m0, m2 498 dec n_rowsd 499 jg .loop 500 501 movhlps m1, m0 502 paddd m0, m1 503 punpckldq m0, m6 504 movhlps m1, m0 505 paddd m0, m1 506%if %2 == 2 ; we skipped rows, so we need to double the sad 507 pslld m0, 1 508%endif 509 movd eax, m0 510 RET 511%endmacro 512 513INIT_XMM sse2 514HIGH_SAD4XN 8 ; highbd_sad4x8_sse2 515HIGH_SAD4XN 4 ; highbd_sad4x4_sse2 516%if CONFIG_REALTIME_ONLY==0 517HIGH_SAD4XN 16 ; highbd_sad4x16_sse2 518HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2 519%endif 520