1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; Macro Arguments 19; Arg 1: Width 20; Arg 2: Height 21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7 24%macro HIGH_SAD_FN 4-5 7 25%if %4 == 0 26%if %3 == 5 27cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows 28%else ; %3 == 7 29cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ 30 src_stride3, ref_stride3, n_rows 31%endif ; %3 == 5/7 32%elif %4 == 1 ; avg 33%if %3 == 5 34cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \ 35 second_pred, n_rows 36%else ; %3 == 7 37cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \ 38 ref, ref_stride, \ 39 second_pred, \ 40 src_stride3, ref_stride3 41%if AOM_ARCH_X86_64 42%define n_rowsd r7d 43%else ; x86-32 44%define n_rowsd dword r0m 45%endif ; x86-32/64 46%endif ; %3 == 5/7 47%else ; %4 == 2, skip rows 48%if %3 == 5 49cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows 50%else ; %3 == 7 51cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \ 52 src_stride3, ref_stride3, n_rows 53%endif ; %3 == 5/7 54%endif ; sad/avg/skip 55%if %4 == 2 ; double the stride if we are skipping rows 56 lea src_strided, [src_strided*2] 57 lea ref_strided, [ref_strided*2] 58%endif 59 movsxdifnidn src_strideq, src_strided 60 movsxdifnidn ref_strideq, ref_strided 61%if %3 == 7 62 lea src_stride3q, [src_strideq*3] 63 lea ref_stride3q, [ref_strideq*3] 64%endif ; %3 == 7 65; convert src, ref & second_pred to short ptrs (from byte ptrs) 66 shl srcq, 1 67 shl refq, 1 68%if %4 == 1 69 shl second_predq, 1 70%endif 71%endmacro 72 73; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, 74; uint8_t *ref, int ref_stride); 75%macro HIGH_SAD64XN 1-2 0 76 HIGH_SAD_FN 64, %1, 5, %2 77%if %2 == 2 ; skip rows, so divide number of rows by 2 78 mov n_rowsd, %1/2 79%else 80 mov n_rowsd, %1 81%endif 82 pxor m0, m0 83 pxor m6, m6 84 85.loop: 86 ; first half of each row 87 movu m1, [refq] 88 movu m2, [refq+16] 89 movu m3, [refq+32] 90 movu m4, [refq+48] 91%if %2 == 1 92 pavgw m1, [second_predq+mmsize*0] 93 pavgw m2, [second_predq+mmsize*1] 94 pavgw m3, [second_predq+mmsize*2] 95 pavgw m4, [second_predq+mmsize*3] 96 lea second_predq, [second_predq+mmsize*4] 97%endif 98 mova m5, [srcq] 99 psubusw m5, m1 100 psubusw m1, [srcq] 101 por m1, m5 102 mova m5, [srcq+16] 103 psubusw m5, m2 104 psubusw m2, [srcq+16] 105 por m2, m5 106 mova m5, [srcq+32] 107 psubusw m5, m3 108 psubusw m3, [srcq+32] 109 por m3, m5 110 mova m5, [srcq+48] 111 psubusw m5, m4 112 psubusw m4, [srcq+48] 113 por m4, m5 114 paddw m1, m2 115 paddw m3, m4 116 movhlps m2, m1 117 movhlps m4, m3 118 paddw m1, m2 119 paddw m3, m4 120 punpcklwd m1, m6 121 punpcklwd m3, m6 122 paddd m0, m1 123 paddd m0, m3 124 ; second half of each row 125 movu m1, [refq+64] 126 movu m2, [refq+80] 127 movu m3, [refq+96] 128 movu m4, [refq+112] 129%if %2 == 1 130 pavgw m1, [second_predq+mmsize*0] 131 pavgw m2, [second_predq+mmsize*1] 132 pavgw m3, [second_predq+mmsize*2] 133 pavgw m4, [second_predq+mmsize*3] 134 lea second_predq, [second_predq+mmsize*4] 135%endif 136 mova m5, [srcq+64] 137 psubusw m5, m1 138 psubusw m1, [srcq+64] 139 por m1, m5 140 mova m5, [srcq+80] 141 psubusw m5, m2 142 psubusw m2, [srcq+80] 143 por m2, m5 144 mova m5, [srcq+96] 145 psubusw m5, m3 146 psubusw m3, [srcq+96] 147 por m3, m5 148 mova m5, [srcq+112] 149 psubusw m5, m4 150 psubusw m4, [srcq+112] 151 por m4, m5 152 paddw m1, m2 153 paddw m3, m4 154 movhlps m2, m1 155 movhlps m4, m3 156 paddw m1, m2 157 paddw m3, m4 158 punpcklwd m1, m6 159 punpcklwd m3, m6 160 lea refq, [refq+ref_strideq*2] 161 paddd m0, m1 162 lea srcq, [srcq+src_strideq*2] 163 paddd m0, m3 164 165 dec n_rowsd 166 jg .loop 167 168 movhlps m1, m0 169 paddd m0, m1 170 punpckldq m0, m6 171 movhlps m1, m0 172 paddd m0, m1 173%if %2 == 2 ; we skipped rows, so we need to double the sad 174 pslld m0, 1 175%endif 176 movd eax, m0 177 RET 178%endmacro 179 180INIT_XMM sse2 181HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 182HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 183HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 184HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 185HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 186HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 187HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2 188HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2 189HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2 190 191; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, 192; uint8_t *ref, int ref_stride); 193%macro HIGH_SAD32XN 1-2 0 194 HIGH_SAD_FN 32, %1, 5, %2 195%if %2 == 2 ; skip rows, so divide number of rows by 2 196 mov n_rowsd, %1/2 197%else 198 mov n_rowsd, %1 199%endif 200 pxor m0, m0 201 pxor m6, m6 202 203.loop: 204 movu m1, [refq] 205 movu m2, [refq+16] 206 movu m3, [refq+32] 207 movu m4, [refq+48] 208%if %2 == 1 209 pavgw m1, [second_predq+mmsize*0] 210 pavgw m2, [second_predq+mmsize*1] 211 pavgw m3, [second_predq+mmsize*2] 212 pavgw m4, [second_predq+mmsize*3] 213 lea second_predq, [second_predq+mmsize*4] 214%endif 215 mova m5, [srcq] 216 psubusw m5, m1 217 psubusw m1, [srcq] 218 por m1, m5 219 mova m5, [srcq+16] 220 psubusw m5, m2 221 psubusw m2, [srcq+16] 222 por m2, m5 223 mova m5, [srcq+32] 224 psubusw m5, m3 225 psubusw m3, [srcq+32] 226 por m3, m5 227 mova m5, [srcq+48] 228 psubusw m5, m4 229 psubusw m4, [srcq+48] 230 por m4, m5 231 paddw m1, m2 232 paddw m3, m4 233 movhlps m2, m1 234 movhlps m4, m3 235 paddw m1, m2 236 paddw m3, m4 237 punpcklwd m1, m6 238 punpcklwd m3, m6 239 lea refq, [refq+ref_strideq*2] 240 paddd m0, m1 241 lea srcq, [srcq+src_strideq*2] 242 paddd m0, m3 243 dec n_rowsd 244 jg .loop 245 246 movhlps m1, m0 247 paddd m0, m1 248 punpckldq m0, m6 249 movhlps m1, m0 250 paddd m0, m1 251%if %2 == 2 ; we skipped rows, so we need to double the sad 252 pslld m0, 1 253%endif 254 movd eax, m0 255 RET 256%endmacro 257 258INIT_XMM sse2 259HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 260HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 261HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 262HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 263HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 264HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 265HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 266HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 267HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2 268HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2 269HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2 270HIGH_SAD32XN 8, 2 ; highbd_sad_skip_32x8_sse2 271 272; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, 273; uint8_t *ref, int ref_stride); 274%macro HIGH_SAD16XN 1-2 0 275 HIGH_SAD_FN 16, %1, 5, %2 276%if %2 == 2 ; skip rows, so divide number of rows by 2 277 mov n_rowsd, %1/4 278%else 279 mov n_rowsd, %1/2 280%endif 281 pxor m0, m0 282 pxor m6, m6 283 284.loop: 285 movu m1, [refq] 286 movu m2, [refq+16] 287 movu m3, [refq+ref_strideq*2] 288 movu m4, [refq+ref_strideq*2+16] 289%if %2 == 1 290 pavgw m1, [second_predq+mmsize*0] 291 pavgw m2, [second_predq+16] 292 pavgw m3, [second_predq+mmsize*2] 293 pavgw m4, [second_predq+mmsize*2+16] 294 lea second_predq, [second_predq+mmsize*4] 295%endif 296 mova m5, [srcq] 297 psubusw m5, m1 298 psubusw m1, [srcq] 299 por m1, m5 300 mova m5, [srcq+16] 301 psubusw m5, m2 302 psubusw m2, [srcq+16] 303 por m2, m5 304 mova m5, [srcq+src_strideq*2] 305 psubusw m5, m3 306 psubusw m3, [srcq+src_strideq*2] 307 por m3, m5 308 mova m5, [srcq+src_strideq*2+16] 309 psubusw m5, m4 310 psubusw m4, [srcq+src_strideq*2+16] 311 por m4, m5 312 paddw m1, m2 313 paddw m3, m4 314 movhlps m2, m1 315 movhlps m4, m3 316 paddw m1, m2 317 paddw m3, m4 318 punpcklwd m1, m6 319 punpcklwd m3, m6 320 lea refq, [refq+ref_strideq*4] 321 paddd m0, m1 322 lea srcq, [srcq+src_strideq*4] 323 paddd m0, m3 324 dec n_rowsd 325 jg .loop 326 327 movhlps m1, m0 328 paddd m0, m1 329 punpckldq m0, m6 330 movhlps m1, m0 331 paddd m0, m1 332%if %2 == 2 ; we skipped rows, so we need to double the sad 333 pslld m0, 1 334%endif 335 movd eax, m0 336 RET 337%endmacro 338 339INIT_XMM sse2 340HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 341HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 342HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 343HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 344HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 345HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 346HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 347HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 348HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 349HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 350HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2 351HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2 352HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2 353HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2 354; Current code fails there are only 2 rows 355; HIGH_SAD16XN 4, 2 ; highbd_sad_skip_16x4_sse2 356 357; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, 358; uint8_t *ref, int ref_stride); 359%macro HIGH_SAD8XN 1-2 0 360 HIGH_SAD_FN 8, %1, 7, %2, 8 361%if %2 == 2 ; skip rows, so divide number of rows by 2 362 mov n_rowsd, %1/8 363%else 364 mov n_rowsd, %1/4 365%endif 366 pxor m0, m0 367 pxor m6, m6 368 369.loop: 370 movu m1, [refq] 371 movu m2, [refq+ref_strideq*2] 372 movu m3, [refq+ref_strideq*4] 373 movu m4, [refq+ref_stride3q*2] 374%if %2 == 1 375 pavgw m1, [second_predq+mmsize*0] 376 pavgw m2, [second_predq+mmsize*1] 377 pavgw m3, [second_predq+mmsize*2] 378 pavgw m4, [second_predq+mmsize*3] 379 lea second_predq, [second_predq+mmsize*4] 380%endif 381 mova m7, m1 382 movu m5, [srcq] 383 psubusw m1, m5 384 psubusw m5, m7 385 por m1, m5 386 387 mova m7, m2 388 movu m5, [srcq+src_strideq*2] 389 psubusw m2, m5 390 psubusw m5, m7 391 por m2, m5 392 393 mova m7, m3 394 movu m5, [srcq+src_strideq*4] 395 psubusw m3, m5 396 psubusw m5, m7 397 por m3, m5 398 399 mova m7, m4 400 movu m5, [srcq+src_stride3q*2] 401 psubusw m4, m5 402 psubusw m5, m7 403 por m4, m5 404 405 paddw m1, m2 406 paddw m3, m4 407 movhlps m2, m1 408 movhlps m4, m3 409 paddw m1, m2 410 paddw m3, m4 411 punpcklwd m1, m6 412 punpcklwd m3, m6 413 lea refq, [refq+ref_strideq*8] 414 paddd m0, m1 415 lea srcq, [srcq+src_strideq*8] 416 paddd m0, m3 417 dec n_rowsd 418 jg .loop 419 420 movhlps m1, m0 421 paddd m0, m1 422 punpckldq m0, m6 423 movhlps m1, m0 424 paddd m0, m1 425%if %2 == 2 ; we skipped rows, so we need to double the sad 426 pslld m0, 1 427%endif 428 movd eax, m0 429 RET 430%endmacro 431 432INIT_XMM sse2 433HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 434HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 435HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 436HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 437HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 438HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 439HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 440HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 441HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2 442HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2 443HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2 444; Current code fails there are only 2 rows 445; HIGH_SAD8XN 4, 2 ; highbd_sad8x4_avg_sse2 446 447; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride, 448; uint8_t *ref, int ref_stride); 449%macro HIGH_SAD4XN 1-2 0 450 HIGH_SAD_FN 4, %1, 7, %2 451%if %2 == 2 ; skip rows, so divide number of rows by 2 452 mov n_rowsd, %1/8 453%else 454 mov n_rowsd, %1/4 455%endif 456 pxor m0, m0 457 pxor m6, m6 458 459.loop: 460 movq m1, [refq] 461 movq m2, [refq+ref_strideq*2] 462 movq m3, [refq+ref_strideq*4] 463 movq m4, [refq+ref_stride3q*2] 464 punpcklwd m1, m3 465 punpcklwd m2, m4 466%if %2 == 1 467 movq m3, [second_predq+8*0] 468 movq m5, [second_predq+8*2] 469 punpcklwd m3, m5 470 movq m4, [second_predq+8*1] 471 movq m5, [second_predq+8*3] 472 punpcklwd m4, m5 473 lea second_predq, [second_predq+8*4] 474 pavgw m1, m3 475 pavgw m2, m4 476%endif 477 movq m5, [srcq] 478 movq m3, [srcq+src_strideq*4] 479 punpcklwd m5, m3 480 movdqa m3, m1 481 psubusw m1, m5 482 psubusw m5, m3 483 por m1, m5 484 movq m5, [srcq+src_strideq*2] 485 movq m4, [srcq+src_stride3q*2] 486 punpcklwd m5, m4 487 movdqa m4, m2 488 psubusw m2, m5 489 psubusw m5, m4 490 por m2, m5 491 paddw m1, m2 492 movdqa m2, m1 493 punpcklwd m1, m6 494 punpckhwd m2, m6 495 lea refq, [refq+ref_strideq*8] 496 paddd m0, m1 497 lea srcq, [srcq+src_strideq*8] 498 paddd m0, m2 499 dec n_rowsd 500 jg .loop 501 502 movhlps m1, m0 503 paddd m0, m1 504 punpckldq m0, m6 505 movhlps m1, m0 506 paddd m0, m1 507%if %2 == 2 ; we skipped rows, so we need to double the sad 508 pslld m0, 1 509%endif 510 movd eax, m0 511 RET 512%endmacro 513 514INIT_XMM sse2 515HIGH_SAD4XN 16 ; highbd_sad4x16_sse2 516HIGH_SAD4XN 8 ; highbd_sad4x8_sse2 517HIGH_SAD4XN 4 ; highbd_sad4x4_sse2 518HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2 519HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2 520HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2 521HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2 522HIGH_SAD4XN 8, 2 ; highbd_sad_skip_4x8_sse2 523; Current code fails there are only 2 rows 524; HIGH_SAD4XN 4, 2 ; highbd_sad_skip_4x4_sse2 525