1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; Macro Arguments 19; Arg 1: Width 20; Arg 2: Height 21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23%macro SAD_FN 4 24%if %4 == 0 ; normal sad 25%if %3 == 5 26cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 27%else ; %3 == 7 28cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 29 src_stride3, ref_stride3, n_rows 30%endif ; %3 == 5/7 31 32%elif %4 == 2 ; skip 33%if %3 == 5 34cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 35%else ; %3 == 7 36cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 37 src_stride3, ref_stride3, n_rows 38%endif ; %3 == 5/7 39 40%else 41%if %3 == 5 42cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 43 second_pred, n_rows 44%else ; %3 == 7 45cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \ 46 ref, ref_stride, \ 47 second_pred, \ 48 src_stride3, ref_stride3 49%if AOM_ARCH_X86_64 50%define n_rowsd r7d 51%else ; x86-32 52%define n_rowsd dword r0m 53%endif ; x86-32/64 54%endif ; %3 == 5/7 55%endif ; sad/avg/skip 56%if %4 == 2; skip rows so double the stride 57lea src_strided, [src_strided*2] 58lea ref_strided, [ref_strided*2] 59%endif ; %4 skip 60 movsxdifnidn src_strideq, src_strided 61 movsxdifnidn ref_strideq, ref_strided 62%if %3 == 7 63 lea src_stride3q, [src_strideq*3] 64 lea ref_stride3q, [ref_strideq*3] 65%endif ; %3 == 7 66%endmacro 67 68; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, 69; uint8_t *ref, int ref_stride); 70%macro SAD128XN 1-2 0 71 SAD_FN 128, %1, 5, %2 72%if %2 == 2 73 mov n_rowsd, %1/2 74%else 75 mov n_rowsd, %1 76%endif 77 pxor m0, m0 78 79.loop: 80 movu m1, [refq] 81 movu m2, [refq+16] 82 movu m3, [refq+32] 83 movu m4, [refq+48] 84%if %2 == 1 85 pavgb m1, [second_predq+mmsize*0] 86 pavgb m2, [second_predq+mmsize*1] 87 pavgb m3, [second_predq+mmsize*2] 88 pavgb m4, [second_predq+mmsize*3] 89%endif 90 psadbw m1, [srcq] 91 psadbw m2, [srcq+16] 92 psadbw m3, [srcq+32] 93 psadbw m4, [srcq+48] 94 95 paddd m1, m2 96 paddd m3, m4 97 paddd m0, m1 98 paddd m0, m3 99 100 movu m1, [refq+64] 101 movu m2, [refq+80] 102 movu m3, [refq+96] 103 movu m4, [refq+112] 104%if %2 == 1 105 pavgb m1, [second_predq+mmsize*4] 106 pavgb m2, [second_predq+mmsize*5] 107 pavgb m3, [second_predq+mmsize*6] 108 pavgb m4, [second_predq+mmsize*7] 109 lea second_predq, [second_predq+mmsize*8] 110%endif 111 psadbw m1, [srcq+64] 112 psadbw m2, [srcq+80] 113 psadbw m3, [srcq+96] 114 psadbw m4, [srcq+112] 115 116 add refq, ref_strideq 117 add srcq, src_strideq 118 119 paddd m1, m2 120 paddd m3, m4 121 paddd m0, m1 122 paddd m0, m3 123 124 sub n_rowsd, 1 125 jg .loop 126 127 movhlps m1, m0 128 paddd m0, m1 129%if %2 == 2 ; we skipped rows, so now we need to double the sad 130 pslld m0, 1 131%endif 132 movd eax, m0 133 RET 134%endmacro 135 136INIT_XMM sse2 137SAD128XN 128 ; sad128x128_sse2 138SAD128XN 128, 1 ; sad128x128_avg_sse2 139SAD128XN 128, 2 ; sad_skip_128x128_sse2 140SAD128XN 64 ; sad128x64_sse2 141SAD128XN 64, 1 ; sad128x64_avg_sse2 142SAD128XN 64, 2 ; sad_skip_128x64_sse2 143 144 145; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, 146; uint8_t *ref, int ref_stride); 147%macro SAD64XN 1-2 0 148 SAD_FN 64, %1, 5, %2 149%if %2 == 2 150 mov n_rowsd, %1/2 151%else 152 mov n_rowsd, %1 153%endif 154 pxor m0, m0 155.loop: 156 movu m1, [refq] 157 movu m2, [refq+16] 158 movu m3, [refq+32] 159 movu m4, [refq+48] 160%if %2 == 1 161 pavgb m1, [second_predq+mmsize*0] 162 pavgb m2, [second_predq+mmsize*1] 163 pavgb m3, [second_predq+mmsize*2] 164 pavgb m4, [second_predq+mmsize*3] 165 lea second_predq, [second_predq+mmsize*4] 166%endif 167 psadbw m1, [srcq] 168 psadbw m2, [srcq+16] 169 psadbw m3, [srcq+32] 170 psadbw m4, [srcq+48] 171 paddd m1, m2 172 paddd m3, m4 173 add refq, ref_strideq 174 paddd m0, m1 175 add srcq, src_strideq 176 paddd m0, m3 177 dec n_rowsd 178 jg .loop 179 180 movhlps m1, m0 181 paddd m0, m1 182%if %2 == 2 ; we skipped rows, so now we need to double the sad 183 pslld m0, 1 184%endif 185 movd eax, m0 186 RET 187%endmacro 188 189INIT_XMM sse2 190SAD64XN 128 ; sad64x128_sse2 191SAD64XN 64 ; sad64x64_sse2 192SAD64XN 32 ; sad64x32_sse2 193SAD64XN 128, 1 ; sad64x128_avg_sse2 194SAD64XN 64, 1 ; sad64x64_avg_sse2 195SAD64XN 32, 1 ; sad64x32_avg_sse2 196SAD64XN 128, 2 ; sad_skip_64x128_sse2 197SAD64XN 64, 2 ; sad_skip_64x64_sse2 198SAD64XN 32, 2 ; sad_skip_64x32_sse2 199%if CONFIG_REALTIME_ONLY==0 200SAD64XN 16 ; sad64x16_sse2 201SAD64XN 16, 1 ; sad64x16_avg_sse2 202SAD64XN 16, 2 ; sad_skip_64x16_sse2 203%endif 204 205; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, 206; uint8_t *ref, int ref_stride); 207%macro SAD32XN 1-2 0 208 SAD_FN 32, %1, 5, %2 209%if %2 == 2 210 mov n_rowsd, %1/4 211%else 212 mov n_rowsd, %1/2 213%endif 214 pxor m0, m0 215.loop: 216 movu m1, [refq] 217 movu m2, [refq+16] 218 movu m3, [refq+ref_strideq] 219 movu m4, [refq+ref_strideq+16] 220%if %2 == 1 221 pavgb m1, [second_predq+mmsize*0] 222 pavgb m2, [second_predq+mmsize*1] 223 pavgb m3, [second_predq+mmsize*2] 224 pavgb m4, [second_predq+mmsize*3] 225 lea second_predq, [second_predq+mmsize*4] 226%endif 227 psadbw m1, [srcq] 228 psadbw m2, [srcq+16] 229 psadbw m3, [srcq+src_strideq] 230 psadbw m4, [srcq+src_strideq+16] 231 paddd m1, m2 232 paddd m3, m4 233 lea refq, [refq+ref_strideq*2] 234 paddd m0, m1 235 lea srcq, [srcq+src_strideq*2] 236 paddd m0, m3 237 dec n_rowsd 238 jg .loop 239 240 movhlps m1, m0 241 paddd m0, m1 242%if %2 == 2 ; we skipped rows, so now we need to double the sad 243 pslld m0, 1 244%endif 245 movd eax, m0 246 RET 247%endmacro 248 249INIT_XMM sse2 250SAD32XN 64 ; sad32x64_sse2 251SAD32XN 32 ; sad32x32_sse2 252SAD32XN 16 ; sad32x16_sse2 253SAD32XN 64, 1 ; sad32x64_avg_sse2 254SAD32XN 32, 1 ; sad32x32_avg_sse2 255SAD32XN 16, 1 ; sad32x16_avg_sse2 256SAD32XN 64, 2 ; sad_skip_32x64_sse2 257SAD32XN 32, 2 ; sad_skip_32x32_sse2 258SAD32XN 16, 2 ; sad_skip_32x16_sse2 259%if CONFIG_REALTIME_ONLY==0 260SAD32XN 8 ; sad32x8_sse2 261SAD32XN 8, 1 ; sad32x8_avg_sse2 262%endif 263 264; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 265; uint8_t *ref, int ref_stride); 266%macro SAD16XN 1-2 0 267 SAD_FN 16, %1, 7, %2 268%if %2 == 2 269 mov n_rowsd, %1/8 270%else 271 mov n_rowsd, %1/4 272%endif 273 pxor m0, m0 274 275.loop: 276 movu m1, [refq] 277 movu m2, [refq+ref_strideq] 278 movu m3, [refq+ref_strideq*2] 279 movu m4, [refq+ref_stride3q] 280%if %2 == 1 281 pavgb m1, [second_predq+mmsize*0] 282 pavgb m2, [second_predq+mmsize*1] 283 pavgb m3, [second_predq+mmsize*2] 284 pavgb m4, [second_predq+mmsize*3] 285 lea second_predq, [second_predq+mmsize*4] 286%endif 287 psadbw m1, [srcq] 288 psadbw m2, [srcq+src_strideq] 289 psadbw m3, [srcq+src_strideq*2] 290 psadbw m4, [srcq+src_stride3q] 291 paddd m1, m2 292 paddd m3, m4 293 lea refq, [refq+ref_strideq*4] 294 paddd m0, m1 295 lea srcq, [srcq+src_strideq*4] 296 paddd m0, m3 297 dec n_rowsd 298 jg .loop 299 300 movhlps m1, m0 301 paddd m0, m1 302%if %2 == 2 ; we skipped rows, so now we need to double the sad 303 pslld m0, 1 304%endif 305 movd eax, m0 306 RET 307%endmacro 308 309INIT_XMM sse2 310SAD16XN 32 ; sad16x32_sse2 311SAD16XN 16 ; sad16x16_sse2 312SAD16XN 8 ; sad16x8_sse2 313SAD16XN 32, 1 ; sad16x32_avg_sse2 314SAD16XN 16, 1 ; sad16x16_avg_sse2 315SAD16XN 8, 1 ; sad16x8_avg_sse2 316SAD16XN 32, 2 ; sad_skip_16x32_sse2 317SAD16XN 16, 2 ; sad_skip_16x16_sse2 318%if CONFIG_REALTIME_ONLY==0 319SAD16XN 64 ; sad16x64_sse2 320SAD16XN 4 ; sad16x4_sse2 321SAD16XN 64, 1 ; sad16x64_avg_sse2 322SAD16XN 64, 2 ; sad_skip_16x64_sse2 323%endif 324 325; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 326; uint8_t *ref, int ref_stride); 327%macro SAD8XN 1-2 0 328 SAD_FN 8, %1, 7, %2 329%if %2 == 2 330 mov n_rowsd, %1/8 331%else 332 mov n_rowsd, %1/4 333%endif 334 pxor m0, m0 335 336.loop: 337 movh m1, [refq] 338 movhps m1, [refq+ref_strideq] 339 movh m2, [refq+ref_strideq*2] 340 movhps m2, [refq+ref_stride3q] 341%if %2 == 1 342 pavgb m1, [second_predq+mmsize*0] 343 pavgb m2, [second_predq+mmsize*1] 344 lea second_predq, [second_predq+mmsize*2] 345%endif 346 movh m3, [srcq] 347 movhps m3, [srcq+src_strideq] 348 movh m4, [srcq+src_strideq*2] 349 movhps m4, [srcq+src_stride3q] 350 psadbw m1, m3 351 psadbw m2, m4 352 lea refq, [refq+ref_strideq*4] 353 paddd m0, m1 354 lea srcq, [srcq+src_strideq*4] 355 paddd m0, m2 356 dec n_rowsd 357 jg .loop 358 359 movhlps m1, m0 360 paddd m0, m1 361%if %2 == 2 ; we skipped rows, so now we need to double the sad 362 pslld m0, 1 363%endif 364 movd eax, m0 365 RET 366%endmacro 367 368INIT_XMM sse2 369SAD8XN 16 ; sad8x16_sse2 370SAD8XN 8 ; sad8x8_sse2 371SAD8XN 4 ; sad8x4_sse2 372SAD8XN 16, 1 ; sad8x16_avg_sse2 373SAD8XN 8, 1 ; sad8x8_avg_sse2 374SAD8XN 16, 2 ; sad_skip_8x16_sse2 375SAD8XN 8, 2 ; sad_skip_8x8_sse2 376%if CONFIG_REALTIME_ONLY==0 377SAD8XN 32 ; sad8x32_sse2 378SAD8XN 32, 1 ; sad8x32_avg_sse2 379SAD8XN 32, 2 ; sad_skip_8x32_sse2 380%endif 381 382; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, 383; uint8_t *ref, int ref_stride); 384%macro SAD4XN 1-2 0 385 SAD_FN 4, %1, 7, %2 386%if %2 == 2 387 mov n_rowsd, %1/8 388%else 389 mov n_rowsd, %1/4 390%endif 391 pxor m0, m0 392 393.loop: 394 movd m1, [refq] 395 movd m2, [refq+ref_strideq] 396 movd m3, [refq+ref_strideq*2] 397 movd m4, [refq+ref_stride3q] 398 punpckldq m1, m2 399 punpckldq m3, m4 400 movlhps m1, m3 401%if %2 == 1 402 pavgb m1, [second_predq+mmsize*0] 403 lea second_predq, [second_predq+mmsize*1] 404%endif 405 movd m2, [srcq] 406 movd m5, [srcq+src_strideq] 407 movd m4, [srcq+src_strideq*2] 408 movd m3, [srcq+src_stride3q] 409 punpckldq m2, m5 410 punpckldq m4, m3 411 movlhps m2, m4 412 psadbw m1, m2 413 lea refq, [refq+ref_strideq*4] 414 paddd m0, m1 415 lea srcq, [srcq+src_strideq*4] 416 dec n_rowsd 417 jg .loop 418 419 movhlps m1, m0 420 paddd m0, m1 421%if %2 == 2 ; we skipped rows, so now we need to double the sad 422 pslld m0, 1 423%endif 424 movd eax, m0 425 RET 426%endmacro 427 428INIT_XMM sse2 429SAD4XN 8 ; sad4x8_sse2 430SAD4XN 4 ; sad4x4_sse2 431%if CONFIG_REALTIME_ONLY==0 432SAD4XN 16 ; sad4x16_sse2 433SAD4XN 16, 2 ; sad_skip_4x16_sse2 434%endif 435