1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18; Macro Arguments 19; Arg 1: Width 20; Arg 2: Height 21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23%macro SAD_FN 4 24%if %4 == 0 ; normal sad 25%if %3 == 5 26cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 27%else ; %3 == 7 28cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 29 src_stride3, ref_stride3, n_rows 30%endif ; %3 == 5/7 31 32%elif %4 == 2 ; skip 33%if %3 == 5 34cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 35%else ; %3 == 7 36cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 37 src_stride3, ref_stride3, n_rows 38%endif ; %3 == 5/7 39 40%else 41%if %3 == 5 42cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 43 second_pred, n_rows 44%else ; %3 == 7 45cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \ 46 ref, ref_stride, \ 47 second_pred, \ 48 src_stride3, ref_stride3 49%if AOM_ARCH_X86_64 50%define n_rowsd r7d 51%else ; x86-32 52%define n_rowsd dword r0m 53%endif ; x86-32/64 54%endif ; %3 == 5/7 55%endif ; sad/avg/skip 56%if %4 == 2; skip rows so double the stride 57lea src_strided, [src_strided*2] 58lea ref_strided, [ref_strided*2] 59%endif ; %4 skip 60 movsxdifnidn src_strideq, src_strided 61 movsxdifnidn ref_strideq, ref_strided 62%if %3 == 7 63 lea src_stride3q, [src_strideq*3] 64 lea ref_stride3q, [ref_strideq*3] 65%endif ; %3 == 7 66%endmacro 67 68; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, 69; uint8_t *ref, int ref_stride); 70%macro SAD128XN 1-2 0 71 SAD_FN 128, %1, 5, %2 72%if %2 == 2 73 mov n_rowsd, %1/2 74%else 75 mov n_rowsd, %1 76%endif 77 pxor m0, m0 78 79.loop: 80 movu m1, [refq] 81 movu m2, [refq+16] 82 movu m3, [refq+32] 83 movu m4, [refq+48] 84%if %2 == 1 85 pavgb m1, [second_predq+mmsize*0] 86 pavgb m2, [second_predq+mmsize*1] 87 pavgb m3, [second_predq+mmsize*2] 88 pavgb m4, [second_predq+mmsize*3] 89%endif 90 psadbw m1, [srcq] 91 psadbw m2, [srcq+16] 92 psadbw m3, [srcq+32] 93 psadbw m4, [srcq+48] 94 95 paddd m1, m2 96 paddd m3, m4 97 paddd m0, m1 98 paddd m0, m3 99 100 movu m1, [refq+64] 101 movu m2, [refq+80] 102 movu m3, [refq+96] 103 movu m4, [refq+112] 104%if %2 == 1 105 pavgb m1, [second_predq+mmsize*4] 106 pavgb m2, [second_predq+mmsize*5] 107 pavgb m3, [second_predq+mmsize*6] 108 pavgb m4, [second_predq+mmsize*7] 109 lea second_predq, [second_predq+mmsize*8] 110%endif 111 psadbw m1, [srcq+64] 112 psadbw m2, [srcq+80] 113 psadbw m3, [srcq+96] 114 psadbw m4, [srcq+112] 115 116 add refq, ref_strideq 117 add srcq, src_strideq 118 119 paddd m1, m2 120 paddd m3, m4 121 paddd m0, m1 122 paddd m0, m3 123 124 sub n_rowsd, 1 125 jg .loop 126 127 movhlps m1, m0 128 paddd m0, m1 129%if %2 == 2 ; we skipped rows, so now we need to double the sad 130 pslld m0, 1 131%endif 132 movd eax, m0 133 RET 134%endmacro 135 136INIT_XMM sse2 137SAD128XN 128 ; sad128x128_sse2 138SAD128XN 128, 1 ; sad128x128_avg_sse2 139SAD128XN 128, 2 ; sad128x128_skip_sse2 140SAD128XN 64 ; sad128x64_sse2 141SAD128XN 64, 1 ; sad128x64_avg_sse2 142SAD128XN 64, 2 ; sad128x64_skip_sse2 143 144 145; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, 146; uint8_t *ref, int ref_stride); 147%macro SAD64XN 1-2 0 148 SAD_FN 64, %1, 5, %2 149%if %2 == 2 150 mov n_rowsd, %1/2 151%else 152 mov n_rowsd, %1 153%endif 154 pxor m0, m0 155.loop: 156 movu m1, [refq] 157 movu m2, [refq+16] 158 movu m3, [refq+32] 159 movu m4, [refq+48] 160%if %2 == 1 161 pavgb m1, [second_predq+mmsize*0] 162 pavgb m2, [second_predq+mmsize*1] 163 pavgb m3, [second_predq+mmsize*2] 164 pavgb m4, [second_predq+mmsize*3] 165 lea second_predq, [second_predq+mmsize*4] 166%endif 167 psadbw m1, [srcq] 168 psadbw m2, [srcq+16] 169 psadbw m3, [srcq+32] 170 psadbw m4, [srcq+48] 171 paddd m1, m2 172 paddd m3, m4 173 add refq, ref_strideq 174 paddd m0, m1 175 add srcq, src_strideq 176 paddd m0, m3 177 dec n_rowsd 178 jg .loop 179 180 movhlps m1, m0 181 paddd m0, m1 182%if %2 == 2 ; we skipped rows, so now we need to double the sad 183 pslld m0, 1 184%endif 185 movd eax, m0 186 RET 187%endmacro 188 189INIT_XMM sse2 190SAD64XN 128 ; sad64x128_sse2 191SAD64XN 64 ; sad64x64_sse2 192SAD64XN 32 ; sad64x32_sse2 193SAD64XN 16 ; sad64x16_sse2 194SAD64XN 128, 1 ; sad64x128_avg_sse2 195SAD64XN 64, 1 ; sad64x64_avg_sse2 196SAD64XN 32, 1 ; sad64x32_avg_sse2 197SAD64XN 16, 1 ; sad64x16_avg_sse2 198SAD64XN 128, 2 ; sad64x128_skip_sse2 199SAD64XN 64, 2 ; sad64x64_skip_sse2 200SAD64XN 32, 2 ; sad64x32_skip_sse2 201SAD64XN 16, 2 ; sad64x16_skip_sse2 202 203; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, 204; uint8_t *ref, int ref_stride); 205%macro SAD32XN 1-2 0 206 SAD_FN 32, %1, 5, %2 207%if %2 == 2 208 mov n_rowsd, %1/4 209%else 210 mov n_rowsd, %1/2 211%endif 212 pxor m0, m0 213.loop: 214 movu m1, [refq] 215 movu m2, [refq+16] 216 movu m3, [refq+ref_strideq] 217 movu m4, [refq+ref_strideq+16] 218%if %2 == 1 219 pavgb m1, [second_predq+mmsize*0] 220 pavgb m2, [second_predq+mmsize*1] 221 pavgb m3, [second_predq+mmsize*2] 222 pavgb m4, [second_predq+mmsize*3] 223 lea second_predq, [second_predq+mmsize*4] 224%endif 225 psadbw m1, [srcq] 226 psadbw m2, [srcq+16] 227 psadbw m3, [srcq+src_strideq] 228 psadbw m4, [srcq+src_strideq+16] 229 paddd m1, m2 230 paddd m3, m4 231 lea refq, [refq+ref_strideq*2] 232 paddd m0, m1 233 lea srcq, [srcq+src_strideq*2] 234 paddd m0, m3 235 dec n_rowsd 236 jg .loop 237 238 movhlps m1, m0 239 paddd m0, m1 240%if %2 == 2 ; we skipped rows, so now we need to double the sad 241 pslld m0, 1 242%endif 243 movd eax, m0 244 RET 245%endmacro 246 247INIT_XMM sse2 248SAD32XN 64 ; sad32x64_sse2 249SAD32XN 32 ; sad32x32_sse2 250SAD32XN 16 ; sad32x16_sse2 251SAD32XN 8 ; sad_32x8_sse2 252SAD32XN 64, 1 ; sad32x64_avg_sse2 253SAD32XN 32, 1 ; sad32x32_avg_sse2 254SAD32XN 16, 1 ; sad32x16_avg_sse2 255SAD32XN 8, 1 ; sad_32x8_avg_sse2 256SAD32XN 64, 2 ; sad32x64_skip_sse2 257SAD32XN 32, 2 ; sad32x32_skip_sse2 258SAD32XN 16, 2 ; sad32x16_skip_sse2 259SAD32XN 8, 2 ; sad_32x8_skip_sse2 260 261; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 262; uint8_t *ref, int ref_stride); 263%macro SAD16XN 1-2 0 264 SAD_FN 16, %1, 7, %2 265%if %2 == 2 266 mov n_rowsd, %1/8 267%else 268 mov n_rowsd, %1/4 269%endif 270 pxor m0, m0 271 272.loop: 273 movu m1, [refq] 274 movu m2, [refq+ref_strideq] 275 movu m3, [refq+ref_strideq*2] 276 movu m4, [refq+ref_stride3q] 277%if %2 == 1 278 pavgb m1, [second_predq+mmsize*0] 279 pavgb m2, [second_predq+mmsize*1] 280 pavgb m3, [second_predq+mmsize*2] 281 pavgb m4, [second_predq+mmsize*3] 282 lea second_predq, [second_predq+mmsize*4] 283%endif 284 psadbw m1, [srcq] 285 psadbw m2, [srcq+src_strideq] 286 psadbw m3, [srcq+src_strideq*2] 287 psadbw m4, [srcq+src_stride3q] 288 paddd m1, m2 289 paddd m3, m4 290 lea refq, [refq+ref_strideq*4] 291 paddd m0, m1 292 lea srcq, [srcq+src_strideq*4] 293 paddd m0, m3 294 dec n_rowsd 295 jg .loop 296 297 movhlps m1, m0 298 paddd m0, m1 299%if %2 == 2 ; we skipped rows, so now we need to double the sad 300 pslld m0, 1 301%endif 302 movd eax, m0 303 RET 304%endmacro 305 306INIT_XMM sse2 307SAD16XN 64 ; sad_16x64_sse2 308SAD16XN 32 ; sad16x32_sse2 309SAD16XN 16 ; sad16x16_sse2 310SAD16XN 8 ; sad16x8_sse2 311SAD16XN 4 ; sad_16x4_sse2 312SAD16XN 64, 1 ; sad_16x64_avg_sse2 313SAD16XN 32, 1 ; sad16x32_avg_sse2 314SAD16XN 16, 1 ; sad16x16_avg_sse2 315SAD16XN 8, 1 ; sad16x8_avg_sse2 316SAD16XN 4, 1 ; sad_16x4_avg_sse2 317SAD16XN 64, 2 ; sad_16x64_skip_sse2 318SAD16XN 32, 2 ; sad16x32_skip_sse2 319SAD16XN 16, 2 ; sad16x16_skip_sse2 320SAD16XN 8, 2 ; sad16x8_skip_sse2 321 322; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 323; uint8_t *ref, int ref_stride); 324%macro SAD8XN 1-2 0 325 SAD_FN 8, %1, 7, %2 326%if %2 == 2 327 mov n_rowsd, %1/8 328%else 329 mov n_rowsd, %1/4 330%endif 331 pxor m0, m0 332 333.loop: 334 movh m1, [refq] 335 movhps m1, [refq+ref_strideq] 336 movh m2, [refq+ref_strideq*2] 337 movhps m2, [refq+ref_stride3q] 338%if %2 == 1 339 pavgb m1, [second_predq+mmsize*0] 340 pavgb m2, [second_predq+mmsize*1] 341 lea second_predq, [second_predq+mmsize*2] 342%endif 343 movh m3, [srcq] 344 movhps m3, [srcq+src_strideq] 345 movh m4, [srcq+src_strideq*2] 346 movhps m4, [srcq+src_stride3q] 347 psadbw m1, m3 348 psadbw m2, m4 349 lea refq, [refq+ref_strideq*4] 350 paddd m0, m1 351 lea srcq, [srcq+src_strideq*4] 352 paddd m0, m2 353 dec n_rowsd 354 jg .loop 355 356 movhlps m1, m0 357 paddd m0, m1 358%if %2 == 2 ; we skipped rows, so now we need to double the sad 359 pslld m0, 1 360%endif 361 movd eax, m0 362 RET 363%endmacro 364 365INIT_XMM sse2 366SAD8XN 32 ; sad_8x32_sse2 367SAD8XN 16 ; sad8x16_sse2 368SAD8XN 8 ; sad8x8_sse2 369SAD8XN 4 ; sad8x4_sse2 370SAD8XN 32, 1 ; sad_8x32_avg_sse2 371SAD8XN 16, 1 ; sad8x16_avg_sse2 372SAD8XN 8, 1 ; sad8x8_avg_sse2 373SAD8XN 4, 1 ; sad8x4_avg_sse2 374SAD8XN 32, 2 ; sad_8x32_skip_sse2 375SAD8XN 16, 2 ; sad8x16_skip_sse2 376SAD8XN 8, 2 ; sad8x8_skip_sse2 377 378; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, 379; uint8_t *ref, int ref_stride); 380%macro SAD4XN 1-2 0 381 SAD_FN 4, %1, 7, %2 382%if %2 == 2 383 mov n_rowsd, %1/8 384%else 385 mov n_rowsd, %1/4 386%endif 387 pxor m0, m0 388 389.loop: 390 movd m1, [refq] 391 movd m2, [refq+ref_strideq] 392 movd m3, [refq+ref_strideq*2] 393 movd m4, [refq+ref_stride3q] 394 punpckldq m1, m2 395 punpckldq m3, m4 396 movlhps m1, m3 397%if %2 == 1 398 pavgb m1, [second_predq+mmsize*0] 399 lea second_predq, [second_predq+mmsize*1] 400%endif 401 movd m2, [srcq] 402 movd m5, [srcq+src_strideq] 403 movd m4, [srcq+src_strideq*2] 404 movd m3, [srcq+src_stride3q] 405 punpckldq m2, m5 406 punpckldq m4, m3 407 movlhps m2, m4 408 psadbw m1, m2 409 lea refq, [refq+ref_strideq*4] 410 paddd m0, m1 411 lea srcq, [srcq+src_strideq*4] 412 dec n_rowsd 413 jg .loop 414 415 movhlps m1, m0 416 paddd m0, m1 417%if %2 == 2 ; we skipped rows, so now we need to double the sad 418 pslld m0, 1 419%endif 420 movd eax, m0 421 RET 422%endmacro 423 424INIT_XMM sse2 425SAD4XN 16 ; sad_4x16_sse2 426SAD4XN 8 ; sad4x8_sse 427SAD4XN 4 ; sad4x4_sse 428SAD4XN 16, 1 ; sad_4x16_avg_sse2 429SAD4XN 8, 1 ; sad4x8_avg_sse 430SAD4XN 4, 1 ; sad4x4_avg_sse 431SAD4XN 16, 2 ; sad_4x16_skip_sse2 432SAD4XN 8, 2 ; sad4x8_skip_sse 433