• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; Macro Arguments
19; Arg 1: Width
20; Arg 2: Height
21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
23; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
24%macro HIGH_SAD_FN 4-5 7
25%if %4 == 0
26%if %3 == 5
27cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
28%else ; %3 == 7
29cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
30                            src_stride3, ref_stride3, n_rows
31%endif ; %3 == 5/7
32%elif %4 == 1 ; avg
33%if %3 == 5
34cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
35                                    second_pred, n_rows
36%else ; %3 == 7
37cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \
38                                              ref, ref_stride, \
39                                              second_pred, \
40                                              src_stride3, ref_stride3
41%if AOM_ARCH_X86_64
42%define n_rowsd r7d
43%else ; x86-32
44%define n_rowsd dword r0m
45%endif ; x86-32/64
46%endif ; %3 == 5/7
47%else  ; %4 == 2, skip rows
48%if %3 == 5
49cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
50%else ; %3 == 7
51cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
52                            src_stride3, ref_stride3, n_rows
53%endif ; %3 == 5/7
54%endif ; sad/avg/skip
55%if %4 == 2  ; double the stride if we are skipping rows
56  lea          src_strided, [src_strided*2]
57  lea          ref_strided, [ref_strided*2]
58%endif
59  movsxdifnidn src_strideq, src_strided
60  movsxdifnidn ref_strideq, ref_strided
61%if %3 == 7
62  lea         src_stride3q, [src_strideq*3]
63  lea         ref_stride3q, [ref_strideq*3]
64%endif ; %3 == 7
65; convert src, ref & second_pred to short ptrs (from byte ptrs)
66  shl                 srcq, 1
67  shl                 refq, 1
68%if %4 == 1
69  shl         second_predq, 1
70%endif
71%endmacro
72
73; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
74;                                    uint8_t *ref, int ref_stride);
75%macro HIGH_SAD64XN 1-2 0
76  HIGH_SAD_FN 64, %1, 5, %2
77%if %2 == 2  ; skip rows, so divide number of rows by 2
78  mov              n_rowsd, %1/2
79%else
80  mov              n_rowsd, %1
81%endif
82  pxor                  m0, m0
83  pxor                  m6, m6
84
85.loop:
86  ; first half of each row
87  movu                  m1, [refq]
88  movu                  m2, [refq+16]
89  movu                  m3, [refq+32]
90  movu                  m4, [refq+48]
91%if %2 == 1
92  pavgw                 m1, [second_predq+mmsize*0]
93  pavgw                 m2, [second_predq+mmsize*1]
94  pavgw                 m3, [second_predq+mmsize*2]
95  pavgw                 m4, [second_predq+mmsize*3]
96  lea         second_predq, [second_predq+mmsize*4]
97%endif
98  mova                  m5, [srcq]
99  psubusw               m5, m1
100  psubusw               m1, [srcq]
101  por                   m1, m5
102  mova                  m5, [srcq+16]
103  psubusw               m5, m2
104  psubusw               m2, [srcq+16]
105  por                   m2, m5
106  mova                  m5, [srcq+32]
107  psubusw               m5, m3
108  psubusw               m3, [srcq+32]
109  por                   m3, m5
110  mova                  m5, [srcq+48]
111  psubusw               m5, m4
112  psubusw               m4, [srcq+48]
113  por                   m4, m5
114  paddw                 m1, m2
115  paddw                 m3, m4
116  movhlps               m2, m1
117  movhlps               m4, m3
118  paddw                 m1, m2
119  paddw                 m3, m4
120  punpcklwd             m1, m6
121  punpcklwd             m3, m6
122  paddd                 m0, m1
123  paddd                 m0, m3
124  ; second half of each row
125  movu                  m1, [refq+64]
126  movu                  m2, [refq+80]
127  movu                  m3, [refq+96]
128  movu                  m4, [refq+112]
129%if %2 == 1
130  pavgw                 m1, [second_predq+mmsize*0]
131  pavgw                 m2, [second_predq+mmsize*1]
132  pavgw                 m3, [second_predq+mmsize*2]
133  pavgw                 m4, [second_predq+mmsize*3]
134  lea         second_predq, [second_predq+mmsize*4]
135%endif
136  mova                  m5, [srcq+64]
137  psubusw               m5, m1
138  psubusw               m1, [srcq+64]
139  por                   m1, m5
140  mova                  m5, [srcq+80]
141  psubusw               m5, m2
142  psubusw               m2, [srcq+80]
143  por                   m2, m5
144  mova                  m5, [srcq+96]
145  psubusw               m5, m3
146  psubusw               m3, [srcq+96]
147  por                   m3, m5
148  mova                  m5, [srcq+112]
149  psubusw               m5, m4
150  psubusw               m4, [srcq+112]
151  por                   m4, m5
152  paddw                 m1, m2
153  paddw                 m3, m4
154  movhlps               m2, m1
155  movhlps               m4, m3
156  paddw                 m1, m2
157  paddw                 m3, m4
158  punpcklwd             m1, m6
159  punpcklwd             m3, m6
160  lea                 refq, [refq+ref_strideq*2]
161  paddd                 m0, m1
162  lea                 srcq, [srcq+src_strideq*2]
163  paddd                 m0, m3
164
165  dec              n_rowsd
166  jg .loop
167
168  movhlps               m1, m0
169  paddd                 m0, m1
170  punpckldq             m0, m6
171  movhlps               m1, m0
172  paddd                 m0, m1
173%if %2 == 2  ; we skipped rows, so we need to double the sad
174  pslld                 m0, 1
175%endif
176  movd                 eax, m0
177  RET
178%endmacro
179
180INIT_XMM sse2
181HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
182HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
183HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
184HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
185HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
186HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
187%if CONFIG_REALTIME_ONLY==0
188HIGH_SAD64XN 16 ; highbd_sad64x16_sse2
189HIGH_SAD64XN 16, 1 ; highbd_sad64x16_avg_sse2
190HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
191%endif
192
193; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
194;                                    uint8_t *ref, int ref_stride);
195%macro HIGH_SAD32XN 1-2 0
196  HIGH_SAD_FN 32, %1, 5, %2
197%if %2 == 2  ; skip rows, so divide number of rows by 2
198  mov              n_rowsd, %1/2
199%else
200  mov              n_rowsd, %1
201%endif
202  pxor                  m0, m0
203  pxor                  m6, m6
204
205.loop:
206  movu                  m1, [refq]
207  movu                  m2, [refq+16]
208  movu                  m3, [refq+32]
209  movu                  m4, [refq+48]
210%if %2 == 1
211  pavgw                 m1, [second_predq+mmsize*0]
212  pavgw                 m2, [second_predq+mmsize*1]
213  pavgw                 m3, [second_predq+mmsize*2]
214  pavgw                 m4, [second_predq+mmsize*3]
215  lea         second_predq, [second_predq+mmsize*4]
216%endif
217  mova                  m5, [srcq]
218  psubusw               m5, m1
219  psubusw               m1, [srcq]
220  por                   m1, m5
221  mova                  m5, [srcq+16]
222  psubusw               m5, m2
223  psubusw               m2, [srcq+16]
224  por                   m2, m5
225  mova                  m5, [srcq+32]
226  psubusw               m5, m3
227  psubusw               m3, [srcq+32]
228  por                   m3, m5
229  mova                  m5, [srcq+48]
230  psubusw               m5, m4
231  psubusw               m4, [srcq+48]
232  por                   m4, m5
233  paddw                 m1, m2
234  paddw                 m3, m4
235  movhlps               m2, m1
236  movhlps               m4, m3
237  paddw                 m1, m2
238  paddw                 m3, m4
239  punpcklwd             m1, m6
240  punpcklwd             m3, m6
241  lea                 refq, [refq+ref_strideq*2]
242  paddd                 m0, m1
243  lea                 srcq, [srcq+src_strideq*2]
244  paddd                 m0, m3
245  dec              n_rowsd
246  jg .loop
247
248  movhlps               m1, m0
249  paddd                 m0, m1
250  punpckldq             m0, m6
251  movhlps               m1, m0
252  paddd                 m0, m1
253%if %2 == 2  ; we skipped rows, so we need to double the sad
254  pslld                 m0, 1
255%endif
256  movd                 eax, m0
257  RET
258%endmacro
259
260INIT_XMM sse2
261HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
262HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
263HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
264HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
265HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
266HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
267HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
268HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
269HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
270%if CONFIG_REALTIME_ONLY==0
271HIGH_SAD32XN  8 ; highbd_sad32x8_sse2
272HIGH_SAD32XN  8, 1 ; highbd_sad32x8_avg_sse2
273%endif
274
275; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
276;                                    uint8_t *ref, int ref_stride);
277%macro HIGH_SAD16XN 1-2 0
278  HIGH_SAD_FN 16, %1, 5, %2
279%if %2 == 2  ; skip rows, so divide number of rows by 2
280  mov              n_rowsd, %1/4
281%else
282  mov              n_rowsd, %1/2
283%endif
284  pxor                  m0, m0
285  pxor                  m6, m6
286
287.loop:
288  movu                  m1, [refq]
289  movu                  m2, [refq+16]
290  movu                  m3, [refq+ref_strideq*2]
291  movu                  m4, [refq+ref_strideq*2+16]
292%if %2 == 1
293  pavgw                 m1, [second_predq+mmsize*0]
294  pavgw                 m2, [second_predq+16]
295  pavgw                 m3, [second_predq+mmsize*2]
296  pavgw                 m4, [second_predq+mmsize*2+16]
297  lea         second_predq, [second_predq+mmsize*4]
298%endif
299  mova                  m5, [srcq]
300  psubusw               m5, m1
301  psubusw               m1, [srcq]
302  por                   m1, m5
303  mova                  m5, [srcq+16]
304  psubusw               m5, m2
305  psubusw               m2, [srcq+16]
306  por                   m2, m5
307  mova                  m5, [srcq+src_strideq*2]
308  psubusw               m5, m3
309  psubusw               m3, [srcq+src_strideq*2]
310  por                   m3, m5
311  mova                  m5, [srcq+src_strideq*2+16]
312  psubusw               m5, m4
313  psubusw               m4, [srcq+src_strideq*2+16]
314  por                   m4, m5
315  paddw                 m1, m2
316  paddw                 m3, m4
317  movhlps               m2, m1
318  movhlps               m4, m3
319  paddw                 m1, m2
320  paddw                 m3, m4
321  punpcklwd             m1, m6
322  punpcklwd             m3, m6
323  lea                 refq, [refq+ref_strideq*4]
324  paddd                 m0, m1
325  lea                 srcq, [srcq+src_strideq*4]
326  paddd                 m0, m3
327  dec              n_rowsd
328  jg .loop
329
330  movhlps               m1, m0
331  paddd                 m0, m1
332  punpckldq             m0, m6
333  movhlps               m1, m0
334  paddd                 m0, m1
335%if %2 == 2  ; we skipped rows, so we need to double the sad
336  pslld                 m0, 1
337%endif
338  movd                 eax, m0
339  RET
340%endmacro
341
342INIT_XMM sse2
343HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
344HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
345HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
346HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
347HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
348HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
349HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
350HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
351%if CONFIG_REALTIME_ONLY==0
352HIGH_SAD16XN 64 ; highbd_sad16x64_sse2
353HIGH_SAD16XN  4 ; highbd_sad16x4_sse2
354HIGH_SAD16XN 64, 1 ; highbd_sad16x64_avg_sse2
355HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
356%endif
357
358; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
359;                                    uint8_t *ref, int ref_stride);
360%macro HIGH_SAD8XN 1-2 0
361  HIGH_SAD_FN 8, %1, 7, %2, 8
362%if %2 == 2  ; skip rows, so divide number of rows by 2
363  mov              n_rowsd, %1/8
364%else
365  mov              n_rowsd, %1/4
366%endif
367  pxor                  m0, m0
368  pxor                  m6, m6
369
370.loop:
371  movu                  m1, [refq]
372  movu                  m2, [refq+ref_strideq*2]
373  movu                  m3, [refq+ref_strideq*4]
374  movu                  m4, [refq+ref_stride3q*2]
375%if %2 == 1
376  pavgw                 m1, [second_predq+mmsize*0]
377  pavgw                 m2, [second_predq+mmsize*1]
378  pavgw                 m3, [second_predq+mmsize*2]
379  pavgw                 m4, [second_predq+mmsize*3]
380  lea         second_predq, [second_predq+mmsize*4]
381%endif
382  mova                  m7, m1
383  movu                  m5, [srcq]
384  psubusw               m1, m5
385  psubusw               m5, m7
386  por                   m1, m5
387
388  mova                  m7, m2
389  movu                  m5, [srcq+src_strideq*2]
390  psubusw               m2, m5
391  psubusw               m5, m7
392  por                   m2, m5
393
394  mova                  m7, m3
395  movu                  m5, [srcq+src_strideq*4]
396  psubusw               m3, m5
397  psubusw               m5, m7
398  por                   m3, m5
399
400  mova                  m7, m4
401  movu                  m5, [srcq+src_stride3q*2]
402  psubusw               m4, m5
403  psubusw               m5, m7
404  por                   m4, m5
405
406  paddw                 m1, m2
407  paddw                 m3, m4
408  movhlps               m2, m1
409  movhlps               m4, m3
410  paddw                 m1, m2
411  paddw                 m3, m4
412  punpcklwd             m1, m6
413  punpcklwd             m3, m6
414  lea                 refq, [refq+ref_strideq*8]
415  paddd                 m0, m1
416  lea                 srcq, [srcq+src_strideq*8]
417  paddd                 m0, m3
418  dec              n_rowsd
419  jg .loop
420
421  movhlps               m1, m0
422  paddd                 m0, m1
423  punpckldq             m0, m6
424  movhlps               m1, m0
425  paddd                 m0, m1
426%if %2 == 2  ; we skipped rows, so we need to double the sad
427  pslld                 m0, 1
428%endif
429  movd                 eax, m0
430  RET
431%endmacro
432
433INIT_XMM sse2
434HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
435HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
436HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
437HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
438HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
439HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
440%if CONFIG_REALTIME_ONLY==0
441HIGH_SAD8XN 32 ; highbd_sad8x32_sse2
442HIGH_SAD8XN 32, 1 ; highbd_sad8x32_avg_sse2
443HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
444%endif
445
446; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
447;                                    uint8_t *ref, int ref_stride);
448%macro HIGH_SAD4XN 1-2 0
449  HIGH_SAD_FN 4, %1, 7, %2
450%if %2 == 2  ; skip rows, so divide number of rows by 2
451  mov              n_rowsd, %1/8
452%else
453  mov              n_rowsd, %1/4
454%endif
455  pxor                  m0, m0
456  pxor                  m6, m6
457
458.loop:
459  movq                  m1, [refq]
460  movq                  m2, [refq+ref_strideq*2]
461  movq                  m3, [refq+ref_strideq*4]
462  movq                  m4, [refq+ref_stride3q*2]
463  punpcklwd             m1, m3
464  punpcklwd             m2, m4
465%if %2 == 1
466  movq                  m3, [second_predq+8*0]
467  movq                  m5, [second_predq+8*2]
468  punpcklwd             m3, m5
469  movq                  m4, [second_predq+8*1]
470  movq                  m5, [second_predq+8*3]
471  punpcklwd             m4, m5
472  lea         second_predq, [second_predq+8*4]
473  pavgw                 m1, m3
474  pavgw                 m2, m4
475%endif
476  movq                  m5, [srcq]
477  movq                  m3, [srcq+src_strideq*4]
478  punpcklwd             m5, m3
479  movdqa                m3, m1
480  psubusw               m1, m5
481  psubusw               m5, m3
482  por                   m1, m5
483  movq                  m5, [srcq+src_strideq*2]
484  movq                  m4, [srcq+src_stride3q*2]
485  punpcklwd             m5, m4
486  movdqa                m4, m2
487  psubusw               m2, m5
488  psubusw               m5, m4
489  por                   m2, m5
490  paddw                 m1, m2
491  movdqa                m2, m1
492  punpcklwd             m1, m6
493  punpckhwd             m2, m6
494  lea                 refq, [refq+ref_strideq*8]
495  paddd                 m0, m1
496  lea                 srcq, [srcq+src_strideq*8]
497  paddd                 m0, m2
498  dec              n_rowsd
499  jg .loop
500
501  movhlps               m1, m0
502  paddd                 m0, m1
503  punpckldq             m0, m6
504  movhlps               m1, m0
505  paddd                 m0, m1
506%if %2 == 2  ; we skipped rows, so we need to double the sad
507  pslld                 m0, 1
508%endif
509  movd                 eax, m0
510  RET
511%endmacro
512
513INIT_XMM sse2
514HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
515HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
516%if CONFIG_REALTIME_ONLY==0
517HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
518HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
519%endif
520