• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; Macro Arguments
19; Arg 1: Width
20; Arg 2: Height
21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
23%macro SAD_FN 4
24%if %4 == 0 ; normal sad
25%if %3 == 5
26cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
27%else ; %3 == 7
28cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
29                            src_stride3, ref_stride3, n_rows
30%endif ; %3 == 5/7
31
32%elif %4 == 2 ; skip
33%if %3 == 5
34cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
35%else ; %3 == 7
36cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
37                            src_stride3, ref_stride3, n_rows
38%endif ; %3 == 5/7
39
40%else
41%if %3 == 5
42cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
43                                    second_pred, n_rows
44%else ; %3 == 7
45cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \
46                                              ref, ref_stride, \
47                                              second_pred, \
48                                              src_stride3, ref_stride3
49%if AOM_ARCH_X86_64
50%define n_rowsd r7d
51%else ; x86-32
52%define n_rowsd dword r0m
53%endif ; x86-32/64
54%endif ; %3 == 5/7
55%endif ; sad/avg/skip
56%if %4 == 2; skip rows so double the stride
57lea           src_strided, [src_strided*2]
58lea           ref_strided, [ref_strided*2]
59%endif ; %4 skip
60  movsxdifnidn src_strideq, src_strided
61  movsxdifnidn ref_strideq, ref_strided
62%if %3 == 7
63  lea         src_stride3q, [src_strideq*3]
64  lea         ref_stride3q, [ref_strideq*3]
65%endif ; %3 == 7
66%endmacro
67
68; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
69;                                  uint8_t *ref, int ref_stride);
70%macro SAD128XN 1-2 0
71  SAD_FN 128, %1, 5, %2
72%if %2 == 2
73  mov              n_rowsd, %1/2
74%else
75  mov              n_rowsd, %1
76%endif
77  pxor                  m0, m0
78
79.loop:
80  movu                  m1, [refq]
81  movu                  m2, [refq+16]
82  movu                  m3, [refq+32]
83  movu                  m4, [refq+48]
84%if %2 == 1
85  pavgb                 m1, [second_predq+mmsize*0]
86  pavgb                 m2, [second_predq+mmsize*1]
87  pavgb                 m3, [second_predq+mmsize*2]
88  pavgb                 m4, [second_predq+mmsize*3]
89%endif
90  psadbw                m1, [srcq]
91  psadbw                m2, [srcq+16]
92  psadbw                m3, [srcq+32]
93  psadbw                m4, [srcq+48]
94
95  paddd                 m1, m2
96  paddd                 m3, m4
97  paddd                 m0, m1
98  paddd                 m0, m3
99
100  movu                  m1, [refq+64]
101  movu                  m2, [refq+80]
102  movu                  m3, [refq+96]
103  movu                  m4, [refq+112]
104%if %2 == 1
105  pavgb                 m1, [second_predq+mmsize*4]
106  pavgb                 m2, [second_predq+mmsize*5]
107  pavgb                 m3, [second_predq+mmsize*6]
108  pavgb                 m4, [second_predq+mmsize*7]
109  lea         second_predq, [second_predq+mmsize*8]
110%endif
111  psadbw                m1, [srcq+64]
112  psadbw                m2, [srcq+80]
113  psadbw                m3, [srcq+96]
114  psadbw                m4, [srcq+112]
115
116  add                 refq, ref_strideq
117  add                 srcq, src_strideq
118
119  paddd                 m1, m2
120  paddd                 m3, m4
121  paddd                 m0, m1
122  paddd                 m0, m3
123
124  sub              n_rowsd, 1
125  jg .loop
126
127  movhlps               m1, m0
128  paddd                 m0, m1
129%if %2 == 2 ; we skipped rows, so now we need to double the sad
130  pslld                 m0, 1
131%endif
132  movd                 eax, m0
133  RET
134%endmacro
135
136INIT_XMM sse2
137SAD128XN 128     ; sad128x128_sse2
138SAD128XN 128, 1  ; sad128x128_avg_sse2
139SAD128XN 128, 2  ; sad_skip_128x128_sse2
140SAD128XN 64      ; sad128x64_sse2
141SAD128XN 64, 1   ; sad128x64_avg_sse2
142SAD128XN 64, 2   ; sad_skip_128x64_sse2
143
144
145; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
146;                                uint8_t *ref, int ref_stride);
147%macro SAD64XN 1-2 0
148  SAD_FN 64, %1, 5, %2
149%if %2 == 2
150  mov              n_rowsd, %1/2
151%else
152  mov              n_rowsd, %1
153%endif
154  pxor                  m0, m0
155.loop:
156  movu                  m1, [refq]
157  movu                  m2, [refq+16]
158  movu                  m3, [refq+32]
159  movu                  m4, [refq+48]
160%if %2 == 1
161  pavgb                 m1, [second_predq+mmsize*0]
162  pavgb                 m2, [second_predq+mmsize*1]
163  pavgb                 m3, [second_predq+mmsize*2]
164  pavgb                 m4, [second_predq+mmsize*3]
165  lea         second_predq, [second_predq+mmsize*4]
166%endif
167  psadbw                m1, [srcq]
168  psadbw                m2, [srcq+16]
169  psadbw                m3, [srcq+32]
170  psadbw                m4, [srcq+48]
171  paddd                 m1, m2
172  paddd                 m3, m4
173  add                 refq, ref_strideq
174  paddd                 m0, m1
175  add                 srcq, src_strideq
176  paddd                 m0, m3
177  dec              n_rowsd
178  jg .loop
179
180  movhlps               m1, m0
181  paddd                 m0, m1
182%if %2 == 2 ; we skipped rows, so now we need to double the sad
183  pslld                 m0, 1
184%endif
185  movd                 eax, m0
186  RET
187%endmacro
188
189INIT_XMM sse2
190SAD64XN 128     ; sad64x128_sse2
191SAD64XN  64     ; sad64x64_sse2
192SAD64XN  32     ; sad64x32_sse2
193SAD64XN 128, 1  ; sad64x128_avg_sse2
194SAD64XN  64, 1  ; sad64x64_avg_sse2
195SAD64XN  32, 1  ; sad64x32_avg_sse2
196SAD64XN 128, 2  ; sad_skip_64x128_sse2
197SAD64XN  64, 2  ; sad_skip_64x64_sse2
198SAD64XN  32, 2  ; sad_skip_64x32_sse2
199%if CONFIG_REALTIME_ONLY==0
200SAD64XN  16     ; sad64x16_sse2
201SAD64XN  16, 1  ; sad64x16_avg_sse2
202SAD64XN  16, 2  ; sad_skip_64x16_sse2
203%endif
204
205; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
206;                                uint8_t *ref, int ref_stride);
207%macro SAD32XN 1-2 0
208  SAD_FN 32, %1, 5, %2
209%if %2 == 2
210  mov              n_rowsd, %1/4
211%else
212  mov              n_rowsd, %1/2
213%endif
214  pxor                  m0, m0
215.loop:
216  movu                  m1, [refq]
217  movu                  m2, [refq+16]
218  movu                  m3, [refq+ref_strideq]
219  movu                  m4, [refq+ref_strideq+16]
220%if %2 == 1
221  pavgb                 m1, [second_predq+mmsize*0]
222  pavgb                 m2, [second_predq+mmsize*1]
223  pavgb                 m3, [second_predq+mmsize*2]
224  pavgb                 m4, [second_predq+mmsize*3]
225  lea         second_predq, [second_predq+mmsize*4]
226%endif
227  psadbw                m1, [srcq]
228  psadbw                m2, [srcq+16]
229  psadbw                m3, [srcq+src_strideq]
230  psadbw                m4, [srcq+src_strideq+16]
231  paddd                 m1, m2
232  paddd                 m3, m4
233  lea                 refq, [refq+ref_strideq*2]
234  paddd                 m0, m1
235  lea                 srcq, [srcq+src_strideq*2]
236  paddd                 m0, m3
237  dec              n_rowsd
238  jg .loop
239
240  movhlps               m1, m0
241  paddd                 m0, m1
242%if %2 == 2 ; we skipped rows, so now we need to double the sad
243  pslld                 m0, 1
244%endif
245  movd                 eax, m0
246  RET
247%endmacro
248
249INIT_XMM sse2
250SAD32XN 64    ; sad32x64_sse2
251SAD32XN 32    ; sad32x32_sse2
252SAD32XN 16    ; sad32x16_sse2
253SAD32XN 64, 1 ; sad32x64_avg_sse2
254SAD32XN 32, 1 ; sad32x32_avg_sse2
255SAD32XN 16, 1 ; sad32x16_avg_sse2
256SAD32XN 64, 2 ; sad_skip_32x64_sse2
257SAD32XN 32, 2 ; sad_skip_32x32_sse2
258SAD32XN 16, 2 ; sad_skip_32x16_sse2
259%if CONFIG_REALTIME_ONLY==0
260SAD32XN  8    ; sad32x8_sse2
261SAD32XN  8, 1 ; sad32x8_avg_sse2
262%endif
263
264; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
265;                                    uint8_t *ref, int ref_stride);
266%macro SAD16XN 1-2 0
267  SAD_FN 16, %1, 7, %2
268%if %2 == 2
269  mov              n_rowsd, %1/8
270%else
271  mov              n_rowsd, %1/4
272%endif
273  pxor                  m0, m0
274
275.loop:
276  movu                  m1, [refq]
277  movu                  m2, [refq+ref_strideq]
278  movu                  m3, [refq+ref_strideq*2]
279  movu                  m4, [refq+ref_stride3q]
280%if %2 == 1
281  pavgb                 m1, [second_predq+mmsize*0]
282  pavgb                 m2, [second_predq+mmsize*1]
283  pavgb                 m3, [second_predq+mmsize*2]
284  pavgb                 m4, [second_predq+mmsize*3]
285  lea         second_predq, [second_predq+mmsize*4]
286%endif
287  psadbw                m1, [srcq]
288  psadbw                m2, [srcq+src_strideq]
289  psadbw                m3, [srcq+src_strideq*2]
290  psadbw                m4, [srcq+src_stride3q]
291  paddd                 m1, m2
292  paddd                 m3, m4
293  lea                 refq, [refq+ref_strideq*4]
294  paddd                 m0, m1
295  lea                 srcq, [srcq+src_strideq*4]
296  paddd                 m0, m3
297  dec              n_rowsd
298  jg .loop
299
300  movhlps               m1, m0
301  paddd                 m0, m1
302%if %2 == 2 ; we skipped rows, so now we need to double the sad
303  pslld                 m0, 1
304%endif
305  movd                 eax, m0
306  RET
307%endmacro
308
309INIT_XMM sse2
310SAD16XN 32    ; sad16x32_sse2
311SAD16XN 16    ; sad16x16_sse2
312SAD16XN  8    ; sad16x8_sse2
313SAD16XN 32, 1 ; sad16x32_avg_sse2
314SAD16XN 16, 1 ; sad16x16_avg_sse2
315SAD16XN  8, 1 ; sad16x8_avg_sse2
316SAD16XN 32, 2 ; sad_skip_16x32_sse2
317SAD16XN 16, 2 ; sad_skip_16x16_sse2
318%if CONFIG_REALTIME_ONLY==0
319SAD16XN 64    ; sad16x64_sse2
320SAD16XN  4    ; sad16x4_sse2
321SAD16XN 64, 1 ; sad16x64_avg_sse2
322SAD16XN 64, 2 ; sad_skip_16x64_sse2
323%endif
324
325; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
326;                                   uint8_t *ref, int ref_stride);
327%macro SAD8XN 1-2 0
328  SAD_FN 8, %1, 7, %2
329%if %2 == 2
330  mov              n_rowsd, %1/8
331%else
332  mov              n_rowsd, %1/4
333%endif
334  pxor                  m0, m0
335
336.loop:
337  movh                  m1, [refq]
338  movhps                m1, [refq+ref_strideq]
339  movh                  m2, [refq+ref_strideq*2]
340  movhps                m2, [refq+ref_stride3q]
341%if %2 == 1
342  pavgb                 m1, [second_predq+mmsize*0]
343  pavgb                 m2, [second_predq+mmsize*1]
344  lea         second_predq, [second_predq+mmsize*2]
345%endif
346  movh                  m3, [srcq]
347  movhps                m3, [srcq+src_strideq]
348  movh                  m4, [srcq+src_strideq*2]
349  movhps                m4, [srcq+src_stride3q]
350  psadbw                m1, m3
351  psadbw                m2, m4
352  lea                 refq, [refq+ref_strideq*4]
353  paddd                 m0, m1
354  lea                 srcq, [srcq+src_strideq*4]
355  paddd                 m0, m2
356  dec              n_rowsd
357  jg .loop
358
359  movhlps               m1, m0
360  paddd                 m0, m1
361%if %2 == 2 ; we skipped rows, so now we need to double the sad
362  pslld                 m0, 1
363%endif
364  movd                 eax, m0
365  RET
366%endmacro
367
368INIT_XMM sse2
369SAD8XN 16    ; sad8x16_sse2
370SAD8XN  8    ; sad8x8_sse2
371SAD8XN  4    ; sad8x4_sse2
372SAD8XN 16, 1 ; sad8x16_avg_sse2
373SAD8XN  8, 1 ; sad8x8_avg_sse2
374SAD8XN 16, 2 ; sad_skip_8x16_sse2
375SAD8XN  8, 2 ; sad_skip_8x8_sse2
376%if CONFIG_REALTIME_ONLY==0
377SAD8XN 32    ; sad8x32_sse2
378SAD8XN 32, 1 ; sad8x32_avg_sse2
379SAD8XN 32, 2 ; sad_skip_8x32_sse2
380%endif
381
382; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
383;                                   uint8_t *ref, int ref_stride);
384%macro SAD4XN 1-2 0
385  SAD_FN 4, %1, 7, %2
386%if %2 == 2
387  mov              n_rowsd, %1/8
388%else
389  mov              n_rowsd, %1/4
390%endif
391  pxor                  m0, m0
392
393.loop:
394  movd                  m1, [refq]
395  movd                  m2, [refq+ref_strideq]
396  movd                  m3, [refq+ref_strideq*2]
397  movd                  m4, [refq+ref_stride3q]
398  punpckldq             m1, m2
399  punpckldq             m3, m4
400  movlhps               m1, m3
401%if %2 == 1
402  pavgb                 m1, [second_predq+mmsize*0]
403  lea         second_predq, [second_predq+mmsize*1]
404%endif
405  movd                  m2, [srcq]
406  movd                  m5, [srcq+src_strideq]
407  movd                  m4, [srcq+src_strideq*2]
408  movd                  m3, [srcq+src_stride3q]
409  punpckldq             m2, m5
410  punpckldq             m4, m3
411  movlhps               m2, m4
412  psadbw                m1, m2
413  lea                 refq, [refq+ref_strideq*4]
414  paddd                 m0, m1
415  lea                 srcq, [srcq+src_strideq*4]
416  dec              n_rowsd
417  jg .loop
418
419  movhlps               m1, m0
420  paddd                 m0, m1
421%if %2 == 2 ; we skipped rows, so now we need to double the sad
422  pslld                 m0, 1
423%endif
424  movd                 eax, m0
425  RET
426%endmacro
427
428INIT_XMM sse2
429SAD4XN  8 ; sad4x8_sse2
430SAD4XN  4 ; sad4x4_sse2
431%if CONFIG_REALTIME_ONLY==0
432SAD4XN 16 ; sad4x16_sse2
433SAD4XN 16, 2 ; sad_skip_4x16_sse2
434%endif
435