• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "aom_ports/x86_abi_support.asm"
15
16%macro HIGH_GET_PARAM_4 0
17    mov         rdx, arg(5)                 ;filter ptr
18    mov         rsi, arg(0)                 ;src_ptr
19    mov         rdi, arg(2)                 ;output_ptr
20    mov         rcx, 0x00000040
21
22    movdqa      xmm3, [rdx]                 ;load filters
23    pshuflw     xmm4, xmm3, 11111111b       ;k3
24    psrldq      xmm3, 8
25    pshuflw     xmm3, xmm3, 0b              ;k4
26    punpcklwd   xmm4, xmm3                  ;k3k4
27
28    movq        xmm3, rcx                   ;rounding
29    pshufd      xmm3, xmm3, 0
30
31    mov         rdx, 0x00010001
32    movsxd      rcx, DWORD PTR arg(6)       ;bps
33    movq        xmm5, rdx
34    movq        xmm2, rcx
35    pshufd      xmm5, xmm5, 0b
36    movdqa      xmm1, xmm5
37    psllw       xmm5, xmm2
38    psubw       xmm5, xmm1                  ;max value (for clamping)
39    pxor        xmm2, xmm2                  ;min value (for clamping)
40
41    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
42    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
43    movsxd      rcx, DWORD PTR arg(4)       ;output_height
44%endm
45
46%macro HIGH_APPLY_FILTER_4 1
47
48    punpcklwd   xmm0, xmm1                  ;two row in one register
49    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
50
51    paddd       xmm0, xmm3                  ;rounding
52    psrad       xmm0, 7                     ;shift
53    packssdw    xmm0, xmm0                  ;pack to word
54
55    ;clamp the values
56    pminsw      xmm0, xmm5
57    pmaxsw      xmm0, xmm2
58
59%if %1
60    movq        xmm1, [rdi]
61    pavgw       xmm0, xmm1
62%endif
63
64    movq        [rdi], xmm0
65    lea         rsi, [rsi + 2*rax]
66    lea         rdi, [rdi + 2*rdx]
67    dec         rcx
68%endm
69
70%macro HIGH_GET_PARAM 0
71    mov         rdx, arg(5)                 ;filter ptr
72    mov         rsi, arg(0)                 ;src_ptr
73    mov         rdi, arg(2)                 ;output_ptr
74    mov         rcx, 0x00000040
75
76    movdqa      xmm6, [rdx]                 ;load filters
77
78    pshuflw     xmm7, xmm6, 11111111b       ;k3
79    pshufhw     xmm6, xmm6, 0b              ;k4
80    psrldq      xmm6, 8
81    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
82
83    movq        xmm4, rcx                   ;rounding
84    pshufd      xmm4, xmm4, 0
85
86    mov         rdx, 0x00010001
87    movsxd      rcx, DWORD PTR arg(6)       ;bps
88    movq        xmm3, rdx
89    movq        xmm5, rcx
90    pshufd      xmm3, xmm3, 0b
91    movdqa      xmm1, xmm3
92    psllw       xmm3, xmm5
93    psubw       xmm3, xmm1                  ;max value (for clamping)
94    pxor        xmm5, xmm5                  ;min value (for clamping)
95
96    movdqa      max, xmm3
97    movdqa      min, xmm5
98
99    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
100    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
101    movsxd      rcx, DWORD PTR arg(4)       ;output_height
102%endm
103
104%macro HIGH_APPLY_FILTER_8 1
105    movdqa      xmm6, xmm0
106    punpckhwd   xmm6, xmm1
107    punpcklwd   xmm0, xmm1
108    pmaddwd     xmm6, xmm7
109    pmaddwd     xmm0, xmm7
110
111    paddd       xmm6, xmm4                  ;rounding
112    paddd       xmm0, xmm4                  ;rounding
113    psrad       xmm6, 7                     ;shift
114    psrad       xmm0, 7                     ;shift
115    packssdw    xmm0, xmm6                  ;pack back to word
116
117    ;clamp the values
118    pminsw      xmm0, max
119    pmaxsw      xmm0, min
120
121%if %1
122    movdqu      xmm1, [rdi]
123    pavgw       xmm0, xmm1
124%endif
125    movdqu      [rdi], xmm0                 ;store the result
126
127    lea         rsi, [rsi + 2*rax]
128    lea         rdi, [rdi + 2*rdx]
129    dec         rcx
130%endm
131
132%macro HIGH_APPLY_FILTER_16 1
133    movdqa      xmm5, xmm0
134    movdqa      xmm6, xmm2
135    punpckhwd   xmm5, xmm1
136    punpckhwd   xmm6, xmm3
137    punpcklwd   xmm0, xmm1
138    punpcklwd   xmm2, xmm3
139
140    pmaddwd     xmm5, xmm7
141    pmaddwd     xmm6, xmm7
142    pmaddwd     xmm0, xmm7
143    pmaddwd     xmm2, xmm7
144
145    paddd       xmm5, xmm4                  ;rounding
146    paddd       xmm6, xmm4
147    paddd       xmm0, xmm4
148    paddd       xmm2, xmm4
149
150    psrad       xmm5, 7                     ;shift
151    psrad       xmm6, 7
152    psrad       xmm0, 7
153    psrad       xmm2, 7
154
155    packssdw    xmm0, xmm5                  ;pack back to word
156    packssdw    xmm2, xmm6                  ;pack back to word
157
158    ;clamp the values
159    pminsw      xmm0, max
160    pmaxsw      xmm0, min
161    pminsw      xmm2, max
162    pmaxsw      xmm2, min
163
164%if %1
165    movdqu      xmm1, [rdi]
166    movdqu      xmm3, [rdi + 16]
167    pavgw       xmm0, xmm1
168    pavgw       xmm2, xmm3
169%endif
170    movdqu      [rdi], xmm0               ;store the result
171    movdqu      [rdi + 16], xmm2          ;store the result
172
173    lea         rsi, [rsi + 2*rax]
174    lea         rdi, [rdi + 2*rdx]
175    dec         rcx
176%endm
177
178SECTION .text
179
180globalsym(aom_highbd_filter_block1d4_v2_sse2)
181sym(aom_highbd_filter_block1d4_v2_sse2):
182    push        rbp
183    mov         rbp, rsp
184    SHADOW_ARGS_TO_STACK 7
185    push        rsi
186    push        rdi
187    ; end prolog
188
189    HIGH_GET_PARAM_4
190.loop:
191    movq        xmm0, [rsi]                 ;load src
192    movq        xmm1, [rsi + 2*rax]
193
194    HIGH_APPLY_FILTER_4 0
195    jnz         .loop
196
197    ; begin epilog
198    pop         rdi
199    pop         rsi
200    UNSHADOW_ARGS
201    pop         rbp
202    ret
203
204globalsym(aom_highbd_filter_block1d8_v2_sse2)
205sym(aom_highbd_filter_block1d8_v2_sse2):
206    push        rbp
207    mov         rbp, rsp
208    SHADOW_ARGS_TO_STACK 7
209    SAVE_XMM 8
210    push        rsi
211    push        rdi
212    ; end prolog
213
214    ALIGN_STACK 16, rax
215    sub         rsp, 16 * 2
216    %define max [rsp + 16 * 0]
217    %define min [rsp + 16 * 1]
218
219    HIGH_GET_PARAM
220.loop:
221    movdqu      xmm0, [rsi]                 ;0
222    movdqu      xmm1, [rsi + 2*rax]         ;1
223
224    HIGH_APPLY_FILTER_8 0
225    jnz         .loop
226
227    add rsp, 16 * 2
228    pop rsp
229
230    ; begin epilog
231    pop         rdi
232    pop         rsi
233    RESTORE_XMM
234    UNSHADOW_ARGS
235    pop         rbp
236    ret
237
238globalsym(aom_highbd_filter_block1d16_v2_sse2)
239sym(aom_highbd_filter_block1d16_v2_sse2):
240    push        rbp
241    mov         rbp, rsp
242    SHADOW_ARGS_TO_STACK 7
243    SAVE_XMM 9
244    push        rsi
245    push        rdi
246    ; end prolog
247
248    ALIGN_STACK 16, rax
249    sub         rsp, 16 * 2
250    %define max [rsp + 16 * 0]
251    %define min [rsp + 16 * 1]
252
253    HIGH_GET_PARAM
254.loop:
255    movdqu        xmm0, [rsi]               ;0
256    movdqu        xmm2, [rsi + 16]
257    movdqu        xmm1, [rsi + 2*rax]       ;1
258    movdqu        xmm3, [rsi + 2*rax + 16]
259
260    HIGH_APPLY_FILTER_16 0
261    jnz         .loop
262
263    add rsp, 16 * 2
264    pop rsp
265
266    ; begin epilog
267    pop         rdi
268    pop         rsi
269    RESTORE_XMM
270    UNSHADOW_ARGS
271    pop         rbp
272    ret
273
274globalsym(aom_highbd_filter_block1d4_h2_sse2)
275sym(aom_highbd_filter_block1d4_h2_sse2):
276    push        rbp
277    mov         rbp, rsp
278    SHADOW_ARGS_TO_STACK 7
279    push        rsi
280    push        rdi
281    ; end prolog
282
283    HIGH_GET_PARAM_4
284.loop:
285    movdqu      xmm0, [rsi]                 ;load src
286    movdqa      xmm1, xmm0
287    psrldq      xmm1, 2
288
289    HIGH_APPLY_FILTER_4 0
290    jnz         .loop
291
292    ; begin epilog
293    pop         rdi
294    pop         rsi
295    UNSHADOW_ARGS
296    pop         rbp
297    ret
298
299globalsym(aom_highbd_filter_block1d8_h2_sse2)
300sym(aom_highbd_filter_block1d8_h2_sse2):
301    push        rbp
302    mov         rbp, rsp
303    SHADOW_ARGS_TO_STACK 7
304    SAVE_XMM 8
305    push        rsi
306    push        rdi
307    ; end prolog
308
309    ALIGN_STACK 16, rax
310    sub         rsp, 16 * 2
311    %define max [rsp + 16 * 0]
312    %define min [rsp + 16 * 1]
313
314    HIGH_GET_PARAM
315.loop:
316    movdqu      xmm0, [rsi]                 ;load src
317    movdqu      xmm1, [rsi + 2]
318
319    HIGH_APPLY_FILTER_8 0
320    jnz         .loop
321
322    add rsp, 16 * 2
323    pop rsp
324
325    ; begin epilog
326    pop         rdi
327    pop         rsi
328    RESTORE_XMM
329    UNSHADOW_ARGS
330    pop         rbp
331    ret
332
333globalsym(aom_highbd_filter_block1d16_h2_sse2)
334sym(aom_highbd_filter_block1d16_h2_sse2):
335    push        rbp
336    mov         rbp, rsp
337    SHADOW_ARGS_TO_STACK 7
338    SAVE_XMM 9
339    push        rsi
340    push        rdi
341    ; end prolog
342
343    ALIGN_STACK 16, rax
344    sub         rsp, 16 * 2
345    %define max [rsp + 16 * 0]
346    %define min [rsp + 16 * 1]
347
348    HIGH_GET_PARAM
349.loop:
350    movdqu      xmm0,   [rsi]               ;load src
351    movdqu      xmm1,   [rsi + 2]
352    movdqu      xmm2,   [rsi + 16]
353    movdqu      xmm3,   [rsi + 18]
354
355    HIGH_APPLY_FILTER_16 0
356    jnz         .loop
357
358    add rsp, 16 * 2
359    pop rsp
360
361    ; begin epilog
362    pop         rdi
363    pop         rsi
364    RESTORE_XMM
365    UNSHADOW_ARGS
366    pop         rbp
367    ret
368