• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
14global sym(vp8_recon_b_mmx)
15sym(vp8_recon_b_mmx):
16    push        rbp
17    mov         rbp, rsp
18    SHADOW_ARGS_TO_STACK 4
19    push        rsi
20    push        rdi
21    ; end prolog
22
23        mov       rsi, arg(0) ;s
24        mov       rdi, arg(2) ;d
25        mov       rdx, arg(1) ;q
26        movsxd    rax, dword ptr arg(3) ;stride
27        pxor      mm0, mm0
28
29        movd      mm1, [rsi]
30        punpcklbw mm1, mm0
31        paddsw    mm1, [rdx]
32        packuswb  mm1,  mm0              ; pack and unpack to saturate
33        movd      [rdi], mm1
34
35        movd      mm2, [rsi+16]
36        punpcklbw mm2, mm0
37        paddsw    mm2, [rdx+32]
38        packuswb  mm2, mm0              ; pack and unpack to saturate
39        movd      [rdi+rax], mm2
40
41        movd      mm3, [rsi+32]
42        punpcklbw mm3, mm0
43        paddsw    mm3, [rdx+64]
44        packuswb  mm3,  mm0              ; pack and unpack to saturate
45        movd      [rdi+2*rax], mm3
46
47        add       rdi, rax
48        movd      mm4, [rsi+48]
49        punpcklbw mm4, mm0
50        paddsw    mm4, [rdx+96]
51        packuswb  mm4, mm0              ; pack and unpack to saturate
52        movd      [rdi+2*rax], mm4
53
54    ; begin epilog
55    pop rdi
56    pop rsi
57    UNSHADOW_ARGS
58    pop         rbp
59    ret
60
61
62;void copy_mem8x8_mmx(
63;    unsigned char *src,
64;    int src_stride,
65;    unsigned char *dst,
66;    int dst_stride
67;    )
68global sym(vp8_copy_mem8x8_mmx)
69sym(vp8_copy_mem8x8_mmx):
70    push        rbp
71    mov         rbp, rsp
72    SHADOW_ARGS_TO_STACK 4
73    push        rsi
74    push        rdi
75    ; end prolog
76
77        mov         rsi,        arg(0) ;src;
78        movq        mm0,        [rsi]
79
80        movsxd      rax,        dword ptr arg(1) ;src_stride;
81        mov         rdi,        arg(2) ;dst;
82
83        movq        mm1,        [rsi+rax]
84        movq        mm2,        [rsi+rax*2]
85
86        movsxd      rcx,        dword ptr arg(3) ;dst_stride
87        lea         rsi,        [rsi+rax*2]
88
89        movq        [rdi],      mm0
90        add         rsi,        rax
91
92        movq        [rdi+rcx],      mm1
93        movq        [rdi+rcx*2],    mm2
94
95
96        lea         rdi,        [rdi+rcx*2]
97        movq        mm3,        [rsi]
98
99        add         rdi,        rcx
100        movq        mm4,        [rsi+rax]
101
102        movq        mm5,        [rsi+rax*2]
103        movq        [rdi],      mm3
104
105        lea         rsi,        [rsi+rax*2]
106        movq        [rdi+rcx],  mm4
107
108        movq        [rdi+rcx*2],    mm5
109        lea         rdi,        [rdi+rcx*2]
110
111        movq        mm0,        [rsi+rax]
112        movq        mm1,        [rsi+rax*2]
113
114        movq        [rdi+rcx],  mm0
115        movq        [rdi+rcx*2],mm1
116
117    ; begin epilog
118    pop rdi
119    pop rsi
120    UNSHADOW_ARGS
121    pop         rbp
122    ret
123
124
125;void copy_mem8x4_mmx(
126;    unsigned char *src,
127;    int src_stride,
128;    unsigned char *dst,
129;    int dst_stride
130;    )
131global sym(vp8_copy_mem8x4_mmx)
132sym(vp8_copy_mem8x4_mmx):
133    push        rbp
134    mov         rbp, rsp
135    SHADOW_ARGS_TO_STACK 4
136    push        rsi
137    push        rdi
138    ; end prolog
139
140        mov         rsi,        arg(0) ;src;
141        movq        mm0,        [rsi]
142
143        movsxd      rax,        dword ptr arg(1) ;src_stride;
144        mov         rdi,        arg(2) ;dst;
145
146        movq        mm1,        [rsi+rax]
147        movq        mm2,        [rsi+rax*2]
148
149        movsxd      rcx,        dword ptr arg(3) ;dst_stride
150        lea         rsi,        [rsi+rax*2]
151
152        movq        [rdi],      mm0
153        movq        [rdi+rcx],      mm1
154
155        movq        [rdi+rcx*2],    mm2
156        lea         rdi,        [rdi+rcx*2]
157
158        movq        mm3,        [rsi+rax]
159        movq        [rdi+rcx],      mm3
160
161    ; begin epilog
162    pop rdi
163    pop rsi
164    UNSHADOW_ARGS
165    pop         rbp
166    ret
167
168
169;void copy_mem16x16_mmx(
170;    unsigned char *src,
171;    int src_stride,
172;    unsigned char *dst,
173;    int dst_stride
174;    )
175global sym(vp8_copy_mem16x16_mmx)
176sym(vp8_copy_mem16x16_mmx):
177    push        rbp
178    mov         rbp, rsp
179    SHADOW_ARGS_TO_STACK 4
180    push        rsi
181    push        rdi
182    ; end prolog
183
184        mov         rsi,        arg(0) ;src;
185        movsxd      rax,        dword ptr arg(1) ;src_stride;
186
187        mov         rdi,        arg(2) ;dst;
188        movsxd      rcx,        dword ptr arg(3) ;dst_stride
189
190        movq        mm0,            [rsi]
191        movq        mm3,            [rsi+8];
192
193        movq        mm1,            [rsi+rax]
194        movq        mm4,            [rsi+rax+8]
195
196        movq        mm2,            [rsi+rax*2]
197        movq        mm5,            [rsi+rax*2+8]
198
199        lea         rsi,            [rsi+rax*2]
200        add         rsi,            rax
201
202        movq        [rdi],          mm0
203        movq        [rdi+8],        mm3
204
205        movq        [rdi+rcx],      mm1
206        movq        [rdi+rcx+8],    mm4
207
208        movq        [rdi+rcx*2],    mm2
209        movq        [rdi+rcx*2+8],  mm5
210
211        lea         rdi,            [rdi+rcx*2]
212        add         rdi,            rcx
213
214        movq        mm0,            [rsi]
215        movq        mm3,            [rsi+8];
216
217        movq        mm1,            [rsi+rax]
218        movq        mm4,            [rsi+rax+8]
219
220        movq        mm2,            [rsi+rax*2]
221        movq        mm5,            [rsi+rax*2+8]
222
223        lea         rsi,            [rsi+rax*2]
224        add         rsi,            rax
225
226        movq        [rdi],          mm0
227        movq        [rdi+8],        mm3
228
229        movq        [rdi+rcx],      mm1
230        movq        [rdi+rcx+8],    mm4
231
232        movq        [rdi+rcx*2],    mm2
233        movq        [rdi+rcx*2+8],  mm5
234
235        lea         rdi,            [rdi+rcx*2]
236        add         rdi,            rcx
237
238        movq        mm0,            [rsi]
239        movq        mm3,            [rsi+8];
240
241        movq        mm1,            [rsi+rax]
242        movq        mm4,            [rsi+rax+8]
243
244        movq        mm2,            [rsi+rax*2]
245        movq        mm5,            [rsi+rax*2+8]
246
247        lea         rsi,            [rsi+rax*2]
248        add         rsi,            rax
249
250        movq        [rdi],          mm0
251        movq        [rdi+8],        mm3
252
253        movq        [rdi+rcx],      mm1
254        movq        [rdi+rcx+8],    mm4
255
256        movq        [rdi+rcx*2],    mm2
257        movq        [rdi+rcx*2+8],  mm5
258
259        lea         rdi,            [rdi+rcx*2]
260        add         rdi,            rcx
261
262        movq        mm0,            [rsi]
263        movq        mm3,            [rsi+8];
264
265        movq        mm1,            [rsi+rax]
266        movq        mm4,            [rsi+rax+8]
267
268        movq        mm2,            [rsi+rax*2]
269        movq        mm5,            [rsi+rax*2+8]
270
271        lea         rsi,            [rsi+rax*2]
272        add         rsi,            rax
273
274        movq        [rdi],          mm0
275        movq        [rdi+8],        mm3
276
277        movq        [rdi+rcx],      mm1
278        movq        [rdi+rcx+8],    mm4
279
280        movq        [rdi+rcx*2],    mm2
281        movq        [rdi+rcx*2+8],  mm5
282
283        lea         rdi,            [rdi+rcx*2]
284        add         rdi,            rcx
285
286        movq        mm0,            [rsi]
287        movq        mm3,            [rsi+8];
288
289        movq        mm1,            [rsi+rax]
290        movq        mm4,            [rsi+rax+8]
291
292        movq        mm2,            [rsi+rax*2]
293        movq        mm5,            [rsi+rax*2+8]
294
295        lea         rsi,            [rsi+rax*2]
296        add         rsi,            rax
297
298        movq        [rdi],          mm0
299        movq        [rdi+8],        mm3
300
301        movq        [rdi+rcx],      mm1
302        movq        [rdi+rcx+8],    mm4
303
304        movq        [rdi+rcx*2],    mm2
305        movq        [rdi+rcx*2+8],  mm5
306
307        lea         rdi,            [rdi+rcx*2]
308        add         rdi,            rcx
309
310        movq        mm0,            [rsi]
311        movq        mm3,            [rsi+8];
312
313        movq        [rdi],          mm0
314        movq        [rdi+8],        mm3
315
316    ; begin epilog
317    pop rdi
318    pop rsi
319    UNSHADOW_ARGS
320    pop         rbp
321    ret
322