• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14section .text
15    global sym(vp8_short_fdct4x4_mmx)
16    global sym(vp8_short_fdct8x4_wmt)
17
18
19%define         DCTCONSTANTSBITS         (16)
20%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
21%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
22%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
23%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
24
25
26;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
27sym(vp8_short_fdct4x4_mmx):
28    push        rbp
29    mov         rbp, rsp
30    SHADOW_ARGS_TO_STACK 3
31    GET_GOT     rbx
32    push rsi
33    push rdi
34    ; end prolog
35        mov     rsi,    arg(0) ;input
36        mov     rdi,    arg(1) ;output
37
38        lea     rdx,    [GLOBAL(dct_const_mmx)]
39        movsxd  rax,    dword ptr arg(2) ;pitch
40
41        lea     rcx,    [rsi + rax*2]
42        ; read the input data
43        movq    mm0,    [rsi]
44        movq    mm1,    [rsi + rax    ]
45
46        movq    mm2,    [rcx]
47        movq    mm3,    [rcx + rax]
48        ; get the constants
49        ;shift to left by 1 for prescision
50        psllw   mm0,    3
51        psllw   mm1,    3
52
53        psllw   mm2,    3
54        psllw   mm3,    3
55
56        ; transpose for the second stage
57        movq    mm4,    mm0         ; 00 01 02 03
58        movq    mm5,    mm2         ; 10 11 12 03
59
60        punpcklwd   mm0,    mm1     ; 00 10 01 11
61        punpckhwd   mm4,    mm1     ; 02 12 03 13
62
63        punpcklwd   mm2,    mm3     ; 20 30 21 31
64        punpckhwd   mm5,    mm3     ; 22 32 23 33
65
66
67        movq        mm1,    mm0     ; 00 10 01 11
68        punpckldq   mm0,    mm2     ; 00 10 20 30
69
70        punpckhdq   mm1,    mm2     ; 01 11 21 31
71
72        movq        mm2,    mm4     ; 02 12 03 13
73        punpckldq   mm2,    mm5     ; 02 12 22 32
74
75        punpckhdq   mm4,    mm5     ; 03 13 23 33
76        movq        mm3,    mm4
77
78
79        ; first stage
80        movq    mm5,    mm0
81        movq    mm4,    mm1
82
83        paddw   mm0,    mm3         ; a = 0 + 3
84        paddw   mm1,    mm2         ; b = 1 + 2
85
86        psubw   mm4,    mm2         ; c = 1 - 2
87        psubw   mm5,    mm3         ; d = 0 - 3
88
89
90        ; output 0 and 2
91        movq    mm6,    [rdx +  16] ; c2
92        movq    mm2,    mm0         ; a
93
94        paddw   mm0,    mm1         ; a + b
95        psubw   mm2,    mm1         ; a - b
96
97        movq    mm1,    mm0         ; a + b
98        pmulhw  mm0,    mm6         ; 00 01 02 03
99
100        paddw   mm0,    mm1         ; output 00 01 02 03
101        pmulhw  mm6,    mm2         ; 20 21 22 23
102
103        paddw   mm2,    mm6         ; output 20 21 22 23
104
105        ; output 1 and 3
106        movq    mm6,    [rdx +  8]  ; c1
107        movq    mm7,    [rdx + 24]  ; c3
108
109        movq    mm1,    mm4         ; c
110        movq    mm3,    mm5         ; d
111
112        pmulhw  mm1,    mm7         ; c * c3
113        pmulhw  mm3,    mm6         ; d * c1
114
115        paddw   mm3,    mm5         ; d * c1 rounded
116        paddw   mm1,    mm3         ; output 10 11 12 13
117
118        movq    mm3,    mm4         ; c
119        pmulhw  mm5,    mm7         ; d * c3
120
121        pmulhw  mm4,    mm6         ; c * c1
122        paddw   mm3,    mm4         ; round c* c1
123
124        psubw   mm5,    mm3         ; output 30 31 32 33
125        movq    mm3,    mm5
126
127
128        ; done with vertical
129        ; transpose for the second stage
130        movq    mm4,    mm0         ; 00 01 02 03
131        movq    mm5,    mm2         ; 10 11 12 03
132
133        punpcklwd   mm0,    mm1     ; 00 10 01 11
134        punpckhwd   mm4,    mm1     ; 02 12 03 13
135
136        punpcklwd   mm2,    mm3     ; 20 30 21 31
137        punpckhwd   mm5,    mm3     ; 22 32 23 33
138
139
140        movq        mm1,    mm0     ; 00 10 01 11
141        punpckldq   mm0,    mm2     ; 00 10 20 30
142
143        punpckhdq   mm1,    mm2     ; 01 11 21 31
144
145        movq        mm2,    mm4     ; 02 12 03 13
146        punpckldq   mm2,    mm5     ; 02 12 22 32
147
148        punpckhdq   mm4,    mm5     ; 03 13 23 33
149        movq        mm3,    mm4
150
151
152        ; first stage
153        movq    mm5,    mm0
154        movq    mm4,    mm1
155
156        paddw   mm0,    mm3         ; a = 0 + 3
157        paddw   mm1,    mm2         ; b = 1 + 2
158
159        psubw   mm4,    mm2         ; c = 1 - 2
160        psubw   mm5,    mm3         ; d = 0 - 3
161
162
163        ; output 0 and 2
164        movq    mm6,    [rdx +  16] ; c2
165        movq    mm2,    mm0         ; a
166        paddw   mm0,    mm1         ; a + b
167
168        psubw   mm2,    mm1         ; a - b
169
170        movq    mm1,    mm0         ; a + b
171        pmulhw  mm0,    mm6         ; 00 01 02 03
172
173        paddw   mm0,    mm1         ; output 00 01 02 03
174        pmulhw  mm6,    mm2         ; 20 21 22 23
175
176        paddw   mm2,    mm6         ; output 20 21 22 23
177
178
179        ; output 1 and 3
180        movq    mm6,    [rdx +  8]  ; c1
181        movq    mm7,    [rdx + 24]  ; c3
182
183        movq    mm1,    mm4         ; c
184        movq    mm3,    mm5         ; d
185
186        pmulhw  mm1,    mm7         ; c * c3
187        pmulhw  mm3,    mm6         ; d * c1
188
189        paddw   mm3,    mm5         ; d * c1 rounded
190        paddw   mm1,    mm3         ; output 10 11 12 13
191
192        movq    mm3,    mm4         ; c
193        pmulhw  mm5,    mm7         ; d * c3
194
195        pmulhw  mm4,    mm6         ; c * c1
196        paddw   mm3,    mm4         ; round c* c1
197
198        psubw   mm5,    mm3         ; output 30 31 32 33
199        movq    mm3,    mm5
200        ; done with vertical
201
202        pcmpeqw mm4,    mm4
203        pcmpeqw mm5,    mm5
204        psrlw   mm4,    15
205        psrlw   mm5,    15
206
207        psllw   mm4,    2
208        psllw   mm5,    2
209
210        paddw   mm0,    mm4
211        paddw   mm1,    mm5
212        paddw   mm2,    mm4
213        paddw   mm3,    mm5
214
215        psraw   mm0, 3
216        psraw   mm1, 3
217        psraw   mm2, 3
218        psraw   mm3, 3
219
220        movq        [rdi   ],   mm0
221        movq        [rdi+ 8],   mm1
222        movq        [rdi+16],   mm2
223        movq        [rdi+24],   mm3
224
225    ; begin epilog
226    pop rdi
227    pop rsi
228    RESTORE_GOT
229    UNSHADOW_ARGS
230    pop         rbp
231    ret
232
233
234;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
235sym(vp8_short_fdct8x4_wmt):
236    push        rbp
237    mov         rbp, rsp
238    SHADOW_ARGS_TO_STACK 3
239    GET_GOT     rbx
240    push rsi
241    push rdi
242    ; end prolog
243        mov         rsi,    arg(0) ;input
244        mov         rdi,    arg(1) ;output
245
246        lea         rdx,    [GLOBAL(dct_const_xmm)]
247        movsxd      rax,    dword ptr arg(2) ;pitch
248
249        lea         rcx,    [rsi + rax*2]
250        ; read the input data
251        movdqa      xmm0,       [rsi]
252        movdqa      xmm2,       [rsi + rax]
253
254        movdqa      xmm4,       [rcx]
255        movdqa      xmm3,       [rcx + rax]
256        ; get the constants
257        ;shift to left by 1 for prescision
258        psllw       xmm0,        3
259        psllw       xmm2,        3
260
261        psllw       xmm4,        3
262        psllw       xmm3,        3
263
264        ; transpose for the second stage
265        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
266        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
267
268        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
269        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
270
271        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
272        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
273
274        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
275        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
276
277        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
278
279
280        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
281        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
282
283        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
284        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
285
286        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
287        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
288
289        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
290        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
291
292        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
293
294        ; xmm0 0
295        ; xmm1 1
296        ; xmm2 2
297        ; xmm3 3
298
299        ; first stage
300        movdqa      xmm5,       xmm0
301        movdqa      xmm4,       xmm1
302
303        paddw       xmm0,       xmm3         ; a = 0 + 3
304        paddw       xmm1,       xmm2         ; b = 1 + 2
305
306        psubw       xmm4,       xmm2         ; c = 1 - 2
307        psubw       xmm5,       xmm3         ; d = 0 - 3
308
309
310        ; output 0 and 2
311        movdqa      xmm6,       [rdx +  32] ; c2
312        movdqa      xmm2,       xmm0         ; a
313
314        paddw       xmm0,       xmm1         ; a + b
315        psubw       xmm2,       xmm1         ; a - b
316
317        movdqa      xmm1,       xmm0         ; a + b
318        pmulhw      xmm0,       xmm6         ; 00 01 02 03
319
320        paddw       xmm0,       xmm1         ; output 00 01 02 03
321        pmulhw      xmm6,       xmm2         ; 20 21 22 23
322
323        paddw       xmm2,       xmm6         ; output 20 21 22 23
324
325        ; output 1 and 3
326        movdqa      xmm6,       [rdx + 16]  ; c1
327        movdqa      xmm7,       [rdx + 48]  ; c3
328
329        movdqa      xmm1,       xmm4         ; c
330        movdqa      xmm3,       xmm5         ; d
331
332        pmulhw      xmm1,       xmm7         ; c * c3
333        pmulhw      xmm3,       xmm6         ; d * c1
334
335        paddw       xmm3,       xmm5         ; d * c1 rounded
336        paddw       xmm1,       xmm3         ; output 10 11 12 13
337
338        movdqa      xmm3,       xmm4         ; c
339        pmulhw      xmm5,       xmm7         ; d * c3
340
341        pmulhw      xmm4,       xmm6         ; c * c1
342        paddw       xmm3,       xmm4         ; round c* c1
343
344        psubw       xmm5,       xmm3         ; output 30 31 32 33
345        movdqa      xmm3,       xmm5
346
347
348        ; done with vertical
349        ; transpose for the second stage
350        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
351        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
352
353        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
354        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
355
356        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
357        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
358
359        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
360        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
361
362        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
363        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
364
365        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
366
367
368        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
369        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
370
371        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
372        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
373
374        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
375        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
376
377        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
378        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
379
380        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
381
382        ; first stage
383        movdqa      xmm5,       xmm0
384        movdqa      xmm4,       xmm1
385
386        paddw       xmm0,       xmm3         ; a = 0 + 3
387        paddw       xmm1,       xmm2         ; b = 1 + 2
388
389        psubw       xmm4,       xmm2         ; c = 1 - 2
390        psubw       xmm5,       xmm3         ; d = 0 - 3
391
392
393        ; output 0 and 2
394        movdqa      xmm6,       [rdx +  32] ; c2
395        movdqa      xmm2,       xmm0         ; a
396
397        paddw       xmm0,       xmm1         ; a + b
398        psubw       xmm2,       xmm1         ; a - b
399
400        movdqa      xmm1,       xmm0         ; a + b
401        pmulhw      xmm0,       xmm6         ; 00 01 02 03
402
403        paddw       xmm0,       xmm1         ; output 00 01 02 03
404        pmulhw      xmm6,       xmm2         ; 20 21 22 23
405
406        paddw       xmm2,       xmm6         ; output 20 21 22 23
407
408        ; output 1 and 3
409        movdqa      xmm6,       [rdx + 16]  ; c1
410        movdqa      xmm7,       [rdx + 48]  ; c3
411
412        movdqa      xmm1,       xmm4         ; c
413        movdqa      xmm3,       xmm5         ; d
414
415        pmulhw      xmm1,       xmm7         ; c * c3
416        pmulhw      xmm3,       xmm6         ; d * c1
417
418        paddw       xmm3,       xmm5         ; d * c1 rounded
419        paddw       xmm1,       xmm3         ; output 10 11 12 13
420
421        movdqa      xmm3,       xmm4         ; c
422        pmulhw      xmm5,       xmm7         ; d * c3
423
424        pmulhw      xmm4,       xmm6         ; c * c1
425        paddw       xmm3,       xmm4         ; round c* c1
426
427        psubw       xmm5,       xmm3         ; output 30 31 32 33
428        movdqa      xmm3,       xmm5
429        ; done with vertical
430
431
432        pcmpeqw     xmm4,       xmm4
433        pcmpeqw     xmm5,       xmm5;
434        psrlw       xmm4,       15
435        psrlw       xmm5,       15
436
437        psllw       xmm4,       2
438        psllw       xmm5,       2
439
440        paddw       xmm0,       xmm4
441        paddw       xmm1,       xmm5
442        paddw       xmm2,       xmm4
443        paddw       xmm3,       xmm5
444
445        psraw       xmm0,       3
446        psraw       xmm1,       3
447        psraw       xmm2,       3
448        psraw       xmm3,       3
449
450        movq        QWORD PTR[rdi   ],   xmm0
451        movq        QWORD PTR[rdi+ 8],   xmm1
452        movq        QWORD PTR[rdi+16],   xmm2
453        movq        QWORD PTR[rdi+24],   xmm3
454
455        psrldq      xmm0,       8
456        psrldq      xmm1,       8
457        psrldq      xmm2,       8
458        psrldq      xmm3,       8
459
460        movq        QWORD PTR[rdi+32],   xmm0
461        movq        QWORD PTR[rdi+40],   xmm1
462        movq        QWORD PTR[rdi+48],   xmm2
463        movq        QWORD PTR[rdi+56],   xmm3
464    ; begin epilog
465    pop rdi
466    pop rsi
467    RESTORE_GOT
468    UNSHADOW_ARGS
469    pop         rbp
470    ret
471
472
473SECTION_RODATA
474;static const unsigned int dct1st_stage_rounding_mmx[2] =
475align 16
476dct1st_stage_rounding_mmx:
477    times 2 dd 8192
478
479
480;static const unsigned int dct2nd_stage_rounding_mmx[2] =
481align 16
482dct2nd_stage_rounding_mmx:
483    times 2 dd 32768
484
485
486;static const short dct_matrix[4][4]=
487align 16
488dct_matrix:
489    times 4 dw 23170
490
491    dw  30274
492    dw  12540
493    dw -12540
494    dw -30274
495
496    dw 23170
497    times 2 dw -23170
498    dw 23170
499
500    dw  12540
501    dw -30274
502    dw  30274
503    dw -12540
504
505
506;static const unsigned short dct_const_mmx[4 * 4]=
507align 16
508dct_const_mmx:
509    times 4 dw 0
510    times 4 dw 60547
511    times 4 dw 46341
512    times 4 dw 25080
513
514
515;static const unsigned short dct_const_xmm[8 * 4]=
516align 16
517dct_const_xmm:
518    times 8 dw 0
519    times 8 dw 60547
520    times 8 dw 46341
521    times 8 dw 25080
522