• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Simple IDCT MMX
3;
4; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5;
6; Conversion from gcc syntax to x264asm syntax with minimal modifications
7; by James Darnley <jdarnley@obe.tv>.
8;
9; This file is part of FFmpeg.
10;
11; FFmpeg is free software; you can redistribute it and/or
12; modify it under the terms of the GNU Lesser General Public
13; License as published by the Free Software Foundation; either
14; version 2.1 of the License, or (at your option) any later version.
15;
16; FFmpeg is distributed in the hope that it will be useful,
17; but WITHOUT ANY WARRANTY; without even the implied warranty of
18; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19; Lesser General Public License for more details.
20;
21; You should have received a copy of the GNU Lesser General Public
22; License along with FFmpeg; if not, write to the Free Software
23; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;/
25
26%include "libavutil/x86/x86util.asm"
27
28%if ARCH_X86_32
29SECTION_RODATA
30
31cextern pb_80
32
33wm1010: dw 0, 0xffff, 0, 0xffff
34d40000: dd 4 << 16, 0
35
36; 23170.475006
37; 22725.260826
38; 21406.727617
39; 19265.545870
40; 16384.000000
41; 12872.826198
42; 8866.956905
43; 4520.335430
44
45%define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46%define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47%define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
48%define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
49%define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
50%define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
51%define C6 8867  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
52%define C7 4520  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
53
54%define ROW_SHIFT 11
55%define COL_SHIFT 20 ; 6
56
57coeffs:
58    dw 1 << (ROW_SHIFT - 1), 0
59    dw 1 << (ROW_SHIFT - 1), 0
60    dw 1 << (ROW_SHIFT - 1), 1
61    dw 1 << (ROW_SHIFT - 1), 0
62
63    dw C4,  C4,  C4,  C4
64    dw C4, -C4,  C4, -C4
65
66    dw C2,  C6,  C2,  C6
67    dw C6, -C2,  C6, -C2
68
69    dw C1,  C3,  C1,  C3
70    dw C5,  C7,  C5,  C7
71
72    dw C3, -C7,  C3, -C7
73    dw -C1, -C5, -C1, -C5
74
75    dw C5, -C1,  C5, -C1
76    dw C7,  C3,  C7,  C3
77
78    dw C7, -C5,  C7, -C5
79    dw C3, -C1,  C3, -C1
80
81SECTION .text
82
83%macro DC_COND_IDCT 7
84    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
85    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
86    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
87    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
88    movq            mm4, [wm1010]
89    pand            mm4, mm0
90    por             mm4, mm1
91    por             mm4, mm2
92    por             mm4, mm3
93    packssdw        mm4, mm4
94    movd            t0d, mm4
95    or              t0d, t0d
96    jz              %%1
97    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
98    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
99    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
100    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
101    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
102    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
103    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
104    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
105    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
106    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
107    paddd           mm4, [coeffs + 8]
108    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
109    paddd           mm4, mm5            ; A0             a0
110    psubd           mm6, mm5            ; A3             a3
111    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
112    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
113    paddd           mm0, [coeffs + 8]
114    paddd           mm1, mm0            ; A1             a1
115    paddd           mm0, mm0
116    psubd           mm0, mm1            ; A2             a2
117    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
118    paddd           mm7, mm5            ; B0             b0
119    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
120    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
121    paddd           mm7, mm4            ; A0+B0          a0+b0
122    paddd           mm4, mm4            ; 2A0            2a0
123    psubd           mm4, mm7            ; A0-B0          a0-b0
124    paddd           mm5, mm2            ; B1             b1
125    psrad           mm7, %7
126    psrad           mm4, %7
127    movq            mm2, mm1            ; A1             a1
128    paddd           mm1, mm5            ; A1+B1          a1+b1
129    psubd           mm2, mm5            ; A1-B1          a1-b1
130    psrad           mm1, %7
131    psrad           mm2, %7
132    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
133    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
134    movq           [%5], mm7
135    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
136    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
137    movq      [24 + %5], mm2
138    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
139    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
140    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
141    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
142    movq            mm2, mm0            ; A2             a2
143    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
144    paddd           mm4, mm7            ; B2             b2
145    paddd           mm2, mm4            ; A2+B2          a2+b2
146    psubd           mm0, mm4            ; a2-B2          a2-b2
147    psrad           mm2, %7
148    psrad           mm0, %7
149    movq            mm4, mm6            ; A3             a3
150    paddd           mm3, mm1            ; B3             b3
151    paddd           mm6, mm3            ; A3+B3          a3+b3
152    psubd           mm4, mm3            ; a3-B3          a3-b3
153    psrad           mm6, %7
154    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
155    movq       [8 + %5], mm2
156    psrad           mm4, %7
157    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
158    movq      [16 + %5], mm4
159    jmp             %%2
160%%1:
161    pslld           mm0, 16
162    paddd           mm0, [d40000]
163    psrad           mm0, 13
164    packssdw        mm0, mm0
165    movq           [%5], mm0
166    movq       [8 + %5], mm0
167    movq      [16 + %5], mm0
168    movq      [24 + %5], mm0
169%%2:
170%endmacro
171
172%macro Z_COND_IDCT 8
173    movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
174    movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
175    movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
176    movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
177    movq            mm4, mm0
178    por             mm4, mm1
179    por             mm4, mm2
180    por             mm4, mm3
181    packssdw        mm4, mm4
182    movd            t0d, mm4
183    or              t0d, t0d
184    jz               %8
185    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
186    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
187    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
188    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
189    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
190    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
191    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
192    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
193    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
194    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
195    paddd           mm4, [coeffs]
196    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
197    paddd           mm4, mm5            ; A0             a0
198    psubd           mm6, mm5            ; A3             a3
199    movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
200    pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
201    paddd           mm0, [coeffs]
202    paddd           mm1, mm0            ; A1             a1
203    paddd           mm0, mm0
204    psubd           mm0, mm1            ; A2             a2
205    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
206    paddd           mm7, mm5            ; B0             b0
207    movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
208    pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
209    paddd           mm7, mm4            ; A0+B0          a0+b0
210    paddd           mm4, mm4            ; 2A0            2a0
211    psubd           mm4, mm7            ; A0-B0          a0-b0
212    paddd           mm5, mm2            ; B1             b1
213    psrad           mm7, %7
214    psrad           mm4, %7
215    movq            mm2, mm1            ; A1             a1
216    paddd           mm1, mm5            ; A1+B1          a1+b1
217    psubd           mm2, mm5            ; A1-B1          a1-b1
218    psrad           mm1, %7
219    psrad           mm2, %7
220    packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
221    packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
222    movq           [%5], mm7
223    movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
224    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
225    movq      [24 + %5], mm2
226    pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
227    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
228    pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
229    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
230    movq            mm2, mm0            ; A2             a2
231    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
232    paddd           mm4, mm7            ; B2             b2
233    paddd           mm2, mm4            ; A2+B2          a2+b2
234    psubd           mm0, mm4            ; a2-B2          a2-b2
235    psrad           mm2, %7
236    psrad           mm0, %7
237    movq            mm4, mm6            ; A3             a3
238    paddd           mm3, mm1            ; B3             b3
239    paddd           mm6, mm3            ; A3+B3          a3+b3
240    psubd           mm4, mm3            ; a3-B3          a3-b3
241    psrad           mm6, %7
242    packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
243    movq       [8 + %5], mm2
244    psrad           mm4, %7
245    packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
246    movq      [16 + %5], mm4
247%endmacro
248
249%macro IDCT1 6
250    movq            mm0, %1             ; R4     R0      r4      r0
251    movq            mm1, %2             ; R6     R2      r6      r2
252    movq            mm2, %3             ; R3     R1      r3      r1
253    movq            mm3, %4             ; R7     R5      r7      r5
254    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
255    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
256    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
257    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
258    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
259    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
260    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
261    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
262    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
263    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
264    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
265    paddd           mm4, mm5            ; A0             a0
266    psubd           mm6, mm5            ; A3             a3
267    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
268    paddd           mm0, mm1            ; A1             a1
269    psubd           mm5, mm1            ; A2             a2
270    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
271    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
272    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
273    paddd           mm7, mm1            ; B0             b0
274    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
275    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
276    paddd           mm7, mm4            ; A0+B0          a0+b0
277    paddd           mm4, mm4            ; 2A0            2a0
278    psubd           mm4, mm7            ; A0-B0          a0-b0
279    paddd           mm1, mm2            ; B1             b1
280    psrad           mm7, %6
281    psrad           mm4, %6
282    movq            mm2, mm0            ; A1             a1
283    paddd           mm0, mm1            ; A1+B1          a1+b1
284    psubd           mm2, mm1            ; A1-B1          a1-b1
285    psrad           mm0, %6
286    psrad           mm2, %6
287    packssdw        mm7, mm7            ; A0+B0  a0+b0
288    movd           [%5], mm7
289    packssdw        mm0, mm0            ; A1+B1  a1+b1
290    movd      [16 + %5], mm0
291    packssdw        mm2, mm2            ; A1-B1  a1-b1
292    movd      [96 + %5], mm2
293    packssdw        mm4, mm4            ; A0-B0  a0-b0
294    movd     [112 + %5], mm4
295    movq            mm0, %3             ; R3     R1      r3      r1
296    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
297    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
298    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
299    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
300    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
301    movq            mm2, mm5            ; A2             a2
302    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
303    paddd           mm4, mm7            ; B2             b2
304    paddd           mm2, mm4            ; A2+B2          a2+b2
305    psubd           mm5, mm4            ; a2-B2          a2-b2
306    psrad           mm2, %6
307    psrad           mm5, %6
308    movq            mm4, mm6            ; A3             a3
309    paddd           mm3, mm0            ; B3             b3
310    paddd           mm6, mm3            ; A3+B3          a3+b3
311    psubd           mm4, mm3            ; a3-B3          a3-b3
312    psrad           mm6, %6
313    psrad           mm4, %6
314    packssdw        mm2, mm2            ; A2+B2  a2+b2
315    packssdw        mm6, mm6            ; A3+B3  a3+b3
316    movd      [32 + %5], mm2
317    packssdw        mm4, mm4            ; A3-B3  a3-b3
318    packssdw        mm5, mm5            ; A2-B2  a2-b2
319    movd      [48 + %5], mm6
320    movd      [64 + %5], mm4
321    movd      [80 + %5], mm5
322%endmacro
323
324%macro IDCT2 6
325    movq            mm0, %1             ; R4     R0      r4      r0
326    movq            mm1, %2             ; R6     R2      r6      r2
327    movq            mm3, %4             ; R7     R5      r7      r5
328    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
329    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
330    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
331    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
332    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
333    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
334    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
335    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
336    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
337    paddd           mm4, mm5            ; A0             a0
338    psubd           mm6, mm5            ; A3             a3
339    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
340    paddd           mm0, mm1            ; A1             a1
341    psubd           mm5, mm1            ; A2             a2
342    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
343    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
344    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
345    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
346    paddd           mm1, mm4            ; A0+B0          a0+b0
347    paddd           mm4, mm4            ; 2A0            2a0
348    psubd           mm4, mm1            ; A0-B0          a0-b0
349    psrad           mm1, %6
350    psrad           mm4, %6
351    movq            mm2, mm0            ; A1             a1
352    paddd           mm0, mm7            ; A1+B1          a1+b1
353    psubd           mm2, mm7            ; A1-B1          a1-b1
354    psrad           mm0, %6
355    psrad           mm2, %6
356    packssdw        mm1, mm1            ; A0+B0  a0+b0
357    movd           [%5], mm1
358    packssdw        mm0, mm0            ; A1+B1  a1+b1
359    movd      [16 + %5], mm0
360    packssdw        mm2, mm2            ; A1-B1  a1-b1
361    movd      [96 + %5], mm2
362    packssdw        mm4, mm4            ; A0-B0  a0-b0
363    movd     [112 + %5], mm4
364    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
365    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
366    movq            mm2, mm5            ; A2             a2
367    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
368    paddd           mm2, mm1            ; A2+B2          a2+b2
369    psubd           mm5, mm1            ; a2-B2          a2-b2
370    psrad           mm2, %6
371    psrad           mm5, %6
372    movq            mm1, mm6            ; A3             a3
373    paddd           mm6, mm3            ; A3+B3          a3+b3
374    psubd           mm1, mm3            ; a3-B3          a3-b3
375    psrad           mm6, %6
376    psrad           mm1, %6
377    packssdw        mm2, mm2            ; A2+B2  a2+b2
378    packssdw        mm6, mm6            ; A3+B3  a3+b3
379    movd      [32 + %5], mm2
380    packssdw        mm1, mm1            ; A3-B3  a3-b3
381    packssdw        mm5, mm5            ; A2-B2  a2-b2
382    movd      [48 + %5], mm6
383    movd      [64 + %5], mm1
384    movd      [80 + %5], mm5
385%endmacro
386
387%macro IDCT3 6
388    movq            mm0, %1             ; R4     R0      r4      r0
389    movq            mm3, %4             ; R7     R5      r7      r5
390    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
391    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
392    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
393    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
394    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
395    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
396    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
397    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
398    movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
399    pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
400    paddd           mm1, mm4            ; A0+B0          a0+b0
401    paddd           mm4, mm4            ; 2A0            2a0
402    psubd           mm4, mm1            ; A0-B0          a0-b0
403    psrad           mm1, %6
404    psrad           mm4, %6
405    movq            mm2, mm0            ; A1             a1
406    paddd           mm0, mm7            ; A1+B1          a1+b1
407    psubd           mm2, mm7            ; A1-B1          a1-b1
408    psrad           mm0, %6
409    psrad           mm2, %6
410    packssdw        mm1, mm1            ; A0+B0  a0+b0
411    movd           [%5], mm1
412    packssdw        mm0, mm0            ; A1+B1  a1+b1
413    movd      [16 + %5], mm0
414    packssdw        mm2, mm2            ; A1-B1  a1-b1
415    movd      [96 + %5], mm2
416    packssdw        mm4, mm4            ; A0-B0  a0-b0
417    movd     [112 + %5], mm4
418    movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
419    pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
420    movq            mm2, mm5            ; A2             a2
421    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
422    paddd           mm2, mm1            ; A2+B2          a2+b2
423    psubd           mm5, mm1            ; a2-B2          a2-b2
424    psrad           mm2, %6
425    psrad           mm5, %6
426    movq            mm1, mm6            ; A3             a3
427    paddd           mm6, mm3            ; A3+B3          a3+b3
428    psubd           mm1, mm3            ; a3-B3          a3-b3
429    psrad           mm6, %6
430    psrad           mm1, %6
431    packssdw        mm2, mm2            ; A2+B2  a2+b2
432    packssdw        mm6, mm6            ; A3+B3  a3+b3
433    movd      [32 + %5], mm2
434    packssdw        mm1, mm1            ; A3-B3  a3-b3
435    packssdw        mm5, mm5            ; A2-B2  a2-b2
436    movd      [48 + %5], mm6
437    movd      [64 + %5], mm1
438    movd      [80 + %5], mm5
439%endmacro
440
441%macro IDCT4 6
442    movq            mm0, %1             ; R4     R0      r4      r0
443    movq            mm2, %3             ; R3     R1      r3      r1
444    movq            mm3, %4             ; R7     R5      r7      r5
445    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
446    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
447    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
448    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
449    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
450    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
451    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
452    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
453    movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
454    pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
455    pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
456    paddd           mm7, mm1            ; B0             b0
457    movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
458    pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
459    paddd           mm7, mm4            ; A0+B0          a0+b0
460    paddd           mm4, mm4            ; 2A0            2a0
461    psubd           mm4, mm7            ; A0-B0          a0-b0
462    paddd           mm1, mm2            ; B1             b1
463    psrad           mm7, %6
464    psrad           mm4, %6
465    movq            mm2, mm0            ; A1             a1
466    paddd           mm0, mm1            ; A1+B1          a1+b1
467    psubd           mm2, mm1            ; A1-B1          a1-b1
468    psrad           mm0, %6
469    psrad           mm2, %6
470    packssdw        mm7, mm7            ; A0+B0  a0+b0
471    movd           [%5], mm7
472    packssdw        mm0, mm0            ; A1+B1  a1+b1
473    movd      [16 + %5], mm0
474    packssdw        mm2, mm2            ; A1-B1  a1-b1
475    movd      [96 + %5], mm2
476    packssdw        mm4, mm4            ; A0-B0  a0-b0
477    movd     [112 + %5], mm4
478    movq            mm0, %3             ; R3     R1      r3      r1
479    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
480    pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
481    movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
482    pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
483    pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
484    movq            mm2, mm5            ; A2             a2
485    pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
486    paddd           mm4, mm7            ; B2             b2
487    paddd           mm2, mm4            ; A2+B2          a2+b2
488    psubd           mm5, mm4            ; a2-B2          a2-b2
489    psrad           mm2, %6
490    psrad           mm5, %6
491    movq            mm4, mm6            ; A3             a3
492    paddd           mm3, mm0            ; B3             b3
493    paddd           mm6, mm3            ; A3+B3          a3+b3
494    psubd           mm4, mm3            ; a3-B3          a3-b3
495    psrad           mm6, %6
496    psrad           mm4, %6
497    packssdw        mm2, mm2            ; A2+B2  a2+b2
498    packssdw        mm6, mm6            ; A3+B3  a3+b3
499    movd      [32 + %5], mm2
500    packssdw        mm4, mm4            ; A3-B3  a3-b3
501    packssdw        mm5, mm5            ; A2-B2  a2-b2
502    movd      [48 + %5], mm6
503    movd      [64 + %5], mm4
504    movd      [80 + %5], mm5
505%endmacro
506
507%macro IDCT5 6
508    movq            mm0, %1             ; R4     R0      r4      r0
509    movq            mm2, %3             ; R3     R1      r3      r1
510    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
511    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
512    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
513    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
514    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
515    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
516    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
517    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
518    movq            mm3, [coeffs + 64]
519    pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
520    paddd           mm7, mm4            ; A0+B0          a0+b0
521    paddd           mm4, mm4            ; 2A0            2a0
522    psubd           mm4, mm7            ; A0-B0          a0-b0
523    psrad           mm7, %6
524    psrad           mm4, %6
525    movq            mm1, mm0            ; A1             a1
526    paddd           mm0, mm3            ; A1+B1          a1+b1
527    psubd           mm1, mm3            ; A1-B1          a1-b1
528    psrad           mm0, %6
529    psrad           mm1, %6
530    packssdw        mm7, mm7            ; A0+B0  a0+b0
531    movd           [%5], mm7
532    packssdw        mm0, mm0            ; A1+B1  a1+b1
533    movd      [16 + %5], mm0
534    packssdw        mm1, mm1            ; A1-B1  a1-b1
535    movd      [96 + %5], mm1
536    packssdw        mm4, mm4            ; A0-B0  a0-b0
537    movd     [112 + %5], mm4
538    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
539    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
540    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
541    movq            mm1, mm5            ; A2             a2
542    paddd           mm1, mm4            ; A2+B2          a2+b2
543    psubd           mm5, mm4            ; a2-B2          a2-b2
544    psrad           mm1, %6
545    psrad           mm5, %6
546    movq            mm4, mm6            ; A3             a3
547    paddd           mm6, mm2            ; A3+B3          a3+b3
548    psubd           mm4, mm2            ; a3-B3          a3-b3
549    psrad           mm6, %6
550    psrad           mm4, %6
551    packssdw        mm1, mm1            ; A2+B2  a2+b2
552    packssdw        mm6, mm6            ; A3+B3  a3+b3
553    movd      [32 + %5], mm1
554    packssdw        mm4, mm4            ; A3-B3  a3-b3
555    packssdw        mm5, mm5            ; A2-B2  a2-b2
556    movd      [48 + %5], mm6
557    movd      [64 + %5], mm4
558    movd      [80 + %5], mm5
559%endmacro
560
561%macro IDCT6 6
562    movq            mm0, [%1]           ; R4     R0      r4      r0
563    movq            mm1, [%2]           ; R6     R2      r6      r2
564    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
565    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
566    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
567    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
568    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
569    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
570    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
571    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
572    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
573    paddd           mm4, mm5            ; A0             a0
574    psubd           mm6, mm5            ; A3             a3
575    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
576    paddd           mm0, mm1            ; A1             a1
577    psubd           mm5, mm1            ; A2             a2
578    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
579    movq            mm3, [8 + %2]       ; R6     R2      r6      r2
580    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
581    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
582    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
583    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
584    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
585    pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
586    pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
587    paddd           mm7, mm1            ; A0             a0
588    paddd           mm1, mm1            ; 2C0            2c0
589    psubd           mm1, mm7            ; A3             a3
590    paddd           mm3, mm2            ; A1             a1
591    paddd           mm2, mm2            ; 2C1            2c1
592    psubd           mm2, mm3            ; A2             a2
593    psrad           mm4, %6
594    psrad           mm7, %6
595    psrad           mm3, %6
596    packssdw        mm4, mm7            ; A0     a0
597    movq           [%5], mm4
598    psrad           mm0, %6
599    packssdw        mm0, mm3            ; A1     a1
600    movq      [16 + %5], mm0
601    movq      [96 + %5], mm0
602    movq     [112 + %5], mm4
603    psrad           mm5, %6
604    psrad           mm6, %6
605    psrad           mm2, %6
606    packssdw        mm5, mm2            ; A2-B2  a2-b2
607    movq      [32 + %5], mm5
608    psrad           mm1, %6
609    packssdw        mm6, mm1            ; A3+B3  a3+b3
610    movq      [48 + %5], mm6
611    movq      [64 + %5], mm6
612    movq      [80 + %5], mm5
613%endmacro
614
615%macro IDCT7 6
616    movq            mm0, %1             ; R4     R0      r4      r0
617    movq            mm1, %2             ; R6     R2      r6      r2
618    movq            mm2, %3             ; R3     R1      r3      r1
619    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
620    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
621    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
622    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
623    movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
624    pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
625    movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
626    pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
627    movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
628    movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
629    pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
630    paddd           mm4, mm5            ; A0             a0
631    psubd           mm6, mm5            ; A3             a3
632    movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
633    paddd           mm0, mm1            ; A1             a1
634    psubd           mm5, mm1            ; A2             a2
635    movq            mm1, [coeffs + 64]
636    pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
637    paddd           mm7, mm4            ; A0+B0          a0+b0
638    paddd           mm4, mm4            ; 2A0            2a0
639    psubd           mm4, mm7            ; A0-B0          a0-b0
640    psrad           mm7, %6
641    psrad           mm4, %6
642    movq            mm3, mm0            ; A1             a1
643    paddd           mm0, mm1            ; A1+B1          a1+b1
644    psubd           mm3, mm1            ; A1-B1          a1-b1
645    psrad           mm0, %6
646    psrad           mm3, %6
647    packssdw        mm7, mm7            ; A0+B0  a0+b0
648    movd           [%5], mm7
649    packssdw        mm0, mm0            ; A1+B1  a1+b1
650    movd      [16 + %5], mm0
651    packssdw        mm3, mm3            ; A1-B1  a1-b1
652    movd      [96 + %5], mm3
653    packssdw        mm4, mm4            ; A0-B0  a0-b0
654    movd     [112 + %5], mm4
655    movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
656    pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
657    pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
658    movq            mm3, mm5            ; A2             a2
659    paddd           mm3, mm4            ; A2+B2          a2+b2
660    psubd           mm5, mm4            ; a2-B2          a2-b2
661    psrad           mm3, %6
662    psrad           mm5, %6
663    movq            mm4, mm6            ; A3             a3
664    paddd           mm6, mm2            ; A3+B3          a3+b3
665    psubd           mm4, mm2            ; a3-B3          a3-b3
666    psrad           mm6, %6
667    packssdw        mm3, mm3            ; A2+B2  a2+b2
668    movd      [32 + %5], mm3
669    psrad           mm4, %6
670    packssdw        mm6, mm6            ; A3+B3  a3+b3
671    movd      [48 + %5], mm6
672    packssdw        mm4, mm4            ; A3-B3  a3-b3
673    packssdw        mm5, mm5            ; A2-B2  a2-b2
674    movd      [64 + %5], mm4
675    movd      [80 + %5], mm5
676%endmacro
677
678%macro IDCT8 6
679    movq            mm0, [%1]           ; R4     R0      r4      r0
680    movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
681    pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
682    movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
683    pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
684    psrad           mm4, %6
685    psrad           mm0, %6
686    movq            mm2, [8 + %1]       ; R4     R0      r4      r0
687    movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
688    pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
689    movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
690    pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
691    movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
692    psrad           mm1, %6
693    packssdw        mm4, mm1            ; A0     a0
694    movq           [%5], mm4
695    psrad           mm2, %6
696    packssdw        mm0, mm2            ; A1     a1
697    movq      [16 + %5], mm0
698    movq      [96 + %5], mm0
699    movq     [112 + %5], mm4
700    movq      [32 + %5], mm0
701    movq      [48 + %5], mm4
702    movq      [64 + %5], mm4
703    movq      [80 + %5], mm0
704%endmacro
705
706%macro IDCT 0
707    DC_COND_IDCT  0,   8,  16,  24, rsp +  0, null, 11
708    Z_COND_IDCT  32,  40,  48,  56, rsp + 32, null, 11, %%4
709    Z_COND_IDCT  64,  72,  80,  88, rsp + 64, null, 11, %%2
710    Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
711
712    IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
713    IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
714    IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
715    IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
716    jmp %%9
717
718    ALIGN 16
719    %%4:
720    Z_COND_IDCT 64,  72,  80,  88, rsp + 64, null, 11, %%6
721    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
722
723    IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
724    IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
725    IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
726    IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
727    jmp %%9
728
729    ALIGN 16
730    %%6:
731    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
732
733    IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
734    IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
735    IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
736    IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
737    jmp %%9
738
739    ALIGN 16
740    %%2:
741    Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
742
743    IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
744    IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
745    IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
746    IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
747    jmp %%9
748
749    ALIGN 16
750    %%3:
751
752    IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
753    IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
754    IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
755    IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
756    jmp %%9
757
758    ALIGN 16
759    %%5:
760
761    IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
762    IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
763    jmp %%9
764
765    ALIGN 16
766    %%1:
767
768    IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
769    IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
770    IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
771    IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
772    jmp %%9
773
774    ALIGN 16
775    %%7:
776
777    IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
778    IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
779
780    %%9:
781%endmacro
782
783%macro PUT_PIXELS_CLAMPED_HALF 1
784    mova     m0, [blockq+mmsize*0+%1]
785    mova     m1, [blockq+mmsize*2+%1]
786%if mmsize == 8
787    mova     m2, [blockq+mmsize*4+%1]
788    mova     m3, [blockq+mmsize*6+%1]
789%endif
790    packuswb m0, [blockq+mmsize*1+%1]
791    packuswb m1, [blockq+mmsize*3+%1]
792%if mmsize == 8
793    packuswb m2, [blockq+mmsize*5+%1]
794    packuswb m3, [blockq+mmsize*7+%1]
795    movq           [pixelsq], m0
796    movq    [lsizeq+pixelsq], m1
797    movq  [2*lsizeq+pixelsq], m2
798    movq   [lsize3q+pixelsq], m3
799%else
800    movq           [pixelsq], m0
801    movhps  [lsizeq+pixelsq], m0
802    movq  [2*lsizeq+pixelsq], m1
803    movhps [lsize3q+pixelsq], m1
804%endif
805%endmacro
806
807%macro ADD_PIXELS_CLAMPED 1
808    mova       m0, [blockq+mmsize*0+%1]
809    mova       m1, [blockq+mmsize*1+%1]
810%if mmsize == 8
811    mova       m5, [blockq+mmsize*2+%1]
812    mova       m6, [blockq+mmsize*3+%1]
813%endif
814    movq       m2, [pixelsq]
815    movq       m3, [pixelsq+lsizeq]
816%if mmsize == 8
817    mova       m7, m2
818    punpcklbw  m2, m4
819    punpckhbw  m7, m4
820    paddsw     m0, m2
821    paddsw     m1, m7
822    mova       m7, m3
823    punpcklbw  m3, m4
824    punpckhbw  m7, m4
825    paddsw     m5, m3
826    paddsw     m6, m7
827%else
828    punpcklbw  m2, m4
829    punpcklbw  m3, m4
830    paddsw     m0, m2
831    paddsw     m1, m3
832%endif
833    packuswb   m0, m1
834%if mmsize == 8
835    packuswb   m5, m6
836    movq       [pixelsq], m0
837    movq       [pixelsq+lsizeq], m5
838%else
839    movq       [pixelsq], m0
840    movhps     [pixelsq+lsizeq], m0
841%endif
842%endmacro
843
844INIT_MMX mmx
845
846cglobal simple_idct, 1, 2, 8, 128, block, t0
847    IDCT
848RET
849
850INIT_XMM sse2
851
852cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
853    IDCT
854    lea lsize3q, [lsizeq*3]
855    PUT_PIXELS_CLAMPED_HALF 0
856    lea pixelsq, [pixelsq+lsizeq*4]
857    PUT_PIXELS_CLAMPED_HALF 64
858RET
859
860cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
861    IDCT
862    pxor       m4, m4
863    ADD_PIXELS_CLAMPED 0
864    lea        pixelsq, [pixelsq+lsizeq*2]
865    ADD_PIXELS_CLAMPED 32
866    lea        pixelsq, [pixelsq+lsizeq*2]
867    ADD_PIXELS_CLAMPED 64
868    lea        pixelsq, [pixelsq+lsizeq*2]
869    ADD_PIXELS_CLAMPED 96
870RET
871%endif
872