• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;// file : mmx_zoom.s
2;// author : JC Hoelt <jeko@free.fr>
3;//
4;// history
5;// 07/01/2001 : Changing FEMMS to EMMS : slower... but run on intel machines
6;//	03/01/2001 : WIDTH and HEIGHT are now variable
7;//	28/12/2000 : adding comments to the code, suppress some useless lines
8;//	27/12/2000 : reducing memory access... improving performance by 20%
9;//		coefficients are now on 1 byte
10;//	22/12/2000 : Changing data structure
11;//	16/12/2000 : AT&T version
12;//	14/12/2000 : unrolling loop
13;//	12/12/2000 : 64 bits memory access
14
15
16.data
17
18thezero:
19	.long 0x00000000
20	.long 0x00000000
21
22
23.text
24
25.globl mmx_zoom		;// name of the function to call by C program
26.extern coeffs		;// the transformation buffer
27.extern expix1,expix2 ;// the source and destination buffer
28.extern mmx_zoom_size, zoom_width ;// size of the buffers
29
30.align 16
31mmx_zoom:
32
33push %ebp
34push %esp
35
36;// initialisation du mm7 � zero
37movq (thezero), %mm7
38
39movl zoom_width, %eax
40movl $4, %ebx
41mull %ebx
42movl %eax, %ebp
43
44movl (coeffs), %eax
45movl (expix1), %edx
46movl (expix2), %ebx
47movl $10, %edi
48movl mmx_zoom_size, %ecx
49
50.while:
51	;// esi <- nouvelle position
52	movl (%eax), %esi
53	leal (%edx, %esi), %esi
54
55	;// recuperation des deux premiers pixels dans mm0 et mm1
56	movq (%esi), %mm0		/* b1-v1-r1-a1-b2-v2-r2-a2 */
57	movq %mm0, %mm1			/* b1-v1-r1-a1-b2-v2-r2-a2 */
58
59	;// recuperation des 4 coefficients
60	movd 4(%eax), %mm6		/* ??-??-??-??-c4-c3-c2-c1 */
61	;// depackage du premier pixel
62	punpcklbw %mm7, %mm0	/* 00-b2-00-v2-00-r2-00-a2 */
63
64	movq %mm6, %mm5			/* ??-??-??-??-c4-c3-c2-c1 */
65	;// depackage du 2ieme pixel
66	punpckhbw %mm7, %mm1	/* 00-b1-00-v1-00-r1-00-a1 */
67
68	;// extraction des coefficients...
69	punpcklbw %mm5, %mm6	/* c4-c4-c3-c3-c2-c2-c1-c1 */
70	movq %mm6, %mm4			/* c4-c4-c3-c3-c2-c2-c1-c1 */
71	movq %mm6, %mm5			/* c4-c4-c3-c3-c2-c2-c1-c1 */
72
73	punpcklbw %mm5, %mm6	/* c2-c2-c2-c2-c1-c1-c1-c1 */
74	punpckhbw %mm5, %mm4	/* c4-c4-c4-c4-c3-c3-c3-c3 */
75
76	movq %mm6, %mm3			/* c2-c2-c2-c2-c1-c1-c1-c1 */
77	punpcklbw %mm7, %mm6	/* 00-c1-00-c1-00-c1-00-c1 */
78	punpckhbw %mm7, %mm3	/* 00-c2-00-c2-00-c2-00-c2 */
79
80	;// multiplication des pixels par les coefficients
81	pmullw %mm6, %mm0		/* c1*b2-c1*v2-c1*r2-c1*a2 */
82	pmullw %mm3, %mm1		/* c2*b1-c2*v1-c2*r1-c2*a1 */
83	paddw %mm1, %mm0
84
85	;// ...extraction des 2 derniers coefficients
86	movq %mm4, %mm5			/* c4-c4-c4-c4-c3-c3-c3-c3 */
87	punpcklbw %mm7, %mm4	/* 00-c3-00-c3-00-c3-00-c3 */
88	punpckhbw %mm7, %mm5	/* 00-c4-00-c4-00-c4-00-c4 */
89
90	;// recuperation des 2 derniers pixels
91	movq (%esi,%ebp), %mm1
92	movq %mm1, %mm2
93
94	;// depackage des pixels
95	punpcklbw %mm7, %mm1
96	punpckhbw %mm7, %mm2
97
98	;// multiplication pas les coeffs
99	pmullw %mm4, %mm1
100	pmullw %mm5, %mm2
101
102	;// ajout des valeurs obtenues � la valeur finale
103	paddw %mm1, %mm0
104	paddw %mm2, %mm0
105
106	;// division par 256 = 16+16+16+16, puis repackage du pixel final
107	psrlw $8, %mm0
108	packuswb %mm7, %mm0
109
110	;// passage au suivant
111	leal 8(%eax), %eax
112
113	decl %ecx
114	;// enregistrement du resultat
115	movd %mm0, (%ebx)
116	leal 4(%ebx), %ebx
117
118	;// test de fin du tantque
119	cmpl $0, %ecx				;// 400x300
120
121jz .fin_while
122jmp .while
123
124.fin_while:
125emms
126
127pop %esp
128pop %ebp
129
130ret                  ;//The End
131