1;// file : mmx_zoom.s 2;// author : JC Hoelt <jeko@free.fr> 3;// 4;// history 5;// 07/01/2001 : Changing FEMMS to EMMS : slower... but run on intel machines 6;// 03/01/2001 : WIDTH and HEIGHT are now variable 7;// 28/12/2000 : adding comments to the code, suppress some useless lines 8;// 27/12/2000 : reducing memory access... improving performance by 20% 9;// coefficients are now on 1 byte 10;// 22/12/2000 : Changing data structure 11;// 16/12/2000 : AT&T version 12;// 14/12/2000 : unrolling loop 13;// 12/12/2000 : 64 bits memory access 14 15 16.data 17 18thezero: 19 .long 0x00000000 20 .long 0x00000000 21 22 23.text 24 25.globl mmx_zoom ;// name of the function to call by C program 26.extern coeffs ;// the transformation buffer 27.extern expix1,expix2 ;// the source and destination buffer 28.extern mmx_zoom_size, zoom_width ;// size of the buffers 29 30.align 16 31mmx_zoom: 32 33push %ebp 34push %esp 35 36;// initialisation du mm7 � zero 37movq (thezero), %mm7 38 39movl zoom_width, %eax 40movl $4, %ebx 41mull %ebx 42movl %eax, %ebp 43 44movl (coeffs), %eax 45movl (expix1), %edx 46movl (expix2), %ebx 47movl $10, %edi 48movl mmx_zoom_size, %ecx 49 50.while: 51 ;// esi <- nouvelle position 52 movl (%eax), %esi 53 leal (%edx, %esi), %esi 54 55 ;// recuperation des deux premiers pixels dans mm0 et mm1 56 movq (%esi), %mm0 /* b1-v1-r1-a1-b2-v2-r2-a2 */ 57 movq %mm0, %mm1 /* b1-v1-r1-a1-b2-v2-r2-a2 */ 58 59 ;// recuperation des 4 coefficients 60 movd 4(%eax), %mm6 /* ??-??-??-??-c4-c3-c2-c1 */ 61 ;// depackage du premier pixel 62 punpcklbw %mm7, %mm0 /* 00-b2-00-v2-00-r2-00-a2 */ 63 64 movq %mm6, %mm5 /* ??-??-??-??-c4-c3-c2-c1 */ 65 ;// depackage du 2ieme pixel 66 punpckhbw %mm7, %mm1 /* 00-b1-00-v1-00-r1-00-a1 */ 67 68 ;// extraction des coefficients... 69 punpcklbw %mm5, %mm6 /* c4-c4-c3-c3-c2-c2-c1-c1 */ 70 movq %mm6, %mm4 /* c4-c4-c3-c3-c2-c2-c1-c1 */ 71 movq %mm6, %mm5 /* c4-c4-c3-c3-c2-c2-c1-c1 */ 72 73 punpcklbw %mm5, %mm6 /* c2-c2-c2-c2-c1-c1-c1-c1 */ 74 punpckhbw %mm5, %mm4 /* c4-c4-c4-c4-c3-c3-c3-c3 */ 75 76 movq %mm6, %mm3 /* c2-c2-c2-c2-c1-c1-c1-c1 */ 77 punpcklbw %mm7, %mm6 /* 00-c1-00-c1-00-c1-00-c1 */ 78 punpckhbw %mm7, %mm3 /* 00-c2-00-c2-00-c2-00-c2 */ 79 80 ;// multiplication des pixels par les coefficients 81 pmullw %mm6, %mm0 /* c1*b2-c1*v2-c1*r2-c1*a2 */ 82 pmullw %mm3, %mm1 /* c2*b1-c2*v1-c2*r1-c2*a1 */ 83 paddw %mm1, %mm0 84 85 ;// ...extraction des 2 derniers coefficients 86 movq %mm4, %mm5 /* c4-c4-c4-c4-c3-c3-c3-c3 */ 87 punpcklbw %mm7, %mm4 /* 00-c3-00-c3-00-c3-00-c3 */ 88 punpckhbw %mm7, %mm5 /* 00-c4-00-c4-00-c4-00-c4 */ 89 90 ;// recuperation des 2 derniers pixels 91 movq (%esi,%ebp), %mm1 92 movq %mm1, %mm2 93 94 ;// depackage des pixels 95 punpcklbw %mm7, %mm1 96 punpckhbw %mm7, %mm2 97 98 ;// multiplication pas les coeffs 99 pmullw %mm4, %mm1 100 pmullw %mm5, %mm2 101 102 ;// ajout des valeurs obtenues � la valeur finale 103 paddw %mm1, %mm0 104 paddw %mm2, %mm0 105 106 ;// division par 256 = 16+16+16+16, puis repackage du pixel final 107 psrlw $8, %mm0 108 packuswb %mm7, %mm0 109 110 ;// passage au suivant 111 leal 8(%eax), %eax 112 113 decl %ecx 114 ;// enregistrement du resultat 115 movd %mm0, (%ebx) 116 leal 4(%ebx), %ebx 117 118 ;// test de fin du tantque 119 cmpl $0, %ecx ;// 400x300 120 121jz .fin_while 122jmp .while 123 124.fin_while: 125emms 126 127pop %esp 128pop %ebp 129 130ret ;//The End 131