1; 2; jquant.asm - sample data conversion and quantization (3DNow! & MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 32 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col, 30; FAST_FLOAT *workspace); 31; 32 33%define sample_data ebp + 8 ; JSAMPARRAY sample_data 34%define start_col ebp + 12 ; JDIMENSION start_col 35%define workspace ebp + 16 ; FAST_FLOAT *workspace 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow) 39 40EXTN(jsimd_convsamp_float_3dnow): 41 push ebp 42 mov ebp, esp 43 push ebx 44; push ecx ; need not be preserved 45; push edx ; need not be preserved 46 push esi 47 push edi 48 49 pcmpeqw mm7, mm7 50 psllw mm7, 7 51 packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) 52 53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 54 mov eax, JDIMENSION [start_col] 55 mov edi, POINTER [workspace] ; (DCTELEM *) 56 mov ecx, DCTSIZE/2 57 alignx 16, 7 58.convloop: 59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] 63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] 64 65 psubb mm0, mm7 ; mm0=(01234567) 66 psubb mm1, mm7 ; mm1=(89ABCDEF) 67 68 punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) 69 punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) 70 punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) 71 punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) 72 73 punpcklwd mm4, mm2 ; mm4=(***0***1) 74 punpckhwd mm2, mm2 ; mm2=(***2***3) 75 punpcklwd mm5, mm0 ; mm5=(***4***5) 76 punpckhwd mm0, mm0 ; mm0=(***6***7) 77 78 psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) 79 psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) 80 pi2fd mm4, mm4 81 pi2fd mm2, mm2 82 psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) 83 psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) 84 pi2fd mm5, mm5 85 pi2fd mm0, mm0 86 87 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 88 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 89 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 90 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 91 92 punpcklwd mm6, mm3 ; mm6=(***8***9) 93 punpckhwd mm3, mm3 ; mm3=(***A***B) 94 punpcklwd mm4, mm1 ; mm4=(***C***D) 95 punpckhwd mm1, mm1 ; mm1=(***E***F) 96 97 psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) 98 psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) 99 pi2fd mm6, mm6 100 pi2fd mm3, mm3 101 psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) 102 psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) 103 pi2fd mm4, mm4 104 pi2fd mm1, mm1 105 106 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 107 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 108 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 109 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 110 111 add esi, byte 2*SIZEOF_JSAMPROW 112 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 113 dec ecx 114 jnz near .convloop 115 116 femms ; empty MMX/3DNow! state 117 118 pop edi 119 pop esi 120; pop edx ; need not be preserved 121; pop ecx ; need not be preserved 122 pop ebx 123 pop ebp 124 ret 125 126; -------------------------------------------------------------------------- 127; 128; Quantize/descale the coefficients, and store into coef_block 129; 130; GLOBAL(void) 131; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors, 132; FAST_FLOAT *workspace); 133; 134 135%define coef_block ebp + 8 ; JCOEFPTR coef_block 136%define divisors ebp + 12 ; FAST_FLOAT *divisors 137%define workspace ebp + 16 ; FAST_FLOAT *workspace 138 139 align 32 140 GLOBAL_FUNCTION(jsimd_quantize_float_3dnow) 141 142EXTN(jsimd_quantize_float_3dnow): 143 push ebp 144 mov ebp, esp 145; push ebx ; unused 146; push ecx ; unused 147; push edx ; need not be preserved 148 push esi 149 push edi 150 151 mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) 152 movd mm7, eax 153 punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F} 154 155 mov esi, POINTER [workspace] 156 mov edx, POINTER [divisors] 157 mov edi, JCOEFPTR [coef_block] 158 mov eax, DCTSIZE2/16 159 alignx 16, 7 160.quantloop: 161 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 162 movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 163 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 164 pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 165 movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] 166 movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] 167 pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 168 pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 169 170 pfadd mm0, mm7 ; mm0=(00 ** 01 **) 171 pfadd mm1, mm7 ; mm1=(02 ** 03 **) 172 pfadd mm2, mm7 ; mm0=(04 ** 05 **) 173 pfadd mm3, mm7 ; mm1=(06 ** 07 **) 174 175 movq mm4, mm0 176 punpcklwd mm0, mm1 ; mm0=(00 02 ** **) 177 punpckhwd mm4, mm1 ; mm4=(01 03 ** **) 178 movq mm5, mm2 179 punpcklwd mm2, mm3 ; mm2=(04 06 ** **) 180 punpckhwd mm5, mm3 ; mm5=(05 07 ** **) 181 182 punpcklwd mm0, mm4 ; mm0=(00 01 02 03) 183 punpcklwd mm2, mm5 ; mm2=(04 05 06 07) 184 185 movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 186 movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 187 pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 188 pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 189 movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] 190 movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] 191 pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 192 pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 193 194 pfadd mm6, mm7 ; mm0=(10 ** 11 **) 195 pfadd mm1, mm7 ; mm4=(12 ** 13 **) 196 pfadd mm3, mm7 ; mm0=(14 ** 15 **) 197 pfadd mm4, mm7 ; mm4=(16 ** 17 **) 198 199 movq mm5, mm6 200 punpcklwd mm6, mm1 ; mm6=(10 12 ** **) 201 punpckhwd mm5, mm1 ; mm5=(11 13 ** **) 202 movq mm1, mm3 203 punpcklwd mm3, mm4 ; mm3=(14 16 ** **) 204 punpckhwd mm1, mm4 ; mm1=(15 17 ** **) 205 206 punpcklwd mm6, mm5 ; mm6=(10 11 12 13) 207 punpcklwd mm3, mm1 ; mm3=(14 15 16 17) 208 209 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 210 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 211 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 212 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 213 214 add esi, byte 16*SIZEOF_FAST_FLOAT 215 add edx, byte 16*SIZEOF_FAST_FLOAT 216 add edi, byte 16*SIZEOF_JCOEF 217 dec eax 218 jnz near .quantloop 219 220 femms ; empty MMX/3DNow! state 221 222 pop edi 223 pop esi 224; pop edx ; need not be preserved 225; pop ecx ; unused 226; pop ebx ; unused 227 pop ebp 228 ret 229 230; For some reason, the OS X linker does not honor the request to align the 231; segment unless we do this. 232 align 32 233