1; 2; jquant.asm - sample data conversion and quantization (3DNow! & MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18%include "jdct.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 32 23; 24; Load data into workspace, applying unsigned->signed conversion 25; 26; GLOBAL(void) 27; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col, 28; FAST_FLOAT *workspace); 29; 30 31%define sample_data ebp + 8 ; JSAMPARRAY sample_data 32%define start_col ebp + 12 ; JDIMENSION start_col 33%define workspace ebp + 16 ; FAST_FLOAT *workspace 34 35 align 32 36 GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow) 37 38EXTN(jsimd_convsamp_float_3dnow): 39 push ebp 40 mov ebp, esp 41 push ebx 42; push ecx ; need not be preserved 43; push edx ; need not be preserved 44 push esi 45 push edi 46 47 pcmpeqw mm7, mm7 48 psllw mm7, 7 49 packsswb mm7, mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) 50 51 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 52 mov eax, JDIMENSION [start_col] 53 mov edi, POINTER [workspace] ; (DCTELEM *) 54 mov ecx, DCTSIZE/2 55 alignx 16, 7 56.convloop: 57 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59 60 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] 61 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] 62 63 psubb mm0, mm7 ; mm0=(01234567) 64 psubb mm1, mm7 ; mm1=(89ABCDEF) 65 66 punpcklbw mm2, mm0 ; mm2=(*0*1*2*3) 67 punpckhbw mm0, mm0 ; mm0=(*4*5*6*7) 68 punpcklbw mm3, mm1 ; mm3=(*8*9*A*B) 69 punpckhbw mm1, mm1 ; mm1=(*C*D*E*F) 70 71 punpcklwd mm4, mm2 ; mm4=(***0***1) 72 punpckhwd mm2, mm2 ; mm2=(***2***3) 73 punpcklwd mm5, mm0 ; mm5=(***4***5) 74 punpckhwd mm0, mm0 ; mm0=(***6***7) 75 76 psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(01) 77 psrad mm2, (DWORD_BIT-BYTE_BIT) ; mm2=(23) 78 pi2fd mm4, mm4 79 pi2fd mm2, mm2 80 psrad mm5, (DWORD_BIT-BYTE_BIT) ; mm5=(45) 81 psrad mm0, (DWORD_BIT-BYTE_BIT) ; mm0=(67) 82 pi2fd mm5, mm5 83 pi2fd mm0, mm0 84 85 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 86 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 87 movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 88 movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 89 90 punpcklwd mm6, mm3 ; mm6=(***8***9) 91 punpckhwd mm3, mm3 ; mm3=(***A***B) 92 punpcklwd mm4, mm1 ; mm4=(***C***D) 93 punpckhwd mm1, mm1 ; mm1=(***E***F) 94 95 psrad mm6, (DWORD_BIT-BYTE_BIT) ; mm6=(89) 96 psrad mm3, (DWORD_BIT-BYTE_BIT) ; mm3=(AB) 97 pi2fd mm6, mm6 98 pi2fd mm3, mm3 99 psrad mm4, (DWORD_BIT-BYTE_BIT) ; mm4=(CD) 100 psrad mm1, (DWORD_BIT-BYTE_BIT) ; mm1=(EF) 101 pi2fd mm4, mm4 102 pi2fd mm1, mm1 103 104 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 105 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 106 movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 107 movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 108 109 add esi, byte 2*SIZEOF_JSAMPROW 110 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 111 dec ecx 112 jnz near .convloop 113 114 femms ; empty MMX/3DNow! state 115 116 pop edi 117 pop esi 118; pop edx ; need not be preserved 119; pop ecx ; need not be preserved 120 pop ebx 121 pop ebp 122 ret 123 124; -------------------------------------------------------------------------- 125; 126; Quantize/descale the coefficients, and store into coef_block 127; 128; GLOBAL(void) 129; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors, 130; FAST_FLOAT *workspace); 131; 132 133%define coef_block ebp + 8 ; JCOEFPTR coef_block 134%define divisors ebp + 12 ; FAST_FLOAT *divisors 135%define workspace ebp + 16 ; FAST_FLOAT *workspace 136 137 align 32 138 GLOBAL_FUNCTION(jsimd_quantize_float_3dnow) 139 140EXTN(jsimd_quantize_float_3dnow): 141 push ebp 142 mov ebp, esp 143; push ebx ; unused 144; push ecx ; unused 145; push edx ; need not be preserved 146 push esi 147 push edi 148 149 mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) 150 movd mm7, eax 151 punpckldq mm7, mm7 ; mm7={12582912.0F 12582912.0F} 152 153 mov esi, POINTER [workspace] 154 mov edx, POINTER [divisors] 155 mov edi, JCOEFPTR [coef_block] 156 mov eax, DCTSIZE2/16 157 alignx 16, 7 158.quantloop: 159 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 160 movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 161 pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 162 pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 163 movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] 164 movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] 165 pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 166 pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 167 168 pfadd mm0, mm7 ; mm0=(00 ** 01 **) 169 pfadd mm1, mm7 ; mm1=(02 ** 03 **) 170 pfadd mm2, mm7 ; mm0=(04 ** 05 **) 171 pfadd mm3, mm7 ; mm1=(06 ** 07 **) 172 173 movq mm4, mm0 174 punpcklwd mm0, mm1 ; mm0=(00 02 ** **) 175 punpckhwd mm4, mm1 ; mm4=(01 03 ** **) 176 movq mm5, mm2 177 punpcklwd mm2, mm3 ; mm2=(04 06 ** **) 178 punpckhwd mm5, mm3 ; mm5=(05 07 ** **) 179 180 punpcklwd mm0, mm4 ; mm0=(00 01 02 03) 181 punpcklwd mm2, mm5 ; mm2=(04 05 06 07) 182 183 movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 184 movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 185 pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 186 pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 187 movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] 188 movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] 189 pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 190 pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 191 192 pfadd mm6, mm7 ; mm0=(10 ** 11 **) 193 pfadd mm1, mm7 ; mm4=(12 ** 13 **) 194 pfadd mm3, mm7 ; mm0=(14 ** 15 **) 195 pfadd mm4, mm7 ; mm4=(16 ** 17 **) 196 197 movq mm5, mm6 198 punpcklwd mm6, mm1 ; mm6=(10 12 ** **) 199 punpckhwd mm5, mm1 ; mm5=(11 13 ** **) 200 movq mm1, mm3 201 punpcklwd mm3, mm4 ; mm3=(14 16 ** **) 202 punpckhwd mm1, mm4 ; mm1=(15 17 ** **) 203 204 punpcklwd mm6, mm5 ; mm6=(10 11 12 13) 205 punpcklwd mm3, mm1 ; mm3=(14 15 16 17) 206 207 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 208 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 209 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 210 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 211 212 add esi, byte 16*SIZEOF_FAST_FLOAT 213 add edx, byte 16*SIZEOF_FAST_FLOAT 214 add edi, byte 16*SIZEOF_JCOEF 215 dec eax 216 jnz near .quantloop 217 218 femms ; empty MMX/3DNow! state 219 220 pop edi 221 pop esi 222; pop edx ; need not be preserved 223; pop ecx ; unused 224; pop ebx ; unused 225 pop ebp 226 ret 227 228; For some reason, the OS X linker does not honor the request to align the 229; segment unless we do this. 230 align 32 231