1; 2; jcsample.asm - downsampling (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18 19; -------------------------------------------------------------------------- 20 SECTION SEG_TEXT 21 BITS 32 22; 23; Downsample pixel values of a single component. 24; This version handles the common case of 2:1 horizontal and 1:1 vertical, 25; without smoothing. 26; 27; GLOBAL(void) 28; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, 29; JDIMENSION v_samp_factor, 30; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 31; JSAMPARRAY output_data); 32; 33 34%define img_width(b) (b) + 8 ; JDIMENSION image_width 35%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 36%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 37%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 38%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 39%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 40 41 align 32 42 GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx) 43 44EXTN(jsimd_h2v1_downsample_mmx): 45 push ebp 46 mov ebp, esp 47; push ebx ; unused 48; push ecx ; need not be preserved 49; push edx ; need not be preserved 50 push esi 51 push edi 52 53 mov ecx, JDIMENSION [width_blks(ebp)] 54 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 55 jz near .return 56 57 mov edx, JDIMENSION [img_width(ebp)] 58 59 ; -- expand_right_edge 60 61 push ecx 62 shl ecx, 1 ; output_cols * 2 63 sub ecx, edx 64 jle short .expand_end 65 66 mov eax, INT [max_v_samp(ebp)] 67 test eax, eax 68 jle short .expand_end 69 70 cld 71 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 72 alignx 16, 7 73.expandloop: 74 push eax 75 push ecx 76 77 mov edi, JSAMPROW [esi] 78 add edi, edx 79 mov al, JSAMPLE [edi-1] 80 81 rep stosb 82 83 pop ecx 84 pop eax 85 86 add esi, byte SIZEOF_JSAMPROW 87 dec eax 88 jg short .expandloop 89 90.expand_end: 91 pop ecx ; output_cols 92 93 ; -- h2v1_downsample 94 95 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 96 test eax, eax 97 jle near .return 98 99 mov edx, 0x00010000 ; bias pattern 100 movd mm7, edx 101 pcmpeqw mm6, mm6 102 punpckldq mm7, mm7 ; mm7={0, 1, 0, 1} 103 psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 104 105 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 106 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 107 alignx 16, 7 108.rowloop: 109 push ecx 110 push edi 111 push esi 112 113 mov esi, JSAMPROW [esi] ; inptr 114 mov edi, JSAMPROW [edi] ; outptr 115 alignx 16, 7 116.columnloop: 117 118 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 119 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] 120 movq mm2, mm0 121 movq mm3, mm1 122 123 pand mm0, mm6 124 psrlw mm2, BYTE_BIT 125 pand mm1, mm6 126 psrlw mm3, BYTE_BIT 127 128 paddw mm0, mm2 129 paddw mm1, mm3 130 paddw mm0, mm7 131 paddw mm1, mm7 132 psrlw mm0, 1 133 psrlw mm1, 1 134 135 packuswb mm0, mm1 136 137 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 138 139 add esi, byte 2*SIZEOF_MMWORD ; inptr 140 add edi, byte 1*SIZEOF_MMWORD ; outptr 141 sub ecx, byte SIZEOF_MMWORD ; outcol 142 jnz short .columnloop 143 144 pop esi 145 pop edi 146 pop ecx 147 148 add esi, byte SIZEOF_JSAMPROW ; input_data 149 add edi, byte SIZEOF_JSAMPROW ; output_data 150 dec eax ; rowctr 151 jg short .rowloop 152 153 emms ; empty MMX state 154 155.return: 156 pop edi 157 pop esi 158; pop edx ; need not be preserved 159; pop ecx ; need not be preserved 160; pop ebx ; unused 161 pop ebp 162 ret 163 164; -------------------------------------------------------------------------- 165; 166; Downsample pixel values of a single component. 167; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 168; without smoothing. 169; 170; GLOBAL(void) 171; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor, 172; JDIMENSION v_samp_factor, 173; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 174; JSAMPARRAY output_data); 175; 176 177%define img_width(b) (b) + 8 ; JDIMENSION image_width 178%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 179%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 180%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 181%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 182%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 183 184 align 32 185 GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx) 186 187EXTN(jsimd_h2v2_downsample_mmx): 188 push ebp 189 mov ebp, esp 190; push ebx ; unused 191; push ecx ; need not be preserved 192; push edx ; need not be preserved 193 push esi 194 push edi 195 196 mov ecx, JDIMENSION [width_blks(ebp)] 197 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 198 jz near .return 199 200 mov edx, JDIMENSION [img_width(ebp)] 201 202 ; -- expand_right_edge 203 204 push ecx 205 shl ecx, 1 ; output_cols * 2 206 sub ecx, edx 207 jle short .expand_end 208 209 mov eax, INT [max_v_samp(ebp)] 210 test eax, eax 211 jle short .expand_end 212 213 cld 214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 215 alignx 16, 7 216.expandloop: 217 push eax 218 push ecx 219 220 mov edi, JSAMPROW [esi] 221 add edi, edx 222 mov al, JSAMPLE [edi-1] 223 224 rep stosb 225 226 pop ecx 227 pop eax 228 229 add esi, byte SIZEOF_JSAMPROW 230 dec eax 231 jg short .expandloop 232 233.expand_end: 234 pop ecx ; output_cols 235 236 ; -- h2v2_downsample 237 238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 239 test eax, eax 240 jle near .return 241 242 mov edx, 0x00020001 ; bias pattern 243 movd mm7, edx 244 pcmpeqw mm6, mm6 245 punpckldq mm7, mm7 ; mm7={1, 2, 1, 2} 246 psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 247 248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 250 alignx 16, 7 251.rowloop: 252 push ecx 253 push edi 254 push esi 255 256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 258 mov edi, JSAMPROW [edi] ; outptr 259 alignx 16, 7 260.columnloop: 261 262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] 263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] 265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] 266 267 movq mm4, mm0 268 movq mm5, mm1 269 pand mm0, mm6 270 psrlw mm4, BYTE_BIT 271 pand mm1, mm6 272 psrlw mm5, BYTE_BIT 273 paddw mm0, mm4 274 paddw mm1, mm5 275 276 movq mm4, mm2 277 movq mm5, mm3 278 pand mm2, mm6 279 psrlw mm4, BYTE_BIT 280 pand mm3, mm6 281 psrlw mm5, BYTE_BIT 282 paddw mm2, mm4 283 paddw mm3, mm5 284 285 paddw mm0, mm1 286 paddw mm2, mm3 287 paddw mm0, mm7 288 paddw mm2, mm7 289 psrlw mm0, 2 290 psrlw mm2, 2 291 292 packuswb mm0, mm2 293 294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 295 296 add edx, byte 2*SIZEOF_MMWORD ; inptr0 297 add esi, byte 2*SIZEOF_MMWORD ; inptr1 298 add edi, byte 1*SIZEOF_MMWORD ; outptr 299 sub ecx, byte SIZEOF_MMWORD ; outcol 300 jnz near .columnloop 301 302 pop esi 303 pop edi 304 pop ecx 305 306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 308 dec eax ; rowctr 309 jg near .rowloop 310 311 emms ; empty MMX state 312 313.return: 314 pop edi 315 pop esi 316; pop edx ; need not be preserved 317; pop ecx ; need not be preserved 318; pop ebx ; unused 319 pop ebp 320 ret 321 322; For some reason, the OS X linker does not honor the request to align the 323; segment unless we do this. 324 align 32 325