1; 2; jcsample.asm - downsampling (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; [TAB8] 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 32 23; 24; Downsample pixel values of a single component. 25; This version handles the common case of 2:1 horizontal and 1:1 vertical, 26; without smoothing. 27; 28; GLOBAL(void) 29; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 30; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 31; JSAMPARRAY input_data, JSAMPARRAY output_data); 32; 33 34%define img_width(b) (b)+8 ; JDIMENSION image_width 35%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 36%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 37%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 38%define input_data(b) (b)+24 ; JSAMPARRAY input_data 39%define output_data(b) (b)+28 ; JSAMPARRAY output_data 40 41 align 16 42 global EXTN(jsimd_h2v1_downsample_sse2) 43 44EXTN(jsimd_h2v1_downsample_sse2): 45 push ebp 46 mov ebp,esp 47; push ebx ; unused 48; push ecx ; need not be preserved 49; push edx ; need not be preserved 50 push esi 51 push edi 52 53 mov ecx, JDIMENSION [width_blks(ebp)] 54 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 55 jz near .return 56 57 mov edx, JDIMENSION [img_width(ebp)] 58 59 ; -- expand_right_edge 60 61 push ecx 62 shl ecx,1 ; output_cols * 2 63 sub ecx,edx 64 jle short .expand_end 65 66 mov eax, INT [max_v_samp(ebp)] 67 test eax,eax 68 jle short .expand_end 69 70 cld 71 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 72 alignx 16,7 73.expandloop: 74 push eax 75 push ecx 76 77 mov edi, JSAMPROW [esi] 78 add edi,edx 79 mov al, JSAMPLE [edi-1] 80 81 rep stosb 82 83 pop ecx 84 pop eax 85 86 add esi, byte SIZEOF_JSAMPROW 87 dec eax 88 jg short .expandloop 89 90.expand_end: 91 pop ecx ; output_cols 92 93 ; -- h2v1_downsample 94 95 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 96 test eax,eax 97 jle near .return 98 99 mov edx, 0x00010000 ; bias pattern 100 movd xmm7,edx 101 pcmpeqw xmm6,xmm6 102 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 103 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 104 105 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 106 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 107 alignx 16,7 108.rowloop: 109 push ecx 110 push edi 111 push esi 112 113 mov esi, JSAMPROW [esi] ; inptr 114 mov edi, JSAMPROW [edi] ; outptr 115 116 cmp ecx, byte SIZEOF_XMMWORD 117 jae short .columnloop 118 alignx 16,7 119 120.columnloop_r8: 121 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 122 pxor xmm1,xmm1 123 mov ecx, SIZEOF_XMMWORD 124 jmp short .downsample 125 alignx 16,7 126 127.columnloop: 128 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 129 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] 130 131.downsample: 132 movdqa xmm2,xmm0 133 movdqa xmm3,xmm1 134 135 pand xmm0,xmm6 136 psrlw xmm2,BYTE_BIT 137 pand xmm1,xmm6 138 psrlw xmm3,BYTE_BIT 139 140 paddw xmm0,xmm2 141 paddw xmm1,xmm3 142 paddw xmm0,xmm7 143 paddw xmm1,xmm7 144 psrlw xmm0,1 145 psrlw xmm1,1 146 147 packuswb xmm0,xmm1 148 149 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 150 151 sub ecx, byte SIZEOF_XMMWORD ; outcol 152 add esi, byte 2*SIZEOF_XMMWORD ; inptr 153 add edi, byte 1*SIZEOF_XMMWORD ; outptr 154 cmp ecx, byte SIZEOF_XMMWORD 155 jae short .columnloop 156 test ecx,ecx 157 jnz short .columnloop_r8 158 159 pop esi 160 pop edi 161 pop ecx 162 163 add esi, byte SIZEOF_JSAMPROW ; input_data 164 add edi, byte SIZEOF_JSAMPROW ; output_data 165 dec eax ; rowctr 166 jg near .rowloop 167 168.return: 169 pop edi 170 pop esi 171; pop edx ; need not be preserved 172; pop ecx ; need not be preserved 173; pop ebx ; unused 174 pop ebp 175 ret 176 177; -------------------------------------------------------------------------- 178; 179; Downsample pixel values of a single component. 180; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 181; without smoothing. 182; 183; GLOBAL(void) 184; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 185; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 186; JSAMPARRAY input_data, JSAMPARRAY output_data); 187; 188 189%define img_width(b) (b)+8 ; JDIMENSION image_width 190%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 191%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 192%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 193%define input_data(b) (b)+24 ; JSAMPARRAY input_data 194%define output_data(b) (b)+28 ; JSAMPARRAY output_data 195 196 align 16 197 global EXTN(jsimd_h2v2_downsample_sse2) 198 199EXTN(jsimd_h2v2_downsample_sse2): 200 push ebp 201 mov ebp,esp 202; push ebx ; unused 203; push ecx ; need not be preserved 204; push edx ; need not be preserved 205 push esi 206 push edi 207 208 mov ecx, JDIMENSION [width_blks(ebp)] 209 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 210 jz near .return 211 212 mov edx, JDIMENSION [img_width(ebp)] 213 214 ; -- expand_right_edge 215 216 push ecx 217 shl ecx,1 ; output_cols * 2 218 sub ecx,edx 219 jle short .expand_end 220 221 mov eax, INT [max_v_samp(ebp)] 222 test eax,eax 223 jle short .expand_end 224 225 cld 226 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 227 alignx 16,7 228.expandloop: 229 push eax 230 push ecx 231 232 mov edi, JSAMPROW [esi] 233 add edi,edx 234 mov al, JSAMPLE [edi-1] 235 236 rep stosb 237 238 pop ecx 239 pop eax 240 241 add esi, byte SIZEOF_JSAMPROW 242 dec eax 243 jg short .expandloop 244 245.expand_end: 246 pop ecx ; output_cols 247 248 ; -- h2v2_downsample 249 250 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 251 test eax,eax 252 jle near .return 253 254 mov edx, 0x00020001 ; bias pattern 255 movd xmm7,edx 256 pcmpeqw xmm6,xmm6 257 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 258 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 259 260 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 261 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 262 alignx 16,7 263.rowloop: 264 push ecx 265 push edi 266 push esi 267 268 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 269 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 270 mov edi, JSAMPROW [edi] ; outptr 271 272 cmp ecx, byte SIZEOF_XMMWORD 273 jae short .columnloop 274 alignx 16,7 275 276.columnloop_r8: 277 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 278 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 279 pxor xmm2,xmm2 280 pxor xmm3,xmm3 281 mov ecx, SIZEOF_XMMWORD 282 jmp short .downsample 283 alignx 16,7 284 285.columnloop: 286 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 287 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 288 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] 289 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] 290 291.downsample: 292 movdqa xmm4,xmm0 293 movdqa xmm5,xmm1 294 pand xmm0,xmm6 295 psrlw xmm4,BYTE_BIT 296 pand xmm1,xmm6 297 psrlw xmm5,BYTE_BIT 298 paddw xmm0,xmm4 299 paddw xmm1,xmm5 300 301 movdqa xmm4,xmm2 302 movdqa xmm5,xmm3 303 pand xmm2,xmm6 304 psrlw xmm4,BYTE_BIT 305 pand xmm3,xmm6 306 psrlw xmm5,BYTE_BIT 307 paddw xmm2,xmm4 308 paddw xmm3,xmm5 309 310 paddw xmm0,xmm1 311 paddw xmm2,xmm3 312 paddw xmm0,xmm7 313 paddw xmm2,xmm7 314 psrlw xmm0,2 315 psrlw xmm2,2 316 317 packuswb xmm0,xmm2 318 319 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 320 321 sub ecx, byte SIZEOF_XMMWORD ; outcol 322 add edx, byte 2*SIZEOF_XMMWORD ; inptr0 323 add esi, byte 2*SIZEOF_XMMWORD ; inptr1 324 add edi, byte 1*SIZEOF_XMMWORD ; outptr 325 cmp ecx, byte SIZEOF_XMMWORD 326 jae near .columnloop 327 test ecx,ecx 328 jnz near .columnloop_r8 329 330 pop esi 331 pop edi 332 pop ecx 333 334 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 335 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 336 dec eax ; rowctr 337 jg near .rowloop 338 339.return: 340 pop edi 341 pop esi 342; pop edx ; need not be preserved 343; pop ecx ; need not be preserved 344; pop ebx ; unused 345 pop ebp 346 ret 347 348; For some reason, the OS X linker does not honor the request to align the 349; segment unless we do this. 350 align 16 351