1; 2; jcsample.asm - downsampling (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 32 24; 25; Downsample pixel values of a single component. 26; This version handles the common case of 2:1 horizontal and 1:1 vertical, 27; without smoothing. 28; 29; GLOBAL(void) 30; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 31; JDIMENSION v_samp_factor, 32; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 33; JSAMPARRAY output_data); 34; 35 36%define img_width(b) (b) + 8 ; JDIMENSION image_width 37%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 38%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 39%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 40%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 41%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 42 43 align 32 44 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) 45 46EXTN(jsimd_h2v1_downsample_sse2): 47 push ebp 48 mov ebp, esp 49; push ebx ; unused 50; push ecx ; need not be preserved 51; push edx ; need not be preserved 52 push esi 53 push edi 54 55 mov ecx, JDIMENSION [width_blks(ebp)] 56 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 57 jz near .return 58 59 mov edx, JDIMENSION [img_width(ebp)] 60 61 ; -- expand_right_edge 62 63 push ecx 64 shl ecx, 1 ; output_cols * 2 65 sub ecx, edx 66 jle short .expand_end 67 68 mov eax, INT [max_v_samp(ebp)] 69 test eax, eax 70 jle short .expand_end 71 72 cld 73 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 74 alignx 16, 7 75.expandloop: 76 push eax 77 push ecx 78 79 mov edi, JSAMPROW [esi] 80 add edi, edx 81 mov al, JSAMPLE [edi-1] 82 83 rep stosb 84 85 pop ecx 86 pop eax 87 88 add esi, byte SIZEOF_JSAMPROW 89 dec eax 90 jg short .expandloop 91 92.expand_end: 93 pop ecx ; output_cols 94 95 ; -- h2v1_downsample 96 97 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 98 test eax, eax 99 jle near .return 100 101 mov edx, 0x00010000 ; bias pattern 102 movd xmm7, edx 103 pcmpeqw xmm6, xmm6 104 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 105 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 106 107 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 108 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 109 alignx 16, 7 110.rowloop: 111 push ecx 112 push edi 113 push esi 114 115 mov esi, JSAMPROW [esi] ; inptr 116 mov edi, JSAMPROW [edi] ; outptr 117 118 cmp ecx, byte SIZEOF_XMMWORD 119 jae short .columnloop 120 alignx 16, 7 121 122.columnloop_r8: 123 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 124 pxor xmm1, xmm1 125 mov ecx, SIZEOF_XMMWORD 126 jmp short .downsample 127 alignx 16, 7 128 129.columnloop: 130 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 131 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] 132 133.downsample: 134 movdqa xmm2, xmm0 135 movdqa xmm3, xmm1 136 137 pand xmm0, xmm6 138 psrlw xmm2, BYTE_BIT 139 pand xmm1, xmm6 140 psrlw xmm3, BYTE_BIT 141 142 paddw xmm0, xmm2 143 paddw xmm1, xmm3 144 paddw xmm0, xmm7 145 paddw xmm1, xmm7 146 psrlw xmm0, 1 147 psrlw xmm1, 1 148 149 packuswb xmm0, xmm1 150 151 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 152 153 sub ecx, byte SIZEOF_XMMWORD ; outcol 154 add esi, byte 2*SIZEOF_XMMWORD ; inptr 155 add edi, byte 1*SIZEOF_XMMWORD ; outptr 156 cmp ecx, byte SIZEOF_XMMWORD 157 jae short .columnloop 158 test ecx, ecx 159 jnz short .columnloop_r8 160 161 pop esi 162 pop edi 163 pop ecx 164 165 add esi, byte SIZEOF_JSAMPROW ; input_data 166 add edi, byte SIZEOF_JSAMPROW ; output_data 167 dec eax ; rowctr 168 jg near .rowloop 169 170.return: 171 pop edi 172 pop esi 173; pop edx ; need not be preserved 174; pop ecx ; need not be preserved 175; pop ebx ; unused 176 pop ebp 177 ret 178 179; -------------------------------------------------------------------------- 180; 181; Downsample pixel values of a single component. 182; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 183; without smoothing. 184; 185; GLOBAL(void) 186; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 187; JDIMENSION v_samp_factor, 188; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 189; JSAMPARRAY output_data); 190; 191 192%define img_width(b) (b) + 8 ; JDIMENSION image_width 193%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 194%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 195%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 196%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 197%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 198 199 align 32 200 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) 201 202EXTN(jsimd_h2v2_downsample_sse2): 203 push ebp 204 mov ebp, esp 205; push ebx ; unused 206; push ecx ; need not be preserved 207; push edx ; need not be preserved 208 push esi 209 push edi 210 211 mov ecx, JDIMENSION [width_blks(ebp)] 212 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 213 jz near .return 214 215 mov edx, JDIMENSION [img_width(ebp)] 216 217 ; -- expand_right_edge 218 219 push ecx 220 shl ecx, 1 ; output_cols * 2 221 sub ecx, edx 222 jle short .expand_end 223 224 mov eax, INT [max_v_samp(ebp)] 225 test eax, eax 226 jle short .expand_end 227 228 cld 229 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 230 alignx 16, 7 231.expandloop: 232 push eax 233 push ecx 234 235 mov edi, JSAMPROW [esi] 236 add edi, edx 237 mov al, JSAMPLE [edi-1] 238 239 rep stosb 240 241 pop ecx 242 pop eax 243 244 add esi, byte SIZEOF_JSAMPROW 245 dec eax 246 jg short .expandloop 247 248.expand_end: 249 pop ecx ; output_cols 250 251 ; -- h2v2_downsample 252 253 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 254 test eax, eax 255 jle near .return 256 257 mov edx, 0x00020001 ; bias pattern 258 movd xmm7, edx 259 pcmpeqw xmm6, xmm6 260 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 261 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 262 263 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 264 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 265 alignx 16, 7 266.rowloop: 267 push ecx 268 push edi 269 push esi 270 271 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 272 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 273 mov edi, JSAMPROW [edi] ; outptr 274 275 cmp ecx, byte SIZEOF_XMMWORD 276 jae short .columnloop 277 alignx 16, 7 278 279.columnloop_r8: 280 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 281 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 282 pxor xmm2, xmm2 283 pxor xmm3, xmm3 284 mov ecx, SIZEOF_XMMWORD 285 jmp short .downsample 286 alignx 16, 7 287 288.columnloop: 289 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 290 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 291 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] 292 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] 293 294.downsample: 295 movdqa xmm4, xmm0 296 movdqa xmm5, xmm1 297 pand xmm0, xmm6 298 psrlw xmm4, BYTE_BIT 299 pand xmm1, xmm6 300 psrlw xmm5, BYTE_BIT 301 paddw xmm0, xmm4 302 paddw xmm1, xmm5 303 304 movdqa xmm4, xmm2 305 movdqa xmm5, xmm3 306 pand xmm2, xmm6 307 psrlw xmm4, BYTE_BIT 308 pand xmm3, xmm6 309 psrlw xmm5, BYTE_BIT 310 paddw xmm2, xmm4 311 paddw xmm3, xmm5 312 313 paddw xmm0, xmm1 314 paddw xmm2, xmm3 315 paddw xmm0, xmm7 316 paddw xmm2, xmm7 317 psrlw xmm0, 2 318 psrlw xmm2, 2 319 320 packuswb xmm0, xmm2 321 322 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 323 324 sub ecx, byte SIZEOF_XMMWORD ; outcol 325 add edx, byte 2*SIZEOF_XMMWORD ; inptr0 326 add esi, byte 2*SIZEOF_XMMWORD ; inptr1 327 add edi, byte 1*SIZEOF_XMMWORD ; outptr 328 cmp ecx, byte SIZEOF_XMMWORD 329 jae near .columnloop 330 test ecx, ecx 331 jnz near .columnloop_r8 332 333 pop esi 334 pop edi 335 pop ecx 336 337 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 338 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 339 dec eax ; rowctr 340 jg near .rowloop 341 342.return: 343 pop edi 344 pop esi 345; pop edx ; need not be preserved 346; pop ecx ; need not be preserved 347; pop ebx ; unused 348 pop ebp 349 ret 350 351; For some reason, the OS X linker does not honor the request to align the 352; segment unless we do this. 353 align 32 354