1; 2; jcsample.asm - downsampling (AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2015, Intel Corporation. 6; Copyright (C) 2016, D. R. Commander. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 32 23; 24; Downsample pixel values of a single component. 25; This version handles the common case of 2:1 horizontal and 1:1 vertical, 26; without smoothing. 27; 28; GLOBAL(void) 29; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 30; JDIMENSION v_samp_factor, 31; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 32; JSAMPARRAY output_data); 33; 34 35%define img_width(b) (b) + 8 ; JDIMENSION image_width 36%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 37%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 38%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 39%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 40%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 41 42 align 32 43 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) 44 45EXTN(jsimd_h2v1_downsample_avx2): 46 push ebp 47 mov ebp, esp 48; push ebx ; unused 49; push ecx ; need not be preserved 50; push edx ; need not be preserved 51 push esi 52 push edi 53 54 mov ecx, JDIMENSION [width_blks(ebp)] 55 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 56 jz near .return 57 58 mov edx, JDIMENSION [img_width(ebp)] 59 60 ; -- expand_right_edge 61 62 push ecx 63 shl ecx, 1 ; output_cols * 2 64 sub ecx, edx 65 jle short .expand_end 66 67 mov eax, INT [max_v_samp(ebp)] 68 test eax, eax 69 jle short .expand_end 70 71 cld 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 73 alignx 16, 7 74.expandloop: 75 push eax 76 push ecx 77 78 mov edi, JSAMPROW [esi] 79 add edi, edx 80 mov al, JSAMPLE [edi-1] 81 82 rep stosb 83 84 pop ecx 85 pop eax 86 87 add esi, byte SIZEOF_JSAMPROW 88 dec eax 89 jg short .expandloop 90 91.expand_end: 92 pop ecx ; output_cols 93 94 ; -- h2v1_downsample 95 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 97 test eax, eax 98 jle near .return 99 100 mov edx, 0x00010000 ; bias pattern 101 vmovd xmm7, edx 102 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 103 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} 104 vpcmpeqw ymm6, ymm6, ymm6 105 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 106 107 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 108 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 109 alignx 16, 7 110.rowloop: 111 push ecx 112 push edi 113 push esi 114 115 mov esi, JSAMPROW [esi] ; inptr 116 mov edi, JSAMPROW [edi] ; outptr 117 118 cmp ecx, byte SIZEOF_YMMWORD 119 jae short .columnloop 120 alignx 16, 7 121 122.columnloop_r24: 123 ; ecx can possibly be 8, 16, 24 124 cmp ecx, 24 125 jne .columnloop_r16 126 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 127 vmovdqu xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD] 128 mov ecx, SIZEOF_YMMWORD 129 jmp short .downsample 130 131.columnloop_r16: 132 cmp ecx, 16 133 jne .columnloop_r8 134 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 135 vpxor ymm1, ymm1, ymm1 136 mov ecx, SIZEOF_YMMWORD 137 jmp short .downsample 138 139.columnloop_r8: 140 vmovdqu xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD] 141 vpxor ymm1, ymm1, ymm1 142 mov ecx, SIZEOF_YMMWORD 143 jmp short .downsample 144 alignx 16, 7 145 146.columnloop: 147 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 148 vmovdqu ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD] 149 150.downsample: 151 vpsrlw ymm2, ymm0, BYTE_BIT 152 vpand ymm0, ymm0, ymm6 153 vpsrlw ymm3, ymm1, BYTE_BIT 154 vpand ymm1, ymm1, ymm6 155 156 vpaddw ymm0, ymm0, ymm2 157 vpaddw ymm1, ymm1, ymm3 158 vpaddw ymm0, ymm0, ymm7 159 vpaddw ymm1, ymm1, ymm7 160 vpsrlw ymm0, ymm0, 1 161 vpsrlw ymm1, ymm1, 1 162 163 vpackuswb ymm0, ymm0, ymm1 164 vpermq ymm0, ymm0, 0xd8 165 166 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 167 168 sub ecx, byte SIZEOF_YMMWORD ; outcol 169 add esi, byte 2*SIZEOF_YMMWORD ; inptr 170 add edi, byte 1*SIZEOF_YMMWORD ; outptr 171 cmp ecx, byte SIZEOF_YMMWORD 172 jae short .columnloop 173 test ecx, ecx 174 jnz near .columnloop_r24 175 176 pop esi 177 pop edi 178 pop ecx 179 180 add esi, byte SIZEOF_JSAMPROW ; input_data 181 add edi, byte SIZEOF_JSAMPROW ; output_data 182 dec eax ; rowctr 183 jg near .rowloop 184 185.return: 186 vzeroupper 187 pop edi 188 pop esi 189; pop edx ; need not be preserved 190; pop ecx ; need not be preserved 191; pop ebx ; unused 192 pop ebp 193 ret 194 195; -------------------------------------------------------------------------- 196; 197; Downsample pixel values of a single component. 198; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 199; without smoothing. 200; 201; GLOBAL(void) 202; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 203; JDIMENSION v_samp_factor, 204; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 205; JSAMPARRAY output_data); 206; 207 208%define img_width(b) (b) + 8 ; JDIMENSION image_width 209%define max_v_samp(b) (b) + 12 ; int max_v_samp_factor 210%define v_samp(b) (b) + 16 ; JDIMENSION v_samp_factor 211%define width_blks(b) (b) + 20 ; JDIMENSION width_in_blocks 212%define input_data(b) (b) + 24 ; JSAMPARRAY input_data 213%define output_data(b) (b) + 28 ; JSAMPARRAY output_data 214 215 align 32 216 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) 217 218EXTN(jsimd_h2v2_downsample_avx2): 219 push ebp 220 mov ebp, esp 221; push ebx ; unused 222; push ecx ; need not be preserved 223; push edx ; need not be preserved 224 push esi 225 push edi 226 227 mov ecx, JDIMENSION [width_blks(ebp)] 228 shl ecx, 3 ; imul ecx,DCTSIZE (ecx = output_cols) 229 jz near .return 230 231 mov edx, JDIMENSION [img_width(ebp)] 232 233 ; -- expand_right_edge 234 235 push ecx 236 shl ecx, 1 ; output_cols * 2 237 sub ecx, edx 238 jle short .expand_end 239 240 mov eax, INT [max_v_samp(ebp)] 241 test eax, eax 242 jle short .expand_end 243 244 cld 245 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 246 alignx 16, 7 247.expandloop: 248 push eax 249 push ecx 250 251 mov edi, JSAMPROW [esi] 252 add edi, edx 253 mov al, JSAMPLE [edi-1] 254 255 rep stosb 256 257 pop ecx 258 pop eax 259 260 add esi, byte SIZEOF_JSAMPROW 261 dec eax 262 jg short .expandloop 263 264.expand_end: 265 pop ecx ; output_cols 266 267 ; -- h2v2_downsample 268 269 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 270 test eax, eax 271 jle near .return 272 273 mov edx, 0x00020001 ; bias pattern 274 vmovd xmm7, edx 275 vpcmpeqw ymm6, ymm6, ymm6 276 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} 277 vperm2i128 ymm7, ymm7, ymm7, 0 278 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 279 280 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 281 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 282 alignx 16, 7 283.rowloop: 284 push ecx 285 push edi 286 push esi 287 288 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 289 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 290 mov edi, JSAMPROW [edi] ; outptr 291 292 cmp ecx, byte SIZEOF_YMMWORD 293 jae short .columnloop 294 alignx 16, 7 295 296.columnloop_r24: 297 cmp ecx, 24 298 jne .columnloop_r16 299 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 300 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 301 vmovdqu xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD] 302 vmovdqu xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD] 303 mov ecx, SIZEOF_YMMWORD 304 jmp short .downsample 305 306.columnloop_r16: 307 cmp ecx, 16 308 jne .columnloop_r8 309 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 310 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 311 vpxor ymm2, ymm2, ymm2 312 vpxor ymm3, ymm3, ymm3 313 mov ecx, SIZEOF_YMMWORD 314 jmp short .downsample 315 316.columnloop_r8: 317 vmovdqu xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 318 vmovdqu xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 319 vpxor ymm2, ymm2, ymm2 320 vpxor ymm3, ymm3, ymm3 321 mov ecx, SIZEOF_YMMWORD 322 jmp short .downsample 323 alignx 16, 7 324 325.columnloop: 326 vmovdqu ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD] 327 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] 328 vmovdqu ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD] 329 vmovdqu ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD] 330 331.downsample: 332 vpand ymm4, ymm0, ymm6 333 vpsrlw ymm0, ymm0, BYTE_BIT 334 vpand ymm5, ymm1, ymm6 335 vpsrlw ymm1, ymm1, BYTE_BIT 336 vpaddw ymm0, ymm0, ymm4 337 vpaddw ymm1, ymm1, ymm5 338 339 vpand ymm4, ymm2, ymm6 340 vpsrlw ymm2, ymm2, BYTE_BIT 341 vpand ymm5, ymm3, ymm6 342 vpsrlw ymm3, ymm3, BYTE_BIT 343 vpaddw ymm2, ymm2, ymm4 344 vpaddw ymm3, ymm3, ymm5 345 346 vpaddw ymm0, ymm0, ymm1 347 vpaddw ymm2, ymm2, ymm3 348 vpaddw ymm0, ymm0, ymm7 349 vpaddw ymm2, ymm2, ymm7 350 vpsrlw ymm0, ymm0, 2 351 vpsrlw ymm2, ymm2, 2 352 353 vpackuswb ymm0, ymm0, ymm2 354 vpermq ymm0, ymm0, 0xd8 355 356 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 357 358 sub ecx, byte SIZEOF_YMMWORD ; outcol 359 add edx, byte 2*SIZEOF_YMMWORD ; inptr0 360 add esi, byte 2*SIZEOF_YMMWORD ; inptr1 361 add edi, byte 1*SIZEOF_YMMWORD ; outptr 362 cmp ecx, byte SIZEOF_YMMWORD 363 jae near .columnloop 364 test ecx, ecx 365 jnz near .columnloop_r24 366 367 pop esi 368 pop edi 369 pop ecx 370 371 add esi, byte 2*SIZEOF_JSAMPROW ; input_data 372 add edi, byte 1*SIZEOF_JSAMPROW ; output_data 373 dec eax ; rowctr 374 jg near .rowloop 375 376.return: 377 vzeroupper 378 pop edi 379 pop esi 380; pop edx ; need not be preserved 381; pop ecx ; need not be preserved 382; pop ebx ; unused 383 pop ebp 384 ret 385 386; For some reason, the OS X linker does not honor the request to align the 387; segment unless we do this. 388 align 32 389