1; 2; jdsample.asm - upsampling (AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2015, Intel Corporation. 6; Copyright (C) 2016, D. R. Commander. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17; 18; [TAB8] 19 20%include "jsimdext.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_CONST 24 25 alignz 32 26 GLOBAL_DATA(jconst_fancy_upsample_avx2) 27 28EXTN(jconst_fancy_upsample_avx2): 29 30PW_ONE times 16 dw 1 31PW_TWO times 16 dw 2 32PW_THREE times 16 dw 3 33PW_SEVEN times 16 dw 7 34PW_EIGHT times 16 dw 8 35 36 alignz 32 37 38; -------------------------------------------------------------------------- 39 SECTION SEG_TEXT 40 BITS 32 41; 42; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 43; 44; The upsampling algorithm is linear interpolation between pixel centers, 45; also known as a "triangle filter". This is a good compromise between 46; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 47; of the way between input pixel centers. 48; 49; GLOBAL(void) 50; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, 51; JDIMENSION downsampled_width, 52; JSAMPARRAY input_data, 53; JSAMPARRAY *output_data_ptr); 54; 55 56%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 57%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 58%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 59%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 60 61 align 32 62 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) 63 64EXTN(jsimd_h2v1_fancy_upsample_avx2): 65 push ebp 66 mov ebp, esp 67 pushpic ebx 68; push ecx ; need not be preserved 69; push edx ; need not be preserved 70 push esi 71 push edi 72 73 get_GOT ebx ; get GOT address 74 75 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 76 test eax, eax 77 jz near .return 78 79 mov ecx, INT [max_v_samp(ebp)] ; rowctr 80 test ecx, ecx 81 jz near .return 82 83 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 84 mov edi, POINTER [output_data_ptr(ebp)] 85 mov edi, JSAMPARRAY [edi] ; output_data 86 alignx 16, 7 87.rowloop: 88 push eax ; colctr 89 push edi 90 push esi 91 92 mov esi, JSAMPROW [esi] ; inptr 93 mov edi, JSAMPROW [edi] ; outptr 94 95 test eax, SIZEOF_YMMWORD-1 96 jz short .skip 97 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 98 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 99.skip: 100 vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) 101 vpcmpeqb xmm7, xmm7, xmm7 102 vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff 103 vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD] 104 105 add eax, byte SIZEOF_YMMWORD-1 106 and eax, byte -SIZEOF_YMMWORD 107 cmp eax, byte SIZEOF_YMMWORD 108 ja short .columnloop 109 alignx 16, 7 110 111.columnloop_last: 112 vpcmpeqb xmm6, xmm6, xmm6 113 vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1) 114 vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff 115 vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD] 116 jmp short .upsample 117 alignx 16, 7 118 119.columnloop: 120 vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD] 121 vperm2i128 ymm6, ymm0, ymm6, 0x20 122 vpslldq ymm6, ymm6, 15 123 124.upsample: 125 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) 126 127 vperm2i128 ymm2, ymm0, ymm1, 0x20 128 vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) 129 vperm2i128 ymm4, ymm0, ymm1, 0x03 130 vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) 131 132 vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) 133 vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) 134 135 vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) 136 137 vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 138 vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 139 vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 140 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 141 142 vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) 143 vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) 144 vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 145 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 146 147 vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) 148 vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) 149 vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 150 vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 151 152 vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) 153 154 vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)] 155 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 156 vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)] 157 vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)] 158 vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)] 159 vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)] 160 161 vpaddw ymm2, ymm2, ymm1 162 vpaddw ymm5, ymm5, ymm4 163 vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 164 vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 165 vpaddw ymm3, ymm3, ymm1 166 vpaddw ymm6, ymm6, ymm4 167 vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 168 vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 169 170 vpsllw ymm3, ymm3, BYTE_BIT 171 vpsllw ymm6, ymm6, BYTE_BIT 172 vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) 173 vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) 174 175 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 176 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5 177 178 sub eax, byte SIZEOF_YMMWORD 179 add esi, byte 1*SIZEOF_YMMWORD ; inptr 180 add edi, byte 2*SIZEOF_YMMWORD ; outptr 181 cmp eax, byte SIZEOF_YMMWORD 182 ja near .columnloop 183 test eax, eax 184 jnz near .columnloop_last 185 186 pop esi 187 pop edi 188 pop eax 189 190 add esi, byte SIZEOF_JSAMPROW ; input_data 191 add edi, byte SIZEOF_JSAMPROW ; output_data 192 dec ecx ; rowctr 193 jg near .rowloop 194 195.return: 196 vzeroupper 197 pop edi 198 pop esi 199; pop edx ; need not be preserved 200; pop ecx ; need not be preserved 201 poppic ebx 202 pop ebp 203 ret 204 205; -------------------------------------------------------------------------- 206; 207; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 208; Again a triangle filter; see comments for h2v1 case, above. 209; 210; GLOBAL(void) 211; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, 212; JDIMENSION downsampled_width, 213; JSAMPARRAY input_data, 214; JSAMPARRAY *output_data_ptr); 215; 216 217%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 218%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 219%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 220%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 221 222%define original_ebp ebp + 0 223%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD 224 ; ymmword wk[WK_NUM] 225%define WK_NUM 4 226%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 227 228 align 32 229 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) 230 231EXTN(jsimd_h2v2_fancy_upsample_avx2): 232 push ebp 233 mov eax, esp ; eax = original ebp 234 sub esp, byte 4 235 and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 236 mov [esp], eax 237 mov ebp, esp ; ebp = aligned ebp 238 lea esp, [wk(0)] 239 pushpic eax ; make a room for GOT address 240 push ebx 241; push ecx ; need not be preserved 242; push edx ; need not be preserved 243 push esi 244 push edi 245 246 get_GOT ebx ; get GOT address 247 movpic POINTER [gotptr], ebx ; save GOT address 248 249 mov edx, eax ; edx = original ebp 250 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 251 test eax, eax 252 jz near .return 253 254 mov ecx, INT [max_v_samp(edx)] ; rowctr 255 test ecx, ecx 256 jz near .return 257 258 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 259 mov edi, POINTER [output_data_ptr(edx)] 260 mov edi, JSAMPARRAY [edi] ; output_data 261 alignx 16, 7 262.rowloop: 263 push eax ; colctr 264 push ecx 265 push edi 266 push esi 267 268 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 269 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 271 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 272 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 273 274 test eax, SIZEOF_YMMWORD-1 275 jz short .skip 276 push edx 277 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 278 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 279 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 280 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 281 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 282 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 283 pop edx 284.skip: 285 ; -- process the first column block 286 287 vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] 288 vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] 289 vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] 290 291 pushpic ebx 292 movpic ebx, POINTER [gotptr] ; load GOT address 293 294 vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) 295 296 vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 297 vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 298 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 299 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 300 301 vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 302 vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 303 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 304 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 305 306 vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 307 vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 308 vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 309 vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 310 311 vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] 312 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 313 314 vpcmpeqb xmm7, xmm7, xmm7 315 vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff 316 317 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 318 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 319 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 320 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 321 322 vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save 323 vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data 324 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 325 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6 326 327 vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 328 vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 329 330 vmovdqa YMMWORD [wk(0)], ymm1 331 vmovdqa YMMWORD [wk(1)], ymm2 332 333 poppic ebx 334 335 add eax, byte SIZEOF_YMMWORD-1 336 and eax, byte -SIZEOF_YMMWORD 337 cmp eax, byte SIZEOF_YMMWORD 338 ja short .columnloop 339 alignx 16, 7 340 341.columnloop_last: 342 ; -- process the last column block 343 344 pushpic ebx 345 movpic ebx, POINTER [gotptr] ; load GOT address 346 347 vpcmpeqb xmm1, xmm1, xmm1 348 vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2) 349 vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff 350 351 vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD] 352 vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD] 353 354 vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 355 vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 356 357 jmp near .upsample 358 alignx 16, 7 359 360.columnloop: 361 ; -- process the next column block 362 363 vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] 364 vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] 365 vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] 366 367 pushpic ebx 368 movpic ebx, POINTER [gotptr] ; load GOT address 369 370 vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) 371 372 vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 373 vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 374 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 375 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 376 377 vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 378 vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 379 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 380 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 381 382 vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 383 vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 384 vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 385 vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 386 387 vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] 388 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 389 390 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 391 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 392 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 393 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 394 395 vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save 396 vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data 397 vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2 398 vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6 399 400 vperm2i128 ymm1, ymm3, ymm1, 0x20 401 vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 402 vperm2i128 ymm2, ymm3, ymm2, 0x20 403 vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 404 405 vmovdqa YMMWORD [wk(2)], ymm1 406 vmovdqa YMMWORD [wk(3)], ymm2 407 408.upsample: 409 ; -- process the upper row 410 411 vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 412 vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 413 414 vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 415 416 vperm2i128 ymm0, ymm1, ymm7, 0x03 417 vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 418 vperm2i128 ymm4, ymm1, ymm3, 0x20 419 vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 420 421 vperm2i128 ymm5, ymm1, ymm7, 0x03 422 vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 423 vperm2i128 ymm6, ymm1, ymm3, 0x20 424 vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 425 426 vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 427 vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 428 429 vperm2i128 ymm2, ymm1, ymm3, 0x03 430 vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 431 vperm2i128 ymm4, ymm1, ymm3, 0x03 432 vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 433 vperm2i128 ymm1, ymm1, ymm7, 0x20 434 vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 435 436 vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 437 vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 438 439 vmovdqa YMMWORD [wk(0)], ymm4 440 441 vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)] 442 vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)] 443 vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] 444 vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)] 445 vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)] 446 vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)] 447 448 vpaddw ymm1, ymm1, ymm7 449 vpaddw ymm5, ymm5, ymm3 450 vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 451 vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 452 vpaddw ymm0, ymm0, ymm7 453 vpaddw ymm2, ymm2, ymm3 454 vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 455 vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 456 457 vpsllw ymm0, ymm0, BYTE_BIT 458 vpsllw ymm2, ymm2, BYTE_BIT 459 vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) 460 vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) 461 462 vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 463 vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 464 465 ; -- process the lower row 466 467 vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 468 vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 469 470 vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 471 472 vperm2i128 ymm7, ymm1, ymm6, 0x03 473 vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 474 vperm2i128 ymm3, ymm1, ymm4, 0x20 475 vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 476 477 vperm2i128 ymm0, ymm1, ymm6, 0x03 478 vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 479 vperm2i128 ymm2, ymm1, ymm4, 0x20 480 vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 481 482 vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 483 vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 484 485 vperm2i128 ymm5, ymm1, ymm4, 0x03 486 vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 487 vperm2i128 ymm3, ymm1, ymm4, 0x03 488 vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 489 vperm2i128 ymm1, ymm1, ymm6, 0x20 490 vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 491 492 vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 493 vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 494 495 vmovdqa YMMWORD [wk(1)], ymm3 496 497 vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)] 498 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 499 vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] 500 vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)] 501 vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)] 502 vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)] 503 504 vpaddw ymm1, ymm1, ymm6 505 vpaddw ymm0, ymm0, ymm4 506 vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 507 vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 508 vpaddw ymm7, ymm7, ymm6 509 vpaddw ymm5, ymm5, ymm4 510 vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 511 vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 512 513 vpsllw ymm7, ymm7, BYTE_BIT 514 vpsllw ymm5, ymm5, BYTE_BIT 515 vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) 516 vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) 517 518 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1 519 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0 520 521 poppic ebx 522 523 sub eax, byte SIZEOF_YMMWORD 524 add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above) 525 add ebx, byte 1*SIZEOF_YMMWORD ; inptr0 526 add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below) 527 add edx, byte 2*SIZEOF_YMMWORD ; outptr0 528 add edi, byte 2*SIZEOF_YMMWORD ; outptr1 529 cmp eax, byte SIZEOF_YMMWORD 530 ja near .columnloop 531 test eax, eax 532 jnz near .columnloop_last 533 534 pop esi 535 pop edi 536 pop ecx 537 pop eax 538 539 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 540 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 541 sub ecx, byte 2 ; rowctr 542 jg near .rowloop 543 544.return: 545 vzeroupper 546 pop edi 547 pop esi 548; pop edx ; need not be preserved 549; pop ecx ; need not be preserved 550 pop ebx 551 mov esp, ebp ; esp <- aligned ebp 552 pop esp ; esp <- original ebp 553 pop ebp 554 ret 555 556; -------------------------------------------------------------------------- 557; 558; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 559; It's still a box filter. 560; 561; GLOBAL(void) 562; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 563; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 564; 565 566%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 567%define output_width(b) (b) + 12 ; JDIMENSION output_width 568%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 569%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 570 571 align 32 572 GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) 573 574EXTN(jsimd_h2v1_upsample_avx2): 575 push ebp 576 mov ebp, esp 577; push ebx ; unused 578; push ecx ; need not be preserved 579; push edx ; need not be preserved 580 push esi 581 push edi 582 583 mov edx, JDIMENSION [output_width(ebp)] 584 add edx, byte (SIZEOF_YMMWORD-1) 585 and edx, -SIZEOF_YMMWORD 586 jz short .return 587 588 mov ecx, INT [max_v_samp(ebp)] ; rowctr 589 test ecx, ecx 590 jz short .return 591 592 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 593 mov edi, POINTER [output_data_ptr(ebp)] 594 mov edi, JSAMPARRAY [edi] ; output_data 595 alignx 16, 7 596.rowloop: 597 push edi 598 push esi 599 600 mov esi, JSAMPROW [esi] ; inptr 601 mov edi, JSAMPROW [edi] ; outptr 602 mov eax, edx ; colctr 603 alignx 16, 7 604.columnloop: 605 606 cmp eax, byte SIZEOF_YMMWORD 607 ja near .above_16 608 609 vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD] 610 vpunpckhbw xmm1, xmm0, xmm0 611 vpunpcklbw xmm0, xmm0, xmm0 612 613 vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 614 vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 615 616 jmp short .nextrow 617 618.above_16: 619 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 620 621 vpermq ymm0, ymm0, 0xd8 622 vpunpckhbw ymm1, ymm0, ymm0 623 vpunpcklbw ymm0, ymm0, ymm0 624 625 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 626 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 627 628 sub eax, byte 2*SIZEOF_YMMWORD 629 jz short .nextrow 630 631 add esi, byte SIZEOF_YMMWORD ; inptr 632 add edi, byte 2*SIZEOF_YMMWORD ; outptr 633 jmp short .columnloop 634 alignx 16, 7 635 636.nextrow: 637 pop esi 638 pop edi 639 640 add esi, byte SIZEOF_JSAMPROW ; input_data 641 add edi, byte SIZEOF_JSAMPROW ; output_data 642 dec ecx ; rowctr 643 jg short .rowloop 644 645.return: 646 vzeroupper 647 pop edi 648 pop esi 649; pop edx ; need not be preserved 650; pop ecx ; need not be preserved 651; pop ebx ; unused 652 pop ebp 653 ret 654 655; -------------------------------------------------------------------------- 656; 657; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 658; It's still a box filter. 659; 660; GLOBAL(void) 661; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 662; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 663; 664 665%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 666%define output_width(b) (b) + 12 ; JDIMENSION output_width 667%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 668%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 669 670 align 32 671 GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) 672 673EXTN(jsimd_h2v2_upsample_avx2): 674 push ebp 675 mov ebp, esp 676 push ebx 677; push ecx ; need not be preserved 678; push edx ; need not be preserved 679 push esi 680 push edi 681 682 mov edx, JDIMENSION [output_width(ebp)] 683 add edx, byte (SIZEOF_YMMWORD-1) 684 and edx, -SIZEOF_YMMWORD 685 jz near .return 686 687 mov ecx, INT [max_v_samp(ebp)] ; rowctr 688 test ecx, ecx 689 jz near .return 690 691 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 692 mov edi, POINTER [output_data_ptr(ebp)] 693 mov edi, JSAMPARRAY [edi] ; output_data 694 alignx 16, 7 695.rowloop: 696 push edi 697 push esi 698 699 mov esi, JSAMPROW [esi] ; inptr 700 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 701 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 702 mov eax, edx ; colctr 703 alignx 16, 7 704.columnloop: 705 706 cmp eax, byte SIZEOF_YMMWORD 707 ja short .above_16 708 709 vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 710 vpunpckhbw xmm1, xmm0, xmm0 711 vpunpcklbw xmm0, xmm0, xmm0 712 713 vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 714 vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 715 vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 716 vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 717 718 jmp near .nextrow 719 720.above_16: 721 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 722 723 vpermq ymm0, ymm0, 0xd8 724 vpunpckhbw ymm1, ymm0, ymm0 725 vpunpcklbw ymm0, ymm0, ymm0 726 727 vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0 728 vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1 729 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 730 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 731 732 sub eax, byte 2*SIZEOF_YMMWORD 733 jz short .nextrow 734 735 add esi, byte SIZEOF_YMMWORD ; inptr 736 add ebx, 2*SIZEOF_YMMWORD ; outptr0 737 add edi, 2*SIZEOF_YMMWORD ; outptr1 738 jmp short .columnloop 739 alignx 16, 7 740 741.nextrow: 742 pop esi 743 pop edi 744 745 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 746 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 747 sub ecx, byte 2 ; rowctr 748 jg near .rowloop 749 750.return: 751 vzeroupper 752 pop edi 753 pop esi 754; pop edx ; need not be preserved 755; pop ecx ; need not be preserved 756 pop ebx 757 pop ebp 758 ret 759 760; For some reason, the OS X linker does not honor the request to align the 761; segment unless we do this. 762 align 32 763