1; 2; jdsample.asm - upsampling (AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2015, Intel Corporation. 6; Copyright (C) 2016, D. R. Commander. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_CONST 22 23 alignz 32 24 GLOBAL_DATA(jconst_fancy_upsample_avx2) 25 26EXTN(jconst_fancy_upsample_avx2): 27 28PW_ONE times 16 dw 1 29PW_TWO times 16 dw 2 30PW_THREE times 16 dw 3 31PW_SEVEN times 16 dw 7 32PW_EIGHT times 16 dw 8 33 34 alignz 32 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_TEXT 38 BITS 32 39; 40; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 41; 42; The upsampling algorithm is linear interpolation between pixel centers, 43; also known as a "triangle filter". This is a good compromise between 44; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 45; of the way between input pixel centers. 46; 47; GLOBAL(void) 48; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, 49; JDIMENSION downsampled_width, 50; JSAMPARRAY input_data, 51; JSAMPARRAY *output_data_ptr); 52; 53 54%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 55%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 56%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 57%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 58 59 align 32 60 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) 61 62EXTN(jsimd_h2v1_fancy_upsample_avx2): 63 push ebp 64 mov ebp, esp 65 pushpic ebx 66; push ecx ; need not be preserved 67; push edx ; need not be preserved 68 push esi 69 push edi 70 71 get_GOT ebx ; get GOT address 72 73 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 74 test eax, eax 75 jz near .return 76 77 mov ecx, INT [max_v_samp(ebp)] ; rowctr 78 test ecx, ecx 79 jz near .return 80 81 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 82 mov edi, POINTER [output_data_ptr(ebp)] 83 mov edi, JSAMPARRAY [edi] ; output_data 84 alignx 16, 7 85.rowloop: 86 push eax ; colctr 87 push edi 88 push esi 89 90 mov esi, JSAMPROW [esi] ; inptr 91 mov edi, JSAMPROW [edi] ; outptr 92 93 test eax, SIZEOF_YMMWORD-1 94 jz short .skip 95 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 96 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 97.skip: 98 vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) 99 vpcmpeqb xmm7, xmm7, xmm7 100 vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff 101 vpand ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD] 102 103 add eax, byte SIZEOF_YMMWORD-1 104 and eax, byte -SIZEOF_YMMWORD 105 cmp eax, byte SIZEOF_YMMWORD 106 ja short .columnloop 107 alignx 16, 7 108 109.columnloop_last: 110 vpcmpeqb xmm6, xmm6, xmm6 111 vpslldq xmm6, xmm6, (SIZEOF_XMMWORD-1) 112 vperm2i128 ymm6, ymm6, ymm6, 1 ; (---- ---- ... ---- ---- ff) MSB is ff 113 vpand ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD] 114 jmp short .upsample 115 alignx 16, 7 116 117.columnloop: 118 vmovdqu ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD] 119 vperm2i128 ymm6, ymm0, ymm6, 0x20 120 vpslldq ymm6, ymm6, 15 121 122.upsample: 123 vmovdqu ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) 124 125 vperm2i128 ymm2, ymm0, ymm1, 0x20 126 vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) 127 vperm2i128 ymm4, ymm0, ymm1, 0x03 128 vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) 129 130 vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) 131 vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) 132 133 vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) 134 135 vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 136 vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 137 vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 138 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 139 140 vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) 141 vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) 142 vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 143 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 144 145 vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) 146 vpunpcklbw ymm0, ymm3, ymm0 ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) 147 vperm2i128 ymm3, ymm0, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 148 vperm2i128 ymm6, ymm0, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 149 150 vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) 151 152 vpmullw ymm1, ymm1, [GOTOFF(ebx,PW_THREE)] 153 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 154 vpaddw ymm2, ymm2, [GOTOFF(ebx,PW_ONE)] 155 vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_ONE)] 156 vpaddw ymm3, ymm3, [GOTOFF(ebx,PW_TWO)] 157 vpaddw ymm6, ymm6, [GOTOFF(ebx,PW_TWO)] 158 159 vpaddw ymm2, ymm2, ymm1 160 vpaddw ymm5, ymm5, ymm4 161 vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 162 vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 163 vpaddw ymm3, ymm3, ymm1 164 vpaddw ymm6, ymm6, ymm4 165 vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 166 vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 167 168 vpsllw ymm3, ymm3, BYTE_BIT 169 vpsllw ymm6, ymm6, BYTE_BIT 170 vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) 171 vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) 172 173 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 174 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5 175 176 sub eax, byte SIZEOF_YMMWORD 177 add esi, byte 1*SIZEOF_YMMWORD ; inptr 178 add edi, byte 2*SIZEOF_YMMWORD ; outptr 179 cmp eax, byte SIZEOF_YMMWORD 180 ja near .columnloop 181 test eax, eax 182 jnz near .columnloop_last 183 184 pop esi 185 pop edi 186 pop eax 187 188 add esi, byte SIZEOF_JSAMPROW ; input_data 189 add edi, byte SIZEOF_JSAMPROW ; output_data 190 dec ecx ; rowctr 191 jg near .rowloop 192 193.return: 194 vzeroupper 195 pop edi 196 pop esi 197; pop edx ; need not be preserved 198; pop ecx ; need not be preserved 199 poppic ebx 200 pop ebp 201 ret 202 203; -------------------------------------------------------------------------- 204; 205; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 206; Again a triangle filter; see comments for h2v1 case, above. 207; 208; GLOBAL(void) 209; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, 210; JDIMENSION downsampled_width, 211; JSAMPARRAY input_data, 212; JSAMPARRAY *output_data_ptr); 213; 214 215%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 216%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 217%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 218%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 219 220%define original_ebp ebp + 0 221%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD 222 ; ymmword wk[WK_NUM] 223%define WK_NUM 4 224%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 225 226 align 32 227 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) 228 229EXTN(jsimd_h2v2_fancy_upsample_avx2): 230 push ebp 231 mov eax, esp ; eax = original ebp 232 sub esp, byte 4 233 and esp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 234 mov [esp], eax 235 mov ebp, esp ; ebp = aligned ebp 236 lea esp, [wk(0)] 237 pushpic eax ; make a room for GOT address 238 push ebx 239; push ecx ; need not be preserved 240; push edx ; need not be preserved 241 push esi 242 push edi 243 244 get_GOT ebx ; get GOT address 245 movpic POINTER [gotptr], ebx ; save GOT address 246 247 mov edx, eax ; edx = original ebp 248 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 249 test eax, eax 250 jz near .return 251 252 mov ecx, INT [max_v_samp(edx)] ; rowctr 253 test ecx, ecx 254 jz near .return 255 256 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 257 mov edi, POINTER [output_data_ptr(edx)] 258 mov edi, JSAMPARRAY [edi] ; output_data 259 alignx 16, 7 260.rowloop: 261 push eax ; colctr 262 push ecx 263 push edi 264 push esi 265 266 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 267 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 268 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 269 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 270 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 271 272 test eax, SIZEOF_YMMWORD-1 273 jz short .skip 274 push edx 275 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 276 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 277 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 278 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 279 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 280 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 281 pop edx 282.skip: 283 ; -- process the first column block 284 285 vmovdqu ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] 286 vmovdqu ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] 287 vmovdqu ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] 288 289 pushpic ebx 290 movpic ebx, POINTER [gotptr] ; load GOT address 291 292 vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) 293 294 vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 295 vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 296 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 297 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 298 299 vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 300 vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 301 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 302 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 303 304 vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 305 vpunpcklbw ymm3, ymm2, ymm3 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 306 vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 307 vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 308 309 vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] 310 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 311 312 vpcmpeqb xmm7, xmm7, xmm7 313 vpsrldq xmm7, xmm7, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff 314 315 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 316 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 317 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 318 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 319 320 vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save 321 vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data 322 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2 323 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6 324 325 vpand ymm1, ymm1, ymm7 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 326 vpand ymm2, ymm2, ymm7 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 327 328 vmovdqa YMMWORD [wk(0)], ymm1 329 vmovdqa YMMWORD [wk(1)], ymm2 330 331 poppic ebx 332 333 add eax, byte SIZEOF_YMMWORD-1 334 and eax, byte -SIZEOF_YMMWORD 335 cmp eax, byte SIZEOF_YMMWORD 336 ja short .columnloop 337 alignx 16, 7 338 339.columnloop_last: 340 ; -- process the last column block 341 342 pushpic ebx 343 movpic ebx, POINTER [gotptr] ; load GOT address 344 345 vpcmpeqb xmm1, xmm1, xmm1 346 vpslldq xmm1, xmm1, (SIZEOF_XMMWORD-2) 347 vperm2i128 ymm1, ymm1, ymm1, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff 348 349 vpand ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD] 350 vpand ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD] 351 352 vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 353 vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 354 355 jmp near .upsample 356 alignx 16, 7 357 358.columnloop: 359 ; -- process the next column block 360 361 vmovdqu ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] 362 vmovdqu ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] 363 vmovdqu ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] 364 365 pushpic ebx 366 movpic ebx, POINTER [gotptr] ; load GOT address 367 368 vpxor ymm3, ymm3, ymm3 ; ymm3=(all 0's) 369 370 vpunpckhbw ymm4, ymm0, ymm3 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 371 vpunpcklbw ymm5, ymm0, ymm3 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 372 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 373 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 374 375 vpunpckhbw ymm5, ymm1, ymm3 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 376 vpunpcklbw ymm6, ymm1, ymm3 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 377 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 378 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 379 380 vpunpckhbw ymm6, ymm2, ymm3 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 381 vpunpcklbw ymm7, ymm2, ymm3 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 382 vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 383 vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 384 385 vpmullw ymm0, ymm0, [GOTOFF(ebx,PW_THREE)] 386 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 387 388 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 389 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 390 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 391 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 392 393 vmovdqu YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save 394 vmovdqu YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data 395 vmovdqu YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2 396 vmovdqu YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6 397 398 vperm2i128 ymm1, ymm3, ymm1, 0x20 399 vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 400 vperm2i128 ymm2, ymm3, ymm2, 0x20 401 vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 402 403 vmovdqa YMMWORD [wk(2)], ymm1 404 vmovdqa YMMWORD [wk(3)], ymm2 405 406.upsample: 407 ; -- process the upper row 408 409 vmovdqu ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 410 vmovdqu ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 411 412 vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 413 414 vperm2i128 ymm0, ymm1, ymm7, 0x03 415 vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 416 vperm2i128 ymm4, ymm1, ymm3, 0x20 417 vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 418 419 vperm2i128 ymm5, ymm1, ymm7, 0x03 420 vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 421 vperm2i128 ymm6, ymm1, ymm3, 0x20 422 vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 423 424 vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 425 vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 426 427 vperm2i128 ymm2, ymm1, ymm3, 0x03 428 vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 429 vperm2i128 ymm4, ymm1, ymm3, 0x03 430 vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 431 vperm2i128 ymm1, ymm1, ymm7, 0x20 432 vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 433 434 vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 435 vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 436 437 vmovdqa YMMWORD [wk(0)], ymm4 438 439 vpmullw ymm7, ymm7, [GOTOFF(ebx,PW_THREE)] 440 vpmullw ymm3, ymm3, [GOTOFF(ebx,PW_THREE)] 441 vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] 442 vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)] 443 vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)] 444 vpaddw ymm2, [GOTOFF(ebx,PW_SEVEN)] 445 446 vpaddw ymm1, ymm1, ymm7 447 vpaddw ymm5, ymm5, ymm3 448 vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 449 vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 450 vpaddw ymm0, ymm0, ymm7 451 vpaddw ymm2, ymm2, ymm3 452 vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 453 vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 454 455 vpsllw ymm0, ymm0, BYTE_BIT 456 vpsllw ymm2, ymm2, BYTE_BIT 457 vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) 458 vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) 459 460 vmovdqu YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1 461 vmovdqu YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5 462 463 ; -- process the lower row 464 465 vmovdqu ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 466 vmovdqu ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 467 468 vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 469 470 vperm2i128 ymm7, ymm1, ymm6, 0x03 471 vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 472 vperm2i128 ymm3, ymm1, ymm4, 0x20 473 vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 474 475 vperm2i128 ymm0, ymm1, ymm6, 0x03 476 vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 477 vperm2i128 ymm2, ymm1, ymm4, 0x20 478 vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 479 480 vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 481 vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 482 483 vperm2i128 ymm5, ymm1, ymm4, 0x03 484 vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 485 vperm2i128 ymm3, ymm1, ymm4, 0x03 486 vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 487 vperm2i128 ymm1, ymm1, ymm6, 0x20 488 vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 489 490 vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 491 vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 492 493 vmovdqa YMMWORD [wk(1)], ymm3 494 495 vpmullw ymm6, ymm6, [GOTOFF(ebx,PW_THREE)] 496 vpmullw ymm4, ymm4, [GOTOFF(ebx,PW_THREE)] 497 vpaddw ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)] 498 vpaddw ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)] 499 vpaddw ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)] 500 vpaddw ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)] 501 502 vpaddw ymm1, ymm1, ymm6 503 vpaddw ymm0, ymm0, ymm4 504 vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 505 vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 506 vpaddw ymm7, ymm7, ymm6 507 vpaddw ymm5, ymm5, ymm4 508 vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 509 vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 510 511 vpsllw ymm7, ymm7, BYTE_BIT 512 vpsllw ymm5, ymm5, BYTE_BIT 513 vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) 514 vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) 515 516 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1 517 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0 518 519 poppic ebx 520 521 sub eax, byte SIZEOF_YMMWORD 522 add ecx, byte 1*SIZEOF_YMMWORD ; inptr1(above) 523 add ebx, byte 1*SIZEOF_YMMWORD ; inptr0 524 add esi, byte 1*SIZEOF_YMMWORD ; inptr1(below) 525 add edx, byte 2*SIZEOF_YMMWORD ; outptr0 526 add edi, byte 2*SIZEOF_YMMWORD ; outptr1 527 cmp eax, byte SIZEOF_YMMWORD 528 ja near .columnloop 529 test eax, eax 530 jnz near .columnloop_last 531 532 pop esi 533 pop edi 534 pop ecx 535 pop eax 536 537 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 538 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 539 sub ecx, byte 2 ; rowctr 540 jg near .rowloop 541 542.return: 543 vzeroupper 544 pop edi 545 pop esi 546; pop edx ; need not be preserved 547; pop ecx ; need not be preserved 548 pop ebx 549 mov esp, ebp ; esp <- aligned ebp 550 pop esp ; esp <- original ebp 551 pop ebp 552 ret 553 554; -------------------------------------------------------------------------- 555; 556; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 557; It's still a box filter. 558; 559; GLOBAL(void) 560; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 561; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 562; 563 564%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 565%define output_width(b) (b) + 12 ; JDIMENSION output_width 566%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 567%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 568 569 align 32 570 GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) 571 572EXTN(jsimd_h2v1_upsample_avx2): 573 push ebp 574 mov ebp, esp 575; push ebx ; unused 576; push ecx ; need not be preserved 577; push edx ; need not be preserved 578 push esi 579 push edi 580 581 mov edx, JDIMENSION [output_width(ebp)] 582 add edx, byte (SIZEOF_YMMWORD-1) 583 and edx, -SIZEOF_YMMWORD 584 jz short .return 585 586 mov ecx, INT [max_v_samp(ebp)] ; rowctr 587 test ecx, ecx 588 jz short .return 589 590 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 591 mov edi, POINTER [output_data_ptr(ebp)] 592 mov edi, JSAMPARRAY [edi] ; output_data 593 alignx 16, 7 594.rowloop: 595 push edi 596 push esi 597 598 mov esi, JSAMPROW [esi] ; inptr 599 mov edi, JSAMPROW [edi] ; outptr 600 mov eax, edx ; colctr 601 alignx 16, 7 602.columnloop: 603 604 cmp eax, byte SIZEOF_YMMWORD 605 ja near .above_16 606 607 vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD] 608 vpunpckhbw xmm1, xmm0, xmm0 609 vpunpcklbw xmm0, xmm0, xmm0 610 611 vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 612 vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 613 614 jmp short .nextrow 615 616.above_16: 617 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 618 619 vpermq ymm0, ymm0, 0xd8 620 vpunpckhbw ymm1, ymm0, ymm0 621 vpunpcklbw ymm0, ymm0, ymm0 622 623 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 624 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 625 626 sub eax, byte 2*SIZEOF_YMMWORD 627 jz short .nextrow 628 629 add esi, byte SIZEOF_YMMWORD ; inptr 630 add edi, byte 2*SIZEOF_YMMWORD ; outptr 631 jmp short .columnloop 632 alignx 16, 7 633 634.nextrow: 635 pop esi 636 pop edi 637 638 add esi, byte SIZEOF_JSAMPROW ; input_data 639 add edi, byte SIZEOF_JSAMPROW ; output_data 640 dec ecx ; rowctr 641 jg short .rowloop 642 643.return: 644 vzeroupper 645 pop edi 646 pop esi 647; pop edx ; need not be preserved 648; pop ecx ; need not be preserved 649; pop ebx ; unused 650 pop ebp 651 ret 652 653; -------------------------------------------------------------------------- 654; 655; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 656; It's still a box filter. 657; 658; GLOBAL(void) 659; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 660; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 661; 662 663%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 664%define output_width(b) (b) + 12 ; JDIMENSION output_width 665%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 666%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 667 668 align 32 669 GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) 670 671EXTN(jsimd_h2v2_upsample_avx2): 672 push ebp 673 mov ebp, esp 674 push ebx 675; push ecx ; need not be preserved 676; push edx ; need not be preserved 677 push esi 678 push edi 679 680 mov edx, JDIMENSION [output_width(ebp)] 681 add edx, byte (SIZEOF_YMMWORD-1) 682 and edx, -SIZEOF_YMMWORD 683 jz near .return 684 685 mov ecx, INT [max_v_samp(ebp)] ; rowctr 686 test ecx, ecx 687 jz near .return 688 689 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 690 mov edi, POINTER [output_data_ptr(ebp)] 691 mov edi, JSAMPARRAY [edi] ; output_data 692 alignx 16, 7 693.rowloop: 694 push edi 695 push esi 696 697 mov esi, JSAMPROW [esi] ; inptr 698 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 699 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 700 mov eax, edx ; colctr 701 alignx 16, 7 702.columnloop: 703 704 cmp eax, byte SIZEOF_YMMWORD 705 ja short .above_16 706 707 vmovdqu xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 708 vpunpckhbw xmm1, xmm0, xmm0 709 vpunpcklbw xmm0, xmm0, xmm0 710 711 vmovdqu XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 712 vmovdqu XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 713 vmovdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 714 vmovdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 715 716 jmp near .nextrow 717 718.above_16: 719 vmovdqu ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD] 720 721 vpermq ymm0, ymm0, 0xd8 722 vpunpckhbw ymm1, ymm0, ymm0 723 vpunpcklbw ymm0, ymm0, ymm0 724 725 vmovdqu YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0 726 vmovdqu YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1 727 vmovdqu YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0 728 vmovdqu YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1 729 730 sub eax, byte 2*SIZEOF_YMMWORD 731 jz short .nextrow 732 733 add esi, byte SIZEOF_YMMWORD ; inptr 734 add ebx, 2*SIZEOF_YMMWORD ; outptr0 735 add edi, 2*SIZEOF_YMMWORD ; outptr1 736 jmp short .columnloop 737 alignx 16, 7 738 739.nextrow: 740 pop esi 741 pop edi 742 743 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 744 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 745 sub ecx, byte 2 ; rowctr 746 jg near .rowloop 747 748.return: 749 vzeroupper 750 pop edi 751 pop esi 752; pop edx ; need not be preserved 753; pop ecx ; need not be preserved 754 pop ebx 755 pop ebp 756 ret 757 758; For some reason, the OS X linker does not honor the request to align the 759; segment unless we do this. 760 align 32 761