1; 2; jdsample.asm - upsampling (64-bit AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2015, Intel Corporation. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_CONST 22 23 alignz 32 24 GLOBAL_DATA(jconst_fancy_upsample_avx2) 25 26EXTN(jconst_fancy_upsample_avx2): 27 28PW_ONE times 16 dw 1 29PW_TWO times 16 dw 2 30PW_THREE times 16 dw 3 31PW_SEVEN times 16 dw 7 32PW_EIGHT times 16 dw 8 33 34 alignz 32 35 36; -------------------------------------------------------------------------- 37 SECTION SEG_TEXT 38 BITS 64 39; 40; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 41; 42; The upsampling algorithm is linear interpolation between pixel centers, 43; also known as a "triangle filter". This is a good compromise between 44; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 45; of the way between input pixel centers. 46; 47; GLOBAL(void) 48; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, 49; JDIMENSION downsampled_width, 50; JSAMPARRAY input_data, 51; JSAMPARRAY *output_data_ptr); 52; 53 54; r10 = int max_v_samp_factor 55; r11d = JDIMENSION downsampled_width 56; r12 = JSAMPARRAY input_data 57; r13 = JSAMPARRAY *output_data_ptr 58 59 align 32 60 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) 61 62EXTN(jsimd_h2v1_fancy_upsample_avx2): 63 push rbp 64 mov rax, rsp 65 mov rbp, rsp 66 push_xmm 3 67 collect_args 4 68 69 mov eax, r11d ; colctr 70 test rax, rax 71 jz near .return 72 73 mov rcx, r10 ; rowctr 74 test rcx, rcx 75 jz near .return 76 77 mov rsi, r12 ; input_data 78 mov rdi, r13 79 mov rdi, JSAMPARRAY [rdi] ; output_data 80 81 vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) 82 vpcmpeqb xmm9, xmm9, xmm9 83 vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff 84 85 vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1) 86 vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff 87 88.rowloop: 89 push rax ; colctr 90 push rdi 91 push rsi 92 93 mov rsi, JSAMPROW [rsi] ; inptr 94 mov rdi, JSAMPROW [rdi] ; outptr 95 96 test rax, SIZEOF_YMMWORD-1 97 jz short .skip 98 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 99 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 100.skip: 101 vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD] 102 103 add rax, byte SIZEOF_YMMWORD-1 104 and rax, byte -SIZEOF_YMMWORD 105 cmp rax, byte SIZEOF_YMMWORD 106 ja short .columnloop 107 108.columnloop_last: 109 vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD] 110 jmp short .upsample 111 112.columnloop: 113 vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD] 114 vperm2i128 ymm6, ymm0, ymm6, 0x20 115 vpslldq ymm6, ymm6, 15 116 117.upsample: 118 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) 119 120 vperm2i128 ymm2, ymm0, ymm1, 0x20 121 vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) 122 vperm2i128 ymm4, ymm0, ymm1, 0x03 123 vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) 124 125 vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) 126 vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) 127 128 vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) 129 130 vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 131 vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 132 vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 133 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 134 135 vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) 136 vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) 137 vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 138 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 139 140 vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) 141 vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) 142 vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 143 vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 144 145 vpmullw ymm1, ymm1, [rel PW_THREE] 146 vpmullw ymm4, ymm4, [rel PW_THREE] 147 vpaddw ymm2, ymm2, [rel PW_ONE] 148 vpaddw ymm5, ymm5, [rel PW_ONE] 149 vpaddw ymm3, ymm3, [rel PW_TWO] 150 vpaddw ymm6, ymm6, [rel PW_TWO] 151 152 vpaddw ymm2, ymm2, ymm1 153 vpaddw ymm5, ymm5, ymm4 154 vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 155 vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 156 vpaddw ymm3, ymm3, ymm1 157 vpaddw ymm6, ymm6, ymm4 158 vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 159 vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 160 161 vpsllw ymm3, ymm3, BYTE_BIT 162 vpsllw ymm6, ymm6, BYTE_BIT 163 vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) 164 vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) 165 166 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 167 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5 168 169 sub rax, byte SIZEOF_YMMWORD 170 add rsi, byte 1*SIZEOF_YMMWORD ; inptr 171 add rdi, byte 2*SIZEOF_YMMWORD ; outptr 172 cmp rax, byte SIZEOF_YMMWORD 173 ja near .columnloop 174 test eax, eax 175 jnz near .columnloop_last 176 177 pop rsi 178 pop rdi 179 pop rax 180 181 add rsi, byte SIZEOF_JSAMPROW ; input_data 182 add rdi, byte SIZEOF_JSAMPROW ; output_data 183 dec rcx ; rowctr 184 jg near .rowloop 185 186.return: 187 vzeroupper 188 uncollect_args 4 189 pop_xmm 3 190 pop rbp 191 ret 192 193; -------------------------------------------------------------------------- 194; 195; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 196; Again a triangle filter; see comments for h2v1 case, above. 197; 198; GLOBAL(void) 199; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, 200; JDIMENSION downsampled_width, 201; JSAMPARRAY input_data, 202; JSAMPARRAY *output_data_ptr); 203; 204 205; r10 = int max_v_samp_factor 206; r11d = JDIMENSION downsampled_width 207; r12 = JSAMPARRAY input_data 208; r13 = JSAMPARRAY *output_data_ptr 209 210%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 211%define WK_NUM 4 212 213 align 32 214 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) 215 216EXTN(jsimd_h2v2_fancy_upsample_avx2): 217 push rbp 218 mov rax, rsp ; rax = original rbp 219 sub rsp, byte 4 220 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 221 mov [rsp], rax 222 mov rbp, rsp ; rbp = aligned rbp 223 lea rsp, [wk(0)] 224 push_xmm 3 225 collect_args 4 226 push rbx 227 228 mov eax, r11d ; colctr 229 test rax, rax 230 jz near .return 231 232 mov rcx, r10 ; rowctr 233 test rcx, rcx 234 jz near .return 235 236 mov rsi, r12 ; input_data 237 mov rdi, r13 238 mov rdi, JSAMPARRAY [rdi] ; output_data 239.rowloop: 240 push rax ; colctr 241 push rcx 242 push rdi 243 push rsi 244 245 mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) 246 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 247 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) 248 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 249 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 250 251 vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's) 252 vpcmpeqb xmm9, xmm9, xmm9 253 vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff 254 vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2) 255 vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff 256 257 test rax, SIZEOF_YMMWORD-1 258 jz short .skip 259 push rdx 260 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] 261 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl 262 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] 263 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl 264 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 265 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 266 pop rdx 267.skip: 268 ; -- process the first column block 269 270 vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] 271 vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] 272 vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] 273 274 vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 275 vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 276 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 277 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 278 279 vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 280 vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 281 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 282 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 283 284 vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 285 vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 286 vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 287 vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 288 289 vpmullw ymm0, ymm0, [rel PW_THREE] 290 vpmullw ymm4, ymm4, [rel PW_THREE] 291 292 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 293 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 294 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 295 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 296 297 vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save 298 vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data 299 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 300 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6 301 302 vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 303 vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 304 305 vmovdqa YMMWORD [wk(0)], ymm1 306 vmovdqa YMMWORD [wk(1)], ymm2 307 308 add rax, byte SIZEOF_YMMWORD-1 309 and rax, byte -SIZEOF_YMMWORD 310 cmp rax, byte SIZEOF_YMMWORD 311 ja short .columnloop 312 313.columnloop_last: 314 ; -- process the last column block 315 316 vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD] 317 vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD] 318 319 vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 320 vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 321 322 jmp near .upsample 323 324.columnloop: 325 ; -- process the next column block 326 327 vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] 328 vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] 329 vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] 330 331 vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 332 vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 333 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 334 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 335 336 vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 337 vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 338 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 339 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 340 341 vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 342 vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 343 vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 344 vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 345 346 vpmullw ymm0, ymm0, [rel PW_THREE] 347 vpmullw ymm4, ymm4, [rel PW_THREE] 348 349 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 350 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 351 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 352 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 353 354 vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save 355 vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data 356 vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2 357 vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6 358 359 vperm2i128 ymm1, ymm8, ymm1, 0x20 360 vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 361 vperm2i128 ymm2, ymm8, ymm2, 0x20 362 vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 363 364 vmovdqa YMMWORD [wk(2)], ymm1 365 vmovdqa YMMWORD [wk(3)], ymm2 366 367.upsample: 368 ; -- process the upper row 369 370 vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 371 vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 372 373 vperm2i128 ymm0, ymm8, ymm7, 0x03 374 vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 375 vperm2i128 ymm4, ymm8, ymm3, 0x20 376 vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 377 378 vperm2i128 ymm5, ymm8, ymm7, 0x03 379 vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 380 vperm2i128 ymm6, ymm8, ymm3, 0x20 381 vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 382 383 vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 384 vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 385 386 vperm2i128 ymm2, ymm8, ymm3, 0x03 387 vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 388 vperm2i128 ymm4, ymm8, ymm3, 0x03 389 vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 390 vperm2i128 ymm1, ymm8, ymm7, 0x20 391 vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 392 393 vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 394 vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 395 396 vmovdqa YMMWORD [wk(0)], ymm4 397 398 vpmullw ymm7, ymm7, [rel PW_THREE] 399 vpmullw ymm3, ymm3, [rel PW_THREE] 400 vpaddw ymm1, ymm1, [rel PW_EIGHT] 401 vpaddw ymm5, ymm5, [rel PW_EIGHT] 402 vpaddw ymm0, ymm0, [rel PW_SEVEN] 403 vpaddw ymm2, [rel PW_SEVEN] 404 405 vpaddw ymm1, ymm1, ymm7 406 vpaddw ymm5, ymm5, ymm3 407 vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 408 vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 409 vpaddw ymm0, ymm0, ymm7 410 vpaddw ymm2, ymm2, ymm3 411 vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 412 vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 413 414 vpsllw ymm0, ymm0, BYTE_BIT 415 vpsllw ymm2, ymm2, BYTE_BIT 416 vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) 417 vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) 418 419 vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 420 vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 421 422 ; -- process the lower row 423 424 vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 425 vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 426 427 vperm2i128 ymm7, ymm8, ymm6, 0x03 428 vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 429 vperm2i128 ymm3, ymm8, ymm4, 0x20 430 vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 431 432 vperm2i128 ymm0, ymm8, ymm6, 0x03 433 vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 434 vperm2i128 ymm2, ymm8, ymm4, 0x20 435 vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 436 437 vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 438 vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 439 440 vperm2i128 ymm5, ymm8, ymm4, 0x03 441 vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 442 vperm2i128 ymm3, ymm8, ymm4, 0x03 443 vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 444 vperm2i128 ymm1, ymm8, ymm6, 0x20 445 vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 446 447 vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 448 vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 449 450 vmovdqa YMMWORD [wk(1)], ymm3 451 452 vpmullw ymm6, ymm6, [rel PW_THREE] 453 vpmullw ymm4, ymm4, [rel PW_THREE] 454 vpaddw ymm1, ymm1, [rel PW_EIGHT] 455 vpaddw ymm0, ymm0, [rel PW_EIGHT] 456 vpaddw ymm7, ymm7, [rel PW_SEVEN] 457 vpaddw ymm5, ymm5, [rel PW_SEVEN] 458 459 vpaddw ymm1, ymm1, ymm6 460 vpaddw ymm0, ymm0, ymm4 461 vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 462 vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 463 vpaddw ymm7, ymm7, ymm6 464 vpaddw ymm5, ymm5, ymm4 465 vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 466 vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 467 468 vpsllw ymm7, ymm7, BYTE_BIT 469 vpsllw ymm5, ymm5, BYTE_BIT 470 vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) 471 vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) 472 473 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1 474 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0 475 476 sub rax, byte SIZEOF_YMMWORD 477 add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above) 478 add rbx, byte 1*SIZEOF_YMMWORD ; inptr0 479 add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below) 480 add rdx, byte 2*SIZEOF_YMMWORD ; outptr0 481 add rdi, byte 2*SIZEOF_YMMWORD ; outptr1 482 cmp rax, byte SIZEOF_YMMWORD 483 ja near .columnloop 484 test rax, rax 485 jnz near .columnloop_last 486 487 pop rsi 488 pop rdi 489 pop rcx 490 pop rax 491 492 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 493 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 494 sub rcx, byte 2 ; rowctr 495 jg near .rowloop 496 497.return: 498 pop rbx 499 vzeroupper 500 uncollect_args 4 501 pop_xmm 3 502 mov rsp, rbp ; rsp <- aligned rbp 503 pop rsp ; rsp <- original rbp 504 pop rbp 505 ret 506 507; -------------------------------------------------------------------------- 508; 509; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 510; It's still a box filter. 511; 512; GLOBAL(void) 513; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 514; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 515; 516 517; r10 = int max_v_samp_factor 518; r11d = JDIMENSION output_width 519; r12 = JSAMPARRAY input_data 520; r13 = JSAMPARRAY *output_data_ptr 521 522 align 32 523 GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) 524 525EXTN(jsimd_h2v1_upsample_avx2): 526 push rbp 527 mov rax, rsp 528 mov rbp, rsp 529 collect_args 4 530 531 mov edx, r11d 532 add rdx, byte (SIZEOF_YMMWORD-1) 533 and rdx, -SIZEOF_YMMWORD 534 jz near .return 535 536 mov rcx, r10 ; rowctr 537 test rcx, rcx 538 jz short .return 539 540 mov rsi, r12 ; input_data 541 mov rdi, r13 542 mov rdi, JSAMPARRAY [rdi] ; output_data 543.rowloop: 544 push rdi 545 push rsi 546 547 mov rsi, JSAMPROW [rsi] ; inptr 548 mov rdi, JSAMPROW [rdi] ; outptr 549 mov rax, rdx ; colctr 550.columnloop: 551 552 cmp rax, byte SIZEOF_YMMWORD 553 ja near .above_16 554 555 vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD] 556 vpunpckhbw xmm1, xmm0, xmm0 557 vpunpcklbw xmm0, xmm0, xmm0 558 559 vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 560 vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 561 562 jmp short .nextrow 563 564.above_16: 565 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 566 567 vpermq ymm0, ymm0, 0xd8 568 vpunpckhbw ymm1, ymm0, ymm0 569 vpunpcklbw ymm0, ymm0, ymm0 570 571 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 572 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 573 574 sub rax, byte 2*SIZEOF_YMMWORD 575 jz short .nextrow 576 577 add rsi, byte SIZEOF_YMMWORD ; inptr 578 add rdi, byte 2*SIZEOF_YMMWORD ; outptr 579 jmp short .columnloop 580 581.nextrow: 582 pop rsi 583 pop rdi 584 585 add rsi, byte SIZEOF_JSAMPROW ; input_data 586 add rdi, byte SIZEOF_JSAMPROW ; output_data 587 dec rcx ; rowctr 588 jg short .rowloop 589 590.return: 591 vzeroupper 592 uncollect_args 4 593 pop rbp 594 ret 595 596; -------------------------------------------------------------------------- 597; 598; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 599; It's still a box filter. 600; 601; GLOBAL(void) 602; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 603; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 604; 605 606; r10 = int max_v_samp_factor 607; r11d = JDIMENSION output_width 608; r12 = JSAMPARRAY input_data 609; r13 = JSAMPARRAY *output_data_ptr 610 611 align 32 612 GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) 613 614EXTN(jsimd_h2v2_upsample_avx2): 615 push rbp 616 mov rax, rsp 617 mov rbp, rsp 618 collect_args 4 619 push rbx 620 621 mov edx, r11d 622 add rdx, byte (SIZEOF_YMMWORD-1) 623 and rdx, -SIZEOF_YMMWORD 624 jz near .return 625 626 mov rcx, r10 ; rowctr 627 test rcx, rcx 628 jz near .return 629 630 mov rsi, r12 ; input_data 631 mov rdi, r13 632 mov rdi, JSAMPARRAY [rdi] ; output_data 633.rowloop: 634 push rdi 635 push rsi 636 637 mov rsi, JSAMPROW [rsi] ; inptr 638 mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 639 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 640 mov rax, rdx ; colctr 641.columnloop: 642 643 cmp rax, byte SIZEOF_YMMWORD 644 ja short .above_16 645 646 vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 647 vpunpckhbw xmm1, xmm0, xmm0 648 vpunpcklbw xmm0, xmm0, xmm0 649 650 vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 651 vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 652 vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 653 vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 654 655 jmp near .nextrow 656 657.above_16: 658 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 659 660 vpermq ymm0, ymm0, 0xd8 661 vpunpckhbw ymm1, ymm0, ymm0 662 vpunpcklbw ymm0, ymm0, ymm0 663 664 vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0 665 vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1 666 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 667 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 668 669 sub rax, byte 2*SIZEOF_YMMWORD 670 jz short .nextrow 671 672 add rsi, byte SIZEOF_YMMWORD ; inptr 673 add rbx, 2*SIZEOF_YMMWORD ; outptr0 674 add rdi, 2*SIZEOF_YMMWORD ; outptr1 675 jmp short .columnloop 676 677.nextrow: 678 pop rsi 679 pop rdi 680 681 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 682 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 683 sub rcx, byte 2 ; rowctr 684 jg near .rowloop 685 686.return: 687 pop rbx 688 vzeroupper 689 uncollect_args 4 690 pop rbp 691 ret 692 693; For some reason, the OS X linker does not honor the request to align the 694; segment unless we do this. 695 align 32 696