1; 2; jdsample.asm - upsampling (64-bit AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2015, Intel Corporation. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17; 18; [TAB8] 19 20%include "jsimdext.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_CONST 24 25 alignz 32 26 GLOBAL_DATA(jconst_fancy_upsample_avx2) 27 28EXTN(jconst_fancy_upsample_avx2): 29 30PW_ONE times 16 dw 1 31PW_TWO times 16 dw 2 32PW_THREE times 16 dw 3 33PW_SEVEN times 16 dw 7 34PW_EIGHT times 16 dw 8 35 36 alignz 32 37 38; -------------------------------------------------------------------------- 39 SECTION SEG_TEXT 40 BITS 64 41; 42; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 43; 44; The upsampling algorithm is linear interpolation between pixel centers, 45; also known as a "triangle filter". This is a good compromise between 46; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 47; of the way between input pixel centers. 48; 49; GLOBAL(void) 50; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, 51; JDIMENSION downsampled_width, 52; JSAMPARRAY input_data, 53; JSAMPARRAY *output_data_ptr); 54; 55 56; r10 = int max_v_samp_factor 57; r11d = JDIMENSION downsampled_width 58; r12 = JSAMPARRAY input_data 59; r13 = JSAMPARRAY *output_data_ptr 60 61 align 32 62 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) 63 64EXTN(jsimd_h2v1_fancy_upsample_avx2): 65 push rbp 66 mov rax, rsp 67 mov rbp, rsp 68 push_xmm 3 69 collect_args 4 70 71 mov eax, r11d ; colctr 72 test rax, rax 73 jz near .return 74 75 mov rcx, r10 ; rowctr 76 test rcx, rcx 77 jz near .return 78 79 mov rsi, r12 ; input_data 80 mov rdi, r13 81 mov rdi, JSAMPARRAY [rdi] ; output_data 82 83 vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) 84 vpcmpeqb xmm9, xmm9, xmm9 85 vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff 86 87 vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1) 88 vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff 89 90.rowloop: 91 push rax ; colctr 92 push rdi 93 push rsi 94 95 mov rsi, JSAMPROW [rsi] ; inptr 96 mov rdi, JSAMPROW [rdi] ; outptr 97 98 test rax, SIZEOF_YMMWORD-1 99 jz short .skip 100 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 101 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 102.skip: 103 vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD] 104 105 add rax, byte SIZEOF_YMMWORD-1 106 and rax, byte -SIZEOF_YMMWORD 107 cmp rax, byte SIZEOF_YMMWORD 108 ja short .columnloop 109 110.columnloop_last: 111 vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD] 112 jmp short .upsample 113 114.columnloop: 115 vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD] 116 vperm2i128 ymm6, ymm0, ymm6, 0x20 117 vpslldq ymm6, ymm6, 15 118 119.upsample: 120 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) 121 122 vperm2i128 ymm2, ymm0, ymm1, 0x20 123 vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) 124 vperm2i128 ymm4, ymm0, ymm1, 0x03 125 vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) 126 127 vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) 128 vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) 129 130 vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) 131 132 vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 133 vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 134 vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 135 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 136 137 vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) 138 vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) 139 vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 140 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 141 142 vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) 143 vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) 144 vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 145 vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 146 147 vpmullw ymm1, ymm1, [rel PW_THREE] 148 vpmullw ymm4, ymm4, [rel PW_THREE] 149 vpaddw ymm2, ymm2, [rel PW_ONE] 150 vpaddw ymm5, ymm5, [rel PW_ONE] 151 vpaddw ymm3, ymm3, [rel PW_TWO] 152 vpaddw ymm6, ymm6, [rel PW_TWO] 153 154 vpaddw ymm2, ymm2, ymm1 155 vpaddw ymm5, ymm5, ymm4 156 vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 157 vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 158 vpaddw ymm3, ymm3, ymm1 159 vpaddw ymm6, ymm6, ymm4 160 vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 161 vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 162 163 vpsllw ymm3, ymm3, BYTE_BIT 164 vpsllw ymm6, ymm6, BYTE_BIT 165 vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) 166 vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) 167 168 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 169 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5 170 171 sub rax, byte SIZEOF_YMMWORD 172 add rsi, byte 1*SIZEOF_YMMWORD ; inptr 173 add rdi, byte 2*SIZEOF_YMMWORD ; outptr 174 cmp rax, byte SIZEOF_YMMWORD 175 ja near .columnloop 176 test eax, eax 177 jnz near .columnloop_last 178 179 pop rsi 180 pop rdi 181 pop rax 182 183 add rsi, byte SIZEOF_JSAMPROW ; input_data 184 add rdi, byte SIZEOF_JSAMPROW ; output_data 185 dec rcx ; rowctr 186 jg near .rowloop 187 188.return: 189 vzeroupper 190 uncollect_args 4 191 pop_xmm 3 192 pop rbp 193 ret 194 195; -------------------------------------------------------------------------- 196; 197; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 198; Again a triangle filter; see comments for h2v1 case, above. 199; 200; GLOBAL(void) 201; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, 202; JDIMENSION downsampled_width, 203; JSAMPARRAY input_data, 204; JSAMPARRAY *output_data_ptr); 205; 206 207; r10 = int max_v_samp_factor 208; r11d = JDIMENSION downsampled_width 209; r12 = JSAMPARRAY input_data 210; r13 = JSAMPARRAY *output_data_ptr 211 212%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 213%define WK_NUM 4 214 215 align 32 216 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) 217 218EXTN(jsimd_h2v2_fancy_upsample_avx2): 219 push rbp 220 mov rax, rsp ; rax = original rbp 221 sub rsp, byte 4 222 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 223 mov [rsp], rax 224 mov rbp, rsp ; rbp = aligned rbp 225 lea rsp, [wk(0)] 226 push_xmm 3 227 collect_args 4 228 push rbx 229 230 mov eax, r11d ; colctr 231 test rax, rax 232 jz near .return 233 234 mov rcx, r10 ; rowctr 235 test rcx, rcx 236 jz near .return 237 238 mov rsi, r12 ; input_data 239 mov rdi, r13 240 mov rdi, JSAMPARRAY [rdi] ; output_data 241.rowloop: 242 push rax ; colctr 243 push rcx 244 push rdi 245 push rsi 246 247 mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) 248 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 249 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) 250 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 251 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 252 253 vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's) 254 vpcmpeqb xmm9, xmm9, xmm9 255 vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff 256 vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2) 257 vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff 258 259 test rax, SIZEOF_YMMWORD-1 260 jz short .skip 261 push rdx 262 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] 263 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl 264 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] 265 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl 266 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 267 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 268 pop rdx 269.skip: 270 ; -- process the first column block 271 272 vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] 273 vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] 274 vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] 275 276 vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 277 vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 278 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 279 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 280 281 vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 282 vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 283 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 284 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 285 286 vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 287 vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 288 vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 289 vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 290 291 vpmullw ymm0, ymm0, [rel PW_THREE] 292 vpmullw ymm4, ymm4, [rel PW_THREE] 293 294 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 295 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 296 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 297 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 298 299 vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save 300 vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data 301 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 302 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6 303 304 vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 305 vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 306 307 vmovdqa YMMWORD [wk(0)], ymm1 308 vmovdqa YMMWORD [wk(1)], ymm2 309 310 add rax, byte SIZEOF_YMMWORD-1 311 and rax, byte -SIZEOF_YMMWORD 312 cmp rax, byte SIZEOF_YMMWORD 313 ja short .columnloop 314 315.columnloop_last: 316 ; -- process the last column block 317 318 vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD] 319 vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD] 320 321 vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 322 vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 323 324 jmp near .upsample 325 326.columnloop: 327 ; -- process the next column block 328 329 vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] 330 vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] 331 vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] 332 333 vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 334 vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 335 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 336 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 337 338 vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 339 vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 340 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 341 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 342 343 vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 344 vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 345 vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 346 vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 347 348 vpmullw ymm0, ymm0, [rel PW_THREE] 349 vpmullw ymm4, ymm4, [rel PW_THREE] 350 351 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 352 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 353 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 354 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 355 356 vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save 357 vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data 358 vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2 359 vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6 360 361 vperm2i128 ymm1, ymm8, ymm1, 0x20 362 vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 363 vperm2i128 ymm2, ymm8, ymm2, 0x20 364 vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 365 366 vmovdqa YMMWORD [wk(2)], ymm1 367 vmovdqa YMMWORD [wk(3)], ymm2 368 369.upsample: 370 ; -- process the upper row 371 372 vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 373 vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 374 375 vperm2i128 ymm0, ymm8, ymm7, 0x03 376 vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 377 vperm2i128 ymm4, ymm8, ymm3, 0x20 378 vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 379 380 vperm2i128 ymm5, ymm8, ymm7, 0x03 381 vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 382 vperm2i128 ymm6, ymm8, ymm3, 0x20 383 vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 384 385 vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 386 vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 387 388 vperm2i128 ymm2, ymm8, ymm3, 0x03 389 vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 390 vperm2i128 ymm4, ymm8, ymm3, 0x03 391 vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 392 vperm2i128 ymm1, ymm8, ymm7, 0x20 393 vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 394 395 vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 396 vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 397 398 vmovdqa YMMWORD [wk(0)], ymm4 399 400 vpmullw ymm7, ymm7, [rel PW_THREE] 401 vpmullw ymm3, ymm3, [rel PW_THREE] 402 vpaddw ymm1, ymm1, [rel PW_EIGHT] 403 vpaddw ymm5, ymm5, [rel PW_EIGHT] 404 vpaddw ymm0, ymm0, [rel PW_SEVEN] 405 vpaddw ymm2, [rel PW_SEVEN] 406 407 vpaddw ymm1, ymm1, ymm7 408 vpaddw ymm5, ymm5, ymm3 409 vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 410 vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 411 vpaddw ymm0, ymm0, ymm7 412 vpaddw ymm2, ymm2, ymm3 413 vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 414 vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 415 416 vpsllw ymm0, ymm0, BYTE_BIT 417 vpsllw ymm2, ymm2, BYTE_BIT 418 vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) 419 vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) 420 421 vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 422 vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 423 424 ; -- process the lower row 425 426 vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 427 vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 428 429 vperm2i128 ymm7, ymm8, ymm6, 0x03 430 vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 431 vperm2i128 ymm3, ymm8, ymm4, 0x20 432 vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 433 434 vperm2i128 ymm0, ymm8, ymm6, 0x03 435 vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 436 vperm2i128 ymm2, ymm8, ymm4, 0x20 437 vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 438 439 vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 440 vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 441 442 vperm2i128 ymm5, ymm8, ymm4, 0x03 443 vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 444 vperm2i128 ymm3, ymm8, ymm4, 0x03 445 vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 446 vperm2i128 ymm1, ymm8, ymm6, 0x20 447 vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 448 449 vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 450 vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 451 452 vmovdqa YMMWORD [wk(1)], ymm3 453 454 vpmullw ymm6, ymm6, [rel PW_THREE] 455 vpmullw ymm4, ymm4, [rel PW_THREE] 456 vpaddw ymm1, ymm1, [rel PW_EIGHT] 457 vpaddw ymm0, ymm0, [rel PW_EIGHT] 458 vpaddw ymm7, ymm7, [rel PW_SEVEN] 459 vpaddw ymm5, ymm5, [rel PW_SEVEN] 460 461 vpaddw ymm1, ymm1, ymm6 462 vpaddw ymm0, ymm0, ymm4 463 vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 464 vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 465 vpaddw ymm7, ymm7, ymm6 466 vpaddw ymm5, ymm5, ymm4 467 vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 468 vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 469 470 vpsllw ymm7, ymm7, BYTE_BIT 471 vpsllw ymm5, ymm5, BYTE_BIT 472 vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) 473 vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) 474 475 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1 476 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0 477 478 sub rax, byte SIZEOF_YMMWORD 479 add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above) 480 add rbx, byte 1*SIZEOF_YMMWORD ; inptr0 481 add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below) 482 add rdx, byte 2*SIZEOF_YMMWORD ; outptr0 483 add rdi, byte 2*SIZEOF_YMMWORD ; outptr1 484 cmp rax, byte SIZEOF_YMMWORD 485 ja near .columnloop 486 test rax, rax 487 jnz near .columnloop_last 488 489 pop rsi 490 pop rdi 491 pop rcx 492 pop rax 493 494 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 495 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 496 sub rcx, byte 2 ; rowctr 497 jg near .rowloop 498 499.return: 500 pop rbx 501 vzeroupper 502 uncollect_args 4 503 pop_xmm 3 504 mov rsp, rbp ; rsp <- aligned rbp 505 pop rsp ; rsp <- original rbp 506 pop rbp 507 ret 508 509; -------------------------------------------------------------------------- 510; 511; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 512; It's still a box filter. 513; 514; GLOBAL(void) 515; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 516; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 517; 518 519; r10 = int max_v_samp_factor 520; r11d = JDIMENSION output_width 521; r12 = JSAMPARRAY input_data 522; r13 = JSAMPARRAY *output_data_ptr 523 524 align 32 525 GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) 526 527EXTN(jsimd_h2v1_upsample_avx2): 528 push rbp 529 mov rax, rsp 530 mov rbp, rsp 531 collect_args 4 532 533 mov edx, r11d 534 add rdx, byte (SIZEOF_YMMWORD-1) 535 and rdx, -SIZEOF_YMMWORD 536 jz near .return 537 538 mov rcx, r10 ; rowctr 539 test rcx, rcx 540 jz short .return 541 542 mov rsi, r12 ; input_data 543 mov rdi, r13 544 mov rdi, JSAMPARRAY [rdi] ; output_data 545.rowloop: 546 push rdi 547 push rsi 548 549 mov rsi, JSAMPROW [rsi] ; inptr 550 mov rdi, JSAMPROW [rdi] ; outptr 551 mov rax, rdx ; colctr 552.columnloop: 553 554 cmp rax, byte SIZEOF_YMMWORD 555 ja near .above_16 556 557 vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD] 558 vpunpckhbw xmm1, xmm0, xmm0 559 vpunpcklbw xmm0, xmm0, xmm0 560 561 vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 562 vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 563 564 jmp short .nextrow 565 566.above_16: 567 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 568 569 vpermq ymm0, ymm0, 0xd8 570 vpunpckhbw ymm1, ymm0, ymm0 571 vpunpcklbw ymm0, ymm0, ymm0 572 573 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 574 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 575 576 sub rax, byte 2*SIZEOF_YMMWORD 577 jz short .nextrow 578 579 add rsi, byte SIZEOF_YMMWORD ; inptr 580 add rdi, byte 2*SIZEOF_YMMWORD ; outptr 581 jmp short .columnloop 582 583.nextrow: 584 pop rsi 585 pop rdi 586 587 add rsi, byte SIZEOF_JSAMPROW ; input_data 588 add rdi, byte SIZEOF_JSAMPROW ; output_data 589 dec rcx ; rowctr 590 jg short .rowloop 591 592.return: 593 vzeroupper 594 uncollect_args 4 595 pop rbp 596 ret 597 598; -------------------------------------------------------------------------- 599; 600; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 601; It's still a box filter. 602; 603; GLOBAL(void) 604; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 605; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 606; 607 608; r10 = int max_v_samp_factor 609; r11d = JDIMENSION output_width 610; r12 = JSAMPARRAY input_data 611; r13 = JSAMPARRAY *output_data_ptr 612 613 align 32 614 GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) 615 616EXTN(jsimd_h2v2_upsample_avx2): 617 push rbp 618 mov rax, rsp 619 mov rbp, rsp 620 collect_args 4 621 push rbx 622 623 mov edx, r11d 624 add rdx, byte (SIZEOF_YMMWORD-1) 625 and rdx, -SIZEOF_YMMWORD 626 jz near .return 627 628 mov rcx, r10 ; rowctr 629 test rcx, rcx 630 jz near .return 631 632 mov rsi, r12 ; input_data 633 mov rdi, r13 634 mov rdi, JSAMPARRAY [rdi] ; output_data 635.rowloop: 636 push rdi 637 push rsi 638 639 mov rsi, JSAMPROW [rsi] ; inptr 640 mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 641 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 642 mov rax, rdx ; colctr 643.columnloop: 644 645 cmp rax, byte SIZEOF_YMMWORD 646 ja short .above_16 647 648 vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 649 vpunpckhbw xmm1, xmm0, xmm0 650 vpunpcklbw xmm0, xmm0, xmm0 651 652 vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 653 vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 654 vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 655 vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 656 657 jmp near .nextrow 658 659.above_16: 660 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 661 662 vpermq ymm0, ymm0, 0xd8 663 vpunpckhbw ymm1, ymm0, ymm0 664 vpunpcklbw ymm0, ymm0, ymm0 665 666 vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0 667 vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1 668 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 669 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 670 671 sub rax, byte 2*SIZEOF_YMMWORD 672 jz short .nextrow 673 674 add rsi, byte SIZEOF_YMMWORD ; inptr 675 add rbx, 2*SIZEOF_YMMWORD ; outptr0 676 add rdi, 2*SIZEOF_YMMWORD ; outptr1 677 jmp short .columnloop 678 679.nextrow: 680 pop rsi 681 pop rdi 682 683 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 684 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 685 sub rcx, byte 2 ; rowctr 686 jg near .rowloop 687 688.return: 689 pop rbx 690 vzeroupper 691 uncollect_args 4 692 pop rbp 693 ret 694 695; For some reason, the OS X linker does not honor the request to align the 696; segment unless we do this. 697 align 32 698