1; 2; jdsample.asm - upsampling (64-bit AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2015, Intel Corporation. 7; Copyright (C) 2018, Matthias Räncker. 8; 9; Based on the x86 SIMD extension for IJG JPEG library 10; Copyright (C) 1999-2006, MIYASAKA Masaru. 11; For conditions of distribution and use, see copyright notice in jsimdext.inc 12; 13; This file should be assembled with NASM (Netwide Assembler), 14; can *not* be assembled with Microsoft's MASM or any compatible 15; assembler (including Borland's Turbo Assembler). 16; NASM is available from http://nasm.sourceforge.net/ or 17; http://sourceforge.net/project/showfiles.php?group_id=6208 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_CONST 23 24 alignz 32 25 GLOBAL_DATA(jconst_fancy_upsample_avx2) 26 27EXTN(jconst_fancy_upsample_avx2): 28 29PW_ONE times 16 dw 1 30PW_TWO times 16 dw 2 31PW_THREE times 16 dw 3 32PW_SEVEN times 16 dw 7 33PW_EIGHT times 16 dw 8 34 35 alignz 32 36 37; -------------------------------------------------------------------------- 38 SECTION SEG_TEXT 39 BITS 64 40; 41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 42; 43; The upsampling algorithm is linear interpolation between pixel centers, 44; also known as a "triangle filter". This is a good compromise between 45; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 46; of the way between input pixel centers. 47; 48; GLOBAL(void) 49; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor, 50; JDIMENSION downsampled_width, 51; JSAMPARRAY input_data, 52; JSAMPARRAY *output_data_ptr); 53; 54 55; r10 = int max_v_samp_factor 56; r11d = JDIMENSION downsampled_width 57; r12 = JSAMPARRAY input_data 58; r13 = JSAMPARRAY *output_data_ptr 59 60 align 32 61 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2) 62 63EXTN(jsimd_h2v1_fancy_upsample_avx2): 64 push rbp 65 mov rax, rsp 66 mov rbp, rsp 67 push_xmm 3 68 collect_args 4 69 70 mov eax, r11d ; colctr 71 test rax, rax 72 jz near .return 73 74 mov rcx, r10 ; rowctr 75 test rcx, rcx 76 jz near .return 77 78 mov rsi, r12 ; input_data 79 mov rdi, r13 80 mov rdip, JSAMPARRAY [rdi] ; output_data 81 82 vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's) 83 vpcmpeqb xmm9, xmm9, xmm9 84 vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff 85 86 vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1) 87 vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff 88 89.rowloop: 90 push rax ; colctr 91 push rdi 92 push rsi 93 94 mov rsip, JSAMPROW [rsi] ; inptr 95 mov rdip, JSAMPROW [rdi] ; outptr 96 97 test rax, SIZEOF_YMMWORD-1 98 jz short .skip 99 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 100 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 101.skip: 102 vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD] 103 104 add rax, byte SIZEOF_YMMWORD-1 105 and rax, byte -SIZEOF_YMMWORD 106 cmp rax, byte SIZEOF_YMMWORD 107 ja short .columnloop 108 109.columnloop_last: 110 vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD] 111 jmp short .upsample 112 113.columnloop: 114 vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD] 115 vperm2i128 ymm6, ymm0, ymm6, 0x20 116 vpslldq ymm6, ymm6, 15 117 118.upsample: 119 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31) 120 121 vperm2i128 ymm2, ymm0, ymm1, 0x20 122 vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30) 123 vperm2i128 ymm4, ymm0, ymm1, 0x03 124 vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --) 125 126 vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30) 127 vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32) 128 129 vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --) 130 131 vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 132 vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 133 vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 134 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 135 136 vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30) 137 vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22) 138 vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 139 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 140 141 vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24) 142 vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32) 143 vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 144 vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 145 146 vpmullw ymm1, ymm1, [rel PW_THREE] 147 vpmullw ymm4, ymm4, [rel PW_THREE] 148 vpaddw ymm2, ymm2, [rel PW_ONE] 149 vpaddw ymm5, ymm5, [rel PW_ONE] 150 vpaddw ymm3, ymm3, [rel PW_TWO] 151 vpaddw ymm6, ymm6, [rel PW_TWO] 152 153 vpaddw ymm2, ymm2, ymm1 154 vpaddw ymm5, ymm5, ymm4 155 vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 156 vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 157 vpaddw ymm3, ymm3, ymm1 158 vpaddw ymm6, ymm6, ymm4 159 vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 160 vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 161 162 vpsllw ymm3, ymm3, BYTE_BIT 163 vpsllw ymm6, ymm6, BYTE_BIT 164 vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31) 165 vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63) 166 167 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 168 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5 169 170 sub rax, byte SIZEOF_YMMWORD 171 add rsi, byte 1*SIZEOF_YMMWORD ; inptr 172 add rdi, byte 2*SIZEOF_YMMWORD ; outptr 173 cmp rax, byte SIZEOF_YMMWORD 174 ja near .columnloop 175 test eax, eax 176 jnz near .columnloop_last 177 178 pop rsi 179 pop rdi 180 pop rax 181 182 add rsi, byte SIZEOF_JSAMPROW ; input_data 183 add rdi, byte SIZEOF_JSAMPROW ; output_data 184 dec rcx ; rowctr 185 jg near .rowloop 186 187.return: 188 vzeroupper 189 uncollect_args 4 190 pop_xmm 3 191 pop rbp 192 ret 193 194; -------------------------------------------------------------------------- 195; 196; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 197; Again a triangle filter; see comments for h2v1 case, above. 198; 199; GLOBAL(void) 200; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor, 201; JDIMENSION downsampled_width, 202; JSAMPARRAY input_data, 203; JSAMPARRAY *output_data_ptr); 204; 205 206; r10 = int max_v_samp_factor 207; r11d = JDIMENSION downsampled_width 208; r12 = JSAMPARRAY input_data 209; r13 = JSAMPARRAY *output_data_ptr 210 211%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 212%define WK_NUM 4 213 214 align 32 215 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2) 216 217EXTN(jsimd_h2v2_fancy_upsample_avx2): 218 push rbp 219 mov rax, rsp ; rax = original rbp 220 sub rsp, byte 4 221 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 222 mov [rsp], rax 223 mov rbp, rsp ; rbp = aligned rbp 224 lea rsp, [wk(0)] 225 push_xmm 3 226 collect_args 4 227 push rbx 228 229 mov eax, r11d ; colctr 230 test rax, rax 231 jz near .return 232 233 mov rcx, r10 ; rowctr 234 test rcx, rcx 235 jz near .return 236 237 mov rsi, r12 ; input_data 238 mov rdi, r13 239 mov rdip, JSAMPARRAY [rdi] ; output_data 240.rowloop: 241 push rax ; colctr 242 push rcx 243 push rdi 244 push rsi 245 246 mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) 247 mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 248 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) 249 mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 250 mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 251 252 vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's) 253 vpcmpeqb xmm9, xmm9, xmm9 254 vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff 255 vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2) 256 vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff 257 258 test rax, SIZEOF_YMMWORD-1 259 jz short .skip 260 push rdx 261 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] 262 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl 263 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] 264 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl 265 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 266 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 267 pop rdx 268.skip: 269 ; -- process the first column block 270 271 vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0] 272 vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0] 273 vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0] 274 275 vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 276 vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 277 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 278 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 279 280 vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 281 vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 282 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 283 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 284 285 vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 286 vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 287 vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 288 vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 289 290 vpmullw ymm0, ymm0, [rel PW_THREE] 291 vpmullw ymm4, ymm4, [rel PW_THREE] 292 293 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 294 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 295 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 296 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 297 298 vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save 299 vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data 300 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2 301 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6 302 303 vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 304 vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 305 306 vmovdqa YMMWORD [wk(0)], ymm1 307 vmovdqa YMMWORD [wk(1)], ymm2 308 309 add rax, byte SIZEOF_YMMWORD-1 310 and rax, byte -SIZEOF_YMMWORD 311 cmp rax, byte SIZEOF_YMMWORD 312 ja short .columnloop 313 314.columnloop_last: 315 ; -- process the last column block 316 317 vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD] 318 vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD] 319 320 vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 321 vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31) 322 323 jmp near .upsample 324 325.columnloop: 326 ; -- process the next column block 327 328 vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1] 329 vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1] 330 vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1] 331 332 vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 333 vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 334 vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 335 vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 336 337 vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 338 vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 339 vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 340 vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 341 342 vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31) 343 vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23) 344 vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 345 vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 346 347 vpmullw ymm0, ymm0, [rel PW_THREE] 348 vpmullw ymm4, ymm4, [rel PW_THREE] 349 350 vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 351 vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 352 vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 353 vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 354 355 vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save 356 vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data 357 vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2 358 vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6 359 360 vperm2i128 ymm1, ymm8, ymm1, 0x20 361 vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 362 vperm2i128 ymm2, ymm8, ymm2, 0x20 363 vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0) 364 365 vmovdqa YMMWORD [wk(2)], ymm1 366 vmovdqa YMMWORD [wk(3)], ymm2 367 368.upsample: 369 ; -- process the upper row 370 371 vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 372 vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 373 374 vperm2i128 ymm0, ymm8, ymm7, 0x03 375 vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 376 vperm2i128 ymm4, ymm8, ymm3, 0x20 377 vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 378 379 vperm2i128 ymm5, ymm8, ymm7, 0x03 380 vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 381 vperm2i128 ymm6, ymm8, ymm3, 0x20 382 vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 383 384 vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 385 vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 386 387 vperm2i128 ymm2, ymm8, ymm3, 0x03 388 vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 389 vperm2i128 ymm4, ymm8, ymm3, 0x03 390 vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 391 vperm2i128 ymm1, ymm8, ymm7, 0x20 392 vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 393 394 vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 395 vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 396 397 vmovdqa YMMWORD [wk(0)], ymm4 398 399 vpmullw ymm7, ymm7, [rel PW_THREE] 400 vpmullw ymm3, ymm3, [rel PW_THREE] 401 vpaddw ymm1, ymm1, [rel PW_EIGHT] 402 vpaddw ymm5, ymm5, [rel PW_EIGHT] 403 vpaddw ymm0, ymm0, [rel PW_SEVEN] 404 vpaddw ymm2, [rel PW_SEVEN] 405 406 vpaddw ymm1, ymm1, ymm7 407 vpaddw ymm5, ymm5, ymm3 408 vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 409 vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 410 vpaddw ymm0, ymm0, ymm7 411 vpaddw ymm2, ymm2, ymm3 412 vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 413 vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 414 415 vpsllw ymm0, ymm0, BYTE_BIT 416 vpsllw ymm2, ymm2, BYTE_BIT 417 vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31) 418 vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63) 419 420 vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 421 vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 422 423 ; -- process the lower row 424 425 vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15) 426 vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31) 427 428 vperm2i128 ymm7, ymm8, ymm6, 0x03 429 vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --) 430 vperm2i128 ymm3, ymm8, ymm4, 0x20 431 vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16) 432 433 vperm2i128 ymm0, ymm8, ymm6, 0x03 434 vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 435 vperm2i128 ymm2, ymm8, ymm4, 0x20 436 vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 437 438 vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16) 439 vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30) 440 441 vperm2i128 ymm5, ymm8, ymm4, 0x03 442 vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --) 443 vperm2i128 ymm3, ymm8, ymm4, 0x03 444 vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --) 445 vperm2i128 ymm1, ymm8, ymm6, 0x20 446 vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 447 448 vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14) 449 vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32) 450 451 vmovdqa YMMWORD [wk(1)], ymm3 452 453 vpmullw ymm6, ymm6, [rel PW_THREE] 454 vpmullw ymm4, ymm4, [rel PW_THREE] 455 vpaddw ymm1, ymm1, [rel PW_EIGHT] 456 vpaddw ymm0, ymm0, [rel PW_EIGHT] 457 vpaddw ymm7, ymm7, [rel PW_SEVEN] 458 vpaddw ymm5, ymm5, [rel PW_SEVEN] 459 460 vpaddw ymm1, ymm1, ymm6 461 vpaddw ymm0, ymm0, ymm4 462 vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30) 463 vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62) 464 vpaddw ymm7, ymm7, ymm6 465 vpaddw ymm5, ymm5, ymm4 466 vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31) 467 vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63) 468 469 vpsllw ymm7, ymm7, BYTE_BIT 470 vpsllw ymm5, ymm5, BYTE_BIT 471 vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31) 472 vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63) 473 474 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1 475 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0 476 477 sub rax, byte SIZEOF_YMMWORD 478 add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above) 479 add rbx, byte 1*SIZEOF_YMMWORD ; inptr0 480 add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below) 481 add rdx, byte 2*SIZEOF_YMMWORD ; outptr0 482 add rdi, byte 2*SIZEOF_YMMWORD ; outptr1 483 cmp rax, byte SIZEOF_YMMWORD 484 ja near .columnloop 485 test rax, rax 486 jnz near .columnloop_last 487 488 pop rsi 489 pop rdi 490 pop rcx 491 pop rax 492 493 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 494 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 495 sub rcx, byte 2 ; rowctr 496 jg near .rowloop 497 498.return: 499 pop rbx 500 vzeroupper 501 uncollect_args 4 502 pop_xmm 3 503 mov rsp, rbp ; rsp <- aligned rbp 504 pop rsp ; rsp <- original rbp 505 pop rbp 506 ret 507 508; -------------------------------------------------------------------------- 509; 510; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 511; It's still a box filter. 512; 513; GLOBAL(void) 514; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 515; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 516; 517 518; r10 = int max_v_samp_factor 519; r11d = JDIMENSION output_width 520; r12 = JSAMPARRAY input_data 521; r13 = JSAMPARRAY *output_data_ptr 522 523 align 32 524 GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2) 525 526EXTN(jsimd_h2v1_upsample_avx2): 527 push rbp 528 mov rax, rsp 529 mov rbp, rsp 530 collect_args 4 531 532 mov edx, r11d 533 add rdx, byte (SIZEOF_YMMWORD-1) 534 and rdx, -SIZEOF_YMMWORD 535 jz near .return 536 537 mov rcx, r10 ; rowctr 538 test rcx, rcx 539 jz short .return 540 541 mov rsi, r12 ; input_data 542 mov rdi, r13 543 mov rdip, JSAMPARRAY [rdi] ; output_data 544.rowloop: 545 push rdi 546 push rsi 547 548 mov rsip, JSAMPROW [rsi] ; inptr 549 mov rdip, JSAMPROW [rdi] ; outptr 550 mov rax, rdx ; colctr 551.columnloop: 552 553 cmp rax, byte SIZEOF_YMMWORD 554 ja near .above_16 555 556 vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD] 557 vpunpckhbw xmm1, xmm0, xmm0 558 vpunpcklbw xmm0, xmm0, xmm0 559 560 vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 561 vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 562 563 jmp short .nextrow 564 565.above_16: 566 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 567 568 vpermq ymm0, ymm0, 0xd8 569 vpunpckhbw ymm1, ymm0, ymm0 570 vpunpcklbw ymm0, ymm0, ymm0 571 572 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 573 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 574 575 sub rax, byte 2*SIZEOF_YMMWORD 576 jz short .nextrow 577 578 add rsi, byte SIZEOF_YMMWORD ; inptr 579 add rdi, byte 2*SIZEOF_YMMWORD ; outptr 580 jmp short .columnloop 581 582.nextrow: 583 pop rsi 584 pop rdi 585 586 add rsi, byte SIZEOF_JSAMPROW ; input_data 587 add rdi, byte SIZEOF_JSAMPROW ; output_data 588 dec rcx ; rowctr 589 jg short .rowloop 590 591.return: 592 vzeroupper 593 uncollect_args 4 594 pop rbp 595 ret 596 597; -------------------------------------------------------------------------- 598; 599; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 600; It's still a box filter. 601; 602; GLOBAL(void) 603; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width, 604; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 605; 606 607; r10 = int max_v_samp_factor 608; r11d = JDIMENSION output_width 609; r12 = JSAMPARRAY input_data 610; r13 = JSAMPARRAY *output_data_ptr 611 612 align 32 613 GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2) 614 615EXTN(jsimd_h2v2_upsample_avx2): 616 push rbp 617 mov rax, rsp 618 mov rbp, rsp 619 collect_args 4 620 push rbx 621 622 mov edx, r11d 623 add rdx, byte (SIZEOF_YMMWORD-1) 624 and rdx, -SIZEOF_YMMWORD 625 jz near .return 626 627 mov rcx, r10 ; rowctr 628 test rcx, rcx 629 jz near .return 630 631 mov rsi, r12 ; input_data 632 mov rdi, r13 633 mov rdip, JSAMPARRAY [rdi] ; output_data 634.rowloop: 635 push rdi 636 push rsi 637 638 mov rsip, JSAMPROW [rsi] ; inptr 639 mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 640 mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 641 mov rax, rdx ; colctr 642.columnloop: 643 644 cmp rax, byte SIZEOF_YMMWORD 645 ja short .above_16 646 647 vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 648 vpunpckhbw xmm1, xmm0, xmm0 649 vpunpcklbw xmm0, xmm0, xmm0 650 651 vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 652 vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 653 vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 654 vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 655 656 jmp near .nextrow 657 658.above_16: 659 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 660 661 vpermq ymm0, ymm0, 0xd8 662 vpunpckhbw ymm1, ymm0, ymm0 663 vpunpcklbw ymm0, ymm0, ymm0 664 665 vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0 666 vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1 667 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 668 vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1 669 670 sub rax, byte 2*SIZEOF_YMMWORD 671 jz short .nextrow 672 673 add rsi, byte SIZEOF_YMMWORD ; inptr 674 add rbx, 2*SIZEOF_YMMWORD ; outptr0 675 add rdi, 2*SIZEOF_YMMWORD ; outptr1 676 jmp short .columnloop 677 678.nextrow: 679 pop rsi 680 pop rdi 681 682 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 683 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 684 sub rcx, byte 2 ; rowctr 685 jg near .rowloop 686 687.return: 688 pop rbx 689 vzeroupper 690 uncollect_args 4 691 pop rbp 692 ret 693 694; For some reason, the OS X linker does not honor the request to align the 695; segment unless we do this. 696 align 32 697