1; 2; jdsample.asm - upsampling (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18 19; -------------------------------------------------------------------------- 20 SECTION SEG_CONST 21 22 alignz 32 23 GLOBAL_DATA(jconst_fancy_upsample_sse2) 24 25EXTN(jconst_fancy_upsample_sse2): 26 27PW_ONE times 8 dw 1 28PW_TWO times 8 dw 2 29PW_THREE times 8 dw 3 30PW_SEVEN times 8 dw 7 31PW_EIGHT times 8 dw 8 32 33 alignz 32 34 35; -------------------------------------------------------------------------- 36 SECTION SEG_TEXT 37 BITS 64 38; 39; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 40; 41; The upsampling algorithm is linear interpolation between pixel centers, 42; also known as a "triangle filter". This is a good compromise between 43; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 44; of the way between input pixel centers. 45; 46; GLOBAL(void) 47; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, 48; JDIMENSION downsampled_width, 49; JSAMPARRAY input_data, 50; JSAMPARRAY *output_data_ptr); 51; 52 53; r10 = int max_v_samp_factor 54; r11d = JDIMENSION downsampled_width 55; r12 = JSAMPARRAY input_data 56; r13 = JSAMPARRAY *output_data_ptr 57 58 align 32 59 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) 60 61EXTN(jsimd_h2v1_fancy_upsample_sse2): 62 push rbp 63 mov rax, rsp 64 mov rbp, rsp 65 collect_args 4 66 67 mov eax, r11d ; colctr 68 test rax, rax 69 jz near .return 70 71 mov rcx, r10 ; rowctr 72 test rcx, rcx 73 jz near .return 74 75 mov rsi, r12 ; input_data 76 mov rdi, r13 77 mov rdi, JSAMPARRAY [rdi] ; output_data 78.rowloop: 79 push rax ; colctr 80 push rdi 81 push rsi 82 83 mov rsi, JSAMPROW [rsi] ; inptr 84 mov rdi, JSAMPROW [rdi] ; outptr 85 86 test rax, SIZEOF_XMMWORD-1 87 jz short .skip 88 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 89 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 90.skip: 91 pxor xmm0, xmm0 ; xmm0=(all 0's) 92 pcmpeqb xmm7, xmm7 93 psrldq xmm7, (SIZEOF_XMMWORD-1) 94 pand xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD] 95 96 add rax, byte SIZEOF_XMMWORD-1 97 and rax, byte -SIZEOF_XMMWORD 98 cmp rax, byte SIZEOF_XMMWORD 99 ja short .columnloop 100 101.columnloop_last: 102 pcmpeqb xmm6, xmm6 103 pslldq xmm6, (SIZEOF_XMMWORD-1) 104 pand xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD] 105 jmp short .upsample 106 107.columnloop: 108 movdqa xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD] 109 pslldq xmm6, (SIZEOF_XMMWORD-1) 110 111.upsample: 112 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 113 movdqa xmm2, xmm1 114 movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 115 pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) 116 psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) 117 118 por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 119 por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 120 121 movdqa xmm7, xmm1 122 psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 123 124 movdqa xmm4, xmm1 125 punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 126 punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 127 movdqa xmm5, xmm2 128 punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 129 punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 130 movdqa xmm6, xmm3 131 punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 132 punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 133 134 pmullw xmm1, [rel PW_THREE] 135 pmullw xmm4, [rel PW_THREE] 136 paddw xmm2, [rel PW_ONE] 137 paddw xmm5, [rel PW_ONE] 138 paddw xmm3, [rel PW_TWO] 139 paddw xmm6, [rel PW_TWO] 140 141 paddw xmm2, xmm1 142 paddw xmm5, xmm4 143 psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 144 psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 145 paddw xmm3, xmm1 146 paddw xmm6, xmm4 147 psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 148 psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 149 150 psllw xmm3, BYTE_BIT 151 psllw xmm6, BYTE_BIT 152 por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 153 por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 154 155 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 156 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5 157 158 sub rax, byte SIZEOF_XMMWORD 159 add rsi, byte 1*SIZEOF_XMMWORD ; inptr 160 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 161 cmp rax, byte SIZEOF_XMMWORD 162 ja near .columnloop 163 test eax, eax 164 jnz near .columnloop_last 165 166 pop rsi 167 pop rdi 168 pop rax 169 170 add rsi, byte SIZEOF_JSAMPROW ; input_data 171 add rdi, byte SIZEOF_JSAMPROW ; output_data 172 dec rcx ; rowctr 173 jg near .rowloop 174 175.return: 176 uncollect_args 4 177 pop rbp 178 ret 179 180; -------------------------------------------------------------------------- 181; 182; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 183; Again a triangle filter; see comments for h2v1 case, above. 184; 185; GLOBAL(void) 186; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, 187; JDIMENSION downsampled_width, 188; JSAMPARRAY input_data, 189; JSAMPARRAY *output_data_ptr); 190; 191 192; r10 = int max_v_samp_factor 193; r11d = JDIMENSION downsampled_width 194; r12 = JSAMPARRAY input_data 195; r13 = JSAMPARRAY *output_data_ptr 196 197%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 198%define WK_NUM 4 199 200 align 32 201 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) 202 203EXTN(jsimd_h2v2_fancy_upsample_sse2): 204 push rbp 205 mov rax, rsp ; rax = original rbp 206 sub rsp, byte 4 207 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 208 mov [rsp], rax 209 mov rbp, rsp ; rbp = aligned rbp 210 lea rsp, [wk(0)] 211 collect_args 4 212 push rbx 213 214 mov eax, r11d ; colctr 215 test rax, rax 216 jz near .return 217 218 mov rcx, r10 ; rowctr 219 test rcx, rcx 220 jz near .return 221 222 mov rsi, r12 ; input_data 223 mov rdi, r13 224 mov rdi, JSAMPARRAY [rdi] ; output_data 225.rowloop: 226 push rax ; colctr 227 push rcx 228 push rdi 229 push rsi 230 231 mov rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above) 232 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 233 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below) 234 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 235 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 236 237 test rax, SIZEOF_XMMWORD-1 238 jz short .skip 239 push rdx 240 mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE] 241 mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl 242 mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE] 243 mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl 244 mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE] 245 mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 246 pop rdx 247.skip: 248 ; -- process the first column block 249 250 movdqa xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 251 movdqa xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 252 movdqa xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 253 254 pxor xmm3, xmm3 ; xmm3=(all 0's) 255 movdqa xmm4, xmm0 256 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 257 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 258 movdqa xmm5, xmm1 259 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 260 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 261 movdqa xmm6, xmm2 262 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 263 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 264 265 pmullw xmm0, [rel PW_THREE] 266 pmullw xmm4, [rel PW_THREE] 267 268 pcmpeqb xmm7, xmm7 269 psrldq xmm7, (SIZEOF_XMMWORD-2) 270 271 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 272 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 273 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 274 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 275 276 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 277 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 278 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2 279 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6 280 281 pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 282 pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 283 284 movdqa XMMWORD [wk(0)], xmm1 285 movdqa XMMWORD [wk(1)], xmm2 286 287 add rax, byte SIZEOF_XMMWORD-1 288 and rax, byte -SIZEOF_XMMWORD 289 cmp rax, byte SIZEOF_XMMWORD 290 ja short .columnloop 291 292.columnloop_last: 293 ; -- process the last column block 294 295 pcmpeqb xmm1, xmm1 296 pslldq xmm1, (SIZEOF_XMMWORD-2) 297 movdqa xmm2, xmm1 298 299 pand xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD] 300 pand xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD] 301 302 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 303 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 304 305 jmp near .upsample 306 307.columnloop: 308 ; -- process the next column block 309 310 movdqa xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 311 movdqa xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 312 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 313 314 pxor xmm3, xmm3 ; xmm3=(all 0's) 315 movdqa xmm4, xmm0 316 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 317 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 318 movdqa xmm5, xmm1 319 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 320 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 321 movdqa xmm6, xmm2 322 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 323 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 324 325 pmullw xmm0, [rel PW_THREE] 326 pmullw xmm4, [rel PW_THREE] 327 328 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 329 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 330 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 331 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 332 333 movdqa XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 334 movdqa XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 335 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 336 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6 337 338 pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 339 pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 340 341 movdqa XMMWORD [wk(2)], xmm1 342 movdqa XMMWORD [wk(3)], xmm2 343 344.upsample: 345 ; -- process the upper row 346 347 movdqa xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD] 348 movdqa xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD] 349 350 movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 351 movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 352 psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) 353 pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 354 movdqa xmm5, xmm7 355 movdqa xmm6, xmm3 356 psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 357 pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) 358 359 por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 360 por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 361 362 movdqa xmm1, xmm7 363 movdqa xmm2, xmm3 364 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 365 psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) 366 movdqa xmm4, xmm3 367 psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 368 369 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 370 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 371 372 movdqa XMMWORD [wk(0)], xmm4 373 374 pmullw xmm7, [rel PW_THREE] 375 pmullw xmm3, [rel PW_THREE] 376 paddw xmm1, [rel PW_EIGHT] 377 paddw xmm5, [rel PW_EIGHT] 378 paddw xmm0, [rel PW_SEVEN] 379 paddw xmm2, [rel PW_SEVEN] 380 381 paddw xmm1, xmm7 382 paddw xmm5, xmm3 383 psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 384 psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 385 paddw xmm0, xmm7 386 paddw xmm2, xmm3 387 psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 388 psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 389 390 psllw xmm0, BYTE_BIT 391 psllw xmm2, BYTE_BIT 392 por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 393 por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 394 395 movdqa XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1 396 movdqa XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5 397 398 ; -- process the lower row 399 400 movdqa xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD] 401 movdqa xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD] 402 403 movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 404 movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 405 psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) 406 pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 407 movdqa xmm0, xmm6 408 movdqa xmm2, xmm4 409 psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 410 pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) 411 412 por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 413 por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 414 415 movdqa xmm1, xmm6 416 movdqa xmm5, xmm4 417 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 418 psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) 419 movdqa xmm3, xmm4 420 psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 421 422 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 423 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 424 425 movdqa XMMWORD [wk(1)], xmm3 426 427 pmullw xmm6, [rel PW_THREE] 428 pmullw xmm4, [rel PW_THREE] 429 paddw xmm1, [rel PW_EIGHT] 430 paddw xmm0, [rel PW_EIGHT] 431 paddw xmm7, [rel PW_SEVEN] 432 paddw xmm5, [rel PW_SEVEN] 433 434 paddw xmm1, xmm6 435 paddw xmm0, xmm4 436 psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 437 psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 438 paddw xmm7, xmm6 439 paddw xmm5, xmm4 440 psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 441 psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 442 443 psllw xmm7, BYTE_BIT 444 psllw xmm5, BYTE_BIT 445 por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 446 por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 447 448 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1 449 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0 450 451 sub rax, byte SIZEOF_XMMWORD 452 add rcx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 453 add rbx, byte 1*SIZEOF_XMMWORD ; inptr0 454 add rsi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 455 add rdx, byte 2*SIZEOF_XMMWORD ; outptr0 456 add rdi, byte 2*SIZEOF_XMMWORD ; outptr1 457 cmp rax, byte SIZEOF_XMMWORD 458 ja near .columnloop 459 test rax, rax 460 jnz near .columnloop_last 461 462 pop rsi 463 pop rdi 464 pop rcx 465 pop rax 466 467 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 468 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 469 sub rcx, byte 2 ; rowctr 470 jg near .rowloop 471 472.return: 473 pop rbx 474 uncollect_args 4 475 mov rsp, rbp ; rsp <- aligned rbp 476 pop rsp ; rsp <- original rbp 477 pop rbp 478 ret 479 480; -------------------------------------------------------------------------- 481; 482; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 483; It's still a box filter. 484; 485; GLOBAL(void) 486; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 487; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 488; 489 490; r10 = int max_v_samp_factor 491; r11d = JDIMENSION output_width 492; r12 = JSAMPARRAY input_data 493; r13 = JSAMPARRAY *output_data_ptr 494 495 align 32 496 GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) 497 498EXTN(jsimd_h2v1_upsample_sse2): 499 push rbp 500 mov rax, rsp 501 mov rbp, rsp 502 collect_args 4 503 504 mov edx, r11d 505 add rdx, byte (2*SIZEOF_XMMWORD)-1 506 and rdx, byte -(2*SIZEOF_XMMWORD) 507 jz near .return 508 509 mov rcx, r10 ; rowctr 510 test rcx, rcx 511 jz short .return 512 513 mov rsi, r12 ; input_data 514 mov rdi, r13 515 mov rdi, JSAMPARRAY [rdi] ; output_data 516.rowloop: 517 push rdi 518 push rsi 519 520 mov rsi, JSAMPROW [rsi] ; inptr 521 mov rdi, JSAMPROW [rdi] ; outptr 522 mov rax, rdx ; colctr 523.columnloop: 524 525 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 526 527 movdqa xmm1, xmm0 528 punpcklbw xmm0, xmm0 529 punpckhbw xmm1, xmm1 530 531 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 532 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 533 534 sub rax, byte 2*SIZEOF_XMMWORD 535 jz short .nextrow 536 537 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 538 539 movdqa xmm3, xmm2 540 punpcklbw xmm2, xmm2 541 punpckhbw xmm3, xmm3 542 543 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 544 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 545 546 sub rax, byte 2*SIZEOF_XMMWORD 547 jz short .nextrow 548 549 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 550 add rdi, byte 4*SIZEOF_XMMWORD ; outptr 551 jmp short .columnloop 552 553.nextrow: 554 pop rsi 555 pop rdi 556 557 add rsi, byte SIZEOF_JSAMPROW ; input_data 558 add rdi, byte SIZEOF_JSAMPROW ; output_data 559 dec rcx ; rowctr 560 jg short .rowloop 561 562.return: 563 uncollect_args 4 564 pop rbp 565 ret 566 567; -------------------------------------------------------------------------- 568; 569; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 570; It's still a box filter. 571; 572; GLOBAL(void) 573; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 574; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 575; 576 577; r10 = int max_v_samp_factor 578; r11d = JDIMENSION output_width 579; r12 = JSAMPARRAY input_data 580; r13 = JSAMPARRAY *output_data_ptr 581 582 align 32 583 GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) 584 585EXTN(jsimd_h2v2_upsample_sse2): 586 push rbp 587 mov rax, rsp 588 mov rbp, rsp 589 collect_args 4 590 push rbx 591 592 mov edx, r11d 593 add rdx, byte (2*SIZEOF_XMMWORD)-1 594 and rdx, byte -(2*SIZEOF_XMMWORD) 595 jz near .return 596 597 mov rcx, r10 ; rowctr 598 test rcx, rcx 599 jz near .return 600 601 mov rsi, r12 ; input_data 602 mov rdi, r13 603 mov rdi, JSAMPARRAY [rdi] ; output_data 604.rowloop: 605 push rdi 606 push rsi 607 608 mov rsi, JSAMPROW [rsi] ; inptr 609 mov rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0 610 mov rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1 611 mov rax, rdx ; colctr 612.columnloop: 613 614 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 615 616 movdqa xmm1, xmm0 617 punpcklbw xmm0, xmm0 618 punpckhbw xmm1, xmm1 619 620 movdqa XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0 621 movdqa XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1 622 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 623 movdqa XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1 624 625 sub rax, byte 2*SIZEOF_XMMWORD 626 jz short .nextrow 627 628 movdqa xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD] 629 630 movdqa xmm3, xmm2 631 punpcklbw xmm2, xmm2 632 punpckhbw xmm3, xmm3 633 634 movdqa XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2 635 movdqa XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3 636 movdqa XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2 637 movdqa XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3 638 639 sub rax, byte 2*SIZEOF_XMMWORD 640 jz short .nextrow 641 642 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 643 add rbx, byte 4*SIZEOF_XMMWORD ; outptr0 644 add rdi, byte 4*SIZEOF_XMMWORD ; outptr1 645 jmp short .columnloop 646 647.nextrow: 648 pop rsi 649 pop rdi 650 651 add rsi, byte 1*SIZEOF_JSAMPROW ; input_data 652 add rdi, byte 2*SIZEOF_JSAMPROW ; output_data 653 sub rcx, byte 2 ; rowctr 654 jg near .rowloop 655 656.return: 657 pop rbx 658 uncollect_args 4 659 pop rbp 660 ret 661 662; For some reason, the OS X linker does not honor the request to align the 663; segment unless we do this. 664 align 32 665