1; 2; jcsample.asm - downsampling (64-bit AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2015, Intel Corporation. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 64 23; 24; Downsample pixel values of a single component. 25; This version handles the common case of 2:1 horizontal and 1:1 vertical, 26; without smoothing. 27; 28; GLOBAL(void) 29; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 30; JDIMENSION v_samp_factor, 31; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 32; JSAMPARRAY output_data); 33; 34 35; r10d = JDIMENSION image_width 36; r11 = int max_v_samp_factor 37; r12d = JDIMENSION v_samp_factor 38; r13d = JDIMENSION width_in_blocks 39; r14 = JSAMPARRAY input_data 40; r15 = JSAMPARRAY output_data 41 42 align 32 43 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) 44 45EXTN(jsimd_h2v1_downsample_avx2): 46 push rbp 47 mov rax, rsp 48 mov rbp, rsp 49 collect_args 6 50 51 mov ecx, r13d 52 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 53 jz near .return 54 55 mov edx, r10d 56 57 ; -- expand_right_edge 58 59 push rcx 60 shl rcx, 1 ; output_cols * 2 61 sub rcx, rdx 62 jle short .expand_end 63 64 mov rax, r11 65 test rax, rax 66 jle short .expand_end 67 68 cld 69 mov rsi, r14 ; input_data 70.expandloop: 71 push rax 72 push rcx 73 74 mov rdi, JSAMPROW [rsi] 75 add rdi, rdx 76 mov al, JSAMPLE [rdi-1] 77 78 rep stosb 79 80 pop rcx 81 pop rax 82 83 add rsi, byte SIZEOF_JSAMPROW 84 dec rax 85 jg short .expandloop 86 87.expand_end: 88 pop rcx ; output_cols 89 90 ; -- h2v1_downsample 91 92 mov eax, r12d ; rowctr 93 test eax, eax 94 jle near .return 95 96 mov rdx, 0x00010000 ; bias pattern 97 vmovd xmm7, edx 98 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 99 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} 100 vpcmpeqw ymm6, ymm6, ymm6 101 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 102 103 mov rsi, r14 ; input_data 104 mov rdi, r15 ; output_data 105.rowloop: 106 push rcx 107 push rdi 108 push rsi 109 110 mov rsi, JSAMPROW [rsi] ; inptr 111 mov rdi, JSAMPROW [rdi] ; outptr 112 113 cmp rcx, byte SIZEOF_YMMWORD 114 jae short .columnloop 115 116.columnloop_r24: 117 ; rcx can possibly be 8, 16, 24 118 cmp rcx, 24 119 jne .columnloop_r16 120 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 121 vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD] 122 mov rcx, SIZEOF_YMMWORD 123 jmp short .downsample 124 125.columnloop_r16: 126 cmp rcx, 16 127 jne .columnloop_r8 128 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 129 vpxor ymm1, ymm1, ymm1 130 mov rcx, SIZEOF_YMMWORD 131 jmp short .downsample 132 133.columnloop_r8: 134 vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD] 135 vpxor ymm1, ymm1, ymm1 136 mov rcx, SIZEOF_YMMWORD 137 jmp short .downsample 138 139.columnloop: 140 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 141 vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD] 142 143.downsample: 144 vpsrlw ymm2, ymm0, BYTE_BIT 145 vpand ymm0, ymm0, ymm6 146 vpsrlw ymm3, ymm1, BYTE_BIT 147 vpand ymm1, ymm1, ymm6 148 149 vpaddw ymm0, ymm0, ymm2 150 vpaddw ymm1, ymm1, ymm3 151 vpaddw ymm0, ymm0, ymm7 152 vpaddw ymm1, ymm1, ymm7 153 vpsrlw ymm0, ymm0, 1 154 vpsrlw ymm1, ymm1, 1 155 156 vpackuswb ymm0, ymm0, ymm1 157 vpermq ymm0, ymm0, 0xd8 158 159 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 160 161 sub rcx, byte SIZEOF_YMMWORD ; outcol 162 add rsi, byte 2*SIZEOF_YMMWORD ; inptr 163 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 164 cmp rcx, byte SIZEOF_YMMWORD 165 jae short .columnloop 166 test rcx, rcx 167 jnz near .columnloop_r24 168 169 pop rsi 170 pop rdi 171 pop rcx 172 173 add rsi, byte SIZEOF_JSAMPROW ; input_data 174 add rdi, byte SIZEOF_JSAMPROW ; output_data 175 dec rax ; rowctr 176 jg near .rowloop 177 178.return: 179 vzeroupper 180 uncollect_args 6 181 pop rbp 182 ret 183 184; -------------------------------------------------------------------------- 185; 186; Downsample pixel values of a single component. 187; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 188; without smoothing. 189; 190; GLOBAL(void) 191; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 192; JDIMENSION v_samp_factor, 193; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 194; JSAMPARRAY output_data); 195; 196 197; r10d = JDIMENSION image_width 198; r11 = int max_v_samp_factor 199; r12d = JDIMENSION v_samp_factor 200; r13d = JDIMENSION width_in_blocks 201; r14 = JSAMPARRAY input_data 202; r15 = JSAMPARRAY output_data 203 204 align 32 205 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) 206 207EXTN(jsimd_h2v2_downsample_avx2): 208 push rbp 209 mov rax, rsp 210 mov rbp, rsp 211 collect_args 6 212 213 mov ecx, r13d 214 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 215 jz near .return 216 217 mov edx, r10d 218 219 ; -- expand_right_edge 220 221 push rcx 222 shl rcx, 1 ; output_cols * 2 223 sub rcx, rdx 224 jle short .expand_end 225 226 mov rax, r11 227 test rax, rax 228 jle short .expand_end 229 230 cld 231 mov rsi, r14 ; input_data 232.expandloop: 233 push rax 234 push rcx 235 236 mov rdi, JSAMPROW [rsi] 237 add rdi, rdx 238 mov al, JSAMPLE [rdi-1] 239 240 rep stosb 241 242 pop rcx 243 pop rax 244 245 add rsi, byte SIZEOF_JSAMPROW 246 dec rax 247 jg short .expandloop 248 249.expand_end: 250 pop rcx ; output_cols 251 252 ; -- h2v2_downsample 253 254 mov eax, r12d ; rowctr 255 test rax, rax 256 jle near .return 257 258 mov rdx, 0x00020001 ; bias pattern 259 vmovd xmm7, edx 260 vpcmpeqw ymm6, ymm6, ymm6 261 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} 262 vperm2i128 ymm7, ymm7, ymm7, 0 263 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 264 265 mov rsi, r14 ; input_data 266 mov rdi, r15 ; output_data 267.rowloop: 268 push rcx 269 push rdi 270 push rsi 271 272 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 273 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 274 mov rdi, JSAMPROW [rdi] ; outptr 275 276 cmp rcx, byte SIZEOF_YMMWORD 277 jae short .columnloop 278 279.columnloop_r24: 280 cmp rcx, 24 281 jne .columnloop_r16 282 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 283 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 284 vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD] 285 vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD] 286 mov rcx, SIZEOF_YMMWORD 287 jmp short .downsample 288 289.columnloop_r16: 290 cmp rcx, 16 291 jne .columnloop_r8 292 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 293 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 294 vpxor ymm2, ymm2, ymm2 295 vpxor ymm3, ymm3, ymm3 296 mov rcx, SIZEOF_YMMWORD 297 jmp short .downsample 298 299.columnloop_r8: 300 vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 301 vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 302 vpxor ymm2, ymm2, ymm2 303 vpxor ymm3, ymm3, ymm3 304 mov rcx, SIZEOF_YMMWORD 305 jmp short .downsample 306 307.columnloop: 308 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 309 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 310 vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD] 311 vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD] 312 313.downsample: 314 vpand ymm4, ymm0, ymm6 315 vpsrlw ymm0, ymm0, BYTE_BIT 316 vpand ymm5, ymm1, ymm6 317 vpsrlw ymm1, ymm1, BYTE_BIT 318 vpaddw ymm0, ymm0, ymm4 319 vpaddw ymm1, ymm1, ymm5 320 321 vpand ymm4, ymm2, ymm6 322 vpsrlw ymm2, ymm2, BYTE_BIT 323 vpand ymm5, ymm3, ymm6 324 vpsrlw ymm3, ymm3, BYTE_BIT 325 vpaddw ymm2, ymm2, ymm4 326 vpaddw ymm3, ymm3, ymm5 327 328 vpaddw ymm0, ymm0, ymm1 329 vpaddw ymm2, ymm2, ymm3 330 vpaddw ymm0, ymm0, ymm7 331 vpaddw ymm2, ymm2, ymm7 332 vpsrlw ymm0, ymm0, 2 333 vpsrlw ymm2, ymm2, 2 334 335 vpackuswb ymm0, ymm0, ymm2 336 vpermq ymm0, ymm0, 0xd8 337 338 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 339 340 sub rcx, byte SIZEOF_YMMWORD ; outcol 341 add rdx, byte 2*SIZEOF_YMMWORD ; inptr0 342 add rsi, byte 2*SIZEOF_YMMWORD ; inptr1 343 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 344 cmp rcx, byte SIZEOF_YMMWORD 345 jae near .columnloop 346 test rcx, rcx 347 jnz near .columnloop_r24 348 349 pop rsi 350 pop rdi 351 pop rcx 352 353 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 354 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 355 dec rax ; rowctr 356 jg near .rowloop 357 358.return: 359 vzeroupper 360 uncollect_args 6 361 pop rbp 362 ret 363 364; For some reason, the OS X linker does not honor the request to align the 365; segment unless we do this. 366 align 32 367