1; 2; jcsample.asm - downsampling (64-bit AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2015, Intel Corporation. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17; 18; [TAB8] 19 20%include "jsimdext.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 64 25; 26; Downsample pixel values of a single component. 27; This version handles the common case of 2:1 horizontal and 1:1 vertical, 28; without smoothing. 29; 30; GLOBAL(void) 31; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 32; JDIMENSION v_samp_factor, 33; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 34; JSAMPARRAY output_data); 35; 36 37; r10d = JDIMENSION image_width 38; r11 = int max_v_samp_factor 39; r12d = JDIMENSION v_samp_factor 40; r13d = JDIMENSION width_in_blocks 41; r14 = JSAMPARRAY input_data 42; r15 = JSAMPARRAY output_data 43 44 align 32 45 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2) 46 47EXTN(jsimd_h2v1_downsample_avx2): 48 push rbp 49 mov rax, rsp 50 mov rbp, rsp 51 collect_args 6 52 53 mov ecx, r13d 54 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 55 jz near .return 56 57 mov edx, r10d 58 59 ; -- expand_right_edge 60 61 push rcx 62 shl rcx, 1 ; output_cols * 2 63 sub rcx, rdx 64 jle short .expand_end 65 66 mov rax, r11 67 test rax, rax 68 jle short .expand_end 69 70 cld 71 mov rsi, r14 ; input_data 72.expandloop: 73 push rax 74 push rcx 75 76 mov rdi, JSAMPROW [rsi] 77 add rdi, rdx 78 mov al, JSAMPLE [rdi-1] 79 80 rep stosb 81 82 pop rcx 83 pop rax 84 85 add rsi, byte SIZEOF_JSAMPROW 86 dec rax 87 jg short .expandloop 88 89.expand_end: 90 pop rcx ; output_cols 91 92 ; -- h2v1_downsample 93 94 mov eax, r12d ; rowctr 95 test eax, eax 96 jle near .return 97 98 mov rdx, 0x00010000 ; bias pattern 99 vmovd xmm7, edx 100 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 101 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7} 102 vpcmpeqw ymm6, ymm6, ymm6 103 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 104 105 mov rsi, r14 ; input_data 106 mov rdi, r15 ; output_data 107.rowloop: 108 push rcx 109 push rdi 110 push rsi 111 112 mov rsi, JSAMPROW [rsi] ; inptr 113 mov rdi, JSAMPROW [rdi] ; outptr 114 115 cmp rcx, byte SIZEOF_YMMWORD 116 jae short .columnloop 117 118.columnloop_r24: 119 ; rcx can possibly be 8, 16, 24 120 cmp rcx, 24 121 jne .columnloop_r16 122 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 123 vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD] 124 mov rcx, SIZEOF_YMMWORD 125 jmp short .downsample 126 127.columnloop_r16: 128 cmp rcx, 16 129 jne .columnloop_r8 130 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 131 vpxor ymm1, ymm1, ymm1 132 mov rcx, SIZEOF_YMMWORD 133 jmp short .downsample 134 135.columnloop_r8: 136 vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD] 137 vpxor ymm1, ymm1, ymm1 138 mov rcx, SIZEOF_YMMWORD 139 jmp short .downsample 140 141.columnloop: 142 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD] 143 vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD] 144 145.downsample: 146 vpsrlw ymm2, ymm0, BYTE_BIT 147 vpand ymm0, ymm0, ymm6 148 vpsrlw ymm3, ymm1, BYTE_BIT 149 vpand ymm1, ymm1, ymm6 150 151 vpaddw ymm0, ymm0, ymm2 152 vpaddw ymm1, ymm1, ymm3 153 vpaddw ymm0, ymm0, ymm7 154 vpaddw ymm1, ymm1, ymm7 155 vpsrlw ymm0, ymm0, 1 156 vpsrlw ymm1, ymm1, 1 157 158 vpackuswb ymm0, ymm0, ymm1 159 vpermq ymm0, ymm0, 0xd8 160 161 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 162 163 sub rcx, byte SIZEOF_YMMWORD ; outcol 164 add rsi, byte 2*SIZEOF_YMMWORD ; inptr 165 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 166 cmp rcx, byte SIZEOF_YMMWORD 167 jae short .columnloop 168 test rcx, rcx 169 jnz near .columnloop_r24 170 171 pop rsi 172 pop rdi 173 pop rcx 174 175 add rsi, byte SIZEOF_JSAMPROW ; input_data 176 add rdi, byte SIZEOF_JSAMPROW ; output_data 177 dec rax ; rowctr 178 jg near .rowloop 179 180.return: 181 vzeroupper 182 uncollect_args 6 183 pop rbp 184 ret 185 186; -------------------------------------------------------------------------- 187; 188; Downsample pixel values of a single component. 189; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 190; without smoothing. 191; 192; GLOBAL(void) 193; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor, 194; JDIMENSION v_samp_factor, 195; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 196; JSAMPARRAY output_data); 197; 198 199; r10d = JDIMENSION image_width 200; r11 = int max_v_samp_factor 201; r12d = JDIMENSION v_samp_factor 202; r13d = JDIMENSION width_in_blocks 203; r14 = JSAMPARRAY input_data 204; r15 = JSAMPARRAY output_data 205 206 align 32 207 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2) 208 209EXTN(jsimd_h2v2_downsample_avx2): 210 push rbp 211 mov rax, rsp 212 mov rbp, rsp 213 collect_args 6 214 215 mov ecx, r13d 216 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 217 jz near .return 218 219 mov edx, r10d 220 221 ; -- expand_right_edge 222 223 push rcx 224 shl rcx, 1 ; output_cols * 2 225 sub rcx, rdx 226 jle short .expand_end 227 228 mov rax, r11 229 test rax, rax 230 jle short .expand_end 231 232 cld 233 mov rsi, r14 ; input_data 234.expandloop: 235 push rax 236 push rcx 237 238 mov rdi, JSAMPROW [rsi] 239 add rdi, rdx 240 mov al, JSAMPLE [rdi-1] 241 242 rep stosb 243 244 pop rcx 245 pop rax 246 247 add rsi, byte SIZEOF_JSAMPROW 248 dec rax 249 jg short .expandloop 250 251.expand_end: 252 pop rcx ; output_cols 253 254 ; -- h2v2_downsample 255 256 mov eax, r12d ; rowctr 257 test rax, rax 258 jle near .return 259 260 mov rdx, 0x00020001 ; bias pattern 261 vmovd xmm7, edx 262 vpcmpeqw ymm6, ymm6, ymm6 263 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2} 264 vperm2i128 ymm7, ymm7, ymm7, 0 265 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..} 266 267 mov rsi, r14 ; input_data 268 mov rdi, r15 ; output_data 269.rowloop: 270 push rcx 271 push rdi 272 push rsi 273 274 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 275 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 276 mov rdi, JSAMPROW [rdi] ; outptr 277 278 cmp rcx, byte SIZEOF_YMMWORD 279 jae short .columnloop 280 281.columnloop_r24: 282 cmp rcx, 24 283 jne .columnloop_r16 284 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 285 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 286 vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD] 287 vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD] 288 mov rcx, SIZEOF_YMMWORD 289 jmp short .downsample 290 291.columnloop_r16: 292 cmp rcx, 16 293 jne .columnloop_r8 294 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 295 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 296 vpxor ymm2, ymm2, ymm2 297 vpxor ymm3, ymm3, ymm3 298 mov rcx, SIZEOF_YMMWORD 299 jmp short .downsample 300 301.columnloop_r8: 302 vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 303 vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 304 vpxor ymm2, ymm2, ymm2 305 vpxor ymm3, ymm3, ymm3 306 mov rcx, SIZEOF_YMMWORD 307 jmp short .downsample 308 309.columnloop: 310 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD] 311 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] 312 vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD] 313 vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD] 314 315.downsample: 316 vpand ymm4, ymm0, ymm6 317 vpsrlw ymm0, ymm0, BYTE_BIT 318 vpand ymm5, ymm1, ymm6 319 vpsrlw ymm1, ymm1, BYTE_BIT 320 vpaddw ymm0, ymm0, ymm4 321 vpaddw ymm1, ymm1, ymm5 322 323 vpand ymm4, ymm2, ymm6 324 vpsrlw ymm2, ymm2, BYTE_BIT 325 vpand ymm5, ymm3, ymm6 326 vpsrlw ymm3, ymm3, BYTE_BIT 327 vpaddw ymm2, ymm2, ymm4 328 vpaddw ymm3, ymm3, ymm5 329 330 vpaddw ymm0, ymm0, ymm1 331 vpaddw ymm2, ymm2, ymm3 332 vpaddw ymm0, ymm0, ymm7 333 vpaddw ymm2, ymm2, ymm7 334 vpsrlw ymm0, ymm0, 2 335 vpsrlw ymm2, ymm2, 2 336 337 vpackuswb ymm0, ymm0, ymm2 338 vpermq ymm0, ymm0, 0xd8 339 340 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0 341 342 sub rcx, byte SIZEOF_YMMWORD ; outcol 343 add rdx, byte 2*SIZEOF_YMMWORD ; inptr0 344 add rsi, byte 2*SIZEOF_YMMWORD ; inptr1 345 add rdi, byte 1*SIZEOF_YMMWORD ; outptr 346 cmp rcx, byte SIZEOF_YMMWORD 347 jae near .columnloop 348 test rcx, rcx 349 jnz near .columnloop_r24 350 351 pop rsi 352 pop rdi 353 pop rcx 354 355 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 356 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 357 dec rax ; rowctr 358 jg near .rowloop 359 360.return: 361 vzeroupper 362 uncollect_args 6 363 pop rbp 364 ret 365 366; For some reason, the OS X linker does not honor the request to align the 367; segment unless we do this. 368 align 32 369