1; 2; jcsample.asm - downsampling (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 64 23; 24; Downsample pixel values of a single component. 25; This version handles the common case of 2:1 horizontal and 1:1 vertical, 26; without smoothing. 27; 28; GLOBAL(void) 29; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 30; JDIMENSION v_samp_factor, 31; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 32; JSAMPARRAY output_data); 33; 34 35; r10d = JDIMENSION image_width 36; r11 = int max_v_samp_factor 37; r12d = JDIMENSION v_samp_factor 38; r13d = JDIMENSION width_in_blocks 39; r14 = JSAMPARRAY input_data 40; r15 = JSAMPARRAY output_data 41 42 align 32 43 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2) 44 45EXTN(jsimd_h2v1_downsample_sse2): 46 push rbp 47 mov rax, rsp 48 mov rbp, rsp 49 collect_args 6 50 51 mov ecx, r13d 52 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 53 jz near .return 54 55 mov edx, r10d 56 57 ; -- expand_right_edge 58 59 push rcx 60 shl rcx, 1 ; output_cols * 2 61 sub rcx, rdx 62 jle short .expand_end 63 64 mov rax, r11 65 test rax, rax 66 jle short .expand_end 67 68 cld 69 mov rsi, r14 ; input_data 70.expandloop: 71 push rax 72 push rcx 73 74 mov rdip, JSAMPROW [rsi] 75 add rdi, rdx 76 mov al, JSAMPLE [rdi-1] 77 78 rep stosb 79 80 pop rcx 81 pop rax 82 83 add rsi, byte SIZEOF_JSAMPROW 84 dec rax 85 jg short .expandloop 86 87.expand_end: 88 pop rcx ; output_cols 89 90 ; -- h2v1_downsample 91 92 mov eax, r12d ; rowctr 93 test eax, eax 94 jle near .return 95 96 mov rdx, 0x00010000 ; bias pattern 97 movd xmm7, edx 98 pcmpeqw xmm6, xmm6 99 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 100 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 101 102 mov rsi, r14 ; input_data 103 mov rdi, r15 ; output_data 104.rowloop: 105 push rcx 106 push rdi 107 push rsi 108 109 mov rsip, JSAMPROW [rsi] ; inptr 110 mov rdip, JSAMPROW [rdi] ; outptr 111 112 cmp rcx, byte SIZEOF_XMMWORD 113 jae short .columnloop 114 115.columnloop_r8: 116 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 117 pxor xmm1, xmm1 118 mov rcx, SIZEOF_XMMWORD 119 jmp short .downsample 120 121.columnloop: 122 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 123 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] 124 125.downsample: 126 movdqa xmm2, xmm0 127 movdqa xmm3, xmm1 128 129 pand xmm0, xmm6 130 psrlw xmm2, BYTE_BIT 131 pand xmm1, xmm6 132 psrlw xmm3, BYTE_BIT 133 134 paddw xmm0, xmm2 135 paddw xmm1, xmm3 136 paddw xmm0, xmm7 137 paddw xmm1, xmm7 138 psrlw xmm0, 1 139 psrlw xmm1, 1 140 141 packuswb xmm0, xmm1 142 143 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 144 145 sub rcx, byte SIZEOF_XMMWORD ; outcol 146 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 147 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 148 cmp rcx, byte SIZEOF_XMMWORD 149 jae short .columnloop 150 test rcx, rcx 151 jnz short .columnloop_r8 152 153 pop rsi 154 pop rdi 155 pop rcx 156 157 add rsi, byte SIZEOF_JSAMPROW ; input_data 158 add rdi, byte SIZEOF_JSAMPROW ; output_data 159 dec rax ; rowctr 160 jg near .rowloop 161 162.return: 163 uncollect_args 6 164 pop rbp 165 ret 166 167; -------------------------------------------------------------------------- 168; 169; Downsample pixel values of a single component. 170; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 171; without smoothing. 172; 173; GLOBAL(void) 174; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor, 175; JDIMENSION v_samp_factor, 176; JDIMENSION width_in_blocks, JSAMPARRAY input_data, 177; JSAMPARRAY output_data); 178; 179 180; r10d = JDIMENSION image_width 181; r11 = int max_v_samp_factor 182; r12d = JDIMENSION v_samp_factor 183; r13d = JDIMENSION width_in_blocks 184; r14 = JSAMPARRAY input_data 185; r15 = JSAMPARRAY output_data 186 187 align 32 188 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2) 189 190EXTN(jsimd_h2v2_downsample_sse2): 191 push rbp 192 mov rax, rsp 193 mov rbp, rsp 194 collect_args 6 195 196 mov ecx, r13d 197 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) 198 jz near .return 199 200 mov edx, r10d 201 202 ; -- expand_right_edge 203 204 push rcx 205 shl rcx, 1 ; output_cols * 2 206 sub rcx, rdx 207 jle short .expand_end 208 209 mov rax, r11 210 test rax, rax 211 jle short .expand_end 212 213 cld 214 mov rsi, r14 ; input_data 215.expandloop: 216 push rax 217 push rcx 218 219 mov rdip, JSAMPROW [rsi] 220 add rdi, rdx 221 mov al, JSAMPLE [rdi-1] 222 223 rep stosb 224 225 pop rcx 226 pop rax 227 228 add rsi, byte SIZEOF_JSAMPROW 229 dec rax 230 jg short .expandloop 231 232.expand_end: 233 pop rcx ; output_cols 234 235 ; -- h2v2_downsample 236 237 mov eax, r12d ; rowctr 238 test rax, rax 239 jle near .return 240 241 mov rdx, 0x00020001 ; bias pattern 242 movd xmm7, edx 243 pcmpeqw xmm6, xmm6 244 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 245 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 246 247 mov rsi, r14 ; input_data 248 mov rdi, r15 ; output_data 249.rowloop: 250 push rcx 251 push rdi 252 push rsi 253 254 mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 255 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 256 mov rdip, JSAMPROW [rdi] ; outptr 257 258 cmp rcx, byte SIZEOF_XMMWORD 259 jae short .columnloop 260 261.columnloop_r8: 262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 264 pxor xmm2, xmm2 265 pxor xmm3, xmm3 266 mov rcx, SIZEOF_XMMWORD 267 jmp short .downsample 268 269.columnloop: 270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] 273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] 274 275.downsample: 276 movdqa xmm4, xmm0 277 movdqa xmm5, xmm1 278 pand xmm0, xmm6 279 psrlw xmm4, BYTE_BIT 280 pand xmm1, xmm6 281 psrlw xmm5, BYTE_BIT 282 paddw xmm0, xmm4 283 paddw xmm1, xmm5 284 285 movdqa xmm4, xmm2 286 movdqa xmm5, xmm3 287 pand xmm2, xmm6 288 psrlw xmm4, BYTE_BIT 289 pand xmm3, xmm6 290 psrlw xmm5, BYTE_BIT 291 paddw xmm2, xmm4 292 paddw xmm3, xmm5 293 294 paddw xmm0, xmm1 295 paddw xmm2, xmm3 296 paddw xmm0, xmm7 297 paddw xmm2, xmm7 298 psrlw xmm0, 2 299 psrlw xmm2, 2 300 301 packuswb xmm0, xmm2 302 303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 304 305 sub rcx, byte SIZEOF_XMMWORD ; outcol 306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 309 cmp rcx, byte SIZEOF_XMMWORD 310 jae near .columnloop 311 test rcx, rcx 312 jnz near .columnloop_r8 313 314 pop rsi 315 pop rdi 316 pop rcx 317 318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 320 dec rax ; rowctr 321 jg near .rowloop 322 323.return: 324 uncollect_args 6 325 pop rbp 326 ret 327 328; For some reason, the OS X linker does not honor the request to align the 329; segment unless we do this. 330 align 32 331