1; 2; jcsample.asm - downsampling (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 64 24; 25; Downsample pixel values of a single component. 26; This version handles the common case of 2:1 horizontal and 1:1 vertical, 27; without smoothing. 28; 29; GLOBAL(void) 30; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 31; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 32; JSAMPARRAY input_data, JSAMPARRAY output_data); 33; 34 35; r10 = JDIMENSION image_width 36; r11 = int max_v_samp_factor 37; r12 = JDIMENSION v_samp_factor 38; r13 = JDIMENSION width_blocks 39; r14 = JSAMPARRAY input_data 40; r15 = JSAMPARRAY output_data 41 42 align 16 43 global EXTN(jsimd_h2v1_downsample_sse2) 44 45EXTN(jsimd_h2v1_downsample_sse2): 46 push rbp 47 mov rax,rsp 48 mov rbp,rsp 49 collect_args 50 51 mov ecx, r13d 52 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) 53 jz near .return 54 55 mov edx, r10d 56 57 ; -- expand_right_edge 58 59 push rcx 60 shl rcx,1 ; output_cols * 2 61 sub rcx,rdx 62 jle short .expand_end 63 64 mov rax, r11 65 test rax,rax 66 jle short .expand_end 67 68 cld 69 mov rsi, r14 ; input_data 70.expandloop: 71 push rax 72 push rcx 73 74 mov rdi, JSAMPROW [rsi] 75 add rdi,rdx 76 mov al, JSAMPLE [rdi-1] 77 78 rep stosb 79 80 pop rcx 81 pop rax 82 83 add rsi, byte SIZEOF_JSAMPROW 84 dec rax 85 jg short .expandloop 86 87.expand_end: 88 pop rcx ; output_cols 89 90 ; -- h2v1_downsample 91 92 mov eax, r12d ; rowctr 93 test eax,eax 94 jle near .return 95 96 mov rdx, 0x00010000 ; bias pattern 97 movd xmm7,edx 98 pcmpeqw xmm6,xmm6 99 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 100 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 101 102 mov rsi, r14 ; input_data 103 mov rdi, r15 ; output_data 104.rowloop: 105 push rcx 106 push rdi 107 push rsi 108 109 mov rsi, JSAMPROW [rsi] ; inptr 110 mov rdi, JSAMPROW [rdi] ; outptr 111 112 cmp rcx, byte SIZEOF_XMMWORD 113 jae short .columnloop 114 115.columnloop_r8: 116 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 117 pxor xmm1,xmm1 118 mov rcx, SIZEOF_XMMWORD 119 jmp short .downsample 120 121.columnloop: 122 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] 123 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] 124 125.downsample: 126 movdqa xmm2,xmm0 127 movdqa xmm3,xmm1 128 129 pand xmm0,xmm6 130 psrlw xmm2,BYTE_BIT 131 pand xmm1,xmm6 132 psrlw xmm3,BYTE_BIT 133 134 paddw xmm0,xmm2 135 paddw xmm1,xmm3 136 paddw xmm0,xmm7 137 paddw xmm1,xmm7 138 psrlw xmm0,1 139 psrlw xmm1,1 140 141 packuswb xmm0,xmm1 142 143 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 144 145 sub rcx, byte SIZEOF_XMMWORD ; outcol 146 add rsi, byte 2*SIZEOF_XMMWORD ; inptr 147 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 148 cmp rcx, byte SIZEOF_XMMWORD 149 jae short .columnloop 150 test rcx,rcx 151 jnz short .columnloop_r8 152 153 pop rsi 154 pop rdi 155 pop rcx 156 157 add rsi, byte SIZEOF_JSAMPROW ; input_data 158 add rdi, byte SIZEOF_JSAMPROW ; output_data 159 dec rax ; rowctr 160 jg near .rowloop 161 162.return: 163 uncollect_args 164 pop rbp 165 ret 166 167; -------------------------------------------------------------------------- 168; 169; Downsample pixel values of a single component. 170; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 171; without smoothing. 172; 173; GLOBAL(void) 174; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 175; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 176; JSAMPARRAY input_data, JSAMPARRAY output_data); 177; 178 179; r10 = JDIMENSION image_width 180; r11 = int max_v_samp_factor 181; r12 = JDIMENSION v_samp_factor 182; r13 = JDIMENSION width_blocks 183; r14 = JSAMPARRAY input_data 184; r15 = JSAMPARRAY output_data 185 186 align 16 187 global EXTN(jsimd_h2v2_downsample_sse2) 188 189EXTN(jsimd_h2v2_downsample_sse2): 190 push rbp 191 mov rax,rsp 192 mov rbp,rsp 193 collect_args 194 195 mov ecx, r13d 196 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) 197 jz near .return 198 199 mov edx, r10d 200 201 ; -- expand_right_edge 202 203 push rcx 204 shl rcx,1 ; output_cols * 2 205 sub rcx,rdx 206 jle short .expand_end 207 208 mov rax, r11 209 test rax,rax 210 jle short .expand_end 211 212 cld 213 mov rsi, r14 ; input_data 214.expandloop: 215 push rax 216 push rcx 217 218 mov rdi, JSAMPROW [rsi] 219 add rdi,rdx 220 mov al, JSAMPLE [rdi-1] 221 222 rep stosb 223 224 pop rcx 225 pop rax 226 227 add rsi, byte SIZEOF_JSAMPROW 228 dec rax 229 jg short .expandloop 230 231.expand_end: 232 pop rcx ; output_cols 233 234 ; -- h2v2_downsample 235 236 mov eax, r12d ; rowctr 237 test rax,rax 238 jle near .return 239 240 mov rdx, 0x00020001 ; bias pattern 241 movd xmm7,edx 242 pcmpeqw xmm6,xmm6 243 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 244 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 245 246 mov rsi, r14 ; input_data 247 mov rdi, r15 ; output_data 248.rowloop: 249 push rcx 250 push rdi 251 push rsi 252 253 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 254 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 255 mov rdi, JSAMPROW [rdi] ; outptr 256 257 cmp rcx, byte SIZEOF_XMMWORD 258 jae short .columnloop 259 260.columnloop_r8: 261 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 262 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 263 pxor xmm2,xmm2 264 pxor xmm3,xmm3 265 mov rcx, SIZEOF_XMMWORD 266 jmp short .downsample 267 268.columnloop: 269 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] 270 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] 271 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] 272 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] 273 274.downsample: 275 movdqa xmm4,xmm0 276 movdqa xmm5,xmm1 277 pand xmm0,xmm6 278 psrlw xmm4,BYTE_BIT 279 pand xmm1,xmm6 280 psrlw xmm5,BYTE_BIT 281 paddw xmm0,xmm4 282 paddw xmm1,xmm5 283 284 movdqa xmm4,xmm2 285 movdqa xmm5,xmm3 286 pand xmm2,xmm6 287 psrlw xmm4,BYTE_BIT 288 pand xmm3,xmm6 289 psrlw xmm5,BYTE_BIT 290 paddw xmm2,xmm4 291 paddw xmm3,xmm5 292 293 paddw xmm0,xmm1 294 paddw xmm2,xmm3 295 paddw xmm0,xmm7 296 paddw xmm2,xmm7 297 psrlw xmm0,2 298 psrlw xmm2,2 299 300 packuswb xmm0,xmm2 301 302 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 303 304 sub rcx, byte SIZEOF_XMMWORD ; outcol 305 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 306 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 307 add rdi, byte 1*SIZEOF_XMMWORD ; outptr 308 cmp rcx, byte SIZEOF_XMMWORD 309 jae near .columnloop 310 test rcx,rcx 311 jnz near .columnloop_r8 312 313 pop rsi 314 pop rdi 315 pop rcx 316 317 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data 318 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data 319 dec rax ; rowctr 320 jg near .rowloop 321 322.return: 323 uncollect_args 324 pop rbp 325 ret 326 327; For some reason, the OS X linker does not honor the request to align the 328; segment unless we do this. 329 align 16 330