1; 2; jquanti.asm - sample data conversion and quantization (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 64 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, 30; DCTELEM *workspace); 31; 32 33; r10 = JSAMPARRAY sample_data 34; r11 = JDIMENSION start_col 35; r12 = DCTELEM *workspace 36 37 align 16 38 global EXTN(jsimd_convsamp_sse2) 39 40EXTN(jsimd_convsamp_sse2): 41 push rbp 42 mov rax,rsp 43 mov rbp,rsp 44 collect_args 45 push rbx 46 47 pxor xmm6,xmm6 ; xmm6=(all 0's) 48 pcmpeqw xmm7,xmm7 49 psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 50 51 mov rsi, r10 52 mov eax, r11d 53 mov rdi, r12 54 mov rcx, DCTSIZE/4 55.convloop: 56 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 57 mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58 59 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 60 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 61 62 mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 63 mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 64 65 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 66 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 67 68 punpcklbw xmm0,xmm6 ; xmm0=(01234567) 69 punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) 70 paddw xmm0,xmm7 71 paddw xmm1,xmm7 72 punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) 73 punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) 74 paddw xmm2,xmm7 75 paddw xmm3,xmm7 76 77 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 78 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 79 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 80 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 81 82 add rsi, byte 4*SIZEOF_JSAMPROW 83 add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM 84 dec rcx 85 jnz short .convloop 86 87 pop rbx 88 uncollect_args 89 pop rbp 90 ret 91 92; -------------------------------------------------------------------------- 93; 94; Quantize/descale the coefficients, and store into coef_block 95; 96; This implementation is based on an algorithm described in 97; "How to optimize for the Pentium family of microprocessors" 98; (http://www.agner.org/assem/). 99; 100; GLOBAL(void) 101; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors, 102; DCTELEM *workspace); 103; 104 105%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) 106%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) 107%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) 108 109; r10 = JCOEFPTR coef_block 110; r11 = DCTELEM *divisors 111; r12 = DCTELEM *workspace 112 113 align 16 114 global EXTN(jsimd_quantize_sse2) 115 116EXTN(jsimd_quantize_sse2): 117 push rbp 118 mov rax,rsp 119 mov rbp,rsp 120 collect_args 121 122 mov rsi, r12 123 mov rdx, r11 124 mov rdi, r10 125 mov rax, DCTSIZE2/32 126.quantloop: 127 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] 128 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] 129 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] 130 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] 131 movdqa xmm0,xmm4 132 movdqa xmm1,xmm5 133 movdqa xmm2,xmm6 134 movdqa xmm3,xmm7 135 psraw xmm4,(WORD_BIT-1) 136 psraw xmm5,(WORD_BIT-1) 137 psraw xmm6,(WORD_BIT-1) 138 psraw xmm7,(WORD_BIT-1) 139 pxor xmm0,xmm4 140 pxor xmm1,xmm5 141 pxor xmm2,xmm6 142 pxor xmm3,xmm7 143 psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 144 psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 145 psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 146 psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 147 148 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor 149 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] 150 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] 151 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] 152 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal 153 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] 154 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] 155 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] 156 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale 157 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] 158 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] 159 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] 160 161 pxor xmm0,xmm4 162 pxor xmm1,xmm5 163 pxor xmm2,xmm6 164 pxor xmm3,xmm7 165 psubw xmm0,xmm4 166 psubw xmm1,xmm5 167 psubw xmm2,xmm6 168 psubw xmm3,xmm7 169 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 170 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 171 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 172 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 173 174 add rsi, byte 32*SIZEOF_DCTELEM 175 add rdx, byte 32*SIZEOF_DCTELEM 176 add rdi, byte 32*SIZEOF_JCOEF 177 dec rax 178 jnz near .quantloop 179 180 uncollect_args 181 pop rbp 182 ret 183 184; For some reason, the OS X linker does not honor the request to align the 185; segment unless we do this. 186 align 16 187