1; 2; jquanti.asm - sample data conversion and quantization (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 64 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 30; DCTELEM *workspace); 31; 32 33; r10 = JSAMPARRAY sample_data 34; r11d = JDIMENSION start_col 35; r12 = DCTELEM *workspace 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_convsamp_sse2) 39 40EXTN(jsimd_convsamp_sse2): 41 push rbp 42 mov rax, rsp 43 mov rbp, rsp 44 collect_args 3 45 push rbx 46 47 pxor xmm6, xmm6 ; xmm6=(all 0's) 48 pcmpeqw xmm7, xmm7 49 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 50 51 mov rsi, r10 52 mov eax, r11d 53 mov rdi, r12 54 mov rcx, DCTSIZE/4 55.convloop: 56 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 57 mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58 59 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 60 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 61 62 mov rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 63 mov rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 64 65 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 66 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 67 68 punpcklbw xmm0, xmm6 ; xmm0=(01234567) 69 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) 70 paddw xmm0, xmm7 71 paddw xmm1, xmm7 72 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) 73 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) 74 paddw xmm2, xmm7 75 paddw xmm3, xmm7 76 77 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 78 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 79 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 80 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 81 82 add rsi, byte 4*SIZEOF_JSAMPROW 83 add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM 84 dec rcx 85 jnz short .convloop 86 87 pop rbx 88 uncollect_args 3 89 pop rbp 90 ret 91 92; -------------------------------------------------------------------------- 93; 94; Quantize/descale the coefficients, and store into coef_block 95; 96; This implementation is based on an algorithm described in 97; "How to optimize for the Pentium family of microprocessors" 98; (http://www.agner.org/assem/). 99; 100; GLOBAL(void) 101; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, 102; DCTELEM *workspace); 103; 104 105%define RECIPROCAL(m, n, b) \ 106 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 107%define CORRECTION(m, n, b) \ 108 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 109%define SCALE(m, n, b) \ 110 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 111 112; r10 = JCOEFPTR coef_block 113; r11 = DCTELEM *divisors 114; r12 = DCTELEM *workspace 115 116 align 32 117 GLOBAL_FUNCTION(jsimd_quantize_sse2) 118 119EXTN(jsimd_quantize_sse2): 120 push rbp 121 mov rax, rsp 122 mov rbp, rsp 123 collect_args 3 124 125 mov rsi, r12 126 mov rdx, r11 127 mov rdi, r10 128 mov rax, DCTSIZE2/32 129.quantloop: 130 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] 131 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] 132 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] 133 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] 134 movdqa xmm0, xmm4 135 movdqa xmm1, xmm5 136 movdqa xmm2, xmm6 137 movdqa xmm3, xmm7 138 psraw xmm4, (WORD_BIT-1) 139 psraw xmm5, (WORD_BIT-1) 140 psraw xmm6, (WORD_BIT-1) 141 psraw xmm7, (WORD_BIT-1) 142 pxor xmm0, xmm4 143 pxor xmm1, xmm5 144 pxor xmm2, xmm6 145 pxor xmm3, xmm7 146 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 147 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 148 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 149 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 150 151 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor 152 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] 153 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] 154 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] 155 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal 156 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] 157 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] 158 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] 159 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale 160 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] 161 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] 162 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] 163 164 pxor xmm0, xmm4 165 pxor xmm1, xmm5 166 pxor xmm2, xmm6 167 pxor xmm3, xmm7 168 psubw xmm0, xmm4 169 psubw xmm1, xmm5 170 psubw xmm2, xmm6 171 psubw xmm3, xmm7 172 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 173 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 174 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 175 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 176 177 add rsi, byte 32*SIZEOF_DCTELEM 178 add rdx, byte 32*SIZEOF_DCTELEM 179 add rdi, byte 32*SIZEOF_JCOEF 180 dec rax 181 jnz near .quantloop 182 183 uncollect_args 3 184 pop rbp 185 ret 186 187; For some reason, the OS X linker does not honor the request to align the 188; segment unless we do this. 189 align 32 190