1; 2; jquanti.asm - sample data conversion and quantization (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19%include "jdct.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 64 24; 25; Load data into workspace, applying unsigned->signed conversion 26; 27; GLOBAL(void) 28; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 29; DCTELEM *workspace); 30; 31 32; r10 = JSAMPARRAY sample_data 33; r11d = JDIMENSION start_col 34; r12 = DCTELEM *workspace 35 36 align 32 37 GLOBAL_FUNCTION(jsimd_convsamp_sse2) 38 39EXTN(jsimd_convsamp_sse2): 40 push rbp 41 mov rax, rsp 42 mov rbp, rsp 43 collect_args 3 44 push rbx 45 46 pxor xmm6, xmm6 ; xmm6=(all 0's) 47 pcmpeqw xmm7, xmm7 48 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 49 50 mov rsi, r10 51 mov eax, r11d 52 mov rdi, r12 53 mov rcx, DCTSIZE/4 54.convloop: 55 mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 56 mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 57 58 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 59 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 60 61 mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 62 mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 63 64 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 65 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 66 67 punpcklbw xmm0, xmm6 ; xmm0=(01234567) 68 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) 69 paddw xmm0, xmm7 70 paddw xmm1, xmm7 71 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) 72 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) 73 paddw xmm2, xmm7 74 paddw xmm3, xmm7 75 76 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 77 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 78 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 79 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 80 81 add rsi, byte 4*SIZEOF_JSAMPROW 82 add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM 83 dec rcx 84 jnz short .convloop 85 86 pop rbx 87 uncollect_args 3 88 pop rbp 89 ret 90 91; -------------------------------------------------------------------------- 92; 93; Quantize/descale the coefficients, and store into coef_block 94; 95; This implementation is based on an algorithm described in 96; "How to optimize for the Pentium family of microprocessors" 97; (http://www.agner.org/assem/). 98; 99; GLOBAL(void) 100; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, 101; DCTELEM *workspace); 102; 103 104%define RECIPROCAL(m, n, b) \ 105 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 106%define CORRECTION(m, n, b) \ 107 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 108%define SCALE(m, n, b) \ 109 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 110 111; r10 = JCOEFPTR coef_block 112; r11 = DCTELEM *divisors 113; r12 = DCTELEM *workspace 114 115 align 32 116 GLOBAL_FUNCTION(jsimd_quantize_sse2) 117 118EXTN(jsimd_quantize_sse2): 119 push rbp 120 mov rax, rsp 121 mov rbp, rsp 122 collect_args 3 123 124 mov rsi, r12 125 mov rdx, r11 126 mov rdi, r10 127 mov rax, DCTSIZE2/32 128.quantloop: 129 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] 130 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] 131 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] 132 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] 133 movdqa xmm0, xmm4 134 movdqa xmm1, xmm5 135 movdqa xmm2, xmm6 136 movdqa xmm3, xmm7 137 psraw xmm4, (WORD_BIT-1) 138 psraw xmm5, (WORD_BIT-1) 139 psraw xmm6, (WORD_BIT-1) 140 psraw xmm7, (WORD_BIT-1) 141 pxor xmm0, xmm4 142 pxor xmm1, xmm5 143 pxor xmm2, xmm6 144 pxor xmm3, xmm7 145 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 146 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 147 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 148 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 149 150 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor 151 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] 152 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] 153 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] 154 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal 155 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] 156 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] 157 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] 158 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale 159 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] 160 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] 161 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] 162 163 pxor xmm0, xmm4 164 pxor xmm1, xmm5 165 pxor xmm2, xmm6 166 pxor xmm3, xmm7 167 psubw xmm0, xmm4 168 psubw xmm1, xmm5 169 psubw xmm2, xmm6 170 psubw xmm3, xmm7 171 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 172 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 173 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 174 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 175 176 add rsi, byte 32*SIZEOF_DCTELEM 177 add rdx, byte 32*SIZEOF_DCTELEM 178 add rdi, byte 32*SIZEOF_JCOEF 179 dec rax 180 jnz near .quantloop 181 182 uncollect_args 3 183 pop rbp 184 ret 185 186; For some reason, the OS X linker does not honor the request to align the 187; segment unless we do this. 188 align 32 189