1; 2; jquanti.asm - sample data conversion and quantization (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18%include "jdct.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 32 23; 24; Load data into workspace, applying unsigned->signed conversion 25; 26; GLOBAL(void) 27; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 28; DCTELEM *workspace); 29; 30 31%define sample_data ebp + 8 ; JSAMPARRAY sample_data 32%define start_col ebp + 12 ; JDIMENSION start_col 33%define workspace ebp + 16 ; DCTELEM *workspace 34 35 align 32 36 GLOBAL_FUNCTION(jsimd_convsamp_sse2) 37 38EXTN(jsimd_convsamp_sse2): 39 push ebp 40 mov ebp, esp 41 push ebx 42; push ecx ; need not be preserved 43; push edx ; need not be preserved 44 push esi 45 push edi 46 47 pxor xmm6, xmm6 ; xmm6=(all 0's) 48 pcmpeqw xmm7, xmm7 49 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 50 51 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 52 mov eax, JDIMENSION [start_col] 53 mov edi, POINTER [workspace] ; (DCTELEM *) 54 mov ecx, DCTSIZE/4 55 alignx 16, 7 56.convloop: 57 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59 60 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 61 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 62 63 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 64 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 65 66 movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 67 movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 68 69 punpcklbw xmm0, xmm6 ; xmm0=(01234567) 70 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) 71 paddw xmm0, xmm7 72 paddw xmm1, xmm7 73 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) 74 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) 75 paddw xmm2, xmm7 76 paddw xmm3, xmm7 77 78 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 79 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 80 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 81 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 82 83 add esi, byte 4*SIZEOF_JSAMPROW 84 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 85 dec ecx 86 jnz short .convloop 87 88 pop edi 89 pop esi 90; pop edx ; need not be preserved 91; pop ecx ; need not be preserved 92 pop ebx 93 pop ebp 94 ret 95 96; -------------------------------------------------------------------------- 97; 98; Quantize/descale the coefficients, and store into coef_block 99; 100; This implementation is based on an algorithm described in 101; "How to optimize for the Pentium family of microprocessors" 102; (http://www.agner.org/assem/). 103; 104; GLOBAL(void) 105; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, 106; DCTELEM *workspace); 107; 108 109%define RECIPROCAL(m, n, b) \ 110 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 111%define CORRECTION(m, n, b) \ 112 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 113%define SCALE(m, n, b) \ 114 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 115 116%define coef_block ebp + 8 ; JCOEFPTR coef_block 117%define divisors ebp + 12 ; DCTELEM *divisors 118%define workspace ebp + 16 ; DCTELEM *workspace 119 120 align 32 121 GLOBAL_FUNCTION(jsimd_quantize_sse2) 122 123EXTN(jsimd_quantize_sse2): 124 push ebp 125 mov ebp, esp 126; push ebx ; unused 127; push ecx ; unused 128; push edx ; need not be preserved 129 push esi 130 push edi 131 132 mov esi, POINTER [workspace] 133 mov edx, POINTER [divisors] 134 mov edi, JCOEFPTR [coef_block] 135 mov eax, DCTSIZE2/32 136 alignx 16, 7 137.quantloop: 138 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 139 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] 140 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 141 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] 142 movdqa xmm0, xmm4 143 movdqa xmm1, xmm5 144 movdqa xmm2, xmm6 145 movdqa xmm3, xmm7 146 psraw xmm4, (WORD_BIT-1) 147 psraw xmm5, (WORD_BIT-1) 148 psraw xmm6, (WORD_BIT-1) 149 psraw xmm7, (WORD_BIT-1) 150 pxor xmm0, xmm4 151 pxor xmm1, xmm5 152 pxor xmm2, xmm6 153 pxor xmm3, xmm7 154 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 155 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 156 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 157 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 158 159 paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 160 paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] 161 paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] 162 paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] 163 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 164 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] 165 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] 166 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] 167 pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale 168 pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] 169 pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] 170 pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] 171 172 pxor xmm0, xmm4 173 pxor xmm1, xmm5 174 pxor xmm2, xmm6 175 pxor xmm3, xmm7 176 psubw xmm0, xmm4 177 psubw xmm1, xmm5 178 psubw xmm2, xmm6 179 psubw xmm3, xmm7 180 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 181 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 182 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 183 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 184 185 add esi, byte 32*SIZEOF_DCTELEM 186 add edx, byte 32*SIZEOF_DCTELEM 187 add edi, byte 32*SIZEOF_JCOEF 188 dec eax 189 jnz near .quantloop 190 191 pop edi 192 pop esi 193; pop edx ; need not be preserved 194; pop ecx ; unused 195; pop ebx ; unused 196 pop ebp 197 ret 198 199; For some reason, the OS X linker does not honor the request to align the 200; segment unless we do this. 201 align 32 202