1; 2; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on 7; x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 32 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, 30; FAST_FLOAT * workspace); 31; 32 33%define sample_data ebp+8 ; JSAMPARRAY sample_data 34%define start_col ebp+12 ; JDIMENSION start_col 35%define workspace ebp+16 ; FAST_FLOAT * workspace 36 37 align 16 38 global EXTN(jsimd_convsamp_float_sse2) PRIVATE 39 40EXTN(jsimd_convsamp_float_sse2): 41 push ebp 42 mov ebp,esp 43 push ebx 44; push ecx ; need not be preserved 45; push edx ; need not be preserved 46 push esi 47 push edi 48 49 pcmpeqw xmm7,xmm7 50 psllw xmm7,7 51 packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 52 53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 54 mov eax, JDIMENSION [start_col] 55 mov edi, POINTER [workspace] ; (DCTELEM *) 56 mov ecx, DCTSIZE/2 57 alignx 16,7 58.convloop: 59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 62 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 63 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 64 65 psubb xmm0,xmm7 ; xmm0=(01234567) 66 psubb xmm1,xmm7 ; xmm1=(89ABCDEF) 67 68 punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 69 punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 70 71 punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) 72 punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) 73 punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) 74 punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) 75 76 psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 77 psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 78 cvtdq2ps xmm2,xmm2 ; xmm2=(0123) 79 cvtdq2ps xmm0,xmm0 ; xmm0=(4567) 80 psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 81 psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 82 cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) 83 cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) 84 85 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 86 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 87 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 88 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 89 90 add esi, byte 2*SIZEOF_JSAMPROW 91 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 92 dec ecx 93 jnz short .convloop 94 95 pop edi 96 pop esi 97; pop edx ; need not be preserved 98; pop ecx ; need not be preserved 99 pop ebx 100 pop ebp 101 ret 102 103 104; -------------------------------------------------------------------------- 105; 106; Quantize/descale the coefficients, and store into coef_block 107; 108; GLOBAL(void) 109; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors, 110; FAST_FLOAT * workspace); 111; 112 113%define coef_block ebp+8 ; JCOEFPTR coef_block 114%define divisors ebp+12 ; FAST_FLOAT * divisors 115%define workspace ebp+16 ; FAST_FLOAT * workspace 116 117 align 16 118 global EXTN(jsimd_quantize_float_sse2) PRIVATE 119 120EXTN(jsimd_quantize_float_sse2): 121 push ebp 122 mov ebp,esp 123; push ebx ; unused 124; push ecx ; unused 125; push edx ; need not be preserved 126 push esi 127 push edi 128 129 mov esi, POINTER [workspace] 130 mov edx, POINTER [divisors] 131 mov edi, JCOEFPTR [coef_block] 132 mov eax, DCTSIZE2/16 133 alignx 16,7 134.quantloop: 135 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 136 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 137 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 138 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 139 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 140 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 141 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 142 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 143 144 cvtps2dq xmm0,xmm0 145 cvtps2dq xmm1,xmm1 146 cvtps2dq xmm2,xmm2 147 cvtps2dq xmm3,xmm3 148 149 packssdw xmm0,xmm1 150 packssdw xmm2,xmm3 151 152 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 153 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 154 155 add esi, byte 16*SIZEOF_FAST_FLOAT 156 add edx, byte 16*SIZEOF_FAST_FLOAT 157 add edi, byte 16*SIZEOF_JCOEF 158 dec eax 159 jnz short .quantloop 160 161 pop edi 162 pop esi 163; pop edx ; need not be preserved 164; pop ecx ; unused 165; pop ebx ; unused 166 pop ebp 167 ret 168 169; For some reason, the OS X linker does not honor the request to align the 170; segment unless we do this. 171 align 16 172