1; 2; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18%include "jdct.inc" 19 20; -------------------------------------------------------------------------- 21 SECTION SEG_TEXT 22 BITS 64 23; 24; Load data into workspace, applying unsigned->signed conversion 25; 26; GLOBAL(void) 27; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 28; FAST_FLOAT *workspace); 29; 30 31; r10 = JSAMPARRAY sample_data 32; r11d = JDIMENSION start_col 33; r12 = FAST_FLOAT *workspace 34 35 align 32 36 GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) 37 38EXTN(jsimd_convsamp_float_sse2): 39 push rbp 40 mov rax, rsp 41 mov rbp, rsp 42 collect_args 3 43 push rbx 44 45 pcmpeqw xmm7, xmm7 46 psllw xmm7, 7 47 packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 48 49 mov rsi, r10 50 mov eax, r11d 51 mov rdi, r12 52 mov rcx, DCTSIZE/2 53.convloop: 54 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 55 mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 56 57 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] 58 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] 59 60 psubb xmm0, xmm7 ; xmm0=(01234567) 61 psubb xmm1, xmm7 ; xmm1=(89ABCDEF) 62 63 punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 64 punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 65 66 punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) 67 punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) 68 punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) 69 punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) 70 71 psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 72 psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 73 cvtdq2ps xmm2, xmm2 ; xmm2=(0123) 74 cvtdq2ps xmm0, xmm0 ; xmm0=(4567) 75 psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 76 psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 77 cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) 78 cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) 79 80 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 81 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 82 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 83 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 84 85 add rsi, byte 2*SIZEOF_JSAMPROW 86 add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 87 dec rcx 88 jnz short .convloop 89 90 pop rbx 91 uncollect_args 3 92 pop rbp 93 ret 94 95; -------------------------------------------------------------------------- 96; 97; Quantize/descale the coefficients, and store into coef_block 98; 99; GLOBAL(void) 100; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, 101; FAST_FLOAT *workspace); 102; 103 104; r10 = JCOEFPTR coef_block 105; r11 = FAST_FLOAT *divisors 106; r12 = FAST_FLOAT *workspace 107 108 align 32 109 GLOBAL_FUNCTION(jsimd_quantize_float_sse2) 110 111EXTN(jsimd_quantize_float_sse2): 112 push rbp 113 mov rax, rsp 114 mov rbp, rsp 115 collect_args 3 116 117 mov rsi, r12 118 mov rdx, r11 119 mov rdi, r10 120 mov rax, DCTSIZE2/16 121.quantloop: 122 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 123 movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] 124 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 125 mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] 126 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 127 movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] 128 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 129 mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] 130 131 cvtps2dq xmm0, xmm0 132 cvtps2dq xmm1, xmm1 133 cvtps2dq xmm2, xmm2 134 cvtps2dq xmm3, xmm3 135 136 packssdw xmm0, xmm1 137 packssdw xmm2, xmm3 138 139 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 140 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 141 142 add rsi, byte 16*SIZEOF_FAST_FLOAT 143 add rdx, byte 16*SIZEOF_FAST_FLOAT 144 add rdi, byte 16*SIZEOF_JCOEF 145 dec rax 146 jnz short .quantloop 147 148 uncollect_args 3 149 pop rbp 150 ret 151 152; For some reason, the OS X linker does not honor the request to align the 153; segment unless we do this. 154 align 32 155