1; 2; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 64 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 30; FAST_FLOAT *workspace); 31; 32 33; r10 = JSAMPARRAY sample_data 34; r11d = JDIMENSION start_col 35; r12 = FAST_FLOAT *workspace 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) 39 40EXTN(jsimd_convsamp_float_sse2): 41 push rbp 42 mov rax, rsp 43 mov rbp, rsp 44 collect_args 3 45 push rbx 46 47 pcmpeqw xmm7, xmm7 48 psllw xmm7, 7 49 packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 50 51 mov rsi, r10 52 mov eax, r11d 53 mov rdi, r12 54 mov rcx, DCTSIZE/2 55.convloop: 56 mov rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 57 mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58 59 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] 60 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] 61 62 psubb xmm0, xmm7 ; xmm0=(01234567) 63 psubb xmm1, xmm7 ; xmm1=(89ABCDEF) 64 65 punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 66 punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 67 68 punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) 69 punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) 70 punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) 71 punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) 72 73 psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 74 psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 75 cvtdq2ps xmm2, xmm2 ; xmm2=(0123) 76 cvtdq2ps xmm0, xmm0 ; xmm0=(4567) 77 psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 78 psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 79 cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) 80 cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) 81 82 movaps XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2 83 movaps XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0 84 movaps XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3 85 movaps XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1 86 87 add rsi, byte 2*SIZEOF_JSAMPROW 88 add rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 89 dec rcx 90 jnz short .convloop 91 92 pop rbx 93 uncollect_args 3 94 pop rbp 95 ret 96 97; -------------------------------------------------------------------------- 98; 99; Quantize/descale the coefficients, and store into coef_block 100; 101; GLOBAL(void) 102; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, 103; FAST_FLOAT *workspace); 104; 105 106; r10 = JCOEFPTR coef_block 107; r11 = FAST_FLOAT *divisors 108; r12 = FAST_FLOAT *workspace 109 110 align 32 111 GLOBAL_FUNCTION(jsimd_quantize_float_sse2) 112 113EXTN(jsimd_quantize_float_sse2): 114 push rbp 115 mov rax, rsp 116 mov rbp, rsp 117 collect_args 3 118 119 mov rsi, r12 120 mov rdx, r11 121 mov rdi, r10 122 mov rax, DCTSIZE2/16 123.quantloop: 124 movaps xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)] 125 movaps xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)] 126 mulps xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)] 127 mulps xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)] 128 movaps xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)] 129 movaps xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)] 130 mulps xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)] 131 mulps xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)] 132 133 cvtps2dq xmm0, xmm0 134 cvtps2dq xmm1, xmm1 135 cvtps2dq xmm2, xmm2 136 cvtps2dq xmm3, xmm3 137 138 packssdw xmm0, xmm1 139 packssdw xmm2, xmm3 140 141 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0 142 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2 143 144 add rsi, byte 16*SIZEOF_FAST_FLOAT 145 add rdx, byte 16*SIZEOF_FAST_FLOAT 146 add rdi, byte 16*SIZEOF_JCOEF 147 dec rax 148 jnz short .quantloop 149 150 uncollect_args 3 151 pop rbp 152 ret 153 154; For some reason, the OS X linker does not honor the request to align the 155; segment unless we do this. 156 align 32 157