• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2010-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  memzero.asm
33;*
34;*  Abstract
35;*     cavlc
36;*
37;*  History
38;*      09/08/2010 Created
39;*
40;*
41;*************************************************************************/
42
43%include "asm_inc.asm"
44
45%ifdef X86_32_PICASM
46SECTION .text align=16
47%else
48SECTION .rodata align=16
49%endif
50
51align 16
52
53wels_shufb_rev:
54    db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
55
56; 4-bit table giving number of preceding zeros for each set bit as well as the
57; eventual next bit. For the case where all 4 bits are set, this requires 5
58; zeros. The 5th zero can either be read from beyond the final table entry or
59; implied via zero-initializing the location being read into.
60wels_cavlc_param_cal_run_lut:
61    db 4, 0, 0, 0
62    db 0, 3, 0, 0
63    db 1, 2, 0, 0
64    db 0, 0, 2, 0
65    db 2, 1, 0, 0
66    db 0, 1, 1, 0
67    db 1, 0, 1, 0
68    db 0, 0, 0, 1
69    db 3, 0, 0, 0
70    db 0, 2, 0, 0
71    db 1, 1, 0, 0
72    db 0, 0, 1, 0
73    db 2, 0, 0, 0
74    db 0, 1, 0, 0
75    db 1, 0, 0, 0
76    db 0, 0, 0, 0
77;   db 0
78; 4-bit table giving pshufb vectors for compacting 4-word vectors by removing
79; the words that match zero bits and concatenating in reverse order.
80wels_cavlc_param_cal_shufb_lut:
81    db 0, 0, 0, 0, 0, 0, 0, 0
82    db 6, 7, 0, 0, 0, 0, 0, 0
83    db 4, 5, 0, 0, 0, 0, 0, 0
84    db 6, 7, 4, 5, 0, 0, 0, 0
85    db 2, 3, 0, 0, 0, 0, 0, 0
86    db 6, 7, 2, 3, 0, 0, 0, 0
87    db 4, 5, 2, 3, 0, 0, 0, 0
88    db 6, 7, 4, 5, 2, 3, 0, 0
89    db 0, 1, 0, 0, 0, 0, 0, 0
90    db 6, 7, 0, 1, 0, 0, 0, 0
91    db 4, 5, 0, 1, 0, 0, 0, 0
92    db 6, 7, 4, 5, 0, 1, 0, 0
93    db 2, 3, 0, 1, 0, 0, 0, 0
94    db 6, 7, 2, 3, 0, 1, 0, 0
95    db 4, 5, 2, 3, 0, 1, 0, 0
96    db 6, 7, 4, 5, 2, 3, 0, 1
97
98
99%ifdef X86_32
100
101align 16
102sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
103
104ALIGN  16
105sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
106
107align 16
108byte_1pos_table:
109    db 0,0,0,0,0,0,0,0, ;0
110    db 0,0,0,0,0,0,0,1, ;1
111    db 1,0,0,0,0,0,0,1, ;2
112    db 1,0,0,0,0,0,0,2, ;3
113    db 2,0,0,0,0,0,0,1, ;4
114    db 2,0,0,0,0,0,0,2, ;5
115    db 2,1,0,0,0,0,0,2, ;6
116    db 2,1,0,0,0,0,0,3, ;7
117    db 3,0,0,0,0,0,0,1, ;8
118    db 3,0,0,0,0,0,0,2, ;9
119    db 3,1,0,0,0,0,0,2, ;10
120    db 3,1,0,0,0,0,0,3, ;11
121    db 3,2,0,0,0,0,0,2, ;12
122    db 3,2,0,0,0,0,0,3, ;13
123    db 3,2,1,0,0,0,0,3, ;14
124    db 3,2,1,0,0,0,0,4, ;15
125    db 4,0,0,0,0,0,0,1, ;16
126    db 4,0,0,0,0,0,0,2, ;17
127    db 4,1,0,0,0,0,0,2, ;18
128    db 4,1,0,0,0,0,0,3, ;19
129    db 4,2,0,0,0,0,0,2, ;20
130    db 4,2,0,0,0,0,0,3, ;21
131    db 4,2,1,0,0,0,0,3, ;22
132    db 4,2,1,0,0,0,0,4, ;23
133    db 4,3,0,0,0,0,0,2, ;24
134    db 4,3,0,0,0,0,0,3, ;25
135    db 4,3,1,0,0,0,0,3, ;26
136    db 4,3,1,0,0,0,0,4, ;27
137    db 4,3,2,0,0,0,0,3, ;28
138    db 4,3,2,0,0,0,0,4, ;29
139    db 4,3,2,1,0,0,0,4, ;30
140    db 4,3,2,1,0,0,0,5, ;31
141    db 5,0,0,0,0,0,0,1, ;32
142    db 5,0,0,0,0,0,0,2, ;33
143    db 5,1,0,0,0,0,0,2, ;34
144    db 5,1,0,0,0,0,0,3, ;35
145    db 5,2,0,0,0,0,0,2, ;36
146    db 5,2,0,0,0,0,0,3, ;37
147    db 5,2,1,0,0,0,0,3, ;38
148    db 5,2,1,0,0,0,0,4, ;39
149    db 5,3,0,0,0,0,0,2, ;40
150    db 5,3,0,0,0,0,0,3, ;41
151    db 5,3,1,0,0,0,0,3, ;42
152    db 5,3,1,0,0,0,0,4, ;43
153    db 5,3,2,0,0,0,0,3, ;44
154    db 5,3,2,0,0,0,0,4, ;45
155    db 5,3,2,1,0,0,0,4, ;46
156    db 5,3,2,1,0,0,0,5, ;47
157    db 5,4,0,0,0,0,0,2, ;48
158    db 5,4,0,0,0,0,0,3, ;49
159    db 5,4,1,0,0,0,0,3, ;50
160    db 5,4,1,0,0,0,0,4, ;51
161    db 5,4,2,0,0,0,0,3, ;52
162    db 5,4,2,0,0,0,0,4, ;53
163    db 5,4,2,1,0,0,0,4, ;54
164    db 5,4,2,1,0,0,0,5, ;55
165    db 5,4,3,0,0,0,0,3, ;56
166    db 5,4,3,0,0,0,0,4, ;57
167    db 5,4,3,1,0,0,0,4, ;58
168    db 5,4,3,1,0,0,0,5, ;59
169    db 5,4,3,2,0,0,0,4, ;60
170    db 5,4,3,2,0,0,0,5, ;61
171    db 5,4,3,2,1,0,0,5, ;62
172    db 5,4,3,2,1,0,0,6, ;63
173    db 6,0,0,0,0,0,0,1, ;64
174    db 6,0,0,0,0,0,0,2, ;65
175    db 6,1,0,0,0,0,0,2, ;66
176    db 6,1,0,0,0,0,0,3, ;67
177    db 6,2,0,0,0,0,0,2, ;68
178    db 6,2,0,0,0,0,0,3, ;69
179    db 6,2,1,0,0,0,0,3, ;70
180    db 6,2,1,0,0,0,0,4, ;71
181    db 6,3,0,0,0,0,0,2, ;72
182    db 6,3,0,0,0,0,0,3, ;73
183    db 6,3,1,0,0,0,0,3, ;74
184    db 6,3,1,0,0,0,0,4, ;75
185    db 6,3,2,0,0,0,0,3, ;76
186    db 6,3,2,0,0,0,0,4, ;77
187    db 6,3,2,1,0,0,0,4, ;78
188    db 6,3,2,1,0,0,0,5, ;79
189    db 6,4,0,0,0,0,0,2, ;80
190    db 6,4,0,0,0,0,0,3, ;81
191    db 6,4,1,0,0,0,0,3, ;82
192    db 6,4,1,0,0,0,0,4, ;83
193    db 6,4,2,0,0,0,0,3, ;84
194    db 6,4,2,0,0,0,0,4, ;85
195    db 6,4,2,1,0,0,0,4, ;86
196    db 6,4,2,1,0,0,0,5, ;87
197    db 6,4,3,0,0,0,0,3, ;88
198    db 6,4,3,0,0,0,0,4, ;89
199    db 6,4,3,1,0,0,0,4, ;90
200    db 6,4,3,1,0,0,0,5, ;91
201    db 6,4,3,2,0,0,0,4, ;92
202    db 6,4,3,2,0,0,0,5, ;93
203    db 6,4,3,2,1,0,0,5, ;94
204    db 6,4,3,2,1,0,0,6, ;95
205    db 6,5,0,0,0,0,0,2, ;96
206    db 6,5,0,0,0,0,0,3, ;97
207    db 6,5,1,0,0,0,0,3, ;98
208    db 6,5,1,0,0,0,0,4, ;99
209    db 6,5,2,0,0,0,0,3, ;100
210    db 6,5,2,0,0,0,0,4, ;101
211    db 6,5,2,1,0,0,0,4, ;102
212    db 6,5,2,1,0,0,0,5, ;103
213    db 6,5,3,0,0,0,0,3, ;104
214    db 6,5,3,0,0,0,0,4, ;105
215    db 6,5,3,1,0,0,0,4, ;106
216    db 6,5,3,1,0,0,0,5, ;107
217    db 6,5,3,2,0,0,0,4, ;108
218    db 6,5,3,2,0,0,0,5, ;109
219    db 6,5,3,2,1,0,0,5, ;110
220    db 6,5,3,2,1,0,0,6, ;111
221    db 6,5,4,0,0,0,0,3, ;112
222    db 6,5,4,0,0,0,0,4, ;113
223    db 6,5,4,1,0,0,0,4, ;114
224    db 6,5,4,1,0,0,0,5, ;115
225    db 6,5,4,2,0,0,0,4, ;116
226    db 6,5,4,2,0,0,0,5, ;117
227    db 6,5,4,2,1,0,0,5, ;118
228    db 6,5,4,2,1,0,0,6, ;119
229    db 6,5,4,3,0,0,0,4, ;120
230    db 6,5,4,3,0,0,0,5, ;121
231    db 6,5,4,3,1,0,0,5, ;122
232    db 6,5,4,3,1,0,0,6, ;123
233    db 6,5,4,3,2,0,0,5, ;124
234    db 6,5,4,3,2,0,0,6, ;125
235    db 6,5,4,3,2,1,0,6, ;126
236    db 6,5,4,3,2,1,0,7, ;127
237    db 7,0,0,0,0,0,0,1, ;128
238    db 7,0,0,0,0,0,0,2, ;129
239    db 7,1,0,0,0,0,0,2, ;130
240    db 7,1,0,0,0,0,0,3, ;131
241    db 7,2,0,0,0,0,0,2, ;132
242    db 7,2,0,0,0,0,0,3, ;133
243    db 7,2,1,0,0,0,0,3, ;134
244    db 7,2,1,0,0,0,0,4, ;135
245    db 7,3,0,0,0,0,0,2, ;136
246    db 7,3,0,0,0,0,0,3, ;137
247    db 7,3,1,0,0,0,0,3, ;138
248    db 7,3,1,0,0,0,0,4, ;139
249    db 7,3,2,0,0,0,0,3, ;140
250    db 7,3,2,0,0,0,0,4, ;141
251    db 7,3,2,1,0,0,0,4, ;142
252    db 7,3,2,1,0,0,0,5, ;143
253    db 7,4,0,0,0,0,0,2, ;144
254    db 7,4,0,0,0,0,0,3, ;145
255    db 7,4,1,0,0,0,0,3, ;146
256    db 7,4,1,0,0,0,0,4, ;147
257    db 7,4,2,0,0,0,0,3, ;148
258    db 7,4,2,0,0,0,0,4, ;149
259    db 7,4,2,1,0,0,0,4, ;150
260    db 7,4,2,1,0,0,0,5, ;151
261    db 7,4,3,0,0,0,0,3, ;152
262    db 7,4,3,0,0,0,0,4, ;153
263    db 7,4,3,1,0,0,0,4, ;154
264    db 7,4,3,1,0,0,0,5, ;155
265    db 7,4,3,2,0,0,0,4, ;156
266    db 7,4,3,2,0,0,0,5, ;157
267    db 7,4,3,2,1,0,0,5, ;158
268    db 7,4,3,2,1,0,0,6, ;159
269    db 7,5,0,0,0,0,0,2, ;160
270    db 7,5,0,0,0,0,0,3, ;161
271    db 7,5,1,0,0,0,0,3, ;162
272    db 7,5,1,0,0,0,0,4, ;163
273    db 7,5,2,0,0,0,0,3, ;164
274    db 7,5,2,0,0,0,0,4, ;165
275    db 7,5,2,1,0,0,0,4, ;166
276    db 7,5,2,1,0,0,0,5, ;167
277    db 7,5,3,0,0,0,0,3, ;168
278    db 7,5,3,0,0,0,0,4, ;169
279    db 7,5,3,1,0,0,0,4, ;170
280    db 7,5,3,1,0,0,0,5, ;171
281    db 7,5,3,2,0,0,0,4, ;172
282    db 7,5,3,2,0,0,0,5, ;173
283    db 7,5,3,2,1,0,0,5, ;174
284    db 7,5,3,2,1,0,0,6, ;175
285    db 7,5,4,0,0,0,0,3, ;176
286    db 7,5,4,0,0,0,0,4, ;177
287    db 7,5,4,1,0,0,0,4, ;178
288    db 7,5,4,1,0,0,0,5, ;179
289    db 7,5,4,2,0,0,0,4, ;180
290    db 7,5,4,2,0,0,0,5, ;181
291    db 7,5,4,2,1,0,0,5, ;182
292    db 7,5,4,2,1,0,0,6, ;183
293    db 7,5,4,3,0,0,0,4, ;184
294    db 7,5,4,3,0,0,0,5, ;185
295    db 7,5,4,3,1,0,0,5, ;186
296    db 7,5,4,3,1,0,0,6, ;187
297    db 7,5,4,3,2,0,0,5, ;188
298    db 7,5,4,3,2,0,0,6, ;189
299    db 7,5,4,3,2,1,0,6, ;190
300    db 7,5,4,3,2,1,0,7, ;191
301    db 7,6,0,0,0,0,0,2, ;192
302    db 7,6,0,0,0,0,0,3, ;193
303    db 7,6,1,0,0,0,0,3, ;194
304    db 7,6,1,0,0,0,0,4, ;195
305    db 7,6,2,0,0,0,0,3, ;196
306    db 7,6,2,0,0,0,0,4, ;197
307    db 7,6,2,1,0,0,0,4, ;198
308    db 7,6,2,1,0,0,0,5, ;199
309    db 7,6,3,0,0,0,0,3, ;200
310    db 7,6,3,0,0,0,0,4, ;201
311    db 7,6,3,1,0,0,0,4, ;202
312    db 7,6,3,1,0,0,0,5, ;203
313    db 7,6,3,2,0,0,0,4, ;204
314    db 7,6,3,2,0,0,0,5, ;205
315    db 7,6,3,2,1,0,0,5, ;206
316    db 7,6,3,2,1,0,0,6, ;207
317    db 7,6,4,0,0,0,0,3, ;208
318    db 7,6,4,0,0,0,0,4, ;209
319    db 7,6,4,1,0,0,0,4, ;210
320    db 7,6,4,1,0,0,0,5, ;211
321    db 7,6,4,2,0,0,0,4, ;212
322    db 7,6,4,2,0,0,0,5, ;213
323    db 7,6,4,2,1,0,0,5, ;214
324    db 7,6,4,2,1,0,0,6, ;215
325    db 7,6,4,3,0,0,0,4, ;216
326    db 7,6,4,3,0,0,0,5, ;217
327    db 7,6,4,3,1,0,0,5, ;218
328    db 7,6,4,3,1,0,0,6, ;219
329    db 7,6,4,3,2,0,0,5, ;220
330    db 7,6,4,3,2,0,0,6, ;221
331    db 7,6,4,3,2,1,0,6, ;222
332    db 7,6,4,3,2,1,0,7, ;223
333    db 7,6,5,0,0,0,0,3, ;224
334    db 7,6,5,0,0,0,0,4, ;225
335    db 7,6,5,1,0,0,0,4, ;226
336    db 7,6,5,1,0,0,0,5, ;227
337    db 7,6,5,2,0,0,0,4, ;228
338    db 7,6,5,2,0,0,0,5, ;229
339    db 7,6,5,2,1,0,0,5, ;230
340    db 7,6,5,2,1,0,0,6, ;231
341    db 7,6,5,3,0,0,0,4, ;232
342    db 7,6,5,3,0,0,0,5, ;233
343    db 7,6,5,3,1,0,0,5, ;234
344    db 7,6,5,3,1,0,0,6, ;235
345    db 7,6,5,3,2,0,0,5, ;236
346    db 7,6,5,3,2,0,0,6, ;237
347    db 7,6,5,3,2,1,0,6, ;238
348    db 7,6,5,3,2,1,0,7, ;239
349    db 7,6,5,4,0,0,0,4, ;240
350    db 7,6,5,4,0,0,0,5, ;241
351    db 7,6,5,4,1,0,0,5, ;242
352    db 7,6,5,4,1,0,0,6, ;243
353    db 7,6,5,4,2,0,0,5, ;244
354    db 7,6,5,4,2,0,0,6, ;245
355    db 7,6,5,4,2,1,0,6, ;246
356    db 7,6,5,4,2,1,0,7, ;247
357    db 7,6,5,4,3,0,0,5, ;248
358    db 7,6,5,4,3,0,0,6, ;249
359    db 7,6,5,4,3,1,0,6, ;250
360    db 7,6,5,4,3,1,0,7, ;251
361    db 7,6,5,4,3,2,0,6, ;252
362    db 7,6,5,4,3,2,0,7, ;253
363    db 7,6,5,4,3,2,1,7, ;254
364    db 7,6,5,4,3,2,1,8, ;255
365
366%endif ; X86_32
367
368;***********************************************************************
369; Code
370;***********************************************************************
371SECTION .text
372
373
374%ifdef X86_32
375
376;***********************************************************************
377;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
378;***********************************************************************
379WELS_EXTERN CavlcParamCal_sse2
380    push ebx
381    push edi
382    push esi
383    %assign push_num 3
384    INIT_X86_32_PIC ebp
385
386    mov         eax,    arg1    ;coffLevel
387    mov         edi,    arg3    ;Level
388    mov         ebx,    arg5    ;endIdx
389    cmp         ebx,    3
390    jne         .Level16
391    pxor        xmm1,   xmm1
392    movq        xmm0,   [eax]   ; removed QWORD
393    jmp         .Cal_begin
394.Level16:
395    movdqa      xmm0,   [eax]
396    movdqa      xmm1,   [eax+16]
397.Cal_begin:
398    movdqa      xmm2,   xmm0
399    packsswb    xmm0,   xmm1
400    movdqa      xmm4,   xmm0
401    pxor        xmm3,   xmm3
402    pcmpgtb     xmm0,   xmm3
403    pcmpgtb     xmm3,   xmm4
404    por         xmm0,   xmm3
405    pmovmskb    edx,    xmm0
406    cmp         edx,    0
407    je near   .return
408    movdqa      xmm6,   [pic(sse2_b_1)]
409    pcmpeqw     xmm7,   xmm7    ;generate -1
410    mov         ebx,    0xff
411    ;pinsrw     xmm6,   ebx,    3
412
413    mov       bl,   dh
414
415    lea       ebx,  [pic(byte_1pos_table+8*ebx)]
416    movq      xmm0, [ebx]
417    pextrw    ecx,  xmm0, 3
418    shr       ecx,  8
419    mov       dh,   cl
420
421.loopHighFind0:
422    cmp       ecx,   0
423    je        .loopHighFind0End
424    ;mov       esi, [ebx]
425    ;and       esi, 0xff
426    movzx     esi, byte [ebx]
427    add       esi, 8
428    mov       esi, [eax+2*esi]
429    mov       [edi], si
430    add       edi,   2
431    ;add       ebx,   1
432    inc       ebx
433    dec       ecx
434    jmp       .loopHighFind0
435.loopHighFind0End:
436    mov       cl,   dh
437    cmp       cl,   8
438    pand      xmm0, xmm6
439    jne       .LowByteFind0
440    sub       edi,   2
441    mov       esi,   [eax+16]
442    mov       [edi], esi
443    add       edi,   2
444.LowByteFind0:
445    and       edx,  0xff
446    lea       ebx,  [pic(byte_1pos_table+8*edx)]
447    movq      xmm1, [ebx]
448    pextrw    esi,  xmm1, 3
449    or        esi,  0xff
450    or        ecx,  0xff00
451    and       ecx,  esi
452    shr       esi,  8
453    pand      xmm1, xmm6
454.loopLowFind0:
455    cmp       esi, 0
456    je        .loopLowFind0End
457    ;mov       edx, [ebx]
458    ;and       edx, 0xff
459    movzx     edx,  byte [ebx]
460    mov       edx, [eax+2*edx]
461    mov       [edi], dx
462    add       edi,   2
463    ;add       ebx,   1
464    inc       ebx
465    dec       esi
466    jmp       .loopLowFind0
467.loopLowFind0End:
468    cmp       ch,  8
469    jne       .getLevelEnd
470    sub       edi, 2
471    mov       edx, [eax]
472    mov       [edi], dx
473.getLevelEnd:
474    mov      edx, arg4  ;total_coeffs
475    ;mov      ebx,   ecx
476    ;and      ebx,   0xff
477    movzx    ebx,   byte cl
478    add      cl,    ch
479    mov      [edx], cl
480;getRun
481    movq     xmm5, [pic(sse2_b8)]
482    paddb    xmm0, xmm5
483    pxor     xmm2, xmm2
484    pxor     xmm3, xmm3
485    mov      eax,  8
486    sub      eax,  ebx
487    shl      eax,  3
488    shl      ebx,  3
489    pinsrw   xmm2, ebx, 0
490    pinsrw   xmm3, eax, 0
491    psllq    xmm0, xmm3
492    psrlq    xmm0, xmm3
493    movdqa   xmm4, xmm1
494    psllq    xmm1, xmm2
495    psrlq    xmm4, xmm3
496    punpcklqdq xmm1, xmm4
497    por      xmm0,  xmm1
498
499    pextrw   eax,   xmm0, 0
500    and      eax,   0xff
501    inc      eax
502    sub      al,    cl
503    movdqa   xmm1,  xmm0
504    paddb    xmm1,  xmm7
505    psrldq   xmm0,  1
506    psubb    xmm1,  xmm0
507    mov      ecx,   arg2 ;run
508    movdqa   [ecx], xmm1
509;getRunEnd
510.return:
511    DEINIT_X86_32_PIC
512    pop esi
513    pop edi
514    pop ebx
515    ret
516%endif ;%ifdef X86_32
517
518;***********************************************************************
519;int32_t CavlcParamCal_sse42(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
520;***********************************************************************
521
522WELS_EXTERN CavlcParamCal_sse42
523%define i_endidxd      dword arg5d
524
525%ifdef X86_32
526    push            r3
527    push            r4
528    push            r5
529    push            r6
530    %assign push_num 4
531%ifdef X86_32_PICASM
532    %define p_total_coeffs r1
533%else
534    %define p_total_coeffs r0
535%endif
536    %define r_tmp r1
537    %define r_tmpd r1d
538    %define r_tmpb r1b
539    %define p_level r2
540    %define p_coeff_level r3
541    %define p_run r6
542    %define r_mask  r5
543    %define r_maskd r5d
544    %define p_shufb_lut pic(wels_cavlc_param_cal_shufb_lut)
545    %define p_run_lut   pic(wels_cavlc_param_cal_run_lut)
546    mov             p_coeff_level, arg1
547    mov             p_run, arg2
548    mov             p_level, arg3
549    mov             p_total_coeffs, arg4
550%elifdef WIN64
551    push            rbx
552    %assign push_num 1
553    %define p_coeff_level r0
554    %define p_run r1
555    %define p_level r2
556    %define p_total_coeffs r3
557    %define r_mask  rbx
558    %define r_maskd ebx
559    %define p_shufb_lut r5
560    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
561    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
562    ; Free up rcx/ecx because only cl is accepted as shift amount operand.
563    mov             r6, r0
564    %undef p_coeff_level
565    %define p_coeff_level r6
566    %define r_tmp r0
567    %define r_tmpd r0d
568    %define r_tmpb r0b
569%else
570    %assign push_num 0
571    %define p_coeff_level r0
572    %define p_run r1
573    %define p_level r2
574    %define p_total_coeffs r3
575    %define r_mask  rax
576    %define r_maskd eax
577    %define p_shufb_lut r5
578    %define i_total_zeros r6
579    %define p_run_lut (p_shufb_lut + (wels_cavlc_param_cal_run_lut - wels_cavlc_param_cal_shufb_lut))
580    lea             p_shufb_lut, [wels_cavlc_param_cal_shufb_lut]
581%endif
582    INIT_X86_32_PIC_NOPRESERVE r0
583
584    ; Acquire a bitmask indicating which words are non-zero.
585    ; Assume p_coeff_level is 16-byte-aligned and at least 32 bytes if endIdx > 3.
586    ; Otherwise, assume 8 bytes available. Assume that input beyond endIdx is zero.
587    ; Assumptions are taken from previous implementations.
588    pxor            xmm1, xmm1
589    cmp             i_endidxd, 3
590    jg              .load16
591    movq            xmm0, [p_coeff_level]
592    packsswb        xmm0, xmm1
593    jmp             .load_done
594.load16:
595    movdqa          xmm0, [p_coeff_level]
596    packsswb        xmm0, [p_coeff_level + 16]
597.load_done:
598    movdqa          [p_run], xmm1                           ; Zero-initialize because we may read back implied zeros.
599    pcmpeqb         xmm0, xmm1
600    pshufb          xmm0, [pic(wels_shufb_rev)]
601    pmovmskb        r_maskd, xmm0
602    xor             r_maskd, 0FFFFh
603%undef i_endidxd
604%define r_tmp2  r4
605%define r_tmp2d r4d
606    popcnt          r_tmp2d, r_maskd
607    mov             [p_total_coeffs], r_tmp2d
608    ; Recycle p_total_coeffs.
609%ifidni p_total_coeffs, rcx
610    %define r_tmp rcx
611    %define r_tmpd ecx
612    %define r_tmpb cl
613%else
614    %xdefine i_total_zeros p_total_coeffs
615%endif
616%undef p_total_coeffs
617%ifdef X86_32_PICASM
618    push            r_tmp2
619    %undef i_total_zeros
620    %define i_total_zeros dword [esp]
621%else
622    mov             i_total_zeros, r_tmp2
623%endif
624    jz              .done
625    bsf             r_tmpd, r_maskd                         ; Find first set bit.
626    lea             r_tmp2, [r_tmp2 + r_tmp - 16]
627    neg             r_tmp2
628    mov             i_total_zeros, r_tmp2
629    ; Skip trailing zeros.
630    ; Restrict to multiples of 4 to retain alignment and avoid out-of-bound stores.
631    and             r_tmpd, -4
632    shr             r_maskd, r_tmpb
633    add             r_tmpd, r_tmpd
634    sub             p_coeff_level, r_tmp
635    ; Handle first quadruple containing a non-zero value.
636    mov             r_tmp, r_mask
637    and             r_tmpd, 0Fh
638    movq            xmm0, [p_coeff_level + 24]
639    movq            xmm1, [p_shufb_lut + 8 * r_tmp]
640    pshufb          xmm0, xmm1
641    mov             r_tmp2d, [p_run_lut + 4 * r_tmp]
642    shr             r_tmp2d, 8                              ; Skip initial zero run.
643    movlps          [p_level], xmm0                         ; Store levels for the first quadruple.
644    mov             [p_run], r_tmp2d                        ; Store accompanying zero runs thus far.
645    shr             r_maskd, 4
646    jz              .done
647.loop:
648    ; Increment pointers.
649    popcnt          r_tmpd, r_tmpd                          ; Number of non-zero values handled.
650    lea             p_level, [p_level + 2 * r_tmp]
651    add             p_run, r_tmp
652    ; Handle next quadruple.
653    mov             r_tmp, r_mask
654    and             r_tmpd, 0Fh
655    movq            xmm0, [p_coeff_level + 16]
656    sub             p_coeff_level, 8
657    movq            xmm1, [p_shufb_lut + 8 * r_tmp]
658    pshufb          xmm0, xmm1
659    movzx           r_tmp2d, byte [p_run - 1]
660    add             r_tmp2d, [p_run_lut + 4 * r_tmp]        ; Add to previous run and get eventual new runs.
661    movlps          [p_level], xmm0                         ; Store levels (potentially none).
662    mov             [p_run - 1], r_tmp2d                    ; Update previous run and store eventual new runs.
663    shr             r_maskd, 4
664    jnz             .loop
665.done:
666%ifnidni retrq, i_total_zeros
667  %ifdef X86_32_PICASM
668    pop             retrq
669  %else
670    mov             retrq, i_total_zeros
671  %endif
672%endif
673    DEINIT_X86_32_PIC
674%ifdef X86_32
675    pop             r6
676    pop             r5
677    pop             r4
678    pop             r3
679%elifdef WIN64
680    pop             rbx
681%endif
682    ret
683%undef p_coeff_level
684%undef p_run
685%undef p_level
686%undef i_total_zeros
687%undef r_mask
688%undef r_maskd
689%undef r_tmp
690%undef r_tmpd
691%undef r_tmpb
692%undef r_tmp2
693%undef r_tmp2d
694%undef p_shufb_lut
695%undef p_run_lut
696