• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
2; 2021-07-21: Igor Pavlov : Public domain
3;
4
5ifndef x64
6; x64=1
7; .err <x64_IS_REQUIRED>
8endif
9
10include 7zAsm.asm
11
12MY_ASM_START
13
14_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
15
16MY_ALIGN macro num:req
17        align  num
18endm
19
20MY_ALIGN_32 macro
21        MY_ALIGN 32
22endm
23
24MY_ALIGN_64 macro
25        MY_ALIGN 64
26endm
27
28
29t0_L    equ x0_L
30t0_x    equ x0
31t0      equ r0
32t1_x    equ x3
33t1      equ r3
34
35cp_x    equ t1_x
36cp_r    equ t1
37m       equ x5
38m_r     equ r5
39len_x   equ x6
40len     equ r6
41diff_x  equ x7
42diff    equ r7
43len0    equ r10
44len1_x  equ x11
45len1    equ r11
46maxLen_x equ x12
47maxLen  equ r12
48d       equ r13
49ptr0    equ r14
50ptr1    equ r15
51
52d_lim       equ m_r
53cycSize     equ len_x
54hash_lim    equ len0
55delta1_x    equ len1_x
56delta1_r    equ len1
57delta_x     equ maxLen_x
58delta_r     equ maxLen
59hash        equ ptr0
60src         equ ptr1
61
62
63
64if (IS_LINUX gt 0)
65
66; r1 r2  r8 r9        : win32
67; r7 r6  r2 r1  r8 r9 : linux
68
69lenLimit        equ r8
70lenLimit_x      equ x8
71; pos_r           equ r2
72pos             equ x2
73cur             equ r1
74son             equ r9
75
76else
77
78lenLimit        equ REG_ABI_PARAM_2
79lenLimit_x      equ REG_ABI_PARAM_2_x
80pos             equ REG_ABI_PARAM_1_x
81cur             equ REG_ABI_PARAM_0
82son             equ REG_ABI_PARAM_3
83
84endif
85
86
87if (IS_LINUX gt 0)
88    maxLen_OFFS         equ  (REG_SIZE * (6 + 1))
89else
90    cutValue_OFFS       equ  (REG_SIZE * (8 + 1 + 4))
91    d_OFFS              equ  (REG_SIZE + cutValue_OFFS)
92    maxLen_OFFS         equ  (REG_SIZE + d_OFFS)
93endif
94    hash_OFFS           equ  (REG_SIZE + maxLen_OFFS)
95    limit_OFFS          equ  (REG_SIZE + hash_OFFS)
96    size_OFFS           equ  (REG_SIZE + limit_OFFS)
97    cycPos_OFFS         equ  (REG_SIZE + size_OFFS)
98    cycSize_OFFS        equ  (REG_SIZE + cycPos_OFFS)
99    posRes_OFFS         equ  (REG_SIZE + cycSize_OFFS)
100
101if (IS_LINUX gt 0)
102else
103    cutValue_PAR        equ  [r0 + cutValue_OFFS]
104    d_PAR               equ  [r0 + d_OFFS]
105endif
106    maxLen_PAR          equ  [r0 + maxLen_OFFS]
107    hash_PAR            equ  [r0 + hash_OFFS]
108    limit_PAR           equ  [r0 + limit_OFFS]
109    size_PAR            equ  [r0 + size_OFFS]
110    cycPos_PAR          equ  [r0 + cycPos_OFFS]
111    cycSize_PAR         equ  [r0 + cycSize_OFFS]
112    posRes_PAR          equ  [r0 + posRes_OFFS]
113
114
115    cutValue_VAR        equ  DWORD PTR [r4 + 8 * 0]
116    cutValueCur_VAR     equ  DWORD PTR [r4 + 8 * 0 + 4]
117    cycPos_VAR          equ  DWORD PTR [r4 + 8 * 1 + 0]
118    cycSize_VAR         equ  DWORD PTR [r4 + 8 * 1 + 4]
119    hash_VAR            equ  QWORD PTR [r4 + 8 * 2]
120    limit_VAR           equ  QWORD PTR [r4 + 8 * 3]
121    size_VAR            equ  QWORD PTR [r4 + 8 * 4]
122    distances           equ  QWORD PTR [r4 + 8 * 5]
123    maxLen_VAR          equ  QWORD PTR [r4 + 8 * 6]
124
125    Old_RSP             equ  QWORD PTR [r4 + 8 * 7]
126    LOCAL_SIZE          equ  8 * 8
127
128COPY_VAR_32 macro dest_var, src_var
129        mov     x3, src_var
130        mov     dest_var, x3
131endm
132
133COPY_VAR_64 macro dest_var, src_var
134        mov     r3, src_var
135        mov     dest_var, r3
136endm
137
138
139; MY_ALIGN_64
140MY_PROC GetMatchesSpecN_2, 13
141MY_PUSH_PRESERVED_ABI_REGS
142        mov     r0, RSP
143        lea     r3, [r0 - LOCAL_SIZE]
144        and     r3, -64
145        mov     RSP, r3
146        mov     Old_RSP, r0
147
148if (IS_LINUX gt 0)
149        mov     d,            REG_ABI_PARAM_5       ; r13 = r9
150        mov     cutValue_VAR, REG_ABI_PARAM_4_x     ;     = r8
151        mov     son,          REG_ABI_PARAM_3       ;  r9 = r1
152        mov     r8,           REG_ABI_PARAM_2       ;  r8 = r2
153        mov     pos,          REG_ABI_PARAM_1_x     ;  r2 = x6
154        mov     r1,           REG_ABI_PARAM_0       ;  r1 = r7
155else
156        COPY_VAR_32 cutValue_VAR, cutValue_PAR
157        mov     d, d_PAR
158endif
159
160        COPY_VAR_64 limit_VAR, limit_PAR
161
162        mov     hash_lim, size_PAR
163        mov     size_VAR, hash_lim
164
165        mov     cp_x, cycPos_PAR
166        mov     hash, hash_PAR
167
168        mov     cycSize, cycSize_PAR
169        mov     cycSize_VAR, cycSize
170
171        ; we want cur in (rcx). So we change the cur and lenLimit variables
172        sub     lenLimit, cur
173        neg     lenLimit_x
174        inc     lenLimit_x
175
176        mov     t0_x, maxLen_PAR
177        sub     t0, lenLimit
178        mov     maxLen_VAR, t0
179
180        jmp     main_loop
181
182MY_ALIGN_64
183fill_empty:
184        ; ptr0 = *ptr1 = kEmptyHashValue;
185        mov     QWORD PTR [ptr1], 0
186        inc     pos
187        inc     cp_x
188        mov     DWORD PTR [d - 4], 0
189        cmp     d, limit_VAR
190        jae     fin
191        cmp     hash, hash_lim
192        je      fin
193
194; MY_ALIGN_64
195main_loop:
196        ; UInt32 delta = *hash++;
197        mov     diff_x, [hash]  ; delta
198        add     hash, 4
199        ; mov     cycPos_VAR, cp_x
200
201        inc     cur
202        add     d, 4
203        mov     m, pos
204        sub     m, diff_x;      ; matchPos
205
206        ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
207        lea     ptr1, [son + 8 * cp_r]
208        ; mov     cycSize, cycSize_VAR
209        cmp     pos, cycSize
210        jb      directMode      ; if (pos < cycSize_VAR)
211
212        ; CYC MODE
213
214        cmp     diff_x, cycSize
215        jae     fill_empty      ; if (delta >= cycSize_VAR)
216
217        xor     t0_x, t0_x
218        mov     cycPos_VAR, cp_x
219        sub     cp_x, diff_x
220        ; jae     prepare_for_tree_loop
221        ; add     cp_x, cycSize
222        cmovb   t0_x, cycSize
223        add     cp_x, t0_x      ; cp_x +=  (cycPos < delta ? cycSize : 0)
224        jmp     prepare_for_tree_loop
225
226
227directMode:
228        cmp     diff_x,  pos
229        je      fill_empty      ; if (delta == pos)
230        jae     fin_error       ; if (delta >= pos)
231
232        mov     cycPos_VAR, cp_x
233        mov     cp_x, m
234
235prepare_for_tree_loop:
236        mov     len0, lenLimit
237        mov     hash_VAR, hash
238        ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
239        lea     ptr0, [ptr1 + 4]
240        ; UInt32 *_distances = ++d;
241        mov     distances, d
242
243        neg     len0
244        mov     len1, len0
245
246        mov     t0_x, cutValue_VAR
247        mov     maxLen, maxLen_VAR
248        mov     cutValueCur_VAR, t0_x
249
250MY_ALIGN_32
251tree_loop:
252        neg     diff
253        mov     len, len0
254        cmp     len1, len0
255        cmovb   len, len1       ; len = (len1 < len0 ? len1 : len0);
256        add     diff, cur
257
258        mov     t0_x, [son + cp_r * 8]  ; prefetch
259        movzx   t0_x, BYTE PTR [diff + 1 * len]
260        lea     cp_r, [son + cp_r * 8]
261        cmp     [cur + 1 * len], t0_L
262        je      matched_1
263
264        jb      left_0
265
266        mov     [ptr1], m
267        mov        m, [cp_r + 4]
268        lea     ptr1, [cp_r + 4]
269        sub     diff, cur ; FIX32
270        jmp     next_node
271
272MY_ALIGN_32
273left_0:
274        mov     [ptr0], m
275        mov        m, [cp_r]
276        mov     ptr0, cp_r
277        sub     diff, cur ; FIX32
278        ; jmp     next_node
279
280; ------------ NEXT NODE ------------
281; MY_ALIGN_32
282next_node:
283        mov     cycSize, cycSize_VAR
284        dec     cutValueCur_VAR
285        je      finish_tree
286
287        add     diff_x, pos     ; prev_match = pos + diff
288        cmp     m, diff_x
289        jae     fin_error       ; if (new_match >= prev_match)
290
291        mov     diff_x, pos
292        sub     diff_x, m       ; delta = pos - new_match
293        cmp     pos, cycSize
294        jae     cyc_mode_2      ; if (pos >= cycSize)
295
296        mov     cp_x, m
297        test    m, m
298        jne     tree_loop       ; if (m != 0)
299
300finish_tree:
301        ; ptr0 = *ptr1 = kEmptyHashValue;
302        mov     DWORD PTR [ptr0], 0
303        mov     DWORD PTR [ptr1], 0
304
305        inc     pos
306
307        ; _distances[-1] = (UInt32)(d - _distances);
308        mov     t0, distances
309        mov     t1, d
310        sub     t1, t0
311        shr     t1_x, 2
312        mov     [t0 - 4], t1_x
313
314        cmp     d, limit_VAR
315        jae     fin             ; if (d >= limit)
316
317        mov     cp_x, cycPos_VAR
318        mov     hash, hash_VAR
319        mov     hash_lim, size_VAR
320        inc     cp_x
321        cmp     hash, hash_lim
322        jne     main_loop       ; if (hash != size)
323        jmp     fin
324
325
326MY_ALIGN_32
327cyc_mode_2:
328        cmp     diff_x, cycSize
329        jae     finish_tree     ; if (delta >= cycSize)
330
331        mov     cp_x, cycPos_VAR
332        xor     t0_x, t0_x
333        sub     cp_x, diff_x    ; cp_x = cycPos - delta
334        cmovb   t0_x, cycSize
335        add     cp_x, t0_x      ; cp_x += (cycPos < delta ? cycSize : 0)
336        jmp     tree_loop
337
338
339MY_ALIGN_32
340matched_1:
341
342        inc     len
343        ; cmp     len_x, lenLimit_x
344        je      short lenLimit_reach
345        movzx   t0_x, BYTE PTR [diff + 1 * len]
346        cmp     [cur + 1 * len], t0_L
347        jne     mismatch
348
349
350MY_ALIGN_32
351match_loop:
352        ;  while (++len != lenLimit)  (len[diff] != len[0]) ;
353
354        inc     len
355        ; cmp     len_x, lenLimit_x
356        je      short lenLimit_reach
357        movzx   t0_x, BYTE PTR [diff + 1 * len]
358        cmp     BYTE PTR [cur + 1 * len], t0_L
359        je      match_loop
360
361mismatch:
362        jb      left_2
363
364        mov     [ptr1], m
365        mov        m, [cp_r + 4]
366        lea     ptr1, [cp_r + 4]
367        mov     len1, len
368
369        jmp     max_update
370
371MY_ALIGN_32
372left_2:
373        mov     [ptr0], m
374        mov        m, [cp_r]
375        mov     ptr0, cp_r
376        mov     len0, len
377
378max_update:
379        sub     diff, cur       ; restore diff
380
381        cmp     maxLen, len
382        jae     next_node
383
384        mov     maxLen, len
385        add     len, lenLimit
386        mov     [d], len_x
387        mov     t0_x, diff_x
388        not     t0_x
389        mov     [d + 4], t0_x
390        add     d, 8
391
392        jmp     next_node
393
394
395
396MY_ALIGN_32
397lenLimit_reach:
398
399        mov     delta_r, cur
400        sub     delta_r, diff
401        lea     delta1_r, [delta_r - 1]
402
403        mov     t0_x, [cp_r]
404        mov     [ptr1], t0_x
405        mov     t0_x, [cp_r + 4]
406        mov     [ptr0], t0_x
407
408        mov     [d], lenLimit_x
409        mov     [d + 4], delta1_x
410        add     d, 8
411
412        ; _distances[-1] = (UInt32)(d - _distances);
413        mov     t0, distances
414        mov     t1, d
415        sub     t1, t0
416        shr     t1_x, 2
417        mov     [t0 - 4], t1_x
418
419        mov     hash, hash_VAR
420        mov     hash_lim, size_VAR
421
422        inc     pos
423        mov     cp_x, cycPos_VAR
424        inc     cp_x
425
426        mov     d_lim, limit_VAR
427        mov     cycSize, cycSize_VAR
428        ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
429        ;    break;
430        cmp     hash, hash_lim
431        je      fin
432        cmp     d, d_lim
433        jae     fin
434        cmp     delta_x, [hash]
435        jne     main_loop
436        movzx   t0_x, BYTE PTR [diff]
437        cmp     [cur], t0_L
438        jne     main_loop
439
440        ; jmp     main_loop     ; bypass for debug
441
442        mov     cycPos_VAR, cp_x
443        shl     len, 3          ; cycSize * 8
444        sub     diff, cur       ; restore diff
445        xor     t0_x, t0_x
446        cmp     cp_x, delta_x   ; cmp (cycPos_VAR, delta)
447        lea     cp_r, [son + 8 * cp_r]  ; dest
448        lea     src, [cp_r + 8 * diff]
449        cmovb   t0, len         ; t0 =  (cycPos_VAR < delta ? cycSize * 8 : 0)
450        add     src, t0
451        add     len, son        ; len = son + cycSize * 8
452
453
454MY_ALIGN_32
455long_loop:
456        add     hash, 4
457
458        ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
459
460        mov     t0, [src]
461        add     src, 8
462        mov     [cp_r], t0
463        add     cp_r, 8
464        cmp     src, len
465        cmove   src, son       ; if end of (son) buffer is reached, we wrap to begin
466
467        mov     DWORD PTR [d], 2
468        mov     [d + 4], lenLimit_x
469        mov     [d + 8], delta1_x
470        add     d, 12
471
472        inc     cur
473
474        cmp     hash, hash_lim
475        je      long_footer
476        cmp     delta_x, [hash]
477        jne     long_footer
478        movzx   t0_x, BYTE PTR [diff + 1 * cur]
479        cmp     [cur], t0_L
480        jne     long_footer
481        cmp     d, d_lim
482        jb      long_loop
483
484long_footer:
485        sub     cp_r, son
486        shr     cp_r, 3
487        add     pos, cp_x
488        sub     pos, cycPos_VAR
489        mov     cycSize, cycSize_VAR
490
491        cmp     d, d_lim
492        jae     fin
493        cmp     hash, hash_lim
494        jne     main_loop
495        jmp     fin
496
497
498
499fin_error:
500        xor     d, d
501
502fin:
503        mov     RSP, Old_RSP
504        mov     t0, [r4 + posRes_OFFS]
505        mov     [t0], pos
506        mov     r0, d
507
508MY_POP_PRESERVED_ABI_REGS
509MY_ENDP
510
511_TEXT$LZFINDOPT ENDS
512
513end
514