• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
2; 2021-02-23: Igor Pavlov : Public domain
3;
4; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
5; function for check at link time.
6; That code is tightly coupled with LzmaDec_TryDummy()
7; and with another functions in LzmaDec.c file.
8; CLzmaDec structure, (probs) array layout, input and output of
9; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
10
11ifndef x64
12; x64=1
13; .err <x64_IS_REQUIRED>
14endif
15
16include 7zAsm.asm
17
18MY_ASM_START
19
20_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
21
22MY_ALIGN macro num:req
23        align  num
24endm
25
26MY_ALIGN_16 macro
27        MY_ALIGN 16
28endm
29
30MY_ALIGN_32 macro
31        MY_ALIGN 32
32endm
33
34MY_ALIGN_64 macro
35        MY_ALIGN 64
36endm
37
38
39; _LZMA_SIZE_OPT  equ 1
40
41; _LZMA_PROB32 equ 1
42
43ifdef _LZMA_PROB32
44        PSHIFT  equ 2
45        PLOAD macro dest, mem
46                mov     dest, dword ptr [mem]
47        endm
48        PSTORE  macro src, mem
49                mov     dword ptr [mem], src
50        endm
51else
52        PSHIFT  equ 1
53        PLOAD macro dest, mem
54                movzx   dest, word ptr [mem]
55        endm
56        PSTORE macro src, mem
57                mov     word ptr [mem], @CatStr(src, _W)
58        endm
59endif
60
61PMULT           equ (1 SHL PSHIFT)
62PMULT_HALF      equ (1 SHL (PSHIFT - 1))
63PMULT_2         equ (1 SHL (PSHIFT + 1))
64
65kMatchSpecLen_Error_Data equ (1 SHL 9)
66
67;       x0      range
68;       x1      pbPos / (prob) TREE
69;       x2      probBranch / prm (MATCHED) / pbPos / cnt
70;       x3      sym
71;====== r4 ===  RSP
72;       x5      cod
73;       x6      t1 NORM_CALC / probs_state / dist
74;       x7      t0 NORM_CALC / prob2 IF_BIT_1
75;       x8      state
76;       x9      match (MATCHED) / sym2 / dist2 / lpMask_reg
77;       x10     kBitModelTotal_reg
78;       r11     probs
79;       x12     offs (MATCHED) / dic / len_temp
80;       x13     processedPos
81;       x14     bit (MATCHED) / dicPos
82;       r15     buf
83
84
85cod     equ x5
86cod_L   equ x5_L
87range   equ x0
88state   equ x8
89state_R equ r8
90buf     equ r15
91processedPos equ x13
92kBitModelTotal_reg equ x10
93
94probBranch   equ x2
95probBranch_R equ r2
96probBranch_W equ x2_W
97
98pbPos   equ x1
99pbPos_R equ r1
100
101cnt     equ x2
102cnt_R   equ r2
103
104lpMask_reg equ x9
105dicPos  equ r14
106
107sym     equ x3
108sym_R   equ r3
109sym_L   equ x3_L
110
111probs   equ r11
112dic     equ r12
113
114t0      equ x7
115t0_W    equ x7_W
116t0_R    equ r7
117
118prob2   equ t0
119prob2_W equ t0_W
120
121t1      equ x6
122t1_R    equ r6
123
124probs_state     equ t1
125probs_state_R   equ t1_R
126
127prm     equ r2
128match   equ x9
129match_R equ r9
130offs    equ x12
131offs_R  equ r12
132bit     equ x14
133bit_R   equ r14
134
135sym2    equ x9
136sym2_R  equ r9
137
138len_temp equ x12
139
140dist    equ sym
141dist2   equ x9
142
143
144
145kNumBitModelTotalBits   equ 11
146kBitModelTotal          equ (1 SHL kNumBitModelTotalBits)
147kNumMoveBits            equ 5
148kBitModelOffset         equ ((1 SHL kNumMoveBits) - 1)
149kTopValue               equ (1 SHL 24)
150
151NORM_2 macro
152        ; movzx   t0, BYTE PTR [buf]
153        shl     cod, 8
154        mov     cod_L, BYTE PTR [buf]
155        shl     range, 8
156        ; or      cod, t0
157        inc     buf
158endm
159
160
161NORM macro
162        cmp     range, kTopValue
163        jae     SHORT @F
164        NORM_2
165@@:
166endm
167
168
169; ---------- Branch MACROS ----------
170
171UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
172        mov     prob2, kBitModelTotal_reg
173        sub     prob2, probBranch
174        shr     prob2, kNumMoveBits
175        add     probBranch, prob2
176        PSTORE  probBranch, probOffset * 1 + probsArray + probDisp * PMULT
177endm
178
179
180UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
181        sub     prob2, range
182        sub     cod, range
183        mov     range, prob2
184        mov     prob2, probBranch
185        shr     probBranch, kNumMoveBits
186        sub     prob2, probBranch
187        PSTORE  prob2, probOffset * 1 + probsArray + probDisp * PMULT
188endm
189
190
191CMP_COD macro probsArray:req, probOffset:req, probDisp:req
192        PLOAD   probBranch, probOffset * 1 + probsArray + probDisp * PMULT
193        NORM
194        mov     prob2, range
195        shr     range, kNumBitModelTotalBits
196        imul    range, probBranch
197        cmp     cod, range
198endm
199
200
201IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
202        CMP_COD probsArray, probOffset, probDisp
203        jae     toLabel
204endm
205
206
207IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
208        IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
209        UPDATE_0 probsArray, probOffset, probDisp
210endm
211
212
213IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
214        CMP_COD probsArray, probOffset, probDisp
215        jb      toLabel
216endm
217
218
219; ---------- CMOV MACROS ----------
220
221NORM_CALC macro prob:req
222        NORM
223        mov     t0, range
224        shr     range, kNumBitModelTotalBits
225        imul    range, prob
226        sub     t0, range
227        mov     t1, cod
228        sub     cod, range
229endm
230
231
232PUP macro prob:req, probPtr:req
233        sub     t0, prob
234       ; only sar works for both 16/32 bit prob modes
235        sar     t0, kNumMoveBits
236        add     t0, prob
237        PSTORE  t0, probPtr
238endm
239
240
241PUP_SUB macro prob:req, probPtr:req, symSub:req
242        sbb     sym, symSub
243        PUP prob, probPtr
244endm
245
246
247PUP_COD macro prob:req, probPtr:req, symSub:req
248        mov     t0, kBitModelOffset
249        cmovb   cod, t1
250        mov     t1, sym
251        cmovb   t0, kBitModelTotal_reg
252        PUP_SUB prob, probPtr, symSub
253endm
254
255
256BIT_0 macro prob:req, probNext:req
257        PLOAD   prob, probs + 1 * PMULT
258        PLOAD   probNext, probs + 1 * PMULT_2
259
260        NORM_CALC prob
261
262        cmovae  range, t0
263        PLOAD   t0, probs + 1 * PMULT_2 + PMULT
264        cmovae  probNext, t0
265        mov     t0, kBitModelOffset
266        cmovb   cod, t1
267        cmovb   t0, kBitModelTotal_reg
268        mov     sym, 2
269        PUP_SUB prob, probs + 1 * PMULT, 0 - 1
270endm
271
272
273BIT_1 macro prob:req, probNext:req
274        PLOAD   probNext, probs + sym_R * PMULT_2
275        add     sym, sym
276
277        NORM_CALC prob
278
279        cmovae  range, t0
280        PLOAD   t0, probs + sym_R * PMULT + PMULT
281        cmovae  probNext, t0
282        PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
283endm
284
285
286BIT_2 macro prob:req, symSub:req
287        add     sym, sym
288
289        NORM_CALC prob
290
291        cmovae  range, t0
292        PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
293endm
294
295
296; ---------- MATCHED LITERAL ----------
297
298LITM_0 macro
299        mov     offs, 256 * PMULT
300        shl     match, (PSHIFT + 1)
301        mov     bit, offs
302        and     bit, match
303        PLOAD   x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
304        lea     prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
305        ; lea     prm, [probs + 256 * PMULT + 1 * PMULT]
306        ; add     prm, bit_R
307        xor     offs, bit
308        add     match, match
309
310        NORM_CALC x1
311
312        cmovae  offs, bit
313        mov     bit, match
314        cmovae  range, t0
315        mov     t0, kBitModelOffset
316        cmovb   cod, t1
317        cmovb   t0, kBitModelTotal_reg
318        mov     sym, 0
319        PUP_SUB x1, prm, -2-1
320endm
321
322
323LITM macro
324        and     bit, offs
325        lea     prm, [probs + offs_R * 1]
326        add     prm, bit_R
327        PLOAD   x1, prm + sym_R * PMULT
328        xor     offs, bit
329        add     sym, sym
330        add     match, match
331
332        NORM_CALC x1
333
334        cmovae  offs, bit
335        mov     bit, match
336        cmovae  range, t0
337        PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
338endm
339
340
341LITM_2 macro
342        and     bit, offs
343        lea     prm, [probs + offs_R * 1]
344        add     prm, bit_R
345        PLOAD   x1, prm + sym_R * PMULT
346        add     sym, sym
347
348        NORM_CALC x1
349
350        cmovae  range, t0
351        PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
352endm
353
354
355; ---------- REVERSE BITS ----------
356
357REV_0 macro prob:req, probNext:req
358        ; PLOAD   prob, probs + 1 * PMULT
359        ; lea     sym2_R, [probs + 2 * PMULT]
360        ; PLOAD   probNext, probs + 2 * PMULT
361        PLOAD   probNext, sym2_R
362
363        NORM_CALC prob
364
365        cmovae  range, t0
366        PLOAD   t0, probs + 3 * PMULT
367        cmovae  probNext, t0
368        cmovb   cod, t1
369        mov     t0, kBitModelOffset
370        cmovb   t0, kBitModelTotal_reg
371        lea     t1_R, [probs + 3 * PMULT]
372        cmovae  sym2_R, t1_R
373        PUP prob, probs + 1 * PMULT
374endm
375
376
377REV_1 macro prob:req, probNext:req, step:req
378        add     sym2_R, step * PMULT
379        PLOAD   probNext, sym2_R
380
381        NORM_CALC prob
382
383        cmovae  range, t0
384        PLOAD   t0, sym2_R + step * PMULT
385        cmovae  probNext, t0
386        cmovb   cod, t1
387        mov     t0, kBitModelOffset
388        cmovb   t0, kBitModelTotal_reg
389        lea     t1_R, [sym2_R + step * PMULT]
390        cmovae  sym2_R, t1_R
391        PUP prob, t1_R - step * PMULT_2
392endm
393
394
395REV_2 macro prob:req, step:req
396        sub     sym2_R, probs
397        shr     sym2, PSHIFT
398        or      sym, sym2
399
400        NORM_CALC prob
401
402        cmovae  range, t0
403        lea     t0, [sym - step]
404        cmovb   sym, t0
405        cmovb   cod, t1
406        mov     t0, kBitModelOffset
407        cmovb   t0, kBitModelTotal_reg
408        PUP prob, probs + sym2_R * PMULT
409endm
410
411
412REV_1_VAR macro prob:req
413        PLOAD   prob, sym_R
414        mov     probs, sym_R
415        add     sym_R, sym2_R
416
417        NORM_CALC prob
418
419        cmovae  range, t0
420        lea     t0_R, [sym_R + 1 * sym2_R]
421        cmovae  sym_R, t0_R
422        mov     t0, kBitModelOffset
423        cmovb   cod, t1
424        ; mov     t1, kBitModelTotal
425        ; cmovb   t0, t1
426        cmovb   t0, kBitModelTotal_reg
427        add     sym2, sym2
428        PUP prob, probs
429endm
430
431
432
433
434LIT_PROBS macro lpMaskParam:req
435        ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
436        mov     t0, processedPos
437        shl     t0, 8
438        add     sym, t0
439        and     sym, lpMaskParam
440        add     probs_state_R, pbPos_R
441        mov     x1, LOC lc2
442        lea     sym, dword ptr[sym_R + 2 * sym_R]
443        add     probs, Literal * PMULT
444        shl     sym, x1_L
445        add     probs, sym_R
446        UPDATE_0 probs_state_R, 0, IsMatch
447        inc     processedPos
448endm
449
450
451
452kNumPosBitsMax          equ 4
453kNumPosStatesMax        equ (1 SHL kNumPosBitsMax)
454
455kLenNumLowBits          equ 3
456kLenNumLowSymbols       equ (1 SHL kLenNumLowBits)
457kLenNumHighBits         equ 8
458kLenNumHighSymbols      equ (1 SHL kLenNumHighBits)
459kNumLenProbs            equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
460
461LenLow                  equ 0
462LenChoice               equ LenLow
463LenChoice2              equ (LenLow + kLenNumLowSymbols)
464LenHigh                 equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
465
466kNumStates              equ 12
467kNumStates2             equ 16
468kNumLitStates           equ 7
469
470kStartPosModelIndex     equ 4
471kEndPosModelIndex       equ 14
472kNumFullDistances       equ (1 SHL (kEndPosModelIndex SHR 1))
473
474kNumPosSlotBits         equ 6
475kNumLenToPosStates      equ 4
476
477kNumAlignBits           equ 4
478kAlignTableSize         equ (1 SHL kNumAlignBits)
479
480kMatchMinLen            equ 2
481kMatchSpecLenStart      equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
482
483kStartOffset    equ 1664
484SpecPos         equ (-kStartOffset)
485IsRep0Long      equ (SpecPos + kNumFullDistances)
486RepLenCoder     equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
487LenCoder        equ (RepLenCoder + kNumLenProbs)
488IsMatch         equ (LenCoder + kNumLenProbs)
489kAlign          equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
490IsRep           equ (kAlign + kAlignTableSize)
491IsRepG0         equ (IsRep + kNumStates)
492IsRepG1         equ (IsRepG0 + kNumStates)
493IsRepG2         equ (IsRepG1 + kNumStates)
494PosSlot         equ (IsRepG2 + kNumStates)
495Literal         equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
496NUM_BASE_PROBS  equ (Literal + kStartOffset)
497
498if kAlign ne 0
499  .err <Stop_Compiling_Bad_LZMA_kAlign>
500endif
501
502if NUM_BASE_PROBS ne 1984
503  .err <Stop_Compiling_Bad_LZMA_PROBS>
504endif
505
506
507PTR_FIELD equ dq ?
508
509CLzmaDec_Asm struct
510        lc      db ?
511        lp      db ?
512        pb      db ?
513        _pad_   db ?
514        dicSize dd ?
515
516        probs_Spec      PTR_FIELD
517        probs_1664      PTR_FIELD
518        dic_Spec        PTR_FIELD
519        dicBufSize      PTR_FIELD
520        dicPos_Spec     PTR_FIELD
521        buf_Spec        PTR_FIELD
522
523        range_Spec      dd ?
524        code_Spec       dd ?
525        processedPos_Spec  dd ?
526        checkDicSize    dd ?
527        rep0    dd ?
528        rep1    dd ?
529        rep2    dd ?
530        rep3    dd ?
531        state_Spec      dd ?
532        remainLen dd ?
533CLzmaDec_Asm ends
534
535
536CLzmaDec_Asm_Loc struct
537        OLD_RSP    PTR_FIELD
538        lzmaPtr    PTR_FIELD
539        _pad0_     PTR_FIELD
540        _pad1_     PTR_FIELD
541        _pad2_     PTR_FIELD
542        dicBufSize PTR_FIELD
543        probs_Spec PTR_FIELD
544        dic_Spec   PTR_FIELD
545
546        limit      PTR_FIELD
547        bufLimit   PTR_FIELD
548        lc2       dd ?
549        lpMask    dd ?
550        pbMask    dd ?
551        checkDicSize   dd ?
552
553        _pad_     dd ?
554        remainLen dd ?
555        dicPos_Spec     PTR_FIELD
556        rep0      dd ?
557        rep1      dd ?
558        rep2      dd ?
559        rep3      dd ?
560CLzmaDec_Asm_Loc ends
561
562
563GLOB_2  equ [sym_R].CLzmaDec_Asm.
564GLOB    equ [r1].CLzmaDec_Asm.
565LOC_0   equ [r0].CLzmaDec_Asm_Loc.
566LOC     equ [RSP].CLzmaDec_Asm_Loc.
567
568
569COPY_VAR macro name
570        mov     t0, GLOB_2 name
571        mov     LOC_0 name, t0
572endm
573
574
575RESTORE_VAR macro name
576        mov     t0, LOC name
577        mov     GLOB name, t0
578endm
579
580
581
582IsMatchBranch_Pre macro reg
583        ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
584        mov     pbPos, LOC pbMask
585        and     pbPos, processedPos
586        shl     pbPos, (kLenNumLowBits + 1 + PSHIFT)
587        lea     probs_state_R, [probs + 1 * state_R]
588endm
589
590
591IsMatchBranch macro reg
592        IsMatchBranch_Pre
593        IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
594endm
595
596
597CheckLimits macro reg
598        cmp     buf, LOC bufLimit
599        jae     fin_OK
600        cmp     dicPos, LOC limit
601        jae     fin_OK
602endm
603
604
605
606; RSP is (16x + 8) bytes aligned in WIN64-x64
607; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
608
609PARAM_lzma      equ REG_ABI_PARAM_0
610PARAM_limit     equ REG_ABI_PARAM_1
611PARAM_bufLimit  equ REG_ABI_PARAM_2
612
613; MY_ALIGN_64
614MY_PROC LzmaDec_DecodeReal_3, 3
615MY_PUSH_PRESERVED_ABI_REGS
616
617        lea     r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
618        and     r0, -128
619        mov     r5, RSP
620        mov     RSP, r0
621        mov     LOC_0 Old_RSP, r5
622        mov     LOC_0 lzmaPtr, PARAM_lzma
623
624        mov     LOC_0 remainLen, 0  ; remainLen must be ZERO
625
626        mov     LOC_0 bufLimit, PARAM_bufLimit
627        mov     sym_R, PARAM_lzma  ;  CLzmaDec_Asm_Loc pointer for GLOB_2
628        mov     dic, GLOB_2 dic_Spec
629        add     PARAM_limit, dic
630        mov     LOC_0 limit, PARAM_limit
631
632        COPY_VAR(rep0)
633        COPY_VAR(rep1)
634        COPY_VAR(rep2)
635        COPY_VAR(rep3)
636
637        mov     dicPos, GLOB_2 dicPos_Spec
638        add     dicPos, dic
639        mov     LOC_0 dicPos_Spec, dicPos
640        mov     LOC_0 dic_Spec, dic
641
642        mov     x1_L, GLOB_2 pb
643        mov     t0, 1
644        shl     t0, x1_L
645        dec     t0
646        mov     LOC_0 pbMask, t0
647
648        ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
649        ; unsigned lc = p->prop.lc;
650        ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
651
652        mov     x1_L, GLOB_2 lc
653        mov     x2, 100h
654        mov     t0, x2
655        shr     x2, x1_L
656        ; inc     x1
657        add     x1_L, PSHIFT
658        mov     LOC_0 lc2, x1
659        mov     x1_L, GLOB_2 lp
660        shl     t0, x1_L
661        sub     t0, x2
662        mov     LOC_0 lpMask, t0
663        mov     lpMask_reg, t0
664
665        ; mov     probs, GLOB_2 probs_Spec
666        ; add     probs, kStartOffset SHL PSHIFT
667        mov     probs, GLOB_2 probs_1664
668        mov     LOC_0 probs_Spec, probs
669
670        mov     t0_R, GLOB_2 dicBufSize
671        mov     LOC_0 dicBufSize, t0_R
672
673        mov     x1, GLOB_2 checkDicSize
674        mov     LOC_0 checkDicSize, x1
675
676        mov     processedPos, GLOB_2 processedPos_Spec
677
678        mov     state, GLOB_2 state_Spec
679        shl     state, PSHIFT
680
681        mov     buf,   GLOB_2 buf_Spec
682        mov     range, GLOB_2 range_Spec
683        mov     cod,   GLOB_2 code_Spec
684        mov     kBitModelTotal_reg, kBitModelTotal
685        xor     sym, sym
686
687        ; if (processedPos != 0 || checkDicSize != 0)
688        or      x1, processedPos
689        jz      @f
690
691        add     t0_R, dic
692        cmp     dicPos, dic
693        cmovnz  t0_R, dicPos
694        movzx   sym, byte ptr[t0_R - 1]
695
696@@:
697        IsMatchBranch_Pre
698        cmp     state, 4 * PMULT
699        jb      lit_end
700        cmp     state, kNumLitStates * PMULT
701        jb      lit_matched_end
702        jmp     lz_end
703
704
705
706
707; ---------- LITERAL ----------
708MY_ALIGN_64
709lit_start:
710        xor     state, state
711lit_start_2:
712        LIT_PROBS lpMask_reg
713
714    ifdef _LZMA_SIZE_OPT
715
716        PLOAD   x1, probs + 1 * PMULT
717        mov     sym, 1
718MY_ALIGN_16
719lit_loop:
720        BIT_1   x1, x2
721        mov     x1, x2
722        cmp     sym, 127
723        jbe     lit_loop
724
725    else
726
727        BIT_0   x1, x2
728        BIT_1   x2, x1
729        BIT_1   x1, x2
730        BIT_1   x2, x1
731        BIT_1   x1, x2
732        BIT_1   x2, x1
733        BIT_1   x1, x2
734
735    endif
736
737        BIT_2   x2, 256 - 1
738
739        ; mov     dic, LOC dic_Spec
740        mov     probs, LOC probs_Spec
741        IsMatchBranch_Pre
742        mov     byte ptr[dicPos], sym_L
743        inc     dicPos
744
745        CheckLimits
746lit_end:
747        IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
748
749        ; jmp     IsMatch_label
750
751; ---------- MATCHES ----------
752; MY_ALIGN_32
753IsMatch_label:
754        UPDATE_1 probs_state_R, pbPos_R, IsMatch
755        IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
756
757        add     probs, LenCoder * PMULT
758        add     state, kNumStates * PMULT
759
760; ---------- LEN DECODE ----------
761len_decode:
762        mov     len_temp, 8 - 1 - kMatchMinLen
763        IF_BIT_0_NOUP probs, 0, 0, len_mid_0
764        UPDATE_1 probs, 0, 0
765        add     probs, (1 SHL (kLenNumLowBits + PSHIFT))
766        mov     len_temp, -1 - kMatchMinLen
767        IF_BIT_0_NOUP probs, 0, 0, len_mid_0
768        UPDATE_1 probs, 0, 0
769        add     probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
770        mov     sym, 1
771        PLOAD   x1, probs + 1 * PMULT
772
773MY_ALIGN_32
774len8_loop:
775        BIT_1   x1, x2
776        mov     x1, x2
777        cmp     sym, 64
778        jb      len8_loop
779
780        mov     len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
781        jmp     short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs
782
783MY_ALIGN_32
784len_mid_0:
785        UPDATE_0 probs, 0, 0
786        add     probs, pbPos_R
787        BIT_0   x2, x1
788len_mid_2:
789        BIT_1   x1, x2
790        BIT_2   x2, len_temp
791        mov     probs, LOC probs_Spec
792        cmp     state, kNumStates * PMULT
793        jb      copy_match
794
795
796; ---------- DECODE DISTANCE ----------
797        ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
798
799        mov     t0, 3 + kMatchMinLen
800        cmp     sym, 3 + kMatchMinLen
801        cmovb   t0, sym
802        add     probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
803        shl     t0, (kNumPosSlotBits + PSHIFT)
804        add     probs, t0_R
805
806        ; sym = Len
807        ; mov     LOC remainLen, sym
808        mov     len_temp, sym
809
810    ifdef _LZMA_SIZE_OPT
811
812        PLOAD   x1, probs + 1 * PMULT
813        mov     sym, 1
814MY_ALIGN_16
815slot_loop:
816        BIT_1   x1, x2
817        mov     x1, x2
818        cmp     sym, 32
819        jb      slot_loop
820
821    else
822
823        BIT_0   x1, x2
824        BIT_1   x2, x1
825        BIT_1   x1, x2
826        BIT_1   x2, x1
827        BIT_1   x1, x2
828
829    endif
830
831        mov     x1, sym
832        BIT_2   x2, 64-1
833
834        and     sym, 3
835        mov     probs, LOC probs_Spec
836        cmp     x1, 32 + kEndPosModelIndex / 2
837        jb      short_dist
838
839        ;  unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
840        sub     x1, (32 + 1 + kNumAlignBits)
841        ;  distance = (2 | (distance & 1));
842        or      sym, 2
843        PLOAD   x2, probs + 1 * PMULT
844        shl     sym, kNumAlignBits + 1
845        lea     sym2_R, [probs + 2 * PMULT]
846
847        jmp     direct_norm
848        ; lea     t1, [sym_R + (1 SHL kNumAlignBits)]
849        ; cmp     range, kTopValue
850        ; jb      direct_norm
851
852; ---------- DIRECT DISTANCE ----------
853MY_ALIGN_32
854direct_loop:
855        shr     range, 1
856        mov     t0, cod
857        sub     cod, range
858        cmovs   cod, t0
859        cmovns  sym, t1
860
861        comment ~
862        sub     cod, range
863        mov     x2, cod
864        sar     x2, 31
865        lea     sym, dword ptr [r2 + sym_R * 2 + 1]
866        and     x2, range
867        add     cod, x2
868        ~
869        dec     x1
870        je      direct_end
871
872        add     sym, sym
873direct_norm:
874        lea     t1, [sym_R + (1 SHL kNumAlignBits)]
875        cmp     range, kTopValue
876        jae     near ptr direct_loop
877        ; we align for 32 here with "near ptr" command above
878        NORM_2
879        jmp     direct_loop
880
881MY_ALIGN_32
882direct_end:
883        ;  prob =  + kAlign;
884        ;  distance <<= kNumAlignBits;
885        REV_0   x2, x1
886        REV_1   x1, x2, 2
887        REV_1   x2, x1, 4
888        REV_2   x1, 8
889
890decode_dist_end:
891
892        ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
893
894        mov     t1, LOC rep0
895        mov     x1, LOC rep1
896        mov     x2, LOC rep2
897
898        mov     t0, LOC checkDicSize
899        test    t0, t0
900        cmove   t0, processedPos
901        cmp     sym, t0
902        jae     end_of_payload
903        ; jmp     end_of_payload ; for debug
904
905        ; rep3 = rep2;
906        ; rep2 = rep1;
907        ; rep1 = rep0;
908        ; rep0 = distance + 1;
909
910        inc     sym
911        mov     LOC rep0, sym
912        ; mov     sym, LOC remainLen
913        mov     sym, len_temp
914        mov     LOC rep1, t1
915        mov     LOC rep2, x1
916        mov     LOC rep3, x2
917
918        ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
919        cmp     state, (kNumStates + kNumLitStates) * PMULT
920        mov     state, kNumLitStates * PMULT
921        mov     t0, (kNumLitStates + 3) * PMULT
922        cmovae  state, t0
923
924
925; ---------- COPY MATCH ----------
926copy_match:
927
928        ; len += kMatchMinLen;
929        ; add     sym, kMatchMinLen
930
931        ; if ((rem = limit - dicPos) == 0)
932        ; {
933        ;   p->dicPos = dicPos;
934        ;   return SZ_ERROR_DATA;
935        ; }
936        mov     cnt_R, LOC limit
937        sub     cnt_R, dicPos
938        jz      fin_dicPos_LIMIT
939
940        ; curLen = ((rem < len) ? (unsigned)rem : len);
941        cmp     cnt_R, sym_R
942        ; cmovae  cnt_R, sym_R ; 64-bit
943        cmovae  cnt, sym ; 32-bit
944
945        mov     dic, LOC dic_Spec
946        mov     x1, LOC rep0
947
948        mov     t0_R, dicPos
949        add     dicPos, cnt_R
950        ; processedPos += curLen;
951        add     processedPos, cnt
952        ; len -= curLen;
953        sub     sym, cnt
954        mov     LOC remainLen, sym
955
956        sub     t0_R, dic
957
958        ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
959        sub     t0_R, r1
960        jae     @f
961
962        mov     r1, LOC dicBufSize
963        add     t0_R, r1
964        sub     r1, t0_R
965        cmp     cnt_R, r1
966        ja      copy_match_cross
967@@:
968        ; if (curLen <= dicBufSize - pos)
969
970; ---------- COPY MATCH FAST ----------
971        ; Byte *dest = dic + dicPos;
972        ; mov     r1, dic
973        ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
974        ; sub   t0_R, dicPos
975        ; dicPos += curLen;
976
977        ; const Byte *lim = dest + curLen;
978        add     t0_R, dic
979        movzx   sym, byte ptr[t0_R]
980        add     t0_R, cnt_R
981        neg     cnt_R
982        ; lea     r1, [dicPos - 1]
983copy_common:
984        dec     dicPos
985        ; cmp   LOC rep0, 1
986        ; je    rep0Label
987
988        ; t0_R - src_lim
989        ; r1 - dest_lim - 1
990        ; cnt_R - (-cnt)
991
992        IsMatchBranch_Pre
993        inc     cnt_R
994        jz      copy_end
995MY_ALIGN_16
996@@:
997        mov     byte ptr[cnt_R * 1 + dicPos], sym_L
998        movzx   sym, byte ptr[cnt_R * 1 + t0_R]
999        inc     cnt_R
1000        jnz     @b
1001
1002copy_end:
1003lz_end_match:
1004        mov     byte ptr[dicPos], sym_L
1005        inc     dicPos
1006
1007        ; IsMatchBranch_Pre
1008        CheckLimits
1009lz_end:
1010        IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1011
1012
1013
1014; ---------- LITERAL MATCHED ----------
1015
1016        LIT_PROBS LOC lpMask
1017
1018        ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1019        mov     x1, LOC rep0
1020        ; mov     dic, LOC dic_Spec
1021        mov     LOC dicPos_Spec, dicPos
1022
1023        ; state -= (state < 10) ? 3 : 6;
1024        lea     t0, [state_R - 6 * PMULT]
1025        sub     state, 3 * PMULT
1026        cmp     state, 7 * PMULT
1027        cmovae  state, t0
1028
1029        sub     dicPos, dic
1030        sub     dicPos, r1
1031        jae     @f
1032        add     dicPos, LOC dicBufSize
1033@@:
1034        comment ~
1035        xor     t0, t0
1036        sub     dicPos, r1
1037        cmovb   t0_R, LOC dicBufSize
1038        ~
1039
1040        movzx   match, byte ptr[dic + dicPos * 1]
1041
1042    ifdef _LZMA_SIZE_OPT
1043
1044        mov     offs, 256 * PMULT
1045        shl     match, (PSHIFT + 1)
1046        mov     bit, match
1047        mov     sym, 1
1048MY_ALIGN_16
1049litm_loop:
1050        LITM
1051        cmp     sym, 256
1052        jb      litm_loop
1053        sub     sym, 256
1054
1055    else
1056
1057        LITM_0
1058        LITM
1059        LITM
1060        LITM
1061        LITM
1062        LITM
1063        LITM
1064        LITM_2
1065
1066    endif
1067
1068        mov     probs, LOC probs_Spec
1069        IsMatchBranch_Pre
1070        ; mov     dic, LOC dic_Spec
1071        mov     dicPos, LOC dicPos_Spec
1072        mov     byte ptr[dicPos], sym_L
1073        inc     dicPos
1074
1075        CheckLimits
1076lit_matched_end:
1077        IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1078        ; IsMatchBranch
1079        mov     lpMask_reg, LOC lpMask
1080        sub     state, 3 * PMULT
1081        jmp     lit_start_2
1082
1083
1084
1085; ---------- REP 0 LITERAL ----------
1086MY_ALIGN_32
1087IsRep0Short_label:
1088        UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
1089
1090        ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1091        mov     dic, LOC dic_Spec
1092        mov     t0_R, dicPos
1093        mov     probBranch, LOC rep0
1094        sub     t0_R, dic
1095
1096        sub     probs, RepLenCoder * PMULT
1097
1098        ; state = state < kNumLitStates ? 9 : 11;
1099        or      state, 1 * PMULT
1100
1101        ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT
1102        ; so we don't need the following (dicPos == limit) check here:
1103        ; cmp     dicPos, LOC limit
1104        ; jae     fin_dicPos_LIMIT_REP_SHORT
1105
1106        inc     processedPos
1107
1108        IsMatchBranch_Pre
1109
1110;        xor     sym, sym
1111;        sub     t0_R, probBranch_R
1112;        cmovb   sym_R, LOC dicBufSize
1113;        add     t0_R, sym_R
1114        sub     t0_R, probBranch_R
1115        jae     @f
1116        add     t0_R, LOC dicBufSize
1117@@:
1118        movzx   sym, byte ptr[dic + t0_R * 1]
1119        jmp     lz_end_match
1120
1121
1122MY_ALIGN_32
1123IsRep_label:
1124        UPDATE_1 probs_state_R, 0, IsRep
1125
1126        ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1127        ; So we don't check it here.
1128
1129        ; mov     t0, processedPos
1130        ; or      t0, LOC checkDicSize
1131        ; jz      fin_ERROR_2
1132
1133        ; state = state < kNumLitStates ? 8 : 11;
1134        cmp     state, kNumLitStates * PMULT
1135        mov     state, 8 * PMULT
1136        mov     probBranch, 11 * PMULT
1137        cmovae  state, probBranch
1138
1139        ; prob = probs + RepLenCoder;
1140        add     probs, RepLenCoder * PMULT
1141
1142        IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
1143        IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
1144        UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
1145        jmp     len_decode
1146
1147MY_ALIGN_32
1148IsRepG0_label:
1149        UPDATE_1 probs_state_R, 0, IsRepG0
1150        mov     dist2, LOC rep0
1151        mov     dist, LOC rep1
1152        mov     LOC rep1, dist2
1153
1154        IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
1155        mov     LOC rep0, dist
1156        jmp     len_decode
1157
1158; MY_ALIGN_32
1159IsRepG1_label:
1160        UPDATE_1 probs_state_R, 0, IsRepG1
1161        mov     dist2, LOC rep2
1162        mov     LOC rep2, dist
1163
1164        IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
1165        mov     LOC rep0, dist2
1166        jmp     len_decode
1167
1168; MY_ALIGN_32
1169IsRepG2_label:
1170        UPDATE_1 probs_state_R, 0, IsRepG2
1171        mov     dist, LOC rep3
1172        mov     LOC rep3, dist2
1173        mov     LOC rep0, dist
1174        jmp     len_decode
1175
1176
1177
1178; ---------- SPEC SHORT DISTANCE ----------
1179
1180MY_ALIGN_32
1181short_dist:
1182        sub     x1, 32 + 1
1183        jbe     decode_dist_end
1184        or      sym, 2
1185        shl     sym, x1_L
1186        lea     sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
1187        mov     sym2, PMULT ; step
1188MY_ALIGN_32
1189spec_loop:
1190        REV_1_VAR x2
1191        dec     x1
1192        jnz     spec_loop
1193
1194        mov     probs, LOC probs_Spec
1195        sub     sym, sym2
1196        sub     sym, SpecPos * PMULT
1197        sub     sym_R, probs
1198        shr     sym, PSHIFT
1199
1200        jmp     decode_dist_end
1201
1202
1203; ---------- COPY MATCH CROSS ----------
1204copy_match_cross:
1205        ; t0_R - src pos
1206        ; r1 - len to dicBufSize
1207        ; cnt_R - total copy len
1208
1209        mov     t1_R, t0_R         ; srcPos
1210        mov     t0_R, dic
1211        mov     r1, LOC dicBufSize   ;
1212        neg     cnt_R
1213@@:
1214        movzx   sym, byte ptr[t1_R * 1 + t0_R]
1215        inc     t1_R
1216        mov     byte ptr[cnt_R * 1 + dicPos], sym_L
1217        inc     cnt_R
1218        cmp     t1_R, r1
1219        jne     @b
1220
1221        movzx   sym, byte ptr[t0_R]
1222        sub     t0_R, cnt_R
1223        jmp     copy_common
1224
1225
1226
1227
1228; fin_dicPos_LIMIT_REP_SHORT:
1229        ; mov     sym, 1
1230
1231fin_dicPos_LIMIT:
1232        mov     LOC remainLen, sym
1233        jmp     fin_OK
1234        ; For more strict mode we can stop decoding with error
1235        ; mov     sym, 1
1236        ; jmp     fin
1237
1238
1239fin_ERROR_MATCH_DIST:
1240
1241        ; rep3 = rep2;
1242        ; rep2 = rep1;
1243        ; rep1 = rep0;
1244        ; rep0 = distance + 1;
1245
1246        add     len_temp, kMatchSpecLen_Error_Data
1247        mov     LOC remainLen, len_temp
1248
1249        mov     LOC rep0, sym
1250        mov     LOC rep1, t1
1251        mov     LOC rep2, x1
1252        mov     LOC rep3, x2
1253
1254        ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
1255        cmp     state, (kNumStates + kNumLitStates) * PMULT
1256        mov     state, kNumLitStates * PMULT
1257        mov     t0, (kNumLitStates + 3) * PMULT
1258        cmovae  state, t0
1259
1260        ; jmp     fin_OK
1261        mov     sym, 1
1262        jmp     fin
1263
1264end_of_payload:
1265        inc     sym
1266        jnz     fin_ERROR_MATCH_DIST
1267
1268        mov     LOC remainLen, kMatchSpecLenStart
1269        sub     state, kNumStates * PMULT
1270
1271fin_OK:
1272        xor     sym, sym
1273
1274fin:
1275        NORM
1276
1277        mov     r1, LOC lzmaPtr
1278
1279        sub     dicPos, LOC dic_Spec
1280        mov     GLOB dicPos_Spec, dicPos
1281        mov     GLOB buf_Spec, buf
1282        mov     GLOB range_Spec, range
1283        mov     GLOB code_Spec, cod
1284        shr     state, PSHIFT
1285        mov     GLOB state_Spec, state
1286        mov     GLOB processedPos_Spec, processedPos
1287
1288        RESTORE_VAR(remainLen)
1289        RESTORE_VAR(rep0)
1290        RESTORE_VAR(rep1)
1291        RESTORE_VAR(rep2)
1292        RESTORE_VAR(rep3)
1293
1294        mov     x0, sym
1295
1296        mov     RSP, LOC Old_RSP
1297
1298MY_POP_PRESERVED_ABI_REGS
1299MY_ENDP
1300
1301_TEXT$LZMADECOPT ENDS
1302
1303end
1304