• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
2 2023-04-02 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 /* #define SHOW_STAT */
7 #ifdef SHOW_STAT
8 #include <stdio.h>
9 #define PRF2(s) printf("%s ip=%8x  tempPos=%d  src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
10 #else
11 #define PRF2(s)
12 #endif
13 
14 #include "Bcj2.h"
15 #include "CpuArch.h"
16 
17 #define kTopValue ((UInt32)1 << 24)
18 #define kNumBitModelTotalBits 11
19 #define kBitModelTotal (1 << kNumBitModelTotalBits)
20 #define kNumMoveBits 5
21 
Bcj2Enc_Init(CBcj2Enc * p)22 void Bcj2Enc_Init(CBcj2Enc *p)
23 {
24   unsigned i;
25   p->state = BCJ2_ENC_STATE_ORIG;
26   p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
27   p->context = 0;
28   p->flushRem = 5;
29   p->isFlushState = 0;
30   p->cache = 0;
31   p->range = 0xffffffff;
32   p->low = 0;
33   p->cacheSize = 1;
34   p->ip64 = 0;
35   p->fileIp64 = 0;
36   p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
37   p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
38   // p->relatExcludeBits = 0;
39   p->tempPos = 0;
40   for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
41     p->probs[i] = kBitModelTotal >> 1;
42 }
43 
44 // Z7_NO_INLINE
45 Z7_FORCE_INLINE
Bcj2_RangeEnc_ShiftLow(CBcj2Enc * p)46 static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
47 {
48   const UInt32 low = (UInt32)p->low;
49   const unsigned high = (unsigned)
50     #if defined(Z7_MSC_VER_ORIGINAL) \
51         && defined(MY_CPU_X86) \
52         && defined(MY_CPU_LE) \
53         && !defined(MY_CPU_64BIT)
54       // we try to rid of __aullshr() call in MSVS-x86
55       (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
56     #else
57       (p->low >> 32);
58     #endif
59   if (low < (UInt32)0xff000000 || high != 0)
60   {
61     Byte *buf = p->bufs[BCJ2_STREAM_RC];
62     do
63     {
64       if (buf == p->lims[BCJ2_STREAM_RC])
65       {
66         p->state = BCJ2_STREAM_RC;
67         p->bufs[BCJ2_STREAM_RC] = buf;
68         return True;
69       }
70       *buf++ = (Byte)(p->cache + high);
71       p->cache = 0xff;
72     }
73     while (--p->cacheSize);
74     p->bufs[BCJ2_STREAM_RC] = buf;
75     p->cache = (Byte)(low >> 24);
76   }
77   p->cacheSize++;
78   p->low = low << 8;
79   return False;
80 }
81 
82 
83 /*
84 We can use 2 alternative versions of code:
85 1) non-marker version:
86   Byte CBcj2Enc::context
87   Byte temp[8];
88   Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
89   Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
90   with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
91 
92 2) marker version:
93   UInt32 CBcj2Enc::context
94   Byte CBcj2Enc::temp[4];
95   MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
96   it's allowed that
97     one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
98     and another call of Bcj2Enc_Encode_2() does offset conversion.
99     So different values of (fileIp) and (fileSize) are possible
100     in these different Bcj2Enc_Encode_2() calls.
101 
102 Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
103 So we use non-marker version.
104 */
105 
106 /*
107   Corner cases with overlap in multi-block.
108   before v23: there was one corner case, where converted instruction
109     could start in one sub-stream and finish in next sub-stream.
110   If multi-block (solid) encoding is used,
111     and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
112     and (0f) is last byte of previous sub-stream
113     and (8x) is first byte of current sub-stream
114   then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
115   BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
116   if that offset meets limit requirements.
117   If encoder allows 32-bit offset conversion for such overlap case,
118   then the data in 3 uncompressed BCJ2 streams for some sub-stream
119   can depend from data of previous sub-stream.
120   That corner case is not big problem, and it's rare case.
121   Since v23.00 we do additional check to prevent conversions in such overlap cases.
122 */
123 
124 /*
125   Bcj2Enc_Encode_2() output variables at exit:
126   {
127     if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
128     {
129       it means that encoder needs more input data.
130       if (p->srcLim == p->src) at exit, then
131       {
132         (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
133         all input data were read and processed, and we are ready for
134         new input data.
135       }
136       else
137       {
138         (p->srcLim != p->src)
139         (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
140           The encoder have found e8/e9/0f_8x marker,
141           and p->src points to last byte of that marker,
142           Bcj2Enc_Encode_2() needs more input data to get totally
143           5 bytes (last byte of marker and 32-bit branch offset)
144           as continuous array starting from p->src.
145         (p->srcLim - p->src < 5) requirement is met after exit.
146           So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
147       }
148     }
149   }
150 */
151 
152 Z7_NO_INLINE
Bcj2Enc_Encode_2(CBcj2Enc * p)153 static void Bcj2Enc_Encode_2(CBcj2Enc *p)
154 {
155   if (!p->isFlushState)
156   {
157     const Byte *src;
158     UInt32 v;
159     {
160       const unsigned state = p->state;
161       if (BCJ2_IS_32BIT_STREAM(state))
162       {
163         Byte *cur = p->bufs[state];
164         if (cur == p->lims[state])
165           return;
166         SetBe32a(cur, p->tempTarget)
167         p->bufs[state] = cur + 4;
168       }
169     }
170     p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
171     src = p->src;
172     v = p->context;
173 
174     // #define WRITE_CONTEXT  p->context = v; // for marker version
175     #define WRITE_CONTEXT           p->context = (Byte)v;
176     #define WRITE_CONTEXT_AND_SRC   p->src = src;  WRITE_CONTEXT
177 
178     for (;;)
179     {
180       // const Byte *src;
181       // UInt32 v;
182       CBcj2Enc_ip_unsigned ip;
183       if (p->range < kTopValue)
184       {
185         // to reduce register pressure and code size: we save and restore local variables.
186         WRITE_CONTEXT_AND_SRC
187         if (Bcj2_RangeEnc_ShiftLow(p))
188           return;
189         p->range <<= 8;
190         src = p->src;
191         v = p->context;
192       }
193       // src = p->src;
194       // #define MARKER_FLAG  ((UInt32)1 << 17)
195       // if ((v & MARKER_FLAG) == 0) // for marker version
196       {
197         const Byte *srcLim;
198         Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
199         {
200           const SizeT remSrc = (SizeT)(p->srcLim - src);
201           SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
202           if (rem >= remSrc)
203             rem = remSrc;
204           srcLim = src + rem;
205         }
206         /* p->context contains context of previous byte:
207            bits [0 : 7]  : src[-1], if (src) was changed in this call
208            bits [8 : 31] : are undefined for non-marker version
209         */
210         // v = p->context;
211         #define NUM_SHIFT_BITS  24
212         #define CONV_FLAG  ((UInt32)1 << 16)
213         #define ONE_ITER { \
214           b = src[0]; \
215           *dest++ = (Byte)b; \
216           v = (v << NUM_SHIFT_BITS) | b; \
217           if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
218           if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
219               ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
220           src++; if (src == srcLim) { break; } }
221 
222         if (src != srcLim)
223         for (;;)
224         {
225           /* clang can generate ineffective code with setne instead of two jcc instructions.
226              we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
227           unsigned b;
228           ONE_ITER
229           ONE_ITER
230         }
231 
232         ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
233         p->bufs[BCJ2_STREAM_MAIN] = dest;
234         p->ip64 = ip;
235 
236         if (src == srcLim)
237         {
238           WRITE_CONTEXT_AND_SRC
239           if (src != p->srcLim)
240           {
241             p->state = BCJ2_STREAM_MAIN;
242             return;
243           }
244           /* (p->src == p->srcLim)
245           (p->state == BCJ2_ENC_STATE_ORIG) */
246           if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
247             return;
248           /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
249           // (p->flushRem == 5);
250           p->isFlushState = 1;
251           break;
252         }
253         src++;
254         // p->src = src;
255       }
256       // ip = p->ip; // for marker version
257       /* marker was found */
258       /* (v) contains marker that was found:
259            bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
260                          : value of src[-2] : xx/xx/0f
261            bits [0 : 7]  : value of src[-1] : e8/e9/8x
262       */
263       {
264         {
265         #if NUM_SHIFT_BITS != 24
266           v &= ~(UInt32)CONV_FLAG;
267         #endif
268           // UInt32 relat = 0;
269           if ((SizeT)(p->srcLim - src) >= 4)
270           {
271             /*
272             if (relat != 0 || (Byte)v != 0xe8)
273             BoolInt isBigOffset = True;
274             */
275             const UInt32 relat = GetUi32(src);
276             /*
277             #define EXCLUDE_FLAG  ((UInt32)1 << 4)
278             #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
279             if (p->relatExcludeBits != 0)
280             {
281               const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
282               isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
283             }
284             // isBigOffset = False; // for debug
285             */
286             ip -= p->fileIp64;
287             // Use the following if check, if (ip) is 64-bit:
288             if (ip > (((v + 0x20) >> 5) & 1))  // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
289             if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
290             if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
291               v |= CONV_FLAG;
292           }
293           else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
294           {
295             // (p->srcLim - src < 4)
296             // /*
297             // for non-marker version
298             p->ip64--; // p->ip = ip - 1;
299             p->bufs[BCJ2_STREAM_MAIN]--;
300             src--;
301             v >>= NUM_SHIFT_BITS;
302             // (0 < p->srcLim - p->src <= 4)
303             // */
304             // v |= MARKER_FLAG; // for marker version
305             /* (p->state == BCJ2_ENC_STATE_ORIG) */
306             WRITE_CONTEXT_AND_SRC
307             return;
308           }
309           {
310             const unsigned c = ((v + 0x17) >> 6) & 1;
311             CBcj2Prob *prob = p->probs + (unsigned)
312                 (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
313             /*
314                 ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
315                 ((Byte)v < 0xe8 ? 0 : 1));  // ((v >> 5) & 1));
316             */
317             const unsigned ttt = *prob;
318             const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
319             if ((v & CONV_FLAG) == 0)
320             {
321               // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
322               // v = (Byte)v; // for marker version
323               p->range = bound;
324               *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
325               // WRITE_CONTEXT_AND_SRC
326               continue;
327             }
328             p->low += bound;
329             p->range -= bound;
330             *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
331           }
332           // p->context = src[3];
333           {
334             // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
335             const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
336             ip = p->ip64;
337             v = GetUi32(src); // relat
338             ip += 4;
339             p->ip64 = ip;
340             src += 4;
341             // p->src = src;
342             {
343               const UInt32 absol = (UInt32)ip + v;
344               Byte *cur = p->bufs[cj];
345               v >>= 24;
346               // WRITE_CONTEXT
347               if (cur == p->lims[cj])
348               {
349                 p->state = cj;
350                 p->tempTarget = absol;
351                 WRITE_CONTEXT_AND_SRC
352                 return;
353               }
354               SetBe32a(cur, absol)
355               p->bufs[cj] = cur + 4;
356             }
357           }
358         }
359       }
360     } // end of loop
361   }
362 
363   for (; p->flushRem != 0; p->flushRem--)
364     if (Bcj2_RangeEnc_ShiftLow(p))
365       return;
366   p->state = BCJ2_ENC_STATE_FINISHED;
367 }
368 
369 
370 /*
371 BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
372 So base function Bcj2Enc_Encode_2()
373   in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
374   (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
375 Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
376   so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
377     then (p->src == p->srcLim).
378   And the caller's code is simpler with Bcj2Enc_Encode().
379 */
380 
381 Z7_NO_INLINE
Bcj2Enc_Encode(CBcj2Enc * p)382 void Bcj2Enc_Encode(CBcj2Enc *p)
383 {
384   PRF2("\n----")
385   if (p->tempPos != 0)
386   {
387     /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
388     unsigned extra = 0;
389     /* We will touch only minimal required number of bytes in input (src) stream.
390        So we will add input bytes from (src) stream to temp[] with step of 1 byte.
391        We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
392          in first loop iteration because
393          - previous call of Bcj2Enc_Encode() could use another (finishMode),
394          - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
395        the case with full temp[] buffer (p->tempPos == 4) is possible here.
396     */
397     for (;;)
398     {
399       // (0 < p->tempPos <= 5) // in non-marker version
400       /* p->src : the current src data position including extra bytes
401                   that were copied to temp[] buffer in this call */
402       const Byte *src = p->src;
403       const Byte *srcLim = p->srcLim;
404       const EBcj2Enc_FinishMode finishMode = p->finishMode;
405       if (src != srcLim)
406       {
407         /* if there are some src data after the data copied to temp[],
408            then we use MODE_CONTINUE for temp data */
409         p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
410       }
411       p->src = p->temp;
412       p->srcLim = p->temp + p->tempPos;
413       PRF2("    ")
414       Bcj2Enc_Encode_2(p);
415       {
416         const unsigned num = (unsigned)(p->src - p->temp);
417         const unsigned tempPos = p->tempPos - num;
418         unsigned i;
419         p->tempPos = tempPos;
420         for (i = 0; i < tempPos; i++)
421           p->temp[i] = p->temp[(SizeT)i + num];
422         // tempPos : number of bytes in temp buffer
423         p->src = src;
424         p->srcLim = srcLim;
425         p->finishMode = finishMode;
426         if (p->state != BCJ2_ENC_STATE_ORIG)
427         {
428           // (p->tempPos <= 4) // in non-marker version
429           /* if (the reason of exit from Bcj2Enc_Encode_2()
430                  is not BCJ2_ENC_STATE_ORIG),
431              then we exit from Bcj2Enc_Encode() with same reason */
432           // optional code begin : we rollback (src) and tempPos, if it's possible:
433           if (extra >= tempPos)
434             extra = tempPos;
435           p->src = src - extra;
436           p->tempPos = tempPos - extra;
437           // optional code end : rollback of (src) and tempPos
438           return;
439         }
440         /* (p->tempPos <= 4)
441            (p->state == BCJ2_ENC_STATE_ORIG)
442              so encoder needs more data than in temp[] */
443         if (src == srcLim)
444           return; // src buffer has no more input data.
445         /* (src != srcLim)
446            so we can provide more input data from src for Bcj2Enc_Encode_2() */
447         if (extra >= tempPos)
448         {
449           /* (extra >= tempPos) means that temp buffer contains
450              only data from src buffer of this call.
451              So now we can encode without temp buffer */
452           p->src = src - tempPos; // rollback (src)
453           p->tempPos = 0;
454           break;
455         }
456         // we append one additional extra byte from (src) to temp[] buffer:
457         p->temp[tempPos] = *src;
458         p->tempPos = tempPos + 1;
459         // (0 < p->tempPos <= 5) // in non-marker version
460         p->src = src + 1;
461         extra++;
462       }
463     }
464   }
465 
466   PRF2("++++")
467   // (p->tempPos == 0)
468   Bcj2Enc_Encode_2(p);
469   PRF2("====")
470 
471   if (p->state == BCJ2_ENC_STATE_ORIG)
472   {
473     const Byte *src = p->src;
474     const Byte *srcLim = p->srcLim;
475     const unsigned rem = (unsigned)(srcLim - src);
476     /* (rem <= 4) here.
477        if (p->src != p->srcLim), then
478          - we copy non-processed bytes from (p->src) to temp[] buffer,
479          - we set p->src equal to p->srcLim.
480     */
481     if (rem)
482     {
483       unsigned i = 0;
484       p->src = srcLim;
485       p->tempPos = rem;
486       // (0 < p->tempPos <= 4)
487       do
488         p->temp[i] = src[i];
489       while (++i != rem);
490     }
491     // (p->tempPos <= 4)
492     // (p->src == p->srcLim)
493   }
494 }
495 
496 #undef PRF2
497 #undef CONV_FLAG
498 #undef MARKER_FLAG
499 #undef WRITE_CONTEXT
500 #undef WRITE_CONTEXT_AND_SRC
501 #undef ONE_ITER
502 #undef NUM_SHIFT_BITS
503 #undef kTopValue
504 #undef kNumBitModelTotalBits
505 #undef kBitModelTotal
506 #undef kNumMoveBits
507