1 /* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
2 2023-04-02 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 /* #define SHOW_STAT */
7 #ifdef SHOW_STAT
8 #include <stdio.h>
9 #define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
10 #else
11 #define PRF2(s)
12 #endif
13
14 #include "Bcj2.h"
15 #include "CpuArch.h"
16
17 #define kTopValue ((UInt32)1 << 24)
18 #define kNumBitModelTotalBits 11
19 #define kBitModelTotal (1 << kNumBitModelTotalBits)
20 #define kNumMoveBits 5
21
Bcj2Enc_Init(CBcj2Enc * p)22 void Bcj2Enc_Init(CBcj2Enc *p)
23 {
24 unsigned i;
25 p->state = BCJ2_ENC_STATE_ORIG;
26 p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
27 p->context = 0;
28 p->flushRem = 5;
29 p->isFlushState = 0;
30 p->cache = 0;
31 p->range = 0xffffffff;
32 p->low = 0;
33 p->cacheSize = 1;
34 p->ip64 = 0;
35 p->fileIp64 = 0;
36 p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
37 p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
38 // p->relatExcludeBits = 0;
39 p->tempPos = 0;
40 for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
41 p->probs[i] = kBitModelTotal >> 1;
42 }
43
44 // Z7_NO_INLINE
45 Z7_FORCE_INLINE
Bcj2_RangeEnc_ShiftLow(CBcj2Enc * p)46 static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
47 {
48 const UInt32 low = (UInt32)p->low;
49 const unsigned high = (unsigned)
50 #if defined(Z7_MSC_VER_ORIGINAL) \
51 && defined(MY_CPU_X86) \
52 && defined(MY_CPU_LE) \
53 && !defined(MY_CPU_64BIT)
54 // we try to rid of __aullshr() call in MSVS-x86
55 (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
56 #else
57 (p->low >> 32);
58 #endif
59 if (low < (UInt32)0xff000000 || high != 0)
60 {
61 Byte *buf = p->bufs[BCJ2_STREAM_RC];
62 do
63 {
64 if (buf == p->lims[BCJ2_STREAM_RC])
65 {
66 p->state = BCJ2_STREAM_RC;
67 p->bufs[BCJ2_STREAM_RC] = buf;
68 return True;
69 }
70 *buf++ = (Byte)(p->cache + high);
71 p->cache = 0xff;
72 }
73 while (--p->cacheSize);
74 p->bufs[BCJ2_STREAM_RC] = buf;
75 p->cache = (Byte)(low >> 24);
76 }
77 p->cacheSize++;
78 p->low = low << 8;
79 return False;
80 }
81
82
83 /*
84 We can use 2 alternative versions of code:
85 1) non-marker version:
86 Byte CBcj2Enc::context
87 Byte temp[8];
88 Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
89 Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
90 with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
91
92 2) marker version:
93 UInt32 CBcj2Enc::context
94 Byte CBcj2Enc::temp[4];
95 MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
96 it's allowed that
97 one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
98 and another call of Bcj2Enc_Encode_2() does offset conversion.
99 So different values of (fileIp) and (fileSize) are possible
100 in these different Bcj2Enc_Encode_2() calls.
101
102 Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
103 So we use non-marker version.
104 */
105
106 /*
107 Corner cases with overlap in multi-block.
108 before v23: there was one corner case, where converted instruction
109 could start in one sub-stream and finish in next sub-stream.
110 If multi-block (solid) encoding is used,
111 and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
112 and (0f) is last byte of previous sub-stream
113 and (8x) is first byte of current sub-stream
114 then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
115 BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
116 if that offset meets limit requirements.
117 If encoder allows 32-bit offset conversion for such overlap case,
118 then the data in 3 uncompressed BCJ2 streams for some sub-stream
119 can depend from data of previous sub-stream.
120 That corner case is not big problem, and it's rare case.
121 Since v23.00 we do additional check to prevent conversions in such overlap cases.
122 */
123
124 /*
125 Bcj2Enc_Encode_2() output variables at exit:
126 {
127 if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
128 {
129 it means that encoder needs more input data.
130 if (p->srcLim == p->src) at exit, then
131 {
132 (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
133 all input data were read and processed, and we are ready for
134 new input data.
135 }
136 else
137 {
138 (p->srcLim != p->src)
139 (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
140 The encoder have found e8/e9/0f_8x marker,
141 and p->src points to last byte of that marker,
142 Bcj2Enc_Encode_2() needs more input data to get totally
143 5 bytes (last byte of marker and 32-bit branch offset)
144 as continuous array starting from p->src.
145 (p->srcLim - p->src < 5) requirement is met after exit.
146 So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
147 }
148 }
149 }
150 */
151
152 Z7_NO_INLINE
Bcj2Enc_Encode_2(CBcj2Enc * p)153 static void Bcj2Enc_Encode_2(CBcj2Enc *p)
154 {
155 if (!p->isFlushState)
156 {
157 const Byte *src;
158 UInt32 v;
159 {
160 const unsigned state = p->state;
161 if (BCJ2_IS_32BIT_STREAM(state))
162 {
163 Byte *cur = p->bufs[state];
164 if (cur == p->lims[state])
165 return;
166 SetBe32a(cur, p->tempTarget)
167 p->bufs[state] = cur + 4;
168 }
169 }
170 p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
171 src = p->src;
172 v = p->context;
173
174 // #define WRITE_CONTEXT p->context = v; // for marker version
175 #define WRITE_CONTEXT p->context = (Byte)v;
176 #define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT
177
178 for (;;)
179 {
180 // const Byte *src;
181 // UInt32 v;
182 CBcj2Enc_ip_unsigned ip;
183 if (p->range < kTopValue)
184 {
185 // to reduce register pressure and code size: we save and restore local variables.
186 WRITE_CONTEXT_AND_SRC
187 if (Bcj2_RangeEnc_ShiftLow(p))
188 return;
189 p->range <<= 8;
190 src = p->src;
191 v = p->context;
192 }
193 // src = p->src;
194 // #define MARKER_FLAG ((UInt32)1 << 17)
195 // if ((v & MARKER_FLAG) == 0) // for marker version
196 {
197 const Byte *srcLim;
198 Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
199 {
200 const SizeT remSrc = (SizeT)(p->srcLim - src);
201 SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
202 if (rem >= remSrc)
203 rem = remSrc;
204 srcLim = src + rem;
205 }
206 /* p->context contains context of previous byte:
207 bits [0 : 7] : src[-1], if (src) was changed in this call
208 bits [8 : 31] : are undefined for non-marker version
209 */
210 // v = p->context;
211 #define NUM_SHIFT_BITS 24
212 #define CONV_FLAG ((UInt32)1 << 16)
213 #define ONE_ITER { \
214 b = src[0]; \
215 *dest++ = (Byte)b; \
216 v = (v << NUM_SHIFT_BITS) | b; \
217 if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
218 if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
219 ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
220 src++; if (src == srcLim) { break; } }
221
222 if (src != srcLim)
223 for (;;)
224 {
225 /* clang can generate ineffective code with setne instead of two jcc instructions.
226 we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
227 unsigned b;
228 ONE_ITER
229 ONE_ITER
230 }
231
232 ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
233 p->bufs[BCJ2_STREAM_MAIN] = dest;
234 p->ip64 = ip;
235
236 if (src == srcLim)
237 {
238 WRITE_CONTEXT_AND_SRC
239 if (src != p->srcLim)
240 {
241 p->state = BCJ2_STREAM_MAIN;
242 return;
243 }
244 /* (p->src == p->srcLim)
245 (p->state == BCJ2_ENC_STATE_ORIG) */
246 if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
247 return;
248 /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
249 // (p->flushRem == 5);
250 p->isFlushState = 1;
251 break;
252 }
253 src++;
254 // p->src = src;
255 }
256 // ip = p->ip; // for marker version
257 /* marker was found */
258 /* (v) contains marker that was found:
259 bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
260 : value of src[-2] : xx/xx/0f
261 bits [0 : 7] : value of src[-1] : e8/e9/8x
262 */
263 {
264 {
265 #if NUM_SHIFT_BITS != 24
266 v &= ~(UInt32)CONV_FLAG;
267 #endif
268 // UInt32 relat = 0;
269 if ((SizeT)(p->srcLim - src) >= 4)
270 {
271 /*
272 if (relat != 0 || (Byte)v != 0xe8)
273 BoolInt isBigOffset = True;
274 */
275 const UInt32 relat = GetUi32(src);
276 /*
277 #define EXCLUDE_FLAG ((UInt32)1 << 4)
278 #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
279 if (p->relatExcludeBits != 0)
280 {
281 const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
282 isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
283 }
284 // isBigOffset = False; // for debug
285 */
286 ip -= p->fileIp64;
287 // Use the following if check, if (ip) is 64-bit:
288 if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
289 if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
290 if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
291 v |= CONV_FLAG;
292 }
293 else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
294 {
295 // (p->srcLim - src < 4)
296 // /*
297 // for non-marker version
298 p->ip64--; // p->ip = ip - 1;
299 p->bufs[BCJ2_STREAM_MAIN]--;
300 src--;
301 v >>= NUM_SHIFT_BITS;
302 // (0 < p->srcLim - p->src <= 4)
303 // */
304 // v |= MARKER_FLAG; // for marker version
305 /* (p->state == BCJ2_ENC_STATE_ORIG) */
306 WRITE_CONTEXT_AND_SRC
307 return;
308 }
309 {
310 const unsigned c = ((v + 0x17) >> 6) & 1;
311 CBcj2Prob *prob = p->probs + (unsigned)
312 (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
313 /*
314 ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
315 ((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1));
316 */
317 const unsigned ttt = *prob;
318 const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
319 if ((v & CONV_FLAG) == 0)
320 {
321 // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
322 // v = (Byte)v; // for marker version
323 p->range = bound;
324 *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
325 // WRITE_CONTEXT_AND_SRC
326 continue;
327 }
328 p->low += bound;
329 p->range -= bound;
330 *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
331 }
332 // p->context = src[3];
333 {
334 // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
335 const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
336 ip = p->ip64;
337 v = GetUi32(src); // relat
338 ip += 4;
339 p->ip64 = ip;
340 src += 4;
341 // p->src = src;
342 {
343 const UInt32 absol = (UInt32)ip + v;
344 Byte *cur = p->bufs[cj];
345 v >>= 24;
346 // WRITE_CONTEXT
347 if (cur == p->lims[cj])
348 {
349 p->state = cj;
350 p->tempTarget = absol;
351 WRITE_CONTEXT_AND_SRC
352 return;
353 }
354 SetBe32a(cur, absol)
355 p->bufs[cj] = cur + 4;
356 }
357 }
358 }
359 }
360 } // end of loop
361 }
362
363 for (; p->flushRem != 0; p->flushRem--)
364 if (Bcj2_RangeEnc_ShiftLow(p))
365 return;
366 p->state = BCJ2_ENC_STATE_FINISHED;
367 }
368
369
370 /*
371 BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
372 So base function Bcj2Enc_Encode_2()
373 in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
374 (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
375 Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
376 so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
377 then (p->src == p->srcLim).
378 And the caller's code is simpler with Bcj2Enc_Encode().
379 */
380
381 Z7_NO_INLINE
Bcj2Enc_Encode(CBcj2Enc * p)382 void Bcj2Enc_Encode(CBcj2Enc *p)
383 {
384 PRF2("\n----")
385 if (p->tempPos != 0)
386 {
387 /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
388 unsigned extra = 0;
389 /* We will touch only minimal required number of bytes in input (src) stream.
390 So we will add input bytes from (src) stream to temp[] with step of 1 byte.
391 We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
392 in first loop iteration because
393 - previous call of Bcj2Enc_Encode() could use another (finishMode),
394 - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
395 the case with full temp[] buffer (p->tempPos == 4) is possible here.
396 */
397 for (;;)
398 {
399 // (0 < p->tempPos <= 5) // in non-marker version
400 /* p->src : the current src data position including extra bytes
401 that were copied to temp[] buffer in this call */
402 const Byte *src = p->src;
403 const Byte *srcLim = p->srcLim;
404 const EBcj2Enc_FinishMode finishMode = p->finishMode;
405 if (src != srcLim)
406 {
407 /* if there are some src data after the data copied to temp[],
408 then we use MODE_CONTINUE for temp data */
409 p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
410 }
411 p->src = p->temp;
412 p->srcLim = p->temp + p->tempPos;
413 PRF2(" ")
414 Bcj2Enc_Encode_2(p);
415 {
416 const unsigned num = (unsigned)(p->src - p->temp);
417 const unsigned tempPos = p->tempPos - num;
418 unsigned i;
419 p->tempPos = tempPos;
420 for (i = 0; i < tempPos; i++)
421 p->temp[i] = p->temp[(SizeT)i + num];
422 // tempPos : number of bytes in temp buffer
423 p->src = src;
424 p->srcLim = srcLim;
425 p->finishMode = finishMode;
426 if (p->state != BCJ2_ENC_STATE_ORIG)
427 {
428 // (p->tempPos <= 4) // in non-marker version
429 /* if (the reason of exit from Bcj2Enc_Encode_2()
430 is not BCJ2_ENC_STATE_ORIG),
431 then we exit from Bcj2Enc_Encode() with same reason */
432 // optional code begin : we rollback (src) and tempPos, if it's possible:
433 if (extra >= tempPos)
434 extra = tempPos;
435 p->src = src - extra;
436 p->tempPos = tempPos - extra;
437 // optional code end : rollback of (src) and tempPos
438 return;
439 }
440 /* (p->tempPos <= 4)
441 (p->state == BCJ2_ENC_STATE_ORIG)
442 so encoder needs more data than in temp[] */
443 if (src == srcLim)
444 return; // src buffer has no more input data.
445 /* (src != srcLim)
446 so we can provide more input data from src for Bcj2Enc_Encode_2() */
447 if (extra >= tempPos)
448 {
449 /* (extra >= tempPos) means that temp buffer contains
450 only data from src buffer of this call.
451 So now we can encode without temp buffer */
452 p->src = src - tempPos; // rollback (src)
453 p->tempPos = 0;
454 break;
455 }
456 // we append one additional extra byte from (src) to temp[] buffer:
457 p->temp[tempPos] = *src;
458 p->tempPos = tempPos + 1;
459 // (0 < p->tempPos <= 5) // in non-marker version
460 p->src = src + 1;
461 extra++;
462 }
463 }
464 }
465
466 PRF2("++++")
467 // (p->tempPos == 0)
468 Bcj2Enc_Encode_2(p);
469 PRF2("====")
470
471 if (p->state == BCJ2_ENC_STATE_ORIG)
472 {
473 const Byte *src = p->src;
474 const Byte *srcLim = p->srcLim;
475 const unsigned rem = (unsigned)(srcLim - src);
476 /* (rem <= 4) here.
477 if (p->src != p->srcLim), then
478 - we copy non-processed bytes from (p->src) to temp[] buffer,
479 - we set p->src equal to p->srcLim.
480 */
481 if (rem)
482 {
483 unsigned i = 0;
484 p->src = srcLim;
485 p->tempPos = rem;
486 // (0 < p->tempPos <= 4)
487 do
488 p->temp[i] = src[i];
489 while (++i != rem);
490 }
491 // (p->tempPos <= 4)
492 // (p->src == p->srcLim)
493 }
494 }
495
496 #undef PRF2
497 #undef CONV_FLAG
498 #undef MARKER_FLAG
499 #undef WRITE_CONTEXT
500 #undef WRITE_CONTEXT_AND_SRC
501 #undef ONE_ITER
502 #undef NUM_SHIFT_BITS
503 #undef kTopValue
504 #undef kNumBitModelTotalBits
505 #undef kBitModelTotal
506 #undef kNumMoveBits
507