1 // UTFConvert.cpp
2
3 #include "StdAfx.h"
4
5 // #include <stdio.h>
6
7 #include "MyTypes.h"
8 #include "UTFConvert.h"
9
10
11 #ifndef _WCHART_IS_16BIT
12 #ifndef __APPLE__
13 // we define it if the system supports files with non-utf8 symbols:
14 #define _UTF8_RAW_NON_UTF8_SUPPORTED
15 #endif
16 #endif
17
18 /*
19 _UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte
20
21 n : _UTF8_START(n) : Bits of code point
22
23 0 : 0x80 : : unused
24 1 : 0xC0 : 11 :
25 2 : 0xE0 : 16 : Basic Multilingual Plane
26 3 : 0xF0 : 21 : Unicode space
27 4 : 0xF8 : 26 :
28 5 : 0xFC : 31 : UCS-4 : wcstombs() in ubuntu is limited to that value
29 6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value
30 7 : 0xFF :
31 */
32
33 #define _UTF8_START(n) (0x100 - (1 << (7 - (n))))
34
35 #define _UTF8_HEAD_PARSE2(n) \
36 if (c < _UTF8_START((n) + 1)) \
37 { numBytes = (n); val -= _UTF8_START(n); }
38
39 #ifndef _WCHART_IS_16BIT
40
41 /*
42 if (wchar_t is 32-bit), we can support large points in long UTF-8 sequence,
43 when we convert wchar_t strings to UTF-8:
44 (_UTF8_NUM_TAIL_BYTES_MAX == 3) : (21-bits points) - Unicode
45 (_UTF8_NUM_TAIL_BYTES_MAX == 5) : (31-bits points) - UCS-4
46 (_UTF8_NUM_TAIL_BYTES_MAX == 6) : (36-bit hack)
47 */
48
49 #define _UTF8_NUM_TAIL_BYTES_MAX 5
50 #endif
51
52 /*
53 #define _UTF8_HEAD_PARSE \
54 UInt32 val = c; \
55 _UTF8_HEAD_PARSE2(1) \
56 else _UTF8_HEAD_PARSE2(2) \
57 else _UTF8_HEAD_PARSE2(3) \
58 else _UTF8_HEAD_PARSE2(4) \
59 else _UTF8_HEAD_PARSE2(5) \
60 #if _UTF8_NUM_TAIL_BYTES_MAX >= 6
61 else _UTF8_HEAD_PARSE2(6)
62 #endif
63 */
64
65 #define _UTF8_HEAD_PARSE_MAX_3_BYTES \
66 UInt32 val = c; \
67 _UTF8_HEAD_PARSE2(1) \
68 else _UTF8_HEAD_PARSE2(2) \
69 else { numBytes = 3; val -= _UTF8_START(3); }
70
71
72 #define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
73
74
75 #define START_POINT_FOR_SURROGATE 0x10000
76
77
78 /* we use 128 bytes block in 16-bit BMP-PLANE to encode non-UTF-8 Escapes
79 Also we can use additional HIGH-PLANE (we use 21-bit points above 0x1f0000)
80 to simplify internal intermediate conversion in Linux:
81 RAW-UTF-8 <-> internal wchar_t utf-16 strings <-> RAW-UTF-UTF-8
82 */
83
84
85 #if defined(_WCHART_IS_16BIT)
86
87 #define UTF_ESCAPE_PLANE 0
88
89 #else
90
91 /*
92 we can place 128 ESCAPE chars to
93 ef 80 - ee be 80 (3-bytes utf-8) : similar to WSL
94 ef ff - ee bf bf
95
96 1f ef 80 - f7 be be 80 (4-bytes utf-8) : last 4-bytes utf-8 plane (out of Unicode)
97 1f ef ff - f7 be bf bf (4-bytes utf-8) : last 4-bytes utf-8 plane (out of Unicode)
98 */
99
100 // #define UTF_ESCAPE_PLANE_HIGH (0x1f << 16)
101 // #define UTF_ESCAPE_PLANE UTF_ESCAPE_PLANE_HIGH
102 #define UTF_ESCAPE_PLANE 0
103
104 /*
105 if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is set)
106 {
107 if (UTF_ESCAPE_PLANE is UTF_ESCAPE_PLANE_HIGH)
108 {
109 we can restore any 8-bit Escape from ESCAPE-PLANE-21 plane.
110 But ESCAPE-PLANE-21 point cannot be stored to utf-16 (7z archive)
111 So we still need a way to extract 8-bit Escapes and BMP-Escapes-8
112 from same BMP-Escapes-16 stored in 7z.
113 And if we want to restore any 8-bit from 7z archive,
114 we still must use UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT for (utf-8 -> utf-16)
115 Also we need additional Conversions to tranform from utf-16 to utf-16-With-Escapes-21
116 }
117 else (UTF_ESCAPE_PLANE == 0)
118 {
119 we must convert original 3-bytes utf-8 BMP-Escape point to sequence
120 of 3 BMP-Escape-16 points with UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
121 so we can extract original RAW-UTF-8 from UTFD-16 later.
122 }
123 }
124 */
125
126 #endif
127
128
129
130 #define UTF_ESCAPE_BASE 0xef00
131
132
133 #ifdef UTF_ESCAPE_BASE
134 #define IS_ESCAPE_POINT(v, plane) (((v) & (UInt32)0xffffff80) == (plane) + UTF_ESCAPE_BASE + 0x80)
135 #endif
136
137 #define IS_SURROGATE_POINT(v) (((v) & (UInt32)0xfffff800) == 0xd800)
138 #define IS_LOW_SURROGATE_POINT(v) (((v) & (UInt32)0xfffffC00) == 0xdc00)
139
140
141 #define _ERROR_UTF8_CHECK \
142 { NonUtf = true; continue; }
143
Check_Buf(const char * src,size_t size)144 void CUtf8Check::Check_Buf(const char *src, size_t size) throw()
145 {
146 Clear();
147 // Byte maxByte = 0;
148
149 for (;;)
150 {
151 if (size == 0)
152 break;
153
154 const Byte c = (Byte)(*src++);
155 size--;
156
157 if (c == 0)
158 {
159 ZeroChar = true;
160 continue;
161 }
162
163 /*
164 if (c > maxByte)
165 maxByte = c;
166 */
167
168 if (c < 0x80)
169 continue;
170
171 if (c < 0xc0 + 2)// it's limit for 0x140000 unicode codes : win32 compatibility
172 _ERROR_UTF8_CHECK
173
174 unsigned numBytes;
175
176 UInt32 val = c;
177 _UTF8_HEAD_PARSE2(1)
178 else _UTF8_HEAD_PARSE2(2)
179 else _UTF8_HEAD_PARSE2(4)
180 else _UTF8_HEAD_PARSE2(5)
181 else
182 {
183 _ERROR_UTF8_CHECK
184 }
185
186 unsigned pos = 0;
187 do
188 {
189 if (pos == size)
190 break;
191 unsigned c2 = (Byte)src[pos];
192 c2 -= 0x80;
193 if (c2 >= 0x40)
194 break;
195 val <<= 6;
196 val |= c2;
197 if (pos == 0)
198 if (val < (((unsigned)1 << 7) >> numBytes))
199 break;
200 pos++;
201 }
202 while (--numBytes);
203
204 if (numBytes != 0)
205 {
206 if (pos == size)
207 Truncated = true;
208 else
209 _ERROR_UTF8_CHECK
210 }
211
212 #ifdef UTF_ESCAPE_BASE
213 if (IS_ESCAPE_POINT(val, 0))
214 Escape = true;
215 #endif
216
217 if (MaxHighPoint < val)
218 MaxHighPoint = val;
219
220 if (IS_SURROGATE_POINT(val))
221 SingleSurrogate = true;
222
223 src += pos;
224 size -= pos;
225 }
226
227 // MaxByte = maxByte;
228 }
229
Check_UTF8_Buf(const char * src,size_t size,bool allowReduced)230 bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw()
231 {
232 CUtf8Check check;
233 check.Check_Buf(src, size);
234 return check.IsOK(allowReduced);
235 }
236
237 /*
238 bool CheckUTF8_chars(const char *src, bool allowReduced) throw()
239 {
240 CUtf8Check check;
241 check.CheckBuf(src, strlen(src));
242 return check.IsOK(allowReduced);
243 }
244 */
245
CheckUTF8_AString(const AString & s)246 bool CheckUTF8_AString(const AString &s) throw()
247 {
248 CUtf8Check check;
249 check.Check_AString(s);
250 return check.IsOK();
251 }
252
253
254 /*
255 bool CheckUTF8(const char *src, bool allowReduced) throw()
256 {
257 // return Check_UTF8_Buf(src, strlen(src), allowReduced);
258
259 for (;;)
260 {
261 const Byte c = (Byte)(*src++);
262 if (c == 0)
263 return true;
264
265 if (c < 0x80)
266 continue;
267 if (c < 0xC0 + 2 || c >= 0xf5)
268 return false;
269
270 unsigned numBytes;
271 _UTF8_HEAD_PARSE
272 else
273 return false;
274
275 unsigned pos = 0;
276
277 do
278 {
279 Byte c2 = (Byte)(*src++);
280 if (c2 < 0x80 || c2 >= 0xC0)
281 return allowReduced && c2 == 0;
282 val <<= 6;
283 val |= (c2 - 0x80);
284 pos++;
285 }
286 while (--numBytes);
287
288 if (val < _UTF8_RANGE(pos - 1))
289 return false;
290
291 if (val >= 0x110000)
292 return false;
293 }
294 }
295 */
296
297 // in case of UTF-8 error we have two ways:
298 // 21.01- : old : 0xfffd: REPLACEMENT CHARACTER : old version
299 // 21.02+ : new : 0xef00 + (c) : similar to WSL scheme for low symbols
300
301 #define UTF_REPLACEMENT_CHAR 0xfffd
302
303
304
305 #define UTF_ESCAPE(c) \
306 ((flags & UTF_FLAG__FROM_UTF8__USE_ESCAPE) ? \
307 UTF_ESCAPE_PLANE + UTF_ESCAPE_BASE + (c) : UTF_REPLACEMENT_CHAR)
308
309 /*
310 #define _HARD_ERROR_UTF8
311 { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
312 destPos++; ok = false; continue; }
313 */
314
315 // we ignore utf errors, and don't change (ok) variable!
316
317 #define _ERROR_UTF8 \
318 { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
319 destPos++; continue; }
320
321 // we store UTF-16 in wchar_t strings. So we use surrogates for big unicode points:
322
323 // for debug puposes only we can store UTF-32 in wchar_t:
324 // #define START_POINT_FOR_SURROGATE ((UInt32)0 - 1)
325
326
327 /*
328 WIN32 MultiByteToWideChar(CP_UTF8) emits 0xfffd point, if utf-8 error was found.
329 Ant it can emit single 0xfffd from 2 src bytes.
330 It doesn't emit single 0xfffd from 3-4 src bytes.
331 We can
332 1) emit Escape point for each incorrect byte. So we can data recover later
333 2) emit 0xfffd for each incorrect byte.
334 That scheme is similar to Escape scheme, but we emit 0xfffd
335 instead of each Escape point.
336 3) emit single 0xfffd from 1-2 incorrect bytes, as WIN32 MultiByteToWideChar scheme
337 */
338
Utf8_To_Utf16(wchar_t * dest,size_t * destLen,const char * src,const char * srcLim,unsigned flags)339 static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim, unsigned flags) throw()
340 {
341 size_t destPos = 0;
342 bool ok = true;
343
344 for (;;)
345 {
346 if (src == srcLim)
347 {
348 *destLen = destPos;
349 return ok;
350 }
351
352 const Byte c = (Byte)(*src++);
353
354 if (c < 0x80)
355 {
356 if (dest)
357 dest[destPos] = (wchar_t)c;
358 destPos++;
359 continue;
360 }
361
362 if (c < 0xc0 + 2
363 || c >= 0xf5) // it's limit for 0x140000 unicode codes : win32 compatibility
364 {
365 _ERROR_UTF8
366 }
367
368 unsigned numBytes;
369
370 _UTF8_HEAD_PARSE_MAX_3_BYTES
371
372 unsigned pos = 0;
373 do
374 {
375 if (src + pos == srcLim)
376 break;
377 unsigned c2 = (Byte)src[pos];
378 c2 -= 0x80;
379 if (c2 >= 0x40)
380 break;
381 val <<= 6;
382 val |= c2;
383 pos++;
384 if (pos == 1)
385 {
386 if (val < (((unsigned)1 << 7) >> numBytes))
387 break;
388 if (numBytes == 2)
389 {
390 if (flags & UTF_FLAG__FROM_UTF8__SURROGATE_ERROR)
391 if ((val & (0xF800 >> 6)) == (0xd800 >> 6))
392 break;
393 }
394 else if (numBytes == 3 && val >= (0x110000 >> 12))
395 break;
396 }
397 }
398 while (--numBytes);
399
400 if (numBytes != 0)
401 {
402 if ((flags & UTF_FLAG__FROM_UTF8__USE_ESCAPE) == 0)
403 {
404 // the following code to emit the 0xfffd chars as win32 Utf8 function.
405 // disable the folling line, if you need 0xfffd for each incorrect byte as in Escape mode
406 src += pos;
407 }
408 _ERROR_UTF8
409 }
410
411 /*
412 if (val < _UTF8_RANGE(pos - 1))
413 _ERROR_UTF8
414 */
415
416 #ifdef UTF_ESCAPE_BASE
417
418 if ((flags & UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT)
419 && IS_ESCAPE_POINT(val, 0))
420 {
421 // We will emit 3 utf16-Escape-16-21 points from one Escape-16 point (3 bytes)
422 _ERROR_UTF8
423 }
424
425 #endif
426
427 /*
428 We don't expect virtual Escape-21 points in UTF-8 stream.
429 And we don't check for Escape-21.
430 So utf8-Escape-21 will be converted to another 3 utf16-Escape-21 points.
431 Maybe we could convert virtual utf8-Escape-21 to one utf16-Escape-21 point in some cases?
432 */
433
434 if (val < START_POINT_FOR_SURROGATE)
435 {
436 /*
437 if ((flags & UTF_FLAG__FROM_UTF8__SURROGATE_ERROR)
438 && IS_SURROGATE_POINT(val))
439 {
440 // We will emit 3 utf16-Escape-16-21 points from one Surrogate-16 point (3 bytes)
441 _ERROR_UTF8
442 }
443 */
444 if (dest)
445 dest[destPos] = (wchar_t)val;
446 destPos++;
447 }
448 else
449 {
450 /*
451 if (val >= 0x110000)
452 {
453 // We will emit utf16-Escape-16-21 point from each source byte
454 _ERROR_UTF8
455 }
456 */
457 if (dest)
458 {
459 dest[destPos + 0] = (wchar_t)(0xd800 - (0x10000 >> 10) + (val >> 10));
460 dest[destPos + 1] = (wchar_t)(0xdc00 + (val & 0x3ff));
461 }
462 destPos += 2;
463 }
464 src += pos;
465 }
466 }
467
468
469
470 #define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n)))))
471 #define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
472
Utf16_To_Utf8_Calc(const wchar_t * src,const wchar_t * srcLim,unsigned flags)473 static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim, unsigned flags)
474 {
475 size_t size = (size_t)(srcLim - src);
476 for (;;)
477 {
478 if (src == srcLim)
479 return size;
480
481 UInt32 val = (UInt32)(*src++);
482
483 if (val < 0x80)
484 continue;
485
486 if (val < _UTF8_RANGE(1))
487 {
488 size++;
489 continue;
490 }
491
492 #ifdef UTF_ESCAPE_BASE
493
494 #if UTF_ESCAPE_PLANE != 0
495 if (flags & UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE)
496 if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
497 continue;
498 #endif
499
500 if (flags & UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE)
501 if (IS_ESCAPE_POINT(val, 0))
502 continue;
503
504 #endif
505
506 if (IS_SURROGATE_POINT(val))
507 {
508 // it's hack to UTF-8 encoding
509
510 if (val < 0xdc00 && src != srcLim)
511 {
512 const UInt32 c2 = (UInt32)*src;
513 if (c2 >= 0xdc00 && c2 < 0xe000)
514 src++;
515 }
516 size += 2;
517 continue;
518 }
519
520 #ifdef _WCHART_IS_16BIT
521
522 size += 2;
523
524 #else
525
526 if (val < _UTF8_RANGE(2)) size += 2;
527 else if (val < _UTF8_RANGE(3)) size += 3;
528 else if (val < _UTF8_RANGE(4)) size += 4;
529 else if (val < _UTF8_RANGE(5)) size += 5;
530 else
531 #if _UTF8_NUM_TAIL_BYTES_MAX >= 6
532 size += 6;
533 #else
534 size += 3;
535 #endif
536
537 #endif
538 }
539 }
540
541
Utf16_To_Utf8(char * dest,const wchar_t * src,const wchar_t * srcLim,unsigned flags)542 static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim, unsigned flags)
543 {
544 for (;;)
545 {
546 if (src == srcLim)
547 return dest;
548
549 UInt32 val = (UInt32)*src++;
550
551 if (val < 0x80)
552 {
553 *dest++ = (char)val;
554 continue;
555 }
556
557 if (val < _UTF8_RANGE(1))
558 {
559 dest[0] = _UTF8_HEAD(1, val);
560 dest[1] = _UTF8_CHAR(0, val);
561 dest += 2;
562 continue;
563 }
564
565 #ifdef UTF_ESCAPE_BASE
566
567 #if UTF_ESCAPE_PLANE != 0
568 /*
569 if (wchar_t is 32-bit)
570 && (UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE is set)
571 && (point is virtual escape plane)
572 we extract 8-bit byte from virtual HIGH-ESCAPE PLANE.
573 */
574 if (flags & UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE)
575 if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
576 {
577 *dest++ = (char)(val);
578 continue;
579 }
580 #endif // UTF_ESCAPE_PLANE != 0
581
582 /* if (UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE is defined)
583 we extract 8-bit byte from BMP-ESCAPE PLANE. */
584
585 if (flags & UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE)
586 if (IS_ESCAPE_POINT(val, 0))
587 {
588 *dest++ = (char)(val);
589 continue;
590 }
591
592 #endif // UTF_ESCAPE_BASE
593
594 if (IS_SURROGATE_POINT(val))
595 {
596 // it's hack to UTF-8 encoding
597 if (val < 0xdc00 && src != srcLim)
598 {
599 const UInt32 c2 = (UInt32)*src;
600 if (IS_LOW_SURROGATE_POINT(c2))
601 {
602 src++;
603 val = (((val - 0xd800) << 10) | (c2 - 0xdc00)) + 0x10000;
604 dest[0] = _UTF8_HEAD(3, val);
605 dest[1] = _UTF8_CHAR(2, val);
606 dest[2] = _UTF8_CHAR(1, val);
607 dest[3] = _UTF8_CHAR(0, val);
608 dest += 4;
609 continue;
610 }
611 }
612 if (flags & UTF_FLAG__TO_UTF8__SURROGATE_ERROR)
613 val = UTF_REPLACEMENT_CHAR; // WIN32 function does it
614 }
615
616 #ifndef _WCHART_IS_16BIT
617 if (val < _UTF8_RANGE(2))
618 #endif
619 {
620 dest[0] = _UTF8_HEAD(2, val);
621 dest[1] = _UTF8_CHAR(1, val);
622 dest[2] = _UTF8_CHAR(0, val);
623 dest += 3;
624 continue;
625 }
626
627 #ifndef _WCHART_IS_16BIT
628
629 // we don't expect this case. so we can throw exception
630 // throw 20210407;
631
632 char b;
633 unsigned numBits;
634 if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }
635 else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); }
636 else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); }
637 #if _UTF8_NUM_TAIL_BYTES_MAX >= 6
638 else { numBits = 6 * 6; b = (char)_UTF8_START(6); }
639 #else
640 else
641 {
642 val = UTF_REPLACEMENT_CHAR;
643 { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }
644 }
645 #endif
646
647 *dest++ = b;
648
649 do
650 {
651 numBits -= 6;
652 *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));
653 }
654 while (numBits != 0);
655
656 #endif
657 }
658 }
659
Convert_UTF8_Buf_To_Unicode(const char * src,size_t srcSize,UString & dest,unsigned flags)660 bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags)
661 {
662 dest.Empty();
663 size_t destLen = 0;
664 Utf8_To_Utf16(NULL, &destLen, src, src + srcSize, flags);
665 bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src + srcSize, flags);
666 dest.ReleaseBuf_SetEnd((unsigned)destLen);
667 return res;
668 }
669
ConvertUTF8ToUnicode_Flags(const AString & src,UString & dest,unsigned flags)670 bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags)
671 {
672 return Convert_UTF8_Buf_To_Unicode(src, src.Len(), dest, flags);
673 }
674
675
676 static
677 unsigned g_UTF8_To_Unicode_Flags =
678 UTF_FLAG__FROM_UTF8__USE_ESCAPE
679 #ifndef _WCHART_IS_16BIT
680 | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
681 #ifdef _UTF8_RAW_NON_UTF8_SUPPORTED
682 | UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
683 #endif
684 #endif
685 ;
686
687
688 /*
689 bool ConvertUTF8ToUnicode_boolRes(const AString &src, UString &dest)
690 {
691 return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
692 }
693 */
694
ConvertUTF8ToUnicode(const AString & src,UString & dest)695 bool ConvertUTF8ToUnicode(const AString &src, UString &dest)
696 {
697 return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
698 }
699
700 void Print_UString(const UString &a);
701
ConvertUnicodeToUTF8_Flags(const UString & src,AString & dest,unsigned flags)702 void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags)
703 {
704 /*
705 if (src.Len()== 24)
706 throw "202104";
707 */
708 dest.Empty();
709 const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
710 char *destStart = dest.GetBuf((unsigned)destLen);
711 const char *destEnd = Utf16_To_Utf8(destStart, src, src.Ptr(src.Len()), flags);
712 dest.ReleaseBuf_SetEnd((unsigned)destLen);
713 // printf("\nlen = %d\n", src.Len());
714 if (destLen != (size_t)(destEnd - destStart))
715 {
716 /*
717 // dest.ReleaseBuf_SetEnd((unsigned)(destEnd - destStart));
718 printf("\nlen = %d\n", (unsigned)destLen);
719 printf("\n(destEnd - destStart) = %d\n", (unsigned)(destEnd - destStart));
720 printf("\n");
721 // Print_UString(src);
722 printf("\n");
723 // printf("\nlen = %d\n", destLen);
724 */
725 throw 20210406;
726 }
727 }
728
729
730
731 unsigned g_Unicode_To_UTF8_Flags =
732 // UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE
733 0
734 #ifndef _WIN32
735 #ifdef _UTF8_RAW_NON_UTF8_SUPPORTED
736 | UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE
737 #else
738 | UTF_FLAG__TO_UTF8__SURROGATE_ERROR;
739 #endif
740 #endif
741 ;
742
ConvertUnicodeToUTF8(const UString & src,AString & dest)743 void ConvertUnicodeToUTF8(const UString &src, AString &dest)
744 {
745 ConvertUnicodeToUTF8_Flags(src, dest, g_Unicode_To_UTF8_Flags);
746 }
747
Convert_Unicode_To_UTF8_Buf(const UString & src,CByteBuffer & dest)748 void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest)
749 {
750 const unsigned flags = g_Unicode_To_UTF8_Flags;
751 dest.Free();
752 const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
753 dest.Alloc(destLen);
754 const char *destEnd = Utf16_To_Utf8((char *)(void *)(Byte *)dest, src, src.Ptr(src.Len()), flags);
755 if (destLen != (size_t)(destEnd - (char *)(void *)(Byte *)dest))
756 throw 202104;
757 }
758
759 /*
760
761 #ifndef _WIN32
762 void Convert_UTF16_To_UTF32(const UString &src, UString &dest)
763 {
764 dest.Empty();
765 for (size_t i = 0; i < src.Len();)
766 {
767 wchar_t c = src[i++];
768 if (c >= 0xd800 && c < 0xdc00 && i < src.Len())
769 {
770 const wchar_t c2 = src[i];
771 if (c2 >= 0xdc00 && c2 < 0x10000)
772 {
773 // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
774 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
775 // printf("%4x\n", (int)c);
776 i++;
777 }
778 }
779 dest += c;
780 }
781 }
782
783 void Convert_UTF32_To_UTF16(const UString &src, UString &dest)
784 {
785 dest.Empty();
786 for (size_t i = 0; i < src.Len();)
787 {
788 wchar_t w = src[i++];
789 if (w >= 0x10000 && w < 0x110000)
790 {
791 w -= 0x10000;
792 dest += (wchar_t)((unsigned)0xd800 + (((unsigned)w >> 10) & 0x3ff));
793 w = 0xdc00 + (w & 0x3ff);
794 }
795 dest += w;
796 }
797 }
798
799 bool UTF32_IsThere_BigPoint(const UString &src)
800 {
801 for (size_t i = 0; i < src.Len();)
802 {
803 const UInt32 c = (UInt32)src[i++];
804 if (c >= 0x110000)
805 return true;
806 }
807 return false;
808 }
809
810 bool Unicode_IsThere_BmpEscape(const UString &src)
811 {
812 for (size_t i = 0; i < src.Len();)
813 {
814 const UInt32 c = (UInt32)src[i++];
815 if (IS_ESCAPE_POINT(c, 0))
816 return true;
817 }
818 return false;
819 }
820
821
822 #endif
823
824 bool Unicode_IsThere_Utf16SurrogateError(const UString &src)
825 {
826 for (size_t i = 0; i < src.Len();)
827 {
828 const UInt32 val = (UInt32)src[i++];
829 if (IS_SURROGATE_POINT(val))
830 {
831 // it's hack to UTF-8 encoding
832 if (val >= 0xdc00 || i == src.Len())
833 return true;
834 const UInt32 c2 = (UInt32)*src;
835 if (!IS_LOW_SURROGATE_POINT(c2))
836 return true;
837 }
838 }
839 return false;
840 }
841 */
842
843 #ifndef _WCHART_IS_16BIT
844
Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &)845 void Convert_UnicodeEsc16_To_UnicodeEscHigh
846 #if UTF_ESCAPE_PLANE == 0
847 (UString &) {}
848 #else
849 (UString &s)
850 {
851 const unsigned len = s.Len();
852 for (unsigned i = 0; i < len; i++)
853 {
854 wchar_t c = s[i];
855 if (IS_ESCAPE_POINT(c, 0))
856 {
857 c += UTF_ESCAPE_PLANE;
858 s.ReplaceOneCharAtPos(i, c);
859 }
860 }
861 }
862 #endif
863 #endif
864