• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // UTFConvert.cpp
2 
3 #include "StdAfx.h"
4 
5 // #include <stdio.h>
6 
7 #include "MyTypes.h"
8 #include "UTFConvert.h"
9 
10 
11 #ifndef _WCHART_IS_16BIT
12 #ifndef __APPLE__
13   // we define it if the system supports files with non-utf8 symbols:
14   #define _UTF8_RAW_NON_UTF8_SUPPORTED
15 #endif
16 #endif
17 
18 /*
19   _UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte
20 
21   n : _UTF8_START(n) : Bits of code point
22 
23   0 : 0x80 :    : unused
24   1 : 0xC0 : 11 :
25   2 : 0xE0 : 16 : Basic Multilingual Plane
26   3 : 0xF0 : 21 : Unicode space
27   4 : 0xF8 : 26 :
28   5 : 0xFC : 31 : UCS-4 : wcstombs() in ubuntu is limited to that value
29   6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value
30   7 : 0xFF :
31 */
32 
33 #define _UTF8_START(n) (0x100 - (1 << (7 - (n))))
34 
35 #define _UTF8_HEAD_PARSE2(n) \
36     if (c < _UTF8_START((n) + 1)) \
37     { numBytes = (n); val -= _UTF8_START(n); }
38 
39 #ifndef _WCHART_IS_16BIT
40 
41 /*
42    if (wchar_t is 32-bit), we can support large points in long UTF-8 sequence,
43    when we convert wchar_t strings to UTF-8:
44      (_UTF8_NUM_TAIL_BYTES_MAX == 3) : (21-bits points) - Unicode
45      (_UTF8_NUM_TAIL_BYTES_MAX == 5) : (31-bits points) - UCS-4
46      (_UTF8_NUM_TAIL_BYTES_MAX == 6) : (36-bit hack)
47 */
48 
49 #define _UTF8_NUM_TAIL_BYTES_MAX 5
50 #endif
51 
52 /*
53 #define _UTF8_HEAD_PARSE \
54     UInt32 val = c; \
55          _UTF8_HEAD_PARSE2(1) \
56     else _UTF8_HEAD_PARSE2(2) \
57     else _UTF8_HEAD_PARSE2(3) \
58     else _UTF8_HEAD_PARSE2(4) \
59     else _UTF8_HEAD_PARSE2(5) \
60   #if _UTF8_NUM_TAIL_BYTES_MAX >= 6
61     else _UTF8_HEAD_PARSE2(6)
62   #endif
63 */
64 
65 #define _UTF8_HEAD_PARSE_MAX_3_BYTES \
66     UInt32 val = c; \
67          _UTF8_HEAD_PARSE2(1) \
68     else _UTF8_HEAD_PARSE2(2) \
69     else { numBytes = 3; val -= _UTF8_START(3); }
70 
71 
72 #define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
73 
74 
75 #define START_POINT_FOR_SURROGATE 0x10000
76 
77 
78 /* we use 128 bytes block in 16-bit BMP-PLANE to encode non-UTF-8 Escapes
79    Also we can use additional HIGH-PLANE (we use 21-bit points above 0x1f0000)
80    to simplify internal intermediate conversion in Linux:
81    RAW-UTF-8 <-> internal wchar_t utf-16 strings <-> RAW-UTF-UTF-8
82 */
83 
84 
85 #if defined(_WCHART_IS_16BIT)
86 
87 #define UTF_ESCAPE_PLANE 0
88 
89 #else
90 
91 /*
92 we can place 128 ESCAPE chars to
93    ef 80 -    ee be 80 (3-bytes utf-8) : similar to WSL
94    ef ff -    ee bf bf
95 
96 1f ef 80 - f7 be be 80 (4-bytes utf-8) : last  4-bytes utf-8 plane (out of Unicode)
97 1f ef ff - f7 be bf bf (4-bytes utf-8) : last  4-bytes utf-8 plane (out of Unicode)
98 */
99 
100 // #define UTF_ESCAPE_PLANE_HIGH  (0x1f << 16)
101 // #define UTF_ESCAPE_PLANE        UTF_ESCAPE_PLANE_HIGH
102 #define UTF_ESCAPE_PLANE 0
103 
104 /*
105   if (UTF_FLAG__FROM_UTF8__USE_ESCAPE is set)
106   {
107     if (UTF_ESCAPE_PLANE is UTF_ESCAPE_PLANE_HIGH)
108     {
109       we can restore any 8-bit Escape from ESCAPE-PLANE-21 plane.
110       But ESCAPE-PLANE-21 point cannot be stored to utf-16 (7z archive)
111       So we still need a way to extract 8-bit Escapes and BMP-Escapes-8
112       from same BMP-Escapes-16 stored in 7z.
113       And if we want to restore any 8-bit from 7z archive,
114       we still must use UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT for (utf-8 -> utf-16)
115       Also we need additional Conversions to tranform from utf-16 to utf-16-With-Escapes-21
116     }
117     else (UTF_ESCAPE_PLANE == 0)
118     {
119       we must convert original 3-bytes utf-8 BMP-Escape point to sequence
120       of 3 BMP-Escape-16 points with UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
121       so we can extract original RAW-UTF-8 from UTFD-16 later.
122     }
123   }
124 */
125 
126 #endif
127 
128 
129 
130 #define UTF_ESCAPE_BASE 0xef00
131 
132 
133 #ifdef UTF_ESCAPE_BASE
134 #define IS_ESCAPE_POINT(v, plane) (((v) & (UInt32)0xffffff80) == (plane) + UTF_ESCAPE_BASE + 0x80)
135 #endif
136 
137 #define IS_SURROGATE_POINT(v)     (((v) & (UInt32)0xfffff800) == 0xd800)
138 #define IS_LOW_SURROGATE_POINT(v) (((v) & (UInt32)0xfffffC00) == 0xdc00)
139 
140 
141 #define _ERROR_UTF8_CHECK \
142   { NonUtf = true; continue; }
143 
Check_Buf(const char * src,size_t size)144 void CUtf8Check::Check_Buf(const char *src, size_t size) throw()
145 {
146   Clear();
147   // Byte maxByte = 0;
148 
149   for (;;)
150   {
151     if (size == 0)
152       break;
153 
154     const Byte c = (Byte)(*src++);
155     size--;
156 
157     if (c == 0)
158     {
159       ZeroChar = true;
160       continue;
161     }
162 
163     /*
164     if (c > maxByte)
165       maxByte = c;
166     */
167 
168     if (c < 0x80)
169       continue;
170 
171     if (c < 0xc0 + 2)// it's limit for 0x140000 unicode codes : win32 compatibility
172       _ERROR_UTF8_CHECK
173 
174     unsigned numBytes;
175 
176     UInt32 val = c;
177          _UTF8_HEAD_PARSE2(1)
178     else _UTF8_HEAD_PARSE2(2)
179     else _UTF8_HEAD_PARSE2(4)
180     else _UTF8_HEAD_PARSE2(5)
181     else
182     {
183       _ERROR_UTF8_CHECK
184     }
185 
186     unsigned pos = 0;
187     do
188     {
189       if (pos == size)
190         break;
191       unsigned c2 = (Byte)src[pos];
192       c2 -= 0x80;
193       if (c2 >= 0x40)
194         break;
195       val <<= 6;
196       val |= c2;
197       if (pos == 0)
198         if (val < (((unsigned)1 << 7) >> numBytes))
199           break;
200       pos++;
201     }
202     while (--numBytes);
203 
204     if (numBytes != 0)
205     {
206       if (pos == size)
207         Truncated = true;
208       else
209         _ERROR_UTF8_CHECK
210     }
211 
212     #ifdef UTF_ESCAPE_BASE
213       if (IS_ESCAPE_POINT(val, 0))
214         Escape = true;
215     #endif
216 
217     if (MaxHighPoint < val)
218       MaxHighPoint = val;
219 
220     if (IS_SURROGATE_POINT(val))
221       SingleSurrogate = true;
222 
223     src += pos;
224     size -= pos;
225   }
226 
227   // MaxByte = maxByte;
228 }
229 
Check_UTF8_Buf(const char * src,size_t size,bool allowReduced)230 bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw()
231 {
232   CUtf8Check check;
233   check.Check_Buf(src, size);
234   return check.IsOK(allowReduced);
235 }
236 
237 /*
238 bool CheckUTF8_chars(const char *src, bool allowReduced) throw()
239 {
240   CUtf8Check check;
241   check.CheckBuf(src, strlen(src));
242   return check.IsOK(allowReduced);
243 }
244 */
245 
CheckUTF8_AString(const AString & s)246 bool CheckUTF8_AString(const AString &s) throw()
247 {
248   CUtf8Check check;
249   check.Check_AString(s);
250   return check.IsOK();
251 }
252 
253 
254 /*
255 bool CheckUTF8(const char *src, bool allowReduced) throw()
256 {
257   // return Check_UTF8_Buf(src, strlen(src), allowReduced);
258 
259   for (;;)
260   {
261     const Byte c = (Byte)(*src++);
262     if (c == 0)
263       return true;
264 
265     if (c < 0x80)
266       continue;
267     if (c < 0xC0 + 2 || c >= 0xf5)
268       return false;
269 
270     unsigned numBytes;
271     _UTF8_HEAD_PARSE
272     else
273       return false;
274 
275     unsigned pos = 0;
276 
277     do
278     {
279       Byte c2 = (Byte)(*src++);
280       if (c2 < 0x80 || c2 >= 0xC0)
281         return allowReduced && c2 == 0;
282       val <<= 6;
283       val |= (c2 - 0x80);
284       pos++;
285     }
286     while (--numBytes);
287 
288     if (val < _UTF8_RANGE(pos - 1))
289       return false;
290 
291     if (val >= 0x110000)
292       return false;
293   }
294 }
295 */
296 
297 // in case of UTF-8 error we have two ways:
298 // 21.01- : old : 0xfffd: REPLACEMENT CHARACTER : old version
299 // 21.02+ : new : 0xef00 + (c) : similar to WSL scheme for low symbols
300 
301 #define UTF_REPLACEMENT_CHAR  0xfffd
302 
303 
304 
305 #define UTF_ESCAPE(c) \
306    ((flags & UTF_FLAG__FROM_UTF8__USE_ESCAPE) ? \
307     UTF_ESCAPE_PLANE + UTF_ESCAPE_BASE + (c) : UTF_REPLACEMENT_CHAR)
308 
309 /*
310 #define _HARD_ERROR_UTF8
311   { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
312     destPos++; ok = false; continue; }
313 */
314 
315 // we ignore utf errors, and don't change (ok) variable!
316 
317 #define _ERROR_UTF8 \
318   { if (dest) dest[destPos] = (wchar_t)UTF_ESCAPE(c); \
319     destPos++; continue; }
320 
321 // we store UTF-16 in wchar_t strings. So we use surrogates for big unicode points:
322 
323 // for debug puposes only we can store UTF-32 in wchar_t:
324 // #define START_POINT_FOR_SURROGATE ((UInt32)0 - 1)
325 
326 
327 /*
328   WIN32 MultiByteToWideChar(CP_UTF8) emits 0xfffd point, if utf-8 error was found.
329   Ant it can emit single 0xfffd from 2 src bytes.
330   It doesn't emit single 0xfffd from 3-4 src bytes.
331   We can
332     1) emit Escape point for each incorrect byte. So we can data recover later
333     2) emit 0xfffd for each incorrect byte.
334        That scheme is similar to Escape scheme, but we emit 0xfffd
335        instead of each Escape point.
336     3) emit single 0xfffd from 1-2 incorrect bytes, as WIN32 MultiByteToWideChar scheme
337 */
338 
Utf8_To_Utf16(wchar_t * dest,size_t * destLen,const char * src,const char * srcLim,unsigned flags)339 static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim, unsigned flags) throw()
340 {
341   size_t destPos = 0;
342   bool ok = true;
343 
344   for (;;)
345   {
346     if (src == srcLim)
347     {
348       *destLen = destPos;
349       return ok;
350     }
351 
352     const Byte c = (Byte)(*src++);
353 
354     if (c < 0x80)
355     {
356       if (dest)
357         dest[destPos] = (wchar_t)c;
358       destPos++;
359       continue;
360     }
361 
362     if (c < 0xc0 + 2
363       || c >= 0xf5) // it's limit for 0x140000 unicode codes : win32 compatibility
364     {
365       _ERROR_UTF8
366     }
367 
368     unsigned numBytes;
369 
370     _UTF8_HEAD_PARSE_MAX_3_BYTES
371 
372     unsigned pos = 0;
373     do
374     {
375       if (src + pos == srcLim)
376         break;
377       unsigned c2 = (Byte)src[pos];
378       c2 -= 0x80;
379       if (c2 >= 0x40)
380         break;
381       val <<= 6;
382       val |= c2;
383       pos++;
384       if (pos == 1)
385       {
386         if (val < (((unsigned)1 << 7) >> numBytes))
387           break;
388         if (numBytes == 2)
389         {
390           if (flags & UTF_FLAG__FROM_UTF8__SURROGATE_ERROR)
391             if ((val & (0xF800 >> 6)) == (0xd800 >> 6))
392               break;
393         }
394         else if (numBytes == 3 && val >= (0x110000 >> 12))
395           break;
396       }
397     }
398     while (--numBytes);
399 
400     if (numBytes != 0)
401     {
402       if ((flags & UTF_FLAG__FROM_UTF8__USE_ESCAPE) == 0)
403       {
404         // the following code to emit the 0xfffd chars as win32 Utf8 function.
405         // disable the folling line, if you need 0xfffd for each incorrect byte as in Escape mode
406         src += pos;
407       }
408       _ERROR_UTF8
409     }
410 
411     /*
412     if (val < _UTF8_RANGE(pos - 1))
413       _ERROR_UTF8
414     */
415 
416     #ifdef UTF_ESCAPE_BASE
417 
418       if ((flags & UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT)
419           && IS_ESCAPE_POINT(val, 0))
420       {
421         // We will emit 3 utf16-Escape-16-21 points from one Escape-16 point (3 bytes)
422         _ERROR_UTF8
423       }
424 
425     #endif
426 
427     /*
428        We don't expect virtual Escape-21 points in UTF-8 stream.
429        And we don't check for Escape-21.
430        So utf8-Escape-21 will be converted to another 3 utf16-Escape-21 points.
431        Maybe we could convert virtual utf8-Escape-21 to one utf16-Escape-21 point in some cases?
432     */
433 
434     if (val < START_POINT_FOR_SURROGATE)
435     {
436       /*
437       if ((flags & UTF_FLAG__FROM_UTF8__SURROGATE_ERROR)
438           && IS_SURROGATE_POINT(val))
439       {
440         // We will emit 3 utf16-Escape-16-21 points from one Surrogate-16 point (3 bytes)
441         _ERROR_UTF8
442       }
443       */
444       if (dest)
445         dest[destPos] = (wchar_t)val;
446       destPos++;
447     }
448     else
449     {
450       /*
451       if (val >= 0x110000)
452       {
453         // We will emit utf16-Escape-16-21 point from each source byte
454         _ERROR_UTF8
455       }
456       */
457       if (dest)
458       {
459         dest[destPos + 0] = (wchar_t)(0xd800 - (0x10000 >> 10) + (val >> 10));
460         dest[destPos + 1] = (wchar_t)(0xdc00 + (val & 0x3ff));
461       }
462       destPos += 2;
463     }
464     src += pos;
465   }
466 }
467 
468 
469 
470 #define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n)))))
471 #define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
472 
Utf16_To_Utf8_Calc(const wchar_t * src,const wchar_t * srcLim,unsigned flags)473 static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim, unsigned flags)
474 {
475   size_t size = (size_t)(srcLim - src);
476   for (;;)
477   {
478     if (src == srcLim)
479       return size;
480 
481     UInt32 val = (UInt32)(*src++);
482 
483     if (val < 0x80)
484       continue;
485 
486     if (val < _UTF8_RANGE(1))
487     {
488       size++;
489       continue;
490     }
491 
492     #ifdef UTF_ESCAPE_BASE
493 
494     #if UTF_ESCAPE_PLANE != 0
495     if (flags & UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE)
496       if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
497         continue;
498     #endif
499 
500     if (flags & UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE)
501       if (IS_ESCAPE_POINT(val, 0))
502         continue;
503 
504     #endif
505 
506     if (IS_SURROGATE_POINT(val))
507     {
508       // it's hack to UTF-8 encoding
509 
510       if (val < 0xdc00 && src != srcLim)
511       {
512         const UInt32 c2 = (UInt32)*src;
513         if (c2 >= 0xdc00 && c2 < 0xe000)
514           src++;
515       }
516       size += 2;
517       continue;
518     }
519 
520     #ifdef _WCHART_IS_16BIT
521 
522     size += 2;
523 
524     #else
525 
526          if (val < _UTF8_RANGE(2)) size += 2;
527     else if (val < _UTF8_RANGE(3)) size += 3;
528     else if (val < _UTF8_RANGE(4)) size += 4;
529     else if (val < _UTF8_RANGE(5)) size += 5;
530     else
531     #if _UTF8_NUM_TAIL_BYTES_MAX >= 6
532       size += 6;
533     #else
534       size += 3;
535     #endif
536 
537     #endif
538   }
539 }
540 
541 
Utf16_To_Utf8(char * dest,const wchar_t * src,const wchar_t * srcLim,unsigned flags)542 static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim, unsigned flags)
543 {
544   for (;;)
545   {
546     if (src == srcLim)
547       return dest;
548 
549     UInt32 val = (UInt32)*src++;
550 
551     if (val < 0x80)
552     {
553       *dest++ = (char)val;
554       continue;
555     }
556 
557     if (val < _UTF8_RANGE(1))
558     {
559       dest[0] = _UTF8_HEAD(1, val);
560       dest[1] = _UTF8_CHAR(0, val);
561       dest += 2;
562       continue;
563     }
564 
565     #ifdef UTF_ESCAPE_BASE
566 
567     #if UTF_ESCAPE_PLANE != 0
568     /*
569        if (wchar_t is 32-bit)
570             && (UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE is set)
571             && (point is virtual escape plane)
572           we extract 8-bit byte from virtual HIGH-ESCAPE PLANE.
573     */
574     if (flags & UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE)
575       if (IS_ESCAPE_POINT(val, UTF_ESCAPE_PLANE))
576       {
577         *dest++ = (char)(val);
578         continue;
579       }
580     #endif // UTF_ESCAPE_PLANE != 0
581 
582     /* if (UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE is defined)
583           we extract 8-bit byte from BMP-ESCAPE PLANE. */
584 
585     if (flags & UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE)
586       if (IS_ESCAPE_POINT(val, 0))
587       {
588         *dest++ = (char)(val);
589         continue;
590       }
591 
592     #endif // UTF_ESCAPE_BASE
593 
594     if (IS_SURROGATE_POINT(val))
595     {
596       // it's hack to UTF-8 encoding
597       if (val < 0xdc00 && src != srcLim)
598       {
599         const UInt32 c2 = (UInt32)*src;
600         if (IS_LOW_SURROGATE_POINT(c2))
601         {
602           src++;
603           val = (((val - 0xd800) << 10) | (c2 - 0xdc00)) + 0x10000;
604           dest[0] = _UTF8_HEAD(3, val);
605           dest[1] = _UTF8_CHAR(2, val);
606           dest[2] = _UTF8_CHAR(1, val);
607           dest[3] = _UTF8_CHAR(0, val);
608           dest += 4;
609           continue;
610         }
611       }
612       if (flags & UTF_FLAG__TO_UTF8__SURROGATE_ERROR)
613         val = UTF_REPLACEMENT_CHAR; // WIN32 function does it
614     }
615 
616     #ifndef _WCHART_IS_16BIT
617     if (val < _UTF8_RANGE(2))
618     #endif
619     {
620       dest[0] = _UTF8_HEAD(2, val);
621       dest[1] = _UTF8_CHAR(1, val);
622       dest[2] = _UTF8_CHAR(0, val);
623       dest += 3;
624       continue;
625     }
626 
627     #ifndef _WCHART_IS_16BIT
628 
629     // we don't expect this case. so we can throw exception
630     // throw 20210407;
631 
632     char b;
633     unsigned numBits;
634          if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }
635     else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); }
636     else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); }
637     #if _UTF8_NUM_TAIL_BYTES_MAX >= 6
638     else                           { numBits = 6 * 6; b = (char)_UTF8_START(6); }
639     #else
640     else
641     {
642       val = UTF_REPLACEMENT_CHAR;
643                                    { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }
644     }
645     #endif
646 
647     *dest++ = b;
648 
649     do
650     {
651       numBits -= 6;
652       *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));
653     }
654     while (numBits != 0);
655 
656     #endif
657   }
658 }
659 
Convert_UTF8_Buf_To_Unicode(const char * src,size_t srcSize,UString & dest,unsigned flags)660 bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags)
661 {
662   dest.Empty();
663   size_t destLen = 0;
664   Utf8_To_Utf16(NULL, &destLen, src, src + srcSize, flags);
665   bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src + srcSize, flags);
666   dest.ReleaseBuf_SetEnd((unsigned)destLen);
667   return res;
668 }
669 
ConvertUTF8ToUnicode_Flags(const AString & src,UString & dest,unsigned flags)670 bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags)
671 {
672   return Convert_UTF8_Buf_To_Unicode(src, src.Len(), dest,  flags);
673 }
674 
675 
676 static
677 unsigned g_UTF8_To_Unicode_Flags =
678     UTF_FLAG__FROM_UTF8__USE_ESCAPE
679   #ifndef _WCHART_IS_16BIT
680     | UTF_FLAG__FROM_UTF8__SURROGATE_ERROR
681   #ifdef _UTF8_RAW_NON_UTF8_SUPPORTED
682     | UTF_FLAG__FROM_UTF8__BMP_ESCAPE_CONVERT
683   #endif
684   #endif
685     ;
686 
687 
688 /*
689 bool ConvertUTF8ToUnicode_boolRes(const AString &src, UString &dest)
690 {
691   return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
692 }
693 */
694 
ConvertUTF8ToUnicode(const AString & src,UString & dest)695 bool ConvertUTF8ToUnicode(const AString &src, UString &dest)
696 {
697   return ConvertUTF8ToUnicode_Flags(src, dest, g_UTF8_To_Unicode_Flags);
698 }
699 
700 void Print_UString(const UString &a);
701 
ConvertUnicodeToUTF8_Flags(const UString & src,AString & dest,unsigned flags)702 void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags)
703 {
704   /*
705   if (src.Len()== 24)
706     throw "202104";
707   */
708   dest.Empty();
709   const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
710   char *destStart = dest.GetBuf((unsigned)destLen);
711   const char *destEnd = Utf16_To_Utf8(destStart, src, src.Ptr(src.Len()), flags);
712   dest.ReleaseBuf_SetEnd((unsigned)destLen);
713   // printf("\nlen = %d\n", src.Len());
714   if (destLen != (size_t)(destEnd - destStart))
715   {
716     /*
717     // dest.ReleaseBuf_SetEnd((unsigned)(destEnd - destStart));
718     printf("\nlen = %d\n", (unsigned)destLen);
719     printf("\n(destEnd - destStart) = %d\n", (unsigned)(destEnd - destStart));
720     printf("\n");
721     // Print_UString(src);
722     printf("\n");
723     // printf("\nlen = %d\n", destLen);
724     */
725     throw 20210406;
726   }
727 }
728 
729 
730 
731 unsigned g_Unicode_To_UTF8_Flags =
732       // UTF_FLAG__TO_UTF8__PARSE_HIGH_ESCAPE
733       0
734   #ifndef _WIN32
735     #ifdef _UTF8_RAW_NON_UTF8_SUPPORTED
736       | UTF_FLAG__TO_UTF8__EXTRACT_BMP_ESCAPE
737     #else
738       | UTF_FLAG__TO_UTF8__SURROGATE_ERROR;
739     #endif
740   #endif
741     ;
742 
ConvertUnicodeToUTF8(const UString & src,AString & dest)743 void ConvertUnicodeToUTF8(const UString &src, AString &dest)
744 {
745   ConvertUnicodeToUTF8_Flags(src, dest, g_Unicode_To_UTF8_Flags);
746 }
747 
Convert_Unicode_To_UTF8_Buf(const UString & src,CByteBuffer & dest)748 void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest)
749 {
750   const unsigned flags = g_Unicode_To_UTF8_Flags;
751   dest.Free();
752   const size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()), flags);
753   dest.Alloc(destLen);
754   const char *destEnd = Utf16_To_Utf8((char *)(void *)(Byte *)dest, src, src.Ptr(src.Len()), flags);
755   if (destLen != (size_t)(destEnd - (char *)(void *)(Byte *)dest))
756     throw 202104;
757 }
758 
759 /*
760 
761 #ifndef _WIN32
762 void Convert_UTF16_To_UTF32(const UString &src, UString &dest)
763 {
764   dest.Empty();
765   for (size_t i = 0; i < src.Len();)
766   {
767     wchar_t c = src[i++];
768     if (c >= 0xd800 && c < 0xdc00 && i < src.Len())
769     {
770       const wchar_t c2 = src[i];
771       if (c2 >= 0xdc00 && c2 < 0x10000)
772       {
773         // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
774         c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
775         // printf("%4x\n", (int)c);
776         i++;
777       }
778     }
779     dest += c;
780   }
781 }
782 
783 void Convert_UTF32_To_UTF16(const UString &src, UString &dest)
784 {
785   dest.Empty();
786   for (size_t i = 0; i < src.Len();)
787   {
788     wchar_t w = src[i++];
789     if (w >= 0x10000 && w < 0x110000)
790     {
791       w -= 0x10000;
792       dest += (wchar_t)((unsigned)0xd800 + (((unsigned)w >> 10) & 0x3ff));
793       w = 0xdc00 + (w & 0x3ff);
794     }
795     dest += w;
796   }
797 }
798 
799 bool UTF32_IsThere_BigPoint(const UString &src)
800 {
801   for (size_t i = 0; i < src.Len();)
802   {
803     const UInt32 c = (UInt32)src[i++];
804     if (c >= 0x110000)
805       return true;
806   }
807   return false;
808 }
809 
810 bool Unicode_IsThere_BmpEscape(const UString &src)
811 {
812   for (size_t i = 0; i < src.Len();)
813   {
814     const UInt32 c = (UInt32)src[i++];
815     if (IS_ESCAPE_POINT(c, 0))
816       return true;
817   }
818   return false;
819 }
820 
821 
822 #endif
823 
824 bool Unicode_IsThere_Utf16SurrogateError(const UString &src)
825 {
826   for (size_t i = 0; i < src.Len();)
827   {
828     const UInt32 val = (UInt32)src[i++];
829     if (IS_SURROGATE_POINT(val))
830     {
831       // it's hack to UTF-8 encoding
832       if (val >= 0xdc00 || i == src.Len())
833         return true;
834       const UInt32 c2 = (UInt32)*src;
835       if (!IS_LOW_SURROGATE_POINT(c2))
836         return true;
837     }
838   }
839   return false;
840 }
841 */
842 
843 #ifndef _WCHART_IS_16BIT
844 
Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &)845 void Convert_UnicodeEsc16_To_UnicodeEscHigh
846 #if UTF_ESCAPE_PLANE == 0
847     (UString &) {}
848 #else
849     (UString &s)
850 {
851   const unsigned len = s.Len();
852   for (unsigned i = 0; i < len; i++)
853   {
854     wchar_t c = s[i];
855     if (IS_ESCAPE_POINT(c, 0))
856     {
857       c += UTF_ESCAPE_PLANE;
858       s.ReplaceOneCharAtPos(i, c);
859     }
860   }
861 }
862 #endif
863 #endif
864