• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Common/UTFConvert.h
2 
3 #ifndef ZIP7_INC_COMMON_UTF_CONVERT_H
4 #define ZIP7_INC_COMMON_UTF_CONVERT_H
5 
6 #include "MyBuffer.h"
7 #include "MyString.h"
8 
9 struct CUtf8Check
10 {
11   // Byte MaxByte;     // in original src stream
12   bool NonUtf;
13   bool ZeroChar;
14   bool SingleSurrogate;
15   bool Escape;
16   bool Truncated;
17   UInt32 MaxHighPoint;  // only for points >= 0x80
18 
CUtf8CheckCUtf8Check19   CUtf8Check() { Clear(); }
20 
ClearCUtf8Check21   void Clear()
22   {
23     // MaxByte = 0;
24     NonUtf = false;
25     ZeroChar = false;
26     SingleSurrogate = false;
27     Escape = false;
28     Truncated = false;
29     MaxHighPoint = 0;
30   }
31 
UpdateCUtf8Check32   void Update(const CUtf8Check &c)
33   {
34     if (c.NonUtf) NonUtf = true;
35     if (c.ZeroChar) ZeroChar = true;
36     if (c.SingleSurrogate) SingleSurrogate = true;
37     if (c.Escape) Escape = true;
38     if (c.Truncated) Truncated = true;
39     if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint;
40   }
41 
PrintStatusCUtf8Check42   void PrintStatus(AString &s) const
43   {
44     s.Empty();
45 
46     // s.Add_OptSpaced("MaxByte=");
47     // s.Add_UInt32(MaxByte);
48 
49     if (NonUtf)          s.Add_OptSpaced("non-UTF8");
50     if (ZeroChar)        s.Add_OptSpaced("ZeroChar");
51     if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate");
52     if (Escape)          s.Add_OptSpaced("Escape");
53     if (Truncated)       s.Add_OptSpaced("Truncated");
54 
55     if (MaxHighPoint != 0)
56     {
57       s.Add_OptSpaced("MaxUnicode=");
58       s.Add_UInt32(MaxHighPoint);
59     }
60   }
61 
62 
63   bool IsOK(bool allowReduced = false) const
64   {
65     if (NonUtf || SingleSurrogate || ZeroChar)
66       return false;
67     if (MaxHighPoint >= 0x110000)
68       return false;
69     if (Truncated && !allowReduced)
70       return false;
71     return true;
72   }
73 
74   // it checks full buffer as specified in (size) and it doesn't stop on zero char
75   void Check_Buf(const char *src, size_t size) throw();
76 
Check_AStringCUtf8Check77   void Check_AString(const AString &s) throw()
78   {
79     Check_Buf(s.Ptr(), s.Len());
80   }
81 };
82 
83 /*
84 if (allowReduced == false) - all UTF-8 character sequences must be finished.
85 if (allowReduced == true)  - it allows truncated last character-Utf8-sequence
86 */
87 
88 bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw();
89 bool CheckUTF8_AString(const AString &s) throw();
90 
91 #define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR    (1 << 0)
92 #define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE         (1 << 1)
93 #define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2)
94 
95 /*
96 Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
97 
98    if (flag is NOT set)
99    {
100      it processes SINGLE-SURROGATE-8 as valid Unicode point.
101      it converts  SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16
102      Note: some sequencies of two SINGLE-SURROGATE-8 points
103            will generate correct SURROGATE-16-PAIR, and
104            that SURROGATE-16-PAIR later will be converted to correct
105            UTF8-SURROGATE-21 point. So we don't restore original
106            STR-8 sequence in that case.
107    }
108 
109    if (flag is set)
110    {
111      if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined)
112         it generates ESCAPE for SINGLE-SURROGATE-8,
113      if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined)
114         it generates U+fffd for SINGLE-SURROGATE-8,
115    }
116 
117 
118 Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
119 
120    if (flag is NOT set)
121      it generates (U+fffd) code for non-UTF-8 (invalid) characters
122 
123    if (flag is set)
124    {
125      It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters.
126      And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes.
127    }
128 
129 Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
130 
131    if (flag is NOT set)
132    {
133      it process ESCAPE-8 points as another Unicode points.
134      In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences,
135        so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW
136    }
137 
138    if (flag is set)
139    {
140      it generates ESCAPE-16-21 for ESCAPE-8 points
141      so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21.
142    }
143 
144 
145 Main USE CASES with UTF-8 <-> UTF-16 conversions:
146 
147  WIN32:   UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW
148    {
149             set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
150      Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
151      Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
152 
153      So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8.
154    }
155 
156  Linux:   UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
157    {
158      we want restore original UTF-8-RAW sequence later from that ESCAPE-16.
159      Set the flags:
160        Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
161        Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
162        Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT
163    }
164 
165  MacOS:   UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW
166    {
167      we want to restore correct UTF-8 without any BMP processing:
168      Set the flags:
169        Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR
170        Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE
171    }
172 
173 */
174 
175 // zero char is not allowed in (src) buf
176 bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0);
177 
178 bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0);
179 bool ConvertUTF8ToUnicode(const AString &src, UString &dest);
180 
181 #define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR    (1 << 8)
182 #define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9)
183 // #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE  (1 << 10)
184 
185 /*
186 Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
187 
188   if (flag is NOT set)
189   {
190      we extract SINGLE-SURROGATE as normal UTF-8
191 
192      In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in .
193 
194      In Linux :
195        use-case-1: UTF-8 -> UTF-16 -> UTF-8  doesn't generate UTF-16 SINGLE-SURROGATE,
196                    if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used.
197        use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux)
198                    will generate SINGLE-SURROGATE-UTF-8 here.
199   }
200 
201   if (flag is set)
202   {
203      we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE
204      it can be used for compatibility mode with WIN32 UTF function
205      or if we want UTF-8 stream without any errors
206   }
207 
208 
209 Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE
210 
211   if (flag is NOT set) it doesn't extract  raw 8-bit symbol from Escape-Plane-16
212   if (flag is set)     it         extracts raw 8-bit symbol from Escape-Plane-16
213 
214   in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive):
215   if (we       use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane.
216   if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
217 
218 
219 Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE
220   // that flag affects the code only if (wchar_t is 32-bit)
221   // that mode with high-escape can be disabled now in UTFConvert.cpp
222   if (flag is NOT set)
223      it doesn't extract raw 8-bit symbol from High-Escape-Plane
224   if (flag is set)
225      it        extracts raw 8-bit symbol from High-Escape-Plane
226 
227 Main use cases:
228 
229 WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW
230    {
231      Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE.
232      Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR.
233      So we restore original UTF-16-RAW.
234    }
235 
236 Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes
237      set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive
238      set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16.
239      Note: high esacape mode can be ignored now in UTFConvert.cpp
240 
241 macOS:
242      the system doesn't support incorrect UTF-8 in file names.
243      set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR
244 */
245 
246 extern unsigned g_Unicode_To_UTF8_Flags;
247 
248 void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0);
249 void ConvertUnicodeToUTF8(const UString &src, AString &dest);
250 
251 void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest);
252 
253 /*
254 #ifndef _WIN32
255 void Convert_UTF16_To_UTF32(const UString &src, UString &dest);
256 void Convert_UTF32_To_UTF16(const UString &src, UString &dest);
257 bool UTF32_IsThere_BigPoint(const UString &src);
258 bool Unicode_IsThere_BmpEscape(const UString &src);
259 #endif
260 
261 bool Unicode_IsThere_Utf16SurrogateError(const UString &src);
262 */
263 
264 #ifdef Z7_WCHART_IS_16BIT
265 #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s)
266 #else
267 void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s);
268 #endif
269 
270 /*
271 // #include "../../C/CpuArch.h"
272 
273 // ---------- Utf16 Little endian functions ----------
274 
275 // We store 16-bit surrogates even in 32-bit WCHARs in Linux.
276 // So now we don't use the following code:
277 
278 #if WCHAR_MAX > 0xffff
279 
280 // void *p     : pointer to src bytes stream
281 // size_t len  : num Utf16 characters : it can include or not include NULL character
282 
283 inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len)
284 {
285   #if WCHAR_MAX > 0xffff
286   size_t num_wchars = 0;
287   for (size_t i = 0; i < len; i++)
288   {
289     wchar_t c = GetUi16(p);
290     p = (const void *)((const Byte *)p + 2);
291     if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
292     {
293       wchar_t c2 = GetUi16(p);
294       if (c2 >= 0xdc00 && c2 < 0xe000)
295       {
296         c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
297         p = (const void *)((const Byte *)p + 2);
298         i++;
299       }
300     }
301     num_wchars++;
302   }
303   return num_wchars;
304   #else
305   UNUSED_VAR(p)
306   return len;
307   #endif
308 }
309 
310 // #include <stdio.h>
311 
312 inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest)
313 {
314   for (size_t i = 0; i < len; i++)
315   {
316     wchar_t c = GetUi16(p);
317     p = (const void *)((const Byte *)p + 2);
318 
319     #if WCHAR_PATH_SEPARATOR != L'/'
320     if (c == L'/')
321       c = WCHAR_PATH_SEPARATOR;
322     #endif
323 
324     #if WCHAR_MAX > 0xffff
325 
326     if (c >= 0xd800 && c < 0xdc00 && i + 1 != len)
327     {
328       wchar_t c2 = GetUi16(p);
329       if (c2 >= 0xdc00 && c2 < 0xe000)
330       {
331         // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2);
332         c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
333         p = (const void *)((const Byte *)p + 2);
334         i++;
335         // printf("%4x\n", (int)c);
336       }
337     }
338 
339     #endif
340 
341     *dest++ = c;
342   }
343   return dest;
344 }
345 
346 
347 inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p)
348 {
349   size_t num = 0;
350   for (;;)
351   {
352     wchar_t c = *p++;
353     if (c == 0)
354       return num;
355     num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1);
356   }
357   return num;
358 }
359 
360 inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest)
361 {
362   for (;;)
363   {
364     wchar_t c = *p++;
365     if (c == 0)
366       return dest;
367     if (c >= 0x10000 && c < 0x110000)
368     {
369       SetUi16(dest    , (UInt16)(0xd800 + ((c >> 10) & 0x3FF)));
370       SetUi16(dest + 2, (UInt16)(0xdc00 + ( c        & 0x3FF)));
371       dest += 4;
372     }
373     else
374     {
375       SetUi16(dest, c);
376       dest += 2;
377     }
378   }
379 }
380 
381 #endif
382 */
383 
384 #endif
385