1 // UTFConvert.cpp
2
3 #include "StdAfx.h"
4
5 #include "MyTypes.h"
6 #include "UTFConvert.h"
7
8 #ifdef _WIN32
9 #define _WCHART_IS_16BIT 1
10 #endif
11
12 /*
13 _UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte
14
15 n : _UTF8_START(n) : Bits of code point
16
17 0 : 0x80 : : unused
18 1 : 0xC0 : 11 :
19 2 : 0xE0 : 16 : Basic Multilingual Plane
20 3 : 0xF0 : 21 : Unicode space
21 3 : 0xF8 : 26 :
22 5 : 0xFC : 31 : UCS-4
23 6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value
24 7 : 0xFF :
25 */
26
27 #define _UTF8_START(n) (0x100 - (1 << (7 - (n))))
28
29 #define _UTF8_HEAD_PARSE2(n) if (c < _UTF8_START((n) + 1)) { numBytes = (n); c -= _UTF8_START(n); }
30
31 #define _UTF8_HEAD_PARSE \
32 _UTF8_HEAD_PARSE2(1) \
33 else _UTF8_HEAD_PARSE2(2) \
34 else _UTF8_HEAD_PARSE2(3) \
35 else _UTF8_HEAD_PARSE2(4) \
36 else _UTF8_HEAD_PARSE2(5) \
37
38 // else _UTF8_HEAD_PARSE2(6)
39
CheckUTF8(const char * src,bool allowReduced)40 bool CheckUTF8(const char *src, bool allowReduced) throw()
41 {
42 for (;;)
43 {
44 Byte c = *src++;
45 if (c == 0)
46 return true;
47
48 if (c < 0x80)
49 continue;
50 if (c < 0xC0) // (c < 0xC0 + 2) // if we support only optimal encoding chars
51 return false;
52
53 unsigned numBytes;
54 _UTF8_HEAD_PARSE
55 else
56 return false;
57
58 UInt32 val = c;
59
60 do
61 {
62 Byte c2 = *src++;
63 if (c2 < 0x80 || c2 >= 0xC0)
64 return allowReduced && c2 == 0;
65 val <<= 6;
66 val |= (c2 - 0x80);
67 }
68 while (--numBytes);
69
70 if (val >= 0x110000)
71 return false;
72 }
73 }
74
75
76 #define _ERROR_UTF8 \
77 { if (dest) dest[destPos] = (wchar_t)0xFFFD; destPos++; ok = false; continue; }
78
Utf8_To_Utf16(wchar_t * dest,size_t * destLen,const char * src,const char * srcLim)79 static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim) throw()
80 {
81 size_t destPos = 0;
82 bool ok = true;
83
84 for (;;)
85 {
86 Byte c;
87 if (src == srcLim)
88 {
89 *destLen = destPos;
90 return ok;
91 }
92 c = *src++;
93
94 if (c < 0x80)
95 {
96 if (dest)
97 dest[destPos] = (wchar_t)c;
98 destPos++;
99 continue;
100 }
101 if (c < 0xC0)
102 _ERROR_UTF8
103
104 unsigned numBytes;
105 _UTF8_HEAD_PARSE
106 else
107 _ERROR_UTF8
108
109 UInt32 val = c;
110
111 do
112 {
113 Byte c2;
114 if (src == srcLim)
115 break;
116 c2 = *src;
117 if (c2 < 0x80 || c2 >= 0xC0)
118 break;
119 src++;
120 val <<= 6;
121 val |= (c2 - 0x80);
122 }
123 while (--numBytes);
124
125 if (numBytes != 0)
126 _ERROR_UTF8
127
128 if (val < 0x10000)
129 {
130 if (dest)
131 dest[destPos] = (wchar_t)val;
132 destPos++;
133 }
134 else
135 {
136 val -= 0x10000;
137 if (val >= 0x100000)
138 _ERROR_UTF8
139 if (dest)
140 {
141 dest[destPos + 0] = (wchar_t)(0xD800 + (val >> 10));
142 dest[destPos + 1] = (wchar_t)(0xDC00 + (val & 0x3FF));
143 }
144 destPos += 2;
145 }
146 }
147 }
148
149 #define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))
150
151 #define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n)))))
152 #define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))
153
Utf16_To_Utf8_Calc(const wchar_t * src,const wchar_t * srcLim)154 static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim)
155 {
156 size_t size = srcLim - src;
157 for (;;)
158 {
159 if (src == srcLim)
160 return size;
161
162 UInt32 val = *src++;
163
164 if (val < 0x80)
165 continue;
166
167 if (val < _UTF8_RANGE(1))
168 {
169 size++;
170 continue;
171 }
172
173 if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
174 {
175 UInt32 c2 = *src;
176 if (c2 >= 0xDC00 && c2 < 0xE000)
177 {
178 src++;
179 size += 2;
180 continue;
181 }
182 }
183
184 #ifdef _WCHART_IS_16BIT
185
186 size += 2;
187
188 #else
189
190 if (val < _UTF8_RANGE(2)) size += 2;
191 else if (val < _UTF8_RANGE(3)) size += 3;
192 else if (val < _UTF8_RANGE(4)) size += 4;
193 else if (val < _UTF8_RANGE(5)) size += 5;
194 else size += 6;
195
196 #endif
197 }
198 }
199
Utf16_To_Utf8(char * dest,const wchar_t * src,const wchar_t * srcLim)200 static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim)
201 {
202 for (;;)
203 {
204 if (src == srcLim)
205 return dest;
206
207 UInt32 val = *src++;
208
209 if (val < 0x80)
210 {
211 *dest++ = (char)val;
212 continue;
213 }
214
215 if (val < _UTF8_RANGE(1))
216 {
217 dest[0] = _UTF8_HEAD(1, val);
218 dest[1] = _UTF8_CHAR(0, val);
219 dest += 2;
220 continue;
221 }
222
223 if (val >= 0xD800 && val < 0xDC00 && src != srcLim)
224 {
225 UInt32 c2 = *src;
226 if (c2 >= 0xDC00 && c2 < 0xE000)
227 {
228 src++;
229 val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;
230 dest[0] = _UTF8_HEAD(3, val);
231 dest[1] = _UTF8_CHAR(2, val);
232 dest[2] = _UTF8_CHAR(1, val);
233 dest[3] = _UTF8_CHAR(0, val);
234 dest += 4;
235 continue;
236 }
237 }
238
239 #ifndef _WCHART_IS_16BIT
240 if (val < _UTF8_RANGE(2))
241 #endif
242 {
243 dest[0] = _UTF8_HEAD(2, val);
244 dest[1] = _UTF8_CHAR(1, val);
245 dest[2] = _UTF8_CHAR(0, val);
246 dest += 3;
247 continue;
248 }
249
250 #ifndef _WCHART_IS_16BIT
251
252 UInt32 b;
253 unsigned numBits;
254 if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }
255 else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); }
256 else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); }
257 else { numBits = 6 * 6; b = _UTF8_START(6); }
258
259 *dest++ = (Byte)b;
260
261 do
262 {
263 numBits -= 6;
264 *dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));
265 }
266 while (numBits != 0);
267
268 #endif
269 }
270 }
271
ConvertUTF8ToUnicode(const AString & src,UString & dest)272 bool ConvertUTF8ToUnicode(const AString &src, UString &dest)
273 {
274 dest.Empty();
275 size_t destLen = 0;
276 Utf8_To_Utf16(NULL, &destLen, src, src.Ptr(src.Len()));
277 bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src.Ptr(src.Len()));
278 dest.ReleaseBuf_SetEnd((unsigned)destLen);
279 return res;
280 }
281
ConvertUnicodeToUTF8(const UString & src,AString & dest)282 void ConvertUnicodeToUTF8(const UString &src, AString &dest)
283 {
284 dest.Empty();
285 size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()));
286 Utf16_To_Utf8(dest.GetBuf((unsigned)destLen), src, src.Ptr(src.Len()));
287 dest.ReleaseBuf_SetEnd((unsigned)destLen);
288 }
289