• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2001-2004 Unicode, Inc.
3  *
4  * Disclaimer
5  *
6  * This source code is provided as is by Unicode, Inc. No claims are
7  * made as to fitness for any particular purpose. No warranties of any
8  * kind are expressed or implied. The recipient agrees to determine
9  * applicability of information provided. If this file has been
10  * purchased on magnetic or optical media from Unicode, Inc., the
11  * sole remedy for any claim will be exchange of defective media
12  * within 90 days of receipt.
13  *
14  * Limitations on Rights to Redistribute This Code
15  *
16  * Unicode, Inc. hereby grants the right to freely use the information
17  * supplied in this file in the creation of products supporting the
18  * Unicode Standard, and to make copies of this file in any form
19  * for internal or external distribution as long as this notice
20  * remains attached.
21  */
22 
23 /* ---------------------------------------------------------------------
24 
25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26     Author: Mark E. Davis, 1994.
27     Rev History: Rick McGowan, fixes & updates May 2001.
28     Sept 2001: fixed const & error conditions per
29 	mods suggested by S. Parent & A. Lillich.
30     June 2002: Tim Dodd added detection and handling of incomplete
31 	source sequences, enhanced error detection, added casts
32 	to eliminate compiler warnings.
33     July 2003: slight mods to back out aggressive FFFE detection.
34     Jan 2004: updated switches in from-UTF8 conversions.
35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36 
37     See the header file "ConvertUTF.h" for complete documentation.
38 
39 ------------------------------------------------------------------------ */
40 
41 
42 #include "antlr3convertutf.h"
43 
44 #ifdef CVTUTF_DEBUG
45 #include <stdio.h>
46 #endif
47 
48 
49 
50 /* --------------------------------------------------------------------- */
51 
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)52 ConversionResult ConvertUTF32toUTF16 (
53 	const UTF32** sourceStart, const UTF32* sourceEnd,
54 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
55     ConversionResult result = conversionOK;
56     const UTF32* source = *sourceStart;
57     UTF16* target = *targetStart;
58     while (source < sourceEnd) {
59 	UTF32 ch;
60 	if (target >= targetEnd) {
61 	    result = targetExhausted; break;
62 	}
63 	ch = *source++;
64 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
65 	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
66 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
67 		if (flags == strictConversion) {
68 		    --source; /* return to the illegal value itself */
69 		    result = sourceIllegal;
70 		    break;
71 		} else {
72 		    *target++ = UNI_REPLACEMENT_CHAR;
73 		}
74 	    } else {
75 		*target++ = (UTF16)ch; /* normal case */
76 	    }
77 	} else if (ch > UNI_MAX_LEGAL_UTF32) {
78 	    if (flags == strictConversion) {
79 		result = sourceIllegal;
80 	    } else {
81 		*target++ = UNI_REPLACEMENT_CHAR;
82 	    }
83 	} else {
84 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
85 	    if (target + 1 >= targetEnd) {
86 		--source; /* Back up source pointer! */
87 		result = targetExhausted; break;
88 	    }
89 	    ch -= halfBase;
90 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
91 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
92 	}
93     }
94     *sourceStart = source;
95     *targetStart = target;
96     return result;
97 }
98 
99 /* --------------------------------------------------------------------- */
100 
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)101 ConversionResult ConvertUTF16toUTF32 (
102 	const UTF16** sourceStart, const UTF16* sourceEnd,
103 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
104     ConversionResult result = conversionOK;
105     const UTF16* source = *sourceStart;
106     UTF32* target = *targetStart;
107     UTF32 ch, ch2;
108     while (source < sourceEnd) {
109 	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
110 	ch = *source++;
111 	/* If we have a surrogate pair, convert to UTF32 first. */
112 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
113 	    /* If the 16 bits following the high surrogate are in the source buffer... */
114 	    if (source < sourceEnd) {
115 		ch2 = *source;
116 		/* If it's a low surrogate, convert to UTF32. */
117 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
118 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
119 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
120 		    ++source;
121 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
122 		    --source; /* return to the illegal value itself */
123 		    result = sourceIllegal;
124 		    break;
125 		}
126 	    } else { /* We don't have the 16 bits following the high surrogate. */
127 		--source; /* return to the high surrogate */
128 		result = sourceExhausted;
129 		break;
130 	    }
131 	} else if (flags == strictConversion) {
132 	    /* UTF-16 surrogate values are illegal in UTF-32 */
133 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
134 		--source; /* return to the illegal value itself */
135 		result = sourceIllegal;
136 		break;
137 	    }
138 	}
139 	if (target >= targetEnd) {
140 	    source = oldSource; /* Back up source pointer! */
141 	    result = targetExhausted; break;
142 	}
143 	*target++ = ch;
144     }
145     *sourceStart = source;
146     *targetStart = target;
147 #ifdef CVTUTF_DEBUG
148 if (result == sourceIllegal) {
149     ANTLR3_FPRINTF(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
150     fflush(stderr);
151 }
152 #endif
153     return result;
154 }
155 
156 /* --------------------------------------------------------------------- */
157 
158 /*
159  * Index into the table below with the first byte of a UTF-8 sequence to
160  * get the number of trailing bytes that are supposed to follow it.
161  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
162  * left as-is for anyone who may want to do such conversion, which was
163  * allowed in earlier algorithms.
164  */
165 static const char trailingBytesForUTF8[256] = {
166     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
167     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
168     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
169     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
170     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
173     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
174 };
175 
176 /*
177  * Magic values subtracted from a buffer value during UTF8 conversion.
178  * This table contains as many values as there might be trailing bytes
179  * in a UTF-8 sequence.
180  */
181 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
182 		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
183 
184 /*
185  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
186  * into the first byte, depending on how many bytes follow.  There are
187  * as many entries in this table as there are UTF-8 sequence types.
188  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
189  * for *legal* UTF-8 will be 4 or fewer bytes total.
190  */
191 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
192 
193 /* --------------------------------------------------------------------- */
194 
195 /* The interface converts a whole buffer to avoid function-call overhead.
196  * Constants have been gathered. Loops & conditionals have been removed as
197  * much as possible for efficiency, in favor of drop-through switches.
198  * (See "Note A" at the bottom of the file for equivalent code.)
199  * If your compiler supports it, the "isLegalUTF8" call can be turned
200  * into an inline function.
201  */
202 
203 /* --------------------------------------------------------------------- */
204 
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)205 ConversionResult ConvertUTF16toUTF8 (
206 	const UTF16** sourceStart, const UTF16* sourceEnd,
207 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
208     ConversionResult result = conversionOK;
209     const UTF16* source = *sourceStart;
210     UTF8* target = *targetStart;
211     while (source < sourceEnd) {
212 	UTF32 ch;
213 	unsigned short bytesToWrite = 0;
214 	const UTF32 byteMask = 0xBF;
215 	const UTF32 byteMark = 0x80;
216 	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
217 	ch = *source++;
218 	/* If we have a surrogate pair, convert to UTF32 first. */
219 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
220 	    /* If the 16 bits following the high surrogate are in the source buffer... */
221 	    if (source < sourceEnd) {
222 		UTF32 ch2 = *source;
223 		/* If it's a low surrogate, convert to UTF32. */
224 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
225 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
226 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
227 		    ++source;
228 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
229 		    --source; /* return to the illegal value itself */
230 		    result = sourceIllegal;
231 		    break;
232 		}
233 	    } else { /* We don't have the 16 bits following the high surrogate. */
234 		--source; /* return to the high surrogate */
235 		result = sourceExhausted;
236 		break;
237 	    }
238         } else if (flags == strictConversion) {
239 	    /* UTF-16 surrogate values are illegal in UTF-32 */
240 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
241 		--source; /* return to the illegal value itself */
242 		result = sourceIllegal;
243 		break;
244 	    }
245 	}
246 	/* Figure out how many bytes the result will require */
247 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
248 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
249 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
250 	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
251 	} else {			    bytesToWrite = 3;
252 					    ch = UNI_REPLACEMENT_CHAR;
253 	}
254 
255 	target += bytesToWrite;
256 	if (target > targetEnd) {
257 	    source = oldSource; /* Back up source pointer! */
258 	    target -= bytesToWrite; result = targetExhausted; break;
259 	}
260 	switch (bytesToWrite) { /* note: everything falls through. */
261 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
262 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
263 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
264 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
265 	}
266 	target += bytesToWrite;
267     }
268     *sourceStart = source;
269     *targetStart = target;
270     return result;
271 }
272 
273 /* --------------------------------------------------------------------- */
274 
275 /*
276  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
277  * This must be called with the length pre-determined by the first byte.
278  * If not calling this from ConvertUTF8to*, then the length can be set by:
279  *  length = trailingBytesForUTF8[*source]+1;
280  * and the sequence is illegal right away if there aren't that many bytes
281  * available.
282  * If presented with a length > 4, this returns false.  The Unicode
283  * definition of UTF-8 goes up to 4-byte sequences.
284  */
285 
286 static ANTLR3_BOOLEAN
isLegalUTF8(const UTF8 * source,int length)287 isLegalUTF8(const UTF8 *source, int length) {
288     UTF8 a;
289     const UTF8 *srcptr = source+length;
290     switch (length) {
291     default: return false;
292 	/* Everything else falls through when "true"... */
293     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
294     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
295     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
296 
297 	switch (*source) {
298 	    /* no fall-through in this inner switch */
299 	    case 0xE0: if (a < 0xA0) return false; break;
300 	    case 0xED: if (a > 0x9F) return false; break;
301 	    case 0xF0: if (a < 0x90) return false; break;
302 	    case 0xF4: if (a > 0x8F) return false; break;
303 	    default:   if (a < 0x80) return false;
304 	}
305 
306     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
307     }
308     if (*source > 0xF4) return false;
309     return true;
310 }
311 
312 /* --------------------------------------------------------------------- */
313 
314 /*
315  * Exported function to return whether a UTF-8 sequence is legal or not.
316  * This is not used here; it's just exported.
317  */
318 ANTLR3_BOOLEAN
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)319 isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
320     int length = trailingBytesForUTF8[*source]+1;
321     if (source+length > sourceEnd) {
322 	return false;
323     }
324     return isLegalUTF8(source, length);
325 }
326 
327 /* --------------------------------------------------------------------- */
328 
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)329 ConversionResult ConvertUTF8toUTF16 (
330 	const UTF8** sourceStart, const UTF8* sourceEnd,
331 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
332     ConversionResult result = conversionOK;
333     const UTF8* source = *sourceStart;
334     UTF16* target = *targetStart;
335     while (source < sourceEnd) {
336 	UTF32 ch = 0;
337 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
338 	if (source + extraBytesToRead >= sourceEnd) {
339 	    result = sourceExhausted; break;
340 	}
341 	/* Do this check whether lenient or strict */
342 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
343 	    result = sourceIllegal;
344 	    break;
345 	}
346 	/*
347 	 * The cases all fall through. See "Note A" below.
348 	 */
349 	switch (extraBytesToRead) {
350 	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
351 	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
352 	    case 3: ch += *source++; ch <<= 6;
353 	    case 2: ch += *source++; ch <<= 6;
354 	    case 1: ch += *source++; ch <<= 6;
355 	    case 0: ch += *source++;
356 	}
357 	ch -= offsetsFromUTF8[extraBytesToRead];
358 
359 	if (target >= targetEnd) {
360 	    source -= (extraBytesToRead+1); /* Back up source pointer! */
361 	    result = targetExhausted; break;
362 	}
363 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
364 	    /* UTF-16 surrogate values are illegal in UTF-32 */
365 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
366 		if (flags == strictConversion) {
367 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
368 		    result = sourceIllegal;
369 		    break;
370 		} else {
371 		    *target++ = UNI_REPLACEMENT_CHAR;
372 		}
373 	    } else {
374 		*target++ = (UTF16)ch; /* normal case */
375 	    }
376 	} else if (ch > UNI_MAX_UTF16) {
377 	    if (flags == strictConversion) {
378 		result = sourceIllegal;
379 		source -= (extraBytesToRead+1); /* return to the start */
380 		break; /* Bail out; shouldn't continue */
381 	    } else {
382 		*target++ = UNI_REPLACEMENT_CHAR;
383 	    }
384 	} else {
385 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
386 	    if (target + 1 >= targetEnd) {
387 		source -= (extraBytesToRead+1); /* Back up source pointer! */
388 		result = targetExhausted; break;
389 	    }
390 	    ch -= halfBase;
391 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
392 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
393 	}
394     }
395     *sourceStart = source;
396     *targetStart = target;
397     return result;
398 }
399 
400 /* --------------------------------------------------------------------- */
401 
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)402 ConversionResult ConvertUTF32toUTF8 (
403 	const UTF32** sourceStart, const UTF32* sourceEnd,
404 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
405     ConversionResult result = conversionOK;
406     const UTF32* source = *sourceStart;
407     UTF8* target = *targetStart;
408     while (source < sourceEnd) {
409 	UTF32 ch;
410 	unsigned short bytesToWrite = 0;
411 	const UTF32 byteMask = 0xBF;
412 	const UTF32 byteMark = 0x80;
413 	ch = *source++;
414 	if (flags == strictConversion ) {
415 	    /* UTF-16 surrogate values are illegal in UTF-32 */
416 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
417 		--source; /* return to the illegal value itself */
418 		result = sourceIllegal;
419 		break;
420 	    }
421 	}
422 	/*
423 	 * Figure out how many bytes the result will require. Turn any
424 	 * illegally large UTF32 things (> Plane 17) into replacement chars.
425 	 */
426 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
427 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
428 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
429 	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
430 	} else {			    bytesToWrite = 3;
431 					    ch = UNI_REPLACEMENT_CHAR;
432 					    result = sourceIllegal;
433 	}
434 
435 	target += bytesToWrite;
436 	if (target > targetEnd) {
437 	    --source; /* Back up source pointer! */
438 	    target -= bytesToWrite; result = targetExhausted; break;
439 	}
440 	switch (bytesToWrite) { /* note: everything falls through. */
441 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
442 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
443 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
444 	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
445 	}
446 	target += bytesToWrite;
447     }
448     *sourceStart = source;
449     *targetStart = target;
450     return result;
451 }
452 
453 /* --------------------------------------------------------------------- */
454 
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)455 ConversionResult ConvertUTF8toUTF32 (
456 	const UTF8** sourceStart, const UTF8* sourceEnd,
457 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
458     ConversionResult result = conversionOK;
459     const UTF8* source = *sourceStart;
460     UTF32* target = *targetStart;
461     while (source < sourceEnd) {
462 	UTF32 ch = 0;
463 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
464 	if (source + extraBytesToRead >= sourceEnd) {
465 	    result = sourceExhausted; break;
466 	}
467 	/* Do this check whether lenient or strict */
468 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
469 	    result = sourceIllegal;
470 	    break;
471 	}
472 	/*
473 	 * The cases all fall through. See "Note A" below.
474 	 */
475 	switch (extraBytesToRead) {
476 	    case 5: ch += *source++; ch <<= 6;
477 	    case 4: ch += *source++; ch <<= 6;
478 	    case 3: ch += *source++; ch <<= 6;
479 	    case 2: ch += *source++; ch <<= 6;
480 	    case 1: ch += *source++; ch <<= 6;
481 	    case 0: ch += *source++;
482 	}
483 	ch -= offsetsFromUTF8[extraBytesToRead];
484 
485 	if (target >= targetEnd) {
486 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
487 	    result = targetExhausted; break;
488 	}
489 	if (ch <= UNI_MAX_LEGAL_UTF32) {
490 	    /*
491 	     * UTF-16 surrogate values are illegal in UTF-32, and anything
492 	     * over Plane 17 (> 0x10FFFF) is illegal.
493 	     */
494 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
495 		if (flags == strictConversion) {
496 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
497 		    result = sourceIllegal;
498 		    break;
499 		} else {
500 		    *target++ = UNI_REPLACEMENT_CHAR;
501 		}
502 	    } else {
503 		*target++ = ch;
504 	    }
505 	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
506 	    result = sourceIllegal;
507 	    *target++ = UNI_REPLACEMENT_CHAR;
508 	}
509     }
510     *sourceStart = source;
511     *targetStart = target;
512     return result;
513 }
514 
515 /* ---------------------------------------------------------------------
516 
517     Note A.
518     The fall-through switches in UTF-8 reading code save a
519     temp variable, some decrements & conditionals.  The switches
520     are equivalent to the following loop:
521 	{
522 	    int tmpBytesToRead = extraBytesToRead+1;
523 	    do {
524 		ch += *source++;
525 		--tmpBytesToRead;
526 		if (tmpBytesToRead) ch <<= 6;
527 	    } while (tmpBytesToRead > 0);
528 	}
529     In UTF-8 writing code, the switches on "bytesToWrite" are
530     similarly unrolled loops.
531 
532    --------------------------------------------------------------------- */
533