1 /* gutf8.c - Operations on UTF-8 strings.
2 *
3 * Copyright (C) 1999 Tom Tromey
4 * Copyright (C) 2000 Red Hat, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22 #include "config.h"
23
24 #include <stdlib.h>
25 #ifndef ANDROID_STUB
26 #ifdef HAVE_CODESET
27 #include <langinfo.h>
28 #endif
29 #endif
30 #include <string.h>
31
32 #include "glib.h"
33
34 #ifdef G_PLATFORM_WIN32
35 #include <stdio.h>
36 #define STRICT
37 #include <windows.h>
38 #undef STRICT
39 #endif
40
41 #ifndef ANDROID_STUB
42 #include "libcharset/libcharset.h"
43 #endif
44
45 #include "glibintl.h"
46 #include "galias.h"
47
48 #define UTF8_COMPUTE(Char, Mask, Len) \
49 if (Char < 128) \
50 { \
51 Len = 1; \
52 Mask = 0x7f; \
53 } \
54 else if ((Char & 0xe0) == 0xc0) \
55 { \
56 Len = 2; \
57 Mask = 0x1f; \
58 } \
59 else if ((Char & 0xf0) == 0xe0) \
60 { \
61 Len = 3; \
62 Mask = 0x0f; \
63 } \
64 else if ((Char & 0xf8) == 0xf0) \
65 { \
66 Len = 4; \
67 Mask = 0x07; \
68 } \
69 else if ((Char & 0xfc) == 0xf8) \
70 { \
71 Len = 5; \
72 Mask = 0x03; \
73 } \
74 else if ((Char & 0xfe) == 0xfc) \
75 { \
76 Len = 6; \
77 Mask = 0x01; \
78 } \
79 else \
80 Len = -1;
81
82 #define UTF8_LENGTH(Char) \
83 ((Char) < 0x80 ? 1 : \
84 ((Char) < 0x800 ? 2 : \
85 ((Char) < 0x10000 ? 3 : \
86 ((Char) < 0x200000 ? 4 : \
87 ((Char) < 0x4000000 ? 5 : 6)))))
88
89
90 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
91 (Result) = (Chars)[0] & (Mask); \
92 for ((Count) = 1; (Count) < (Len); ++(Count)) \
93 { \
94 if (((Chars)[(Count)] & 0xc0) != 0x80) \
95 { \
96 (Result) = -1; \
97 break; \
98 } \
99 (Result) <<= 6; \
100 (Result) |= ((Chars)[(Count)] & 0x3f); \
101 }
102
103 #define UNICODE_VALID(Char) \
104 ((Char) < 0x110000 && \
105 (((Char) & 0xFFFFF800) != 0xD800) && \
106 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
107 ((Char) & 0xFFFE) != 0xFFFE)
108
109
110 static const gchar utf8_skip_data[256] = {
111 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
113 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
114 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
115 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
116 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
117 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
118 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
119 };
120
121 const gchar * const g_utf8_skip = utf8_skip_data;
122
123 /**
124 * g_utf8_find_prev_char:
125 * @str: pointer to the beginning of a UTF-8 encoded string
126 * @p: pointer to some position within @str
127 *
128 * Given a position @p with a UTF-8 encoded string @str, find the start
129 * of the previous UTF-8 character starting before @p. Returns %NULL if no
130 * UTF-8 characters are present in @str before @p.
131 *
132 * @p does not have to be at the beginning of a UTF-8 character. No check
133 * is made to see if the character found is actually valid other than
134 * it starts with an appropriate byte.
135 *
136 * Return value: a pointer to the found character or %NULL.
137 **/
138 gchar *
g_utf8_find_prev_char(const char * str,const char * p)139 g_utf8_find_prev_char (const char *str,
140 const char *p)
141 {
142 for (--p; p >= str; --p)
143 {
144 if ((*p & 0xc0) != 0x80)
145 return (gchar *)p;
146 }
147 return NULL;
148 }
149
150 /**
151 * g_utf8_find_next_char:
152 * @p: a pointer to a position within a UTF-8 encoded string
153 * @end: a pointer to the byte following the end of the string,
154 * or %NULL to indicate that the string is nul-terminated.
155 *
156 * Finds the start of the next UTF-8 character in the string after @p.
157 *
158 * @p does not have to be at the beginning of a UTF-8 character. No check
159 * is made to see if the character found is actually valid other than
160 * it starts with an appropriate byte.
161 *
162 * Return value: a pointer to the found character or %NULL
163 **/
164 gchar *
g_utf8_find_next_char(const gchar * p,const gchar * end)165 g_utf8_find_next_char (const gchar *p,
166 const gchar *end)
167 {
168 if (*p)
169 {
170 if (end)
171 for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
172 ;
173 else
174 for (++p; (*p & 0xc0) == 0x80; ++p)
175 ;
176 }
177 return (p == end) ? NULL : (gchar *)p;
178 }
179
180 /**
181 * g_utf8_prev_char:
182 * @p: a pointer to a position within a UTF-8 encoded string
183 *
184 * Finds the previous UTF-8 character in the string before @p.
185 *
186 * @p does not have to be at the beginning of a UTF-8 character. No check
187 * is made to see if the character found is actually valid other than
188 * it starts with an appropriate byte. If @p might be the first
189 * character of the string, you must use g_utf8_find_prev_char() instead.
190 *
191 * Return value: a pointer to the found character.
192 **/
193 gchar *
g_utf8_prev_char(const gchar * p)194 g_utf8_prev_char (const gchar *p)
195 {
196 while (TRUE)
197 {
198 p--;
199 if ((*p & 0xc0) != 0x80)
200 return (gchar *)p;
201 }
202 }
203
204 /**
205 * g_utf8_strlen:
206 * @p: pointer to the start of a UTF-8 encoded string.
207 * @max: the maximum number of bytes to examine. If @max
208 * is less than 0, then the string is assumed to be
209 * nul-terminated. If @max is 0, @p will not be examined and
210 * may be %NULL.
211 *
212 * Returns the length of the string in characters.
213 *
214 * Return value: the length of the string in characters
215 **/
216 glong
g_utf8_strlen(const gchar * p,gssize max)217 g_utf8_strlen (const gchar *p,
218 gssize max)
219 {
220 glong len = 0;
221 const gchar *start = p;
222 g_return_val_if_fail (p != NULL || max == 0, 0);
223
224 if (max < 0)
225 {
226 while (*p)
227 {
228 p = g_utf8_next_char (p);
229 ++len;
230 }
231 }
232 else
233 {
234 if (max == 0 || !*p)
235 return 0;
236
237 p = g_utf8_next_char (p);
238
239 while (p - start < max && *p)
240 {
241 ++len;
242 p = g_utf8_next_char (p);
243 }
244
245 /* only do the last len increment if we got a complete
246 * char (don't count partial chars)
247 */
248 if (p - start <= max)
249 ++len;
250 }
251
252 return len;
253 }
254
255 /**
256 * g_utf8_get_char:
257 * @p: a pointer to Unicode character encoded as UTF-8
258 *
259 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
260 * If @p does not point to a valid UTF-8 encoded character, results are
261 * undefined. If you are not sure that the bytes are complete
262 * valid Unicode characters, you should use g_utf8_get_char_validated()
263 * instead.
264 *
265 * Return value: the resulting character
266 **/
267 gunichar
g_utf8_get_char(const gchar * p)268 g_utf8_get_char (const gchar *p)
269 {
270 int i, mask = 0, len;
271 gunichar result;
272 unsigned char c = (unsigned char) *p;
273
274 UTF8_COMPUTE (c, mask, len);
275 if (len == -1)
276 return (gunichar)-1;
277 UTF8_GET (result, p, i, mask, len);
278
279 return result;
280 }
281
282 /**
283 * g_utf8_offset_to_pointer:
284 * @str: a UTF-8 encoded string
285 * @offset: a character offset within @str
286 *
287 * Converts from an integer character offset to a pointer to a position
288 * within the string.
289 *
290 * Since 2.10, this function allows to pass a negative @offset to
291 * step backwards. It is usually worth stepping backwards from the end
292 * instead of forwards if @offset is in the last fourth of the string,
293 * since moving forward is about 3 times faster than moving backward.
294 *
295 * <note><para>
296 * This function doesn't abort when reaching the end of @str. Therefore
297 * you should be sure that @offset is within string boundaries before
298 * calling that function. Call g_utf8_strlen() when unsure.
299 *
300 * This limitation exists as this function is called frequently during
301 * text rendering and therefore has to be as fast as possible.
302 * </para></note>
303 *
304 * Return value: the resulting pointer
305 **/
306 gchar *
g_utf8_offset_to_pointer(const gchar * str,glong offset)307 g_utf8_offset_to_pointer (const gchar *str,
308 glong offset)
309 {
310 const gchar *s = str;
311
312 if (offset > 0)
313 while (offset--)
314 s = g_utf8_next_char (s);
315 else
316 {
317 const char *s1;
318
319 /* This nice technique for fast backwards stepping
320 * through a UTF-8 string was dubbed "stutter stepping"
321 * by its inventor, Larry Ewing.
322 */
323 while (offset)
324 {
325 s1 = s;
326 s += offset;
327 while ((*s & 0xc0) == 0x80)
328 s--;
329
330 offset += g_utf8_pointer_to_offset (s, s1);
331 }
332 }
333
334 return (gchar *)s;
335 }
336
337 /**
338 * g_utf8_pointer_to_offset:
339 * @str: a UTF-8 encoded string
340 * @pos: a pointer to a position within @str
341 *
342 * Converts from a pointer to position within a string to a integer
343 * character offset.
344 *
345 * Since 2.10, this function allows @pos to be before @str, and returns
346 * a negative offset in this case.
347 *
348 * Return value: the resulting character offset
349 **/
350 glong
g_utf8_pointer_to_offset(const gchar * str,const gchar * pos)351 g_utf8_pointer_to_offset (const gchar *str,
352 const gchar *pos)
353 {
354 const gchar *s = str;
355 glong offset = 0;
356
357 if (pos < str)
358 offset = - g_utf8_pointer_to_offset (pos, str);
359 else
360 while (s < pos)
361 {
362 s = g_utf8_next_char (s);
363 offset++;
364 }
365
366 return offset;
367 }
368
369
370 /**
371 * g_utf8_strncpy:
372 * @dest: buffer to fill with characters from @src
373 * @src: UTF-8 encoded string
374 * @n: character count
375 *
376 * Like the standard C strncpy() function, but
377 * copies a given number of characters instead of a given number of
378 * bytes. The @src string must be valid UTF-8 encoded text.
379 * (Use g_utf8_validate() on all text before trying to use UTF-8
380 * utility functions with it.)
381 *
382 * Return value: @dest
383 **/
384 gchar *
g_utf8_strncpy(gchar * dest,const gchar * src,gsize n)385 g_utf8_strncpy (gchar *dest,
386 const gchar *src,
387 gsize n)
388 {
389 const gchar *s = src;
390 while (n && *s)
391 {
392 s = g_utf8_next_char(s);
393 n--;
394 }
395 strncpy(dest, src, s - src);
396 dest[s - src] = 0;
397 return dest;
398 }
399
400 G_LOCK_DEFINE_STATIC (aliases);
401
402 static GHashTable *
get_alias_hash(void)403 get_alias_hash (void)
404 {
405 static GHashTable *alias_hash = NULL;
406 const char *aliases;
407
408 G_LOCK (aliases);
409
410 if (!alias_hash)
411 {
412 alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
413
414 aliases = _g_locale_get_charset_aliases ();
415 while (*aliases != '\0')
416 {
417 const char *canonical;
418 const char *alias;
419 const char **alias_array;
420 int count = 0;
421
422 alias = aliases;
423 aliases += strlen (aliases) + 1;
424 canonical = aliases;
425 aliases += strlen (aliases) + 1;
426
427 alias_array = g_hash_table_lookup (alias_hash, canonical);
428 if (alias_array)
429 {
430 while (alias_array[count])
431 count++;
432 }
433
434 alias_array = g_renew (const char *, alias_array, count + 2);
435 alias_array[count] = alias;
436 alias_array[count + 1] = NULL;
437
438 g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
439 }
440 }
441
442 G_UNLOCK (aliases);
443
444 return alias_hash;
445 }
446
447 /* As an abuse of the alias table, the following routines gets
448 * the charsets that are aliases for the canonical name.
449 */
450 #ifndef ANDROID_STUB
451 G_GNUC_INTERNAL const char **
_g_charset_get_aliases(const char * canonical_name)452 _g_charset_get_aliases (const char *canonical_name)
453 {
454 GHashTable *alias_hash = get_alias_hash ();
455
456 return g_hash_table_lookup (alias_hash, canonical_name);
457 }
458 #endif
459
460 static gboolean
g_utf8_get_charset_internal(const char * raw_data,const char ** a)461 g_utf8_get_charset_internal (const char *raw_data,
462 const char **a)
463 {
464 const char *charset = getenv("CHARSET");
465
466 if (charset && *charset)
467 {
468 *a = charset;
469
470 if (charset && strstr (charset, "UTF-8"))
471 return TRUE;
472 else
473 return FALSE;
474 }
475
476 /* The libcharset code tries to be thread-safe without
477 * a lock, but has a memory leak and a missing memory
478 * barrier, so we lock for it
479 */
480 #ifndef ANDROID_STUB
481 G_LOCK (aliases);
482 charset = _g_locale_charset_unalias (raw_data);
483 G_UNLOCK (aliases);
484
485 if (charset && *charset)
486 {
487 *a = charset;
488
489 if (charset && strstr (charset, "UTF-8"))
490 return TRUE;
491 else
492 return FALSE;
493 }
494 #endif
495
496 /* Assume this for compatibility at present. */
497 *a = "US-ASCII";
498
499 return FALSE;
500 }
501
502 typedef struct _GCharsetCache GCharsetCache;
503
504 struct _GCharsetCache {
505 gboolean is_utf8;
506 gchar *raw;
507 gchar *charset;
508 };
509
510 static void
charset_cache_free(gpointer data)511 charset_cache_free (gpointer data)
512 {
513 GCharsetCache *cache = data;
514 g_free (cache->raw);
515 g_free (cache->charset);
516 g_free (cache);
517 }
518
519 /**
520 * g_get_charset:
521 * @charset: return location for character set name
522 *
523 * Obtains the character set for the <link linkend="setlocale">current
524 * locale</link>; you might use this character set as an argument to
525 * g_convert(), to convert from the current locale's encoding to some
526 * other encoding. (Frequently g_locale_to_utf8() and g_locale_from_utf8()
527 * are nice shortcuts, though.)
528 *
529 * On Windows the character set returned by this function is the
530 * so-called system default ANSI code-page. That is the character set
531 * used by the "narrow" versions of C library and Win32 functions that
532 * handle file names. It might be different from the character set
533 * used by the C library's current locale.
534 *
535 * The return value is %TRUE if the locale's encoding is UTF-8, in that
536 * case you can perhaps avoid calling g_convert().
537 *
538 * The string returned in @charset is not allocated, and should not be
539 * freed.
540 *
541 * Return value: %TRUE if the returned charset is UTF-8
542 **/
543 gboolean
g_get_charset(G_CONST_RETURN char ** charset)544 g_get_charset (G_CONST_RETURN char **charset)
545 {
546 static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
547 GCharsetCache *cache = g_static_private_get (&cache_private);
548 const gchar *raw;
549
550 if (!cache)
551 {
552 cache = g_new0 (GCharsetCache, 1);
553 g_static_private_set (&cache_private, cache, charset_cache_free);
554 }
555
556 #ifndef ANDROID_STUB
557 raw = _g_locale_charset_raw ();
558
559 if (!(cache->raw && strcmp (cache->raw, raw) == 0))
560 {
561 const gchar *new_charset;
562
563 g_free (cache->raw);
564 g_free (cache->charset);
565 cache->raw = g_strdup (raw);
566 cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
567 cache->charset = g_strdup (new_charset);
568 }
569 #else
570 cache->charset = g_strdup("UTF-8");
571 cache->is_utf8 = TRUE;
572 #endif
573 if (charset)
574 *charset = cache->charset;
575
576 return cache->is_utf8;
577 }
578
579 /* unicode_strchr */
580
581 /**
582 * g_unichar_to_utf8:
583 * @c: a Unicode character code
584 * @outbuf: output buffer, must have at least 6 bytes of space.
585 * If %NULL, the length will be computed and returned
586 * and nothing will be written to @outbuf.
587 *
588 * Converts a single character to UTF-8.
589 *
590 * Return value: number of bytes written
591 **/
592 int
g_unichar_to_utf8(gunichar c,gchar * outbuf)593 g_unichar_to_utf8 (gunichar c,
594 gchar *outbuf)
595 {
596 /* If this gets modified, also update the copy in g_string_insert_unichar() */
597 guint len = 0;
598 int first;
599 int i;
600
601 if (c < 0x80)
602 {
603 first = 0;
604 len = 1;
605 }
606 else if (c < 0x800)
607 {
608 first = 0xc0;
609 len = 2;
610 }
611 else if (c < 0x10000)
612 {
613 first = 0xe0;
614 len = 3;
615 }
616 else if (c < 0x200000)
617 {
618 first = 0xf0;
619 len = 4;
620 }
621 else if (c < 0x4000000)
622 {
623 first = 0xf8;
624 len = 5;
625 }
626 else
627 {
628 first = 0xfc;
629 len = 6;
630 }
631
632 if (outbuf)
633 {
634 for (i = len - 1; i > 0; --i)
635 {
636 outbuf[i] = (c & 0x3f) | 0x80;
637 c >>= 6;
638 }
639 outbuf[0] = c | first;
640 }
641
642 return len;
643 }
644
645 /**
646 * g_utf8_strchr:
647 * @p: a nul-terminated UTF-8 encoded string
648 * @len: the maximum length of @p
649 * @c: a Unicode character
650 *
651 * Finds the leftmost occurrence of the given Unicode character
652 * in a UTF-8 encoded string, while limiting the search to @len bytes.
653 * If @len is -1, allow unbounded search.
654 *
655 * Return value: %NULL if the string does not contain the character,
656 * otherwise, a pointer to the start of the leftmost occurrence of
657 * the character in the string.
658 **/
659 gchar *
g_utf8_strchr(const char * p,gssize len,gunichar c)660 g_utf8_strchr (const char *p,
661 gssize len,
662 gunichar c)
663 {
664 gchar ch[10];
665
666 gint charlen = g_unichar_to_utf8 (c, ch);
667 ch[charlen] = '\0';
668
669 return g_strstr_len (p, len, ch);
670 }
671
672
673 /**
674 * g_utf8_strrchr:
675 * @p: a nul-terminated UTF-8 encoded string
676 * @len: the maximum length of @p
677 * @c: a Unicode character
678 *
679 * Find the rightmost occurrence of the given Unicode character
680 * in a UTF-8 encoded string, while limiting the search to @len bytes.
681 * If @len is -1, allow unbounded search.
682 *
683 * Return value: %NULL if the string does not contain the character,
684 * otherwise, a pointer to the start of the rightmost occurrence of the
685 * character in the string.
686 **/
687 gchar *
g_utf8_strrchr(const char * p,gssize len,gunichar c)688 g_utf8_strrchr (const char *p,
689 gssize len,
690 gunichar c)
691 {
692 gchar ch[10];
693
694 gint charlen = g_unichar_to_utf8 (c, ch);
695 ch[charlen] = '\0';
696
697 return g_strrstr_len (p, len, ch);
698 }
699
700
701 /* Like g_utf8_get_char, but take a maximum length
702 * and return (gunichar)-2 on incomplete trailing character
703 */
704 static inline gunichar
g_utf8_get_char_extended(const gchar * p,gssize max_len)705 g_utf8_get_char_extended (const gchar *p,
706 gssize max_len)
707 {
708 guint i, len;
709 gunichar wc = (guchar) *p;
710
711 if (wc < 0x80)
712 {
713 return wc;
714 }
715 else if (wc < 0xc0)
716 {
717 return (gunichar)-1;
718 }
719 else if (wc < 0xe0)
720 {
721 len = 2;
722 wc &= 0x1f;
723 }
724 else if (wc < 0xf0)
725 {
726 len = 3;
727 wc &= 0x0f;
728 }
729 else if (wc < 0xf8)
730 {
731 len = 4;
732 wc &= 0x07;
733 }
734 else if (wc < 0xfc)
735 {
736 len = 5;
737 wc &= 0x03;
738 }
739 else if (wc < 0xfe)
740 {
741 len = 6;
742 wc &= 0x01;
743 }
744 else
745 {
746 return (gunichar)-1;
747 }
748
749 if (max_len >= 0 && len > max_len)
750 {
751 for (i = 1; i < max_len; i++)
752 {
753 if ((((guchar *)p)[i] & 0xc0) != 0x80)
754 return (gunichar)-1;
755 }
756 return (gunichar)-2;
757 }
758
759 for (i = 1; i < len; ++i)
760 {
761 gunichar ch = ((guchar *)p)[i];
762
763 if ((ch & 0xc0) != 0x80)
764 {
765 if (ch)
766 return (gunichar)-1;
767 else
768 return (gunichar)-2;
769 }
770
771 wc <<= 6;
772 wc |= (ch & 0x3f);
773 }
774
775 if (UTF8_LENGTH(wc) != len)
776 return (gunichar)-1;
777
778 return wc;
779 }
780
781 /**
782 * g_utf8_get_char_validated:
783 * @p: a pointer to Unicode character encoded as UTF-8
784 * @max_len: the maximum number of bytes to read, or -1, for no maximum or
785 * if @p is nul-terminated
786 *
787 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
788 * This function checks for incomplete characters, for invalid characters
789 * such as characters that are out of the range of Unicode, and for
790 * overlong encodings of valid characters.
791 *
792 * Return value: the resulting character. If @p points to a partial
793 * sequence at the end of a string that could begin a valid
794 * character (or if @max_len is zero), returns (gunichar)-2;
795 * otherwise, if @p does not point to a valid UTF-8 encoded
796 * Unicode character, returns (gunichar)-1.
797 **/
798 gunichar
g_utf8_get_char_validated(const gchar * p,gssize max_len)799 g_utf8_get_char_validated (const gchar *p,
800 gssize max_len)
801 {
802 gunichar result;
803
804 if (max_len == 0)
805 return (gunichar)-2;
806
807 result = g_utf8_get_char_extended (p, max_len);
808
809 if (result & 0x80000000)
810 return result;
811 else if (!UNICODE_VALID (result))
812 return (gunichar)-1;
813 else
814 return result;
815 }
816
817 /**
818 * g_utf8_to_ucs4_fast:
819 * @str: a UTF-8 encoded string
820 * @len: the maximum length of @str to use, in bytes. If @len < 0,
821 * then the string is nul-terminated.
822 * @items_written: location to store the number of characters in the
823 * result, or %NULL.
824 *
825 * Convert a string from UTF-8 to a 32-bit fixed width
826 * representation as UCS-4, assuming valid UTF-8 input.
827 * This function is roughly twice as fast as g_utf8_to_ucs4()
828 * but does no error checking on the input.
829 *
830 * Return value: a pointer to a newly allocated UCS-4 string.
831 * This value must be freed with g_free().
832 **/
833 gunichar *
g_utf8_to_ucs4_fast(const gchar * str,glong len,glong * items_written)834 g_utf8_to_ucs4_fast (const gchar *str,
835 glong len,
836 glong *items_written)
837 {
838 gint j, charlen;
839 gunichar *result;
840 gint n_chars, i;
841 const gchar *p;
842
843 g_return_val_if_fail (str != NULL, NULL);
844
845 p = str;
846 n_chars = 0;
847 if (len < 0)
848 {
849 while (*p)
850 {
851 p = g_utf8_next_char (p);
852 ++n_chars;
853 }
854 }
855 else
856 {
857 while (p < str + len && *p)
858 {
859 p = g_utf8_next_char (p);
860 ++n_chars;
861 }
862 }
863
864 result = g_new (gunichar, n_chars + 1);
865
866 p = str;
867 for (i=0; i < n_chars; i++)
868 {
869 gunichar wc = ((unsigned char *)p)[0];
870
871 if (wc < 0x80)
872 {
873 result[i] = wc;
874 p++;
875 }
876 else
877 {
878 if (wc < 0xe0)
879 {
880 charlen = 2;
881 wc &= 0x1f;
882 }
883 else if (wc < 0xf0)
884 {
885 charlen = 3;
886 wc &= 0x0f;
887 }
888 else if (wc < 0xf8)
889 {
890 charlen = 4;
891 wc &= 0x07;
892 }
893 else if (wc < 0xfc)
894 {
895 charlen = 5;
896 wc &= 0x03;
897 }
898 else
899 {
900 charlen = 6;
901 wc &= 0x01;
902 }
903
904 for (j = 1; j < charlen; j++)
905 {
906 wc <<= 6;
907 wc |= ((unsigned char *)p)[j] & 0x3f;
908 }
909
910 result[i] = wc;
911 p += charlen;
912 }
913 }
914 result[i] = 0;
915
916 if (items_written)
917 *items_written = i;
918
919 return result;
920 }
921
922 /**
923 * g_utf8_to_ucs4:
924 * @str: a UTF-8 encoded string
925 * @len: the maximum length of @str to use, in bytes. If @len < 0,
926 * then the string is nul-terminated.
927 * @items_read: location to store number of bytes read, or %NULL.
928 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
929 * returned in case @str contains a trailing partial
930 * character. If an error occurs then the index of the
931 * invalid input is stored here.
932 * @items_written: location to store number of characters written or %NULL.
933 * The value here stored does not include the trailing 0
934 * character.
935 * @error: location to store the error occuring, or %NULL to ignore
936 * errors. Any of the errors in #GConvertError other than
937 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
938 *
939 * Convert a string from UTF-8 to a 32-bit fixed width
940 * representation as UCS-4. A trailing 0 will be added to the
941 * string after the converted text.
942 *
943 * Return value: a pointer to a newly allocated UCS-4 string.
944 * This value must be freed with g_free(). If an
945 * error occurs, %NULL will be returned and
946 * @error set.
947 **/
948 gunichar *
g_utf8_to_ucs4(const gchar * str,glong len,glong * items_read,glong * items_written,GError ** error)949 g_utf8_to_ucs4 (const gchar *str,
950 glong len,
951 glong *items_read,
952 glong *items_written,
953 GError **error)
954 {
955 gunichar *result = NULL;
956 gint n_chars, i;
957 const gchar *in;
958
959 in = str;
960 n_chars = 0;
961 while ((len < 0 || str + len - in > 0) && *in)
962 {
963 gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
964 if (wc & 0x80000000)
965 {
966 if (wc == (gunichar)-2)
967 {
968 if (items_read)
969 break;
970 else
971 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
972 _("Partial character sequence at end of input"));
973 }
974 else
975 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
976 _("Invalid byte sequence in conversion input"));
977
978 goto err_out;
979 }
980
981 n_chars++;
982
983 in = g_utf8_next_char (in);
984 }
985
986 result = g_new (gunichar, n_chars + 1);
987
988 in = str;
989 for (i=0; i < n_chars; i++)
990 {
991 result[i] = g_utf8_get_char (in);
992 in = g_utf8_next_char (in);
993 }
994 result[i] = 0;
995
996 if (items_written)
997 *items_written = n_chars;
998
999 err_out:
1000 if (items_read)
1001 *items_read = in - str;
1002
1003 return result;
1004 }
1005
1006 /**
1007 * g_ucs4_to_utf8:
1008 * @str: a UCS-4 encoded string
1009 * @len: the maximum length (number of characters) of @str to use.
1010 * If @len < 0, then the string is nul-terminated.
1011 * @items_read: location to store number of characters read, or %NULL.
1012 * @items_written: location to store number of bytes written or %NULL.
1013 * The value here stored does not include the trailing 0
1014 * byte.
1015 * @error: location to store the error occuring, or %NULL to ignore
1016 * errors. Any of the errors in #GConvertError other than
1017 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
1018 *
1019 * Convert a string from a 32-bit fixed width representation as UCS-4.
1020 * to UTF-8. The result will be terminated with a 0 byte.
1021 *
1022 * Return value: a pointer to a newly allocated UTF-8 string.
1023 * This value must be freed with g_free(). If an
1024 * error occurs, %NULL will be returned and
1025 * @error set. In that case, @items_read will be
1026 * set to the position of the first invalid input
1027 * character.
1028 **/
1029 gchar *
g_ucs4_to_utf8(const gunichar * str,glong len,glong * items_read,glong * items_written,GError ** error)1030 g_ucs4_to_utf8 (const gunichar *str,
1031 glong len,
1032 glong *items_read,
1033 glong *items_written,
1034 GError **error)
1035 {
1036 gint result_length;
1037 gchar *result = NULL;
1038 gchar *p;
1039 gint i;
1040
1041 result_length = 0;
1042 for (i = 0; len < 0 || i < len ; i++)
1043 {
1044 if (!str[i])
1045 break;
1046
1047 if (str[i] >= 0x80000000)
1048 {
1049 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1050 _("Character out of range for UTF-8"));
1051 goto err_out;
1052 }
1053
1054 result_length += UTF8_LENGTH (str[i]);
1055 }
1056
1057 result = g_malloc (result_length + 1);
1058 p = result;
1059
1060 i = 0;
1061 while (p < result + result_length)
1062 p += g_unichar_to_utf8 (str[i++], p);
1063
1064 *p = '\0';
1065
1066 if (items_written)
1067 *items_written = p - result;
1068
1069 err_out:
1070 if (items_read)
1071 *items_read = i;
1072
1073 return result;
1074 }
1075
1076 #define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
1077
1078 /**
1079 * g_utf16_to_utf8:
1080 * @str: a UTF-16 encoded string
1081 * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
1082 * If @len < 0, then the string is nul-terminated.
1083 * @items_read: location to store number of words read, or %NULL.
1084 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1085 * returned in case @str contains a trailing partial
1086 * character. If an error occurs then the index of the
1087 * invalid input is stored here.
1088 * @items_written: location to store number of bytes written, or %NULL.
1089 * The value stored here does not include the trailing
1090 * 0 byte.
1091 * @error: location to store the error occuring, or %NULL to ignore
1092 * errors. Any of the errors in #GConvertError other than
1093 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
1094 *
1095 * Convert a string from UTF-16 to UTF-8. The result will be
1096 * terminated with a 0 byte.
1097 *
1098 * Note that the input is expected to be already in native endianness,
1099 * an initial byte-order-mark character is not handled specially.
1100 * g_convert() can be used to convert a byte buffer of UTF-16 data of
1101 * ambiguous endianess.
1102 *
1103 * Return value: a pointer to a newly allocated UTF-8 string.
1104 * This value must be freed with g_free(). If an
1105 * error occurs, %NULL will be returned and
1106 * @error set.
1107 **/
1108 gchar *
g_utf16_to_utf8(const gunichar2 * str,glong len,glong * items_read,glong * items_written,GError ** error)1109 g_utf16_to_utf8 (const gunichar2 *str,
1110 glong len,
1111 glong *items_read,
1112 glong *items_written,
1113 GError **error)
1114 {
1115 /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
1116 * are marked.
1117 */
1118 const gunichar2 *in;
1119 gchar *out;
1120 gchar *result = NULL;
1121 gint n_bytes;
1122 gunichar high_surrogate;
1123
1124 g_return_val_if_fail (str != NULL, NULL);
1125
1126 n_bytes = 0;
1127 in = str;
1128 high_surrogate = 0;
1129 while ((len < 0 || in - str < len) && *in)
1130 {
1131 gunichar2 c = *in;
1132 gunichar wc;
1133
1134 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1135 {
1136 if (high_surrogate)
1137 {
1138 wc = SURROGATE_VALUE (high_surrogate, c);
1139 high_surrogate = 0;
1140 }
1141 else
1142 {
1143 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1144 _("Invalid sequence in conversion input"));
1145 goto err_out;
1146 }
1147 }
1148 else
1149 {
1150 if (high_surrogate)
1151 {
1152 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1153 _("Invalid sequence in conversion input"));
1154 goto err_out;
1155 }
1156
1157 if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1158 {
1159 high_surrogate = c;
1160 goto next1;
1161 }
1162 else
1163 wc = c;
1164 }
1165
1166 /********** DIFFERENT for UTF8/UCS4 **********/
1167 n_bytes += UTF8_LENGTH (wc);
1168
1169 next1:
1170 in++;
1171 }
1172
1173 if (high_surrogate && !items_read)
1174 {
1175 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1176 _("Partial character sequence at end of input"));
1177 goto err_out;
1178 }
1179
1180 /* At this point, everything is valid, and we just need to convert
1181 */
1182 /********** DIFFERENT for UTF8/UCS4 **********/
1183 result = g_malloc (n_bytes + 1);
1184
1185 high_surrogate = 0;
1186 out = result;
1187 in = str;
1188 while (out < result + n_bytes)
1189 {
1190 gunichar2 c = *in;
1191 gunichar wc;
1192
1193 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1194 {
1195 wc = SURROGATE_VALUE (high_surrogate, c);
1196 high_surrogate = 0;
1197 }
1198 else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1199 {
1200 high_surrogate = c;
1201 goto next2;
1202 }
1203 else
1204 wc = c;
1205
1206 /********** DIFFERENT for UTF8/UCS4 **********/
1207 out += g_unichar_to_utf8 (wc, out);
1208
1209 next2:
1210 in++;
1211 }
1212
1213 /********** DIFFERENT for UTF8/UCS4 **********/
1214 *out = '\0';
1215
1216 if (items_written)
1217 /********** DIFFERENT for UTF8/UCS4 **********/
1218 *items_written = out - result;
1219
1220 err_out:
1221 if (items_read)
1222 *items_read = in - str;
1223
1224 return result;
1225 }
1226
1227 /**
1228 * g_utf16_to_ucs4:
1229 * @str: a UTF-16 encoded string
1230 * @len: the maximum length (number of <type>gunichar2</type>) of @str to use.
1231 * If @len < 0, then the string is nul-terminated.
1232 * @items_read: location to store number of words read, or %NULL.
1233 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1234 * returned in case @str contains a trailing partial
1235 * character. If an error occurs then the index of the
1236 * invalid input is stored here.
1237 * @items_written: location to store number of characters written, or %NULL.
1238 * The value stored here does not include the trailing
1239 * 0 character.
1240 * @error: location to store the error occuring, or %NULL to ignore
1241 * errors. Any of the errors in #GConvertError other than
1242 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
1243 *
1244 * Convert a string from UTF-16 to UCS-4. The result will be
1245 * nul-terminated.
1246 *
1247 * Return value: a pointer to a newly allocated UCS-4 string.
1248 * This value must be freed with g_free(). If an
1249 * error occurs, %NULL will be returned and
1250 * @error set.
1251 **/
1252 gunichar *
g_utf16_to_ucs4(const gunichar2 * str,glong len,glong * items_read,glong * items_written,GError ** error)1253 g_utf16_to_ucs4 (const gunichar2 *str,
1254 glong len,
1255 glong *items_read,
1256 glong *items_written,
1257 GError **error)
1258 {
1259 const gunichar2 *in;
1260 gchar *out;
1261 gchar *result = NULL;
1262 gint n_bytes;
1263 gunichar high_surrogate;
1264
1265 g_return_val_if_fail (str != NULL, NULL);
1266
1267 n_bytes = 0;
1268 in = str;
1269 high_surrogate = 0;
1270 while ((len < 0 || in - str < len) && *in)
1271 {
1272 gunichar2 c = *in;
1273 gunichar wc;
1274
1275 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1276 {
1277 if (high_surrogate)
1278 {
1279 wc = SURROGATE_VALUE (high_surrogate, c);
1280 high_surrogate = 0;
1281 }
1282 else
1283 {
1284 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1285 _("Invalid sequence in conversion input"));
1286 goto err_out;
1287 }
1288 }
1289 else
1290 {
1291 if (high_surrogate)
1292 {
1293 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1294 _("Invalid sequence in conversion input"));
1295 goto err_out;
1296 }
1297
1298 if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1299 {
1300 high_surrogate = c;
1301 goto next1;
1302 }
1303 else
1304 wc = c;
1305 }
1306
1307 /********** DIFFERENT for UTF8/UCS4 **********/
1308 n_bytes += sizeof (gunichar);
1309
1310 next1:
1311 in++;
1312 }
1313
1314 if (high_surrogate && !items_read)
1315 {
1316 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1317 _("Partial character sequence at end of input"));
1318 goto err_out;
1319 }
1320
1321 /* At this point, everything is valid, and we just need to convert
1322 */
1323 /********** DIFFERENT for UTF8/UCS4 **********/
1324 result = g_malloc (n_bytes + 4);
1325
1326 high_surrogate = 0;
1327 out = result;
1328 in = str;
1329 while (out < result + n_bytes)
1330 {
1331 gunichar2 c = *in;
1332 gunichar wc;
1333
1334 if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1335 {
1336 wc = SURROGATE_VALUE (high_surrogate, c);
1337 high_surrogate = 0;
1338 }
1339 else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1340 {
1341 high_surrogate = c;
1342 goto next2;
1343 }
1344 else
1345 wc = c;
1346
1347 /********** DIFFERENT for UTF8/UCS4 **********/
1348 *(gunichar *)out = wc;
1349 out += sizeof (gunichar);
1350
1351 next2:
1352 in++;
1353 }
1354
1355 /********** DIFFERENT for UTF8/UCS4 **********/
1356 *(gunichar *)out = 0;
1357
1358 if (items_written)
1359 /********** DIFFERENT for UTF8/UCS4 **********/
1360 *items_written = (out - result) / sizeof (gunichar);
1361
1362 err_out:
1363 if (items_read)
1364 *items_read = in - str;
1365
1366 return (gunichar *)result;
1367 }
1368
1369 /**
1370 * g_utf8_to_utf16:
1371 * @str: a UTF-8 encoded string
1372 * @len: the maximum length (number of characters) of @str to use.
1373 * If @len < 0, then the string is nul-terminated.
1374 * @items_read: location to store number of bytes read, or %NULL.
1375 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1376 * returned in case @str contains a trailing partial
1377 * character. If an error occurs then the index of the
1378 * invalid input is stored here.
1379 * @items_written: location to store number of <type>gunichar2</type> written,
1380 * or %NULL.
1381 * The value stored here does not include the trailing 0.
1382 * @error: location to store the error occuring, or %NULL to ignore
1383 * errors. Any of the errors in #GConvertError other than
1384 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
1385 *
1386 * Convert a string from UTF-8 to UTF-16. A 0 character will be
1387 * added to the result after the converted text.
1388 *
1389 * Return value: a pointer to a newly allocated UTF-16 string.
1390 * This value must be freed with g_free(). If an
1391 * error occurs, %NULL will be returned and
1392 * @error set.
1393 **/
1394 gunichar2 *
g_utf8_to_utf16(const gchar * str,glong len,glong * items_read,glong * items_written,GError ** error)1395 g_utf8_to_utf16 (const gchar *str,
1396 glong len,
1397 glong *items_read,
1398 glong *items_written,
1399 GError **error)
1400 {
1401 gunichar2 *result = NULL;
1402 gint n16;
1403 const gchar *in;
1404 gint i;
1405
1406 g_return_val_if_fail (str != NULL, NULL);
1407
1408 in = str;
1409 n16 = 0;
1410 while ((len < 0 || str + len - in > 0) && *in)
1411 {
1412 gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);
1413 if (wc & 0x80000000)
1414 {
1415 if (wc == (gunichar)-2)
1416 {
1417 if (items_read)
1418 break;
1419 else
1420 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1421 _("Partial character sequence at end of input"));
1422 }
1423 else
1424 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1425 _("Invalid byte sequence in conversion input"));
1426
1427 goto err_out;
1428 }
1429
1430 if (wc < 0xd800)
1431 n16 += 1;
1432 else if (wc < 0xe000)
1433 {
1434 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1435 _("Invalid sequence in conversion input"));
1436
1437 goto err_out;
1438 }
1439 else if (wc < 0x10000)
1440 n16 += 1;
1441 else if (wc < 0x110000)
1442 n16 += 2;
1443 else
1444 {
1445 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1446 _("Character out of range for UTF-16"));
1447
1448 goto err_out;
1449 }
1450
1451 in = g_utf8_next_char (in);
1452 }
1453
1454 result = g_new (gunichar2, n16 + 1);
1455
1456 in = str;
1457 for (i = 0; i < n16;)
1458 {
1459 gunichar wc = g_utf8_get_char (in);
1460
1461 if (wc < 0x10000)
1462 {
1463 result[i++] = wc;
1464 }
1465 else
1466 {
1467 result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1468 result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1469 }
1470
1471 in = g_utf8_next_char (in);
1472 }
1473
1474 result[i] = 0;
1475
1476 if (items_written)
1477 *items_written = n16;
1478
1479 err_out:
1480 if (items_read)
1481 *items_read = in - str;
1482
1483 return result;
1484 }
1485
1486 /**
1487 * g_ucs4_to_utf16:
1488 * @str: a UCS-4 encoded string
1489 * @len: the maximum length (number of characters) of @str to use.
1490 * If @len < 0, then the string is nul-terminated.
1491 * @items_read: location to store number of bytes read, or %NULL.
1492 * If an error occurs then the index of the invalid input
1493 * is stored here.
1494 * @items_written: location to store number of <type>gunichar2</type>
1495 * written, or %NULL. The value stored here does not
1496 * include the trailing 0.
1497 * @error: location to store the error occuring, or %NULL to ignore
1498 * errors. Any of the errors in #GConvertError other than
1499 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
1500 *
1501 * Convert a string from UCS-4 to UTF-16. A 0 character will be
1502 * added to the result after the converted text.
1503 *
1504 * Return value: a pointer to a newly allocated UTF-16 string.
1505 * This value must be freed with g_free(). If an
1506 * error occurs, %NULL will be returned and
1507 * @error set.
1508 **/
1509 gunichar2 *
g_ucs4_to_utf16(const gunichar * str,glong len,glong * items_read,glong * items_written,GError ** error)1510 g_ucs4_to_utf16 (const gunichar *str,
1511 glong len,
1512 glong *items_read,
1513 glong *items_written,
1514 GError **error)
1515 {
1516 gunichar2 *result = NULL;
1517 gint n16;
1518 gint i, j;
1519
1520 n16 = 0;
1521 i = 0;
1522 while ((len < 0 || i < len) && str[i])
1523 {
1524 gunichar wc = str[i];
1525
1526 if (wc < 0xd800)
1527 n16 += 1;
1528 else if (wc < 0xe000)
1529 {
1530 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1531 _("Invalid sequence in conversion input"));
1532
1533 goto err_out;
1534 }
1535 else if (wc < 0x10000)
1536 n16 += 1;
1537 else if (wc < 0x110000)
1538 n16 += 2;
1539 else
1540 {
1541 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1542 _("Character out of range for UTF-16"));
1543
1544 goto err_out;
1545 }
1546
1547 i++;
1548 }
1549
1550 result = g_new (gunichar2, n16 + 1);
1551
1552 for (i = 0, j = 0; j < n16; i++)
1553 {
1554 gunichar wc = str[i];
1555
1556 if (wc < 0x10000)
1557 {
1558 result[j++] = wc;
1559 }
1560 else
1561 {
1562 result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1563 result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1564 }
1565 }
1566 result[j] = 0;
1567
1568 if (items_written)
1569 *items_written = n16;
1570
1571 err_out:
1572 if (items_read)
1573 *items_read = i;
1574
1575 return result;
1576 }
1577
1578 #define CONTINUATION_CHAR \
1579 G_STMT_START { \
1580 if ((*(guchar *)p & 0xc0) != 0x80) /* 10xxxxxx */ \
1581 goto error; \
1582 val <<= 6; \
1583 val |= (*(guchar *)p) & 0x3f; \
1584 } G_STMT_END
1585
1586 static const gchar *
fast_validate(const char * str)1587 fast_validate (const char *str)
1588
1589 {
1590 gunichar val = 0;
1591 gunichar min = 0;
1592 const gchar *p;
1593
1594 for (p = str; *p; p++)
1595 {
1596 if (*(guchar *)p < 128)
1597 /* done */;
1598 else
1599 {
1600 const gchar *last;
1601
1602 last = p;
1603 if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1604 {
1605 if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1606 goto error;
1607 p++;
1608 if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1609 goto error;
1610 }
1611 else
1612 {
1613 if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1614 {
1615 min = (1 << 11);
1616 val = *(guchar *)p & 0x0f;
1617 goto TWO_REMAINING;
1618 }
1619 else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1620 {
1621 min = (1 << 16);
1622 val = *(guchar *)p & 0x07;
1623 }
1624 else
1625 goto error;
1626
1627 p++;
1628 CONTINUATION_CHAR;
1629 TWO_REMAINING:
1630 p++;
1631 CONTINUATION_CHAR;
1632 p++;
1633 CONTINUATION_CHAR;
1634
1635 if (G_UNLIKELY (val < min))
1636 goto error;
1637
1638 if (G_UNLIKELY (!UNICODE_VALID(val)))
1639 goto error;
1640 }
1641
1642 continue;
1643
1644 error:
1645 return last;
1646 }
1647 }
1648
1649 return p;
1650 }
1651
1652 static const gchar *
fast_validate_len(const char * str,gssize max_len)1653 fast_validate_len (const char *str,
1654 gssize max_len)
1655
1656 {
1657 gunichar val = 0;
1658 gunichar min = 0;
1659 const gchar *p;
1660
1661 g_assert (max_len >= 0);
1662
1663 for (p = str; ((p - str) < max_len) && *p; p++)
1664 {
1665 if (*(guchar *)p < 128)
1666 /* done */;
1667 else
1668 {
1669 const gchar *last;
1670
1671 last = p;
1672 if ((*(guchar *)p & 0xe0) == 0xc0) /* 110xxxxx */
1673 {
1674 if (G_UNLIKELY (max_len - (p - str) < 2))
1675 goto error;
1676
1677 if (G_UNLIKELY ((*(guchar *)p & 0x1e) == 0))
1678 goto error;
1679 p++;
1680 if (G_UNLIKELY ((*(guchar *)p & 0xc0) != 0x80)) /* 10xxxxxx */
1681 goto error;
1682 }
1683 else
1684 {
1685 if ((*(guchar *)p & 0xf0) == 0xe0) /* 1110xxxx */
1686 {
1687 if (G_UNLIKELY (max_len - (p - str) < 3))
1688 goto error;
1689
1690 min = (1 << 11);
1691 val = *(guchar *)p & 0x0f;
1692 goto TWO_REMAINING;
1693 }
1694 else if ((*(guchar *)p & 0xf8) == 0xf0) /* 11110xxx */
1695 {
1696 if (G_UNLIKELY (max_len - (p - str) < 4))
1697 goto error;
1698
1699 min = (1 << 16);
1700 val = *(guchar *)p & 0x07;
1701 }
1702 else
1703 goto error;
1704
1705 p++;
1706 CONTINUATION_CHAR;
1707 TWO_REMAINING:
1708 p++;
1709 CONTINUATION_CHAR;
1710 p++;
1711 CONTINUATION_CHAR;
1712
1713 if (G_UNLIKELY (val < min))
1714 goto error;
1715 if (G_UNLIKELY (!UNICODE_VALID(val)))
1716 goto error;
1717 }
1718
1719 continue;
1720
1721 error:
1722 return last;
1723 }
1724 }
1725
1726 return p;
1727 }
1728
1729 /**
1730 * g_utf8_validate:
1731 * @str: a pointer to character data
1732 * @max_len: max bytes to validate, or -1 to go until NUL
1733 * @end: return location for end of valid data
1734 *
1735 * Validates UTF-8 encoded text. @str is the text to validate;
1736 * if @str is nul-terminated, then @max_len can be -1, otherwise
1737 * @max_len should be the number of bytes to validate.
1738 * If @end is non-%NULL, then the end of the valid range
1739 * will be stored there (i.e. the start of the first invalid
1740 * character if some bytes were invalid, or the end of the text
1741 * being validated otherwise).
1742 *
1743 * Note that g_utf8_validate() returns %FALSE if @max_len is
1744 * positive and NUL is met before @max_len bytes have been read.
1745 *
1746 * Returns %TRUE if all of @str was valid. Many GLib and GTK+
1747 * routines <emphasis>require</emphasis> valid UTF-8 as input;
1748 * so data read from a file or the network should be checked
1749 * with g_utf8_validate() before doing anything else with it.
1750 *
1751 * Return value: %TRUE if the text was valid UTF-8
1752 **/
1753 gboolean
g_utf8_validate(const char * str,gssize max_len,const gchar ** end)1754 g_utf8_validate (const char *str,
1755 gssize max_len,
1756 const gchar **end)
1757
1758 {
1759 const gchar *p;
1760
1761 if (max_len < 0)
1762 p = fast_validate (str);
1763 else
1764 p = fast_validate_len (str, max_len);
1765
1766 if (end)
1767 *end = p;
1768
1769 if ((max_len >= 0 && p != str + max_len) ||
1770 (max_len < 0 && *p != '\0'))
1771 return FALSE;
1772 else
1773 return TRUE;
1774 }
1775
1776 /**
1777 * g_unichar_validate:
1778 * @ch: a Unicode character
1779 *
1780 * Checks whether @ch is a valid Unicode character. Some possible
1781 * integer values of @ch will not be valid. 0 is considered a valid
1782 * character, though it's normally a string terminator.
1783 *
1784 * Return value: %TRUE if @ch is a valid Unicode character
1785 **/
1786 gboolean
g_unichar_validate(gunichar ch)1787 g_unichar_validate (gunichar ch)
1788 {
1789 return UNICODE_VALID (ch);
1790 }
1791
1792 /**
1793 * g_utf8_strreverse:
1794 * @str: a UTF-8 encoded string
1795 * @len: the maximum length of @str to use, in bytes. If @len < 0,
1796 * then the string is nul-terminated.
1797 *
1798 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1799 * (Use g_utf8_validate() on all text before trying to use UTF-8
1800 * utility functions with it.)
1801 *
1802 * This function is intended for programmatic uses of reversed strings.
1803 * It pays no attention to decomposed characters, combining marks, byte
1804 * order marks, directional indicators (LRM, LRO, etc) and similar
1805 * characters which might need special handling when reversing a string
1806 * for display purposes.
1807 *
1808 * Note that unlike g_strreverse(), this function returns
1809 * newly-allocated memory, which should be freed with g_free() when
1810 * no longer needed.
1811 *
1812 * Returns: a newly-allocated string which is the reverse of @str.
1813 *
1814 * Since: 2.2
1815 */
1816 gchar *
g_utf8_strreverse(const gchar * str,gssize len)1817 g_utf8_strreverse (const gchar *str,
1818 gssize len)
1819 {
1820 gchar *r, *result;
1821 const gchar *p;
1822
1823 if (len < 0)
1824 len = strlen (str);
1825
1826 result = g_new (gchar, len + 1);
1827 r = result + len;
1828 p = str;
1829 while (r > result)
1830 {
1831 gchar *m, skip = g_utf8_skip[*(guchar*) p];
1832 r -= skip;
1833 for (m = r; skip; skip--)
1834 *m++ = *p++;
1835 }
1836 result[len] = 0;
1837
1838 return result;
1839 }
1840
1841
1842 gchar *
_g_utf8_make_valid(const gchar * name)1843 _g_utf8_make_valid (const gchar *name)
1844 {
1845 GString *string;
1846 const gchar *remainder, *invalid;
1847 gint remaining_bytes, valid_bytes;
1848
1849 g_return_val_if_fail (name != NULL, NULL);
1850
1851 string = NULL;
1852 remainder = name;
1853 remaining_bytes = strlen (name);
1854
1855 while (remaining_bytes != 0)
1856 {
1857 if (g_utf8_validate (remainder, remaining_bytes, &invalid))
1858 break;
1859 valid_bytes = invalid - remainder;
1860
1861 if (string == NULL)
1862 string = g_string_sized_new (remaining_bytes);
1863
1864 g_string_append_len (string, remainder, valid_bytes);
1865 /* append U+FFFD REPLACEMENT CHARACTER */
1866 g_string_append (string, "\357\277\275");
1867
1868 remaining_bytes -= valid_bytes + 1;
1869 remainder = invalid + 1;
1870 }
1871
1872 if (string == NULL)
1873 return g_strdup (name);
1874
1875 g_string_append (string, remainder);
1876
1877 g_assert (g_utf8_validate (string->str, -1, NULL));
1878
1879 return g_string_free (string, FALSE);
1880 }
1881
1882
1883 #define __G_UTF8_C__
1884 #include "galiasdef.c"
1885