1 /* GLIB - Library of useful routines for C programming
2 *
3 * gconvert.c: Convert between character sets using iconv
4 * Copyright Red Hat Inc., 2000
5 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "config.h"
22 #include "glibconfig.h"
23
24 #ifndef G_OS_WIN32
25 #include <iconv.h>
26 #endif
27 #include <errno.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
31
32 #ifdef G_OS_WIN32
33 #include "win_iconv.c"
34 #endif
35
36 #ifdef G_PLATFORM_WIN32
37 #define STRICT
38 #include <windows.h>
39 #undef STRICT
40 #endif
41
42 #include "gconvert.h"
43
44 #include "gcharsetprivate.h"
45 #include "gslist.h"
46 #include "gstrfuncs.h"
47 #include "gtestutils.h"
48 #include "gthread.h"
49 #include "gthreadprivate.h"
50 #include "gunicode.h"
51 #include "gfileutils.h"
52 #include "genviron.h"
53
54 #include "glibintl.h"
55
56
57 /**
58 * SECTION:conversions
59 * @title: Character Set Conversion
60 * @short_description: convert strings between different character sets
61 *
62 * The g_convert() family of function wraps the functionality of iconv().
63 * In addition to pure character set conversions, GLib has functions to
64 * deal with the extra complications of encodings for file names.
65 *
66 * ## File Name Encodings
67 *
68 * Historically, UNIX has not had a defined encoding for file names:
69 * a file name is valid as long as it does not have path separators
70 * in it ("/"). However, displaying file names may require conversion:
71 * from the character set in which they were created, to the character
72 * set in which the application operates. Consider the Spanish file name
73 * "Presentación.sxi". If the application which created it uses
74 * ISO-8859-1 for its encoding,
75 * |[
76 * Character: P r e s e n t a c i ó n . s x i
77 * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
78 * ]|
79 * However, if the application use UTF-8, the actual file name on
80 * disk would look like this:
81 * |[
82 * Character: P r e s e n t a c i ó n . s x i
83 * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
84 * ]|
85 * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
86 * GLib do the same thing. If you get a file name from the file system,
87 * for example, from readdir() or from g_dir_read_name(), and you wish
88 * to display the file name to the user, you will need to convert it
89 * into UTF-8. The opposite case is when the user types the name of a
90 * file they wish to save: the toolkit will give you that string in
91 * UTF-8 encoding, and you will need to convert it to the character
92 * set used for file names before you can create the file with open()
93 * or fopen().
94 *
95 * By default, GLib assumes that file names on disk are in UTF-8
96 * encoding. This is a valid assumption for file systems which
97 * were created relatively recently: most applications use UTF-8
98 * encoding for their strings, and that is also what they use for
99 * the file names they create. However, older file systems may
100 * still contain file names created in "older" encodings, such as
101 * ISO-8859-1. In this case, for compatibility reasons, you may want
102 * to instruct GLib to use that particular encoding for file names
103 * rather than UTF-8. You can do this by specifying the encoding for
104 * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
105 * environment variable. For example, if your installation uses
106 * ISO-8859-1 for file names, you can put this in your `~/.profile`:
107 * |[
108 * export G_FILENAME_ENCODING=ISO-8859-1
109 * ]|
110 * GLib provides the functions g_filename_to_utf8() and
111 * g_filename_from_utf8() to perform the necessary conversions.
112 * These functions convert file names from the encoding specified
113 * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
114 * [diagram][file-name-encodings-diagram] illustrates how
115 * these functions are used to convert between UTF-8 and the
116 * encoding for file names in the file system.
117 *
118 * ## Conversion between file name encodings # {#file-name-encodings-diagram)
119 *
120 * ![](file-name-encodings.png)
121 *
122 * ## Checklist for Application Writers
123 *
124 * This section is a practical summary of the detailed
125 * things to do to make sure your applications process file
126 * name encodings correctly.
127 *
128 * 1. If you get a file name from the file system from a function
129 * such as readdir() or gtk_file_chooser_get_filename(), you do
130 * not need to do any conversion to pass that file name to
131 * functions like open(), rename(), or fopen() -- those are "raw"
132 * file names which the file system understands.
133 *
134 * 2. If you need to display a file name, convert it to UTF-8 first
135 * by using g_filename_to_utf8(). If conversion fails, display a
136 * string like "Unknown file name". Do not convert this string back
137 * into the encoding used for file names if you wish to pass it to
138 * the file system; use the original file name instead.
139 *
140 * For example, the document window of a word processor could display
141 * "Unknown file name" in its title bar but still let the user save
142 * the file, as it would keep the raw file name internally. This
143 * can happen if the user has not set the `G_FILENAME_ENCODING`
144 * environment variable even though he has files whose names are
145 * not encoded in UTF-8.
146 *
147 * 3. If your user interface lets the user type a file name for saving
148 * or renaming, convert it to the encoding used for file names in
149 * the file system by using g_filename_from_utf8(). Pass the converted
150 * file name to functions like fopen(). If conversion fails, ask the
151 * user to enter a different file name. This can happen if the user
152 * types Japanese characters when `G_FILENAME_ENCODING` is set to
153 * `ISO-8859-1`, for example.
154 */
155
156 /* We try to terminate strings in unknown charsets with this many zero bytes
157 * to ensure that multibyte strings really are nul-terminated when we return
158 * them from g_convert() and friends.
159 */
160 #define NUL_TERMINATOR_LENGTH 4
161
G_DEFINE_QUARK(g_convert_error,g_convert_error)162 G_DEFINE_QUARK (g_convert_error, g_convert_error)
163
164 static gboolean
165 try_conversion (const char *to_codeset,
166 const char *from_codeset,
167 iconv_t *cd)
168 {
169 *cd = iconv_open (to_codeset, from_codeset);
170
171 if (*cd == (iconv_t)-1 && errno == EINVAL)
172 return FALSE;
173 else
174 return TRUE;
175 }
176
177 static gboolean
try_to_aliases(const char ** to_aliases,const char * from_codeset,iconv_t * cd)178 try_to_aliases (const char **to_aliases,
179 const char *from_codeset,
180 iconv_t *cd)
181 {
182 if (to_aliases)
183 {
184 const char **p = to_aliases;
185 while (*p)
186 {
187 if (try_conversion (*p, from_codeset, cd))
188 return TRUE;
189
190 p++;
191 }
192 }
193
194 return FALSE;
195 }
196
197 /**
198 * g_iconv_open: (skip)
199 * @to_codeset: destination codeset
200 * @from_codeset: source codeset
201 *
202 * Same as the standard UNIX routine iconv_open(), but
203 * may be implemented via libiconv on UNIX flavors that lack
204 * a native implementation.
205 *
206 * GLib provides g_convert() and g_locale_to_utf8() which are likely
207 * more convenient than the raw iconv wrappers.
208 *
209 * Returns: a "conversion descriptor", or (GIConv)-1 if
210 * opening the converter failed.
211 **/
212 GIConv
g_iconv_open(const gchar * to_codeset,const gchar * from_codeset)213 g_iconv_open (const gchar *to_codeset,
214 const gchar *from_codeset)
215 {
216 iconv_t cd;
217
218 if (!try_conversion (to_codeset, from_codeset, &cd))
219 {
220 const char **to_aliases = _g_charset_get_aliases (to_codeset);
221 const char **from_aliases = _g_charset_get_aliases (from_codeset);
222
223 if (from_aliases)
224 {
225 const char **p = from_aliases;
226 while (*p)
227 {
228 if (try_conversion (to_codeset, *p, &cd))
229 goto out;
230
231 if (try_to_aliases (to_aliases, *p, &cd))
232 goto out;
233
234 p++;
235 }
236 }
237
238 if (try_to_aliases (to_aliases, from_codeset, &cd))
239 goto out;
240 }
241
242 out:
243 return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
244 }
245
246 /**
247 * g_iconv: (skip)
248 * @converter: conversion descriptor from g_iconv_open()
249 * @inbuf: bytes to convert
250 * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
251 * @outbuf: converted output bytes
252 * @outbytes_left: inout parameter, bytes available to fill in @outbuf
253 *
254 * Same as the standard UNIX routine iconv(), but
255 * may be implemented via libiconv on UNIX flavors that lack
256 * a native implementation.
257 *
258 * GLib provides g_convert() and g_locale_to_utf8() which are likely
259 * more convenient than the raw iconv wrappers.
260 *
261 * Note that the behaviour of iconv() for characters which are valid in the
262 * input character set, but which have no representation in the output character
263 * set, is implementation defined. This function may return success (with a
264 * positive number of non-reversible conversions as replacement characters were
265 * used), or it may return -1 and set an error such as %EILSEQ, in such a
266 * situation.
267 *
268 * Returns: count of non-reversible conversions, or -1 on error
269 **/
270 gsize
g_iconv(GIConv converter,gchar ** inbuf,gsize * inbytes_left,gchar ** outbuf,gsize * outbytes_left)271 g_iconv (GIConv converter,
272 gchar **inbuf,
273 gsize *inbytes_left,
274 gchar **outbuf,
275 gsize *outbytes_left)
276 {
277 iconv_t cd = (iconv_t)converter;
278
279 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
280 }
281
282 /**
283 * g_iconv_close: (skip)
284 * @converter: a conversion descriptor from g_iconv_open()
285 *
286 * Same as the standard UNIX routine iconv_close(), but
287 * may be implemented via libiconv on UNIX flavors that lack
288 * a native implementation. Should be called to clean up
289 * the conversion descriptor from g_iconv_open() when
290 * you are done converting things.
291 *
292 * GLib provides g_convert() and g_locale_to_utf8() which are likely
293 * more convenient than the raw iconv wrappers.
294 *
295 * Returns: -1 on error, 0 on success
296 **/
297 gint
g_iconv_close(GIConv converter)298 g_iconv_close (GIConv converter)
299 {
300 iconv_t cd = (iconv_t)converter;
301
302 return iconv_close (cd);
303 }
304
305 static GIConv
open_converter(const gchar * to_codeset,const gchar * from_codeset,GError ** error)306 open_converter (const gchar *to_codeset,
307 const gchar *from_codeset,
308 GError **error)
309 {
310 GIConv cd;
311
312 cd = g_iconv_open (to_codeset, from_codeset);
313
314 if (cd == (GIConv) -1)
315 {
316 /* Something went wrong. */
317 if (error)
318 {
319 if (errno == EINVAL)
320 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
321 _("Conversion from character set “%s” to “%s” is not supported"),
322 from_codeset, to_codeset);
323 else
324 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
325 _("Could not open converter from “%s” to “%s”"),
326 from_codeset, to_codeset);
327 }
328 }
329
330 return cd;
331 }
332
333 static int
close_converter(GIConv cd)334 close_converter (GIConv cd)
335 {
336 if (cd == (GIConv) -1)
337 return 0;
338
339 return g_iconv_close (cd);
340 }
341
342 /**
343 * g_convert_with_iconv: (skip)
344 * @str: (array length=len) (element-type guint8):
345 * the string to convert.
346 * @len: the length of the string in bytes, or -1 if the string is
347 * nul-terminated (Note that some encodings may allow nul
348 * bytes to occur inside strings. In that case, using -1
349 * for the @len parameter is unsafe)
350 * @converter: conversion descriptor from g_iconv_open()
351 * @bytes_read: (out) (optional): location to store the number of bytes in
352 * the input string that were successfully converted, or %NULL.
353 * Even if the conversion was successful, this may be
354 * less than @len if there were partial characters
355 * at the end of the input. If the error
356 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
357 * stored will be the byte offset after the last valid
358 * input sequence.
359 * @bytes_written: (out) (optional): the number of bytes stored in
360 * the output buffer (not including the terminating nul).
361 * @error: location to store the error occurring, or %NULL to ignore
362 * errors. Any of the errors in #GConvertError may occur.
363 *
364 * Converts a string from one character set to another.
365 *
366 * Note that you should use g_iconv() for streaming conversions.
367 * Despite the fact that @bytes_read can return information about partial
368 * characters, the g_convert_... functions are not generally suitable
369 * for streaming. If the underlying converter maintains internal state,
370 * then this won't be preserved across successive calls to g_convert(),
371 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
372 * this is the GNU C converter for CP1255 which does not emit a base
373 * character until it knows that the next character is not a mark that
374 * could combine with the base character.)
375 *
376 * Characters which are valid in the input character set, but which have no
377 * representation in the output character set will result in a
378 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
379 * specification, which leaves this behaviour implementation defined. Note that
380 * this is the same error code as is returned for an invalid byte sequence in
381 * the input character set. To get defined behaviour for conversion of
382 * unrepresentable characters, use g_convert_with_fallback().
383 *
384 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
385 * If the conversion was successful, a newly allocated buffer
386 * containing the converted string, which must be freed with
387 * g_free(). Otherwise %NULL and @error will be set.
388 **/
389 gchar*
g_convert_with_iconv(const gchar * str,gssize len,GIConv converter,gsize * bytes_read,gsize * bytes_written,GError ** error)390 g_convert_with_iconv (const gchar *str,
391 gssize len,
392 GIConv converter,
393 gsize *bytes_read,
394 gsize *bytes_written,
395 GError **error)
396 {
397 gchar *dest;
398 gchar *outp;
399 const gchar *p;
400 gsize inbytes_remaining;
401 gsize outbytes_remaining;
402 gsize err;
403 gsize outbuf_size;
404 gboolean have_error = FALSE;
405 gboolean done = FALSE;
406 gboolean reset = FALSE;
407
408 g_return_val_if_fail (converter != (GIConv) -1, NULL);
409
410 if (len < 0)
411 len = strlen (str);
412
413 p = str;
414 inbytes_remaining = len;
415 outbuf_size = len + NUL_TERMINATOR_LENGTH;
416
417 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
418 outp = dest = g_malloc (outbuf_size);
419
420 while (!done && !have_error)
421 {
422 if (reset)
423 err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
424 else
425 err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
426
427 if (err == (gsize) -1)
428 {
429 switch (errno)
430 {
431 case EINVAL:
432 /* Incomplete text, do not report an error */
433 done = TRUE;
434 break;
435 case E2BIG:
436 {
437 gsize used = outp - dest;
438
439 outbuf_size *= 2;
440 dest = g_realloc (dest, outbuf_size);
441
442 outp = dest + used;
443 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
444 }
445 break;
446 case EILSEQ:
447 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
448 _("Invalid byte sequence in conversion input"));
449 have_error = TRUE;
450 break;
451 default:
452 {
453 int errsv = errno;
454
455 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
456 _("Error during conversion: %s"),
457 g_strerror (errsv));
458 }
459 have_error = TRUE;
460 break;
461 }
462 }
463 else if (err > 0)
464 {
465 /* @err gives the number of replacement characters used. */
466 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
467 _("Unrepresentable character in conversion input"));
468 have_error = TRUE;
469 }
470 else
471 {
472 if (!reset)
473 {
474 /* call g_iconv with NULL inbuf to cleanup shift state */
475 reset = TRUE;
476 inbytes_remaining = 0;
477 }
478 else
479 done = TRUE;
480 }
481 }
482
483 memset (outp, 0, NUL_TERMINATOR_LENGTH);
484
485 if (bytes_read)
486 *bytes_read = p - str;
487 else
488 {
489 if ((p - str) != len)
490 {
491 if (!have_error)
492 {
493 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
494 _("Partial character sequence at end of input"));
495 have_error = TRUE;
496 }
497 }
498 }
499
500 if (bytes_written)
501 *bytes_written = outp - dest; /* Doesn't include '\0' */
502
503 if (have_error)
504 {
505 g_free (dest);
506 return NULL;
507 }
508 else
509 return dest;
510 }
511
512 /**
513 * g_convert:
514 * @str: (array length=len) (element-type guint8):
515 * the string to convert.
516 * @len: the length of the string in bytes, or -1 if the string is
517 * nul-terminated (Note that some encodings may allow nul
518 * bytes to occur inside strings. In that case, using -1
519 * for the @len parameter is unsafe)
520 * @to_codeset: name of character set into which to convert @str
521 * @from_codeset: character set of @str.
522 * @bytes_read: (out) (optional): location to store the number of bytes in
523 * the input string that were successfully converted, or %NULL.
524 * Even if the conversion was successful, this may be
525 * less than @len if there were partial characters
526 * at the end of the input. If the error
527 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
528 * stored will be the byte offset after the last valid
529 * input sequence.
530 * @bytes_written: (out) (optional): the number of bytes stored in
531 * the output buffer (not including the terminating nul).
532 * @error: location to store the error occurring, or %NULL to ignore
533 * errors. Any of the errors in #GConvertError may occur.
534 *
535 * Converts a string from one character set to another.
536 *
537 * Note that you should use g_iconv() for streaming conversions.
538 * Despite the fact that @bytes_read can return information about partial
539 * characters, the g_convert_... functions are not generally suitable
540 * for streaming. If the underlying converter maintains internal state,
541 * then this won't be preserved across successive calls to g_convert(),
542 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
543 * this is the GNU C converter for CP1255 which does not emit a base
544 * character until it knows that the next character is not a mark that
545 * could combine with the base character.)
546 *
547 * Using extensions such as "//TRANSLIT" may not work (or may not work
548 * well) on many platforms. Consider using g_str_to_ascii() instead.
549 *
550 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
551 * If the conversion was successful, a newly allocated buffer
552 * containing the converted string, which must be freed with g_free().
553 * Otherwise %NULL and @error will be set.
554 **/
555 gchar*
g_convert(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,gsize * bytes_read,gsize * bytes_written,GError ** error)556 g_convert (const gchar *str,
557 gssize len,
558 const gchar *to_codeset,
559 const gchar *from_codeset,
560 gsize *bytes_read,
561 gsize *bytes_written,
562 GError **error)
563 {
564 gchar *res;
565 GIConv cd;
566
567 g_return_val_if_fail (str != NULL, NULL);
568 g_return_val_if_fail (to_codeset != NULL, NULL);
569 g_return_val_if_fail (from_codeset != NULL, NULL);
570
571 cd = open_converter (to_codeset, from_codeset, error);
572
573 if (cd == (GIConv) -1)
574 {
575 if (bytes_read)
576 *bytes_read = 0;
577
578 if (bytes_written)
579 *bytes_written = 0;
580
581 return NULL;
582 }
583
584 res = g_convert_with_iconv (str, len, cd,
585 bytes_read, bytes_written,
586 error);
587
588 close_converter (cd);
589
590 return res;
591 }
592
593 /**
594 * g_convert_with_fallback:
595 * @str: (array length=len) (element-type guint8):
596 * the string to convert.
597 * @len: the length of the string in bytes, or -1 if the string is
598 * nul-terminated (Note that some encodings may allow nul
599 * bytes to occur inside strings. In that case, using -1
600 * for the @len parameter is unsafe)
601 * @to_codeset: name of character set into which to convert @str
602 * @from_codeset: character set of @str.
603 * @fallback: UTF-8 string to use in place of characters not
604 * present in the target encoding. (The string must be
605 * representable in the target encoding).
606 * If %NULL, characters not in the target encoding will
607 * be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
608 * @bytes_read: (out) (optional): location to store the number of bytes in
609 * the input string that were successfully converted, or %NULL.
610 * Even if the conversion was successful, this may be
611 * less than @len if there were partial characters
612 * at the end of the input.
613 * @bytes_written: (out) (optional): the number of bytes stored in
614 * the output buffer (not including the terminating nul).
615 * @error: location to store the error occurring, or %NULL to ignore
616 * errors. Any of the errors in #GConvertError may occur.
617 *
618 * Converts a string from one character set to another, possibly
619 * including fallback sequences for characters not representable
620 * in the output. Note that it is not guaranteed that the specification
621 * for the fallback sequences in @fallback will be honored. Some
622 * systems may do an approximate conversion from @from_codeset
623 * to @to_codeset in their iconv() functions,
624 * in which case GLib will simply return that approximate conversion.
625 *
626 * Note that you should use g_iconv() for streaming conversions.
627 * Despite the fact that @bytes_read can return information about partial
628 * characters, the g_convert_... functions are not generally suitable
629 * for streaming. If the underlying converter maintains internal state,
630 * then this won't be preserved across successive calls to g_convert(),
631 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
632 * this is the GNU C converter for CP1255 which does not emit a base
633 * character until it knows that the next character is not a mark that
634 * could combine with the base character.)
635 *
636 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
637 * If the conversion was successful, a newly allocated buffer
638 * containing the converted string, which must be freed with g_free().
639 * Otherwise %NULL and @error will be set.
640 **/
641 gchar*
g_convert_with_fallback(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,const gchar * fallback,gsize * bytes_read,gsize * bytes_written,GError ** error)642 g_convert_with_fallback (const gchar *str,
643 gssize len,
644 const gchar *to_codeset,
645 const gchar *from_codeset,
646 const gchar *fallback,
647 gsize *bytes_read,
648 gsize *bytes_written,
649 GError **error)
650 {
651 gchar *utf8;
652 gchar *dest;
653 gchar *outp;
654 const gchar *insert_str = NULL;
655 const gchar *p;
656 gsize inbytes_remaining;
657 const gchar *save_p = NULL;
658 gsize save_inbytes = 0;
659 gsize outbytes_remaining;
660 gsize err;
661 GIConv cd;
662 gsize outbuf_size;
663 gboolean have_error = FALSE;
664 gboolean done = FALSE;
665
666 GError *local_error = NULL;
667
668 g_return_val_if_fail (str != NULL, NULL);
669 g_return_val_if_fail (to_codeset != NULL, NULL);
670 g_return_val_if_fail (from_codeset != NULL, NULL);
671
672 if (len < 0)
673 len = strlen (str);
674
675 /* Try an exact conversion; we only proceed if this fails
676 * due to an illegal sequence in the input string.
677 */
678 dest = g_convert (str, len, to_codeset, from_codeset,
679 bytes_read, bytes_written, &local_error);
680 if (!local_error)
681 return dest;
682
683 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
684 {
685 g_propagate_error (error, local_error);
686 return NULL;
687 }
688 else
689 g_error_free (local_error);
690
691 local_error = NULL;
692
693 /* No go; to proceed, we need a converter from "UTF-8" to
694 * to_codeset, and the string as UTF-8.
695 */
696 cd = open_converter (to_codeset, "UTF-8", error);
697 if (cd == (GIConv) -1)
698 {
699 if (bytes_read)
700 *bytes_read = 0;
701
702 if (bytes_written)
703 *bytes_written = 0;
704
705 return NULL;
706 }
707
708 utf8 = g_convert (str, len, "UTF-8", from_codeset,
709 bytes_read, &inbytes_remaining, error);
710 if (!utf8)
711 {
712 close_converter (cd);
713 if (bytes_written)
714 *bytes_written = 0;
715 return NULL;
716 }
717
718 /* Now the heart of the code. We loop through the UTF-8 string, and
719 * whenever we hit an offending character, we form fallback, convert
720 * the fallback to the target codeset, and then go back to
721 * converting the original string after finishing with the fallback.
722 *
723 * The variables save_p and save_inbytes store the input state
724 * for the original string while we are converting the fallback
725 */
726 p = utf8;
727
728 outbuf_size = len + NUL_TERMINATOR_LENGTH;
729 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
730 outp = dest = g_malloc (outbuf_size);
731
732 while (!done && !have_error)
733 {
734 gsize inbytes_tmp = inbytes_remaining;
735 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
736 inbytes_remaining = inbytes_tmp;
737
738 if (err == (gsize) -1)
739 {
740 switch (errno)
741 {
742 case EINVAL:
743 g_assert_not_reached();
744 break;
745 case E2BIG:
746 {
747 gsize used = outp - dest;
748
749 outbuf_size *= 2;
750 dest = g_realloc (dest, outbuf_size);
751
752 outp = dest + used;
753 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
754
755 break;
756 }
757 case EILSEQ:
758 if (save_p)
759 {
760 /* Error converting fallback string - fatal
761 */
762 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
763 _("Cannot convert fallback “%s” to codeset “%s”"),
764 insert_str, to_codeset);
765 have_error = TRUE;
766 break;
767 }
768 else if (p)
769 {
770 if (!fallback)
771 {
772 gunichar ch = g_utf8_get_char (p);
773 insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
774 ch);
775 }
776 else
777 insert_str = fallback;
778
779 save_p = g_utf8_next_char (p);
780 save_inbytes = inbytes_remaining - (save_p - p);
781 p = insert_str;
782 inbytes_remaining = strlen (p);
783 break;
784 }
785 /* if p is null */
786 G_GNUC_FALLTHROUGH;
787 default:
788 {
789 int errsv = errno;
790
791 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
792 _("Error during conversion: %s"),
793 g_strerror (errsv));
794 }
795
796 have_error = TRUE;
797 break;
798 }
799 }
800 else
801 {
802 if (save_p)
803 {
804 if (!fallback)
805 g_free ((gchar *)insert_str);
806 p = save_p;
807 inbytes_remaining = save_inbytes;
808 save_p = NULL;
809 }
810 else if (p)
811 {
812 /* call g_iconv with NULL inbuf to cleanup shift state */
813 p = NULL;
814 inbytes_remaining = 0;
815 }
816 else
817 done = TRUE;
818 }
819 }
820
821 /* Cleanup
822 */
823 memset (outp, 0, NUL_TERMINATOR_LENGTH);
824
825 close_converter (cd);
826
827 if (bytes_written)
828 *bytes_written = outp - dest; /* Doesn't include '\0' */
829
830 g_free (utf8);
831
832 if (have_error)
833 {
834 if (save_p && !fallback)
835 g_free ((gchar *)insert_str);
836 g_free (dest);
837 return NULL;
838 }
839 else
840 return dest;
841 }
842
843 /*
844 * g_locale_to_utf8
845 *
846 *
847 */
848
849 /*
850 * Validate @string as UTF-8. @len can be negative if @string is
851 * nul-terminated, or a non-negative value in bytes. If @string ends in an
852 * incomplete sequence, or contains any illegal sequences or nul codepoints,
853 * %NULL will be returned and the error set to
854 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
855 * On success, @bytes_read and @bytes_written, if provided, will be set to
856 * the number of bytes in @string up to @len or the terminating nul byte.
857 * On error, @bytes_read will be set to the byte offset after the last valid
858 * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
859 */
860 static gchar *
strdup_len(const gchar * string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)861 strdup_len (const gchar *string,
862 gssize len,
863 gsize *bytes_read,
864 gsize *bytes_written,
865 GError **error)
866 {
867 gsize real_len;
868 const gchar *end_valid;
869
870 if (!g_utf8_validate (string, len, &end_valid))
871 {
872 if (bytes_read)
873 *bytes_read = end_valid - string;
874 if (bytes_written)
875 *bytes_written = 0;
876
877 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
878 _("Invalid byte sequence in conversion input"));
879 return NULL;
880 }
881
882 real_len = end_valid - string;
883
884 if (bytes_read)
885 *bytes_read = real_len;
886 if (bytes_written)
887 *bytes_written = real_len;
888
889 return g_strndup (string, real_len);
890 }
891
892 typedef enum
893 {
894 CONVERT_CHECK_NO_NULS_IN_INPUT = 1 << 0,
895 CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
896 } ConvertCheckFlags;
897
898 /*
899 * Convert from @string in the encoding identified by @from_codeset,
900 * returning a string in the encoding identifed by @to_codeset.
901 * @len can be negative if @string is nul-terminated, or a non-negative
902 * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
903 * to check the input, the output, or both, for embedded nul bytes.
904 * On success, @bytes_read, if provided, will be set to the number of bytes
905 * in @string up to @len or the terminating nul byte, and @bytes_written, if
906 * provided, will be set to the number of output bytes written into the
907 * returned buffer, excluding the terminating nul sequence.
908 * On error, @bytes_read will be set to the byte offset after the last valid
909 * sequence in @string, and @bytes_written will be set to 0.
910 */
911 static gchar *
convert_checked(const gchar * string,gssize len,const gchar * to_codeset,const gchar * from_codeset,ConvertCheckFlags flags,gsize * bytes_read,gsize * bytes_written,GError ** error)912 convert_checked (const gchar *string,
913 gssize len,
914 const gchar *to_codeset,
915 const gchar *from_codeset,
916 ConvertCheckFlags flags,
917 gsize *bytes_read,
918 gsize *bytes_written,
919 GError **error)
920 {
921 gchar *out;
922 gsize outbytes;
923
924 if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
925 {
926 const gchar *early_nul = memchr (string, '\0', len);
927 if (early_nul != NULL)
928 {
929 if (bytes_read)
930 *bytes_read = early_nul - string;
931 if (bytes_written)
932 *bytes_written = 0;
933
934 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
935 _("Embedded NUL byte in conversion input"));
936 return NULL;
937 }
938 }
939
940 out = g_convert (string, len, to_codeset, from_codeset,
941 bytes_read, &outbytes, error);
942 if (out == NULL)
943 {
944 if (bytes_written)
945 *bytes_written = 0;
946 return NULL;
947 }
948
949 if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
950 && memchr (out, '\0', outbytes) != NULL)
951 {
952 g_free (out);
953 if (bytes_written)
954 *bytes_written = 0;
955 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
956 _("Embedded NUL byte in conversion output"));
957 return NULL;
958 }
959
960 if (bytes_written)
961 *bytes_written = outbytes;
962 return out;
963 }
964
965 /**
966 * g_locale_to_utf8:
967 * @opsysstring: (array length=len) (element-type guint8): a string in the
968 * encoding of the current locale. On Windows
969 * this means the system codepage.
970 * @len: the length of the string, or -1 if the string is
971 * nul-terminated (Note that some encodings may allow nul
972 * bytes to occur inside strings. In that case, using -1
973 * for the @len parameter is unsafe)
974 * @bytes_read: (out) (optional): location to store the number of bytes in the
975 * input string that were successfully converted, or %NULL.
976 * Even if the conversion was successful, this may be
977 * less than @len if there were partial characters
978 * at the end of the input. If the error
979 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
980 * stored will be the byte offset after the last valid
981 * input sequence.
982 * @bytes_written: (out) (optional): the number of bytes stored in the output
983 * buffer (not including the terminating nul).
984 * @error: location to store the error occurring, or %NULL to ignore
985 * errors. Any of the errors in #GConvertError may occur.
986 *
987 * Converts a string which is in the encoding used for strings by
988 * the C runtime (usually the same as that used by the operating
989 * system) in the [current locale][setlocale] into a UTF-8 string.
990 *
991 * If the source encoding is not UTF-8 and the conversion output contains a
992 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
993 * function returns %NULL.
994 * If the source encoding is UTF-8, an embedded nul character is treated with
995 * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
996 * earlier versions of this library. Use g_convert() to produce output that
997 * may contain embedded nul characters.
998 *
999 * Returns: (type utf8): The converted string, or %NULL on an error.
1000 **/
1001 gchar *
g_locale_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1002 g_locale_to_utf8 (const gchar *opsysstring,
1003 gssize len,
1004 gsize *bytes_read,
1005 gsize *bytes_written,
1006 GError **error)
1007 {
1008 const char *charset;
1009
1010 if (g_get_charset (&charset))
1011 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1012 else
1013 return convert_checked (opsysstring, len, "UTF-8", charset,
1014 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1015 bytes_read, bytes_written, error);
1016 }
1017
1018 /**
1019 * g_locale_from_utf8:
1020 * @utf8string: a UTF-8 encoded string
1021 * @len: the length of the string, or -1 if the string is
1022 * nul-terminated.
1023 * @bytes_read: (out) (optional): location to store the number of bytes in the
1024 * input string that were successfully converted, or %NULL.
1025 * Even if the conversion was successful, this may be
1026 * less than @len if there were partial characters
1027 * at the end of the input. If the error
1028 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1029 * stored will be the byte offset after the last valid
1030 * input sequence.
1031 * @bytes_written: (out) (optional): the number of bytes stored in the output
1032 * buffer (not including the terminating nul).
1033 * @error: location to store the error occurring, or %NULL to ignore
1034 * errors. Any of the errors in #GConvertError may occur.
1035 *
1036 * Converts a string from UTF-8 to the encoding used for strings by
1037 * the C runtime (usually the same as that used by the operating
1038 * system) in the [current locale][setlocale]. On Windows this means
1039 * the system codepage.
1040 *
1041 * The input string shall not contain nul characters even if the @len
1042 * argument is positive. A nul character found inside the string will result
1043 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1044 * input that may contain embedded nul characters.
1045 *
1046 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1047 * A newly-allocated buffer containing the converted string,
1048 * or %NULL on an error, and error will be set.
1049 **/
1050 gchar *
g_locale_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1051 g_locale_from_utf8 (const gchar *utf8string,
1052 gssize len,
1053 gsize *bytes_read,
1054 gsize *bytes_written,
1055 GError **error)
1056 {
1057 const gchar *charset;
1058
1059 if (g_get_charset (&charset))
1060 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1061 else
1062 return convert_checked (utf8string, len, charset, "UTF-8",
1063 CONVERT_CHECK_NO_NULS_IN_INPUT,
1064 bytes_read, bytes_written, error);
1065 }
1066
1067 #ifndef G_PLATFORM_WIN32
1068
1069 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1070
1071 struct _GFilenameCharsetCache {
1072 gboolean is_utf8;
1073 gchar *charset;
1074 gchar **filename_charsets;
1075 };
1076
1077 static void
filename_charset_cache_free(gpointer data)1078 filename_charset_cache_free (gpointer data)
1079 {
1080 GFilenameCharsetCache *cache = data;
1081 g_free (cache->charset);
1082 g_strfreev (cache->filename_charsets);
1083 g_free (cache);
1084 }
1085
1086 /**
1087 * g_get_filename_charsets:
1088 * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1089 * return location for the %NULL-terminated list of encoding names
1090 *
1091 * Determines the preferred character sets used for filenames.
1092 * The first character set from the @charsets is the filename encoding, the
1093 * subsequent character sets are used when trying to generate a displayable
1094 * representation of a filename, see g_filename_display_name().
1095 *
1096 * On Unix, the character sets are determined by consulting the
1097 * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1098 * On Windows, the character set used in the GLib API is always UTF-8
1099 * and said environment variables have no effect.
1100 *
1101 * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1102 * character set names. The special token "\@locale" is taken
1103 * to mean the character set for the [current locale][setlocale].
1104 * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1105 * the character set of the current locale is taken as the filename
1106 * encoding. If neither environment variable is set, UTF-8 is taken
1107 * as the filename encoding, but the character set of the current locale
1108 * is also put in the list of encodings.
1109 *
1110 * The returned @charsets belong to GLib and must not be freed.
1111 *
1112 * Note that on Unix, regardless of the locale character set or
1113 * `G_FILENAME_ENCODING` value, the actual file names present
1114 * on a system might be in any random encoding or just gibberish.
1115 *
1116 * Returns: %TRUE if the filename encoding is UTF-8.
1117 *
1118 * Since: 2.6
1119 */
1120 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1121 g_get_filename_charsets (const gchar ***filename_charsets)
1122 {
1123 static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1124 GFilenameCharsetCache *cache = g_private_get (&cache_private);
1125 const gchar *charset;
1126
1127 if (!cache)
1128 cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache));
1129
1130 g_get_charset (&charset);
1131
1132 if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1133 {
1134 const gchar *new_charset;
1135 const gchar *p;
1136 gint i;
1137
1138 g_free (cache->charset);
1139 g_strfreev (cache->filename_charsets);
1140 cache->charset = g_strdup (charset);
1141
1142 p = g_getenv ("G_FILENAME_ENCODING");
1143 if (p != NULL && p[0] != '\0')
1144 {
1145 cache->filename_charsets = g_strsplit (p, ",", 0);
1146 cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1147
1148 for (i = 0; cache->filename_charsets[i]; i++)
1149 {
1150 if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1151 {
1152 g_get_charset (&new_charset);
1153 g_free (cache->filename_charsets[i]);
1154 cache->filename_charsets[i] = g_strdup (new_charset);
1155 }
1156 }
1157 }
1158 else if (g_getenv ("G_BROKEN_FILENAMES") != NULL)
1159 {
1160 cache->filename_charsets = g_new0 (gchar *, 2);
1161 cache->is_utf8 = g_get_charset (&new_charset);
1162 cache->filename_charsets[0] = g_strdup (new_charset);
1163 }
1164 else
1165 {
1166 cache->filename_charsets = g_new0 (gchar *, 3);
1167 cache->is_utf8 = TRUE;
1168 cache->filename_charsets[0] = g_strdup ("UTF-8");
1169 if (!g_get_charset (&new_charset))
1170 cache->filename_charsets[1] = g_strdup (new_charset);
1171 }
1172 }
1173
1174 if (filename_charsets)
1175 *filename_charsets = (const gchar **)cache->filename_charsets;
1176
1177 return cache->is_utf8;
1178 }
1179
1180 #else /* G_PLATFORM_WIN32 */
1181
1182 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1183 g_get_filename_charsets (const gchar ***filename_charsets)
1184 {
1185 static const gchar *charsets[] = {
1186 "UTF-8",
1187 NULL
1188 };
1189
1190 #ifdef G_OS_WIN32
1191 /* On Windows GLib pretends that the filename charset is UTF-8 */
1192 if (filename_charsets)
1193 *filename_charsets = charsets;
1194
1195 return TRUE;
1196 #else
1197 gboolean result;
1198
1199 /* Cygwin works like before */
1200 result = g_get_charset (&(charsets[0]));
1201
1202 if (filename_charsets)
1203 *filename_charsets = charsets;
1204
1205 return result;
1206 #endif
1207 }
1208
1209 #endif /* G_PLATFORM_WIN32 */
1210
1211 static gboolean
get_filename_charset(const gchar ** filename_charset)1212 get_filename_charset (const gchar **filename_charset)
1213 {
1214 const gchar **charsets;
1215 gboolean is_utf8;
1216
1217 is_utf8 = g_get_filename_charsets (&charsets);
1218
1219 if (filename_charset)
1220 *filename_charset = charsets[0];
1221
1222 return is_utf8;
1223 }
1224
1225 /**
1226 * g_filename_to_utf8:
1227 * @opsysstring: (type filename): a string in the encoding for filenames
1228 * @len: the length of the string, or -1 if the string is
1229 * nul-terminated (Note that some encodings may allow nul
1230 * bytes to occur inside strings. In that case, using -1
1231 * for the @len parameter is unsafe)
1232 * @bytes_read: (out) (optional): location to store the number of bytes in the
1233 * input string that were successfully converted, or %NULL.
1234 * Even if the conversion was successful, this may be
1235 * less than @len if there were partial characters
1236 * at the end of the input. If the error
1237 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1238 * stored will be the byte offset after the last valid
1239 * input sequence.
1240 * @bytes_written: (out) (optional): the number of bytes stored in the output
1241 * buffer (not including the terminating nul).
1242 * @error: location to store the error occurring, or %NULL to ignore
1243 * errors. Any of the errors in #GConvertError may occur.
1244 *
1245 * Converts a string which is in the encoding used by GLib for
1246 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1247 * for filenames; on other platforms, this function indirectly depends on
1248 * the [current locale][setlocale].
1249 *
1250 * The input string shall not contain nul characters even if the @len
1251 * argument is positive. A nul character found inside the string will result
1252 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1253 * If the source encoding is not UTF-8 and the conversion output contains a
1254 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1255 * function returns %NULL. Use g_convert() to produce output that
1256 * may contain embedded nul characters.
1257 *
1258 * Returns: (type utf8): The converted string, or %NULL on an error.
1259 **/
1260 gchar*
g_filename_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1261 g_filename_to_utf8 (const gchar *opsysstring,
1262 gssize len,
1263 gsize *bytes_read,
1264 gsize *bytes_written,
1265 GError **error)
1266 {
1267 const gchar *charset;
1268
1269 g_return_val_if_fail (opsysstring != NULL, NULL);
1270
1271 if (get_filename_charset (&charset))
1272 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1273 else
1274 return convert_checked (opsysstring, len, "UTF-8", charset,
1275 CONVERT_CHECK_NO_NULS_IN_INPUT |
1276 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1277 bytes_read, bytes_written, error);
1278 }
1279
1280 /**
1281 * g_filename_from_utf8:
1282 * @utf8string: (type utf8): a UTF-8 encoded string.
1283 * @len: the length of the string, or -1 if the string is
1284 * nul-terminated.
1285 * @bytes_read: (out) (optional): location to store the number of bytes in
1286 * the input string that were successfully converted, or %NULL.
1287 * Even if the conversion was successful, this may be
1288 * less than @len if there were partial characters
1289 * at the end of the input. If the error
1290 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1291 * stored will be the byte offset after the last valid
1292 * input sequence.
1293 * @bytes_written: (out) (optional): the number of bytes stored in
1294 * the output buffer (not including the terminating nul).
1295 * @error: location to store the error occurring, or %NULL to ignore
1296 * errors. Any of the errors in #GConvertError may occur.
1297 *
1298 * Converts a string from UTF-8 to the encoding GLib uses for
1299 * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1300 * on other platforms, this function indirectly depends on the
1301 * [current locale][setlocale].
1302 *
1303 * The input string shall not contain nul characters even if the @len
1304 * argument is positive. A nul character found inside the string will result
1305 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1306 * not UTF-8 and the conversion output contains a nul character, the error
1307 * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1308 *
1309 * Returns: (type filename):
1310 * The converted string, or %NULL on an error.
1311 **/
1312 gchar*
g_filename_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1313 g_filename_from_utf8 (const gchar *utf8string,
1314 gssize len,
1315 gsize *bytes_read,
1316 gsize *bytes_written,
1317 GError **error)
1318 {
1319 const gchar *charset;
1320
1321 if (get_filename_charset (&charset))
1322 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1323 else
1324 return convert_checked (utf8string, len, charset, "UTF-8",
1325 CONVERT_CHECK_NO_NULS_IN_INPUT |
1326 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1327 bytes_read, bytes_written, error);
1328 }
1329
1330 /* Test of haystack has the needle prefix, comparing case
1331 * insensitive. haystack may be UTF-8, but needle must
1332 * contain only ascii. */
1333 static gboolean
has_case_prefix(const gchar * haystack,const gchar * needle)1334 has_case_prefix (const gchar *haystack, const gchar *needle)
1335 {
1336 const gchar *h, *n;
1337
1338 /* Eat one character at a time. */
1339 h = haystack;
1340 n = needle;
1341
1342 while (*n && *h &&
1343 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1344 {
1345 n++;
1346 h++;
1347 }
1348
1349 return *n == '\0';
1350 }
1351
1352 typedef enum {
1353 UNSAFE_ALL = 0x1, /* Escape all unsafe characters */
1354 UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */
1355 UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1356 UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */
1357 UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */
1358 } UnsafeCharacterSet;
1359
1360 static const guchar acceptable[96] = {
1361 /* A table of the ASCII chars from space (32) to DEL (127) */
1362 /* ! " # $ % & ' ( ) * + , - . / */
1363 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1364 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1365 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1366 /* @ A B C D E F G H I J K L M N O */
1367 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1368 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1369 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1370 /* ` a b c d e f g h i j k l m n o */
1371 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1372 /* p q r s t u v w x y z { | } ~ DEL */
1373 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1374 };
1375
1376 static const gchar hex[16] = "0123456789ABCDEF";
1377
1378 /* Note: This escape function works on file: URIs, but if you want to
1379 * escape something else, please read RFC-2396 */
1380 static gchar *
g_escape_uri_string(const gchar * string,UnsafeCharacterSet mask)1381 g_escape_uri_string (const gchar *string,
1382 UnsafeCharacterSet mask)
1383 {
1384 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1385
1386 const gchar *p;
1387 gchar *q;
1388 gchar *result;
1389 int c;
1390 gint unacceptable;
1391 UnsafeCharacterSet use_mask;
1392
1393 g_return_val_if_fail (mask == UNSAFE_ALL
1394 || mask == UNSAFE_ALLOW_PLUS
1395 || mask == UNSAFE_PATH
1396 || mask == UNSAFE_HOST
1397 || mask == UNSAFE_SLASHES, NULL);
1398
1399 unacceptable = 0;
1400 use_mask = mask;
1401 for (p = string; *p != '\0'; p++)
1402 {
1403 c = (guchar) *p;
1404 if (!ACCEPTABLE (c))
1405 unacceptable++;
1406 }
1407
1408 result = g_malloc (p - string + unacceptable * 2 + 1);
1409
1410 use_mask = mask;
1411 for (q = result, p = string; *p != '\0'; p++)
1412 {
1413 c = (guchar) *p;
1414
1415 if (!ACCEPTABLE (c))
1416 {
1417 *q++ = '%'; /* means hex coming */
1418 *q++ = hex[c >> 4];
1419 *q++ = hex[c & 15];
1420 }
1421 else
1422 *q++ = *p;
1423 }
1424
1425 *q = '\0';
1426
1427 return result;
1428 }
1429
1430
1431 static gchar *
g_escape_file_uri(const gchar * hostname,const gchar * pathname)1432 g_escape_file_uri (const gchar *hostname,
1433 const gchar *pathname)
1434 {
1435 char *escaped_hostname = NULL;
1436 char *escaped_path;
1437 char *res;
1438
1439 #ifdef G_OS_WIN32
1440 char *p, *backslash;
1441
1442 /* Turn backslashes into forward slashes. That's what Netscape
1443 * does, and they are actually more or less equivalent in Windows.
1444 */
1445
1446 pathname = g_strdup (pathname);
1447 p = (char *) pathname;
1448
1449 while ((backslash = strchr (p, '\\')) != NULL)
1450 {
1451 *backslash = '/';
1452 p = backslash + 1;
1453 }
1454 #endif
1455
1456 if (hostname && *hostname != '\0')
1457 {
1458 escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1459 }
1460
1461 escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1462
1463 res = g_strconcat ("file://",
1464 (escaped_hostname) ? escaped_hostname : "",
1465 (*escaped_path != '/') ? "/" : "",
1466 escaped_path,
1467 NULL);
1468
1469 #ifdef G_OS_WIN32
1470 g_free ((char *) pathname);
1471 #endif
1472
1473 g_free (escaped_hostname);
1474 g_free (escaped_path);
1475
1476 return res;
1477 }
1478
1479 static int
unescape_character(const char * scanner)1480 unescape_character (const char *scanner)
1481 {
1482 int first_digit;
1483 int second_digit;
1484
1485 first_digit = g_ascii_xdigit_value (scanner[0]);
1486 if (first_digit < 0)
1487 return -1;
1488
1489 second_digit = g_ascii_xdigit_value (scanner[1]);
1490 if (second_digit < 0)
1491 return -1;
1492
1493 return (first_digit << 4) | second_digit;
1494 }
1495
1496 static gchar *
g_unescape_uri_string(const char * escaped,int len,const char * illegal_escaped_characters,gboolean ascii_must_not_be_escaped)1497 g_unescape_uri_string (const char *escaped,
1498 int len,
1499 const char *illegal_escaped_characters,
1500 gboolean ascii_must_not_be_escaped)
1501 {
1502 const gchar *in, *in_end;
1503 gchar *out, *result;
1504 int c;
1505
1506 if (escaped == NULL)
1507 return NULL;
1508
1509 if (len < 0)
1510 len = strlen (escaped);
1511
1512 result = g_malloc (len + 1);
1513
1514 out = result;
1515 for (in = escaped, in_end = escaped + len; in < in_end; in++)
1516 {
1517 c = *in;
1518
1519 if (c == '%')
1520 {
1521 /* catch partial escape sequences past the end of the substring */
1522 if (in + 3 > in_end)
1523 break;
1524
1525 c = unescape_character (in + 1);
1526
1527 /* catch bad escape sequences and NUL characters */
1528 if (c <= 0)
1529 break;
1530
1531 /* catch escaped ASCII */
1532 if (ascii_must_not_be_escaped && c <= 0x7F)
1533 break;
1534
1535 /* catch other illegal escaped characters */
1536 if (strchr (illegal_escaped_characters, c) != NULL)
1537 break;
1538
1539 in += 2;
1540 }
1541
1542 *out++ = c;
1543 }
1544
1545 g_assert (out - result <= len);
1546 *out = '\0';
1547
1548 if (in != in_end)
1549 {
1550 g_free (result);
1551 return NULL;
1552 }
1553
1554 return result;
1555 }
1556
1557 static gboolean
is_asciialphanum(gunichar c)1558 is_asciialphanum (gunichar c)
1559 {
1560 return c <= 0x7F && g_ascii_isalnum (c);
1561 }
1562
1563 static gboolean
is_asciialpha(gunichar c)1564 is_asciialpha (gunichar c)
1565 {
1566 return c <= 0x7F && g_ascii_isalpha (c);
1567 }
1568
1569 /* allows an empty string */
1570 static gboolean
hostname_validate(const char * hostname)1571 hostname_validate (const char *hostname)
1572 {
1573 const char *p;
1574 gunichar c, first_char, last_char;
1575
1576 p = hostname;
1577 if (*p == '\0')
1578 return TRUE;
1579 do
1580 {
1581 /* read in a label */
1582 c = g_utf8_get_char (p);
1583 p = g_utf8_next_char (p);
1584 if (!is_asciialphanum (c))
1585 return FALSE;
1586 first_char = c;
1587 do
1588 {
1589 last_char = c;
1590 c = g_utf8_get_char (p);
1591 p = g_utf8_next_char (p);
1592 }
1593 while (is_asciialphanum (c) || c == '-');
1594 if (last_char == '-')
1595 return FALSE;
1596
1597 /* if that was the last label, check that it was a toplabel */
1598 if (c == '\0' || (c == '.' && *p == '\0'))
1599 return is_asciialpha (first_char);
1600 }
1601 while (c == '.');
1602 return FALSE;
1603 }
1604
1605 /**
1606 * g_filename_from_uri:
1607 * @uri: a uri describing a filename (escaped, encoded in ASCII).
1608 * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1609 * If there is no hostname in the URI, %NULL will be
1610 * stored in this location.
1611 * @error: location to store the error occurring, or %NULL to ignore
1612 * errors. Any of the errors in #GConvertError may occur.
1613 *
1614 * Converts an escaped ASCII-encoded URI to a local filename in the
1615 * encoding used for filenames.
1616 *
1617 * Returns: (type filename): a newly-allocated string holding
1618 * the resulting filename, or %NULL on an error.
1619 **/
1620 gchar *
g_filename_from_uri(const gchar * uri,gchar ** hostname,GError ** error)1621 g_filename_from_uri (const gchar *uri,
1622 gchar **hostname,
1623 GError **error)
1624 {
1625 const char *path_part;
1626 const char *host_part;
1627 char *unescaped_hostname;
1628 char *result;
1629 char *filename;
1630 int offs;
1631 #ifdef G_OS_WIN32
1632 char *p, *slash;
1633 #endif
1634
1635 if (hostname)
1636 *hostname = NULL;
1637
1638 if (!has_case_prefix (uri, "file:/"))
1639 {
1640 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1641 _("The URI “%s” is not an absolute URI using the “file” scheme"),
1642 uri);
1643 return NULL;
1644 }
1645
1646 path_part = uri + strlen ("file:");
1647
1648 if (strchr (path_part, '#') != NULL)
1649 {
1650 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1651 _("The local file URI “%s” may not include a “#”"),
1652 uri);
1653 return NULL;
1654 }
1655
1656 if (has_case_prefix (path_part, "///"))
1657 path_part += 2;
1658 else if (has_case_prefix (path_part, "//"))
1659 {
1660 path_part += 2;
1661 host_part = path_part;
1662
1663 path_part = strchr (path_part, '/');
1664
1665 if (path_part == NULL)
1666 {
1667 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1668 _("The URI “%s” is invalid"),
1669 uri);
1670 return NULL;
1671 }
1672
1673 unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1674
1675 if (unescaped_hostname == NULL ||
1676 !hostname_validate (unescaped_hostname))
1677 {
1678 g_free (unescaped_hostname);
1679 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1680 _("The hostname of the URI “%s” is invalid"),
1681 uri);
1682 return NULL;
1683 }
1684
1685 if (hostname)
1686 *hostname = unescaped_hostname;
1687 else
1688 g_free (unescaped_hostname);
1689 }
1690
1691 filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1692
1693 if (filename == NULL)
1694 {
1695 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1696 _("The URI “%s” contains invalidly escaped characters"),
1697 uri);
1698 return NULL;
1699 }
1700
1701 offs = 0;
1702 #ifdef G_OS_WIN32
1703 /* Drop localhost */
1704 if (hostname && *hostname != NULL &&
1705 g_ascii_strcasecmp (*hostname, "localhost") == 0)
1706 {
1707 g_free (*hostname);
1708 *hostname = NULL;
1709 }
1710
1711 /* Turn slashes into backslashes, because that's the canonical spelling */
1712 p = filename;
1713 while ((slash = strchr (p, '/')) != NULL)
1714 {
1715 *slash = '\\';
1716 p = slash + 1;
1717 }
1718
1719 /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1720 * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1721 * the filename from the drive letter.
1722 */
1723 if (g_ascii_isalpha (filename[1]))
1724 {
1725 if (filename[2] == ':')
1726 offs = 1;
1727 else if (filename[2] == '|')
1728 {
1729 filename[2] = ':';
1730 offs = 1;
1731 }
1732 }
1733 #endif
1734
1735 result = g_strdup (filename + offs);
1736 g_free (filename);
1737
1738 return result;
1739 }
1740
1741 /**
1742 * g_filename_to_uri:
1743 * @filename: (type filename): an absolute filename specified in the GLib file
1744 * name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1745 * on Windows
1746 * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1747 * @error: location to store the error occurring, or %NULL to ignore
1748 * errors. Any of the errors in #GConvertError may occur.
1749 *
1750 * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1751 * component following Section 3.3. of RFC 2396.
1752 *
1753 * Returns: a newly-allocated string holding the resulting
1754 * URI, or %NULL on an error.
1755 **/
1756 gchar *
g_filename_to_uri(const gchar * filename,const gchar * hostname,GError ** error)1757 g_filename_to_uri (const gchar *filename,
1758 const gchar *hostname,
1759 GError **error)
1760 {
1761 char *escaped_uri;
1762
1763 g_return_val_if_fail (filename != NULL, NULL);
1764
1765 if (!g_path_is_absolute (filename))
1766 {
1767 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1768 _("The pathname “%s” is not an absolute path"),
1769 filename);
1770 return NULL;
1771 }
1772
1773 if (hostname &&
1774 !(g_utf8_validate (hostname, -1, NULL)
1775 && hostname_validate (hostname)))
1776 {
1777 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1778 _("Invalid hostname"));
1779 return NULL;
1780 }
1781
1782 #ifdef G_OS_WIN32
1783 /* Don't use localhost unnecessarily */
1784 if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1785 hostname = NULL;
1786 #endif
1787
1788 escaped_uri = g_escape_file_uri (hostname, filename);
1789
1790 return escaped_uri;
1791 }
1792
1793 /**
1794 * g_uri_list_extract_uris:
1795 * @uri_list: an URI list
1796 *
1797 * Splits an URI list conforming to the text/uri-list
1798 * mime type defined in RFC 2483 into individual URIs,
1799 * discarding any comments. The URIs are not validated.
1800 *
1801 * Returns: (transfer full): a newly allocated %NULL-terminated list
1802 * of strings holding the individual URIs. The array should be freed
1803 * with g_strfreev().
1804 *
1805 * Since: 2.6
1806 */
1807 gchar **
g_uri_list_extract_uris(const gchar * uri_list)1808 g_uri_list_extract_uris (const gchar *uri_list)
1809 {
1810 GPtrArray *uris;
1811 const gchar *p, *q;
1812
1813 uris = g_ptr_array_new ();
1814
1815 p = uri_list;
1816
1817 /* We don't actually try to validate the URI according to RFC
1818 * 2396, or even check for allowed characters - we just ignore
1819 * comments and trim whitespace off the ends. We also
1820 * allow LF delimination as well as the specified CRLF.
1821 *
1822 * We do allow comments like specified in RFC 2483.
1823 */
1824 while (p)
1825 {
1826 if (*p != '#')
1827 {
1828 while (g_ascii_isspace (*p))
1829 p++;
1830
1831 q = p;
1832 while (*q && (*q != '\n') && (*q != '\r'))
1833 q++;
1834
1835 if (q > p)
1836 {
1837 q--;
1838 while (q > p && g_ascii_isspace (*q))
1839 q--;
1840
1841 if (q > p)
1842 g_ptr_array_add (uris, g_strndup (p, q - p + 1));
1843 }
1844 }
1845 p = strchr (p, '\n');
1846 if (p)
1847 p++;
1848 }
1849
1850 g_ptr_array_add (uris, NULL);
1851
1852 return (gchar **) g_ptr_array_free (uris, FALSE);
1853 }
1854
1855 /**
1856 * g_filename_display_basename:
1857 * @filename: (type filename): an absolute pathname in the
1858 * GLib file name encoding
1859 *
1860 * Returns the display basename for the particular filename, guaranteed
1861 * to be valid UTF-8. The display name might not be identical to the filename,
1862 * for instance there might be problems converting it to UTF-8, and some files
1863 * can be translated in the display.
1864 *
1865 * If GLib cannot make sense of the encoding of @filename, as a last resort it
1866 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1867 * You can search the result for the UTF-8 encoding of this character (which is
1868 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1869 * encoding.
1870 *
1871 * You must pass the whole absolute pathname to this functions so that
1872 * translation of well known locations can be done.
1873 *
1874 * This function is preferred over g_filename_display_name() if you know the
1875 * whole path, as it allows translation.
1876 *
1877 * Returns: a newly allocated string containing
1878 * a rendition of the basename of the filename in valid UTF-8
1879 *
1880 * Since: 2.6
1881 **/
1882 gchar *
g_filename_display_basename(const gchar * filename)1883 g_filename_display_basename (const gchar *filename)
1884 {
1885 char *basename;
1886 char *display_name;
1887
1888 g_return_val_if_fail (filename != NULL, NULL);
1889
1890 basename = g_path_get_basename (filename);
1891 display_name = g_filename_display_name (basename);
1892 g_free (basename);
1893 return display_name;
1894 }
1895
1896 /**
1897 * g_filename_display_name:
1898 * @filename: (type filename): a pathname hopefully in the
1899 * GLib file name encoding
1900 *
1901 * Converts a filename into a valid UTF-8 string. The conversion is
1902 * not necessarily reversible, so you should keep the original around
1903 * and use the return value of this function only for display purposes.
1904 * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1905 * even if the filename actually isn't in the GLib file name encoding.
1906 *
1907 * If GLib cannot make sense of the encoding of @filename, as a last resort it
1908 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1909 * You can search the result for the UTF-8 encoding of this character (which is
1910 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1911 * encoding.
1912 *
1913 * If you know the whole pathname of the file you should use
1914 * g_filename_display_basename(), since that allows location-based
1915 * translation of filenames.
1916 *
1917 * Returns: a newly allocated string containing
1918 * a rendition of the filename in valid UTF-8
1919 *
1920 * Since: 2.6
1921 **/
1922 gchar *
g_filename_display_name(const gchar * filename)1923 g_filename_display_name (const gchar *filename)
1924 {
1925 gint i;
1926 const gchar **charsets;
1927 gchar *display_name = NULL;
1928 gboolean is_utf8;
1929
1930 is_utf8 = g_get_filename_charsets (&charsets);
1931
1932 if (is_utf8)
1933 {
1934 if (g_utf8_validate (filename, -1, NULL))
1935 display_name = g_strdup (filename);
1936 }
1937
1938 if (!display_name)
1939 {
1940 /* Try to convert from the filename charsets to UTF-8.
1941 * Skip the first charset if it is UTF-8.
1942 */
1943 for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1944 {
1945 display_name = g_convert (filename, -1, "UTF-8", charsets[i],
1946 NULL, NULL, NULL);
1947
1948 if (display_name)
1949 break;
1950 }
1951 }
1952
1953 /* if all conversions failed, we replace invalid UTF-8
1954 * by a question mark
1955 */
1956 if (!display_name)
1957 display_name = g_utf8_make_valid (filename, -1);
1958
1959 return display_name;
1960 }
1961
1962 #ifdef G_OS_WIN32
1963
1964 /* Binary compatibility versions. Not for newly compiled code. */
1965
1966 _GLIB_EXTERN gchar *g_filename_to_utf8_utf8 (const gchar *opsysstring,
1967 gssize len,
1968 gsize *bytes_read,
1969 gsize *bytes_written,
1970 GError **error) G_GNUC_MALLOC;
1971 _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar *utf8string,
1972 gssize len,
1973 gsize *bytes_read,
1974 gsize *bytes_written,
1975 GError **error) G_GNUC_MALLOC;
1976 _GLIB_EXTERN gchar *g_filename_from_uri_utf8 (const gchar *uri,
1977 gchar **hostname,
1978 GError **error) G_GNUC_MALLOC;
1979 _GLIB_EXTERN gchar *g_filename_to_uri_utf8 (const gchar *filename,
1980 const gchar *hostname,
1981 GError **error) G_GNUC_MALLOC;
1982
1983 gchar *
g_filename_to_utf8_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1984 g_filename_to_utf8_utf8 (const gchar *opsysstring,
1985 gssize len,
1986 gsize *bytes_read,
1987 gsize *bytes_written,
1988 GError **error)
1989 {
1990 return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
1991 }
1992
1993 gchar *
g_filename_from_utf8_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1994 g_filename_from_utf8_utf8 (const gchar *utf8string,
1995 gssize len,
1996 gsize *bytes_read,
1997 gsize *bytes_written,
1998 GError **error)
1999 {
2000 return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
2001 }
2002
2003 gchar *
g_filename_from_uri_utf8(const gchar * uri,gchar ** hostname,GError ** error)2004 g_filename_from_uri_utf8 (const gchar *uri,
2005 gchar **hostname,
2006 GError **error)
2007 {
2008 return g_filename_from_uri (uri, hostname, error);
2009 }
2010
2011 gchar *
g_filename_to_uri_utf8(const gchar * filename,const gchar * hostname,GError ** error)2012 g_filename_to_uri_utf8 (const gchar *filename,
2013 const gchar *hostname,
2014 GError **error)
2015 {
2016 return g_filename_to_uri (filename, hostname, error);
2017 }
2018
2019 #endif
2020