• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GLIB - Library of useful routines for C programming
2  *
3  * gconvert.c: Convert between character sets using iconv
4  * Copyright Red Hat Inc., 2000
5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "config.h"
22 #include "glibconfig.h"
23 
24 #ifndef G_OS_WIN32
25 #include <iconv.h>
26 #endif
27 #include <errno.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
31 
32 #ifdef G_OS_WIN32
33 #include "win_iconv.c"
34 #endif
35 
36 #ifdef G_PLATFORM_WIN32
37 #define STRICT
38 #include <windows.h>
39 #undef STRICT
40 #endif
41 
42 #include "gconvert.h"
43 
44 #include "gcharsetprivate.h"
45 #include "gslist.h"
46 #include "gstrfuncs.h"
47 #include "gtestutils.h"
48 #include "gthread.h"
49 #include "gthreadprivate.h"
50 #include "gunicode.h"
51 #include "gfileutils.h"
52 
53 #include "glibintl.h"
54 
55 
56 /**
57  * SECTION:conversions
58  * @title: Character Set Conversion
59  * @short_description: convert strings between different character sets
60  *
61  * The g_convert() family of function wraps the functionality of iconv().
62  * In addition to pure character set conversions, GLib has functions to
63  * deal with the extra complications of encodings for file names.
64  *
65  * ## File Name Encodings
66  *
67  * Historically, UNIX has not had a defined encoding for file names:
68  * a file name is valid as long as it does not have path separators
69  * in it ("/"). However, displaying file names may require conversion:
70  * from the character set in which they were created, to the character
71  * set in which the application operates. Consider the Spanish file name
72  * "Presentación.sxi". If the application which created it uses
73  * ISO-8859-1 for its encoding,
74  * |[
75  * Character:  P  r  e  s  e  n  t  a  c  i  ó  n  .  s  x  i
76  * Hex code:   50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
77  * ]|
78  * However, if the application use UTF-8, the actual file name on
79  * disk would look like this:
80  * |[
81  * Character:  P  r  e  s  e  n  t  a  c  i  ó     n  .  s  x  i
82  * Hex code:   50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
83  * ]|
84  * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
85  * GLib do the same thing. If you get a file name from the file system,
86  * for example, from readdir() or from g_dir_read_name(), and you wish
87  * to display the file name to the user, you  will need to convert it
88  * into UTF-8. The opposite case is when the user types the name of a
89  * file they wish to save: the toolkit will give you that string in
90  * UTF-8 encoding, and you will need to convert it to the character
91  * set used for file names before you can create the file with open()
92  * or fopen().
93  *
94  * By default, GLib assumes that file names on disk are in UTF-8
95  * encoding. This is a valid assumption for file systems which
96  * were created relatively recently: most applications use UTF-8
97  * encoding for their strings, and that is also what they use for
98  * the file names they create. However, older file systems may
99  * still contain file names created in "older" encodings, such as
100  * ISO-8859-1. In this case, for compatibility reasons, you may want
101  * to instruct GLib to use that particular encoding for file names
102  * rather than UTF-8. You can do this by specifying the encoding for
103  * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
104  * environment variable. For example, if your installation uses
105  * ISO-8859-1 for file names, you can put this in your `~/.profile`:
106  * |[
107  * export G_FILENAME_ENCODING=ISO-8859-1
108  * ]|
109  * GLib provides the functions g_filename_to_utf8() and
110  * g_filename_from_utf8() to perform the necessary conversions.
111  * These functions convert file names from the encoding specified
112  * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
113  * [diagram][file-name-encodings-diagram] illustrates how
114  * these functions are used to convert between UTF-8 and the
115  * encoding for file names in the file system.
116  *
117  * ## Conversion between file name encodings # {#file-name-encodings-diagram)
118  *
119  * ![](file-name-encodings.png)
120  *
121  * ## Checklist for Application Writers
122  *
123  * This section is a practical summary of the detailed
124  * things to do to make sure your applications process file
125  * name encodings correctly.
126  *
127  * 1. If you get a file name from the file system from a function
128  *    such as readdir() or gtk_file_chooser_get_filename(), you do
129  *    not need to do any conversion to pass that file name to
130  *    functions like open(), rename(), or fopen() -- those are "raw"
131  *    file names which the file system understands.
132  *
133  * 2. If you need to display a file name, convert it to UTF-8 first
134  *    by using g_filename_to_utf8(). If conversion fails, display a
135  *    string like "Unknown file name". Do not convert this string back
136  *    into the encoding used for file names if you wish to pass it to
137  *    the file system; use the original file name instead.
138  *
139  *    For example, the document window of a word processor could display
140  *    "Unknown file name" in its title bar but still let the user save
141  *    the file, as it would keep the raw file name internally. This
142  *    can happen if the user has not set the `G_FILENAME_ENCODING`
143  *    environment variable even though he has files whose names are
144  *    not encoded in UTF-8.
145  *
146  * 3. If your user interface lets the user type a file name for saving
147  *    or renaming, convert it to the encoding used for file names in
148  *    the file system by using g_filename_from_utf8(). Pass the converted
149  *    file name to functions like fopen(). If conversion fails, ask the
150  *    user to enter a different file name. This can happen if the user
151  *    types Japanese characters when `G_FILENAME_ENCODING` is set to
152  *    `ISO-8859-1`, for example.
153  */
154 
155 /* We try to terminate strings in unknown charsets with this many zero bytes
156  * to ensure that multibyte strings really are nul-terminated when we return
157  * them from g_convert() and friends.
158  */
159 #define NUL_TERMINATOR_LENGTH 4
160 
G_DEFINE_QUARK(g_convert_error,g_convert_error)161 G_DEFINE_QUARK (g_convert_error, g_convert_error)
162 
163 static gboolean
164 try_conversion (const char *to_codeset,
165 		const char *from_codeset,
166 		iconv_t    *cd)
167 {
168   *cd = iconv_open (to_codeset, from_codeset);
169 
170   if (*cd == (iconv_t)-1 && errno == EINVAL)
171     return FALSE;
172   else
173     return TRUE;
174 }
175 
176 static gboolean
try_to_aliases(const char ** to_aliases,const char * from_codeset,iconv_t * cd)177 try_to_aliases (const char **to_aliases,
178 		const char  *from_codeset,
179 		iconv_t     *cd)
180 {
181   if (to_aliases)
182     {
183       const char **p = to_aliases;
184       while (*p)
185 	{
186 	  if (try_conversion (*p, from_codeset, cd))
187 	    return TRUE;
188 
189 	  p++;
190 	}
191     }
192 
193   return FALSE;
194 }
195 
196 /**
197  * g_iconv_open: (skip)
198  * @to_codeset: destination codeset
199  * @from_codeset: source codeset
200  *
201  * Same as the standard UNIX routine iconv_open(), but
202  * may be implemented via libiconv on UNIX flavors that lack
203  * a native implementation.
204  *
205  * GLib provides g_convert() and g_locale_to_utf8() which are likely
206  * more convenient than the raw iconv wrappers.
207  *
208  * Returns: a "conversion descriptor", or (GIConv)-1 if
209  *  opening the converter failed.
210  **/
211 GIConv
g_iconv_open(const gchar * to_codeset,const gchar * from_codeset)212 g_iconv_open (const gchar  *to_codeset,
213 	      const gchar  *from_codeset)
214 {
215   iconv_t cd;
216 
217   if (!try_conversion (to_codeset, from_codeset, &cd))
218     {
219       const char **to_aliases = _g_charset_get_aliases (to_codeset);
220       const char **from_aliases = _g_charset_get_aliases (from_codeset);
221 
222       if (from_aliases)
223 	{
224 	  const char **p = from_aliases;
225 	  while (*p)
226 	    {
227 	      if (try_conversion (to_codeset, *p, &cd))
228 		goto out;
229 
230 	      if (try_to_aliases (to_aliases, *p, &cd))
231 		goto out;
232 
233 	      p++;
234 	    }
235 	}
236 
237       if (try_to_aliases (to_aliases, from_codeset, &cd))
238 	goto out;
239     }
240 
241  out:
242   return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
243 }
244 
245 /**
246  * g_iconv: (skip)
247  * @converter: conversion descriptor from g_iconv_open()
248  * @inbuf: bytes to convert
249  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
250  * @outbuf: converted output bytes
251  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
252  *
253  * Same as the standard UNIX routine iconv(), but
254  * may be implemented via libiconv on UNIX flavors that lack
255  * a native implementation.
256  *
257  * GLib provides g_convert() and g_locale_to_utf8() which are likely
258  * more convenient than the raw iconv wrappers.
259  *
260  * Note that the behaviour of iconv() for characters which are valid in the
261  * input character set, but which have no representation in the output character
262  * set, is implementation defined. This function may return success (with a
263  * positive number of non-reversible conversions as replacement characters were
264  * used), or it may return -1 and set an error such as %EILSEQ, in such a
265  * situation.
266  *
267  * Returns: count of non-reversible conversions, or -1 on error
268  **/
269 gsize
g_iconv(GIConv converter,gchar ** inbuf,gsize * inbytes_left,gchar ** outbuf,gsize * outbytes_left)270 g_iconv (GIConv   converter,
271 	 gchar  **inbuf,
272 	 gsize   *inbytes_left,
273 	 gchar  **outbuf,
274 	 gsize   *outbytes_left)
275 {
276   iconv_t cd = (iconv_t)converter;
277 
278   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
279 }
280 
281 /**
282  * g_iconv_close: (skip)
283  * @converter: a conversion descriptor from g_iconv_open()
284  *
285  * Same as the standard UNIX routine iconv_close(), but
286  * may be implemented via libiconv on UNIX flavors that lack
287  * a native implementation. Should be called to clean up
288  * the conversion descriptor from g_iconv_open() when
289  * you are done converting things.
290  *
291  * GLib provides g_convert() and g_locale_to_utf8() which are likely
292  * more convenient than the raw iconv wrappers.
293  *
294  * Returns: -1 on error, 0 on success
295  **/
296 gint
g_iconv_close(GIConv converter)297 g_iconv_close (GIConv converter)
298 {
299   iconv_t cd = (iconv_t)converter;
300 
301   return iconv_close (cd);
302 }
303 
304 static GIConv
open_converter(const gchar * to_codeset,const gchar * from_codeset,GError ** error)305 open_converter (const gchar *to_codeset,
306 		const gchar *from_codeset,
307 		GError     **error)
308 {
309   GIConv cd;
310 
311   cd = g_iconv_open (to_codeset, from_codeset);
312 
313   if (cd == (GIConv) -1)
314     {
315       /* Something went wrong.  */
316       if (error)
317 	{
318 	  if (errno == EINVAL)
319 	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
320 			 _("Conversion from character set “%s” to “%s” is not supported"),
321 			 from_codeset, to_codeset);
322 	  else
323 	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
324 			 _("Could not open converter from “%s” to “%s”"),
325 			 from_codeset, to_codeset);
326 	}
327     }
328 
329   return cd;
330 }
331 
332 static int
close_converter(GIConv cd)333 close_converter (GIConv cd)
334 {
335   if (cd == (GIConv) -1)
336     return 0;
337 
338   return g_iconv_close (cd);
339 }
340 
341 /**
342  * g_convert_with_iconv: (skip)
343  * @str:           (array length=len) (element-type guint8):
344  *                 the string to convert.
345  * @len:           the length of the string in bytes, or -1 if the string is
346  *                 nul-terminated (Note that some encodings may allow nul
347  *                 bytes to occur inside strings. In that case, using -1
348  *                 for the @len parameter is unsafe)
349  * @converter:     conversion descriptor from g_iconv_open()
350  * @bytes_read:    (out) (optional): location to store the number of bytes in
351  *                 the input string that were successfully converted, or %NULL.
352  *                 Even if the conversion was successful, this may be
353  *                 less than @len if there were partial characters
354  *                 at the end of the input. If the error
355  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
356  *                 stored will be the byte offset after the last valid
357  *                 input sequence.
358  * @bytes_written: (out) (optional): the number of bytes stored in
359  *                 the output buffer (not including the terminating nul).
360  * @error:         location to store the error occurring, or %NULL to ignore
361  *                 errors. Any of the errors in #GConvertError may occur.
362  *
363  * Converts a string from one character set to another.
364  *
365  * Note that you should use g_iconv() for streaming conversions.
366  * Despite the fact that @bytes_read can return information about partial
367  * characters, the g_convert_... functions are not generally suitable
368  * for streaming. If the underlying converter maintains internal state,
369  * then this won't be preserved across successive calls to g_convert(),
370  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
371  * this is the GNU C converter for CP1255 which does not emit a base
372  * character until it knows that the next character is not a mark that
373  * could combine with the base character.)
374  *
375  * Characters which are valid in the input character set, but which have no
376  * representation in the output character set will result in a
377  * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
378  * specification, which leaves this behaviour implementation defined. Note that
379  * this is the same error code as is returned for an invalid byte sequence in
380  * the input character set. To get defined behaviour for conversion of
381  * unrepresentable characters, use g_convert_with_fallback().
382  *
383  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
384  *               If the conversion was successful, a newly allocated buffer
385  *               containing the converted string, which must be freed with
386  *               g_free(). Otherwise %NULL and @error will be set.
387  **/
388 gchar*
g_convert_with_iconv(const gchar * str,gssize len,GIConv converter,gsize * bytes_read,gsize * bytes_written,GError ** error)389 g_convert_with_iconv (const gchar *str,
390 		      gssize       len,
391 		      GIConv       converter,
392 		      gsize       *bytes_read,
393 		      gsize       *bytes_written,
394 		      GError     **error)
395 {
396   gchar *dest;
397   gchar *outp;
398   const gchar *p;
399   gsize inbytes_remaining;
400   gsize outbytes_remaining;
401   gsize err;
402   gsize outbuf_size;
403   gboolean have_error = FALSE;
404   gboolean done = FALSE;
405   gboolean reset = FALSE;
406 
407   g_return_val_if_fail (converter != (GIConv) -1, NULL);
408 
409   if (len < 0)
410     len = strlen (str);
411 
412   p = str;
413   inbytes_remaining = len;
414   outbuf_size = len + NUL_TERMINATOR_LENGTH;
415 
416   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
417   outp = dest = g_malloc (outbuf_size);
418 
419   while (!done && !have_error)
420     {
421       if (reset)
422         err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
423       else
424         err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
425 
426       if (err == (gsize) -1)
427 	{
428 	  switch (errno)
429 	    {
430 	    case EINVAL:
431 	      /* Incomplete text, do not report an error */
432 	      done = TRUE;
433 	      break;
434 	    case E2BIG:
435 	      {
436 		gsize used = outp - dest;
437 
438 		outbuf_size *= 2;
439 		dest = g_realloc (dest, outbuf_size);
440 
441 		outp = dest + used;
442 		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
443 	      }
444 	      break;
445 	    case EILSEQ:
446               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
447                                    _("Invalid byte sequence in conversion input"));
448 	      have_error = TRUE;
449 	      break;
450 	    default:
451               {
452                 int errsv = errno;
453 
454                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
455                              _("Error during conversion: %s"),
456                              g_strerror (errsv));
457               }
458 	      have_error = TRUE;
459 	      break;
460 	    }
461 	}
462       else if (err > 0)
463         {
464           /* @err gives the number of replacement characters used. */
465           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
466                                _("Unrepresentable character in conversion input"));
467           have_error = TRUE;
468         }
469       else
470 	{
471 	  if (!reset)
472 	    {
473 	      /* call g_iconv with NULL inbuf to cleanup shift state */
474 	      reset = TRUE;
475 	      inbytes_remaining = 0;
476 	    }
477 	  else
478 	    done = TRUE;
479 	}
480     }
481 
482   memset (outp, 0, NUL_TERMINATOR_LENGTH);
483 
484   if (bytes_read)
485     *bytes_read = p - str;
486   else
487     {
488       if ((p - str) != len)
489 	{
490           if (!have_error)
491             {
492               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
493                                    _("Partial character sequence at end of input"));
494               have_error = TRUE;
495             }
496 	}
497     }
498 
499   if (bytes_written)
500     *bytes_written = outp - dest;	/* Doesn't include '\0' */
501 
502   if (have_error)
503     {
504       g_free (dest);
505       return NULL;
506     }
507   else
508     return dest;
509 }
510 
511 /**
512  * g_convert:
513  * @str:           (array length=len) (element-type guint8):
514  *                 the string to convert.
515  * @len:           the length of the string in bytes, or -1 if the string is
516  *                 nul-terminated (Note that some encodings may allow nul
517  *                 bytes to occur inside strings. In that case, using -1
518  *                 for the @len parameter is unsafe)
519  * @to_codeset:    name of character set into which to convert @str
520  * @from_codeset:  character set of @str.
521  * @bytes_read:    (out) (optional): location to store the number of bytes in
522  *                 the input string that were successfully converted, or %NULL.
523  *                 Even if the conversion was successful, this may be
524  *                 less than @len if there were partial characters
525  *                 at the end of the input. If the error
526  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
527  *                 stored will be the byte offset after the last valid
528  *                 input sequence.
529  * @bytes_written: (out) (optional): the number of bytes stored in
530  *                 the output buffer (not including the terminating nul).
531  * @error:         location to store the error occurring, or %NULL to ignore
532  *                 errors. Any of the errors in #GConvertError may occur.
533  *
534  * Converts a string from one character set to another.
535  *
536  * Note that you should use g_iconv() for streaming conversions.
537  * Despite the fact that @bytes_read can return information about partial
538  * characters, the g_convert_... functions are not generally suitable
539  * for streaming. If the underlying converter maintains internal state,
540  * then this won't be preserved across successive calls to g_convert(),
541  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
542  * this is the GNU C converter for CP1255 which does not emit a base
543  * character until it knows that the next character is not a mark that
544  * could combine with the base character.)
545  *
546  * Using extensions such as "//TRANSLIT" may not work (or may not work
547  * well) on many platforms.  Consider using g_str_to_ascii() instead.
548  *
549  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
550  *          If the conversion was successful, a newly allocated buffer
551  *          containing the converted string, which must be freed with g_free().
552  *          Otherwise %NULL and @error will be set.
553  **/
554 gchar*
g_convert(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,gsize * bytes_read,gsize * bytes_written,GError ** error)555 g_convert (const gchar *str,
556            gssize       len,
557            const gchar *to_codeset,
558            const gchar *from_codeset,
559            gsize       *bytes_read,
560 	   gsize       *bytes_written,
561 	   GError     **error)
562 {
563   gchar *res;
564   GIConv cd;
565 
566   g_return_val_if_fail (str != NULL, NULL);
567   g_return_val_if_fail (to_codeset != NULL, NULL);
568   g_return_val_if_fail (from_codeset != NULL, NULL);
569 
570   cd = open_converter (to_codeset, from_codeset, error);
571 
572   if (cd == (GIConv) -1)
573     {
574       if (bytes_read)
575         *bytes_read = 0;
576 
577       if (bytes_written)
578         *bytes_written = 0;
579 
580       return NULL;
581     }
582 
583   res = g_convert_with_iconv (str, len, cd,
584 			      bytes_read, bytes_written,
585 			      error);
586 
587   close_converter (cd);
588 
589   return res;
590 }
591 
592 /**
593  * g_convert_with_fallback:
594  * @str:          (array length=len) (element-type guint8):
595  *                the string to convert.
596  * @len:          the length of the string in bytes, or -1 if the string is
597  *                 nul-terminated (Note that some encodings may allow nul
598  *                 bytes to occur inside strings. In that case, using -1
599  *                 for the @len parameter is unsafe)
600  * @to_codeset:   name of character set into which to convert @str
601  * @from_codeset: character set of @str.
602  * @fallback:     UTF-8 string to use in place of characters not
603  *                present in the target encoding. (The string must be
604  *                representable in the target encoding).
605  *                If %NULL, characters not in the target encoding will
606  *                be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
607  * @bytes_read:   (out) (optional): location to store the number of bytes in
608  *                the input string that were successfully converted, or %NULL.
609  *                Even if the conversion was successful, this may be
610  *                less than @len if there were partial characters
611  *                at the end of the input.
612  * @bytes_written: (out) (optional): the number of bytes stored in
613  *                 the output buffer (not including the terminating nul).
614  * @error:        location to store the error occurring, or %NULL to ignore
615  *                errors. Any of the errors in #GConvertError may occur.
616  *
617  * Converts a string from one character set to another, possibly
618  * including fallback sequences for characters not representable
619  * in the output. Note that it is not guaranteed that the specification
620  * for the fallback sequences in @fallback will be honored. Some
621  * systems may do an approximate conversion from @from_codeset
622  * to @to_codeset in their iconv() functions,
623  * in which case GLib will simply return that approximate conversion.
624  *
625  * Note that you should use g_iconv() for streaming conversions.
626  * Despite the fact that @bytes_read can return information about partial
627  * characters, the g_convert_... functions are not generally suitable
628  * for streaming. If the underlying converter maintains internal state,
629  * then this won't be preserved across successive calls to g_convert(),
630  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
631  * this is the GNU C converter for CP1255 which does not emit a base
632  * character until it knows that the next character is not a mark that
633  * could combine with the base character.)
634  *
635  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
636  *          If the conversion was successful, a newly allocated buffer
637  *          containing the converted string, which must be freed with g_free().
638  *          Otherwise %NULL and @error will be set.
639  **/
640 gchar*
g_convert_with_fallback(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,const gchar * fallback,gsize * bytes_read,gsize * bytes_written,GError ** error)641 g_convert_with_fallback (const gchar *str,
642 			 gssize       len,
643 			 const gchar *to_codeset,
644 			 const gchar *from_codeset,
645 			 const gchar *fallback,
646 			 gsize       *bytes_read,
647 			 gsize       *bytes_written,
648 			 GError     **error)
649 {
650   gchar *utf8;
651   gchar *dest;
652   gchar *outp;
653   const gchar *insert_str = NULL;
654   const gchar *p;
655   gsize inbytes_remaining;
656   const gchar *save_p = NULL;
657   gsize save_inbytes = 0;
658   gsize outbytes_remaining;
659   gsize err;
660   GIConv cd;
661   gsize outbuf_size;
662   gboolean have_error = FALSE;
663   gboolean done = FALSE;
664 
665   GError *local_error = NULL;
666 
667   g_return_val_if_fail (str != NULL, NULL);
668   g_return_val_if_fail (to_codeset != NULL, NULL);
669   g_return_val_if_fail (from_codeset != NULL, NULL);
670 
671   if (len < 0)
672     len = strlen (str);
673 
674   /* Try an exact conversion; we only proceed if this fails
675    * due to an illegal sequence in the input string.
676    */
677   dest = g_convert (str, len, to_codeset, from_codeset,
678 		    bytes_read, bytes_written, &local_error);
679   if (!local_error)
680     return dest;
681 
682   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
683     {
684       g_propagate_error (error, local_error);
685       return NULL;
686     }
687   else
688     g_error_free (local_error);
689 
690   local_error = NULL;
691 
692   /* No go; to proceed, we need a converter from "UTF-8" to
693    * to_codeset, and the string as UTF-8.
694    */
695   cd = open_converter (to_codeset, "UTF-8", error);
696   if (cd == (GIConv) -1)
697     {
698       if (bytes_read)
699         *bytes_read = 0;
700 
701       if (bytes_written)
702         *bytes_written = 0;
703 
704       return NULL;
705     }
706 
707   utf8 = g_convert (str, len, "UTF-8", from_codeset,
708 		    bytes_read, &inbytes_remaining, error);
709   if (!utf8)
710     {
711       close_converter (cd);
712       if (bytes_written)
713         *bytes_written = 0;
714       return NULL;
715     }
716 
717   /* Now the heart of the code. We loop through the UTF-8 string, and
718    * whenever we hit an offending character, we form fallback, convert
719    * the fallback to the target codeset, and then go back to
720    * converting the original string after finishing with the fallback.
721    *
722    * The variables save_p and save_inbytes store the input state
723    * for the original string while we are converting the fallback
724    */
725   p = utf8;
726 
727   outbuf_size = len + NUL_TERMINATOR_LENGTH;
728   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
729   outp = dest = g_malloc (outbuf_size);
730 
731   while (!done && !have_error)
732     {
733       gsize inbytes_tmp = inbytes_remaining;
734       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
735       inbytes_remaining = inbytes_tmp;
736 
737       if (err == (gsize) -1)
738 	{
739 	  switch (errno)
740 	    {
741 	    case EINVAL:
742 	      g_assert_not_reached();
743 	      break;
744 	    case E2BIG:
745 	      {
746 		gsize used = outp - dest;
747 
748 		outbuf_size *= 2;
749 		dest = g_realloc (dest, outbuf_size);
750 
751 		outp = dest + used;
752 		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
753 
754 		break;
755 	      }
756 	    case EILSEQ:
757 	      if (save_p)
758 		{
759 		  /* Error converting fallback string - fatal
760 		   */
761 		  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
762 			       _("Cannot convert fallback “%s” to codeset “%s”"),
763 			       insert_str, to_codeset);
764 		  have_error = TRUE;
765 		  break;
766 		}
767 	      else if (p)
768 		{
769 		  if (!fallback)
770 		    {
771 		      gunichar ch = g_utf8_get_char (p);
772 		      insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
773 						    ch);
774 		    }
775 		  else
776 		    insert_str = fallback;
777 
778 		  save_p = g_utf8_next_char (p);
779 		  save_inbytes = inbytes_remaining - (save_p - p);
780 		  p = insert_str;
781 		  inbytes_remaining = strlen (p);
782 		  break;
783 		}
784               /* if p is null */
785               G_GNUC_FALLTHROUGH;
786 	    default:
787               {
788                 int errsv = errno;
789 
790                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
791                              _("Error during conversion: %s"),
792                              g_strerror (errsv));
793               }
794 
795 	      have_error = TRUE;
796 	      break;
797 	    }
798 	}
799       else
800 	{
801 	  if (save_p)
802 	    {
803 	      if (!fallback)
804 		g_free ((gchar *)insert_str);
805 	      p = save_p;
806 	      inbytes_remaining = save_inbytes;
807 	      save_p = NULL;
808 	    }
809 	  else if (p)
810 	    {
811 	      /* call g_iconv with NULL inbuf to cleanup shift state */
812 	      p = NULL;
813 	      inbytes_remaining = 0;
814 	    }
815 	  else
816 	    done = TRUE;
817 	}
818     }
819 
820   /* Cleanup
821    */
822   memset (outp, 0, NUL_TERMINATOR_LENGTH);
823 
824   close_converter (cd);
825 
826   if (bytes_written)
827     *bytes_written = outp - dest;	/* Doesn't include '\0' */
828 
829   g_free (utf8);
830 
831   if (have_error)
832     {
833       if (save_p && !fallback)
834 	g_free ((gchar *)insert_str);
835       g_free (dest);
836       return NULL;
837     }
838   else
839     return dest;
840 }
841 
842 /*
843  * g_locale_to_utf8
844  *
845  *
846  */
847 
848 /*
849  * Validate @string as UTF-8. @len can be negative if @string is
850  * nul-terminated, or a non-negative value in bytes. If @string ends in an
851  * incomplete sequence, or contains any illegal sequences or nul codepoints,
852  * %NULL will be returned and the error set to
853  * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
854  * On success, @bytes_read and @bytes_written, if provided, will be set to
855  * the number of bytes in @string up to @len or the terminating nul byte.
856  * On error, @bytes_read will be set to the byte offset after the last valid
857  * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
858  */
859 static gchar *
strdup_len(const gchar * string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)860 strdup_len (const gchar *string,
861 	    gssize       len,
862 	    gsize       *bytes_read,
863 	    gsize       *bytes_written,
864 	    GError     **error)
865 {
866   gsize real_len;
867   const gchar *end_valid;
868 
869   if (!g_utf8_validate (string, len, &end_valid))
870     {
871       if (bytes_read)
872 	*bytes_read = end_valid - string;
873       if (bytes_written)
874 	*bytes_written = 0;
875 
876       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
877                            _("Invalid byte sequence in conversion input"));
878       return NULL;
879     }
880 
881   real_len = end_valid - string;
882 
883   if (bytes_read)
884     *bytes_read = real_len;
885   if (bytes_written)
886     *bytes_written = real_len;
887 
888   return g_strndup (string, real_len);
889 }
890 
891 typedef enum
892 {
893   CONVERT_CHECK_NO_NULS_IN_INPUT  = 1 << 0,
894   CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
895 } ConvertCheckFlags;
896 
897 /*
898  * Convert from @string in the encoding identified by @from_codeset,
899  * returning a string in the encoding identifed by @to_codeset.
900  * @len can be negative if @string is nul-terminated, or a non-negative
901  * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
902  * to check the input, the output, or both, for embedded nul bytes.
903  * On success, @bytes_read, if provided, will be set to the number of bytes
904  * in @string up to @len or the terminating nul byte, and @bytes_written, if
905  * provided, will be set to the number of output bytes written into the
906  * returned buffer, excluding the terminating nul sequence.
907  * On error, @bytes_read will be set to the byte offset after the last valid
908  * sequence in @string, and @bytes_written will be set to 0.
909  */
910 static gchar *
convert_checked(const gchar * string,gssize len,const gchar * to_codeset,const gchar * from_codeset,ConvertCheckFlags flags,gsize * bytes_read,gsize * bytes_written,GError ** error)911 convert_checked (const gchar      *string,
912                  gssize            len,
913                  const gchar      *to_codeset,
914                  const gchar      *from_codeset,
915                  ConvertCheckFlags flags,
916                  gsize            *bytes_read,
917                  gsize            *bytes_written,
918                  GError          **error)
919 {
920   gchar *out;
921   gsize outbytes;
922 
923   if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
924     {
925       const gchar *early_nul = memchr (string, '\0', len);
926       if (early_nul != NULL)
927         {
928           if (bytes_read)
929             *bytes_read = early_nul - string;
930           if (bytes_written)
931             *bytes_written = 0;
932 
933           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
934                                _("Embedded NUL byte in conversion input"));
935           return NULL;
936         }
937     }
938 
939   out = g_convert (string, len, to_codeset, from_codeset,
940                    bytes_read, &outbytes, error);
941   if (out == NULL)
942     {
943       if (bytes_written)
944         *bytes_written = 0;
945       return NULL;
946     }
947 
948   if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
949       && memchr (out, '\0', outbytes) != NULL)
950     {
951       g_free (out);
952       if (bytes_written)
953         *bytes_written = 0;
954       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
955                            _("Embedded NUL byte in conversion output"));
956       return NULL;
957     }
958 
959   if (bytes_written)
960     *bytes_written = outbytes;
961   return out;
962 }
963 
964 /**
965  * g_locale_to_utf8:
966  * @opsysstring:   (array length=len) (element-type guint8): a string in the
967  *                 encoding of the current locale. On Windows
968  *                 this means the system codepage.
969  * @len:           the length of the string, or -1 if the string is
970  *                 nul-terminated (Note that some encodings may allow nul
971  *                 bytes to occur inside strings. In that case, using -1
972  *                 for the @len parameter is unsafe)
973  * @bytes_read: (out) (optional): location to store the number of bytes in the
974  *                 input string that were successfully converted, or %NULL.
975  *                 Even if the conversion was successful, this may be
976  *                 less than @len if there were partial characters
977  *                 at the end of the input. If the error
978  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
979  *                 stored will be the byte offset after the last valid
980  *                 input sequence.
981  * @bytes_written: (out) (optional): the number of bytes stored in the output
982  *                 buffer (not including the terminating nul).
983  * @error:         location to store the error occurring, or %NULL to ignore
984  *                 errors. Any of the errors in #GConvertError may occur.
985  *
986  * Converts a string which is in the encoding used for strings by
987  * the C runtime (usually the same as that used by the operating
988  * system) in the [current locale][setlocale] into a UTF-8 string.
989  *
990  * If the source encoding is not UTF-8 and the conversion output contains a
991  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
992  * function returns %NULL.
993  * If the source encoding is UTF-8, an embedded nul character is treated with
994  * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
995  * earlier versions of this library. Use g_convert() to produce output that
996  * may contain embedded nul characters.
997  *
998  * Returns: (type utf8): The converted string, or %NULL on an error.
999  **/
1000 gchar *
g_locale_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1001 g_locale_to_utf8 (const gchar  *opsysstring,
1002 		  gssize        len,
1003 		  gsize        *bytes_read,
1004 		  gsize        *bytes_written,
1005 		  GError      **error)
1006 {
1007   const char *charset;
1008 
1009   if (g_get_charset (&charset))
1010     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1011   else
1012     return convert_checked (opsysstring, len, "UTF-8", charset,
1013                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1014                             bytes_read, bytes_written, error);
1015 }
1016 
1017 /**
1018  * g_locale_from_utf8:
1019  * @utf8string:    a UTF-8 encoded string
1020  * @len:           the length of the string, or -1 if the string is
1021  *                 nul-terminated.
1022  * @bytes_read: (out) (optional): location to store the number of bytes in the
1023  *                 input string that were successfully converted, or %NULL.
1024  *                 Even if the conversion was successful, this may be
1025  *                 less than @len if there were partial characters
1026  *                 at the end of the input. If the error
1027  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1028  *                 stored will be the byte offset after the last valid
1029  *                 input sequence.
1030  * @bytes_written: (out) (optional): the number of bytes stored in the output
1031  *                 buffer (not including the terminating nul).
1032  * @error:         location to store the error occurring, or %NULL to ignore
1033  *                 errors. Any of the errors in #GConvertError may occur.
1034  *
1035  * Converts a string from UTF-8 to the encoding used for strings by
1036  * the C runtime (usually the same as that used by the operating
1037  * system) in the [current locale][setlocale]. On Windows this means
1038  * the system codepage.
1039  *
1040  * The input string shall not contain nul characters even if the @len
1041  * argument is positive. A nul character found inside the string will result
1042  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1043  * input that may contain embedded nul characters.
1044  *
1045  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1046  *          A newly-allocated buffer containing the converted string,
1047  *          or %NULL on an error, and error will be set.
1048  **/
1049 gchar *
g_locale_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1050 g_locale_from_utf8 (const gchar *utf8string,
1051 		    gssize       len,
1052 		    gsize       *bytes_read,
1053 		    gsize       *bytes_written,
1054 		    GError     **error)
1055 {
1056   const gchar *charset;
1057 
1058   if (g_get_charset (&charset))
1059     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1060   else
1061     return convert_checked (utf8string, len, charset, "UTF-8",
1062                             CONVERT_CHECK_NO_NULS_IN_INPUT,
1063                             bytes_read, bytes_written, error);
1064 }
1065 
1066 #ifndef G_PLATFORM_WIN32
1067 
1068 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1069 
1070 struct _GFilenameCharsetCache {
1071   gboolean is_utf8;
1072   gchar *charset;
1073   gchar **filename_charsets;
1074 };
1075 
1076 static void
filename_charset_cache_free(gpointer data)1077 filename_charset_cache_free (gpointer data)
1078 {
1079   GFilenameCharsetCache *cache = data;
1080   g_free (cache->charset);
1081   g_strfreev (cache->filename_charsets);
1082   g_free (cache);
1083 }
1084 
1085 /**
1086  * g_get_filename_charsets:
1087  * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1088  *    return location for the %NULL-terminated list of encoding names
1089  *
1090  * Determines the preferred character sets used for filenames.
1091  * The first character set from the @charsets is the filename encoding, the
1092  * subsequent character sets are used when trying to generate a displayable
1093  * representation of a filename, see g_filename_display_name().
1094  *
1095  * On Unix, the character sets are determined by consulting the
1096  * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1097  * On Windows, the character set used in the GLib API is always UTF-8
1098  * and said environment variables have no effect.
1099  *
1100  * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1101  * character set names. The special token "\@locale" is taken
1102  * to  mean the character set for the [current locale][setlocale].
1103  * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1104  * the character set of the current locale is taken as the filename
1105  * encoding. If neither environment variable  is set, UTF-8 is taken
1106  * as the filename encoding, but the character set of the current locale
1107  * is also put in the list of encodings.
1108  *
1109  * The returned @charsets belong to GLib and must not be freed.
1110  *
1111  * Note that on Unix, regardless of the locale character set or
1112  * `G_FILENAME_ENCODING` value, the actual file names present
1113  * on a system might be in any random encoding or just gibberish.
1114  *
1115  * Returns: %TRUE if the filename encoding is UTF-8.
1116  *
1117  * Since: 2.6
1118  */
1119 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1120 g_get_filename_charsets (const gchar ***filename_charsets)
1121 {
1122   static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1123   GFilenameCharsetCache *cache = g_private_get (&cache_private);
1124   const gchar *charset;
1125 
1126   if (!cache)
1127     cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache));
1128 
1129   g_get_charset (&charset);
1130 
1131   if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1132     {
1133       const gchar *new_charset;
1134       gchar *p;
1135       gint i;
1136 
1137       g_free (cache->charset);
1138       g_strfreev (cache->filename_charsets);
1139       cache->charset = g_strdup (charset);
1140 
1141       p = getenv ("G_FILENAME_ENCODING");
1142       if (p != NULL && p[0] != '\0')
1143 	{
1144 	  cache->filename_charsets = g_strsplit (p, ",", 0);
1145 	  cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1146 
1147 	  for (i = 0; cache->filename_charsets[i]; i++)
1148 	    {
1149 	      if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1150 		{
1151 		  g_get_charset (&new_charset);
1152 		  g_free (cache->filename_charsets[i]);
1153 		  cache->filename_charsets[i] = g_strdup (new_charset);
1154 		}
1155 	    }
1156 	}
1157       else if (getenv ("G_BROKEN_FILENAMES") != NULL)
1158 	{
1159 	  cache->filename_charsets = g_new0 (gchar *, 2);
1160 	  cache->is_utf8 = g_get_charset (&new_charset);
1161 	  cache->filename_charsets[0] = g_strdup (new_charset);
1162 	}
1163       else
1164 	{
1165 	  cache->filename_charsets = g_new0 (gchar *, 3);
1166 	  cache->is_utf8 = TRUE;
1167 	  cache->filename_charsets[0] = g_strdup ("UTF-8");
1168 	  if (!g_get_charset (&new_charset))
1169 	    cache->filename_charsets[1] = g_strdup (new_charset);
1170 	}
1171     }
1172 
1173   if (filename_charsets)
1174     *filename_charsets = (const gchar **)cache->filename_charsets;
1175 
1176   return cache->is_utf8;
1177 }
1178 
1179 #else /* G_PLATFORM_WIN32 */
1180 
1181 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1182 g_get_filename_charsets (const gchar ***filename_charsets)
1183 {
1184   static const gchar *charsets[] = {
1185     "UTF-8",
1186     NULL
1187   };
1188 
1189 #ifdef G_OS_WIN32
1190   /* On Windows GLib pretends that the filename charset is UTF-8 */
1191   if (filename_charsets)
1192     *filename_charsets = charsets;
1193 
1194   return TRUE;
1195 #else
1196   gboolean result;
1197 
1198   /* Cygwin works like before */
1199   result = g_get_charset (&(charsets[0]));
1200 
1201   if (filename_charsets)
1202     *filename_charsets = charsets;
1203 
1204   return result;
1205 #endif
1206 }
1207 
1208 #endif /* G_PLATFORM_WIN32 */
1209 
1210 static gboolean
get_filename_charset(const gchar ** filename_charset)1211 get_filename_charset (const gchar **filename_charset)
1212 {
1213   const gchar **charsets;
1214   gboolean is_utf8;
1215 
1216   is_utf8 = g_get_filename_charsets (&charsets);
1217 
1218   if (filename_charset)
1219     *filename_charset = charsets[0];
1220 
1221   return is_utf8;
1222 }
1223 
1224 /**
1225  * g_filename_to_utf8:
1226  * @opsysstring: (type filename): a string in the encoding for filenames
1227  * @len:           the length of the string, or -1 if the string is
1228  *                 nul-terminated (Note that some encodings may allow nul
1229  *                 bytes to occur inside strings. In that case, using -1
1230  *                 for the @len parameter is unsafe)
1231  * @bytes_read: (out) (optional): location to store the number of bytes in the
1232  *                 input string that were successfully converted, or %NULL.
1233  *                 Even if the conversion was successful, this may be
1234  *                 less than @len if there were partial characters
1235  *                 at the end of the input. If the error
1236  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1237  *                 stored will be the byte offset after the last valid
1238  *                 input sequence.
1239  * @bytes_written: (out) (optional): the number of bytes stored in the output
1240  *                 buffer (not including the terminating nul).
1241  * @error:         location to store the error occurring, or %NULL to ignore
1242  *                 errors. Any of the errors in #GConvertError may occur.
1243  *
1244  * Converts a string which is in the encoding used by GLib for
1245  * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1246  * for filenames; on other platforms, this function indirectly depends on
1247  * the [current locale][setlocale].
1248  *
1249  * The input string shall not contain nul characters even if the @len
1250  * argument is positive. A nul character found inside the string will result
1251  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1252  * If the source encoding is not UTF-8 and the conversion output contains a
1253  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1254  * function returns %NULL. Use g_convert() to produce output that
1255  * may contain embedded nul characters.
1256  *
1257  * Returns: (type utf8): The converted string, or %NULL on an error.
1258  **/
1259 gchar*
g_filename_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1260 g_filename_to_utf8 (const gchar *opsysstring,
1261 		    gssize       len,
1262 		    gsize       *bytes_read,
1263 		    gsize       *bytes_written,
1264 		    GError     **error)
1265 {
1266   const gchar *charset;
1267 
1268   g_return_val_if_fail (opsysstring != NULL, NULL);
1269 
1270   if (get_filename_charset (&charset))
1271     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1272   else
1273     return convert_checked (opsysstring, len, "UTF-8", charset,
1274                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1275                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1276                             bytes_read, bytes_written, error);
1277 }
1278 
1279 /**
1280  * g_filename_from_utf8:
1281  * @utf8string:    (type utf8): a UTF-8 encoded string.
1282  * @len:           the length of the string, or -1 if the string is
1283  *                 nul-terminated.
1284  * @bytes_read:    (out) (optional): location to store the number of bytes in
1285  *                 the input string that were successfully converted, or %NULL.
1286  *                 Even if the conversion was successful, this may be
1287  *                 less than @len if there were partial characters
1288  *                 at the end of the input. If the error
1289  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1290  *                 stored will be the byte offset after the last valid
1291  *                 input sequence.
1292  * @bytes_written: (out) (optional): the number of bytes stored in
1293  *                 the output buffer (not including the terminating nul).
1294  * @error:         location to store the error occurring, or %NULL to ignore
1295  *                 errors. Any of the errors in #GConvertError may occur.
1296  *
1297  * Converts a string from UTF-8 to the encoding GLib uses for
1298  * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1299  * on other platforms, this function indirectly depends on the
1300  * [current locale][setlocale].
1301  *
1302  * The input string shall not contain nul characters even if the @len
1303  * argument is positive. A nul character found inside the string will result
1304  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1305  * not UTF-8 and the conversion output contains a nul character, the error
1306  * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1307  *
1308  * Returns: (type filename):
1309  *               The converted string, or %NULL on an error.
1310  **/
1311 gchar*
g_filename_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1312 g_filename_from_utf8 (const gchar *utf8string,
1313 		      gssize       len,
1314 		      gsize       *bytes_read,
1315 		      gsize       *bytes_written,
1316 		      GError     **error)
1317 {
1318   const gchar *charset;
1319 
1320   if (get_filename_charset (&charset))
1321     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1322   else
1323     return convert_checked (utf8string, len, charset, "UTF-8",
1324                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1325                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1326                             bytes_read, bytes_written, error);
1327 }
1328 
1329 /* Test of haystack has the needle prefix, comparing case
1330  * insensitive. haystack may be UTF-8, but needle must
1331  * contain only ascii. */
1332 static gboolean
has_case_prefix(const gchar * haystack,const gchar * needle)1333 has_case_prefix (const gchar *haystack, const gchar *needle)
1334 {
1335   const gchar *h, *n;
1336 
1337   /* Eat one character at a time. */
1338   h = haystack;
1339   n = needle;
1340 
1341   while (*n && *h &&
1342 	 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1343     {
1344       n++;
1345       h++;
1346     }
1347 
1348   return *n == '\0';
1349 }
1350 
1351 typedef enum {
1352   UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1353   UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1354   UNSAFE_PATH       = 0x8,  /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1355   UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1356   UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1357 } UnsafeCharacterSet;
1358 
1359 static const guchar acceptable[96] = {
1360   /* A table of the ASCII chars from space (32) to DEL (127) */
1361   /*      !    "    #    $    %    &    '    (    )    *    +    ,    -    .    / */
1362   0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1363   /* 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ? */
1364   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1365   /* @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O */
1366   0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1367   /* P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _ */
1368   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1369   /* `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o */
1370   0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1371   /* p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~  DEL */
1372   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1373 };
1374 
1375 static const gchar hex[16] = "0123456789ABCDEF";
1376 
1377 /* Note: This escape function works on file: URIs, but if you want to
1378  * escape something else, please read RFC-2396 */
1379 static gchar *
g_escape_uri_string(const gchar * string,UnsafeCharacterSet mask)1380 g_escape_uri_string (const gchar *string,
1381 		     UnsafeCharacterSet mask)
1382 {
1383 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1384 
1385   const gchar *p;
1386   gchar *q;
1387   gchar *result;
1388   int c;
1389   gint unacceptable;
1390   UnsafeCharacterSet use_mask;
1391 
1392   g_return_val_if_fail (mask == UNSAFE_ALL
1393 			|| mask == UNSAFE_ALLOW_PLUS
1394 			|| mask == UNSAFE_PATH
1395 			|| mask == UNSAFE_HOST
1396 			|| mask == UNSAFE_SLASHES, NULL);
1397 
1398   unacceptable = 0;
1399   use_mask = mask;
1400   for (p = string; *p != '\0'; p++)
1401     {
1402       c = (guchar) *p;
1403       if (!ACCEPTABLE (c))
1404 	unacceptable++;
1405     }
1406 
1407   result = g_malloc (p - string + unacceptable * 2 + 1);
1408 
1409   use_mask = mask;
1410   for (q = result, p = string; *p != '\0'; p++)
1411     {
1412       c = (guchar) *p;
1413 
1414       if (!ACCEPTABLE (c))
1415 	{
1416 	  *q++ = '%'; /* means hex coming */
1417 	  *q++ = hex[c >> 4];
1418 	  *q++ = hex[c & 15];
1419 	}
1420       else
1421 	*q++ = *p;
1422     }
1423 
1424   *q = '\0';
1425 
1426   return result;
1427 }
1428 
1429 
1430 static gchar *
g_escape_file_uri(const gchar * hostname,const gchar * pathname)1431 g_escape_file_uri (const gchar *hostname,
1432 		   const gchar *pathname)
1433 {
1434   char *escaped_hostname = NULL;
1435   char *escaped_path;
1436   char *res;
1437 
1438 #ifdef G_OS_WIN32
1439   char *p, *backslash;
1440 
1441   /* Turn backslashes into forward slashes. That's what Netscape
1442    * does, and they are actually more or less equivalent in Windows.
1443    */
1444 
1445   pathname = g_strdup (pathname);
1446   p = (char *) pathname;
1447 
1448   while ((backslash = strchr (p, '\\')) != NULL)
1449     {
1450       *backslash = '/';
1451       p = backslash + 1;
1452     }
1453 #endif
1454 
1455   if (hostname && *hostname != '\0')
1456     {
1457       escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1458     }
1459 
1460   escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1461 
1462   res = g_strconcat ("file://",
1463 		     (escaped_hostname) ? escaped_hostname : "",
1464 		     (*escaped_path != '/') ? "/" : "",
1465 		     escaped_path,
1466 		     NULL);
1467 
1468 #ifdef G_OS_WIN32
1469   g_free ((char *) pathname);
1470 #endif
1471 
1472   g_free (escaped_hostname);
1473   g_free (escaped_path);
1474 
1475   return res;
1476 }
1477 
1478 static int
unescape_character(const char * scanner)1479 unescape_character (const char *scanner)
1480 {
1481   int first_digit;
1482   int second_digit;
1483 
1484   first_digit = g_ascii_xdigit_value (scanner[0]);
1485   if (first_digit < 0)
1486     return -1;
1487 
1488   second_digit = g_ascii_xdigit_value (scanner[1]);
1489   if (second_digit < 0)
1490     return -1;
1491 
1492   return (first_digit << 4) | second_digit;
1493 }
1494 
1495 static gchar *
g_unescape_uri_string(const char * escaped,int len,const char * illegal_escaped_characters,gboolean ascii_must_not_be_escaped)1496 g_unescape_uri_string (const char *escaped,
1497 		       int         len,
1498 		       const char *illegal_escaped_characters,
1499 		       gboolean    ascii_must_not_be_escaped)
1500 {
1501   const gchar *in, *in_end;
1502   gchar *out, *result;
1503   int c;
1504 
1505   if (escaped == NULL)
1506     return NULL;
1507 
1508   if (len < 0)
1509     len = strlen (escaped);
1510 
1511   result = g_malloc (len + 1);
1512 
1513   out = result;
1514   for (in = escaped, in_end = escaped + len; in < in_end; in++)
1515     {
1516       c = *in;
1517 
1518       if (c == '%')
1519 	{
1520 	  /* catch partial escape sequences past the end of the substring */
1521 	  if (in + 3 > in_end)
1522 	    break;
1523 
1524 	  c = unescape_character (in + 1);
1525 
1526 	  /* catch bad escape sequences and NUL characters */
1527 	  if (c <= 0)
1528 	    break;
1529 
1530 	  /* catch escaped ASCII */
1531 	  if (ascii_must_not_be_escaped && c <= 0x7F)
1532 	    break;
1533 
1534 	  /* catch other illegal escaped characters */
1535 	  if (strchr (illegal_escaped_characters, c) != NULL)
1536 	    break;
1537 
1538 	  in += 2;
1539 	}
1540 
1541       *out++ = c;
1542     }
1543 
1544   g_assert (out - result <= len);
1545   *out = '\0';
1546 
1547   if (in != in_end)
1548     {
1549       g_free (result);
1550       return NULL;
1551     }
1552 
1553   return result;
1554 }
1555 
1556 static gboolean
is_asciialphanum(gunichar c)1557 is_asciialphanum (gunichar c)
1558 {
1559   return c <= 0x7F && g_ascii_isalnum (c);
1560 }
1561 
1562 static gboolean
is_asciialpha(gunichar c)1563 is_asciialpha (gunichar c)
1564 {
1565   return c <= 0x7F && g_ascii_isalpha (c);
1566 }
1567 
1568 /* allows an empty string */
1569 static gboolean
hostname_validate(const char * hostname)1570 hostname_validate (const char *hostname)
1571 {
1572   const char *p;
1573   gunichar c, first_char, last_char;
1574 
1575   p = hostname;
1576   if (*p == '\0')
1577     return TRUE;
1578   do
1579     {
1580       /* read in a label */
1581       c = g_utf8_get_char (p);
1582       p = g_utf8_next_char (p);
1583       if (!is_asciialphanum (c))
1584 	return FALSE;
1585       first_char = c;
1586       do
1587 	{
1588 	  last_char = c;
1589 	  c = g_utf8_get_char (p);
1590 	  p = g_utf8_next_char (p);
1591 	}
1592       while (is_asciialphanum (c) || c == '-');
1593       if (last_char == '-')
1594 	return FALSE;
1595 
1596       /* if that was the last label, check that it was a toplabel */
1597       if (c == '\0' || (c == '.' && *p == '\0'))
1598 	return is_asciialpha (first_char);
1599     }
1600   while (c == '.');
1601   return FALSE;
1602 }
1603 
1604 /**
1605  * g_filename_from_uri:
1606  * @uri: a uri describing a filename (escaped, encoded in ASCII).
1607  * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1608  *            If there is no hostname in the URI, %NULL will be
1609  *            stored in this location.
1610  * @error: location to store the error occurring, or %NULL to ignore
1611  *         errors. Any of the errors in #GConvertError may occur.
1612  *
1613  * Converts an escaped ASCII-encoded URI to a local filename in the
1614  * encoding used for filenames.
1615  *
1616  * Returns: (type filename): a newly-allocated string holding
1617  *               the resulting filename, or %NULL on an error.
1618  **/
1619 gchar *
g_filename_from_uri(const gchar * uri,gchar ** hostname,GError ** error)1620 g_filename_from_uri (const gchar *uri,
1621 		     gchar      **hostname,
1622 		     GError     **error)
1623 {
1624   const char *path_part;
1625   const char *host_part;
1626   char *unescaped_hostname;
1627   char *result;
1628   char *filename;
1629   int offs;
1630 #ifdef G_OS_WIN32
1631   char *p, *slash;
1632 #endif
1633 
1634   if (hostname)
1635     *hostname = NULL;
1636 
1637   if (!has_case_prefix (uri, "file:/"))
1638     {
1639       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1640 		   _("The URI “%s” is not an absolute URI using the “file” scheme"),
1641 		   uri);
1642       return NULL;
1643     }
1644 
1645   path_part = uri + strlen ("file:");
1646 
1647   if (strchr (path_part, '#') != NULL)
1648     {
1649       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1650 		   _("The local file URI “%s” may not include a “#”"),
1651 		   uri);
1652       return NULL;
1653     }
1654 
1655   if (has_case_prefix (path_part, "///"))
1656     path_part += 2;
1657   else if (has_case_prefix (path_part, "//"))
1658     {
1659       path_part += 2;
1660       host_part = path_part;
1661 
1662       path_part = strchr (path_part, '/');
1663 
1664       if (path_part == NULL)
1665 	{
1666 	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1667 		       _("The URI “%s” is invalid"),
1668 		       uri);
1669 	  return NULL;
1670 	}
1671 
1672       unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1673 
1674       if (unescaped_hostname == NULL ||
1675 	  !hostname_validate (unescaped_hostname))
1676 	{
1677 	  g_free (unescaped_hostname);
1678 	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1679 		       _("The hostname of the URI “%s” is invalid"),
1680 		       uri);
1681 	  return NULL;
1682 	}
1683 
1684       if (hostname)
1685 	*hostname = unescaped_hostname;
1686       else
1687 	g_free (unescaped_hostname);
1688     }
1689 
1690   filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1691 
1692   if (filename == NULL)
1693     {
1694       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1695 		   _("The URI “%s” contains invalidly escaped characters"),
1696 		   uri);
1697       return NULL;
1698     }
1699 
1700   offs = 0;
1701 #ifdef G_OS_WIN32
1702   /* Drop localhost */
1703   if (hostname && *hostname != NULL &&
1704       g_ascii_strcasecmp (*hostname, "localhost") == 0)
1705     {
1706       g_free (*hostname);
1707       *hostname = NULL;
1708     }
1709 
1710   /* Turn slashes into backslashes, because that's the canonical spelling */
1711   p = filename;
1712   while ((slash = strchr (p, '/')) != NULL)
1713     {
1714       *slash = '\\';
1715       p = slash + 1;
1716     }
1717 
1718   /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1719    * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1720    * the filename from the drive letter.
1721    */
1722   if (g_ascii_isalpha (filename[1]))
1723     {
1724       if (filename[2] == ':')
1725 	offs = 1;
1726       else if (filename[2] == '|')
1727 	{
1728 	  filename[2] = ':';
1729 	  offs = 1;
1730 	}
1731     }
1732 #endif
1733 
1734   result = g_strdup (filename + offs);
1735   g_free (filename);
1736 
1737   return result;
1738 }
1739 
1740 /**
1741  * g_filename_to_uri:
1742  * @filename: (type filename): an absolute filename specified in the GLib file
1743  *     name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1744  *     on Windows
1745  * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1746  * @error: location to store the error occurring, or %NULL to ignore
1747  *         errors. Any of the errors in #GConvertError may occur.
1748  *
1749  * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1750  * component following Section 3.3. of RFC 2396.
1751  *
1752  * Returns: a newly-allocated string holding the resulting
1753  *               URI, or %NULL on an error.
1754  **/
1755 gchar *
g_filename_to_uri(const gchar * filename,const gchar * hostname,GError ** error)1756 g_filename_to_uri (const gchar *filename,
1757 		   const gchar *hostname,
1758 		   GError     **error)
1759 {
1760   char *escaped_uri;
1761 
1762   g_return_val_if_fail (filename != NULL, NULL);
1763 
1764   if (!g_path_is_absolute (filename))
1765     {
1766       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1767 		   _("The pathname “%s” is not an absolute path"),
1768 		   filename);
1769       return NULL;
1770     }
1771 
1772   if (hostname &&
1773       !(g_utf8_validate (hostname, -1, NULL)
1774 	&& hostname_validate (hostname)))
1775     {
1776       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1777                            _("Invalid hostname"));
1778       return NULL;
1779     }
1780 
1781 #ifdef G_OS_WIN32
1782   /* Don't use localhost unnecessarily */
1783   if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1784     hostname = NULL;
1785 #endif
1786 
1787   escaped_uri = g_escape_file_uri (hostname, filename);
1788 
1789   return escaped_uri;
1790 }
1791 
1792 /**
1793  * g_uri_list_extract_uris:
1794  * @uri_list: an URI list
1795  *
1796  * Splits an URI list conforming to the text/uri-list
1797  * mime type defined in RFC 2483 into individual URIs,
1798  * discarding any comments. The URIs are not validated.
1799  *
1800  * Returns: (transfer full): a newly allocated %NULL-terminated list
1801  *   of strings holding the individual URIs. The array should be freed
1802  *   with g_strfreev().
1803  *
1804  * Since: 2.6
1805  */
1806 gchar **
g_uri_list_extract_uris(const gchar * uri_list)1807 g_uri_list_extract_uris (const gchar *uri_list)
1808 {
1809   GSList *uris, *u;
1810   const gchar *p, *q;
1811   gchar **result;
1812   gint n_uris = 0;
1813 
1814   uris = NULL;
1815 
1816   p = uri_list;
1817 
1818   /* We don't actually try to validate the URI according to RFC
1819    * 2396, or even check for allowed characters - we just ignore
1820    * comments and trim whitespace off the ends.  We also
1821    * allow LF delimination as well as the specified CRLF.
1822    *
1823    * We do allow comments like specified in RFC 2483.
1824    */
1825   while (p)
1826     {
1827       if (*p != '#')
1828 	{
1829 	  while (g_ascii_isspace (*p))
1830 	    p++;
1831 
1832 	  q = p;
1833 	  while (*q && (*q != '\n') && (*q != '\r'))
1834 	    q++;
1835 
1836 	  if (q > p)
1837 	    {
1838 	      q--;
1839 	      while (q > p && g_ascii_isspace (*q))
1840 		q--;
1841 
1842 	      if (q > p)
1843 		{
1844 		  uris = g_slist_prepend (uris, g_strndup (p, q - p + 1));
1845 		  n_uris++;
1846 		}
1847 	    }
1848 	}
1849       p = strchr (p, '\n');
1850       if (p)
1851 	p++;
1852     }
1853 
1854   result = g_new (gchar *, n_uris + 1);
1855 
1856   result[n_uris--] = NULL;
1857   for (u = uris; u; u = u->next)
1858     result[n_uris--] = u->data;
1859 
1860   g_slist_free (uris);
1861 
1862   return result;
1863 }
1864 
1865 /**
1866  * g_filename_display_basename:
1867  * @filename: (type filename): an absolute pathname in the
1868  *     GLib file name encoding
1869  *
1870  * Returns the display basename for the particular filename, guaranteed
1871  * to be valid UTF-8. The display name might not be identical to the filename,
1872  * for instance there might be problems converting it to UTF-8, and some files
1873  * can be translated in the display.
1874  *
1875  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1876  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1877  * You can search the result for the UTF-8 encoding of this character (which is
1878  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1879  * encoding.
1880  *
1881  * You must pass the whole absolute pathname to this functions so that
1882  * translation of well known locations can be done.
1883  *
1884  * This function is preferred over g_filename_display_name() if you know the
1885  * whole path, as it allows translation.
1886  *
1887  * Returns: a newly allocated string containing
1888  *   a rendition of the basename of the filename in valid UTF-8
1889  *
1890  * Since: 2.6
1891  **/
1892 gchar *
g_filename_display_basename(const gchar * filename)1893 g_filename_display_basename (const gchar *filename)
1894 {
1895   char *basename;
1896   char *display_name;
1897 
1898   g_return_val_if_fail (filename != NULL, NULL);
1899 
1900   basename = g_path_get_basename (filename);
1901   display_name = g_filename_display_name (basename);
1902   g_free (basename);
1903   return display_name;
1904 }
1905 
1906 /**
1907  * g_filename_display_name:
1908  * @filename: (type filename): a pathname hopefully in the
1909  *     GLib file name encoding
1910  *
1911  * Converts a filename into a valid UTF-8 string. The conversion is
1912  * not necessarily reversible, so you should keep the original around
1913  * and use the return value of this function only for display purposes.
1914  * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1915  * even if the filename actually isn't in the GLib file name encoding.
1916  *
1917  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1918  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1919  * You can search the result for the UTF-8 encoding of this character (which is
1920  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1921  * encoding.
1922  *
1923  * If you know the whole pathname of the file you should use
1924  * g_filename_display_basename(), since that allows location-based
1925  * translation of filenames.
1926  *
1927  * Returns: a newly allocated string containing
1928  *   a rendition of the filename in valid UTF-8
1929  *
1930  * Since: 2.6
1931  **/
1932 gchar *
g_filename_display_name(const gchar * filename)1933 g_filename_display_name (const gchar *filename)
1934 {
1935   gint i;
1936   const gchar **charsets;
1937   gchar *display_name = NULL;
1938   gboolean is_utf8;
1939 
1940   is_utf8 = g_get_filename_charsets (&charsets);
1941 
1942   if (is_utf8)
1943     {
1944       if (g_utf8_validate (filename, -1, NULL))
1945 	display_name = g_strdup (filename);
1946     }
1947 
1948   if (!display_name)
1949     {
1950       /* Try to convert from the filename charsets to UTF-8.
1951        * Skip the first charset if it is UTF-8.
1952        */
1953       for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1954 	{
1955 	  display_name = g_convert (filename, -1, "UTF-8", charsets[i],
1956 				    NULL, NULL, NULL);
1957 
1958 	  if (display_name)
1959 	    break;
1960 	}
1961     }
1962 
1963   /* if all conversions failed, we replace invalid UTF-8
1964    * by a question mark
1965    */
1966   if (!display_name)
1967     display_name = g_utf8_make_valid (filename, -1);
1968 
1969   return display_name;
1970 }
1971 
1972 #ifdef G_OS_WIN32
1973 
1974 /* Binary compatibility versions. Not for newly compiled code. */
1975 
1976 _GLIB_EXTERN gchar *g_filename_to_utf8_utf8   (const gchar  *opsysstring,
1977                                                gssize        len,
1978                                                gsize        *bytes_read,
1979                                                gsize        *bytes_written,
1980                                                GError      **error) G_GNUC_MALLOC;
1981 _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar  *utf8string,
1982                                                gssize        len,
1983                                                gsize        *bytes_read,
1984                                                gsize        *bytes_written,
1985                                                GError      **error) G_GNUC_MALLOC;
1986 _GLIB_EXTERN gchar *g_filename_from_uri_utf8  (const gchar  *uri,
1987                                                gchar       **hostname,
1988                                                GError      **error) G_GNUC_MALLOC;
1989 _GLIB_EXTERN gchar *g_filename_to_uri_utf8    (const gchar  *filename,
1990                                                const gchar  *hostname,
1991                                                GError      **error) G_GNUC_MALLOC;
1992 
1993 gchar *
g_filename_to_utf8_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1994 g_filename_to_utf8_utf8 (const gchar *opsysstring,
1995                          gssize       len,
1996                          gsize       *bytes_read,
1997                          gsize       *bytes_written,
1998                          GError     **error)
1999 {
2000   return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
2001 }
2002 
2003 gchar *
g_filename_from_utf8_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)2004 g_filename_from_utf8_utf8 (const gchar *utf8string,
2005                            gssize       len,
2006                            gsize       *bytes_read,
2007                            gsize       *bytes_written,
2008                            GError     **error)
2009 {
2010   return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
2011 }
2012 
2013 gchar *
g_filename_from_uri_utf8(const gchar * uri,gchar ** hostname,GError ** error)2014 g_filename_from_uri_utf8 (const gchar *uri,
2015                           gchar      **hostname,
2016                           GError     **error)
2017 {
2018   return g_filename_from_uri (uri, hostname, error);
2019 }
2020 
2021 gchar *
g_filename_to_uri_utf8(const gchar * filename,const gchar * hostname,GError ** error)2022 g_filename_to_uri_utf8 (const gchar *filename,
2023                         const gchar *hostname,
2024                         GError     **error)
2025 {
2026   return g_filename_to_uri (filename, hostname, error);
2027 }
2028 
2029 #endif
2030