• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GLIB - Library of useful routines for C programming
2  *
3  * gconvert.c: Convert between character sets using iconv
4  * Copyright Red Hat Inc., 2000
5  * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "config.h"
22 #include "glibconfig.h"
23 
24 #ifndef G_OS_WIN32
25 #include <iconv.h>
26 #endif
27 #include <errno.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
31 
32 #ifdef G_OS_WIN32
33 #include "win_iconv.c"
34 #endif
35 
36 #ifdef G_PLATFORM_WIN32
37 #define STRICT
38 #include <windows.h>
39 #undef STRICT
40 #endif
41 
42 #include "gconvert.h"
43 
44 #include "gcharsetprivate.h"
45 #include "gslist.h"
46 #include "gstrfuncs.h"
47 #include "gtestutils.h"
48 #include "gthread.h"
49 #include "gthreadprivate.h"
50 #include "gunicode.h"
51 #include "gfileutils.h"
52 #include "genviron.h"
53 
54 #include "glibintl.h"
55 
56 
57 /**
58  * SECTION:conversions
59  * @title: Character Set Conversion
60  * @short_description: convert strings between different character sets
61  *
62  * The g_convert() family of function wraps the functionality of iconv().
63  * In addition to pure character set conversions, GLib has functions to
64  * deal with the extra complications of encodings for file names.
65  *
66  * ## File Name Encodings
67  *
68  * Historically, UNIX has not had a defined encoding for file names:
69  * a file name is valid as long as it does not have path separators
70  * in it ("/"). However, displaying file names may require conversion:
71  * from the character set in which they were created, to the character
72  * set in which the application operates. Consider the Spanish file name
73  * "Presentación.sxi". If the application which created it uses
74  * ISO-8859-1 for its encoding,
75  * |[
76  * Character:  P  r  e  s  e  n  t  a  c  i  ó  n  .  s  x  i
77  * Hex code:   50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
78  * ]|
79  * However, if the application use UTF-8, the actual file name on
80  * disk would look like this:
81  * |[
82  * Character:  P  r  e  s  e  n  t  a  c  i  ó     n  .  s  x  i
83  * Hex code:   50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
84  * ]|
85  * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
86  * GLib do the same thing. If you get a file name from the file system,
87  * for example, from readdir() or from g_dir_read_name(), and you wish
88  * to display the file name to the user, you  will need to convert it
89  * into UTF-8. The opposite case is when the user types the name of a
90  * file they wish to save: the toolkit will give you that string in
91  * UTF-8 encoding, and you will need to convert it to the character
92  * set used for file names before you can create the file with open()
93  * or fopen().
94  *
95  * By default, GLib assumes that file names on disk are in UTF-8
96  * encoding. This is a valid assumption for file systems which
97  * were created relatively recently: most applications use UTF-8
98  * encoding for their strings, and that is also what they use for
99  * the file names they create. However, older file systems may
100  * still contain file names created in "older" encodings, such as
101  * ISO-8859-1. In this case, for compatibility reasons, you may want
102  * to instruct GLib to use that particular encoding for file names
103  * rather than UTF-8. You can do this by specifying the encoding for
104  * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
105  * environment variable. For example, if your installation uses
106  * ISO-8859-1 for file names, you can put this in your `~/.profile`:
107  * |[
108  * export G_FILENAME_ENCODING=ISO-8859-1
109  * ]|
110  * GLib provides the functions g_filename_to_utf8() and
111  * g_filename_from_utf8() to perform the necessary conversions.
112  * These functions convert file names from the encoding specified
113  * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
114  * [diagram][file-name-encodings-diagram] illustrates how
115  * these functions are used to convert between UTF-8 and the
116  * encoding for file names in the file system.
117  *
118  * ## Conversion between file name encodings # {#file-name-encodings-diagram)
119  *
120  * ![](file-name-encodings.png)
121  *
122  * ## Checklist for Application Writers
123  *
124  * This section is a practical summary of the detailed
125  * things to do to make sure your applications process file
126  * name encodings correctly.
127  *
128  * 1. If you get a file name from the file system from a function
129  *    such as readdir() or gtk_file_chooser_get_filename(), you do
130  *    not need to do any conversion to pass that file name to
131  *    functions like open(), rename(), or fopen() -- those are "raw"
132  *    file names which the file system understands.
133  *
134  * 2. If you need to display a file name, convert it to UTF-8 first
135  *    by using g_filename_to_utf8(). If conversion fails, display a
136  *    string like "Unknown file name". Do not convert this string back
137  *    into the encoding used for file names if you wish to pass it to
138  *    the file system; use the original file name instead.
139  *
140  *    For example, the document window of a word processor could display
141  *    "Unknown file name" in its title bar but still let the user save
142  *    the file, as it would keep the raw file name internally. This
143  *    can happen if the user has not set the `G_FILENAME_ENCODING`
144  *    environment variable even though he has files whose names are
145  *    not encoded in UTF-8.
146  *
147  * 3. If your user interface lets the user type a file name for saving
148  *    or renaming, convert it to the encoding used for file names in
149  *    the file system by using g_filename_from_utf8(). Pass the converted
150  *    file name to functions like fopen(). If conversion fails, ask the
151  *    user to enter a different file name. This can happen if the user
152  *    types Japanese characters when `G_FILENAME_ENCODING` is set to
153  *    `ISO-8859-1`, for example.
154  */
155 
156 /* We try to terminate strings in unknown charsets with this many zero bytes
157  * to ensure that multibyte strings really are nul-terminated when we return
158  * them from g_convert() and friends.
159  */
160 #define NUL_TERMINATOR_LENGTH 4
161 
G_DEFINE_QUARK(g_convert_error,g_convert_error)162 G_DEFINE_QUARK (g_convert_error, g_convert_error)
163 
164 static gboolean
165 try_conversion (const char *to_codeset,
166 		const char *from_codeset,
167 		iconv_t    *cd)
168 {
169   *cd = iconv_open (to_codeset, from_codeset);
170 
171   if (*cd == (iconv_t)-1 && errno == EINVAL)
172     return FALSE;
173   else
174     return TRUE;
175 }
176 
177 static gboolean
try_to_aliases(const char ** to_aliases,const char * from_codeset,iconv_t * cd)178 try_to_aliases (const char **to_aliases,
179 		const char  *from_codeset,
180 		iconv_t     *cd)
181 {
182   if (to_aliases)
183     {
184       const char **p = to_aliases;
185       while (*p)
186 	{
187 	  if (try_conversion (*p, from_codeset, cd))
188 	    return TRUE;
189 
190 	  p++;
191 	}
192     }
193 
194   return FALSE;
195 }
196 
197 /**
198  * g_iconv_open: (skip)
199  * @to_codeset: destination codeset
200  * @from_codeset: source codeset
201  *
202  * Same as the standard UNIX routine iconv_open(), but
203  * may be implemented via libiconv on UNIX flavors that lack
204  * a native implementation.
205  *
206  * GLib provides g_convert() and g_locale_to_utf8() which are likely
207  * more convenient than the raw iconv wrappers.
208  *
209  * Returns: a "conversion descriptor", or (GIConv)-1 if
210  *  opening the converter failed.
211  **/
212 GIConv
g_iconv_open(const gchar * to_codeset,const gchar * from_codeset)213 g_iconv_open (const gchar  *to_codeset,
214 	      const gchar  *from_codeset)
215 {
216   iconv_t cd;
217 
218   if (!try_conversion (to_codeset, from_codeset, &cd))
219     {
220       const char **to_aliases = _g_charset_get_aliases (to_codeset);
221       const char **from_aliases = _g_charset_get_aliases (from_codeset);
222 
223       if (from_aliases)
224 	{
225 	  const char **p = from_aliases;
226 	  while (*p)
227 	    {
228 	      if (try_conversion (to_codeset, *p, &cd))
229 		goto out;
230 
231 	      if (try_to_aliases (to_aliases, *p, &cd))
232 		goto out;
233 
234 	      p++;
235 	    }
236 	}
237 
238       if (try_to_aliases (to_aliases, from_codeset, &cd))
239 	goto out;
240     }
241 
242  out:
243   return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
244 }
245 
246 /**
247  * g_iconv: (skip)
248  * @converter: conversion descriptor from g_iconv_open()
249  * @inbuf: bytes to convert
250  * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
251  * @outbuf: converted output bytes
252  * @outbytes_left: inout parameter, bytes available to fill in @outbuf
253  *
254  * Same as the standard UNIX routine iconv(), but
255  * may be implemented via libiconv on UNIX flavors that lack
256  * a native implementation.
257  *
258  * GLib provides g_convert() and g_locale_to_utf8() which are likely
259  * more convenient than the raw iconv wrappers.
260  *
261  * Note that the behaviour of iconv() for characters which are valid in the
262  * input character set, but which have no representation in the output character
263  * set, is implementation defined. This function may return success (with a
264  * positive number of non-reversible conversions as replacement characters were
265  * used), or it may return -1 and set an error such as %EILSEQ, in such a
266  * situation.
267  *
268  * Returns: count of non-reversible conversions, or -1 on error
269  **/
270 gsize
g_iconv(GIConv converter,gchar ** inbuf,gsize * inbytes_left,gchar ** outbuf,gsize * outbytes_left)271 g_iconv (GIConv   converter,
272 	 gchar  **inbuf,
273 	 gsize   *inbytes_left,
274 	 gchar  **outbuf,
275 	 gsize   *outbytes_left)
276 {
277   iconv_t cd = (iconv_t)converter;
278 
279   return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
280 }
281 
282 /**
283  * g_iconv_close: (skip)
284  * @converter: a conversion descriptor from g_iconv_open()
285  *
286  * Same as the standard UNIX routine iconv_close(), but
287  * may be implemented via libiconv on UNIX flavors that lack
288  * a native implementation. Should be called to clean up
289  * the conversion descriptor from g_iconv_open() when
290  * you are done converting things.
291  *
292  * GLib provides g_convert() and g_locale_to_utf8() which are likely
293  * more convenient than the raw iconv wrappers.
294  *
295  * Returns: -1 on error, 0 on success
296  **/
297 gint
g_iconv_close(GIConv converter)298 g_iconv_close (GIConv converter)
299 {
300   iconv_t cd = (iconv_t)converter;
301 
302   return iconv_close (cd);
303 }
304 
305 static GIConv
open_converter(const gchar * to_codeset,const gchar * from_codeset,GError ** error)306 open_converter (const gchar *to_codeset,
307 		const gchar *from_codeset,
308 		GError     **error)
309 {
310   GIConv cd;
311 
312   cd = g_iconv_open (to_codeset, from_codeset);
313 
314   if (cd == (GIConv) -1)
315     {
316       /* Something went wrong.  */
317       if (error)
318 	{
319 	  if (errno == EINVAL)
320 	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
321 			 _("Conversion from character set “%s” to “%s” is not supported"),
322 			 from_codeset, to_codeset);
323 	  else
324 	    g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
325 			 _("Could not open converter from “%s” to “%s”"),
326 			 from_codeset, to_codeset);
327 	}
328     }
329 
330   return cd;
331 }
332 
333 static int
close_converter(GIConv cd)334 close_converter (GIConv cd)
335 {
336   if (cd == (GIConv) -1)
337     return 0;
338 
339   return g_iconv_close (cd);
340 }
341 
342 /**
343  * g_convert_with_iconv: (skip)
344  * @str:           (array length=len) (element-type guint8):
345  *                 the string to convert.
346  * @len:           the length of the string in bytes, or -1 if the string is
347  *                 nul-terminated (Note that some encodings may allow nul
348  *                 bytes to occur inside strings. In that case, using -1
349  *                 for the @len parameter is unsafe)
350  * @converter:     conversion descriptor from g_iconv_open()
351  * @bytes_read:    (out) (optional): location to store the number of bytes in
352  *                 the input string that were successfully converted, or %NULL.
353  *                 Even if the conversion was successful, this may be
354  *                 less than @len if there were partial characters
355  *                 at the end of the input. If the error
356  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
357  *                 stored will be the byte offset after the last valid
358  *                 input sequence.
359  * @bytes_written: (out) (optional): the number of bytes stored in
360  *                 the output buffer (not including the terminating nul).
361  * @error:         location to store the error occurring, or %NULL to ignore
362  *                 errors. Any of the errors in #GConvertError may occur.
363  *
364  * Converts a string from one character set to another.
365  *
366  * Note that you should use g_iconv() for streaming conversions.
367  * Despite the fact that @bytes_read can return information about partial
368  * characters, the g_convert_... functions are not generally suitable
369  * for streaming. If the underlying converter maintains internal state,
370  * then this won't be preserved across successive calls to g_convert(),
371  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
372  * this is the GNU C converter for CP1255 which does not emit a base
373  * character until it knows that the next character is not a mark that
374  * could combine with the base character.)
375  *
376  * Characters which are valid in the input character set, but which have no
377  * representation in the output character set will result in a
378  * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
379  * specification, which leaves this behaviour implementation defined. Note that
380  * this is the same error code as is returned for an invalid byte sequence in
381  * the input character set. To get defined behaviour for conversion of
382  * unrepresentable characters, use g_convert_with_fallback().
383  *
384  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
385  *               If the conversion was successful, a newly allocated buffer
386  *               containing the converted string, which must be freed with
387  *               g_free(). Otherwise %NULL and @error will be set.
388  **/
389 gchar*
g_convert_with_iconv(const gchar * str,gssize len,GIConv converter,gsize * bytes_read,gsize * bytes_written,GError ** error)390 g_convert_with_iconv (const gchar *str,
391 		      gssize       len,
392 		      GIConv       converter,
393 		      gsize       *bytes_read,
394 		      gsize       *bytes_written,
395 		      GError     **error)
396 {
397   gchar *dest;
398   gchar *outp;
399   const gchar *p;
400   gsize inbytes_remaining;
401   gsize outbytes_remaining;
402   gsize err;
403   gsize outbuf_size;
404   gboolean have_error = FALSE;
405   gboolean done = FALSE;
406   gboolean reset = FALSE;
407 
408   g_return_val_if_fail (converter != (GIConv) -1, NULL);
409 
410   if (len < 0)
411     len = strlen (str);
412 
413   p = str;
414   inbytes_remaining = len;
415   outbuf_size = len + NUL_TERMINATOR_LENGTH;
416 
417   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
418   outp = dest = g_malloc (outbuf_size);
419 
420   while (!done && !have_error)
421     {
422       if (reset)
423         err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
424       else
425         err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
426 
427       if (err == (gsize) -1)
428 	{
429 	  switch (errno)
430 	    {
431 	    case EINVAL:
432 	      /* Incomplete text, do not report an error */
433 	      done = TRUE;
434 	      break;
435 	    case E2BIG:
436 	      {
437 		gsize used = outp - dest;
438 
439 		outbuf_size *= 2;
440 		dest = g_realloc (dest, outbuf_size);
441 
442 		outp = dest + used;
443 		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
444 	      }
445 	      break;
446 	    case EILSEQ:
447               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
448                                    _("Invalid byte sequence in conversion input"));
449 	      have_error = TRUE;
450 	      break;
451 	    default:
452               {
453                 int errsv = errno;
454 
455                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
456                              _("Error during conversion: %s"),
457                              g_strerror (errsv));
458               }
459 	      have_error = TRUE;
460 	      break;
461 	    }
462 	}
463       else if (err > 0)
464         {
465           /* @err gives the number of replacement characters used. */
466           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
467                                _("Unrepresentable character in conversion input"));
468           have_error = TRUE;
469         }
470       else
471 	{
472 	  if (!reset)
473 	    {
474 	      /* call g_iconv with NULL inbuf to cleanup shift state */
475 	      reset = TRUE;
476 	      inbytes_remaining = 0;
477 	    }
478 	  else
479 	    done = TRUE;
480 	}
481     }
482 
483   memset (outp, 0, NUL_TERMINATOR_LENGTH);
484 
485   if (bytes_read)
486     *bytes_read = p - str;
487   else
488     {
489       if ((p - str) != len)
490 	{
491           if (!have_error)
492             {
493               g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
494                                    _("Partial character sequence at end of input"));
495               have_error = TRUE;
496             }
497 	}
498     }
499 
500   if (bytes_written)
501     *bytes_written = outp - dest;	/* Doesn't include '\0' */
502 
503   if (have_error)
504     {
505       g_free (dest);
506       return NULL;
507     }
508   else
509     return dest;
510 }
511 
512 /**
513  * g_convert:
514  * @str:           (array length=len) (element-type guint8):
515  *                 the string to convert.
516  * @len:           the length of the string in bytes, or -1 if the string is
517  *                 nul-terminated (Note that some encodings may allow nul
518  *                 bytes to occur inside strings. In that case, using -1
519  *                 for the @len parameter is unsafe)
520  * @to_codeset:    name of character set into which to convert @str
521  * @from_codeset:  character set of @str.
522  * @bytes_read:    (out) (optional): location to store the number of bytes in
523  *                 the input string that were successfully converted, or %NULL.
524  *                 Even if the conversion was successful, this may be
525  *                 less than @len if there were partial characters
526  *                 at the end of the input. If the error
527  *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
528  *                 stored will be the byte offset after the last valid
529  *                 input sequence.
530  * @bytes_written: (out) (optional): the number of bytes stored in
531  *                 the output buffer (not including the terminating nul).
532  * @error:         location to store the error occurring, or %NULL to ignore
533  *                 errors. Any of the errors in #GConvertError may occur.
534  *
535  * Converts a string from one character set to another.
536  *
537  * Note that you should use g_iconv() for streaming conversions.
538  * Despite the fact that @bytes_read can return information about partial
539  * characters, the g_convert_... functions are not generally suitable
540  * for streaming. If the underlying converter maintains internal state,
541  * then this won't be preserved across successive calls to g_convert(),
542  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
543  * this is the GNU C converter for CP1255 which does not emit a base
544  * character until it knows that the next character is not a mark that
545  * could combine with the base character.)
546  *
547  * Using extensions such as "//TRANSLIT" may not work (or may not work
548  * well) on many platforms.  Consider using g_str_to_ascii() instead.
549  *
550  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
551  *          If the conversion was successful, a newly allocated buffer
552  *          containing the converted string, which must be freed with g_free().
553  *          Otherwise %NULL and @error will be set.
554  **/
555 gchar*
g_convert(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,gsize * bytes_read,gsize * bytes_written,GError ** error)556 g_convert (const gchar *str,
557            gssize       len,
558            const gchar *to_codeset,
559            const gchar *from_codeset,
560            gsize       *bytes_read,
561 	   gsize       *bytes_written,
562 	   GError     **error)
563 {
564   gchar *res;
565   GIConv cd;
566 
567   g_return_val_if_fail (str != NULL, NULL);
568   g_return_val_if_fail (to_codeset != NULL, NULL);
569   g_return_val_if_fail (from_codeset != NULL, NULL);
570 
571   cd = open_converter (to_codeset, from_codeset, error);
572 
573   if (cd == (GIConv) -1)
574     {
575       if (bytes_read)
576         *bytes_read = 0;
577 
578       if (bytes_written)
579         *bytes_written = 0;
580 
581       return NULL;
582     }
583 
584   res = g_convert_with_iconv (str, len, cd,
585 			      bytes_read, bytes_written,
586 			      error);
587 
588   close_converter (cd);
589 
590   return res;
591 }
592 
593 /**
594  * g_convert_with_fallback:
595  * @str:          (array length=len) (element-type guint8):
596  *                the string to convert.
597  * @len:          the length of the string in bytes, or -1 if the string is
598  *                 nul-terminated (Note that some encodings may allow nul
599  *                 bytes to occur inside strings. In that case, using -1
600  *                 for the @len parameter is unsafe)
601  * @to_codeset:   name of character set into which to convert @str
602  * @from_codeset: character set of @str.
603  * @fallback:     UTF-8 string to use in place of characters not
604  *                present in the target encoding. (The string must be
605  *                representable in the target encoding).
606  *                If %NULL, characters not in the target encoding will
607  *                be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
608  * @bytes_read:   (out) (optional): location to store the number of bytes in
609  *                the input string that were successfully converted, or %NULL.
610  *                Even if the conversion was successful, this may be
611  *                less than @len if there were partial characters
612  *                at the end of the input.
613  * @bytes_written: (out) (optional): the number of bytes stored in
614  *                 the output buffer (not including the terminating nul).
615  * @error:        location to store the error occurring, or %NULL to ignore
616  *                errors. Any of the errors in #GConvertError may occur.
617  *
618  * Converts a string from one character set to another, possibly
619  * including fallback sequences for characters not representable
620  * in the output. Note that it is not guaranteed that the specification
621  * for the fallback sequences in @fallback will be honored. Some
622  * systems may do an approximate conversion from @from_codeset
623  * to @to_codeset in their iconv() functions,
624  * in which case GLib will simply return that approximate conversion.
625  *
626  * Note that you should use g_iconv() for streaming conversions.
627  * Despite the fact that @bytes_read can return information about partial
628  * characters, the g_convert_... functions are not generally suitable
629  * for streaming. If the underlying converter maintains internal state,
630  * then this won't be preserved across successive calls to g_convert(),
631  * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
632  * this is the GNU C converter for CP1255 which does not emit a base
633  * character until it knows that the next character is not a mark that
634  * could combine with the base character.)
635  *
636  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
637  *          If the conversion was successful, a newly allocated buffer
638  *          containing the converted string, which must be freed with g_free().
639  *          Otherwise %NULL and @error will be set.
640  **/
641 gchar*
g_convert_with_fallback(const gchar * str,gssize len,const gchar * to_codeset,const gchar * from_codeset,const gchar * fallback,gsize * bytes_read,gsize * bytes_written,GError ** error)642 g_convert_with_fallback (const gchar *str,
643 			 gssize       len,
644 			 const gchar *to_codeset,
645 			 const gchar *from_codeset,
646 			 const gchar *fallback,
647 			 gsize       *bytes_read,
648 			 gsize       *bytes_written,
649 			 GError     **error)
650 {
651   gchar *utf8;
652   gchar *dest;
653   gchar *outp;
654   const gchar *insert_str = NULL;
655   const gchar *p;
656   gsize inbytes_remaining;
657   const gchar *save_p = NULL;
658   gsize save_inbytes = 0;
659   gsize outbytes_remaining;
660   gsize err;
661   GIConv cd;
662   gsize outbuf_size;
663   gboolean have_error = FALSE;
664   gboolean done = FALSE;
665 
666   GError *local_error = NULL;
667 
668   g_return_val_if_fail (str != NULL, NULL);
669   g_return_val_if_fail (to_codeset != NULL, NULL);
670   g_return_val_if_fail (from_codeset != NULL, NULL);
671 
672   if (len < 0)
673     len = strlen (str);
674 
675   /* Try an exact conversion; we only proceed if this fails
676    * due to an illegal sequence in the input string.
677    */
678   dest = g_convert (str, len, to_codeset, from_codeset,
679 		    bytes_read, bytes_written, &local_error);
680   if (!local_error)
681     return dest;
682 
683   if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
684     {
685       g_propagate_error (error, local_error);
686       return NULL;
687     }
688   else
689     g_error_free (local_error);
690 
691   local_error = NULL;
692 
693   /* No go; to proceed, we need a converter from "UTF-8" to
694    * to_codeset, and the string as UTF-8.
695    */
696   cd = open_converter (to_codeset, "UTF-8", error);
697   if (cd == (GIConv) -1)
698     {
699       if (bytes_read)
700         *bytes_read = 0;
701 
702       if (bytes_written)
703         *bytes_written = 0;
704 
705       return NULL;
706     }
707 
708   utf8 = g_convert (str, len, "UTF-8", from_codeset,
709 		    bytes_read, &inbytes_remaining, error);
710   if (!utf8)
711     {
712       close_converter (cd);
713       if (bytes_written)
714         *bytes_written = 0;
715       return NULL;
716     }
717 
718   /* Now the heart of the code. We loop through the UTF-8 string, and
719    * whenever we hit an offending character, we form fallback, convert
720    * the fallback to the target codeset, and then go back to
721    * converting the original string after finishing with the fallback.
722    *
723    * The variables save_p and save_inbytes store the input state
724    * for the original string while we are converting the fallback
725    */
726   p = utf8;
727 
728   outbuf_size = len + NUL_TERMINATOR_LENGTH;
729   outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
730   outp = dest = g_malloc (outbuf_size);
731 
732   while (!done && !have_error)
733     {
734       gsize inbytes_tmp = inbytes_remaining;
735       err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
736       inbytes_remaining = inbytes_tmp;
737 
738       if (err == (gsize) -1)
739 	{
740 	  switch (errno)
741 	    {
742 	    case EINVAL:
743 	      g_assert_not_reached();
744 	      break;
745 	    case E2BIG:
746 	      {
747 		gsize used = outp - dest;
748 
749 		outbuf_size *= 2;
750 		dest = g_realloc (dest, outbuf_size);
751 
752 		outp = dest + used;
753 		outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
754 
755 		break;
756 	      }
757 	    case EILSEQ:
758 	      if (save_p)
759 		{
760 		  /* Error converting fallback string - fatal
761 		   */
762 		  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
763 			       _("Cannot convert fallback “%s” to codeset “%s”"),
764 			       insert_str, to_codeset);
765 		  have_error = TRUE;
766 		  break;
767 		}
768 	      else if (p)
769 		{
770 		  if (!fallback)
771 		    {
772 		      gunichar ch = g_utf8_get_char (p);
773 		      insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
774 						    ch);
775 		    }
776 		  else
777 		    insert_str = fallback;
778 
779 		  save_p = g_utf8_next_char (p);
780 		  save_inbytes = inbytes_remaining - (save_p - p);
781 		  p = insert_str;
782 		  inbytes_remaining = strlen (p);
783 		  break;
784 		}
785               /* if p is null */
786               G_GNUC_FALLTHROUGH;
787 	    default:
788               {
789                 int errsv = errno;
790 
791                 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
792                              _("Error during conversion: %s"),
793                              g_strerror (errsv));
794               }
795 
796 	      have_error = TRUE;
797 	      break;
798 	    }
799 	}
800       else
801 	{
802 	  if (save_p)
803 	    {
804 	      if (!fallback)
805 		g_free ((gchar *)insert_str);
806 	      p = save_p;
807 	      inbytes_remaining = save_inbytes;
808 	      save_p = NULL;
809 	    }
810 	  else if (p)
811 	    {
812 	      /* call g_iconv with NULL inbuf to cleanup shift state */
813 	      p = NULL;
814 	      inbytes_remaining = 0;
815 	    }
816 	  else
817 	    done = TRUE;
818 	}
819     }
820 
821   /* Cleanup
822    */
823   memset (outp, 0, NUL_TERMINATOR_LENGTH);
824 
825   close_converter (cd);
826 
827   if (bytes_written)
828     *bytes_written = outp - dest;	/* Doesn't include '\0' */
829 
830   g_free (utf8);
831 
832   if (have_error)
833     {
834       if (save_p && !fallback)
835 	g_free ((gchar *)insert_str);
836       g_free (dest);
837       return NULL;
838     }
839   else
840     return dest;
841 }
842 
843 /*
844  * g_locale_to_utf8
845  *
846  *
847  */
848 
849 /*
850  * Validate @string as UTF-8. @len can be negative if @string is
851  * nul-terminated, or a non-negative value in bytes. If @string ends in an
852  * incomplete sequence, or contains any illegal sequences or nul codepoints,
853  * %NULL will be returned and the error set to
854  * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
855  * On success, @bytes_read and @bytes_written, if provided, will be set to
856  * the number of bytes in @string up to @len or the terminating nul byte.
857  * On error, @bytes_read will be set to the byte offset after the last valid
858  * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
859  */
860 static gchar *
strdup_len(const gchar * string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)861 strdup_len (const gchar *string,
862 	    gssize       len,
863 	    gsize       *bytes_read,
864 	    gsize       *bytes_written,
865 	    GError     **error)
866 {
867   gsize real_len;
868   const gchar *end_valid;
869 
870   if (!g_utf8_validate (string, len, &end_valid))
871     {
872       if (bytes_read)
873 	*bytes_read = end_valid - string;
874       if (bytes_written)
875 	*bytes_written = 0;
876 
877       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
878                            _("Invalid byte sequence in conversion input"));
879       return NULL;
880     }
881 
882   real_len = end_valid - string;
883 
884   if (bytes_read)
885     *bytes_read = real_len;
886   if (bytes_written)
887     *bytes_written = real_len;
888 
889   return g_strndup (string, real_len);
890 }
891 
892 typedef enum
893 {
894   CONVERT_CHECK_NO_NULS_IN_INPUT  = 1 << 0,
895   CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
896 } ConvertCheckFlags;
897 
898 /*
899  * Convert from @string in the encoding identified by @from_codeset,
900  * returning a string in the encoding identifed by @to_codeset.
901  * @len can be negative if @string is nul-terminated, or a non-negative
902  * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
903  * to check the input, the output, or both, for embedded nul bytes.
904  * On success, @bytes_read, if provided, will be set to the number of bytes
905  * in @string up to @len or the terminating nul byte, and @bytes_written, if
906  * provided, will be set to the number of output bytes written into the
907  * returned buffer, excluding the terminating nul sequence.
908  * On error, @bytes_read will be set to the byte offset after the last valid
909  * sequence in @string, and @bytes_written will be set to 0.
910  */
911 static gchar *
convert_checked(const gchar * string,gssize len,const gchar * to_codeset,const gchar * from_codeset,ConvertCheckFlags flags,gsize * bytes_read,gsize * bytes_written,GError ** error)912 convert_checked (const gchar      *string,
913                  gssize            len,
914                  const gchar      *to_codeset,
915                  const gchar      *from_codeset,
916                  ConvertCheckFlags flags,
917                  gsize            *bytes_read,
918                  gsize            *bytes_written,
919                  GError          **error)
920 {
921   gchar *out;
922   gsize outbytes;
923 
924   if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
925     {
926       const gchar *early_nul = memchr (string, '\0', len);
927       if (early_nul != NULL)
928         {
929           if (bytes_read)
930             *bytes_read = early_nul - string;
931           if (bytes_written)
932             *bytes_written = 0;
933 
934           g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
935                                _("Embedded NUL byte in conversion input"));
936           return NULL;
937         }
938     }
939 
940   out = g_convert (string, len, to_codeset, from_codeset,
941                    bytes_read, &outbytes, error);
942   if (out == NULL)
943     {
944       if (bytes_written)
945         *bytes_written = 0;
946       return NULL;
947     }
948 
949   if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
950       && memchr (out, '\0', outbytes) != NULL)
951     {
952       g_free (out);
953       if (bytes_written)
954         *bytes_written = 0;
955       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
956                            _("Embedded NUL byte in conversion output"));
957       return NULL;
958     }
959 
960   if (bytes_written)
961     *bytes_written = outbytes;
962   return out;
963 }
964 
965 /**
966  * g_locale_to_utf8:
967  * @opsysstring:   (array length=len) (element-type guint8): a string in the
968  *                 encoding of the current locale. On Windows
969  *                 this means the system codepage.
970  * @len:           the length of the string, or -1 if the string is
971  *                 nul-terminated (Note that some encodings may allow nul
972  *                 bytes to occur inside strings. In that case, using -1
973  *                 for the @len parameter is unsafe)
974  * @bytes_read: (out) (optional): location to store the number of bytes in the
975  *                 input string that were successfully converted, or %NULL.
976  *                 Even if the conversion was successful, this may be
977  *                 less than @len if there were partial characters
978  *                 at the end of the input. If the error
979  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
980  *                 stored will be the byte offset after the last valid
981  *                 input sequence.
982  * @bytes_written: (out) (optional): the number of bytes stored in the output
983  *                 buffer (not including the terminating nul).
984  * @error:         location to store the error occurring, or %NULL to ignore
985  *                 errors. Any of the errors in #GConvertError may occur.
986  *
987  * Converts a string which is in the encoding used for strings by
988  * the C runtime (usually the same as that used by the operating
989  * system) in the [current locale][setlocale] into a UTF-8 string.
990  *
991  * If the source encoding is not UTF-8 and the conversion output contains a
992  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
993  * function returns %NULL.
994  * If the source encoding is UTF-8, an embedded nul character is treated with
995  * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
996  * earlier versions of this library. Use g_convert() to produce output that
997  * may contain embedded nul characters.
998  *
999  * Returns: (type utf8): The converted string, or %NULL on an error.
1000  **/
1001 gchar *
g_locale_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1002 g_locale_to_utf8 (const gchar  *opsysstring,
1003 		  gssize        len,
1004 		  gsize        *bytes_read,
1005 		  gsize        *bytes_written,
1006 		  GError      **error)
1007 {
1008   const char *charset;
1009 
1010   if (g_get_charset (&charset))
1011     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1012   else
1013     return convert_checked (opsysstring, len, "UTF-8", charset,
1014                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1015                             bytes_read, bytes_written, error);
1016 }
1017 
1018 /**
1019  * g_locale_from_utf8:
1020  * @utf8string:    a UTF-8 encoded string
1021  * @len:           the length of the string, or -1 if the string is
1022  *                 nul-terminated.
1023  * @bytes_read: (out) (optional): location to store the number of bytes in the
1024  *                 input string that were successfully converted, or %NULL.
1025  *                 Even if the conversion was successful, this may be
1026  *                 less than @len if there were partial characters
1027  *                 at the end of the input. If the error
1028  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1029  *                 stored will be the byte offset after the last valid
1030  *                 input sequence.
1031  * @bytes_written: (out) (optional): the number of bytes stored in the output
1032  *                 buffer (not including the terminating nul).
1033  * @error:         location to store the error occurring, or %NULL to ignore
1034  *                 errors. Any of the errors in #GConvertError may occur.
1035  *
1036  * Converts a string from UTF-8 to the encoding used for strings by
1037  * the C runtime (usually the same as that used by the operating
1038  * system) in the [current locale][setlocale]. On Windows this means
1039  * the system codepage.
1040  *
1041  * The input string shall not contain nul characters even if the @len
1042  * argument is positive. A nul character found inside the string will result
1043  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1044  * input that may contain embedded nul characters.
1045  *
1046  * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1047  *          A newly-allocated buffer containing the converted string,
1048  *          or %NULL on an error, and error will be set.
1049  **/
1050 gchar *
g_locale_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1051 g_locale_from_utf8 (const gchar *utf8string,
1052 		    gssize       len,
1053 		    gsize       *bytes_read,
1054 		    gsize       *bytes_written,
1055 		    GError     **error)
1056 {
1057   const gchar *charset;
1058 
1059   if (g_get_charset (&charset))
1060     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1061   else
1062     return convert_checked (utf8string, len, charset, "UTF-8",
1063                             CONVERT_CHECK_NO_NULS_IN_INPUT,
1064                             bytes_read, bytes_written, error);
1065 }
1066 
1067 #ifndef G_PLATFORM_WIN32
1068 
1069 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1070 
1071 struct _GFilenameCharsetCache {
1072   gboolean is_utf8;
1073   gchar *charset;
1074   gchar **filename_charsets;
1075 };
1076 
1077 static void
filename_charset_cache_free(gpointer data)1078 filename_charset_cache_free (gpointer data)
1079 {
1080   GFilenameCharsetCache *cache = data;
1081   g_free (cache->charset);
1082   g_strfreev (cache->filename_charsets);
1083   g_free (cache);
1084 }
1085 
1086 /**
1087  * g_get_filename_charsets:
1088  * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1089  *    return location for the %NULL-terminated list of encoding names
1090  *
1091  * Determines the preferred character sets used for filenames.
1092  * The first character set from the @charsets is the filename encoding, the
1093  * subsequent character sets are used when trying to generate a displayable
1094  * representation of a filename, see g_filename_display_name().
1095  *
1096  * On Unix, the character sets are determined by consulting the
1097  * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1098  * On Windows, the character set used in the GLib API is always UTF-8
1099  * and said environment variables have no effect.
1100  *
1101  * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1102  * character set names. The special token "\@locale" is taken
1103  * to  mean the character set for the [current locale][setlocale].
1104  * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1105  * the character set of the current locale is taken as the filename
1106  * encoding. If neither environment variable  is set, UTF-8 is taken
1107  * as the filename encoding, but the character set of the current locale
1108  * is also put in the list of encodings.
1109  *
1110  * The returned @charsets belong to GLib and must not be freed.
1111  *
1112  * Note that on Unix, regardless of the locale character set or
1113  * `G_FILENAME_ENCODING` value, the actual file names present
1114  * on a system might be in any random encoding or just gibberish.
1115  *
1116  * Returns: %TRUE if the filename encoding is UTF-8.
1117  *
1118  * Since: 2.6
1119  */
1120 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1121 g_get_filename_charsets (const gchar ***filename_charsets)
1122 {
1123   static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1124   GFilenameCharsetCache *cache = g_private_get (&cache_private);
1125   const gchar *charset;
1126 
1127   if (!cache)
1128     cache = g_private_set_alloc0 (&cache_private, sizeof (GFilenameCharsetCache));
1129 
1130   g_get_charset (&charset);
1131 
1132   if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1133     {
1134       const gchar *new_charset;
1135       const gchar *p;
1136       gint i;
1137 
1138       g_free (cache->charset);
1139       g_strfreev (cache->filename_charsets);
1140       cache->charset = g_strdup (charset);
1141 
1142       p = g_getenv ("G_FILENAME_ENCODING");
1143       if (p != NULL && p[0] != '\0')
1144 	{
1145 	  cache->filename_charsets = g_strsplit (p, ",", 0);
1146 	  cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1147 
1148 	  for (i = 0; cache->filename_charsets[i]; i++)
1149 	    {
1150 	      if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1151 		{
1152 		  g_get_charset (&new_charset);
1153 		  g_free (cache->filename_charsets[i]);
1154 		  cache->filename_charsets[i] = g_strdup (new_charset);
1155 		}
1156 	    }
1157 	}
1158       else if (g_getenv ("G_BROKEN_FILENAMES") != NULL)
1159 	{
1160 	  cache->filename_charsets = g_new0 (gchar *, 2);
1161 	  cache->is_utf8 = g_get_charset (&new_charset);
1162 	  cache->filename_charsets[0] = g_strdup (new_charset);
1163 	}
1164       else
1165 	{
1166 	  cache->filename_charsets = g_new0 (gchar *, 3);
1167 	  cache->is_utf8 = TRUE;
1168 	  cache->filename_charsets[0] = g_strdup ("UTF-8");
1169 	  if (!g_get_charset (&new_charset))
1170 	    cache->filename_charsets[1] = g_strdup (new_charset);
1171 	}
1172     }
1173 
1174   if (filename_charsets)
1175     *filename_charsets = (const gchar **)cache->filename_charsets;
1176 
1177   return cache->is_utf8;
1178 }
1179 
1180 #else /* G_PLATFORM_WIN32 */
1181 
1182 gboolean
g_get_filename_charsets(const gchar *** filename_charsets)1183 g_get_filename_charsets (const gchar ***filename_charsets)
1184 {
1185   static const gchar *charsets[] = {
1186     "UTF-8",
1187     NULL
1188   };
1189 
1190 #ifdef G_OS_WIN32
1191   /* On Windows GLib pretends that the filename charset is UTF-8 */
1192   if (filename_charsets)
1193     *filename_charsets = charsets;
1194 
1195   return TRUE;
1196 #else
1197   gboolean result;
1198 
1199   /* Cygwin works like before */
1200   result = g_get_charset (&(charsets[0]));
1201 
1202   if (filename_charsets)
1203     *filename_charsets = charsets;
1204 
1205   return result;
1206 #endif
1207 }
1208 
1209 #endif /* G_PLATFORM_WIN32 */
1210 
1211 static gboolean
get_filename_charset(const gchar ** filename_charset)1212 get_filename_charset (const gchar **filename_charset)
1213 {
1214   const gchar **charsets;
1215   gboolean is_utf8;
1216 
1217   is_utf8 = g_get_filename_charsets (&charsets);
1218 
1219   if (filename_charset)
1220     *filename_charset = charsets[0];
1221 
1222   return is_utf8;
1223 }
1224 
1225 /**
1226  * g_filename_to_utf8:
1227  * @opsysstring: (type filename): a string in the encoding for filenames
1228  * @len:           the length of the string, or -1 if the string is
1229  *                 nul-terminated (Note that some encodings may allow nul
1230  *                 bytes to occur inside strings. In that case, using -1
1231  *                 for the @len parameter is unsafe)
1232  * @bytes_read: (out) (optional): location to store the number of bytes in the
1233  *                 input string that were successfully converted, or %NULL.
1234  *                 Even if the conversion was successful, this may be
1235  *                 less than @len if there were partial characters
1236  *                 at the end of the input. If the error
1237  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1238  *                 stored will be the byte offset after the last valid
1239  *                 input sequence.
1240  * @bytes_written: (out) (optional): the number of bytes stored in the output
1241  *                 buffer (not including the terminating nul).
1242  * @error:         location to store the error occurring, or %NULL to ignore
1243  *                 errors. Any of the errors in #GConvertError may occur.
1244  *
1245  * Converts a string which is in the encoding used by GLib for
1246  * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1247  * for filenames; on other platforms, this function indirectly depends on
1248  * the [current locale][setlocale].
1249  *
1250  * The input string shall not contain nul characters even if the @len
1251  * argument is positive. A nul character found inside the string will result
1252  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1253  * If the source encoding is not UTF-8 and the conversion output contains a
1254  * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1255  * function returns %NULL. Use g_convert() to produce output that
1256  * may contain embedded nul characters.
1257  *
1258  * Returns: (type utf8): The converted string, or %NULL on an error.
1259  **/
1260 gchar*
g_filename_to_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1261 g_filename_to_utf8 (const gchar *opsysstring,
1262 		    gssize       len,
1263 		    gsize       *bytes_read,
1264 		    gsize       *bytes_written,
1265 		    GError     **error)
1266 {
1267   const gchar *charset;
1268 
1269   g_return_val_if_fail (opsysstring != NULL, NULL);
1270 
1271   if (get_filename_charset (&charset))
1272     return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1273   else
1274     return convert_checked (opsysstring, len, "UTF-8", charset,
1275                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1276                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1277                             bytes_read, bytes_written, error);
1278 }
1279 
1280 /**
1281  * g_filename_from_utf8:
1282  * @utf8string:    (type utf8): a UTF-8 encoded string.
1283  * @len:           the length of the string, or -1 if the string is
1284  *                 nul-terminated.
1285  * @bytes_read:    (out) (optional): location to store the number of bytes in
1286  *                 the input string that were successfully converted, or %NULL.
1287  *                 Even if the conversion was successful, this may be
1288  *                 less than @len if there were partial characters
1289  *                 at the end of the input. If the error
1290  *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1291  *                 stored will be the byte offset after the last valid
1292  *                 input sequence.
1293  * @bytes_written: (out) (optional): the number of bytes stored in
1294  *                 the output buffer (not including the terminating nul).
1295  * @error:         location to store the error occurring, or %NULL to ignore
1296  *                 errors. Any of the errors in #GConvertError may occur.
1297  *
1298  * Converts a string from UTF-8 to the encoding GLib uses for
1299  * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1300  * on other platforms, this function indirectly depends on the
1301  * [current locale][setlocale].
1302  *
1303  * The input string shall not contain nul characters even if the @len
1304  * argument is positive. A nul character found inside the string will result
1305  * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1306  * not UTF-8 and the conversion output contains a nul character, the error
1307  * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1308  *
1309  * Returns: (type filename):
1310  *               The converted string, or %NULL on an error.
1311  **/
1312 gchar*
g_filename_from_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1313 g_filename_from_utf8 (const gchar *utf8string,
1314 		      gssize       len,
1315 		      gsize       *bytes_read,
1316 		      gsize       *bytes_written,
1317 		      GError     **error)
1318 {
1319   const gchar *charset;
1320 
1321   if (get_filename_charset (&charset))
1322     return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1323   else
1324     return convert_checked (utf8string, len, charset, "UTF-8",
1325                             CONVERT_CHECK_NO_NULS_IN_INPUT |
1326                             CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1327                             bytes_read, bytes_written, error);
1328 }
1329 
1330 /* Test of haystack has the needle prefix, comparing case
1331  * insensitive. haystack may be UTF-8, but needle must
1332  * contain only ascii. */
1333 static gboolean
has_case_prefix(const gchar * haystack,const gchar * needle)1334 has_case_prefix (const gchar *haystack, const gchar *needle)
1335 {
1336   const gchar *h, *n;
1337 
1338   /* Eat one character at a time. */
1339   h = haystack;
1340   n = needle;
1341 
1342   while (*n && *h &&
1343 	 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1344     {
1345       n++;
1346       h++;
1347     }
1348 
1349   return *n == '\0';
1350 }
1351 
1352 typedef enum {
1353   UNSAFE_ALL        = 0x1,  /* Escape all unsafe characters   */
1354   UNSAFE_ALLOW_PLUS = 0x2,  /* Allows '+'  */
1355   UNSAFE_PATH       = 0x8,  /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1356   UNSAFE_HOST       = 0x10, /* Allows '/' and ':' and '@' */
1357   UNSAFE_SLASHES    = 0x20  /* Allows all characters except for '/' and '%' */
1358 } UnsafeCharacterSet;
1359 
1360 static const guchar acceptable[96] = {
1361   /* A table of the ASCII chars from space (32) to DEL (127) */
1362   /*      !    "    #    $    %    &    '    (    )    *    +    ,    -    .    / */
1363   0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1364   /* 0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ? */
1365   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1366   /* @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O */
1367   0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1368   /* P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _ */
1369   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1370   /* `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o */
1371   0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1372   /* p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~  DEL */
1373   0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1374 };
1375 
1376 static const gchar hex[16] = "0123456789ABCDEF";
1377 
1378 /* Note: This escape function works on file: URIs, but if you want to
1379  * escape something else, please read RFC-2396 */
1380 static gchar *
g_escape_uri_string(const gchar * string,UnsafeCharacterSet mask)1381 g_escape_uri_string (const gchar *string,
1382 		     UnsafeCharacterSet mask)
1383 {
1384 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1385 
1386   const gchar *p;
1387   gchar *q;
1388   gchar *result;
1389   int c;
1390   gint unacceptable;
1391   UnsafeCharacterSet use_mask;
1392 
1393   g_return_val_if_fail (mask == UNSAFE_ALL
1394 			|| mask == UNSAFE_ALLOW_PLUS
1395 			|| mask == UNSAFE_PATH
1396 			|| mask == UNSAFE_HOST
1397 			|| mask == UNSAFE_SLASHES, NULL);
1398 
1399   unacceptable = 0;
1400   use_mask = mask;
1401   for (p = string; *p != '\0'; p++)
1402     {
1403       c = (guchar) *p;
1404       if (!ACCEPTABLE (c))
1405 	unacceptable++;
1406     }
1407 
1408   result = g_malloc (p - string + unacceptable * 2 + 1);
1409 
1410   use_mask = mask;
1411   for (q = result, p = string; *p != '\0'; p++)
1412     {
1413       c = (guchar) *p;
1414 
1415       if (!ACCEPTABLE (c))
1416 	{
1417 	  *q++ = '%'; /* means hex coming */
1418 	  *q++ = hex[c >> 4];
1419 	  *q++ = hex[c & 15];
1420 	}
1421       else
1422 	*q++ = *p;
1423     }
1424 
1425   *q = '\0';
1426 
1427   return result;
1428 }
1429 
1430 
1431 static gchar *
g_escape_file_uri(const gchar * hostname,const gchar * pathname)1432 g_escape_file_uri (const gchar *hostname,
1433 		   const gchar *pathname)
1434 {
1435   char *escaped_hostname = NULL;
1436   char *escaped_path;
1437   char *res;
1438 
1439 #ifdef G_OS_WIN32
1440   char *p, *backslash;
1441 
1442   /* Turn backslashes into forward slashes. That's what Netscape
1443    * does, and they are actually more or less equivalent in Windows.
1444    */
1445 
1446   pathname = g_strdup (pathname);
1447   p = (char *) pathname;
1448 
1449   while ((backslash = strchr (p, '\\')) != NULL)
1450     {
1451       *backslash = '/';
1452       p = backslash + 1;
1453     }
1454 #endif
1455 
1456   if (hostname && *hostname != '\0')
1457     {
1458       escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1459     }
1460 
1461   escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1462 
1463   res = g_strconcat ("file://",
1464 		     (escaped_hostname) ? escaped_hostname : "",
1465 		     (*escaped_path != '/') ? "/" : "",
1466 		     escaped_path,
1467 		     NULL);
1468 
1469 #ifdef G_OS_WIN32
1470   g_free ((char *) pathname);
1471 #endif
1472 
1473   g_free (escaped_hostname);
1474   g_free (escaped_path);
1475 
1476   return res;
1477 }
1478 
1479 static int
unescape_character(const char * scanner)1480 unescape_character (const char *scanner)
1481 {
1482   int first_digit;
1483   int second_digit;
1484 
1485   first_digit = g_ascii_xdigit_value (scanner[0]);
1486   if (first_digit < 0)
1487     return -1;
1488 
1489   second_digit = g_ascii_xdigit_value (scanner[1]);
1490   if (second_digit < 0)
1491     return -1;
1492 
1493   return (first_digit << 4) | second_digit;
1494 }
1495 
1496 static gchar *
g_unescape_uri_string(const char * escaped,int len,const char * illegal_escaped_characters,gboolean ascii_must_not_be_escaped)1497 g_unescape_uri_string (const char *escaped,
1498 		       int         len,
1499 		       const char *illegal_escaped_characters,
1500 		       gboolean    ascii_must_not_be_escaped)
1501 {
1502   const gchar *in, *in_end;
1503   gchar *out, *result;
1504   int c;
1505 
1506   if (escaped == NULL)
1507     return NULL;
1508 
1509   if (len < 0)
1510     len = strlen (escaped);
1511 
1512   result = g_malloc (len + 1);
1513 
1514   out = result;
1515   for (in = escaped, in_end = escaped + len; in < in_end; in++)
1516     {
1517       c = *in;
1518 
1519       if (c == '%')
1520 	{
1521 	  /* catch partial escape sequences past the end of the substring */
1522 	  if (in + 3 > in_end)
1523 	    break;
1524 
1525 	  c = unescape_character (in + 1);
1526 
1527 	  /* catch bad escape sequences and NUL characters */
1528 	  if (c <= 0)
1529 	    break;
1530 
1531 	  /* catch escaped ASCII */
1532 	  if (ascii_must_not_be_escaped && c <= 0x7F)
1533 	    break;
1534 
1535 	  /* catch other illegal escaped characters */
1536 	  if (strchr (illegal_escaped_characters, c) != NULL)
1537 	    break;
1538 
1539 	  in += 2;
1540 	}
1541 
1542       *out++ = c;
1543     }
1544 
1545   g_assert (out - result <= len);
1546   *out = '\0';
1547 
1548   if (in != in_end)
1549     {
1550       g_free (result);
1551       return NULL;
1552     }
1553 
1554   return result;
1555 }
1556 
1557 static gboolean
is_asciialphanum(gunichar c)1558 is_asciialphanum (gunichar c)
1559 {
1560   return c <= 0x7F && g_ascii_isalnum (c);
1561 }
1562 
1563 static gboolean
is_asciialpha(gunichar c)1564 is_asciialpha (gunichar c)
1565 {
1566   return c <= 0x7F && g_ascii_isalpha (c);
1567 }
1568 
1569 /* allows an empty string */
1570 static gboolean
hostname_validate(const char * hostname)1571 hostname_validate (const char *hostname)
1572 {
1573   const char *p;
1574   gunichar c, first_char, last_char;
1575 
1576   p = hostname;
1577   if (*p == '\0')
1578     return TRUE;
1579   do
1580     {
1581       /* read in a label */
1582       c = g_utf8_get_char (p);
1583       p = g_utf8_next_char (p);
1584       if (!is_asciialphanum (c))
1585 	return FALSE;
1586       first_char = c;
1587       do
1588 	{
1589 	  last_char = c;
1590 	  c = g_utf8_get_char (p);
1591 	  p = g_utf8_next_char (p);
1592 	}
1593       while (is_asciialphanum (c) || c == '-');
1594       if (last_char == '-')
1595 	return FALSE;
1596 
1597       /* if that was the last label, check that it was a toplabel */
1598       if (c == '\0' || (c == '.' && *p == '\0'))
1599 	return is_asciialpha (first_char);
1600     }
1601   while (c == '.');
1602   return FALSE;
1603 }
1604 
1605 /**
1606  * g_filename_from_uri:
1607  * @uri: a uri describing a filename (escaped, encoded in ASCII).
1608  * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1609  *            If there is no hostname in the URI, %NULL will be
1610  *            stored in this location.
1611  * @error: location to store the error occurring, or %NULL to ignore
1612  *         errors. Any of the errors in #GConvertError may occur.
1613  *
1614  * Converts an escaped ASCII-encoded URI to a local filename in the
1615  * encoding used for filenames.
1616  *
1617  * Returns: (type filename): a newly-allocated string holding
1618  *               the resulting filename, or %NULL on an error.
1619  **/
1620 gchar *
g_filename_from_uri(const gchar * uri,gchar ** hostname,GError ** error)1621 g_filename_from_uri (const gchar *uri,
1622 		     gchar      **hostname,
1623 		     GError     **error)
1624 {
1625   const char *path_part;
1626   const char *host_part;
1627   char *unescaped_hostname;
1628   char *result;
1629   char *filename;
1630   int offs;
1631 #ifdef G_OS_WIN32
1632   char *p, *slash;
1633 #endif
1634 
1635   if (hostname)
1636     *hostname = NULL;
1637 
1638   if (!has_case_prefix (uri, "file:/"))
1639     {
1640       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1641 		   _("The URI “%s” is not an absolute URI using the “file” scheme"),
1642 		   uri);
1643       return NULL;
1644     }
1645 
1646   path_part = uri + strlen ("file:");
1647 
1648   if (strchr (path_part, '#') != NULL)
1649     {
1650       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1651 		   _("The local file URI “%s” may not include a “#”"),
1652 		   uri);
1653       return NULL;
1654     }
1655 
1656   if (has_case_prefix (path_part, "///"))
1657     path_part += 2;
1658   else if (has_case_prefix (path_part, "//"))
1659     {
1660       path_part += 2;
1661       host_part = path_part;
1662 
1663       path_part = strchr (path_part, '/');
1664 
1665       if (path_part == NULL)
1666 	{
1667 	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1668 		       _("The URI “%s” is invalid"),
1669 		       uri);
1670 	  return NULL;
1671 	}
1672 
1673       unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1674 
1675       if (unescaped_hostname == NULL ||
1676 	  !hostname_validate (unescaped_hostname))
1677 	{
1678 	  g_free (unescaped_hostname);
1679 	  g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1680 		       _("The hostname of the URI “%s” is invalid"),
1681 		       uri);
1682 	  return NULL;
1683 	}
1684 
1685       if (hostname)
1686 	*hostname = unescaped_hostname;
1687       else
1688 	g_free (unescaped_hostname);
1689     }
1690 
1691   filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1692 
1693   if (filename == NULL)
1694     {
1695       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1696 		   _("The URI “%s” contains invalidly escaped characters"),
1697 		   uri);
1698       return NULL;
1699     }
1700 
1701   offs = 0;
1702 #ifdef G_OS_WIN32
1703   /* Drop localhost */
1704   if (hostname && *hostname != NULL &&
1705       g_ascii_strcasecmp (*hostname, "localhost") == 0)
1706     {
1707       g_free (*hostname);
1708       *hostname = NULL;
1709     }
1710 
1711   /* Turn slashes into backslashes, because that's the canonical spelling */
1712   p = filename;
1713   while ((slash = strchr (p, '/')) != NULL)
1714     {
1715       *slash = '\\';
1716       p = slash + 1;
1717     }
1718 
1719   /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1720    * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1721    * the filename from the drive letter.
1722    */
1723   if (g_ascii_isalpha (filename[1]))
1724     {
1725       if (filename[2] == ':')
1726 	offs = 1;
1727       else if (filename[2] == '|')
1728 	{
1729 	  filename[2] = ':';
1730 	  offs = 1;
1731 	}
1732     }
1733 #endif
1734 
1735   result = g_strdup (filename + offs);
1736   g_free (filename);
1737 
1738   return result;
1739 }
1740 
1741 /**
1742  * g_filename_to_uri:
1743  * @filename: (type filename): an absolute filename specified in the GLib file
1744  *     name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1745  *     on Windows
1746  * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1747  * @error: location to store the error occurring, or %NULL to ignore
1748  *         errors. Any of the errors in #GConvertError may occur.
1749  *
1750  * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1751  * component following Section 3.3. of RFC 2396.
1752  *
1753  * Returns: a newly-allocated string holding the resulting
1754  *               URI, or %NULL on an error.
1755  **/
1756 gchar *
g_filename_to_uri(const gchar * filename,const gchar * hostname,GError ** error)1757 g_filename_to_uri (const gchar *filename,
1758 		   const gchar *hostname,
1759 		   GError     **error)
1760 {
1761   char *escaped_uri;
1762 
1763   g_return_val_if_fail (filename != NULL, NULL);
1764 
1765   if (!g_path_is_absolute (filename))
1766     {
1767       g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1768 		   _("The pathname “%s” is not an absolute path"),
1769 		   filename);
1770       return NULL;
1771     }
1772 
1773   if (hostname &&
1774       !(g_utf8_validate (hostname, -1, NULL)
1775 	&& hostname_validate (hostname)))
1776     {
1777       g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1778                            _("Invalid hostname"));
1779       return NULL;
1780     }
1781 
1782 #ifdef G_OS_WIN32
1783   /* Don't use localhost unnecessarily */
1784   if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1785     hostname = NULL;
1786 #endif
1787 
1788   escaped_uri = g_escape_file_uri (hostname, filename);
1789 
1790   return escaped_uri;
1791 }
1792 
1793 /**
1794  * g_uri_list_extract_uris:
1795  * @uri_list: an URI list
1796  *
1797  * Splits an URI list conforming to the text/uri-list
1798  * mime type defined in RFC 2483 into individual URIs,
1799  * discarding any comments. The URIs are not validated.
1800  *
1801  * Returns: (transfer full): a newly allocated %NULL-terminated list
1802  *   of strings holding the individual URIs. The array should be freed
1803  *   with g_strfreev().
1804  *
1805  * Since: 2.6
1806  */
1807 gchar **
g_uri_list_extract_uris(const gchar * uri_list)1808 g_uri_list_extract_uris (const gchar *uri_list)
1809 {
1810   GPtrArray *uris;
1811   const gchar *p, *q;
1812 
1813   uris = g_ptr_array_new ();
1814 
1815   p = uri_list;
1816 
1817   /* We don't actually try to validate the URI according to RFC
1818    * 2396, or even check for allowed characters - we just ignore
1819    * comments and trim whitespace off the ends.  We also
1820    * allow LF delimination as well as the specified CRLF.
1821    *
1822    * We do allow comments like specified in RFC 2483.
1823    */
1824   while (p)
1825     {
1826       if (*p != '#')
1827 	{
1828 	  while (g_ascii_isspace (*p))
1829 	    p++;
1830 
1831 	  q = p;
1832 	  while (*q && (*q != '\n') && (*q != '\r'))
1833 	    q++;
1834 
1835 	  if (q > p)
1836 	    {
1837 	      q--;
1838 	      while (q > p && g_ascii_isspace (*q))
1839 		q--;
1840 
1841 	      if (q > p)
1842                 g_ptr_array_add (uris, g_strndup (p, q - p + 1));
1843             }
1844         }
1845       p = strchr (p, '\n');
1846       if (p)
1847 	p++;
1848     }
1849 
1850   g_ptr_array_add (uris, NULL);
1851 
1852   return (gchar **) g_ptr_array_free (uris, FALSE);
1853 }
1854 
1855 /**
1856  * g_filename_display_basename:
1857  * @filename: (type filename): an absolute pathname in the
1858  *     GLib file name encoding
1859  *
1860  * Returns the display basename for the particular filename, guaranteed
1861  * to be valid UTF-8. The display name might not be identical to the filename,
1862  * for instance there might be problems converting it to UTF-8, and some files
1863  * can be translated in the display.
1864  *
1865  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1866  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1867  * You can search the result for the UTF-8 encoding of this character (which is
1868  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1869  * encoding.
1870  *
1871  * You must pass the whole absolute pathname to this functions so that
1872  * translation of well known locations can be done.
1873  *
1874  * This function is preferred over g_filename_display_name() if you know the
1875  * whole path, as it allows translation.
1876  *
1877  * Returns: a newly allocated string containing
1878  *   a rendition of the basename of the filename in valid UTF-8
1879  *
1880  * Since: 2.6
1881  **/
1882 gchar *
g_filename_display_basename(const gchar * filename)1883 g_filename_display_basename (const gchar *filename)
1884 {
1885   char *basename;
1886   char *display_name;
1887 
1888   g_return_val_if_fail (filename != NULL, NULL);
1889 
1890   basename = g_path_get_basename (filename);
1891   display_name = g_filename_display_name (basename);
1892   g_free (basename);
1893   return display_name;
1894 }
1895 
1896 /**
1897  * g_filename_display_name:
1898  * @filename: (type filename): a pathname hopefully in the
1899  *     GLib file name encoding
1900  *
1901  * Converts a filename into a valid UTF-8 string. The conversion is
1902  * not necessarily reversible, so you should keep the original around
1903  * and use the return value of this function only for display purposes.
1904  * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1905  * even if the filename actually isn't in the GLib file name encoding.
1906  *
1907  * If GLib cannot make sense of the encoding of @filename, as a last resort it
1908  * replaces unknown characters with U+FFFD, the Unicode replacement character.
1909  * You can search the result for the UTF-8 encoding of this character (which is
1910  * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1911  * encoding.
1912  *
1913  * If you know the whole pathname of the file you should use
1914  * g_filename_display_basename(), since that allows location-based
1915  * translation of filenames.
1916  *
1917  * Returns: a newly allocated string containing
1918  *   a rendition of the filename in valid UTF-8
1919  *
1920  * Since: 2.6
1921  **/
1922 gchar *
g_filename_display_name(const gchar * filename)1923 g_filename_display_name (const gchar *filename)
1924 {
1925   gint i;
1926   const gchar **charsets;
1927   gchar *display_name = NULL;
1928   gboolean is_utf8;
1929 
1930   is_utf8 = g_get_filename_charsets (&charsets);
1931 
1932   if (is_utf8)
1933     {
1934       if (g_utf8_validate (filename, -1, NULL))
1935 	display_name = g_strdup (filename);
1936     }
1937 
1938   if (!display_name)
1939     {
1940       /* Try to convert from the filename charsets to UTF-8.
1941        * Skip the first charset if it is UTF-8.
1942        */
1943       for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1944 	{
1945 	  display_name = g_convert (filename, -1, "UTF-8", charsets[i],
1946 				    NULL, NULL, NULL);
1947 
1948 	  if (display_name)
1949 	    break;
1950 	}
1951     }
1952 
1953   /* if all conversions failed, we replace invalid UTF-8
1954    * by a question mark
1955    */
1956   if (!display_name)
1957     display_name = g_utf8_make_valid (filename, -1);
1958 
1959   return display_name;
1960 }
1961 
1962 #ifdef G_OS_WIN32
1963 
1964 /* Binary compatibility versions. Not for newly compiled code. */
1965 
1966 _GLIB_EXTERN gchar *g_filename_to_utf8_utf8   (const gchar  *opsysstring,
1967                                                gssize        len,
1968                                                gsize        *bytes_read,
1969                                                gsize        *bytes_written,
1970                                                GError      **error) G_GNUC_MALLOC;
1971 _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar  *utf8string,
1972                                                gssize        len,
1973                                                gsize        *bytes_read,
1974                                                gsize        *bytes_written,
1975                                                GError      **error) G_GNUC_MALLOC;
1976 _GLIB_EXTERN gchar *g_filename_from_uri_utf8  (const gchar  *uri,
1977                                                gchar       **hostname,
1978                                                GError      **error) G_GNUC_MALLOC;
1979 _GLIB_EXTERN gchar *g_filename_to_uri_utf8    (const gchar  *filename,
1980                                                const gchar  *hostname,
1981                                                GError      **error) G_GNUC_MALLOC;
1982 
1983 gchar *
g_filename_to_utf8_utf8(const gchar * opsysstring,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1984 g_filename_to_utf8_utf8 (const gchar *opsysstring,
1985                          gssize       len,
1986                          gsize       *bytes_read,
1987                          gsize       *bytes_written,
1988                          GError     **error)
1989 {
1990   return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
1991 }
1992 
1993 gchar *
g_filename_from_utf8_utf8(const gchar * utf8string,gssize len,gsize * bytes_read,gsize * bytes_written,GError ** error)1994 g_filename_from_utf8_utf8 (const gchar *utf8string,
1995                            gssize       len,
1996                            gsize       *bytes_read,
1997                            gsize       *bytes_written,
1998                            GError     **error)
1999 {
2000   return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
2001 }
2002 
2003 gchar *
g_filename_from_uri_utf8(const gchar * uri,gchar ** hostname,GError ** error)2004 g_filename_from_uri_utf8 (const gchar *uri,
2005                           gchar      **hostname,
2006                           GError     **error)
2007 {
2008   return g_filename_from_uri (uri, hostname, error);
2009 }
2010 
2011 gchar *
g_filename_to_uri_utf8(const gchar * filename,const gchar * hostname,GError ** error)2012 g_filename_to_uri_utf8 (const gchar *filename,
2013                         const gchar *hostname,
2014                         GError     **error)
2015 {
2016   return g_filename_to_uri (filename, hostname, error);
2017 }
2018 
2019 #endif
2020