• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Transcoding support for CUPS.
3  *
4  * Copyright © 2020-2024 by OpenPrinting.
5  * Copyright 2007-2014 by Apple Inc.
6  * Copyright 1997-2007 by Easy Software Products.
7  *
8  * Licensed under Apache License v2.0.  See the file "LICENSE" for more information.
9  */
10 
11 /*
12  * Include necessary headers...
13  */
14 
15 #include "cups-private.h"
16 #include "debug-internal.h"
17 #include <limits.h>
18 #include <time.h>
19 #ifdef HAVE_ICONV_H
20 #  include <iconv.h>
21 #endif /* HAVE_ICONV_H */
22 
23 
24 /*
25  * Local globals...
26  */
27 
28 #ifdef HAVE_ICONV_H
29 static _cups_mutex_t	map_mutex = _CUPS_MUTEX_INITIALIZER;
30 					/* Mutex to control access to maps */
31 static iconv_t		map_from_utf8 = (iconv_t)-1;
32 					/* Convert from UTF-8 to charset */
33 static iconv_t		map_to_utf8 = (iconv_t)-1;
34 					/* Convert from charset to UTF-8 */
35 static cups_encoding_t	map_encoding = CUPS_AUTO_ENCODING;
36 					/* Which charset is cached */
37 #endif /* HAVE_ICONV_H */
38 
39 
40 /*
41  * '_cupsCharmapFlush()' - Flush all character set maps out of cache.
42  */
43 
44 void
_cupsCharmapFlush(void)45 _cupsCharmapFlush(void)
46 {
47 #ifdef HAVE_ICONV_H
48   if (map_from_utf8 != (iconv_t)-1)
49   {
50     iconv_close(map_from_utf8);
51     map_from_utf8 = (iconv_t)-1;
52   }
53 
54   if (map_to_utf8 != (iconv_t)-1)
55   {
56     iconv_close(map_to_utf8);
57     map_to_utf8 = (iconv_t)-1;
58   }
59 
60   map_encoding = CUPS_AUTO_ENCODING;
61 #endif /* HAVE_ICONV_H */
62 }
63 
64 
65 /*
66  * 'cupsCharsetToUTF8()' - Convert legacy character set to UTF-8.
67  */
68 
69 int					/* O - Count or -1 on error */
cupsCharsetToUTF8(cups_utf8_t * dest,const char * src,const int maxout,const cups_encoding_t encoding)70 cupsCharsetToUTF8(
71     cups_utf8_t           *dest,	/* O - Target string */
72     const char            *src,		/* I - Source string */
73     const int             maxout,	/* I - Max output */
74     const cups_encoding_t encoding)	/* I - Encoding */
75 {
76   cups_utf8_t	*destptr;		/* Pointer into UTF-8 buffer */
77 #ifdef HAVE_ICONV_H
78   size_t	srclen,			/* Length of source string */
79 		outBytesLeft;		/* Bytes remaining in output buffer */
80 #endif /* HAVE_ICONV_H */
81 
82 
83  /*
84   * Check for valid arguments...
85   */
86 
87   DEBUG_printf(("2cupsCharsetToUTF8(dest=%p, src=\"%s\", maxout=%d, encoding=%d)", (void *)dest, src, maxout, encoding));
88 
89   if (!dest || !src || maxout < 1)
90   {
91     if (dest)
92       *dest = '\0';
93 
94     DEBUG_puts("3cupsCharsetToUTF8: Bad arguments, returning -1");
95     return (-1);
96   }
97 
98  /*
99   * Handle identity conversions...
100   */
101 
102   if (encoding == CUPS_UTF8 || encoding <= CUPS_US_ASCII ||
103       encoding >= CUPS_ENCODING_VBCS_END)
104   {
105     strlcpy((char *)dest, src, (size_t)maxout);
106     return ((int)strlen((char *)dest));
107   }
108 
109  /*
110   * Handle ISO-8859-1 to UTF-8 directly...
111   */
112 
113   destptr = dest;
114 
115   if (encoding == CUPS_ISO8859_1)
116   {
117     int		ch;			/* Character from string */
118     cups_utf8_t	*destend;		/* End of UTF-8 buffer */
119 
120 
121     destend = dest + maxout - 2;
122 
123     while (*src && destptr < destend)
124     {
125       ch = *src++ & 255;
126 
127       if (ch & 128)
128       {
129 	*destptr++ = (cups_utf8_t)(0xc0 | (ch >> 6));
130 	*destptr++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
131       }
132       else
133 	*destptr++ = (cups_utf8_t)ch;
134     }
135 
136     *destptr = '\0';
137 
138     return ((int)(destptr - dest));
139   }
140 
141  /*
142   * Convert input legacy charset to UTF-8...
143   */
144 
145 #ifdef HAVE_ICONV_H
146   _cupsMutexLock(&map_mutex);
147 
148   if (map_encoding != encoding)
149   {
150     char	toset[1024];		/* Destination character set */
151 
152     _cupsCharmapFlush();
153 
154     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
155 
156     map_encoding  = encoding;
157     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
158     map_to_utf8   = iconv_open("UTF-8", toset);
159   }
160 
161   if (map_to_utf8 != (iconv_t)-1)
162   {
163     char *altdestptr = (char *)dest;	/* Silence bogus GCC type-punned */
164 
165     srclen       = strlen(src);
166     outBytesLeft = (size_t)maxout - 1;
167 
168     iconv(map_to_utf8, (char **)&src, &srclen, &altdestptr, &outBytesLeft);
169     *altdestptr = '\0';
170 
171     _cupsMutexUnlock(&map_mutex);
172 
173     return ((int)(altdestptr - (char *)dest));
174   }
175 
176   _cupsMutexUnlock(&map_mutex);
177 #endif /* HAVE_ICONV_H */
178 
179  /*
180   * No iconv() support, so error out...
181   */
182 
183   *destptr = '\0';
184 
185   return (-1);
186 }
187 
188 
189 /*
190  * 'cupsUTF8ToCharset()' - Convert UTF-8 to legacy character set.
191  */
192 
193 int					/* O - Count or -1 on error */
cupsUTF8ToCharset(char * dest,const cups_utf8_t * src,const int maxout,const cups_encoding_t encoding)194 cupsUTF8ToCharset(
195     char		  *dest,	/* O - Target string */
196     const cups_utf8_t	  *src,		/* I - Source string */
197     const int		  maxout,	/* I - Max output */
198     const cups_encoding_t encoding)	/* I - Encoding */
199 {
200   char		*destptr;		/* Pointer into destination */
201 #ifdef HAVE_ICONV_H
202   size_t	srclen,			/* Length of source string */
203 		outBytesLeft;		/* Bytes remaining in output buffer */
204 #endif /* HAVE_ICONV_H */
205 
206 
207  /*
208   * Check for valid arguments...
209   */
210 
211   if (!dest || !src || maxout < 1)
212   {
213     if (dest)
214       *dest = '\0';
215 
216     return (-1);
217   }
218 
219  /*
220   * Handle identity conversions...
221   */
222 
223   if (encoding == CUPS_UTF8 ||
224       encoding >= CUPS_ENCODING_VBCS_END)
225   {
226     strlcpy(dest, (char *)src, (size_t)maxout);
227     return ((int)strlen(dest));
228   }
229 
230  /*
231   * Handle UTF-8 to ISO-8859-1 directly...
232   */
233 
234   destptr = dest;
235 
236   if (encoding == CUPS_ISO8859_1 || encoding <= CUPS_US_ASCII)
237   {
238     int		ch,			/* Character from string */
239 		maxch;			/* Maximum character for charset */
240     char	*destend;		/* End of ISO-8859-1 buffer */
241 
242     maxch   = encoding == CUPS_ISO8859_1 ? 256 : 128;
243     destend = dest + maxout - 1;
244 
245     while (*src && destptr < destend)
246     {
247       ch = *src++;
248 
249       if ((ch & 0xe0) == 0xc0)
250       {
251 	ch = ((ch & 0x1f) << 6) | (*src++ & 0x3f);
252 
253 	if (ch < maxch)
254           *destptr++ = (char)ch;
255 	else
256           *destptr++ = '?';
257       }
258       else if ((ch & 0xf0) == 0xe0 ||
259                (ch & 0xf8) == 0xf0)
260         *destptr++ = '?';
261       else if (!(ch & 0x80))
262 	*destptr++ = (char)ch;
263     }
264 
265     *destptr = '\0';
266 
267     return ((int)(destptr - dest));
268   }
269 
270 #ifdef HAVE_ICONV_H
271  /*
272   * Convert input UTF-8 to legacy charset...
273   */
274 
275   _cupsMutexLock(&map_mutex);
276 
277   if (map_encoding != encoding)
278   {
279     char	toset[1024];		/* Destination character set */
280 
281     _cupsCharmapFlush();
282 
283     snprintf(toset, sizeof(toset), "%s//IGNORE", _cupsEncodingName(encoding));
284 
285     map_encoding  = encoding;
286     map_from_utf8 = iconv_open(_cupsEncodingName(encoding), "UTF-8");
287     map_to_utf8   = iconv_open("UTF-8", toset);
288   }
289 
290   if (map_from_utf8 != (iconv_t)-1)
291   {
292     char *altsrc = (char *)src;		/* Silence bogus GCC type-punned */
293 
294     srclen       = strlen((char *)src);
295     outBytesLeft = (size_t)maxout - 1;
296 
297     iconv(map_from_utf8, &altsrc, &srclen, &destptr, &outBytesLeft);
298     *destptr = '\0';
299 
300     _cupsMutexUnlock(&map_mutex);
301 
302     return ((int)(destptr - dest));
303   }
304 
305   _cupsMutexUnlock(&map_mutex);
306 #endif /* HAVE_ICONV_H */
307 
308  /*
309   * No iconv() support, so error out...
310   */
311 
312   *destptr = '\0';
313 
314   return (-1);
315 }
316 
317 
318 /*
319  * 'cupsUTF8ToUTF32()' - Convert UTF-8 to UTF-32.
320  *
321  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
322  *
323  *   UTF-32 char     UTF-8 char(s)
324  *   --------------------------------------------------
325  *	  0 to 127 = 0xxxxxxx (US-ASCII)
326  *     128 to 2047 = 110xxxxx 10yyyyyy
327  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
328  *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
329  *
330  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
331  * which would convert to five- or six-octet UTF-8 sequences...
332  */
333 
334 int					/* O - Count or -1 on error */
cupsUTF8ToUTF32(cups_utf32_t * dest,const cups_utf8_t * src,const int maxout)335 cupsUTF8ToUTF32(
336     cups_utf32_t      *dest,		/* O - Target string */
337     const cups_utf8_t *src,		/* I - Source string */
338     const int         maxout)		/* I - Max output */
339 {
340   int		i;			/* Looping variable */
341   cups_utf8_t	ch;			/* Character value */
342   cups_utf8_t	next;			/* Next character value */
343   cups_utf32_t	ch32;			/* UTF-32 character value */
344 
345 
346  /*
347   * Check for valid arguments and clear output...
348   */
349 
350   DEBUG_printf(("2cupsUTF8ToUTF32(dest=%p, src=\"%s\", maxout=%d)", (void *)dest, src, maxout));
351 
352   if (dest)
353     *dest = 0;
354 
355   if (!dest || !src || maxout < 1 || maxout > CUPS_MAX_USTRING)
356   {
357     DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad arguments)");
358 
359     return (-1);
360   }
361 
362  /*
363   * Convert input UTF-8 to output UTF-32...
364   */
365 
366   for (i = maxout - 1; *src && i > 0; i --)
367   {
368     ch = *src++;
369 
370    /*
371     * Convert UTF-8 character(s) to UTF-32 character...
372     */
373 
374     if (!(ch & 0x80))
375     {
376      /*
377       * One-octet UTF-8 <= 127 (US-ASCII)...
378       */
379 
380       *dest++ = ch;
381 
382       DEBUG_printf(("4cupsUTF8ToUTF32: %02x => %08X", src[-1], ch));
383       continue;
384     }
385     else if ((ch & 0xe0) == 0xc0)
386     {
387      /*
388       * Two-octet UTF-8 <= 2047 (Latin-x)...
389       */
390 
391       next = *src++;
392       if ((next & 0xc0) != 0x80)
393       {
394         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
395 
396 	return (-1);
397       }
398 
399       ch32 = (cups_utf32_t)((ch & 0x1f) << 6) | (cups_utf32_t)(next & 0x3f);
400 
401      /*
402       * Check for non-shortest form (invalid UTF-8)...
403       */
404 
405       if (ch32 < 0x80)
406       {
407         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
408 
409 	return (-1);
410       }
411 
412       *dest++ = ch32;
413 
414       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x => %08X",
415                     src[-2], src[-1], (unsigned)ch32));
416     }
417     else if ((ch & 0xf0) == 0xe0)
418     {
419      /*
420       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
421       */
422 
423       next = *src++;
424       if ((next & 0xc0) != 0x80)
425       {
426         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
427 
428 	return (-1);
429       }
430 
431       ch32 = (cups_utf32_t)((ch & 0x0f) << 6) | (cups_utf32_t)(next & 0x3f);
432 
433       next = *src++;
434       if ((next & 0xc0) != 0x80)
435       {
436         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
437 
438 	return (-1);
439       }
440 
441       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
442 
443      /*
444       * Check for non-shortest form (invalid UTF-8)...
445       */
446 
447       if (ch32 < 0x800)
448       {
449         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
450 
451 	return (-1);
452       }
453 
454       *dest++ = ch32;
455 
456       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x => %08X",
457                     src[-3], src[-2], src[-1], (unsigned)ch32));
458     }
459     else if ((ch & 0xf8) == 0xf0)
460     {
461      /*
462       * Four-octet UTF-8...
463       */
464 
465       next = *src++;
466       if ((next & 0xc0) != 0x80)
467       {
468         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
469 
470 	return (-1);
471       }
472 
473       ch32 = (cups_utf32_t)((ch & 0x07) << 6) | (cups_utf32_t)(next & 0x3f);
474 
475       next = *src++;
476       if ((next & 0xc0) != 0x80)
477       {
478         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
479 
480 	return (-1);
481       }
482 
483       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
484 
485       next = *src++;
486       if ((next & 0xc0) != 0x80)
487       {
488         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
489 
490 	return (-1);
491       }
492 
493       ch32 = (ch32 << 6) | (cups_utf32_t)(next & 0x3f);
494 
495      /*
496       * Check for non-shortest form (invalid UTF-8)...
497       */
498 
499       if (ch32 < 0x10000)
500       {
501         DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
502 
503 	return (-1);
504       }
505 
506       *dest++ = ch32;
507 
508       DEBUG_printf(("4cupsUTF8ToUTF32: %02x %02x %02x %02x => %08X",
509                     src[-4], src[-3], src[-2], src[-1], (unsigned)ch32));
510     }
511     else
512     {
513      /*
514       * More than 4-octet (invalid UTF-8 sequence)...
515       */
516 
517       DEBUG_puts("3cupsUTF8ToUTF32: Returning -1 (bad UTF-8 sequence)");
518 
519       return (-1);
520     }
521 
522    /*
523     * Check for UTF-16 surrogate (illegal UTF-8)...
524     */
525 
526     if (ch32 >= 0xd800 && ch32 <= 0xdfff)
527       return (-1);
528   }
529 
530   *dest = 0;
531 
532   DEBUG_printf(("3cupsUTF8ToUTF32: Returning %d characters", maxout - 1 - i));
533 
534   return (maxout - 1 - i);
535 }
536 
537 
538 /*
539  * 'cupsUTF32ToUTF8()' - Convert UTF-32 to UTF-8.
540  *
541  * 32-bit UTF-32 (actually 21-bit) maps to UTF-8 as follows...
542  *
543  *   UTF-32 char     UTF-8 char(s)
544  *   --------------------------------------------------
545  *	  0 to 127 = 0xxxxxxx (US-ASCII)
546  *     128 to 2047 = 110xxxxx 10yyyyyy
547  *   2048 to 65535 = 1110xxxx 10yyyyyy 10zzzzzz
548  *	   > 65535 = 11110xxx 10yyyyyy 10zzzzzz 10xxxxxx
549  *
550  * UTF-32 prohibits chars beyond Plane 16 (> 0x10ffff) in UCS-4,
551  * which would convert to five- or six-octet UTF-8 sequences...
552  */
553 
554 int					/* O - Count or -1 on error */
cupsUTF32ToUTF8(cups_utf8_t * dest,const cups_utf32_t * src,const int maxout)555 cupsUTF32ToUTF8(
556     cups_utf8_t        *dest,		/* O - Target string */
557     const cups_utf32_t *src,		/* I - Source string */
558     const int          maxout)		/* I - Max output */
559 {
560   cups_utf8_t	*start;			/* Start of destination string */
561   int		i;			/* Looping variable */
562   int		swap;			/* Byte-swap input to output */
563   cups_utf32_t	ch;			/* Character value */
564 
565 
566  /*
567   * Check for valid arguments and clear output...
568   */
569 
570   DEBUG_printf(("2cupsUTF32ToUTF8(dest=%p, src=%p, maxout=%d)", (void *)dest, (void *)src, maxout));
571 
572   if (dest)
573     *dest = '\0';
574 
575   if (!dest || !src || maxout < 1)
576   {
577     DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (bad args)");
578 
579     return (-1);
580   }
581 
582  /*
583   * Check for leading BOM in UTF-32 and inverted BOM...
584   */
585 
586   start = dest;
587   swap  = *src == 0xfffe0000;
588 
589   DEBUG_printf(("4cupsUTF32ToUTF8: swap=%d", swap));
590 
591   if (*src == 0xfffe0000 || *src == 0xfeff)
592     src ++;
593 
594  /*
595   * Convert input UTF-32 to output UTF-8...
596   */
597 
598   for (i = maxout - 1; *src && i > 0;)
599   {
600     ch = *src++;
601 
602    /*
603     * Byte swap input UTF-32, if necessary...
604     * (only byte-swapping 24 of 32 bits)
605     */
606 
607     if (swap)
608       ch = ((ch >> 24) | ((ch >> 8) & 0xff00) | ((ch << 8) & 0xff0000));
609 
610    /*
611     * Check for beyond Plane 16 (invalid UTF-32)...
612     */
613 
614     if (ch > 0x10ffff)
615     {
616       DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (character out of range)");
617 
618       return (-1);
619     }
620 
621    /*
622     * Convert UTF-32 character to UTF-8 character(s)...
623     */
624 
625     if (ch < 0x80)
626     {
627      /*
628       * One-octet UTF-8 <= 127 (US-ASCII)...
629       */
630 
631       *dest++ = (cups_utf8_t)ch;
632       i --;
633 
634       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x", (unsigned)ch, dest[-1]));
635     }
636     else if (ch < 0x800)
637     {
638      /*
639       * Two-octet UTF-8 <= 2047 (Latin-x)...
640       */
641 
642       if (i < 2)
643       {
644         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 2)");
645 
646         return (-1);
647       }
648 
649       *dest++ = (cups_utf8_t)(0xc0 | ((ch >> 6) & 0x1f));
650       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
651       i -= 2;
652 
653       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x", (unsigned)ch,
654                     dest[-2], dest[-1]));
655     }
656     else if (ch < 0x10000)
657     {
658      /*
659       * Three-octet UTF-8 <= 65535 (Plane 0 - BMP)...
660       */
661 
662       if (i < 3)
663       {
664         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 3)");
665 
666         return (-1);
667       }
668 
669       *dest++ = (cups_utf8_t)(0xe0 | ((ch >> 12) & 0x0f));
670       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
671       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
672       i -= 3;
673 
674       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x", (unsigned)ch,
675                     dest[-3], dest[-2], dest[-1]));
676     }
677     else
678     {
679      /*
680       * Four-octet UTF-8...
681       */
682 
683       if (i < 4)
684       {
685         DEBUG_puts("3cupsUTF32ToUTF8: Returning -1 (too long 4)");
686 
687         return (-1);
688       }
689 
690       *dest++ = (cups_utf8_t)(0xf0 | ((ch >> 18) & 0x07));
691       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 12) & 0x3f));
692       *dest++ = (cups_utf8_t)(0x80 | ((ch >> 6) & 0x3f));
693       *dest++ = (cups_utf8_t)(0x80 | (ch & 0x3f));
694       i -= 4;
695 
696       DEBUG_printf(("4cupsUTF32ToUTF8: %08x => %02x %02x %02x %02x",
697                     (unsigned)ch, dest[-4], dest[-3], dest[-2], dest[-1]));
698     }
699   }
700 
701   *dest = '\0';
702 
703   DEBUG_printf(("3cupsUTF32ToUTF8: Returning %d", (int)(dest - start)));
704 
705   return ((int)(dest - start));
706 }
707