1 /***
2 This file is part of PulseAudio.
3
4 Copyright 2006 Lennart Poettering
5 Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
6
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as
9 published by the Free Software Foundation; either version 2.1 of the
10 License, or (at your option) any later version.
11
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with PulseAudio; if not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 /* This file is based on the GLIB utf8 validation functions. The
22 * original license text follows. */
23
24 /* gutf8.c - Operations on UTF-8 strings.
25 *
26 * Copyright (C) 1999 Tom Tromey
27 * Copyright (C) 2000 Red Hat, Inc.
28 *
29 * This library is free software; you can redistribute it and/or
30 * modify it under the terms of the GNU Lesser General Public
31 * License as published by the Free Software Foundation; either
32 * version 2 of the License, or (at your option) any later version.
33 *
34 * This library is distributed in the hope that it will be useful,
35 * but WITHOUT ANY WARRANTY; without even the implied warranty of
36 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
37 * Lesser General Public License for more details.
38 *
39 * You should have received a copy of the GNU Lesser General Public
40 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
41 */
42
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46
47 #include <errno.h>
48 #include <stdlib.h>
49 #include <inttypes.h>
50 #include <string.h>
51
52 #ifdef HAVE_ICONV
53 #include <iconv.h>
54 #endif
55
56 #include <pulse/xmalloc.h>
57 #include <pulsecore/macro.h>
58
59 #include "utf8.h"
60
61 #define FILTER_CHAR '_'
62
is_unicode_valid(uint32_t ch)63 static inline bool is_unicode_valid(uint32_t ch) {
64
65 if (ch >= 0x110000) /* End of unicode space */
66 return false;
67 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
68 return false;
69 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
70 return false;
71 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
72 return false;
73
74 return true;
75 }
76
is_continuation_char(uint8_t ch)77 static inline bool is_continuation_char(uint8_t ch) {
78 if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
79 return false;
80 return true;
81 }
82
merge_continuation_char(uint32_t * u_ch,uint8_t ch)83 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
84 *u_ch <<= 6;
85 *u_ch |= ch & 0x3f;
86 }
87
utf8_validate(const char * str,char * output)88 static char* utf8_validate(const char *str, char *output) {
89 uint32_t val = 0;
90 uint32_t min = 0;
91 const uint8_t *p, *last;
92 int size;
93 uint8_t *o;
94
95 pa_assert(str);
96
97 o = (uint8_t*) output;
98 for (p = (const uint8_t*) str; *p; p++) {
99 if (*p < 128) {
100 if (o)
101 *o = *p;
102 } else {
103 last = p;
104
105 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
106 size = 2;
107 min = 128;
108 val = (uint32_t) (*p & 0x1e);
109 goto ONE_REMAINING;
110 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
111 size = 3;
112 min = (1 << 11);
113 val = (uint32_t) (*p & 0x0f);
114 goto TWO_REMAINING;
115 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
116 size = 4;
117 min = (1 << 16);
118 val = (uint32_t) (*p & 0x07);
119 } else
120 goto error;
121
122 p++;
123 if (!is_continuation_char(*p))
124 goto error;
125 merge_continuation_char(&val, *p);
126
127 TWO_REMAINING:
128 p++;
129 if (!is_continuation_char(*p))
130 goto error;
131 merge_continuation_char(&val, *p);
132
133 ONE_REMAINING:
134 p++;
135 if (!is_continuation_char(*p))
136 goto error;
137 merge_continuation_char(&val, *p);
138
139 if (val < min)
140 goto error;
141
142 if (!is_unicode_valid(val))
143 goto error;
144
145 if (o) {
146 memcpy(o, last, (size_t) size);
147 o += size;
148 }
149
150 continue;
151
152 error:
153 if (o) {
154 *o = FILTER_CHAR;
155 p = last; /* We retry at the next character */
156 } else
157 goto failure;
158 }
159
160 if (o)
161 o++;
162 }
163
164 if (o) {
165 *o = '\0';
166 return output;
167 }
168
169 return (char*) str;
170
171 failure:
172 return NULL;
173 }
174
pa_utf8_valid(const char * str)175 char* pa_utf8_valid (const char *str) {
176 return utf8_validate(str, NULL);
177 }
178
pa_utf8_filter(const char * str)179 char* pa_utf8_filter (const char *str) {
180 char *new_str;
181
182 pa_assert(str);
183 new_str = pa_xmalloc(strlen(str) + 1);
184 return utf8_validate(str, new_str);
185 }
186
187 #ifdef HAVE_ICONV
188
iconv_simple(const char * str,const char * to,const char * from)189 static char* iconv_simple(const char *str, const char *to, const char *from) {
190 char *new_str;
191 size_t len, inlen;
192 iconv_t cd;
193 ICONV_CONST char *inbuf;
194 char *outbuf;
195 size_t res, inbytes, outbytes;
196
197 pa_assert(str);
198 pa_assert(to);
199 pa_assert(from);
200
201 cd = iconv_open(to, from);
202 if (cd == (iconv_t)-1)
203 return NULL;
204
205 inlen = len = strlen(str) + 1;
206 new_str = pa_xmalloc(len);
207
208 for (;;) {
209 inbuf = (ICONV_CONST char*) str; /* Brain dead prototype for iconv() */
210 inbytes = inlen;
211 outbuf = new_str;
212 outbytes = len;
213
214 res = iconv(cd, &inbuf, &inbytes, &outbuf, &outbytes);
215
216 if (res != (size_t)-1)
217 break;
218
219 if (errno != E2BIG) {
220 pa_xfree(new_str);
221 new_str = NULL;
222 break;
223 }
224
225 pa_assert(inbytes != 0);
226
227 len += inbytes;
228 new_str = pa_xrealloc(new_str, len);
229 }
230
231 iconv_close(cd);
232
233 return new_str;
234 }
235
pa_utf8_to_locale(const char * str)236 char* pa_utf8_to_locale (const char *str) {
237 return iconv_simple(str, "", "UTF-8");
238 }
239
pa_locale_to_utf8(const char * str)240 char* pa_locale_to_utf8 (const char *str) {
241 return iconv_simple(str, "UTF-8", "");
242 }
243
244 #else
245
pa_utf8_to_locale(const char * str)246 char* pa_utf8_to_locale (const char *str) {
247 pa_assert(str);
248
249 return pa_ascii_filter(str);
250 }
251
pa_locale_to_utf8(const char * str)252 char* pa_locale_to_utf8 (const char *str) {
253 pa_assert(str);
254
255 if (pa_utf8_valid(str))
256 return pa_xstrdup(str);
257
258 return NULL;
259 }
260
261 #endif
262
pa_ascii_valid(const char * str)263 char *pa_ascii_valid(const char *str) {
264 const char *p;
265 pa_assert(str);
266
267 for (p = str; *p; p++)
268 if ((unsigned char) *p >= 128)
269 return NULL;
270
271 return (char*) str;
272 }
273
pa_ascii_filter(const char * str)274 char *pa_ascii_filter(const char *str) {
275 char *r, *s, *d;
276 pa_assert(str);
277
278 r = pa_xstrdup(str);
279
280 for (s = r, d = r; *s; s++)
281 if ((unsigned char) *s < 128)
282 *(d++) = *s;
283
284 *d = 0;
285
286 return r;
287 }
288