• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***
2   This file is part of udev, forked from systemd.
3 
4   Copyright 2008-2011 Kay Sievers
5   Copyright 2012 Lennart Poettering
6 
7   systemd is free software; you can redistribute it and/or modify it
8   under the terms of the GNU Lesser General Public License as published by
9   the Free Software Foundation; either version 2.1 of the License, or
10   (at your option) any later version.
11 
12   systemd is distributed in the hope that it will be useful, but
13   WITHOUT ANY WARRANTY; without even the implied warranty of
14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15   Lesser General Public License for more details.
16 
17   You should have received a copy of the GNU Lesser General Public License
18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20 
21 /* Parts of this file are based on the GLIB utf8 validation functions. The
22  * original license text follows. */
23 
24 /* gutf8.c - Operations on UTF-8 strings.
25  *
26  * Copyright (C) 1999 Tom Tromey
27  * Copyright (C) 2000 Red Hat, Inc.
28  *
29  * This library is free software; you can redistribute it and/or
30  * modify it under the terms of the GNU Library General Public
31  * License as published by the Free Software Foundation; either
32  * version 2 of the License, or (at your option) any later version.
33  *
34  * This library is distributed in the hope that it will be useful,
35  * but WITHOUT ANY WARRANTY; without even the implied warranty of
36  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
37  * Library General Public License for more details.
38  *
39  * You should have received a copy of the GNU Library General Public
40  * License along with this library; if not, write to the Free Software
41  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
42  */
43 
44 #include <errno.h>
45 #include <stdlib.h>
46 #include <inttypes.h>
47 #include <string.h>
48 #include <stdbool.h>
49 
50 #include "utf8.h"
51 #include "util.h"
52 
unichar_is_valid(uint32_t ch)53 bool unichar_is_valid(uint32_t ch) {
54 
55         if (ch >= 0x110000) /* End of unicode space */
56                 return false;
57         if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
58                 return false;
59         if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
60                 return false;
61         if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
62                 return false;
63 
64         return true;
65 }
66 
unichar_is_control(uint32_t ch)67 static bool unichar_is_control(uint32_t ch) {
68 
69         /*
70           0 to ' '-1 is the C0 range.
71           DEL=0x7F, and DEL+1 to 0x9F is C1 range.
72           '\t' is in C0 range, but more or less harmless and commonly used.
73         */
74 
75         return (ch < ' ' && ch != '\t' && ch != '\n') ||
76                 (0x7F <= ch && ch <= 0x9F);
77 }
78 
79 /* count of characters used to encode one unicode char */
utf8_encoded_expected_len(const char * str)80 static int utf8_encoded_expected_len(const char *str) {
81         unsigned char c;
82 
83         assert(str);
84 
85         c = (unsigned char) str[0];
86         if (c < 0x80)
87                 return 1;
88         if ((c & 0xe0) == 0xc0)
89                 return 2;
90         if ((c & 0xf0) == 0xe0)
91                 return 3;
92         if ((c & 0xf8) == 0xf0)
93                 return 4;
94         if ((c & 0xfc) == 0xf8)
95                 return 5;
96         if ((c & 0xfe) == 0xfc)
97                 return 6;
98 
99         return 0;
100 }
101 
102 /* decode one unicode char */
utf8_encoded_to_unichar(const char * str)103 int utf8_encoded_to_unichar(const char *str) {
104         int unichar, len, i;
105 
106         assert(str);
107 
108         len = utf8_encoded_expected_len(str);
109 
110         switch (len) {
111         case 1:
112                 return (int)str[0];
113         case 2:
114                 unichar = str[0] & 0x1f;
115                 break;
116         case 3:
117                 unichar = (int)str[0] & 0x0f;
118                 break;
119         case 4:
120                 unichar = (int)str[0] & 0x07;
121                 break;
122         case 5:
123                 unichar = (int)str[0] & 0x03;
124                 break;
125         case 6:
126                 unichar = (int)str[0] & 0x01;
127                 break;
128         default:
129                 return -EINVAL;
130         }
131 
132         for (i = 1; i < len; i++) {
133                 if (((int)str[i] & 0xc0) != 0x80)
134                         return -EINVAL;
135                 unichar <<= 6;
136                 unichar |= (int)str[i] & 0x3f;
137         }
138 
139         return unichar;
140 }
141 
utf8_is_printable_newline(const char * str,size_t length,bool newline)142 bool utf8_is_printable_newline(const char* str, size_t length, bool newline) {
143         const uint8_t *p;
144 
145         assert(str);
146 
147         for (p = (const uint8_t*) str; length;) {
148                 int encoded_len, val;
149 
150                 encoded_len = utf8_encoded_valid_unichar((const char *) p);
151                 if (encoded_len < 0 ||
152                     (size_t) encoded_len > length)
153                         return false;
154 
155                 val = utf8_encoded_to_unichar((const char*) p);
156                 if (val < 0 ||
157                     unichar_is_control(val) ||
158                     (!newline && val == '\n'))
159                         return false;
160 
161                 length -= encoded_len;
162                 p += encoded_len;
163         }
164 
165         return true;
166 }
167 
ascii_is_valid(const char * str)168 char *ascii_is_valid(const char *str) {
169         const char *p;
170 
171         assert(str);
172 
173         for (p = str; *p; p++)
174                 if ((unsigned char) *p >= 128)
175                         return NULL;
176 
177         return (char*) str;
178 }
179 
180 /**
181  * utf8_encode_unichar() - Encode single UCS-4 character as UTF-8
182  * @out_utf8: output buffer of at least 4 bytes or NULL
183  * @g: UCS-4 character to encode
184  *
185  * This encodes a single UCS-4 character as UTF-8 and writes it into @out_utf8.
186  * The length of the character is returned. It is not zero-terminated! If the
187  * output buffer is NULL, only the length is returned.
188  *
189  * Returns: The length in bytes that the UTF-8 representation does or would
190  *          occupy.
191  */
utf8_encode_unichar(char * out_utf8,uint32_t g)192 size_t utf8_encode_unichar(char *out_utf8, uint32_t g) {
193 
194         if (g < (1 << 7)) {
195                 if (out_utf8)
196                         out_utf8[0] = g & 0x7f;
197                 return 1;
198         } else if (g < (1 << 11)) {
199                 if (out_utf8) {
200                         out_utf8[0] = 0xc0 | ((g >> 6) & 0x1f);
201                         out_utf8[1] = 0x80 | (g & 0x3f);
202                 }
203                 return 2;
204         } else if (g < (1 << 16)) {
205                 if (out_utf8) {
206                         out_utf8[0] = 0xe0 | ((g >> 12) & 0x0f);
207                         out_utf8[1] = 0x80 | ((g >> 6) & 0x3f);
208                         out_utf8[2] = 0x80 | (g & 0x3f);
209                 }
210                 return 3;
211         } else if (g < (1 << 21)) {
212                 if (out_utf8) {
213                         out_utf8[0] = 0xf0 | ((g >> 18) & 0x07);
214                         out_utf8[1] = 0x80 | ((g >> 12) & 0x3f);
215                         out_utf8[2] = 0x80 | ((g >> 6) & 0x3f);
216                         out_utf8[3] = 0x80 | (g & 0x3f);
217                 }
218                 return 4;
219         }
220 
221         return 0;
222 }
223 
utf16_to_utf8(const void * s,size_t length)224 char *utf16_to_utf8(const void *s, size_t length) {
225         const uint8_t *f;
226         char *r, *t;
227 
228         r = new(char, (length * 4 + 1) / 2 + 1);
229         if (!r)
230                 return NULL;
231 
232         f = s;
233         t = r;
234 
235         while (f < (const uint8_t*) s + length) {
236                 uint16_t w1, w2;
237 
238                 /* see RFC 2781 section 2.2 */
239 
240                 w1 = f[1] << 8 | f[0];
241                 f += 2;
242 
243                 if (!utf16_is_surrogate(w1)) {
244                         t += utf8_encode_unichar(t, w1);
245 
246                         continue;
247                 }
248 
249                 if (utf16_is_trailing_surrogate(w1))
250                         continue;
251                 else if (f >= (const uint8_t*) s + length)
252                         break;
253 
254                 w2 = f[1] << 8 | f[0];
255                 f += 2;
256 
257                 if (!utf16_is_trailing_surrogate(w2)) {
258                         f -= 2;
259                         continue;
260                 }
261 
262                 t += utf8_encode_unichar(t, utf16_surrogate_pair_to_unichar(w1, w2));
263         }
264 
265         *t = 0;
266         return r;
267 }
268 
269 /* expected size used to encode one unicode char */
utf8_unichar_to_encoded_len(int unichar)270 static int utf8_unichar_to_encoded_len(int unichar) {
271 
272         if (unichar < 0x80)
273                 return 1;
274         if (unichar < 0x800)
275                 return 2;
276         if (unichar < 0x10000)
277                 return 3;
278         if (unichar < 0x200000)
279                 return 4;
280         if (unichar < 0x4000000)
281                 return 5;
282 
283         return 6;
284 }
285 
286 /* validate one encoded unicode char and return its length */
utf8_encoded_valid_unichar(const char * str)287 int utf8_encoded_valid_unichar(const char *str) {
288         int len, unichar, i;
289 
290         assert(str);
291 
292         len = utf8_encoded_expected_len(str);
293         if (len == 0)
294                 return -EINVAL;
295 
296         /* ascii is valid */
297         if (len == 1)
298                 return 1;
299 
300         /* check if expected encoded chars are available */
301         for (i = 0; i < len; i++)
302                 if ((str[i] & 0x80) != 0x80)
303                         return -EINVAL;
304 
305         unichar = utf8_encoded_to_unichar(str);
306 
307         /* check if encoded length matches encoded value */
308         if (utf8_unichar_to_encoded_len(unichar) != len)
309                 return -EINVAL;
310 
311         /* check if value has valid range */
312         if (!unichar_is_valid(unichar))
313                 return -EINVAL;
314 
315         return len;
316 }
317