1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * charset conversion utils
4 *
5 * Copyright (c) 2017 Rob Clark
6 */
7
8 #include <common.h>
9 #include <charset.h>
10 #include <capitalization.h>
11 #include <malloc.h>
12
13 static struct capitalization_table capitalization_table[] =
14 #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
15 UNICODE_CAPITALIZATION_TABLE;
16 #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
17 CP1250_CAPITALIZATION_TABLE;
18 #else
19 CP437_CAPITALIZATION_TABLE;
20 #endif
21
22 /**
23 * get_code() - read Unicode code point from UTF-8 stream
24 *
25 * @read_u8: - stream reader
26 * @src: - string buffer passed to stream reader, optional
27 * Return: - Unicode code point
28 */
get_code(u8 (* read_u8)(void * data),void * data)29 static int get_code(u8 (*read_u8)(void *data), void *data)
30 {
31 s32 ch = 0;
32
33 ch = read_u8(data);
34 if (!ch)
35 return 0;
36 if (ch >= 0xc2 && ch <= 0xf4) {
37 int code = 0;
38
39 if (ch >= 0xe0) {
40 if (ch >= 0xf0) {
41 /* 0xf0 - 0xf4 */
42 ch &= 0x07;
43 code = ch << 18;
44 ch = read_u8(data);
45 if (ch < 0x80 || ch > 0xbf)
46 goto error;
47 ch &= 0x3f;
48 } else {
49 /* 0xe0 - 0xef */
50 ch &= 0x0f;
51 }
52 code += ch << 12;
53 if ((code >= 0xD800 && code <= 0xDFFF) ||
54 code >= 0x110000)
55 goto error;
56 ch = read_u8(data);
57 if (ch < 0x80 || ch > 0xbf)
58 goto error;
59 }
60 /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
61 ch &= 0x3f;
62 code += ch << 6;
63 ch = read_u8(data);
64 if (ch < 0x80 || ch > 0xbf)
65 goto error;
66 ch &= 0x3f;
67 ch += code;
68 } else if (ch >= 0x80) {
69 goto error;
70 }
71 return ch;
72 error:
73 return '?';
74 }
75
76 /**
77 * read_string() - read byte from character string
78 *
79 * @data: - pointer to string
80 * Return: - byte read
81 *
82 * The string pointer is incremented if it does not point to '\0'.
83 */
read_string(void * data)84 static u8 read_string(void *data)
85
86 {
87 const char **src = (const char **)data;
88 u8 c;
89
90 if (!src || !*src || !**src)
91 return 0;
92 c = **src;
93 ++*src;
94 return c;
95 }
96
97 /**
98 * read_console() - read byte from console
99 *
100 * @data - not used, needed to match interface
101 * Return: - byte read or 0 on error
102 */
read_console(void * data)103 static u8 read_console(void *data)
104 {
105 int ch;
106
107 ch = getc();
108 if (ch < 0)
109 ch = 0;
110 return ch;
111 }
112
console_read_unicode(s32 * code)113 int console_read_unicode(s32 *code)
114 {
115 if (!tstc()) {
116 /* No input available */
117 return 1;
118 }
119
120 /* Read Unicode code */
121 *code = get_code(read_console, NULL);
122 return 0;
123 }
124
utf8_get(const char ** src)125 s32 utf8_get(const char **src)
126 {
127 return get_code(read_string, src);
128 }
129
utf8_put(s32 code,char ** dst)130 int utf8_put(s32 code, char **dst)
131 {
132 if (!dst || !*dst)
133 return -1;
134 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
135 return -1;
136 if (code <= 0x007F) {
137 **dst = code;
138 } else {
139 if (code <= 0x07FF) {
140 **dst = code >> 6 | 0xC0;
141 } else {
142 if (code < 0x10000) {
143 **dst = code >> 12 | 0xE0;
144 } else {
145 **dst = code >> 18 | 0xF0;
146 ++*dst;
147 **dst = (code >> 12 & 0x3F) | 0x80;
148 }
149 ++*dst;
150 **dst = (code >> 6 & 0x3F) | 0x80;
151 }
152 ++*dst;
153 **dst = (code & 0x3F) | 0x80;
154 }
155 ++*dst;
156 return 0;
157 }
158
utf8_utf16_strnlen(const char * src,size_t count)159 size_t utf8_utf16_strnlen(const char *src, size_t count)
160 {
161 size_t len = 0;
162
163 for (; *src && count; --count) {
164 s32 code = utf8_get(&src);
165
166 if (!code)
167 break;
168 if (code < 0) {
169 /* Reserve space for a replacement character */
170 len += 1;
171 } else if (code < 0x10000) {
172 len += 1;
173 } else {
174 len += 2;
175 }
176 }
177 return len;
178 }
179
utf8_utf16_strncpy(u16 ** dst,const char * src,size_t count)180 int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
181 {
182 if (!src || !dst || !*dst)
183 return -1;
184
185 for (; count && *src; --count) {
186 s32 code = utf8_get(&src);
187
188 if (code < 0)
189 code = '?';
190 utf16_put(code, dst);
191 }
192 **dst = 0;
193 return 0;
194 }
195
utf16_get(const u16 ** src)196 s32 utf16_get(const u16 **src)
197 {
198 s32 code, code2;
199
200 if (!src || !*src)
201 return -1;
202 if (!**src)
203 return 0;
204 code = **src;
205 ++*src;
206 if (code >= 0xDC00 && code <= 0xDFFF)
207 return -1;
208 if (code >= 0xD800 && code <= 0xDBFF) {
209 if (!**src)
210 return -1;
211 code &= 0x3ff;
212 code <<= 10;
213 code += 0x10000;
214 code2 = **src;
215 ++*src;
216 if (code2 <= 0xDC00 || code2 >= 0xDFFF)
217 return -1;
218 code2 &= 0x3ff;
219 code += code2;
220 }
221 return code;
222 }
223
utf16_put(s32 code,u16 ** dst)224 int utf16_put(s32 code, u16 **dst)
225 {
226 if (!dst || !*dst)
227 return -1;
228 if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
229 return -1;
230 if (code < 0x10000) {
231 **dst = code;
232 } else {
233 code -= 0x10000;
234 **dst = code >> 10 | 0xD800;
235 ++*dst;
236 **dst = (code & 0x3ff) | 0xDC00;
237 }
238 ++*dst;
239 return 0;
240 }
241
utf16_strnlen(const u16 * src,size_t count)242 size_t utf16_strnlen(const u16 *src, size_t count)
243 {
244 size_t len = 0;
245
246 for (; *src && count; --count) {
247 s32 code = utf16_get(&src);
248
249 if (!code)
250 break;
251 /*
252 * In case of an illegal sequence still reserve space for a
253 * replacement character.
254 */
255 ++len;
256 }
257 return len;
258 }
259
utf16_utf8_strnlen(const u16 * src,size_t count)260 size_t utf16_utf8_strnlen(const u16 *src, size_t count)
261 {
262 size_t len = 0;
263
264 for (; *src && count; --count) {
265 s32 code = utf16_get(&src);
266
267 if (!code)
268 break;
269 if (code < 0)
270 /* Reserve space for a replacement character */
271 len += 1;
272 else if (code < 0x80)
273 len += 1;
274 else if (code < 0x800)
275 len += 2;
276 else if (code < 0x10000)
277 len += 3;
278 else
279 len += 4;
280 }
281 return len;
282 }
283
utf16_utf8_strncpy(char ** dst,const u16 * src,size_t count)284 int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
285 {
286 if (!src || !dst || !*dst)
287 return -1;
288
289 for (; count && *src; --count) {
290 s32 code = utf16_get(&src);
291
292 if (code < 0)
293 code = '?';
294 utf8_put(code, dst);
295 }
296 **dst = 0;
297 return 0;
298 }
299
utf_to_lower(const s32 code)300 s32 utf_to_lower(const s32 code)
301 {
302 struct capitalization_table *pos = capitalization_table;
303 s32 ret = code;
304
305 if (code <= 0x7f) {
306 if (code >= 'A' && code <= 'Z')
307 ret += 0x20;
308 return ret;
309 }
310 for (; pos->upper; ++pos) {
311 if (pos->upper == code) {
312 ret = pos->lower;
313 break;
314 }
315 }
316 return ret;
317 }
318
utf_to_upper(const s32 code)319 s32 utf_to_upper(const s32 code)
320 {
321 struct capitalization_table *pos = capitalization_table;
322 s32 ret = code;
323
324 if (code <= 0x7f) {
325 if (code >= 'a' && code <= 'z')
326 ret -= 0x20;
327 return ret;
328 }
329 for (; pos->lower; ++pos) {
330 if (pos->lower == code) {
331 ret = pos->upper;
332 break;
333 }
334 }
335 return ret;
336 }
337
338 /*
339 * u16_strncmp() - compare two u16 string
340 *
341 * @s1: first string to compare
342 * @s2: second string to compare
343 * @n: maximum number of u16 to compare
344 * Return: 0 if the first n u16 are the same in s1 and s2
345 * < 0 if the first different u16 in s1 is less than the
346 * corresponding u16 in s2
347 * > 0 if the first different u16 in s1 is greater than the
348 * corresponding u16 in s2
349 */
u16_strncmp(const u16 * s1,const u16 * s2,size_t n)350 int u16_strncmp(const u16 *s1, const u16 *s2, size_t n)
351 {
352 int ret = 0;
353
354 for (; n; --n, ++s1, ++s2) {
355 ret = *s1 - *s2;
356 if (ret || !*s1)
357 break;
358 }
359
360 return ret;
361 }
362
u16_strlen(const void * in)363 size_t u16_strlen(const void *in)
364 {
365 const char *pos = in;
366 size_t ret;
367
368 for (; pos[0] || pos[1]; pos += 2)
369 ;
370 ret = pos - (char *)in;
371 ret >>= 1;
372 return ret;
373 }
374
u16_strnlen(const u16 * in,size_t count)375 size_t u16_strnlen(const u16 *in, size_t count)
376 {
377 size_t i;
378 for (i = 0; count-- && in[i]; i++);
379 return i;
380 }
381
u16_strcpy(u16 * dest,const u16 * src)382 u16 *u16_strcpy(u16 *dest, const u16 *src)
383 {
384 u16 *tmp = dest;
385
386 for (;; dest++, src++) {
387 *dest = *src;
388 if (!*src)
389 break;
390 }
391
392 return tmp;
393 }
394
u16_strdup(const void * src)395 u16 *u16_strdup(const void *src)
396 {
397 u16 *new;
398 size_t len;
399
400 if (!src)
401 return NULL;
402 len = (u16_strlen(src) + 1) * sizeof(u16);
403 new = malloc(len);
404 if (!new)
405 return NULL;
406 memcpy(new, src, len);
407
408 return new;
409 }
410
411 /* Convert UTF-16 to UTF-8. */
utf16_to_utf8(uint8_t * dest,const uint16_t * src,size_t size)412 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
413 {
414 uint32_t code_high = 0;
415
416 while (size--) {
417 uint32_t code = *src++;
418
419 if (code_high) {
420 if (code >= 0xDC00 && code <= 0xDFFF) {
421 /* Surrogate pair. */
422 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
423
424 *dest++ = (code >> 18) | 0xF0;
425 *dest++ = ((code >> 12) & 0x3F) | 0x80;
426 *dest++ = ((code >> 6) & 0x3F) | 0x80;
427 *dest++ = (code & 0x3F) | 0x80;
428 } else {
429 /* Error... */
430 *dest++ = '?';
431 /* *src may be valid. Don't eat it. */
432 src--;
433 }
434
435 code_high = 0;
436 } else {
437 if (code <= 0x007F) {
438 *dest++ = code;
439 } else if (code <= 0x07FF) {
440 *dest++ = (code >> 6) | 0xC0;
441 *dest++ = (code & 0x3F) | 0x80;
442 } else if (code >= 0xD800 && code <= 0xDBFF) {
443 code_high = code;
444 continue;
445 } else if (code >= 0xDC00 && code <= 0xDFFF) {
446 /* Error... */
447 *dest++ = '?';
448 } else if (code < 0x10000) {
449 *dest++ = (code >> 12) | 0xE0;
450 *dest++ = ((code >> 6) & 0x3F) | 0x80;
451 *dest++ = (code & 0x3F) | 0x80;
452 } else {
453 *dest++ = (code >> 18) | 0xF0;
454 *dest++ = ((code >> 12) & 0x3F) | 0x80;
455 *dest++ = ((code >> 6) & 0x3F) | 0x80;
456 *dest++ = (code & 0x3F) | 0x80;
457 }
458 }
459 }
460
461 return dest;
462 }
463