1 /*
2 * Copyright (c) 2014 SGI.
3 * Copyright (c) 2018 Collabora Ltd.
4 * All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 */
16
17 /*
18 * This code is adapted from the Linux Kernel. We have a
19 * userspace version here such that the hashes will match that
20 * implementation.
21 */
22
23 #include <stdint.h>
24 #include <unistd.h>
25 #include <string.h>
26 #include <limits.h>
27 #include <errno.h>
28
29 #include <f2fs_fs.h>
30
31 /* Encoding a unicode version number as a single unsigned int. */
32 #define UNICODE_MAJ_SHIFT (16)
33 #define UNICODE_MIN_SHIFT (8)
34
35 #define UNICODE_AGE(MAJ, MIN, REV) \
36 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
37 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
38 ((unsigned int)(REV)))
39
40 /* Needed in struct utf8cursor below. */
41 #define UTF8HANGULLEAF (12)
42
43 /*
44 * Cursor structure used by the normalizer.
45 */
46 struct utf8cursor {
47 const struct utf8data *data;
48 const char *s;
49 const char *p;
50 const char *ss;
51 const char *sp;
52 unsigned int len;
53 unsigned int slen;
54 short int ccc;
55 short int nccc;
56 unsigned char hangul[UTF8HANGULLEAF];
57 };
58
59 /*
60 * Initialize a utf8cursor to normalize a string.
61 * Returns 0 on success.
62 * Returns -1 on failure.
63 */
64 // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
65 // const char *s);
66 // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
67 // const char *s, size_t len);
68
69 /*
70 * Get the next byte in the normalization.
71 * Returns a value > 0 && < 256 on success.
72 * Returns 0 when the end of the normalization is reached.
73 * Returns -1 if the string being normalized is not valid UTF-8.
74 */
75 // extern int utf8byte(struct utf8cursor *u8c);
76
77
78 struct utf8data {
79 unsigned int maxage;
80 unsigned int offset;
81 };
82
83 #define __INCLUDED_FROM_UTF8NORM_C__
84 #include "utf8data.h"
85 #undef __INCLUDED_FROM_UTF8NORM_C__
86
87 #define ARRAY_SIZE(array) \
88 (sizeof(array) / sizeof(array[0]))
89
90 #if 0
91 /* Highest unicode version supported by the data tables. */
92 static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
93 {
94 int i = ARRAY_SIZE(utf8agetab) - 1;
95 unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
96
97 while (i >= 0 && utf8agetab[i] != 0) {
98 if (sb_utf8version == utf8agetab[i])
99 return 1;
100 i--;
101 }
102 return 0;
103 }
104 #endif
105
106 #if 0
107 static int utf8version_latest(void)
108 {
109 return utf8vers;
110 }
111 #endif
112
113 /*
114 * UTF-8 valid ranges.
115 *
116 * The UTF-8 encoding spreads the bits of a 32bit word over several
117 * bytes. This table gives the ranges that can be held and how they'd
118 * be represented.
119 *
120 * 0x00000000 0x0000007F: 0xxxxxxx
121 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
122 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
123 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
124 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
125 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
126 *
127 * There is an additional requirement on UTF-8, in that only the
128 * shortest representation of a 32bit value is to be used. A decoder
129 * must not decode sequences that do not satisfy this requirement.
130 * Thus the allowed ranges have a lower bound.
131 *
132 * 0x00000000 0x0000007F: 0xxxxxxx
133 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
134 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
135 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
137 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
138 *
139 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
140 * 17 planes of 65536 values. This limits the sequences actually seen
141 * even more, to just the following.
142 *
143 * 0 - 0x7F: 0 - 0x7F
144 * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
145 * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
146 * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
147 *
148 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
149 *
150 * Note that the longest sequence seen with valid usage is 4 bytes,
151 * the same a single UTF-32 character. This makes the UTF-8
152 * representation of Unicode strictly smaller than UTF-32.
153 *
154 * The shortest sequence requirement was introduced by:
155 * Corrigendum #1: UTF-8 Shortest Form
156 * It can be found here:
157 * http://www.unicode.org/versions/corrigendum1.html
158 *
159 */
160
161 /*
162 * Return the number of bytes used by the current UTF-8 sequence.
163 * Assumes the input points to the first byte of a valid UTF-8
164 * sequence.
165 */
utf8clen(const char * s)166 static inline int utf8clen(const char *s)
167 {
168 unsigned char c = *s;
169
170 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
171 }
172
173 /*
174 * Decode a 3-byte UTF-8 sequence.
175 */
176 static unsigned int
utf8decode3(const char * str)177 utf8decode3(const char *str)
178 {
179 unsigned int uc;
180
181 uc = *str++ & 0x0F;
182 uc <<= 6;
183 uc |= *str++ & 0x3F;
184 uc <<= 6;
185 uc |= *str++ & 0x3F;
186
187 return uc;
188 }
189
190 /*
191 * Encode a 3-byte UTF-8 sequence.
192 */
193 static int
utf8encode3(char * str,unsigned int val)194 utf8encode3(char *str, unsigned int val)
195 {
196 str[2] = (val & 0x3F) | 0x80;
197 val >>= 6;
198 str[1] = (val & 0x3F) | 0x80;
199 val >>= 6;
200 str[0] = val | 0xE0;
201
202 return 3;
203 }
204
205 /*
206 * utf8trie_t
207 *
208 * A compact binary tree, used to decode UTF-8 characters.
209 *
210 * Internal nodes are one byte for the node itself, and up to three
211 * bytes for an offset into the tree. The first byte contains the
212 * following information:
213 * NEXTBYTE - flag - advance to next byte if set
214 * BITNUM - 3 bit field - the bit number to tested
215 * OFFLEN - 2 bit field - number of bytes in the offset
216 * if offlen == 0 (non-branching node)
217 * RIGHTPATH - 1 bit field - set if the following node is for the
218 * right-hand path (tested bit is set)
219 * TRIENODE - 1 bit field - set if the following node is an internal
220 * node, otherwise it is a leaf node
221 * if offlen != 0 (branching node)
222 * LEFTNODE - 1 bit field - set if the left-hand node is internal
223 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
224 *
225 * Due to the way utf8 works, there cannot be branching nodes with
226 * NEXTBYTE set, and moreover those nodes always have a righthand
227 * descendant.
228 */
229 typedef const unsigned char utf8trie_t;
230 #define BITNUM 0x07
231 #define NEXTBYTE 0x08
232 #define OFFLEN 0x30
233 #define OFFLEN_SHIFT 4
234 #define RIGHTPATH 0x40
235 #define TRIENODE 0x80
236 #define RIGHTNODE 0x40
237 #define LEFTNODE 0x80
238
239 /*
240 * utf8leaf_t
241 *
242 * The leaves of the trie are embedded in the trie, and so the same
243 * underlying datatype: unsigned char.
244 *
245 * leaf[0]: The unicode version, stored as a generation number that is
246 * an index into utf8agetab[]. With this we can filter code
247 * points based on the unicode version in which they were
248 * defined. The CCC of a non-defined code point is 0.
249 * leaf[1]: Canonical Combining Class. During normalization, we need
250 * to do a stable sort into ascending order of all characters
251 * with a non-zero CCC that occur between two characters with
252 * a CCC of 0, or at the begin or end of a string.
253 * The unicode standard guarantees that all CCC values are
254 * between 0 and 254 inclusive, which leaves 255 available as
255 * a special value.
256 * Code points with CCC 0 are known as stoppers.
257 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
258 * start of a NUL-terminated string that is the decomposition
259 * of the character.
260 * The CCC of a decomposable character is the same as the CCC
261 * of the first character of its decomposition.
262 * Some characters decompose as the empty string: these are
263 * characters with the Default_Ignorable_Code_Point property.
264 * These do affect normalization, as they all have CCC 0.
265 *
266 * The decompositions in the trie have been fully expanded, with the
267 * exception of Hangul syllables, which are decomposed algorithmically.
268 *
269 * Casefolding, if applicable, is also done using decompositions.
270 *
271 * The trie is constructed in such a way that leaves exist for all
272 * UTF-8 sequences that match the criteria from the "UTF-8 valid
273 * ranges" comment above, and only for those sequences. Therefore a
274 * lookup in the trie can be used to validate the UTF-8 input.
275 */
276 typedef const unsigned char utf8leaf_t;
277
278 #define LEAF_GEN(LEAF) ((LEAF)[0])
279 #define LEAF_CCC(LEAF) ((LEAF)[1])
280 #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
281
282 #define MINCCC (0)
283 #define MAXCCC (254)
284 #define STOPPER (0)
285 #define DECOMPOSE (255)
286
287 /* Marker for hangul syllable decomposition. */
288 #define HANGUL ((char)(255))
289 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
290 #define UTF8HANGULLEAF (12)
291
292 /*
293 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
294 *
295 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
296 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
297 *
298 * SBase = 0xAC00
299 * LBase = 0x1100
300 * VBase = 0x1161
301 * TBase = 0x11A7
302 * LCount = 19
303 * VCount = 21
304 * TCount = 28
305 * NCount = 588 (VCount * TCount)
306 * SCount = 11172 (LCount * NCount)
307 *
308 * Decomposition:
309 * SIndex = s - SBase
310 *
311 * LV (Canonical/Full)
312 * LIndex = SIndex / NCount
313 * VIndex = (Sindex % NCount) / TCount
314 * LPart = LBase + LIndex
315 * VPart = VBase + VIndex
316 *
317 * LVT (Canonical)
318 * LVIndex = (SIndex / TCount) * TCount
319 * TIndex = (Sindex % TCount)
320 * LVPart = SBase + LVIndex
321 * TPart = TBase + TIndex
322 *
323 * LVT (Full)
324 * LIndex = SIndex / NCount
325 * VIndex = (Sindex % NCount) / TCount
326 * TIndex = (Sindex % TCount)
327 * LPart = LBase + LIndex
328 * VPart = VBase + VIndex
329 * if (TIndex == 0) {
330 * d = <LPart, VPart>
331 * } else {
332 * TPart = TBase + TIndex
333 * d = <LPart, TPart, VPart>
334 * }
335 */
336
337 /* Constants */
338 #define SB (0xAC00)
339 #define LB (0x1100)
340 #define VB (0x1161)
341 #define TB (0x11A7)
342 #define LC (19)
343 #define VC (21)
344 #define TC (28)
345 #define NC (VC * TC)
346 #define SC (LC * NC)
347
348 /* Algorithmic decomposition of hangul syllable. */
349 static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)350 utf8hangul(const char *str, unsigned char *hangul)
351 {
352 unsigned int si;
353 unsigned int li;
354 unsigned int vi;
355 unsigned int ti;
356 unsigned char *h;
357
358 /* Calculate the SI, LI, VI, and TI values. */
359 si = utf8decode3(str) - SB;
360 li = si / NC;
361 vi = (si % NC) / TC;
362 ti = si % TC;
363
364 /* Fill in base of leaf. */
365 h = hangul;
366 LEAF_GEN(h) = 2;
367 LEAF_CCC(h) = DECOMPOSE;
368 h += 2;
369
370 /* Add LPart, a 3-byte UTF-8 sequence. */
371 h += utf8encode3((char *)h, li + LB);
372
373 /* Add VPart, a 3-byte UTF-8 sequence. */
374 h += utf8encode3((char *)h, vi + VB);
375
376 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
377 if (ti)
378 h += utf8encode3((char *)h, ti + TB);
379
380 /* Terminate string. */
381 h[0] = '\0';
382
383 return hangul;
384 }
385
386 /*
387 * Use trie to scan s, touching at most len bytes.
388 * Returns the leaf if one exists, NULL otherwise.
389 *
390 * A non-NULL return guarantees that the UTF-8 sequence starting at s
391 * is well-formed and corresponds to a known unicode code point. The
392 * shorthand for this will be "is valid UTF-8 unicode".
393 */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)394 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
395 unsigned char *hangul, const char *s, size_t len)
396 {
397 utf8trie_t *trie;
398 int offlen;
399 int offset;
400 int mask;
401 int node;
402
403 if (!data)
404 return NULL;
405 if (len == 0)
406 return NULL;
407
408 trie = utf8data + data->offset;
409 node = 1;
410 while (node) {
411 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
412 if (*trie & NEXTBYTE) {
413 if (--len == 0)
414 return NULL;
415 s++;
416 }
417 mask = 1 << (*trie & BITNUM);
418 if (*s & mask) {
419 /* Right leg */
420 if (offlen) {
421 /* Right node at offset of trie */
422 node = (*trie & RIGHTNODE);
423 offset = trie[offlen];
424 while (--offlen) {
425 offset <<= 8;
426 offset |= trie[offlen];
427 }
428 trie += offset;
429 } else if (*trie & RIGHTPATH) {
430 /* Right node after this node */
431 node = (*trie & TRIENODE);
432 trie++;
433 } else {
434 /* No right node. */
435 return NULL;
436 }
437 } else {
438 /* Left leg */
439 if (offlen) {
440 /* Left node after this node. */
441 node = (*trie & LEFTNODE);
442 trie += offlen + 1;
443 } else if (*trie & RIGHTPATH) {
444 /* No left node. */
445 return NULL;
446 } else {
447 /* Left node after this node */
448 node = (*trie & TRIENODE);
449 trie++;
450 }
451 }
452 }
453 /*
454 * Hangul decomposition is done algorithmically. These are the
455 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
456 * always 3 bytes long, so s has been advanced twice, and the
457 * start of the sequence is at s-2.
458 */
459 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
460 trie = utf8hangul(s - 2, hangul);
461 return trie;
462 }
463
464 /*
465 * Use trie to scan s.
466 * Returns the leaf if one exists, NULL otherwise.
467 *
468 * Forwards to utf8nlookup().
469 */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)470 static utf8leaf_t *utf8lookup(const struct utf8data *data,
471 unsigned char *hangul, const char *s)
472 {
473 return utf8nlookup(data, hangul, s, (size_t)-1);
474 }
475
476 #if 0
477 /*
478 * Maximum age of any character in s.
479 * Return -1 if s is not valid UTF-8 unicode.
480 * Return 0 if only non-assigned code points are used.
481 */
482 static int utf8agemax(const struct utf8data *data, const char *s)
483 {
484 utf8leaf_t *leaf;
485 int age = 0;
486 int leaf_age;
487 unsigned char hangul[UTF8HANGULLEAF];
488
489 if (!data)
490 return -1;
491
492 while (*s) {
493 leaf = utf8lookup(data, hangul, s);
494 if (!leaf)
495 return -1;
496
497 leaf_age = utf8agetab[LEAF_GEN(leaf)];
498 if (leaf_age <= data->maxage && leaf_age > age)
499 age = leaf_age;
500 s += utf8clen(s);
501 }
502 return age;
503 }
504 #endif
505
506 #if 0
507 /*
508 * Minimum age of any character in s.
509 * Return -1 if s is not valid UTF-8 unicode.
510 * Return 0 if non-assigned code points are used.
511 */
512 static int utf8agemin(const struct utf8data *data, const char *s)
513 {
514 utf8leaf_t *leaf;
515 int age;
516 int leaf_age;
517 unsigned char hangul[UTF8HANGULLEAF];
518
519 if (!data)
520 return -1;
521 age = data->maxage;
522 while (*s) {
523 leaf = utf8lookup(data, hangul, s);
524 if (!leaf)
525 return -1;
526 leaf_age = utf8agetab[LEAF_GEN(leaf)];
527 if (leaf_age <= data->maxage && leaf_age < age)
528 age = leaf_age;
529 s += utf8clen(s);
530 }
531 return age;
532 }
533 #endif
534
535 #if 0
536 /*
537 * Maximum age of any character in s, touch at most len bytes.
538 * Return -1 if s is not valid UTF-8 unicode.
539 */
540 static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
541 {
542 utf8leaf_t *leaf;
543 int age = 0;
544 int leaf_age;
545 unsigned char hangul[UTF8HANGULLEAF];
546
547 if (!data)
548 return -1;
549
550 while (len && *s) {
551 leaf = utf8nlookup(data, hangul, s, len);
552 if (!leaf)
553 return -1;
554 leaf_age = utf8agetab[LEAF_GEN(leaf)];
555 if (leaf_age <= data->maxage && leaf_age > age)
556 age = leaf_age;
557 len -= utf8clen(s);
558 s += utf8clen(s);
559 }
560 return age;
561 }
562 #endif
563
564 #if 0
565 /*
566 * Maximum age of any character in s, touch at most len bytes.
567 * Return -1 if s is not valid UTF-8 unicode.
568 */
569 static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
570 {
571 utf8leaf_t *leaf;
572 int leaf_age;
573 int age;
574 unsigned char hangul[UTF8HANGULLEAF];
575
576 if (!data)
577 return -1;
578 age = data->maxage;
579 while (len && *s) {
580 leaf = utf8nlookup(data, hangul, s, len);
581 if (!leaf)
582 return -1;
583 leaf_age = utf8agetab[LEAF_GEN(leaf)];
584 if (leaf_age <= data->maxage && leaf_age < age)
585 age = leaf_age;
586 len -= utf8clen(s);
587 s += utf8clen(s);
588 }
589 return age;
590 }
591 #endif
592
593 #if 0
594 /*
595 * Length of the normalization of s.
596 * Return -1 if s is not valid UTF-8 unicode.
597 *
598 * A string of Default_Ignorable_Code_Point has length 0.
599 */
600 static ssize_t utf8len(const struct utf8data *data, const char *s)
601 {
602 utf8leaf_t *leaf;
603 size_t ret = 0;
604 unsigned char hangul[UTF8HANGULLEAF];
605
606 if (!data)
607 return -1;
608 while (*s) {
609 leaf = utf8lookup(data, hangul, s);
610 if (!leaf)
611 return -1;
612 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
613 ret += utf8clen(s);
614 else if (LEAF_CCC(leaf) == DECOMPOSE)
615 ret += strlen(LEAF_STR(leaf));
616 else
617 ret += utf8clen(s);
618 s += utf8clen(s);
619 }
620 return ret;
621 }
622 #endif
623
624 #if 0
625 /*
626 * Length of the normalization of s, touch at most len bytes.
627 * Return -1 if s is not valid UTF-8 unicode.
628 */
629 static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
630 {
631 utf8leaf_t *leaf;
632 size_t ret = 0;
633 unsigned char hangul[UTF8HANGULLEAF];
634
635 if (!data)
636 return -1;
637 while (len && *s) {
638 leaf = utf8nlookup(data, hangul, s, len);
639 if (!leaf)
640 return -1;
641 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
642 ret += utf8clen(s);
643 else if (LEAF_CCC(leaf) == DECOMPOSE)
644 ret += strlen(LEAF_STR(leaf));
645 else
646 ret += utf8clen(s);
647 len -= utf8clen(s);
648 s += utf8clen(s);
649 }
650 return ret;
651 }
652 #endif
653
654 /*
655 * Set up an utf8cursor for use by utf8byte().
656 *
657 * u8c : pointer to cursor.
658 * data : const struct utf8data to use for normalization.
659 * s : string.
660 * len : length of s.
661 *
662 * Returns -1 on error, 0 on success.
663 */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)664 static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
665 const char *s, size_t len)
666 {
667 if (!data)
668 return -1;
669 if (!s)
670 return -1;
671 u8c->data = data;
672 u8c->s = s;
673 u8c->p = NULL;
674 u8c->ss = NULL;
675 u8c->sp = NULL;
676 u8c->len = len;
677 u8c->slen = 0;
678 u8c->ccc = STOPPER;
679 u8c->nccc = STOPPER;
680 /* Check we didn't clobber the maximum length. */
681 if (u8c->len != len)
682 return -1;
683 /* The first byte of s may not be an utf8 continuation. */
684 if (len > 0 && (*s & 0xC0) == 0x80)
685 return -1;
686 return 0;
687 }
688
689 #if 0
690 /*
691 * Set up an utf8cursor for use by utf8byte().
692 *
693 * u8c : pointer to cursor.
694 * data : const struct utf8data to use for normalization.
695 * s : NUL-terminated string.
696 *
697 * Returns -1 on error, 0 on success.
698 */
699 static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
700 const char *s)
701 {
702 return utf8ncursor(u8c, data, s, (unsigned int)-1);
703 }
704 #endif
705
706 /*
707 * Get one byte from the normalized form of the string described by u8c.
708 *
709 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
710 *
711 * The cursor keeps track of the location in the string in u8c->s.
712 * When a character is decomposed, the current location is stored in
713 * u8c->p, and u8c->s is set to the start of the decomposition. Note
714 * that bytes from a decomposition do not count against u8c->len.
715 *
716 * Characters are emitted if they match the current CCC in u8c->ccc.
717 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
718 * and the function returns 0 in that case.
719 *
720 * Sorting by CCC is done by repeatedly scanning the string. The
721 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
722 * the start of the scan. The first pass finds the lowest CCC to be
723 * emitted and stores it in u8c->nccc, the second pass emits the
724 * characters with this CCC and finds the next lowest CCC. This limits
725 * the number of passes to 1 + the number of different CCCs in the
726 * sequence being scanned.
727 *
728 * Therefore:
729 * u8c->p != NULL -> a decomposition is being scanned.
730 * u8c->ss != NULL -> this is a repeating scan.
731 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
732 */
utf8byte(struct utf8cursor * u8c)733 static int utf8byte(struct utf8cursor *u8c)
734 {
735 utf8leaf_t *leaf;
736 int ccc;
737
738 for (;;) {
739 /* Check for the end of a decomposed character. */
740 if (u8c->p && *u8c->s == '\0') {
741 u8c->s = u8c->p;
742 u8c->p = NULL;
743 }
744
745 /* Check for end-of-string. */
746 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
747 /* There is no next byte. */
748 if (u8c->ccc == STOPPER)
749 return 0;
750 /* End-of-string during a scan counts as a stopper. */
751 ccc = STOPPER;
752 goto ccc_mismatch;
753 } else if ((*u8c->s & 0xC0) == 0x80) {
754 /* This is a continuation of the current character. */
755 if (!u8c->p)
756 u8c->len--;
757 return (unsigned char)*u8c->s++;
758 }
759
760 /* Look up the data for the current character. */
761 if (u8c->p) {
762 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
763 } else {
764 leaf = utf8nlookup(u8c->data, u8c->hangul,
765 u8c->s, u8c->len);
766 }
767
768 /* No leaf found implies that the input is a binary blob. */
769 if (!leaf)
770 return -1;
771
772 ccc = LEAF_CCC(leaf);
773 /* Characters that are too new have CCC 0. */
774 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
775 ccc = STOPPER;
776 } else if (ccc == DECOMPOSE) {
777 u8c->len -= utf8clen(u8c->s);
778 u8c->p = u8c->s + utf8clen(u8c->s);
779 u8c->s = LEAF_STR(leaf);
780 /* Empty decomposition implies CCC 0. */
781 if (*u8c->s == '\0') {
782 if (u8c->ccc == STOPPER)
783 continue;
784 ccc = STOPPER;
785 goto ccc_mismatch;
786 }
787
788 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
789 if (!leaf)
790 return -1;
791 ccc = LEAF_CCC(leaf);
792 }
793
794 /*
795 * If this is not a stopper, then see if it updates
796 * the next canonical class to be emitted.
797 */
798 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
799 u8c->nccc = ccc;
800
801 /*
802 * Return the current byte if this is the current
803 * combining class.
804 */
805 if (ccc == u8c->ccc) {
806 if (!u8c->p)
807 u8c->len--;
808 return (unsigned char)*u8c->s++;
809 }
810
811 /* Current combining class mismatch. */
812 ccc_mismatch:
813 if (u8c->nccc == STOPPER) {
814 /*
815 * Scan forward for the first canonical class
816 * to be emitted. Save the position from
817 * which to restart.
818 */
819 u8c->ccc = MINCCC - 1;
820 u8c->nccc = ccc;
821 u8c->sp = u8c->p;
822 u8c->ss = u8c->s;
823 u8c->slen = u8c->len;
824 if (!u8c->p)
825 u8c->len -= utf8clen(u8c->s);
826 u8c->s += utf8clen(u8c->s);
827 } else if (ccc != STOPPER) {
828 /* Not a stopper, and not the ccc we're emitting. */
829 if (!u8c->p)
830 u8c->len -= utf8clen(u8c->s);
831 u8c->s += utf8clen(u8c->s);
832 } else if (u8c->nccc != MAXCCC + 1) {
833 /* At a stopper, restart for next ccc. */
834 u8c->ccc = u8c->nccc;
835 u8c->nccc = MAXCCC + 1;
836 u8c->s = u8c->ss;
837 u8c->p = u8c->sp;
838 u8c->len = u8c->slen;
839 } else {
840 /* All done, proceed from here. */
841 u8c->ccc = STOPPER;
842 u8c->nccc = STOPPER;
843 u8c->sp = NULL;
844 u8c->ss = NULL;
845 u8c->slen = 0;
846 }
847 }
848 }
849
850 #if 0
851 /*
852 * Look for the correct const struct utf8data for a unicode version.
853 * Returns NULL if the version requested is too new.
854 *
855 * Two normalization forms are supported: nfdi and nfdicf.
856 *
857 * nfdi:
858 * - Apply unicode normalization form NFD.
859 * - Remove any Default_Ignorable_Code_Point.
860 *
861 * nfdicf:
862 * - Apply unicode normalization form NFD.
863 * - Remove any Default_Ignorable_Code_Point.
864 * - Apply a full casefold (C + F).
865 */
866 static const struct utf8data *utf8nfdi(unsigned int maxage)
867 {
868 int i = ARRAY_SIZE(utf8nfdidata) - 1;
869
870 while (maxage < utf8nfdidata[i].maxage)
871 i--;
872 if (maxage > utf8nfdidata[i].maxage)
873 return NULL;
874 return &utf8nfdidata[i];
875 }
876 #endif
877
utf8nfdicf(unsigned int maxage)878 static const struct utf8data *utf8nfdicf(unsigned int maxage)
879 {
880 int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
881
882 while (maxage < utf8nfdicfdata[i].maxage)
883 i--;
884 if (maxage > utf8nfdicfdata[i].maxage)
885 return NULL;
886 return &utf8nfdicfdata[i];
887 }
888
utf8_casefold(const struct f2fs_nls_table * table,const unsigned char * str,size_t len,unsigned char * dest,size_t dlen)889 static int utf8_casefold(const struct f2fs_nls_table *table,
890 const unsigned char *str, size_t len,
891 unsigned char *dest, size_t dlen)
892 {
893 const struct utf8data *data = utf8nfdicf(table->version);
894 struct utf8cursor cur;
895 size_t nlen = 0;
896
897 if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
898 goto invalid_seq;
899
900 for (nlen = 0; nlen < dlen; nlen++) {
901 int c = utf8byte(&cur);
902
903 dest[nlen] = c;
904 if (!c)
905 return nlen;
906 if (c == -1)
907 break;
908 }
909
910 return -ENAMETOOLONG;
911
912 invalid_seq:
913 if (dlen < len)
914 return -ENAMETOOLONG;
915
916 /* Signal invalid sequence */
917 return -EINVAL;
918 }
919
920 static const struct f2fs_nls_ops utf8_ops = {
921 .casefold = utf8_casefold,
922 };
923
924 static const struct f2fs_nls_table nls_utf8 = {
925 .ops = &utf8_ops,
926 .version = UNICODE_AGE(12, 1, 0),
927 };
928
f2fs_load_nls_table(int encoding)929 const struct f2fs_nls_table *f2fs_load_nls_table(int encoding)
930 {
931 if (encoding == F2FS_ENC_UTF8_12_1)
932 return &nls_utf8;
933
934 return NULL;
935 }
936