• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2014 SGI.
3  * Copyright (c) 2018 Collabora Ltd.
4  * All rights reserved.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it would be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  */
16 
17 /*
18  * This code is adapted from the Linux Kernel.  We have a
19  * userspace version here such that the hashes will match that
20  * implementation.
21  */
22 
23 #include <stdint.h>
24 #include <unistd.h>
25 #include <string.h>
26 #include <limits.h>
27 #include <errno.h>
28 
29 #include <f2fs_fs.h>
30 
31 /* Encoding a unicode version number as a single unsigned int. */
32 #define UNICODE_MAJ_SHIFT		(16)
33 #define UNICODE_MIN_SHIFT		(8)
34 
35 #define UNICODE_AGE(MAJ, MIN, REV)			\
36 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
37 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
38 	 ((unsigned int)(REV)))
39 
40 /* Needed in struct utf8cursor below. */
41 #define UTF8HANGULLEAF	(12)
42 
43 /*
44  * Cursor structure used by the normalizer.
45  */
46 struct utf8cursor {
47 	const struct utf8data	*data;
48 	const char	*s;
49 	const char	*p;
50 	const char	*ss;
51 	const char	*sp;
52 	unsigned int	len;
53 	unsigned int	slen;
54 	short int	ccc;
55 	short int	nccc;
56 	unsigned char	hangul[UTF8HANGULLEAF];
57 };
58 
59 /*
60  * Initialize a utf8cursor to normalize a string.
61  * Returns 0 on success.
62  * Returns -1 on failure.
63  */
64 // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
65 //		      const char *s);
66 // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
67 //		       const char *s, size_t len);
68 
69 /*
70  * Get the next byte in the normalization.
71  * Returns a value > 0 && < 256 on success.
72  * Returns 0 when the end of the normalization is reached.
73  * Returns -1 if the string being normalized is not valid UTF-8.
74  */
75 // extern int utf8byte(struct utf8cursor *u8c);
76 
77 
78 struct utf8data {
79 	unsigned int maxage;
80 	unsigned int offset;
81 };
82 
83 #define __INCLUDED_FROM_UTF8NORM_C__
84 #include "utf8data.h"
85 #undef __INCLUDED_FROM_UTF8NORM_C__
86 
87 #define ARRAY_SIZE(array)			\
88         (sizeof(array) / sizeof(array[0]))
89 
90 #if 0
91 /* Highest unicode version supported by the data tables. */
92 static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
93 {
94 	int i = ARRAY_SIZE(utf8agetab) - 1;
95 	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
96 
97 	while (i >= 0 && utf8agetab[i] != 0) {
98 		if (sb_utf8version == utf8agetab[i])
99 			return 1;
100 		i--;
101 	}
102 	return 0;
103 }
104 #endif
105 
106 #if 0
107 static int utf8version_latest(void)
108 {
109 	return utf8vers;
110 }
111 #endif
112 
113 /*
114  * UTF-8 valid ranges.
115  *
116  * The UTF-8 encoding spreads the bits of a 32bit word over several
117  * bytes. This table gives the ranges that can be held and how they'd
118  * be represented.
119  *
120  * 0x00000000 0x0000007F: 0xxxxxxx
121  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
122  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
123  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
124  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
125  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
126  *
127  * There is an additional requirement on UTF-8, in that only the
128  * shortest representation of a 32bit value is to be used.  A decoder
129  * must not decode sequences that do not satisfy this requirement.
130  * Thus the allowed ranges have a lower bound.
131  *
132  * 0x00000000 0x0000007F: 0xxxxxxx
133  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
134  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
135  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
136  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
137  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
138  *
139  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
140  * 17 planes of 65536 values.  This limits the sequences actually seen
141  * even more, to just the following.
142  *
143  *          0 -     0x7F: 0                   - 0x7F
144  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
145  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
146  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
147  *
148  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
149  *
150  * Note that the longest sequence seen with valid usage is 4 bytes,
151  * the same a single UTF-32 character.  This makes the UTF-8
152  * representation of Unicode strictly smaller than UTF-32.
153  *
154  * The shortest sequence requirement was introduced by:
155  *    Corrigendum #1: UTF-8 Shortest Form
156  * It can be found here:
157  *    http://www.unicode.org/versions/corrigendum1.html
158  *
159  */
160 
161 /*
162  * Return the number of bytes used by the current UTF-8 sequence.
163  * Assumes the input points to the first byte of a valid UTF-8
164  * sequence.
165  */
utf8clen(const char * s)166 static inline int utf8clen(const char *s)
167 {
168 	unsigned char c = *s;
169 
170 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
171 }
172 
173 /*
174  * Decode a 3-byte UTF-8 sequence.
175  */
176 static unsigned int
utf8decode3(const char * str)177 utf8decode3(const char *str)
178 {
179 	unsigned int		uc;
180 
181 	uc = *str++ & 0x0F;
182 	uc <<= 6;
183 	uc |= *str++ & 0x3F;
184 	uc <<= 6;
185 	uc |= *str++ & 0x3F;
186 
187 	return uc;
188 }
189 
190 /*
191  * Encode a 3-byte UTF-8 sequence.
192  */
193 static int
utf8encode3(char * str,unsigned int val)194 utf8encode3(char *str, unsigned int val)
195 {
196 	str[2] = (val & 0x3F) | 0x80;
197 	val >>= 6;
198 	str[1] = (val & 0x3F) | 0x80;
199 	val >>= 6;
200 	str[0] = val | 0xE0;
201 
202 	return 3;
203 }
204 
205 /*
206  * utf8trie_t
207  *
208  * A compact binary tree, used to decode UTF-8 characters.
209  *
210  * Internal nodes are one byte for the node itself, and up to three
211  * bytes for an offset into the tree.  The first byte contains the
212  * following information:
213  *  NEXTBYTE  - flag        - advance to next byte if set
214  *  BITNUM    - 3 bit field - the bit number to tested
215  *  OFFLEN    - 2 bit field - number of bytes in the offset
216  * if offlen == 0 (non-branching node)
217  *  RIGHTPATH - 1 bit field - set if the following node is for the
218  *                            right-hand path (tested bit is set)
219  *  TRIENODE  - 1 bit field - set if the following node is an internal
220  *                            node, otherwise it is a leaf node
221  * if offlen != 0 (branching node)
222  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
223  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
224  *
225  * Due to the way utf8 works, there cannot be branching nodes with
226  * NEXTBYTE set, and moreover those nodes always have a righthand
227  * descendant.
228  */
229 typedef const unsigned char utf8trie_t;
230 #define BITNUM		0x07
231 #define NEXTBYTE	0x08
232 #define OFFLEN		0x30
233 #define OFFLEN_SHIFT	4
234 #define RIGHTPATH	0x40
235 #define TRIENODE	0x80
236 #define RIGHTNODE	0x40
237 #define LEFTNODE	0x80
238 
239 /*
240  * utf8leaf_t
241  *
242  * The leaves of the trie are embedded in the trie, and so the same
243  * underlying datatype: unsigned char.
244  *
245  * leaf[0]: The unicode version, stored as a generation number that is
246  *          an index into utf8agetab[].  With this we can filter code
247  *          points based on the unicode version in which they were
248  *          defined.  The CCC of a non-defined code point is 0.
249  * leaf[1]: Canonical Combining Class. During normalization, we need
250  *          to do a stable sort into ascending order of all characters
251  *          with a non-zero CCC that occur between two characters with
252  *          a CCC of 0, or at the begin or end of a string.
253  *          The unicode standard guarantees that all CCC values are
254  *          between 0 and 254 inclusive, which leaves 255 available as
255  *          a special value.
256  *          Code points with CCC 0 are known as stoppers.
257  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
258  *          start of a NUL-terminated string that is the decomposition
259  *          of the character.
260  *          The CCC of a decomposable character is the same as the CCC
261  *          of the first character of its decomposition.
262  *          Some characters decompose as the empty string: these are
263  *          characters with the Default_Ignorable_Code_Point property.
264  *          These do affect normalization, as they all have CCC 0.
265  *
266  * The decompositions in the trie have been fully expanded, with the
267  * exception of Hangul syllables, which are decomposed algorithmically.
268  *
269  * Casefolding, if applicable, is also done using decompositions.
270  *
271  * The trie is constructed in such a way that leaves exist for all
272  * UTF-8 sequences that match the criteria from the "UTF-8 valid
273  * ranges" comment above, and only for those sequences.  Therefore a
274  * lookup in the trie can be used to validate the UTF-8 input.
275  */
276 typedef const unsigned char utf8leaf_t;
277 
278 #define LEAF_GEN(LEAF)	((LEAF)[0])
279 #define LEAF_CCC(LEAF)	((LEAF)[1])
280 #define LEAF_STR(LEAF)	((const char *)((LEAF) + 2))
281 
282 #define MINCCC		(0)
283 #define MAXCCC		(254)
284 #define STOPPER		(0)
285 #define	DECOMPOSE	(255)
286 
287 /* Marker for hangul syllable decomposition. */
288 #define HANGUL		((char)(255))
289 /* Size of the synthesized leaf used for Hangul syllable decomposition. */
290 #define UTF8HANGULLEAF	(12)
291 
292 /*
293  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
294  *
295  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
296  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
297  *
298  * SBase = 0xAC00
299  * LBase = 0x1100
300  * VBase = 0x1161
301  * TBase = 0x11A7
302  * LCount = 19
303  * VCount = 21
304  * TCount = 28
305  * NCount = 588 (VCount * TCount)
306  * SCount = 11172 (LCount * NCount)
307  *
308  * Decomposition:
309  *   SIndex = s - SBase
310  *
311  * LV (Canonical/Full)
312  *   LIndex = SIndex / NCount
313  *   VIndex = (Sindex % NCount) / TCount
314  *   LPart = LBase + LIndex
315  *   VPart = VBase + VIndex
316  *
317  * LVT (Canonical)
318  *   LVIndex = (SIndex / TCount) * TCount
319  *   TIndex = (Sindex % TCount)
320  *   LVPart = SBase + LVIndex
321  *   TPart = TBase + TIndex
322  *
323  * LVT (Full)
324  *   LIndex = SIndex / NCount
325  *   VIndex = (Sindex % NCount) / TCount
326  *   TIndex = (Sindex % TCount)
327  *   LPart = LBase + LIndex
328  *   VPart = VBase + VIndex
329  *   if (TIndex == 0) {
330  *          d = <LPart, VPart>
331  *   } else {
332  *          TPart = TBase + TIndex
333  *          d = <LPart, TPart, VPart>
334  *   }
335  */
336 
337 /* Constants */
338 #define SB	(0xAC00)
339 #define LB	(0x1100)
340 #define VB	(0x1161)
341 #define TB	(0x11A7)
342 #define LC	(19)
343 #define VC	(21)
344 #define TC	(28)
345 #define NC	(VC * TC)
346 #define SC	(LC * NC)
347 
348 /* Algorithmic decomposition of hangul syllable. */
349 static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)350 utf8hangul(const char *str, unsigned char *hangul)
351 {
352 	unsigned int	si;
353 	unsigned int	li;
354 	unsigned int	vi;
355 	unsigned int	ti;
356 	unsigned char	*h;
357 
358 	/* Calculate the SI, LI, VI, and TI values. */
359 	si = utf8decode3(str) - SB;
360 	li = si / NC;
361 	vi = (si % NC) / TC;
362 	ti = si % TC;
363 
364 	/* Fill in base of leaf. */
365 	h = hangul;
366 	LEAF_GEN(h) = 2;
367 	LEAF_CCC(h) = DECOMPOSE;
368 	h += 2;
369 
370 	/* Add LPart, a 3-byte UTF-8 sequence. */
371 	h += utf8encode3((char *)h, li + LB);
372 
373 	/* Add VPart, a 3-byte UTF-8 sequence. */
374 	h += utf8encode3((char *)h, vi + VB);
375 
376 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
377 	if (ti)
378 		h += utf8encode3((char *)h, ti + TB);
379 
380 	/* Terminate string. */
381 	h[0] = '\0';
382 
383 	return hangul;
384 }
385 
386 /*
387  * Use trie to scan s, touching at most len bytes.
388  * Returns the leaf if one exists, NULL otherwise.
389  *
390  * A non-NULL return guarantees that the UTF-8 sequence starting at s
391  * is well-formed and corresponds to a known unicode code point.  The
392  * shorthand for this will be "is valid UTF-8 unicode".
393  */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)394 static utf8leaf_t *utf8nlookup(const struct utf8data *data,
395 			       unsigned char *hangul, const char *s, size_t len)
396 {
397 	utf8trie_t	*trie;
398 	int		offlen;
399 	int		offset;
400 	int		mask;
401 	int		node;
402 
403 	if (!data)
404 		return NULL;
405 	if (len == 0)
406 		return NULL;
407 
408 	trie = utf8data + data->offset;
409 	node = 1;
410 	while (node) {
411 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
412 		if (*trie & NEXTBYTE) {
413 			if (--len == 0)
414 				return NULL;
415 			s++;
416 		}
417 		mask = 1 << (*trie & BITNUM);
418 		if (*s & mask) {
419 			/* Right leg */
420 			if (offlen) {
421 				/* Right node at offset of trie */
422 				node = (*trie & RIGHTNODE);
423 				offset = trie[offlen];
424 				while (--offlen) {
425 					offset <<= 8;
426 					offset |= trie[offlen];
427 				}
428 				trie += offset;
429 			} else if (*trie & RIGHTPATH) {
430 				/* Right node after this node */
431 				node = (*trie & TRIENODE);
432 				trie++;
433 			} else {
434 				/* No right node. */
435 				return NULL;
436 			}
437 		} else {
438 			/* Left leg */
439 			if (offlen) {
440 				/* Left node after this node. */
441 				node = (*trie & LEFTNODE);
442 				trie += offlen + 1;
443 			} else if (*trie & RIGHTPATH) {
444 				/* No left node. */
445 				return NULL;
446 			} else {
447 				/* Left node after this node */
448 				node = (*trie & TRIENODE);
449 				trie++;
450 			}
451 		}
452 	}
453 	/*
454 	 * Hangul decomposition is done algorithmically. These are the
455 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
456 	 * always 3 bytes long, so s has been advanced twice, and the
457 	 * start of the sequence is at s-2.
458 	 */
459 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
460 		trie = utf8hangul(s - 2, hangul);
461 	return trie;
462 }
463 
464 /*
465  * Use trie to scan s.
466  * Returns the leaf if one exists, NULL otherwise.
467  *
468  * Forwards to utf8nlookup().
469  */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)470 static utf8leaf_t *utf8lookup(const struct utf8data *data,
471 			      unsigned char *hangul, const char *s)
472 {
473 	return utf8nlookup(data, hangul, s, (size_t)-1);
474 }
475 
476 #if 0
477 /*
478  * Maximum age of any character in s.
479  * Return -1 if s is not valid UTF-8 unicode.
480  * Return 0 if only non-assigned code points are used.
481  */
482 static int utf8agemax(const struct utf8data *data, const char *s)
483 {
484 	utf8leaf_t	*leaf;
485 	int		age = 0;
486 	int		leaf_age;
487 	unsigned char	hangul[UTF8HANGULLEAF];
488 
489 	if (!data)
490 		return -1;
491 
492 	while (*s) {
493 		leaf = utf8lookup(data, hangul, s);
494 		if (!leaf)
495 			return -1;
496 
497 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
498 		if (leaf_age <= data->maxage && leaf_age > age)
499 			age = leaf_age;
500 		s += utf8clen(s);
501 	}
502 	return age;
503 }
504 #endif
505 
506 #if 0
507 /*
508  * Minimum age of any character in s.
509  * Return -1 if s is not valid UTF-8 unicode.
510  * Return 0 if non-assigned code points are used.
511  */
512 static int utf8agemin(const struct utf8data *data, const char *s)
513 {
514 	utf8leaf_t	*leaf;
515 	int		age;
516 	int		leaf_age;
517 	unsigned char	hangul[UTF8HANGULLEAF];
518 
519 	if (!data)
520 		return -1;
521 	age = data->maxage;
522 	while (*s) {
523 		leaf = utf8lookup(data, hangul, s);
524 		if (!leaf)
525 			return -1;
526 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
527 		if (leaf_age <= data->maxage && leaf_age < age)
528 			age = leaf_age;
529 		s += utf8clen(s);
530 	}
531 	return age;
532 }
533 #endif
534 
535 #if 0
536 /*
537  * Maximum age of any character in s, touch at most len bytes.
538  * Return -1 if s is not valid UTF-8 unicode.
539  */
540 static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
541 {
542 	utf8leaf_t	*leaf;
543 	int		age = 0;
544 	int		leaf_age;
545 	unsigned char	hangul[UTF8HANGULLEAF];
546 
547 	if (!data)
548 		return -1;
549 
550 	while (len && *s) {
551 		leaf = utf8nlookup(data, hangul, s, len);
552 		if (!leaf)
553 			return -1;
554 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
555 		if (leaf_age <= data->maxage && leaf_age > age)
556 			age = leaf_age;
557 		len -= utf8clen(s);
558 		s += utf8clen(s);
559 	}
560 	return age;
561 }
562 #endif
563 
564 #if 0
565 /*
566  * Maximum age of any character in s, touch at most len bytes.
567  * Return -1 if s is not valid UTF-8 unicode.
568  */
569 static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
570 {
571 	utf8leaf_t	*leaf;
572 	int		leaf_age;
573 	int		age;
574 	unsigned char	hangul[UTF8HANGULLEAF];
575 
576 	if (!data)
577 		return -1;
578 	age = data->maxage;
579 	while (len && *s) {
580 		leaf = utf8nlookup(data, hangul, s, len);
581 		if (!leaf)
582 			return -1;
583 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
584 		if (leaf_age <= data->maxage && leaf_age < age)
585 			age = leaf_age;
586 		len -= utf8clen(s);
587 		s += utf8clen(s);
588 	}
589 	return age;
590 }
591 #endif
592 
593 #if 0
594 /*
595  * Length of the normalization of s.
596  * Return -1 if s is not valid UTF-8 unicode.
597  *
598  * A string of Default_Ignorable_Code_Point has length 0.
599  */
600 static ssize_t utf8len(const struct utf8data *data, const char *s)
601 {
602 	utf8leaf_t	*leaf;
603 	size_t		ret = 0;
604 	unsigned char	hangul[UTF8HANGULLEAF];
605 
606 	if (!data)
607 		return -1;
608 	while (*s) {
609 		leaf = utf8lookup(data, hangul, s);
610 		if (!leaf)
611 			return -1;
612 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
613 			ret += utf8clen(s);
614 		else if (LEAF_CCC(leaf) == DECOMPOSE)
615 			ret += strlen(LEAF_STR(leaf));
616 		else
617 			ret += utf8clen(s);
618 		s += utf8clen(s);
619 	}
620 	return ret;
621 }
622 #endif
623 
624 #if 0
625 /*
626  * Length of the normalization of s, touch at most len bytes.
627  * Return -1 if s is not valid UTF-8 unicode.
628  */
629 static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
630 {
631 	utf8leaf_t	*leaf;
632 	size_t		ret = 0;
633 	unsigned char	hangul[UTF8HANGULLEAF];
634 
635 	if (!data)
636 		return -1;
637 	while (len && *s) {
638 		leaf = utf8nlookup(data, hangul, s, len);
639 		if (!leaf)
640 			return -1;
641 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
642 			ret += utf8clen(s);
643 		else if (LEAF_CCC(leaf) == DECOMPOSE)
644 			ret += strlen(LEAF_STR(leaf));
645 		else
646 			ret += utf8clen(s);
647 		len -= utf8clen(s);
648 		s += utf8clen(s);
649 	}
650 	return ret;
651 }
652 #endif
653 
654 /*
655  * Set up an utf8cursor for use by utf8byte().
656  *
657  *   u8c    : pointer to cursor.
658  *   data   : const struct utf8data to use for normalization.
659  *   s      : string.
660  *   len    : length of s.
661  *
662  * Returns -1 on error, 0 on success.
663  */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)664 static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
665 		const char *s, size_t len)
666 {
667 	if (!data)
668 		return -1;
669 	if (!s)
670 		return -1;
671 	u8c->data = data;
672 	u8c->s = s;
673 	u8c->p = NULL;
674 	u8c->ss = NULL;
675 	u8c->sp = NULL;
676 	u8c->len = len;
677 	u8c->slen = 0;
678 	u8c->ccc = STOPPER;
679 	u8c->nccc = STOPPER;
680 	/* Check we didn't clobber the maximum length. */
681 	if (u8c->len != len)
682 		return -1;
683 	/* The first byte of s may not be an utf8 continuation. */
684 	if (len > 0 && (*s & 0xC0) == 0x80)
685 		return -1;
686 	return 0;
687 }
688 
689 #if 0
690 /*
691  * Set up an utf8cursor for use by utf8byte().
692  *
693  *   u8c    : pointer to cursor.
694  *   data   : const struct utf8data to use for normalization.
695  *   s      : NUL-terminated string.
696  *
697  * Returns -1 on error, 0 on success.
698  */
699 static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
700 	       const char *s)
701 {
702 	return utf8ncursor(u8c, data, s, (unsigned int)-1);
703 }
704 #endif
705 
706 /*
707  * Get one byte from the normalized form of the string described by u8c.
708  *
709  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
710  *
711  * The cursor keeps track of the location in the string in u8c->s.
712  * When a character is decomposed, the current location is stored in
713  * u8c->p, and u8c->s is set to the start of the decomposition. Note
714  * that bytes from a decomposition do not count against u8c->len.
715  *
716  * Characters are emitted if they match the current CCC in u8c->ccc.
717  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
718  * and the function returns 0 in that case.
719  *
720  * Sorting by CCC is done by repeatedly scanning the string.  The
721  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
722  * the start of the scan.  The first pass finds the lowest CCC to be
723  * emitted and stores it in u8c->nccc, the second pass emits the
724  * characters with this CCC and finds the next lowest CCC. This limits
725  * the number of passes to 1 + the number of different CCCs in the
726  * sequence being scanned.
727  *
728  * Therefore:
729  *  u8c->p  != NULL -> a decomposition is being scanned.
730  *  u8c->ss != NULL -> this is a repeating scan.
731  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
732  */
utf8byte(struct utf8cursor * u8c)733 static int utf8byte(struct utf8cursor *u8c)
734 {
735 	utf8leaf_t *leaf;
736 	int ccc;
737 
738 	for (;;) {
739 		/* Check for the end of a decomposed character. */
740 		if (u8c->p && *u8c->s == '\0') {
741 			u8c->s = u8c->p;
742 			u8c->p = NULL;
743 		}
744 
745 		/* Check for end-of-string. */
746 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
747 			/* There is no next byte. */
748 			if (u8c->ccc == STOPPER)
749 				return 0;
750 			/* End-of-string during a scan counts as a stopper. */
751 			ccc = STOPPER;
752 			goto ccc_mismatch;
753 		} else if ((*u8c->s & 0xC0) == 0x80) {
754 			/* This is a continuation of the current character. */
755 			if (!u8c->p)
756 				u8c->len--;
757 			return (unsigned char)*u8c->s++;
758 		}
759 
760 		/* Look up the data for the current character. */
761 		if (u8c->p) {
762 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
763 		} else {
764 			leaf = utf8nlookup(u8c->data, u8c->hangul,
765 					   u8c->s, u8c->len);
766 		}
767 
768 		/* No leaf found implies that the input is a binary blob. */
769 		if (!leaf)
770 			return -1;
771 
772 		ccc = LEAF_CCC(leaf);
773 		/* Characters that are too new have CCC 0. */
774 		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
775 			ccc = STOPPER;
776 		} else if (ccc == DECOMPOSE) {
777 			u8c->len -= utf8clen(u8c->s);
778 			u8c->p = u8c->s + utf8clen(u8c->s);
779 			u8c->s = LEAF_STR(leaf);
780 			/* Empty decomposition implies CCC 0. */
781 			if (*u8c->s == '\0') {
782 				if (u8c->ccc == STOPPER)
783 					continue;
784 				ccc = STOPPER;
785 				goto ccc_mismatch;
786 			}
787 
788 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
789 			if (!leaf)
790 				return -1;
791 			ccc = LEAF_CCC(leaf);
792 		}
793 
794 		/*
795 		 * If this is not a stopper, then see if it updates
796 		 * the next canonical class to be emitted.
797 		 */
798 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
799 			u8c->nccc = ccc;
800 
801 		/*
802 		 * Return the current byte if this is the current
803 		 * combining class.
804 		 */
805 		if (ccc == u8c->ccc) {
806 			if (!u8c->p)
807 				u8c->len--;
808 			return (unsigned char)*u8c->s++;
809 		}
810 
811 		/* Current combining class mismatch. */
812 ccc_mismatch:
813 		if (u8c->nccc == STOPPER) {
814 			/*
815 			 * Scan forward for the first canonical class
816 			 * to be emitted.  Save the position from
817 			 * which to restart.
818 			 */
819 			u8c->ccc = MINCCC - 1;
820 			u8c->nccc = ccc;
821 			u8c->sp = u8c->p;
822 			u8c->ss = u8c->s;
823 			u8c->slen = u8c->len;
824 			if (!u8c->p)
825 				u8c->len -= utf8clen(u8c->s);
826 			u8c->s += utf8clen(u8c->s);
827 		} else if (ccc != STOPPER) {
828 			/* Not a stopper, and not the ccc we're emitting. */
829 			if (!u8c->p)
830 				u8c->len -= utf8clen(u8c->s);
831 			u8c->s += utf8clen(u8c->s);
832 		} else if (u8c->nccc != MAXCCC + 1) {
833 			/* At a stopper, restart for next ccc. */
834 			u8c->ccc = u8c->nccc;
835 			u8c->nccc = MAXCCC + 1;
836 			u8c->s = u8c->ss;
837 			u8c->p = u8c->sp;
838 			u8c->len = u8c->slen;
839 		} else {
840 			/* All done, proceed from here. */
841 			u8c->ccc = STOPPER;
842 			u8c->nccc = STOPPER;
843 			u8c->sp = NULL;
844 			u8c->ss = NULL;
845 			u8c->slen = 0;
846 		}
847 	}
848 }
849 
850 #if 0
851 /*
852  * Look for the correct const struct utf8data for a unicode version.
853  * Returns NULL if the version requested is too new.
854  *
855  * Two normalization forms are supported: nfdi and nfdicf.
856  *
857  * nfdi:
858  *  - Apply unicode normalization form NFD.
859  *  - Remove any Default_Ignorable_Code_Point.
860  *
861  * nfdicf:
862  *  - Apply unicode normalization form NFD.
863  *  - Remove any Default_Ignorable_Code_Point.
864  *  - Apply a full casefold (C + F).
865  */
866 static const struct utf8data *utf8nfdi(unsigned int maxage)
867 {
868 	int i = ARRAY_SIZE(utf8nfdidata) - 1;
869 
870 	while (maxage < utf8nfdidata[i].maxage)
871 		i--;
872 	if (maxage > utf8nfdidata[i].maxage)
873 		return NULL;
874 	return &utf8nfdidata[i];
875 }
876 #endif
877 
utf8nfdicf(unsigned int maxage)878 static const struct utf8data *utf8nfdicf(unsigned int maxage)
879 {
880 	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
881 
882 	while (maxage < utf8nfdicfdata[i].maxage)
883 		i--;
884 	if (maxage > utf8nfdicfdata[i].maxage)
885 		return NULL;
886 	return &utf8nfdicfdata[i];
887 }
888 
utf8_casefold(const struct f2fs_nls_table * table,const unsigned char * str,size_t len,unsigned char * dest,size_t dlen)889 static int utf8_casefold(const struct f2fs_nls_table *table,
890 			  const unsigned char *str, size_t len,
891 			  unsigned char *dest, size_t dlen)
892 {
893 	const struct utf8data *data = utf8nfdicf(table->version);
894 	struct utf8cursor cur;
895 	size_t nlen = 0;
896 
897 	if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
898 		goto invalid_seq;
899 
900 	for (nlen = 0; nlen < dlen; nlen++) {
901 		int c = utf8byte(&cur);
902 
903 		dest[nlen] = c;
904 		if (!c)
905 			return nlen;
906 		if (c == -1)
907 			break;
908 	}
909 
910 	return -ENAMETOOLONG;
911 
912 invalid_seq:
913 	if (dlen < len)
914 		return -ENAMETOOLONG;
915 
916 	/* Signal invalid sequence */
917 	return -EINVAL;
918 }
919 
920 static const struct f2fs_nls_ops utf8_ops = {
921 	.casefold = utf8_casefold,
922 };
923 
924 static const struct f2fs_nls_table nls_utf8 = {
925 	.ops = &utf8_ops,
926 	.version = UNICODE_AGE(12, 1, 0),
927 };
928 
f2fs_load_nls_table(int encoding)929 const struct f2fs_nls_table *f2fs_load_nls_table(int encoding)
930 {
931 	if (encoding == F2FS_ENC_UTF8_12_1)
932 		return &nls_utf8;
933 
934 	return NULL;
935 }
936