utf8-norm.c - OpenGrok cross reference for /kernel/linux/linux-5.10/fs/unicode/utf8-norm.c

Lines Matching +full:nc +full:- +full:si
1 // SPDX-License-Identifier: GPL-2.0-only
20 	int i = ARRAY_SIZE(utf8agetab) - 1;  in utf8version_is_supported()
26 		i--;  in utf8version_is_supported()
39  * UTF-8 valid ranges.
41  * The UTF-8 encoding spreads the bits of a 32bit word over several
52  * There is an additional requirement on UTF-8, in that only the
64  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
68  *          0 -     0x7F: 0                   - 0x7F
69  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
70  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
71  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
73  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
76  * the same a single UTF-32 character.  This makes the UTF-8
77  * representation of Unicode strictly smaller than UTF-32.
80  *    Corrigendum #1: UTF-8 Shortest Form
87  * Return the number of bytes used by the current UTF-8 sequence.
88  * Assumes the input points to the first byte of a valid UTF-8
99  * Decode a 3-byte UTF-8 sequence.
116  * Encode a 3-byte UTF-8 sequence.
133  * A compact binary tree, used to decode UTF-8 characters.
138  *  NEXTBYTE  - flag        - advance to next byte if set
139  *  BITNUM    - 3 bit field - the bit number to tested
140  *  OFFLEN    - 2 bit field - number of bytes in the offset
141  * if offlen == 0 (non-branching node)
142  *  RIGHTPATH - 1 bit field - set if the following node is for the
143  *                            right-hand path (tested bit is set)
144  *  TRIENODE  - 1 bit field - set if the following node is an internal
147  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
148  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
173  *          defined.  The CCC of a non-defined code point is 0.
176  *          with a non-zero CCC that occur between two characters with
183  *          start of a NUL-terminated string that is the decomposition
197  * UTF-8 sequences that match the criteria from the "UTF-8 valid
199  * lookup in the trie can be used to validate the UTF-8 input.
234  *   SIndex = s - SBase
270 #define NC	(VC * TC)  macro
271 #define SC	(LC * NC)
277 	unsigned int	si;  in utf8hangul()  local
283 	/* Calculate the SI, LI, VI, and TI values. */  in utf8hangul()
284 	si = utf8decode3(str) - SB;  in utf8hangul()
285 	li = si / NC;  in utf8hangul()
286 	vi = (si % NC) / TC;  in utf8hangul()
287 	ti = si % TC;  in utf8hangul()
295 	/* Add LPart, a 3-byte UTF-8 sequence. */  in utf8hangul()
298 	/* Add VPart, a 3-byte UTF-8 sequence. */  in utf8hangul()
301 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */  in utf8hangul()
315  * A non-NULL return guarantees that the UTF-8 sequence starting at s
316  * is well-formed and corresponds to a known unicode code point.  The
317  * shorthand for this will be "is valid UTF-8 unicode".
333 	trie = utf8data + data->offset;  in utf8nlookup()
338 			if (--len == 0)  in utf8nlookup()
349 				while (--offlen) {  in utf8nlookup()
380 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is  in utf8nlookup()
382 	 * start of the sequence is at s-2.  in utf8nlookup()
385 		trie = utf8hangul(s - 2, hangul);  in utf8nlookup()
398 	return utf8nlookup(data, hangul, s, (size_t)-1);  in utf8lookup()
403  * Return -1 if s is not valid UTF-8 unicode.
404  * Return 0 if only non-assigned code points are used.
414 		return -1;  in utf8agemax()
419 			return -1;  in utf8agemax()
422 		if (leaf_age <= data->maxage && leaf_age > age)  in utf8agemax()
432  * Return -1 if s is not valid UTF-8 unicode.
433  * Return 0 if non-assigned code points are used.
443 		return -1;  in utf8agemin()
444 	age = data->maxage;  in utf8agemin()
448 			return -1;  in utf8agemin()
450 		if (leaf_age <= data->maxage && leaf_age < age)  in utf8agemin()
460  * Return -1 if s is not valid UTF-8 unicode.
470 		return -1;  in utf8nagemax()
475 			return -1;  in utf8nagemax()
477 		if (leaf_age <= data->maxage && leaf_age > age)  in utf8nagemax()
479 		len -= utf8clen(s);  in utf8nagemax()
488  * Return -1 if s is not valid UTF-8 unicode.
498 		return -1;  in utf8nagemin()
499 	age = data->maxage;  in utf8nagemin()
503 			return -1;  in utf8nagemin()
505 		if (leaf_age <= data->maxage && leaf_age < age)  in utf8nagemin()
507 		len -= utf8clen(s);  in utf8nagemin()
516  * Return -1 if s is not valid UTF-8 unicode.
527 		return -1;  in utf8len()
531 			return -1;  in utf8len()
532 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)  in utf8len()
546  * Return -1 if s is not valid UTF-8 unicode.
555 		return -1;  in utf8nlen()
559 			return -1;  in utf8nlen()
560 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)  in utf8nlen()
566 		len -= utf8clen(s);  in utf8nlen()
581  * Returns -1 on error, 0 on success.
587 		return -1;  in utf8ncursor()
589 		return -1;  in utf8ncursor()
590 	u8c->data = data;  in utf8ncursor()
591 	u8c->s = s;  in utf8ncursor()
592 	u8c->p = NULL;  in utf8ncursor()
593 	u8c->ss = NULL;  in utf8ncursor()
594 	u8c->sp = NULL;  in utf8ncursor()
595 	u8c->len = len;  in utf8ncursor()
596 	u8c->slen = 0;  in utf8ncursor()
597 	u8c->ccc = STOPPER;  in utf8ncursor()
598 	u8c->nccc = STOPPER;  in utf8ncursor()
600 	if (u8c->len != len)  in utf8ncursor()
601 		return -1;  in utf8ncursor()
604 		return -1;  in utf8ncursor()
614  *   s      : NUL-terminated string.
616  * Returns -1 on error, 0 on success.
621 	return utf8ncursor(u8c, data, s, (unsigned int)-1);  in utf8cursor()
628  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
630  * The cursor keeps track of the location in the string in u8c->s.
632  * u8c->p, and u8c->s is set to the start of the decomposition. Note
633  * that bytes from a decomposition do not count against u8c->len.
635  * Characters are emitted if they match the current CCC in u8c->ccc.
636  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
640  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
642  * emitted and stores it in u8c->nccc, the second pass emits the
648  *  u8c->p  != NULL -> a decomposition is being scanned.
649  *  u8c->ss != NULL -> this is a repeating scan.
650  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
659 		if (u8c->p && *u8c->s == '\0') {  in utf8byte()
660 			u8c->s = u8c->p;  in utf8byte()
661 			u8c->p = NULL;  in utf8byte()
664 		/* Check for end-of-string. */  in utf8byte()
665 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {  in utf8byte()
667 			if (u8c->ccc == STOPPER)  in utf8byte()
669 			/* End-of-string during a scan counts as a stopper. */  in utf8byte()
672 		} else if ((*u8c->s & 0xC0) == 0x80) {  in utf8byte()
674 			if (!u8c->p)  in utf8byte()
675 				u8c->len--;  in utf8byte()
676 			return (unsigned char)*u8c->s++;  in utf8byte()
680 		if (u8c->p) {  in utf8byte()
681 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);  in utf8byte()
683 			leaf = utf8nlookup(u8c->data, u8c->hangul,  in utf8byte()
684 					   u8c->s, u8c->len);  in utf8byte()
689 			return -1;  in utf8byte()
693 		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {  in utf8byte()
696 			u8c->len -= utf8clen(u8c->s);  in utf8byte()
697 			u8c->p = u8c->s + utf8clen(u8c->s);  in utf8byte()
698 			u8c->s = LEAF_STR(leaf);  in utf8byte()
700 			if (*u8c->s == '\0') {  in utf8byte()
701 				if (u8c->ccc == STOPPER)  in utf8byte()
707 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);  in utf8byte()
709 				return -1;  in utf8byte()
717 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)  in utf8byte()
718 			u8c->nccc = ccc;  in utf8byte()
724 		if (ccc == u8c->ccc) {  in utf8byte()
725 			if (!u8c->p)  in utf8byte()
726 				u8c->len--;  in utf8byte()
727 			return (unsigned char)*u8c->s++;  in utf8byte()
732 		if (u8c->nccc == STOPPER) {  in utf8byte()
738 			u8c->ccc = MINCCC - 1;  in utf8byte()
739 			u8c->nccc = ccc;  in utf8byte()
740 			u8c->sp = u8c->p;  in utf8byte()
741 			u8c->ss = u8c->s;  in utf8byte()
742 			u8c->slen = u8c->len;  in utf8byte()
743 			if (!u8c->p)  in utf8byte()
744 				u8c->len -= utf8clen(u8c->s);  in utf8byte()
745 			u8c->s += utf8clen(u8c->s);  in utf8byte()
748 			if (!u8c->p)  in utf8byte()
749 				u8c->len -= utf8clen(u8c->s);  in utf8byte()
750 			u8c->s += utf8clen(u8c->s);  in utf8byte()
751 		} else if (u8c->nccc != MAXCCC + 1) {  in utf8byte()
753 			u8c->ccc = u8c->nccc;  in utf8byte()
754 			u8c->nccc = MAXCCC + 1;  in utf8byte()
755 			u8c->s = u8c->ss;  in utf8byte()
756 			u8c->p = u8c->sp;  in utf8byte()
757 			u8c->len = u8c->slen;  in utf8byte()
760 			u8c->ccc = STOPPER;  in utf8byte()
761 			u8c->nccc = STOPPER;  in utf8byte()
762 			u8c->sp = NULL;  in utf8byte()
763 			u8c->ss = NULL;  in utf8byte()
764 			u8c->slen = 0;  in utf8byte()
772 	int i = ARRAY_SIZE(utf8nfdidata) - 1;  in utf8nfdi()
775 		i--;  in utf8nfdi()
784 	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;  in utf8nfdicf()
787 		i--;  in utf8nfdicf()