1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 1999-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: utf_impl.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 1999sep13
16 * created by: Markus W. Scherer
17 *
18 * This file provides implementation functions for macros in the utfXX.h
19 * that would otherwise be too long as macros.
20 */
21
22 #include "base/third_party/icu/icu_utf.h"
23
24 namespace base_icu {
25
26 // source/common/utf_impl.cpp
27
28 static const UChar32
29 utf8_errorValue[6]={
30 // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
31 // but without relying on the obsolete unicode/utf_old.h.
32 0x15, 0x9f, 0xffff,
33 0x10ffff
34 };
35
36 static UChar32
errorValue(int32_t count,int8_t strict)37 errorValue(int32_t count, int8_t strict) {
38 if(strict>=0) {
39 return utf8_errorValue[count];
40 } else if(strict==-3) {
41 return 0xfffd;
42 } else {
43 return CBU_SENTINEL;
44 }
45 }
46
47 /*
48 * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
49 * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
50 *
51 * U8_NEXT() supports NUL-terminated strings indicated via length<0.
52 *
53 * The "strict" parameter controls the error behavior:
54 * <0 "Safe" behavior of U8_NEXT():
55 * -1: All illegal byte sequences yield U_SENTINEL=-1.
56 * -2: Same as -1, except for lenient treatment of surrogate code points as legal.
57 * Some implementations use this for roundtripping of
58 * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
59 * contain unpaired surrogates.
60 * -3: All illegal byte sequences yield U+FFFD.
61 * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
62 * All illegal byte sequences yield a positive code point such that this
63 * result code point would be encoded with the same number of bytes as
64 * the illegal sequence.
65 * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
66 * Same as the obsolete "safe" behavior, but non-characters are also treated
67 * like illegal sequences.
68 *
69 * Note that a UBool is the same as an int8_t.
70 */
71 UChar32
utf8_nextCharSafeBody(const uint8_t * s,int32_t * pi,int32_t length,UChar32 c,UBool strict)72 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
73 // *pi is one after byte c.
74 int32_t i=*pi;
75 // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
76 if(i==length || c>0xf4) {
77 // end of string, or not a lead byte
78 } else if(c>=0xf0) {
79 // Test for 4-byte sequences first because
80 // U8_NEXT() handles shorter valid sequences inline.
81 uint8_t t1=s[i], t2, t3;
82 c&=7;
83 if(CBU8_IS_VALID_LEAD4_AND_T1(c, t1) &&
84 ++i!=length && (t2=s[i]-0x80)<=0x3f &&
85 ++i!=length && (t3=s[i]-0x80)<=0x3f) {
86 ++i;
87 c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
88 // strict: forbid non-characters like U+fffe
89 if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
90 *pi=i;
91 return c;
92 }
93 }
94 } else if(c>=0xe0) {
95 c&=0xf;
96 if(strict!=-2) {
97 uint8_t t1=s[i], t2;
98 if(CBU8_IS_VALID_LEAD3_AND_T1(c, t1) &&
99 ++i!=length && (t2=s[i]-0x80)<=0x3f) {
100 ++i;
101 c=(c<<12)|((t1&0x3f)<<6)|t2;
102 // strict: forbid non-characters like U+fffe
103 if(strict<=0 || !CBU_IS_UNICODE_NONCHAR(c)) {
104 *pi=i;
105 return c;
106 }
107 }
108 } else {
109 // strict=-2 -> lenient: allow surrogates
110 uint8_t t1=s[i]-0x80, t2;
111 if(t1<=0x3f && (c>0 || t1>=0x20) &&
112 ++i!=length && (t2=s[i]-0x80)<=0x3f) {
113 *pi=i+1;
114 return (c<<12)|(t1<<6)|t2;
115 }
116 }
117 } else if(c>=0xc2) {
118 uint8_t t1=s[i]-0x80;
119 if(t1<=0x3f) {
120 *pi=i+1;
121 return ((c-0xc0)<<6)|t1;
122 }
123 } // else 0x80<=c<0xc2 is not a lead byte
124
125 /* error handling */
126 c=errorValue(i-*pi, strict);
127 *pi=i;
128 return c;
129 }
130
131 } // namespace base_icu
132