1 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2 /*
3 * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other
4 * contributors.
5 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 /*
27 * This library contains derived data from a modified version of the
28 * Unicode data files.
29 *
30 * The original data files are available at
31 * http://www.unicode.org/Public/UNIDATA/
32 *
33 * Please notice the copyright statement in the file "utf8proc_data.c".
34 */
35
36
37 /*
38 * File name: utf8proc.c
39 *
40 * Description:
41 * Implementation of libutf8proc.
42 */
43
44
45 #include "utf8proc.h"
46
47 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
62 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
63 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
64 };
65
66 #define UTF8PROC_HANGUL_SBASE 0xAC00
67 #define UTF8PROC_HANGUL_LBASE 0x1100
68 #define UTF8PROC_HANGUL_VBASE 0x1161
69 #define UTF8PROC_HANGUL_TBASE 0x11A7
70 #define UTF8PROC_HANGUL_LCOUNT 19
71 #define UTF8PROC_HANGUL_VCOUNT 21
72 #define UTF8PROC_HANGUL_TCOUNT 28
73 #define UTF8PROC_HANGUL_NCOUNT 588
74 #define UTF8PROC_HANGUL_SCOUNT 11172
75 /* END is exclusive */
76 #define UTF8PROC_HANGUL_L_START 0x1100
77 #define UTF8PROC_HANGUL_L_END 0x115A
78 #define UTF8PROC_HANGUL_L_FILLER 0x115F
79 #define UTF8PROC_HANGUL_V_START 0x1160
80 #define UTF8PROC_HANGUL_V_END 0x11A3
81 #define UTF8PROC_HANGUL_T_START 0x11A8
82 #define UTF8PROC_HANGUL_T_END 0x11FA
83 #define UTF8PROC_HANGUL_S_START 0xAC00
84 #define UTF8PROC_HANGUL_S_END 0xD7A4
85
86 /* Should follow semantic-versioning rules (semver.org) based on API
87 compatibility. (Note that the shared-library version number will
88 be different, being based on ABI compatibility.): */
89 #define STRINGIZEx(x) #x
90 #define STRINGIZE(x) STRINGIZEx(x)
91
utf8proc_version(void)92 UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
93 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH)
94 "";
95 }
96
utf8proc_errmsg(utf8proc_ssize_t errcode)97 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
98 switch (errcode) {
99 case UTF8PROC_ERROR_NOMEM:
100 return "Memory for processing UTF-8 data could not be allocated.";
101 case UTF8PROC_ERROR_OVERFLOW:
102 return "UTF-8 string is too long to be processed.";
103 case UTF8PROC_ERROR_INVALIDUTF8:
104 return "Invalid UTF-8 string";
105 case UTF8PROC_ERROR_NOTASSIGNED:
106 return "Unassigned Unicode code point found in UTF-8 string.";
107 case UTF8PROC_ERROR_INVALIDOPTS:
108 return "Invalid options for UTF-8 processing chosen.";
109 default:
110 return "An unknown error occurred while processing UTF-8 data.";
111 }
112 }
113
114 #define utf_cont(ch) (((ch) & 0xc0) == 0x80)
115
utf8proc_iterate(const utf8proc_uint8_t * str,utf8proc_ssize_t strlen,utf8proc_int32_t * dst)116 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
117 const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
118 ) {
119 utf8proc_uint32_t uc;
120 const utf8proc_uint8_t *end;
121
122 *dst = -1;
123 if (!strlen) {
124 return 0;
125 }
126 end = str + ((strlen < 0) ? 4 : strlen);
127 uc = *str++;
128 if (uc < 0x80) {
129 *dst = uc;
130 return 1;
131 }
132 // Must be between 0xc2 and 0xf4 inclusive to be valid
133 if ((uc - 0xc2) > (0xf4 - 0xc2)) {
134 return UTF8PROC_ERROR_INVALIDUTF8;
135 }
136 if (uc < 0xe0) { // 2-byte sequence
137 // Must have valid continuation character
138 if ((str >= end) || !utf_cont(*str)) {
139 return UTF8PROC_ERROR_INVALIDUTF8;
140 }
141 *dst = ((uc & 0x1f) << 6) | (*str & 0x3f);
142 return 2;
143 }
144 if (uc < 0xf0) { // 3-byte sequence
145 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) {
146 return UTF8PROC_ERROR_INVALIDUTF8;
147 }
148 // Check for surrogate chars
149 if ((uc == 0xed) && (*str > 0x9f) ) {
150 return UTF8PROC_ERROR_INVALIDUTF8;
151 }
152 uc = ((uc & 0xf) << 12) | ((*str & 0x3f) << 6) | (str[1] & 0x3f);
153 if (uc < 0x800) {
154 return UTF8PROC_ERROR_INVALIDUTF8;
155 }
156 *dst = uc;
157 return 3;
158 }
159 // 4-byte sequence
160 // Must have 3 valid continuation characters
161 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) {
162 return UTF8PROC_ERROR_INVALIDUTF8;
163 }
164 // Make sure in correct range (0x10000 - 0x10ffff)
165 if (uc == 0xf0) {
166 if (*str < 0x90) {
167 return UTF8PROC_ERROR_INVALIDUTF8;
168 }
169 } else if (uc == 0xf4) {
170 if (*str > 0x8f) {
171 return UTF8PROC_ERROR_INVALIDUTF8;
172 }
173 }
174 *dst = ((uc & 7) << 18) | ((*str & 0x3f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f);
175 return 4;
176 }
177
utf8proc_codepoint_valid(utf8proc_int32_t uc)178 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
179 return (((utf8proc_uint32_t) uc) - 0xd800 > 0x07ff) && ((utf8proc_uint32_t) uc < 0x110000);
180 }
181
utf8proc_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)182 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
183 if (uc < 0x00) {
184 return 0;
185 } else if (uc < 0x80) {
186 dst[0] = (utf8proc_uint8_t) uc;
187 return 1;
188 } else if (uc < 0x800) {
189 dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
190 dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
191 return 2;
192 // Note: we allow encoding 0xd800-0xdfff here, so as not to change
193 // the API, however, these are actually invalid in UTF-8
194 } else if (uc < 0x10000) {
195 dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
196 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
197 dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
198 return 3;
199 } else if (uc < 0x110000) {
200 dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
201 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
202 dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
203 dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
204 return 4;
205 } else {
206 return 0;
207 }
208 }
209
210 /* internal "unsafe" version that does not check whether uc is in range */
unsafe_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)211 static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
212 if (uc < 0x00) {
213 return 0;
214 } else if (uc < 0x80) {
215 dst[0] = (utf8proc_uint8_t) uc;
216 return 1;
217 } else if (uc < 0x800) {
218 dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
219 dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
220 return 2;
221 } else if (uc == 0xFFFF) {
222 dst[0] = (utf8proc_uint8_t) 0xFF;
223 return 1;
224 } else if (uc == 0xFFFE) {
225 dst[0] = (utf8proc_uint8_t) 0xFE;
226 return 1;
227 } else if (uc < 0x10000) {
228 dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
229 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
230 dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
231 return 3;
232 } else if (uc < 0x110000) {
233 dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
234 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
235 dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
236 dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
237 return 4;
238 } else {
239 return 0;
240 }
241 }
242
243 /* return whether there is a grapheme break between boundclasses lbc and tbc
244 (according to the definition of extended grapheme clusters)
245
246 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
247 http://www.unicode.org/reports/tr29/tr29-29.html
248
249 CAVEATS:
250 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
251 and GB 12/13 (regional indicator code points) require knowledge of previous characters
252 and are thus not handled by this function. This may result in an incorrect break before
253 an E_Modifier class codepoint and an incorrectly missing break between two
254 REGIONAL_INDICATOR class code points if such support does not exist in the caller.
255
256 See the special support in grapheme_break_extended, for required bookkeeping by the caller.
257 */
grapheme_break_simple(int lbc,int tbc)258 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
259 return (lbc == UTF8PROC_BOUNDCLASS_START) ? true // GB1
260 : ( lbc == UTF8PROC_BOUNDCLASS_CR // GB3
261 && tbc == UTF8PROC_BOUNDCLASS_LF) ? false // ---
262 : (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB4
263 : (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB5
264 : ( lbc == UTF8PROC_BOUNDCLASS_L // GB6
265 && ( tbc == UTF8PROC_BOUNDCLASS_L // ---
266 || tbc == UTF8PROC_BOUNDCLASS_V // ---
267 || tbc == UTF8PROC_BOUNDCLASS_LV // ---
268 || tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false // ---
269 : ( ( lbc == UTF8PROC_BOUNDCLASS_LV // GB7
270 || lbc == UTF8PROC_BOUNDCLASS_V) // ---
271 && ( tbc == UTF8PROC_BOUNDCLASS_V // ---
272 || tbc == UTF8PROC_BOUNDCLASS_T)) ? false // ---
273 : ( ( lbc == UTF8PROC_BOUNDCLASS_LVT // GB8
274 || lbc == UTF8PROC_BOUNDCLASS_T) // ---
275 && tbc == UTF8PROC_BOUNDCLASS_T) ? false // ---
276 : ( tbc == UTF8PROC_BOUNDCLASS_EXTEND // GB9
277 || tbc == UTF8PROC_BOUNDCLASS_ZWJ // ---
278 || tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK // GB9a
279 || lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false // GB9b
280 : ( ( lbc == UTF8PROC_BOUNDCLASS_E_BASE // GB10 (requires additional
281 // handling below)
282 || lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) // ----
283 && tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false // ----
284 : ( lbc == UTF8PROC_BOUNDCLASS_ZWJ // GB11
285 && ( tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ // ----
286 || tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false // ----
287 : ( lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR // GB12/13 (requires additional
288 // handling below)
289 && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false // ----
290 : true; // GB999
291 }
292
grapheme_break_extended(int lbc,int tbc,utf8proc_int32_t * state)293 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) {
294 int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
295 ? *state : lbc);
296 utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
297 if (state) {
298 // Special support for GB 12/13 made possible by GB999. After two RI
299 // class codepoints we want to force a break. Do this by resetting the
300 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
301 // after that character according to GB999 (unless of course such a break is
302 // forbidden by a different rule such as GB9).
303 if ((*state == tbc) && (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)) {
304 *state = UTF8PROC_BOUNDCLASS_OTHER;
305 }
306 // Special support for GB10. Fold any EXTEND codepoints into the previous
307 // boundclass if we're dealing with an emoji base boundclass.
308 else if ( ( (*state == UTF8PROC_BOUNDCLASS_E_BASE)
309 || (*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) )
310 && (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ) {
311 *state = UTF8PROC_BOUNDCLASS_E_BASE;
312 } else {
313 *state = tbc;
314 }
315 }
316 return break_permitted;
317 }
318
seqindex_decode_entry(const utf8proc_uint16_t ** entry)319 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) {
320 utf8proc_int32_t entry_cp = **entry;
321 if ((entry_cp & 0xF800) == 0xD800) {
322 *entry = *entry + 1;
323 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
324 entry_cp += 0x10000;
325 }
326 return entry_cp;
327 }
328