1 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2 /*
3 * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other
4 * contributors.
5 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 /*
27 * This library contains derived data from a modified version of the
28 * Unicode data files.
29 *
30 * The original data files are available at
31 * http://www.unicode.org/Public/UNIDATA/
32 *
33 * Please notice the copyright statement in the file "utf8proc_data.c".
34 */
35
36
37 /*
38 * File name: utf8proc.c
39 *
40 * Description:
41 * Implementation of libutf8proc.
42 */
43
44
45 #include "utf8proc.h"
46
47 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
62 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
63 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
64 };
65
66 #define UTF8PROC_HANGUL_SBASE 0xAC00
67 #define UTF8PROC_HANGUL_LBASE 0x1100
68 #define UTF8PROC_HANGUL_VBASE 0x1161
69 #define UTF8PROC_HANGUL_TBASE 0x11A7
70 #define UTF8PROC_HANGUL_LCOUNT 19
71 #define UTF8PROC_HANGUL_VCOUNT 21
72 #define UTF8PROC_HANGUL_TCOUNT 28
73 #define UTF8PROC_HANGUL_NCOUNT 588
74 #define UTF8PROC_HANGUL_SCOUNT 11172
75 /* END is exclusive */
76 #define UTF8PROC_HANGUL_L_START 0x1100
77 #define UTF8PROC_HANGUL_L_END 0x115A
78 #define UTF8PROC_HANGUL_L_FILLER 0x115F
79 #define UTF8PROC_HANGUL_V_START 0x1160
80 #define UTF8PROC_HANGUL_V_END 0x11A3
81 #define UTF8PROC_HANGUL_T_START 0x11A8
82 #define UTF8PROC_HANGUL_T_END 0x11FA
83 #define UTF8PROC_HANGUL_S_START 0xAC00
84 #define UTF8PROC_HANGUL_S_END 0xD7A4
85
86 /* Should follow semantic-versioning rules (semver.org) based on API
87 compatibility. (Note that the shared-library version number will
88 be different, being based on ABI compatibility.): */
89 #define STRINGIZEx(x) #x
90 #define STRINGIZE(x) STRINGIZEx(x)
91
utf8proc_version(void)92 UTF8PROC_DLLEXPORT const char* utf8proc_version(void) {
93 return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH)
94 "";
95 }
96
utf8proc_errmsg(utf8proc_ssize_t errcode)97 UTF8PROC_DLLEXPORT const char* utf8proc_errmsg(utf8proc_ssize_t errcode) {
98 switch (errcode) {
99 case UTF8PROC_ERROR_NOMEM:
100 return "Memory for processing UTF-8 data could not be allocated.";
101 case UTF8PROC_ERROR_OVERFLOW:
102 return "UTF-8 string is too long to be processed.";
103 case UTF8PROC_ERROR_INVALIDUTF8:
104 return "Invalid UTF-8 string";
105 case UTF8PROC_ERROR_NOTASSIGNED:
106 return "Unassigned Unicode code point found in UTF-8 string.";
107 case UTF8PROC_ERROR_INVALIDOPTS:
108 return "Invalid options for UTF-8 processing chosen.";
109 default:
110 return "An unknown error occurred while processing UTF-8 data.";
111 }
112 }
113
114 #define utf_cont(ch) (((ch) & 0xc0) == 0x80)
115
utf8proc_iterate(const utf8proc_uint8_t * str,utf8proc_ssize_t strlen,utf8proc_int32_t * dst)116 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
117 const utf8proc_uint8_t *str,
118 utf8proc_ssize_t strlen,
119 utf8proc_int32_t *dst
120 ) {
121 utf8proc_uint32_t uc;
122 const utf8proc_uint8_t *end;
123
124 *dst = -1;
125 if (!strlen) {
126 return 0;
127 }
128 end = str + ((strlen < 0) ? 4 : strlen);
129 uc = *str++;
130 if (uc < 0x80) {
131 *dst = uc;
132 return 1;
133 }
134 // Must be between 0xc2 and 0xf4 inclusive to be valid
135 if ((uc - 0xc2) > (0xf4 - 0xc2)) {
136 return UTF8PROC_ERROR_INVALIDUTF8;
137 }
138 if (uc < 0xe0) { // 2-byte sequence
139 // Must have valid continuation character
140 if ((str >= end) || !utf_cont(*str)) {
141 return UTF8PROC_ERROR_INVALIDUTF8;
142 }
143 *dst = ((uc & 0x1f) << 6) | (*str & 0x3f);
144 return 2;
145 }
146 if (uc < 0xf0) { // 3-byte sequence
147 if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) {
148 return UTF8PROC_ERROR_INVALIDUTF8;
149 }
150 // Check for surrogate chars
151 if ((uc == 0xed) && (*str > 0x9f)) {
152 return UTF8PROC_ERROR_INVALIDUTF8;
153 }
154 uc = ((uc & 0xf) << 12) | ((*str & 0x3f) << 6) | (str[1] & 0x3f);
155 if (uc < 0x800) {
156 return UTF8PROC_ERROR_INVALIDUTF8;
157 }
158 *dst = uc;
159 return 3;
160 }
161 // 4-byte sequence
162 // Must have 3 valid continuation characters
163 if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) {
164 return UTF8PROC_ERROR_INVALIDUTF8;
165 }
166 // Make sure in correct range (0x10000 - 0x10ffff)
167 if (uc == 0xf0) {
168 if (*str < 0x90) {
169 return UTF8PROC_ERROR_INVALIDUTF8;
170 }
171 } else if (uc == 0xf4) {
172 if (*str > 0x8f) {
173 return UTF8PROC_ERROR_INVALIDUTF8;
174 }
175 }
176 *dst = ((uc & 7) << 18) | ((*str & 0x3f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f);
177 return 4;
178 }
179
utf8proc_codepoint_valid(utf8proc_int32_t uc)180 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
181 return (((utf8proc_uint32_t) uc) - 0xd800 > 0x07ff) && ((utf8proc_uint32_t) uc < 0x110000);
182 }
183
utf8proc_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)184 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
185 if (uc < 0x00) {
186 return 0;
187 } else if (uc < 0x80) {
188 dst[0] = (utf8proc_uint8_t) uc;
189 return 1;
190 } else if (uc < 0x800) {
191 dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
192 dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
193 return 2;
194 // Note: we allow encoding 0xd800-0xdfff here, so as not to change
195 // the API, however, these are actually invalid in UTF-8
196 } else if (uc < 0x10000) {
197 dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
198 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
199 dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
200 return 3;
201 } else if (uc < 0x110000) {
202 dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
203 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
204 dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
205 dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
206 return 4;
207 } else {
208 return 0;
209 }
210 }
211
212 /* internal "unsafe" version that does not check whether uc is in range */
unsafe_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)213 static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
214 if (uc < 0x00) {
215 return 0;
216 } else if (uc < 0x80) {
217 dst[0] = (utf8proc_uint8_t) uc;
218 return 1;
219 } else if (uc < 0x800) {
220 dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
221 dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
222 return 2;
223 } else if (uc == 0xFFFF) {
224 dst[0] = (utf8proc_uint8_t) 0xFF;
225 return 1;
226 } else if (uc == 0xFFFE) {
227 dst[0] = (utf8proc_uint8_t) 0xFE;
228 return 1;
229 } else if (uc < 0x10000) {
230 dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
231 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
232 dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
233 return 3;
234 } else if (uc < 0x110000) {
235 dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
236 dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
237 dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
238 dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
239 return 4;
240 } else {
241 return 0;
242 }
243 }
244
245 /* return whether there is a grapheme break between boundclasses lbc and tbc
246 (according to the definition of extended grapheme clusters)
247
248 Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
249 http://www.unicode.org/reports/tr29/tr29-29.html
250
251 CAVEATS:
252 Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
253 and GB 12/13 (regional indicator code points) require knowledge of previous characters
254 and are thus not handled by this function. This may result in an incorrect break before
255 an E_Modifier class codepoint and an incorrectly missing break between two
256 REGIONAL_INDICATOR class code points if such support does not exist in the caller.
257
258 See the special support in grapheme_break_extended, for required bookkeeping by the caller.
259 */
grapheme_break_simple(int lbc,int tbc)260 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
261 return (lbc == UTF8PROC_BOUNDCLASS_START) ? true // GB1
262 : ( lbc == UTF8PROC_BOUNDCLASS_CR // GB3
263 && tbc == UTF8PROC_BOUNDCLASS_LF) ? false // ---
264 : (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB4
265 : (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB5
266 : ( lbc == UTF8PROC_BOUNDCLASS_L // GB6
267 && ( tbc == UTF8PROC_BOUNDCLASS_L // ---
268 || tbc == UTF8PROC_BOUNDCLASS_V // ---
269 || tbc == UTF8PROC_BOUNDCLASS_LV // ---
270 || tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false // ---
271 : ( ( lbc == UTF8PROC_BOUNDCLASS_LV // GB7
272 || lbc == UTF8PROC_BOUNDCLASS_V) // ---
273 && ( tbc == UTF8PROC_BOUNDCLASS_V // ---
274 || tbc == UTF8PROC_BOUNDCLASS_T)) ? false // ---
275 : ( ( lbc == UTF8PROC_BOUNDCLASS_LVT // GB8
276 || lbc == UTF8PROC_BOUNDCLASS_T) // ---
277 && tbc == UTF8PROC_BOUNDCLASS_T) ? false // ---
278 : ( tbc == UTF8PROC_BOUNDCLASS_EXTEND // GB9
279 || tbc == UTF8PROC_BOUNDCLASS_ZWJ // ---
280 || tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK // GB9a
281 || lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false // GB9b
282 : ( ( lbc == UTF8PROC_BOUNDCLASS_E_BASE // GB10 (requires additional
283 // handling below)
284 || lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) // ----
285 && tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false // ----
286 : ( lbc == UTF8PROC_BOUNDCLASS_ZWJ // GB11
287 && ( tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ // ----
288 || tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false // ----
289 : ( lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR // GB12/13 (requires additional
290 // handling below)
291 && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false // ----
292 : true; // GB999
293 }
294
grapheme_break_extended(int lbc,int tbc,utf8proc_int32_t * state)295 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) {
296 int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
297 ? *state : lbc);
298 utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
299 if (state) {
300 // Special support for GB 12/13 made possible by GB999. After two RI
301 // class codepoints we want to force a break. Do this by resetting the
302 // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
303 // after that character according to GB999 (unless of course such a break is
304 // forbidden by a different rule such as GB9).
305 if ((*state == tbc) && (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)) {
306 *state = UTF8PROC_BOUNDCLASS_OTHER;
307 }
308 // Special support for GB10. Fold any EXTEND codepoints into the previous
309 // boundclass if we're dealing with an emoji base boundclass.
310 else if ( ( (*state == UTF8PROC_BOUNDCLASS_E_BASE)
311 || (*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ))
312 && (tbc == UTF8PROC_BOUNDCLASS_EXTEND)) {
313 *state = UTF8PROC_BOUNDCLASS_E_BASE;
314 } else {
315 *state = tbc;
316 }
317 }
318 return break_permitted;
319 }
320
seqindex_decode_entry(const utf8proc_uint16_t ** entry)321 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) {
322 utf8proc_int32_t entry_cp = **entry;
323 if ((entry_cp & 0xF800) == 0xD800) {
324 *entry = *entry + 1;
325 entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
326 entry_cp += 0x10000;
327 }
328 return entry_cp;
329 }
330