• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2 /*
3  *  Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other
4  * contributors.
5  *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
6  *
7  *  Permission is hereby granted, free of charge, to any person obtaining a
8  *  copy of this software and associated documentation files (the "Software"),
9  *  to deal in the Software without restriction, including without limitation
10  *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  *  and/or sell copies of the Software, and to permit persons to whom the
12  *  Software is furnished to do so, subject to the following conditions:
13  *
14  *  The above copyright notice and this permission notice shall be included in
15  *  all copies or substantial portions of the Software.
16  *
17  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  *  DEALINGS IN THE SOFTWARE.
24  */
25 
26 /*
27  *  This library contains derived data from a modified version of the
28  *  Unicode data files.
29  *
30  *  The original data files are available at
31  *  http://www.unicode.org/Public/UNIDATA/
32  *
33  *  Please notice the copyright statement in the file "utf8proc_data.c".
34  */
35 
36 
37 /*
38  *  File name:    utf8proc.c
39  *
40  *  Description:
41  *  Implementation of libutf8proc.
42  */
43 
44 
45 #include "utf8proc.h"
46 
47 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
62   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
63   4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
64 };
65 
66 #define UTF8PROC_HANGUL_SBASE  0xAC00
67 #define UTF8PROC_HANGUL_LBASE  0x1100
68 #define UTF8PROC_HANGUL_VBASE  0x1161
69 #define UTF8PROC_HANGUL_TBASE  0x11A7
70 #define UTF8PROC_HANGUL_LCOUNT 19
71 #define UTF8PROC_HANGUL_VCOUNT 21
72 #define UTF8PROC_HANGUL_TCOUNT 28
73 #define UTF8PROC_HANGUL_NCOUNT 588
74 #define UTF8PROC_HANGUL_SCOUNT 11172
75 /* END is exclusive */
76 #define UTF8PROC_HANGUL_L_START  0x1100
77 #define UTF8PROC_HANGUL_L_END    0x115A
78 #define UTF8PROC_HANGUL_L_FILLER 0x115F
79 #define UTF8PROC_HANGUL_V_START  0x1160
80 #define UTF8PROC_HANGUL_V_END    0x11A3
81 #define UTF8PROC_HANGUL_T_START  0x11A8
82 #define UTF8PROC_HANGUL_T_END    0x11FA
83 #define UTF8PROC_HANGUL_S_START  0xAC00
84 #define UTF8PROC_HANGUL_S_END    0xD7A4
85 
86 /* Should follow semantic-versioning rules (semver.org) based on API
87    compatibility.  (Note that the shared-library version number will
88    be different, being based on ABI compatibility.): */
89 #define STRINGIZEx(x) #x
90 #define STRINGIZE(x)  STRINGIZEx(x)
91 
utf8proc_version(void)92 UTF8PROC_DLLEXPORT const char* utf8proc_version(void) {
93   return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH)
94          "";
95 }
96 
utf8proc_errmsg(utf8proc_ssize_t errcode)97 UTF8PROC_DLLEXPORT const char* utf8proc_errmsg(utf8proc_ssize_t errcode) {
98   switch (errcode) {
99     case UTF8PROC_ERROR_NOMEM:
100       return "Memory for processing UTF-8 data could not be allocated.";
101     case UTF8PROC_ERROR_OVERFLOW:
102       return "UTF-8 string is too long to be processed.";
103     case UTF8PROC_ERROR_INVALIDUTF8:
104       return "Invalid UTF-8 string";
105     case UTF8PROC_ERROR_NOTASSIGNED:
106       return "Unassigned Unicode code point found in UTF-8 string.";
107     case UTF8PROC_ERROR_INVALIDOPTS:
108       return "Invalid options for UTF-8 processing chosen.";
109     default:
110       return "An unknown error occurred while processing UTF-8 data.";
111   }
112 }
113 
114 #define utf_cont(ch) (((ch) & 0xc0) == 0x80)
115 
utf8proc_iterate(const utf8proc_uint8_t * str,utf8proc_ssize_t strlen,utf8proc_int32_t * dst)116 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
117   const utf8proc_uint8_t *str,
118   utf8proc_ssize_t        strlen,
119   utf8proc_int32_t       *dst
120   ) {
121   utf8proc_uint32_t uc;
122   const utf8proc_uint8_t *end;
123 
124   *dst = -1;
125   if (!strlen) {
126     return 0;
127   }
128   end = str + ((strlen < 0) ? 4 : strlen);
129   uc = *str++;
130   if (uc < 0x80) {
131     *dst = uc;
132     return 1;
133   }
134   // Must be between 0xc2 and 0xf4 inclusive to be valid
135   if ((uc - 0xc2) > (0xf4 - 0xc2)) {
136     return UTF8PROC_ERROR_INVALIDUTF8;
137   }
138   if (uc < 0xe0) {         // 2-byte sequence
139     // Must have valid continuation character
140     if ((str >= end) || !utf_cont(*str)) {
141       return UTF8PROC_ERROR_INVALIDUTF8;
142     }
143     *dst = ((uc & 0x1f) << 6) | (*str & 0x3f);
144     return 2;
145   }
146   if (uc < 0xf0) {        // 3-byte sequence
147     if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) {
148       return UTF8PROC_ERROR_INVALIDUTF8;
149     }
150     // Check for surrogate chars
151     if ((uc == 0xed) && (*str > 0x9f)) {
152       return UTF8PROC_ERROR_INVALIDUTF8;
153     }
154     uc = ((uc & 0xf) << 12) | ((*str & 0x3f) << 6) | (str[1] & 0x3f);
155     if (uc < 0x800) {
156       return UTF8PROC_ERROR_INVALIDUTF8;
157     }
158     *dst = uc;
159     return 3;
160   }
161   // 4-byte sequence
162   // Must have 3 valid continuation characters
163   if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) {
164     return UTF8PROC_ERROR_INVALIDUTF8;
165   }
166   // Make sure in correct range (0x10000 - 0x10ffff)
167   if (uc == 0xf0) {
168     if (*str < 0x90) {
169       return UTF8PROC_ERROR_INVALIDUTF8;
170     }
171   } else if (uc == 0xf4) {
172     if (*str > 0x8f) {
173       return UTF8PROC_ERROR_INVALIDUTF8;
174     }
175   }
176   *dst = ((uc & 7) << 18) | ((*str & 0x3f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f);
177   return 4;
178 }
179 
utf8proc_codepoint_valid(utf8proc_int32_t uc)180 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
181   return (((utf8proc_uint32_t) uc) - 0xd800 > 0x07ff) && ((utf8proc_uint32_t) uc < 0x110000);
182 }
183 
utf8proc_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)184 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
185   if (uc < 0x00) {
186     return 0;
187   } else if (uc < 0x80) {
188     dst[0] = (utf8proc_uint8_t) uc;
189     return 1;
190   } else if (uc < 0x800) {
191     dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
192     dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
193     return 2;
194     // Note: we allow encoding 0xd800-0xdfff here, so as not to change
195     // the API, however, these are actually invalid in UTF-8
196   } else if (uc < 0x10000) {
197     dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
198     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
199     dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
200     return 3;
201   } else if (uc < 0x110000) {
202     dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
203     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
204     dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
205     dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
206     return 4;
207   } else {
208     return 0;
209   }
210 }
211 
212 /* internal "unsafe" version that does not check whether uc is in range */
unsafe_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)213 static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
214   if (uc < 0x00) {
215     return 0;
216   } else if (uc < 0x80) {
217     dst[0] = (utf8proc_uint8_t) uc;
218     return 1;
219   } else if (uc < 0x800) {
220     dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
221     dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
222     return 2;
223   } else if (uc == 0xFFFF) {
224     dst[0] = (utf8proc_uint8_t) 0xFF;
225     return 1;
226   } else if (uc == 0xFFFE) {
227     dst[0] = (utf8proc_uint8_t) 0xFE;
228     return 1;
229   } else if (uc < 0x10000) {
230     dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
231     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
232     dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
233     return 3;
234   } else if (uc < 0x110000) {
235     dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
236     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
237     dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
238     dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
239     return 4;
240   } else {
241     return 0;
242   }
243 }
244 
245 /* return whether there is a grapheme break between boundclasses lbc and tbc
246    (according to the definition of extended grapheme clusters)
247 
248    Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
249    http://www.unicode.org/reports/tr29/tr29-29.html
250 
251    CAVEATS:
252    Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
253    and GB 12/13 (regional indicator code points) require knowledge of previous characters
254    and are thus not handled by this function. This may result in an incorrect break before
255    an E_Modifier class codepoint and an incorrectly missing break between two
256    REGIONAL_INDICATOR class code points if such support does not exist in the caller.
257 
258    See the special support in grapheme_break_extended, for required bookkeeping by the caller.
259  */
grapheme_break_simple(int lbc,int tbc)260 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
261   return (lbc == UTF8PROC_BOUNDCLASS_START) ? true                                      // GB1
262          : (  lbc == UTF8PROC_BOUNDCLASS_CR                                             // GB3
263            && tbc == UTF8PROC_BOUNDCLASS_LF) ? false                                    // ---
264          : (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB4
265          : (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB5
266          : (  lbc == UTF8PROC_BOUNDCLASS_L                                              // GB6
267            && (  tbc == UTF8PROC_BOUNDCLASS_L                                           // ---
268               || tbc == UTF8PROC_BOUNDCLASS_V                                           // ---
269               || tbc == UTF8PROC_BOUNDCLASS_LV                                          // ---
270               || tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false                               // ---
271          : (  (  lbc == UTF8PROC_BOUNDCLASS_LV                                          // GB7
272               || lbc == UTF8PROC_BOUNDCLASS_V)                                          // ---
273            && (  tbc == UTF8PROC_BOUNDCLASS_V                                           // ---
274               || tbc == UTF8PROC_BOUNDCLASS_T)) ? false                                 // ---
275          : (  (  lbc == UTF8PROC_BOUNDCLASS_LVT                                         // GB8
276               || lbc == UTF8PROC_BOUNDCLASS_T)                                          // ---
277            && tbc == UTF8PROC_BOUNDCLASS_T) ? false                                     // ---
278          : (  tbc == UTF8PROC_BOUNDCLASS_EXTEND                                         // GB9
279            || tbc == UTF8PROC_BOUNDCLASS_ZWJ                                            // ---
280            || tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK                                    // GB9a
281            || lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false                               // GB9b
282          : (  (  lbc == UTF8PROC_BOUNDCLASS_E_BASE                                      // GB10 (requires additional
283                                                                                         // handling below)
284               || lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)                                 // ----
285            && tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false                            // ----
286          : (  lbc == UTF8PROC_BOUNDCLASS_ZWJ                                            // GB11
287            && (  tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ                              // ----
288               || tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false                        // ----
289          : (  lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR                             // GB12/13 (requires additional
290                                                                                         // handling below)
291            && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false                    // ----
292          : true;                                                                        // GB999
293 }
294 
grapheme_break_extended(int lbc,int tbc,utf8proc_int32_t * state)295 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) {
296   int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
297                       ? *state : lbc);
298   utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
299   if (state) {
300     // Special support for GB 12/13 made possible by GB999. After two RI
301     // class codepoints we want to force a break. Do this by resetting the
302     // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
303     // after that character according to GB999 (unless of course such a break is
304     // forbidden by a different rule such as GB9).
305     if ((*state == tbc) && (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)) {
306       *state = UTF8PROC_BOUNDCLASS_OTHER;
307     }
308     // Special support for GB10. Fold any EXTEND codepoints into the previous
309     // boundclass if we're dealing with an emoji base boundclass.
310     else if (  (  (*state == UTF8PROC_BOUNDCLASS_E_BASE)
311                || (*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ))
312             && (tbc == UTF8PROC_BOUNDCLASS_EXTEND)) {
313       *state = UTF8PROC_BOUNDCLASS_E_BASE;
314     } else {
315       *state = tbc;
316     }
317   }
318   return break_permitted;
319 }
320 
seqindex_decode_entry(const utf8proc_uint16_t ** entry)321 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) {
322   utf8proc_int32_t entry_cp = **entry;
323   if ((entry_cp & 0xF800) == 0xD800) {
324     *entry = *entry + 1;
325     entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
326     entry_cp += 0x10000;
327   }
328   return entry_cp;
329 }
330