• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
2 /*
3  *  Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other
4  * contributors.
5  *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
6  *
7  *  Permission is hereby granted, free of charge, to any person obtaining a
8  *  copy of this software and associated documentation files (the "Software"),
9  *  to deal in the Software without restriction, including without limitation
10  *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  *  and/or sell copies of the Software, and to permit persons to whom the
12  *  Software is furnished to do so, subject to the following conditions:
13  *
14  *  The above copyright notice and this permission notice shall be included in
15  *  all copies or substantial portions of the Software.
16  *
17  *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18  *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23  *  DEALINGS IN THE SOFTWARE.
24  */
25 
26 /*
27  *  This library contains derived data from a modified version of the
28  *  Unicode data files.
29  *
30  *  The original data files are available at
31  *  http://www.unicode.org/Public/UNIDATA/
32  *
33  *  Please notice the copyright statement in the file "utf8proc_data.c".
34  */
35 
36 
37 /*
38  *  File name:    utf8proc.c
39  *
40  *  Description:
41  *  Implementation of libutf8proc.
42  */
43 
44 
45 #include "utf8proc.h"
46 
47 UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
60   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
62   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
63   4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
64 };
65 
66 #define UTF8PROC_HANGUL_SBASE  0xAC00
67 #define UTF8PROC_HANGUL_LBASE  0x1100
68 #define UTF8PROC_HANGUL_VBASE  0x1161
69 #define UTF8PROC_HANGUL_TBASE  0x11A7
70 #define UTF8PROC_HANGUL_LCOUNT 19
71 #define UTF8PROC_HANGUL_VCOUNT 21
72 #define UTF8PROC_HANGUL_TCOUNT 28
73 #define UTF8PROC_HANGUL_NCOUNT 588
74 #define UTF8PROC_HANGUL_SCOUNT 11172
75 /* END is exclusive */
76 #define UTF8PROC_HANGUL_L_START  0x1100
77 #define UTF8PROC_HANGUL_L_END    0x115A
78 #define UTF8PROC_HANGUL_L_FILLER 0x115F
79 #define UTF8PROC_HANGUL_V_START  0x1160
80 #define UTF8PROC_HANGUL_V_END    0x11A3
81 #define UTF8PROC_HANGUL_T_START  0x11A8
82 #define UTF8PROC_HANGUL_T_END    0x11FA
83 #define UTF8PROC_HANGUL_S_START  0xAC00
84 #define UTF8PROC_HANGUL_S_END    0xD7A4
85 
86 /* Should follow semantic-versioning rules (semver.org) based on API
87    compatibility.  (Note that the shared-library version number will
88    be different, being based on ABI compatibility.): */
89 #define STRINGIZEx(x) #x
90 #define STRINGIZE(x)  STRINGIZEx(x)
91 
utf8proc_version(void)92 UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
93   return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH)
94          "";
95 }
96 
utf8proc_errmsg(utf8proc_ssize_t errcode)97 UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
98   switch (errcode) {
99     case UTF8PROC_ERROR_NOMEM:
100       return "Memory for processing UTF-8 data could not be allocated.";
101     case UTF8PROC_ERROR_OVERFLOW:
102       return "UTF-8 string is too long to be processed.";
103     case UTF8PROC_ERROR_INVALIDUTF8:
104       return "Invalid UTF-8 string";
105     case UTF8PROC_ERROR_NOTASSIGNED:
106       return "Unassigned Unicode code point found in UTF-8 string.";
107     case UTF8PROC_ERROR_INVALIDOPTS:
108       return "Invalid options for UTF-8 processing chosen.";
109     default:
110       return "An unknown error occurred while processing UTF-8 data.";
111   }
112 }
113 
114 #define utf_cont(ch) (((ch) & 0xc0) == 0x80)
115 
utf8proc_iterate(const utf8proc_uint8_t * str,utf8proc_ssize_t strlen,utf8proc_int32_t * dst)116 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
117   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
118   ) {
119   utf8proc_uint32_t uc;
120   const utf8proc_uint8_t *end;
121 
122   *dst = -1;
123   if (!strlen) {
124     return 0;
125   }
126   end = str + ((strlen < 0) ? 4 : strlen);
127   uc = *str++;
128   if (uc < 0x80) {
129     *dst = uc;
130     return 1;
131   }
132   // Must be between 0xc2 and 0xf4 inclusive to be valid
133   if ((uc - 0xc2) > (0xf4 - 0xc2)) {
134     return UTF8PROC_ERROR_INVALIDUTF8;
135   }
136   if (uc < 0xe0) {         // 2-byte sequence
137     // Must have valid continuation character
138     if ((str >= end) || !utf_cont(*str)) {
139       return UTF8PROC_ERROR_INVALIDUTF8;
140     }
141     *dst = ((uc & 0x1f) << 6) | (*str & 0x3f);
142     return 2;
143   }
144   if (uc < 0xf0) {        // 3-byte sequence
145     if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1])) {
146       return UTF8PROC_ERROR_INVALIDUTF8;
147     }
148     // Check for surrogate chars
149     if ((uc == 0xed) && (*str > 0x9f) ) {
150       return UTF8PROC_ERROR_INVALIDUTF8;
151     }
152     uc = ((uc & 0xf) << 12) | ((*str & 0x3f) << 6) | (str[1] & 0x3f);
153     if (uc < 0x800) {
154       return UTF8PROC_ERROR_INVALIDUTF8;
155     }
156     *dst = uc;
157     return 3;
158   }
159   // 4-byte sequence
160   // Must have 3 valid continuation characters
161   if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2])) {
162     return UTF8PROC_ERROR_INVALIDUTF8;
163   }
164   // Make sure in correct range (0x10000 - 0x10ffff)
165   if (uc == 0xf0) {
166     if (*str < 0x90) {
167       return UTF8PROC_ERROR_INVALIDUTF8;
168     }
169   } else if (uc == 0xf4) {
170     if (*str > 0x8f) {
171       return UTF8PROC_ERROR_INVALIDUTF8;
172     }
173   }
174   *dst = ((uc & 7) << 18) | ((*str & 0x3f) << 12) | ((str[1] & 0x3f) << 6) | (str[2] & 0x3f);
175   return 4;
176 }
177 
utf8proc_codepoint_valid(utf8proc_int32_t uc)178 UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
179   return (((utf8proc_uint32_t) uc) - 0xd800 > 0x07ff) && ((utf8proc_uint32_t) uc < 0x110000);
180 }
181 
utf8proc_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)182 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
183   if (uc < 0x00) {
184     return 0;
185   } else if (uc < 0x80) {
186     dst[0] = (utf8proc_uint8_t) uc;
187     return 1;
188   } else if (uc < 0x800) {
189     dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
190     dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
191     return 2;
192     // Note: we allow encoding 0xd800-0xdfff here, so as not to change
193     // the API, however, these are actually invalid in UTF-8
194   } else if (uc < 0x10000) {
195     dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
196     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
197     dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
198     return 3;
199   } else if (uc < 0x110000) {
200     dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
201     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
202     dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
203     dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
204     return 4;
205   } else {
206     return 0;
207   }
208 }
209 
210 /* internal "unsafe" version that does not check whether uc is in range */
unsafe_encode_char(utf8proc_int32_t uc,utf8proc_uint8_t * dst)211 static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
212   if (uc < 0x00) {
213     return 0;
214   } else if (uc < 0x80) {
215     dst[0] = (utf8proc_uint8_t) uc;
216     return 1;
217   } else if (uc < 0x800) {
218     dst[0] = (utf8proc_uint8_t) (0xC0 + (uc >> 6));
219     dst[1] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
220     return 2;
221   } else if (uc == 0xFFFF) {
222     dst[0] = (utf8proc_uint8_t) 0xFF;
223     return 1;
224   } else if (uc == 0xFFFE) {
225     dst[0] = (utf8proc_uint8_t) 0xFE;
226     return 1;
227   } else if (uc < 0x10000) {
228     dst[0] = (utf8proc_uint8_t) (0xE0 + (uc >> 12));
229     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
230     dst[2] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
231     return 3;
232   } else if (uc < 0x110000) {
233     dst[0] = (utf8proc_uint8_t) (0xF0 + (uc >> 18));
234     dst[1] = (utf8proc_uint8_t) (0x80 + ((uc >> 12) & 0x3F));
235     dst[2] = (utf8proc_uint8_t) (0x80 + ((uc >> 6) & 0x3F));
236     dst[3] = (utf8proc_uint8_t) (0x80 + (uc & 0x3F));
237     return 4;
238   } else {
239     return 0;
240   }
241 }
242 
243 /* return whether there is a grapheme break between boundclasses lbc and tbc
244    (according to the definition of extended grapheme clusters)
245 
246    Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
247    http://www.unicode.org/reports/tr29/tr29-29.html
248 
249    CAVEATS:
250    Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
251    and GB 12/13 (regional indicator code points) require knowledge of previous characters
252    and are thus not handled by this function. This may result in an incorrect break before
253    an E_Modifier class codepoint and an incorrectly missing break between two
254    REGIONAL_INDICATOR class code points if such support does not exist in the caller.
255 
256    See the special support in grapheme_break_extended, for required bookkeeping by the caller.
257  */
grapheme_break_simple(int lbc,int tbc)258 static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
259   return (lbc == UTF8PROC_BOUNDCLASS_START) ? true                                      // GB1
260          : (  lbc == UTF8PROC_BOUNDCLASS_CR                                             // GB3
261            && tbc == UTF8PROC_BOUNDCLASS_LF) ? false                                    // ---
262          : (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB4
263          : (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true // GB5
264          : (  lbc == UTF8PROC_BOUNDCLASS_L                                              // GB6
265            && (  tbc == UTF8PROC_BOUNDCLASS_L                                           // ---
266               || tbc == UTF8PROC_BOUNDCLASS_V                                           // ---
267               || tbc == UTF8PROC_BOUNDCLASS_LV                                          // ---
268               || tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false                               // ---
269          : (  (  lbc == UTF8PROC_BOUNDCLASS_LV                                          // GB7
270               || lbc == UTF8PROC_BOUNDCLASS_V)                                          // ---
271            && (  tbc == UTF8PROC_BOUNDCLASS_V                                           // ---
272               || tbc == UTF8PROC_BOUNDCLASS_T)) ? false                                 // ---
273          : (  (  lbc == UTF8PROC_BOUNDCLASS_LVT                                         // GB8
274               || lbc == UTF8PROC_BOUNDCLASS_T)                                          // ---
275            && tbc == UTF8PROC_BOUNDCLASS_T) ? false                                     // ---
276          : (  tbc == UTF8PROC_BOUNDCLASS_EXTEND                                         // GB9
277            || tbc == UTF8PROC_BOUNDCLASS_ZWJ                                            // ---
278            || tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK                                    // GB9a
279            || lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false                               // GB9b
280          : (  (  lbc == UTF8PROC_BOUNDCLASS_E_BASE                                      // GB10 (requires additional
281                                                                                         // handling below)
282               || lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)                                 // ----
283            && tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false                            // ----
284          : (  lbc == UTF8PROC_BOUNDCLASS_ZWJ                                            // GB11
285            && (  tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ                              // ----
286               || tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false                        // ----
287          : (  lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR                             // GB12/13 (requires additional
288                                                                                         // handling below)
289            && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false                    // ----
290          : true;                                                                        // GB999
291 }
292 
grapheme_break_extended(int lbc,int tbc,utf8proc_int32_t * state)293 static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) {
294   int lbc_override = ((state && *state != UTF8PROC_BOUNDCLASS_START)
295                       ? *state : lbc);
296   utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
297   if (state) {
298     // Special support for GB 12/13 made possible by GB999. After two RI
299     // class codepoints we want to force a break. Do this by resetting the
300     // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
301     // after that character according to GB999 (unless of course such a break is
302     // forbidden by a different rule such as GB9).
303     if ((*state == tbc) && (tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)) {
304       *state = UTF8PROC_BOUNDCLASS_OTHER;
305     }
306     // Special support for GB10. Fold any EXTEND codepoints into the previous
307     // boundclass if we're dealing with an emoji base boundclass.
308     else if (  (  (*state == UTF8PROC_BOUNDCLASS_E_BASE)
309                || (*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) )
310             && (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ) {
311       *state = UTF8PROC_BOUNDCLASS_E_BASE;
312     } else {
313       *state = tbc;
314     }
315   }
316   return break_permitted;
317 }
318 
seqindex_decode_entry(const utf8proc_uint16_t ** entry)319 static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) {
320   utf8proc_int32_t entry_cp = **entry;
321   if ((entry_cp & 0xF800) == 0xD800) {
322     *entry = *entry + 1;
323     entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
324     entry_cp += 0x10000;
325   }
326   return entry_cp;
327 }
328