• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    unicodedata -- Provides access to the Unicode database.
4 
5    The current version number is reported in the unidata_version constant.
6 
7    Written by Marc-Andre Lemburg (mal@lemburg.com).
8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9    Modified by Martin v. Löwis (martin@v.loewis.de)
10 
11    Copyright (c) Corporation for National Research Initiatives.
12 
13    ------------------------------------------------------------------------ */
14 
15 #ifndef Py_BUILD_CORE_BUILTIN
16 #  define Py_BUILD_CORE_MODULE 1
17 #endif
18 
19 #include "Python.h"
20 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
21 
22 #include <stdbool.h>
23 #include <stddef.h>               // offsetof()
24 
25 /*[clinic input]
26 module unicodedata
27 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
28 [clinic start generated code]*/
29 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
30 
31 /* character properties */
32 
33 typedef struct {
34     const unsigned char category;       /* index into
35                                            _PyUnicode_CategoryNames */
36     const unsigned char combining;      /* combining class value 0 - 255 */
37     const unsigned char bidirectional;  /* index into
38                                            _PyUnicode_BidirectionalNames */
39     const unsigned char mirrored;       /* true if mirrored in bidir mode */
40     const unsigned char east_asian_width;       /* index into
41                                                    _PyUnicode_EastAsianWidth */
42     const unsigned char normalization_quick_check; /* see is_normalized() */
43 } _PyUnicode_DatabaseRecord;
44 
45 typedef struct change_record {
46     /* sequence of fields should be the same as in merge_old_version */
47     const unsigned char bidir_changed;
48     const unsigned char category_changed;
49     const unsigned char decimal_changed;
50     const unsigned char mirrored_changed;
51     const unsigned char east_asian_width_changed;
52     const double numeric_changed;
53 } change_record;
54 
55 /* data file generated by Tools/unicode/makeunicodedata.py */
56 #include "unicodedata_db.h"
57 
58 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)59 _getrecord_ex(Py_UCS4 code)
60 {
61     int index;
62     if (code >= 0x110000)
63         index = 0;
64     else {
65         index = index1[(code>>SHIFT)];
66         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
67     }
68 
69     return &_PyUnicode_Database_Records[index];
70 }
71 
72 /* ------------- Previous-version API ------------------------------------- */
73 typedef struct previous_version {
74     PyObject_HEAD
75     const char *name;
76     const change_record* (*getrecord)(Py_UCS4);
77     Py_UCS4 (*normalization)(Py_UCS4);
78 } PreviousDBVersion;
79 
80 #include "clinic/unicodedata.c.h"
81 
82 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
83 
84 static PyMemberDef DB_members[] = {
85         {"unidata_version", Py_T_STRING, offsetof(PreviousDBVersion, name), Py_READONLY},
86         {NULL}
87 };
88 
89 // Check if self is an unicodedata.UCD instance.
90 // If self is NULL (when the PyCapsule C API is used), return 0.
91 // PyModule_Check() is used to avoid having to retrieve the ucd_type.
92 // See unicodedata_functions comment to the rationale of this macro.
93 #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
94 
95 static PyObject*
new_previous_version(PyTypeObject * ucd_type,const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))96 new_previous_version(PyTypeObject *ucd_type,
97                      const char*name, const change_record* (*getrecord)(Py_UCS4),
98                      Py_UCS4 (*normalization)(Py_UCS4))
99 {
100     PreviousDBVersion *self;
101     self = PyObject_GC_New(PreviousDBVersion, ucd_type);
102     if (self == NULL)
103         return NULL;
104     self->name = name;
105     self->getrecord = getrecord;
106     self->normalization = normalization;
107     PyObject_GC_Track(self);
108     return (PyObject*)self;
109 }
110 
111 
112 /* --- Module API --------------------------------------------------------- */
113 
114 /*[clinic input]
115 unicodedata.UCD.decimal
116 
117     self: self
118     chr: int(accept={str})
119     default: object=NULL
120     /
121 
122 Converts a Unicode character into its equivalent decimal value.
123 
124 Returns the decimal value assigned to the character chr as integer.
125 If no such value is defined, default is returned, or, if not given,
126 ValueError is raised.
127 [clinic start generated code]*/
128 
129 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)130 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
131                              PyObject *default_value)
132 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
133 {
134     int have_old = 0;
135     long rc;
136     Py_UCS4 c = (Py_UCS4)chr;
137 
138     if (UCD_Check(self)) {
139         const change_record *old = get_old_record(self, c);
140         if (old->category_changed == 0) {
141             /* unassigned */
142             have_old = 1;
143             rc = -1;
144         }
145         else if (old->decimal_changed != 0xFF) {
146             have_old = 1;
147             rc = old->decimal_changed;
148         }
149     }
150 
151     if (!have_old)
152         rc = Py_UNICODE_TODECIMAL(c);
153     if (rc < 0) {
154         if (default_value == NULL) {
155             PyErr_SetString(PyExc_ValueError,
156                             "not a decimal");
157             return NULL;
158         }
159         else {
160             return Py_NewRef(default_value);
161         }
162     }
163     return PyLong_FromLong(rc);
164 }
165 
166 /*[clinic input]
167 unicodedata.UCD.digit
168 
169     self: self
170     chr: int(accept={str})
171     default: object=NULL
172     /
173 
174 Converts a Unicode character into its equivalent digit value.
175 
176 Returns the digit value assigned to the character chr as integer.
177 If no such value is defined, default is returned, or, if not given,
178 ValueError is raised.
179 [clinic start generated code]*/
180 
181 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)182 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
183 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
184 {
185     long rc;
186     Py_UCS4 c = (Py_UCS4)chr;
187     rc = Py_UNICODE_TODIGIT(c);
188     if (rc < 0) {
189         if (default_value == NULL) {
190             PyErr_SetString(PyExc_ValueError, "not a digit");
191             return NULL;
192         }
193         else {
194             return Py_NewRef(default_value);
195         }
196     }
197     return PyLong_FromLong(rc);
198 }
199 
200 /*[clinic input]
201 unicodedata.UCD.numeric
202 
203     self: self
204     chr: int(accept={str})
205     default: object=NULL
206     /
207 
208 Converts a Unicode character into its equivalent numeric value.
209 
210 Returns the numeric value assigned to the character chr as float.
211 If no such value is defined, default is returned, or, if not given,
212 ValueError is raised.
213 [clinic start generated code]*/
214 
215 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)216 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
217                              PyObject *default_value)
218 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
219 {
220     int have_old = 0;
221     double rc;
222     Py_UCS4 c = (Py_UCS4)chr;
223 
224     if (UCD_Check(self)) {
225         const change_record *old = get_old_record(self, c);
226         if (old->category_changed == 0) {
227             /* unassigned */
228             have_old = 1;
229             rc = -1.0;
230         }
231         else if (old->decimal_changed != 0xFF) {
232             have_old = 1;
233             rc = old->decimal_changed;
234         }
235     }
236 
237     if (!have_old)
238         rc = Py_UNICODE_TONUMERIC(c);
239     if (rc == -1.0) {
240         if (default_value == NULL) {
241             PyErr_SetString(PyExc_ValueError, "not a numeric character");
242             return NULL;
243         }
244         else {
245             return Py_NewRef(default_value);
246         }
247     }
248     return PyFloat_FromDouble(rc);
249 }
250 
251 /*[clinic input]
252 unicodedata.UCD.category
253 
254     self: self
255     chr: int(accept={str})
256     /
257 
258 Returns the general category assigned to the character chr as string.
259 [clinic start generated code]*/
260 
261 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)262 unicodedata_UCD_category_impl(PyObject *self, int chr)
263 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
264 {
265     int index;
266     Py_UCS4 c = (Py_UCS4)chr;
267     index = (int) _getrecord_ex(c)->category;
268     if (UCD_Check(self)) {
269         const change_record *old = get_old_record(self, c);
270         if (old->category_changed != 0xFF)
271             index = old->category_changed;
272     }
273     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
274 }
275 
276 /*[clinic input]
277 unicodedata.UCD.bidirectional
278 
279     self: self
280     chr: int(accept={str})
281     /
282 
283 Returns the bidirectional class assigned to the character chr as string.
284 
285 If no such value is defined, an empty string is returned.
286 [clinic start generated code]*/
287 
288 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)289 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
290 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
291 {
292     int index;
293     Py_UCS4 c = (Py_UCS4)chr;
294     index = (int) _getrecord_ex(c)->bidirectional;
295     if (UCD_Check(self)) {
296         const change_record *old = get_old_record(self, c);
297         if (old->category_changed == 0)
298             index = 0; /* unassigned */
299         else if (old->bidir_changed != 0xFF)
300             index = old->bidir_changed;
301     }
302     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
303 }
304 
305 /*[clinic input]
306 unicodedata.UCD.combining -> int
307 
308     self: self
309     chr: int(accept={str})
310     /
311 
312 Returns the canonical combining class assigned to the character chr as integer.
313 
314 Returns 0 if no combining class is defined.
315 [clinic start generated code]*/
316 
317 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)318 unicodedata_UCD_combining_impl(PyObject *self, int chr)
319 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
320 {
321     int index;
322     Py_UCS4 c = (Py_UCS4)chr;
323     index = (int) _getrecord_ex(c)->combining;
324     if (UCD_Check(self)) {
325         const change_record *old = get_old_record(self, c);
326         if (old->category_changed == 0)
327             index = 0; /* unassigned */
328     }
329     return index;
330 }
331 
332 /*[clinic input]
333 unicodedata.UCD.mirrored -> int
334 
335     self: self
336     chr: int(accept={str})
337     /
338 
339 Returns the mirrored property assigned to the character chr as integer.
340 
341 Returns 1 if the character has been identified as a "mirrored"
342 character in bidirectional text, 0 otherwise.
343 [clinic start generated code]*/
344 
345 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)346 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
347 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
348 {
349     int index;
350     Py_UCS4 c = (Py_UCS4)chr;
351     index = (int) _getrecord_ex(c)->mirrored;
352     if (UCD_Check(self)) {
353         const change_record *old = get_old_record(self, c);
354         if (old->category_changed == 0)
355             index = 0; /* unassigned */
356         else if (old->mirrored_changed != 0xFF)
357             index = old->mirrored_changed;
358     }
359     return index;
360 }
361 
362 /*[clinic input]
363 unicodedata.UCD.east_asian_width
364 
365     self: self
366     chr: int(accept={str})
367     /
368 
369 Returns the east asian width assigned to the character chr as string.
370 [clinic start generated code]*/
371 
372 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)373 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
374 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
375 {
376     int index;
377     Py_UCS4 c = (Py_UCS4)chr;
378     index = (int) _getrecord_ex(c)->east_asian_width;
379     if (UCD_Check(self)) {
380         const change_record *old = get_old_record(self, c);
381         if (old->category_changed == 0)
382             index = 0; /* unassigned */
383         else if (old->east_asian_width_changed != 0xFF)
384             index = old->east_asian_width_changed;
385     }
386     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
387 }
388 
389 /*[clinic input]
390 unicodedata.UCD.decomposition
391 
392     self: self
393     chr: int(accept={str})
394     /
395 
396 Returns the character decomposition mapping assigned to the character chr as string.
397 
398 An empty string is returned in case no such mapping is defined.
399 [clinic start generated code]*/
400 
401 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)402 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
403 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
404 {
405     char decomp[256];
406     int code, index, count;
407     size_t i;
408     unsigned int prefix_index;
409     Py_UCS4 c = (Py_UCS4)chr;
410 
411     code = (int)c;
412 
413     if (UCD_Check(self)) {
414         const change_record *old = get_old_record(self, c);
415         if (old->category_changed == 0)
416             return PyUnicode_FromString(""); /* unassigned */
417     }
418 
419     if (code < 0 || code >= 0x110000)
420         index = 0;
421     else {
422         index = decomp_index1[(code>>DECOMP_SHIFT)];
423         index = decomp_index2[(index<<DECOMP_SHIFT)+
424                              (code&((1<<DECOMP_SHIFT)-1))];
425     }
426 
427     /* high byte is number of hex bytes (usually one or two), low byte
428        is prefix code (from*/
429     count = decomp_data[index] >> 8;
430 
431     /* XXX: could allocate the PyString up front instead
432        (strlen(prefix) + 5 * count + 1 bytes) */
433 
434     /* Based on how index is calculated above and decomp_data is generated
435        from Tools/unicode/makeunicodedata.py, it should not be possible
436        to overflow decomp_prefix. */
437     prefix_index = decomp_data[index] & 255;
438     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
439 
440     /* copy prefix */
441     i = strlen(decomp_prefix[prefix_index]);
442     memcpy(decomp, decomp_prefix[prefix_index], i);
443 
444     while (count-- > 0) {
445         if (i)
446             decomp[i++] = ' ';
447         assert(i < sizeof(decomp));
448         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
449                       decomp_data[++index]);
450         i += strlen(decomp + i);
451     }
452     return PyUnicode_FromStringAndSize(decomp, i);
453 }
454 
455 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)456 get_decomp_record(PyObject *self, Py_UCS4 code,
457                   int *index, int *prefix, int *count)
458 {
459     if (code >= 0x110000) {
460         *index = 0;
461     }
462     else if (UCD_Check(self)
463              && get_old_record(self, code)->category_changed==0) {
464         /* unassigned in old version */
465         *index = 0;
466     }
467     else {
468         *index = decomp_index1[(code>>DECOMP_SHIFT)];
469         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
470                                (code&((1<<DECOMP_SHIFT)-1))];
471     }
472 
473     /* high byte is number of hex bytes (usually one or two), low byte
474        is prefix code (from*/
475     *count = decomp_data[*index] >> 8;
476     *prefix = decomp_data[*index] & 255;
477 
478     (*index)++;
479 }
480 
481 #define SBase   0xAC00
482 #define LBase   0x1100
483 #define VBase   0x1161
484 #define TBase   0x11A7
485 #define LCount  19
486 #define VCount  21
487 #define TCount  28
488 #define NCount  (VCount*TCount)
489 #define SCount  (LCount*NCount)
490 
491 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)492 nfd_nfkd(PyObject *self, PyObject *input, int k)
493 {
494     PyObject *result;
495     Py_UCS4 *output;
496     Py_ssize_t i, o, osize;
497     int kind;
498     const void *data;
499     /* Longest decomposition in Unicode 3.2: U+FDFA */
500     Py_UCS4 stack[20];
501     Py_ssize_t space, isize;
502     int index, prefix, count, stackptr;
503     unsigned char prev, cur;
504 
505     stackptr = 0;
506     isize = PyUnicode_GET_LENGTH(input);
507     space = isize;
508     /* Overallocate at most 10 characters. */
509     if (space > 10) {
510         if (space <= PY_SSIZE_T_MAX - 10)
511             space += 10;
512     }
513     else {
514         space *= 2;
515     }
516     osize = space;
517     output = PyMem_NEW(Py_UCS4, space);
518     if (!output) {
519         PyErr_NoMemory();
520         return NULL;
521     }
522     i = o = 0;
523     kind = PyUnicode_KIND(input);
524     data = PyUnicode_DATA(input);
525 
526     while (i < isize) {
527         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
528         while(stackptr) {
529             Py_UCS4 code = stack[--stackptr];
530             /* Hangul Decomposition adds three characters in
531                a single step, so we need at least that much room. */
532             if (space < 3) {
533                 Py_UCS4 *new_output;
534                 osize += 10;
535                 space += 10;
536                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
537                 if (new_output == NULL) {
538                     PyMem_Free(output);
539                     PyErr_NoMemory();
540                     return NULL;
541                 }
542                 output = new_output;
543             }
544             /* Hangul Decomposition. */
545             if (SBase <= code && code < (SBase+SCount)) {
546                 int SIndex = code - SBase;
547                 int L = LBase + SIndex / NCount;
548                 int V = VBase + (SIndex % NCount) / TCount;
549                 int T = TBase + SIndex % TCount;
550                 output[o++] = L;
551                 output[o++] = V;
552                 space -= 2;
553                 if (T != TBase) {
554                     output[o++] = T;
555                     space --;
556                 }
557                 continue;
558             }
559             /* normalization changes */
560             if (UCD_Check(self)) {
561                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
562                 if (value != 0) {
563                     stack[stackptr++] = value;
564                     continue;
565                 }
566             }
567 
568             /* Other decompositions. */
569             get_decomp_record(self, code, &index, &prefix, &count);
570 
571             /* Copy character if it is not decomposable, or has a
572                compatibility decomposition, but we do NFD. */
573             if (!count || (prefix && !k)) {
574                 output[o++] = code;
575                 space--;
576                 continue;
577             }
578             /* Copy decomposition onto the stack, in reverse
579                order.  */
580             while(count) {
581                 code = decomp_data[index + (--count)];
582                 stack[stackptr++] = code;
583             }
584         }
585     }
586 
587     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
588                                        output, o);
589     PyMem_Free(output);
590     if (!result)
591         return NULL;
592     /* result is guaranteed to be ready, as it is compact. */
593     kind = PyUnicode_KIND(result);
594     data = PyUnicode_DATA(result);
595 
596     /* Sort canonically. */
597     i = 0;
598     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
599     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
600         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
601         if (prev == 0 || cur == 0 || prev <= cur) {
602             prev = cur;
603             continue;
604         }
605         /* Non-canonical order. Need to switch *i with previous. */
606         o = i - 1;
607         while (1) {
608             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
609             PyUnicode_WRITE(kind, data, o+1,
610                             PyUnicode_READ(kind, data, o));
611             PyUnicode_WRITE(kind, data, o, tmp);
612             o--;
613             if (o < 0)
614                 break;
615             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
616             if (prev == 0 || prev <= cur)
617                 break;
618         }
619         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
620     }
621     return result;
622 }
623 
624 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)625 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
626 {
627     unsigned int index;
628     for (index = 0; nfc[index].start; index++) {
629         unsigned int start = nfc[index].start;
630         if (code < start)
631             return -1;
632         if (code <= start + nfc[index].count) {
633             unsigned int delta = code - start;
634             return nfc[index].index + delta;
635         }
636     }
637     return -1;
638 }
639 
640 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)641 nfc_nfkc(PyObject *self, PyObject *input, int k)
642 {
643     PyObject *result;
644     int kind;
645     const void *data;
646     Py_UCS4 *output;
647     Py_ssize_t i, i1, o, len;
648     int f,l,index,index1,comb;
649     Py_UCS4 code;
650     Py_ssize_t skipped[20];
651     int cskipped = 0;
652 
653     result = nfd_nfkd(self, input, k);
654     if (!result)
655         return NULL;
656     /* result will be "ready". */
657     kind = PyUnicode_KIND(result);
658     data = PyUnicode_DATA(result);
659     len = PyUnicode_GET_LENGTH(result);
660 
661     /* We allocate a buffer for the output.
662        If we find that we made no changes, we still return
663        the NFD result. */
664     output = PyMem_NEW(Py_UCS4, len);
665     if (!output) {
666         PyErr_NoMemory();
667         Py_DECREF(result);
668         return 0;
669     }
670     i = o = 0;
671 
672   again:
673     while (i < len) {
674       for (index = 0; index < cskipped; index++) {
675           if (skipped[index] == i) {
676               /* *i character is skipped.
677                  Remove from list. */
678               skipped[index] = skipped[cskipped-1];
679               cskipped--;
680               i++;
681               goto again; /* continue while */
682           }
683       }
684       /* Hangul Composition. We don't need to check for <LV,T>
685          pairs, since we always have decomposed data. */
686       code = PyUnicode_READ(kind, data, i);
687       if (LBase <= code && code < (LBase+LCount) &&
688           i + 1 < len &&
689           VBase <= PyUnicode_READ(kind, data, i+1) &&
690           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
691           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
692              and V character is a modern vowel (0x1161 ~ 0x1175). */
693           int LIndex, VIndex;
694           LIndex = code - LBase;
695           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
696           code = SBase + (LIndex*VCount+VIndex)*TCount;
697           i+=2;
698           if (i < len &&
699               TBase < PyUnicode_READ(kind, data, i) &&
700               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
701               /* check T character is a modern trailing consonant
702                  (0x11A8 ~ 0x11C2). */
703               code += PyUnicode_READ(kind, data, i)-TBase;
704               i++;
705           }
706           output[o++] = code;
707           continue;
708       }
709 
710       /* code is still input[i] here */
711       f = find_nfc_index(nfc_first, code);
712       if (f == -1) {
713           output[o++] = code;
714           i++;
715           continue;
716       }
717       /* Find next unblocked character. */
718       i1 = i+1;
719       comb = 0;
720       /* output base character for now; might be updated later. */
721       output[o] = PyUnicode_READ(kind, data, i);
722       while (i1 < len) {
723           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
724           int comb1 = _getrecord_ex(code1)->combining;
725           if (comb) {
726               if (comb1 == 0)
727                   break;
728               if (comb >= comb1) {
729                   /* Character is blocked. */
730                   i1++;
731                   continue;
732               }
733           }
734           l = find_nfc_index(nfc_last, code1);
735           /* i1 cannot be combined with i. If i1
736              is a starter, we don't need to look further.
737              Otherwise, record the combining class. */
738           if (l == -1) {
739             not_combinable:
740               if (comb1 == 0)
741                   break;
742               comb = comb1;
743               i1++;
744               continue;
745           }
746           index = f*TOTAL_LAST + l;
747           index1 = comp_index[index >> COMP_SHIFT];
748           code = comp_data[(index1<<COMP_SHIFT)+
749                            (index&((1<<COMP_SHIFT)-1))];
750           if (code == 0)
751               goto not_combinable;
752 
753           /* Replace the original character. */
754           output[o] = code;
755           /* Mark the second character unused. */
756           assert(cskipped < 20);
757           skipped[cskipped++] = i1;
758           i1++;
759           f = find_nfc_index(nfc_first, output[o]);
760           if (f == -1)
761               break;
762       }
763       /* Output character was already written.
764          Just advance the indices. */
765       o++; i++;
766     }
767     if (o == len) {
768         /* No changes. Return original string. */
769         PyMem_Free(output);
770         return result;
771     }
772     Py_DECREF(result);
773     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
774                                        output, o);
775     PyMem_Free(output);
776     return result;
777 }
778 
779 // This needs to match the logic in makeunicodedata.py
780 // which constructs the quickcheck data.
781 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
782 
783 /* Run the Unicode normalization "quickcheck" algorithm.
784  *
785  * Return YES or NO if quickcheck determines the input is certainly
786  * normalized or certainly not, and MAYBE if quickcheck is unable to
787  * tell.
788  *
789  * If `yes_only` is true, then return MAYBE as soon as we determine
790  * the answer is not YES.
791  *
792  * For background and details on the algorithm, see UAX #15:
793  *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
794  */
795 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)796 is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
797                          bool yes_only)
798 {
799     /* UCD 3.2.0 is requested, quickchecks must be disabled. */
800     if (UCD_Check(self)) {
801         return MAYBE;
802     }
803 
804     if (PyUnicode_IS_ASCII(input)) {
805         return YES;
806     }
807 
808     Py_ssize_t i, len;
809     int kind;
810     const void *data;
811     unsigned char prev_combining = 0;
812 
813     /* The two quickcheck bits at this shift have type QuickcheckResult. */
814     int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
815 
816     QuickcheckResult result = YES; /* certainly normalized, unless we find something */
817 
818     i = 0;
819     kind = PyUnicode_KIND(input);
820     data = PyUnicode_DATA(input);
821     len = PyUnicode_GET_LENGTH(input);
822     while (i < len) {
823         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
824         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
825 
826         unsigned char combining = record->combining;
827         if (combining && prev_combining > combining)
828             return NO; /* non-canonical sort order, not normalized */
829         prev_combining = combining;
830 
831         unsigned char quickcheck_whole = record->normalization_quick_check;
832         if (yes_only) {
833             if (quickcheck_whole & (3 << quickcheck_shift))
834                 return MAYBE;
835         } else {
836             switch ((quickcheck_whole >> quickcheck_shift) & 3) {
837             case NO:
838               return NO;
839             case MAYBE:
840               result = MAYBE; /* this string might need normalization */
841             }
842         }
843     }
844     return result;
845 }
846 
847 /*[clinic input]
848 unicodedata.UCD.is_normalized
849 
850     self: self
851     form: unicode
852     unistr as input: unicode
853     /
854 
855 Return whether the Unicode string unistr is in the normal form 'form'.
856 
857 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
858 [clinic start generated code]*/
859 
860 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)861 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
862                                    PyObject *input)
863 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
864 {
865     if (PyUnicode_GET_LENGTH(input) == 0) {
866         /* special case empty input strings. */
867         Py_RETURN_TRUE;
868     }
869 
870     PyObject *result;
871     bool nfc = false;
872     bool k = false;
873     QuickcheckResult m;
874 
875     PyObject *cmp;
876     int match = 0;
877 
878     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
879         nfc = true;
880     }
881     else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
882         nfc = true;
883         k = true;
884     }
885     else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
886         /* matches default values for `nfc` and `k` */
887     }
888     else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
889         k = true;
890     }
891     else {
892         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
893         return NULL;
894     }
895 
896     m = is_normalized_quickcheck(self, input, nfc, k, false);
897 
898     if (m == MAYBE) {
899         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
900         if (cmp == NULL) {
901             return NULL;
902         }
903         match = PyUnicode_Compare(input, cmp);
904         Py_DECREF(cmp);
905         result = (match == 0) ? Py_True : Py_False;
906     }
907     else {
908         result = (m == YES) ? Py_True : Py_False;
909     }
910 
911     return Py_NewRef(result);
912 }
913 
914 
915 /*[clinic input]
916 unicodedata.UCD.normalize
917 
918     self: self
919     form: unicode
920     unistr as input: unicode
921     /
922 
923 Return the normal form 'form' for the Unicode string unistr.
924 
925 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
926 [clinic start generated code]*/
927 
928 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)929 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
930                                PyObject *input)
931 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
932 {
933     if (PyUnicode_GET_LENGTH(input) == 0) {
934         /* Special case empty input strings, since resizing
935            them  later would cause internal errors. */
936         return Py_NewRef(input);
937     }
938 
939     if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
940         if (is_normalized_quickcheck(self, input,
941                                      true,  false, true) == YES) {
942             return Py_NewRef(input);
943         }
944         return nfc_nfkc(self, input, 0);
945     }
946     if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
947         if (is_normalized_quickcheck(self, input,
948                                      true,  true,  true) == YES) {
949             return Py_NewRef(input);
950         }
951         return nfc_nfkc(self, input, 1);
952     }
953     if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
954         if (is_normalized_quickcheck(self, input,
955                                      false, false, true) == YES) {
956             return Py_NewRef(input);
957         }
958         return nfd_nfkd(self, input, 0);
959     }
960     if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
961         if (is_normalized_quickcheck(self, input,
962                                      false, true,  true) == YES) {
963             return Py_NewRef(input);
964         }
965         return nfd_nfkd(self, input, 1);
966     }
967     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
968     return NULL;
969 }
970 
971 /* -------------------------------------------------------------------- */
972 /* unicode character name tables */
973 
974 /* data file generated by Tools/unicode/makeunicodedata.py */
975 #include "unicodename_db.h"
976 
977 /* -------------------------------------------------------------------- */
978 /* database code (cut and pasted from the unidb package) */
979 
980 static const char * const hangul_syllables[][3] = {
981     { "G",  "A",   ""   },
982     { "GG", "AE",  "G"  },
983     { "N",  "YA",  "GG" },
984     { "D",  "YAE", "GS" },
985     { "DD", "EO",  "N", },
986     { "R",  "E",   "NJ" },
987     { "M",  "YEO", "NH" },
988     { "B",  "YE",  "D"  },
989     { "BB", "O",   "L"  },
990     { "S",  "WA",  "LG" },
991     { "SS", "WAE", "LM" },
992     { "",   "OE",  "LB" },
993     { "J",  "YO",  "LS" },
994     { "JJ", "U",   "LT" },
995     { "C",  "WEO", "LP" },
996     { "K",  "WE",  "LH" },
997     { "T",  "WI",  "M"  },
998     { "P",  "YU",  "B"  },
999     { "H",  "EU",  "BS" },
1000     { 0,    "YI",  "S"  },
1001     { 0,    "I",   "SS" },
1002     { 0,    0,     "NG" },
1003     { 0,    0,     "J"  },
1004     { 0,    0,     "C"  },
1005     { 0,    0,     "K"  },
1006     { 0,    0,     "T"  },
1007     { 0,    0,     "P"  },
1008     { 0,    0,     "H"  }
1009 };
1010 
1011 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1012 static int
is_unified_ideograph(Py_UCS4 code)1013 is_unified_ideograph(Py_UCS4 code)
1014 {
1015     return
1016         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
1017         (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
1018         (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1019         (0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
1020         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1021         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1022         (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1023         (0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
1024         (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
1025         (0x31350 <= code && code <= 0x323AF);   /* CJK Ideograph Extension H */
1026 }
1027 
1028 /* macros used to determine if the given code point is in the PUA range that
1029  * we are using to store aliases and named sequences */
1030 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1031 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1032                           (cp < named_sequences_end))
1033 
1034 
1035 // DAWG decoding functions
1036 
1037 static unsigned int
_dawg_decode_varint_unsigned(unsigned int index,unsigned int * result)1038 _dawg_decode_varint_unsigned(unsigned int index, unsigned int* result)
1039 {
1040     unsigned int res = 0;
1041     unsigned int shift = 0;
1042     for (;;) {
1043         unsigned char byte = packed_name_dawg[index];
1044         res |= (byte & 0x7f) << shift;
1045         index++;
1046         shift += 7;
1047         if (!(byte & 0x80)) {
1048             *result = res;
1049             return index;
1050         }
1051     }
1052 }
1053 
1054 static int
_dawg_match_edge(const char * name,unsigned int namelen,unsigned int size,unsigned int label_offset,unsigned int namepos)1055 _dawg_match_edge(const char* name, unsigned int namelen, unsigned int size,
1056                  unsigned int label_offset, unsigned int namepos)
1057 {
1058     // This returns 1 if the edge matched, 0 if it didn't (but further edges
1059     // could match) and -1 if the name cannot match at all.
1060     if (size > 1 && namepos + size > namelen) {
1061         return 0;
1062     }
1063     for (unsigned int i = 0; i < size; i++) {
1064         if (packed_name_dawg[label_offset + i] != Py_TOUPPER(name[namepos + i])) {
1065             if (i > 0) {
1066                 return -1; // cannot match at all
1067             }
1068             return 0;
1069         }
1070     }
1071     return 1;
1072 }
1073 
1074 // reading DAWG node information:
1075 // a node is encoded by a varint. The lowest bit of that int is set if the node
1076 // is a final, accepting state. The higher bits of that int represent the
1077 // number of names that are encoded by the sub-DAWG started by this node. It's
1078 // used to compute the position of a name.
1079 //
1080 // the starting node of the DAWG is at position 0.
1081 //
1082 // the varint representing a node is followed by the node's edges, the encoding
1083 // is described below
1084 
1085 
1086 static unsigned int
_dawg_decode_node(unsigned int node_offset,bool * final)1087 _dawg_decode_node(unsigned int node_offset, bool* final)
1088 {
1089     unsigned int num;
1090     node_offset = _dawg_decode_varint_unsigned(node_offset, &num);
1091     *final = num & 1;
1092     return node_offset;
1093 }
1094 
1095 static bool
_dawg_node_is_final(unsigned int node_offset)1096 _dawg_node_is_final(unsigned int node_offset)
1097 {
1098     unsigned int num;
1099     _dawg_decode_varint_unsigned(node_offset, &num);
1100     return num & 1;
1101 }
1102 
1103 static unsigned int
_dawg_node_descendant_count(unsigned int node_offset)1104 _dawg_node_descendant_count(unsigned int node_offset)
1105 {
1106     unsigned int num;
1107     _dawg_decode_varint_unsigned(node_offset, &num);
1108     return num >> 1;
1109 }
1110 
1111 
1112 // reading DAWG edge information:
1113 // a DAWG edge is comprised of the following information:
1114 // (1) the size of the label of the string attached to the edge
1115 // (2) the characters of that edge
1116 // (3) the target node
1117 // (4) whether the edge is the last edge in the list of edges following a node
1118 //
1119 // this information is encoded in a compact form as follows:
1120 //
1121 // +---------+-----------------+--------------+--------------------
1122 // |  varint | size (if != 1)  | label chars  | ... next edge ...
1123 // +---------+-----------------+--------------+--------------------
1124 //
1125 // - first comes a varint
1126 //     - the lowest bit of that varint is whether the edge is final (4)
1127 //     - the second lowest bit of that varint is true if the size of
1128 //       the length of the label is 1 (1)
1129 //     - the rest of the varint is an offset that can be used to compute
1130 //       the offset of the target node of that edge (3)
1131 //  - if the size is not 1, the first varint is followed by a
1132 //    character encoding the number of characters of the label (1)
1133 //    (unicode character names aren't larger than 256 bytes, therefore each
1134 //    edge label can be at most 256 chars, but is usually smaller)
1135 //  - the next size bytes are the characters of the label (2)
1136 //
1137 // the offset of the target node is computed as follows: the number in the
1138 // upper bits of the varint needs to be added to the offset of the target node
1139 // of the previous edge. For the first edge, where there is no previous target
1140 // node, the offset of the first edge is used.
1141 // The intuition here is that edges going out from a node often lead to nodes
1142 // that are close by, leading to small offsets from the current node and thus
1143 // fewer bytes.
1144 //
1145 // There is a special case: if a final node has no outgoing edges, it has to be
1146 // followed by a 0 byte to indicate that there are no edges (because the end of
1147 // the edge list is normally indicated in a bit in the edge encoding). This is
1148 // indicated by _dawg_decode_edge returning -1
1149 
1150 
1151 static int
_dawg_decode_edge(bool is_first_edge,unsigned int prev_target_node_offset,unsigned int edge_offset,unsigned int * size,unsigned int * label_offset,unsigned int * target_node_offset)1152 _dawg_decode_edge(bool is_first_edge, unsigned int prev_target_node_offset,
1153                   unsigned int edge_offset, unsigned int* size,
1154                   unsigned int* label_offset, unsigned int* target_node_offset)
1155 {
1156     unsigned int num;
1157     edge_offset = _dawg_decode_varint_unsigned(edge_offset, &num);
1158     if (num == 0 && is_first_edge) {
1159         return -1; // trying to decode past a final node without outgoing edges
1160     }
1161     bool last_edge = num & 1;
1162     num >>= 1;
1163     bool len_is_one = num & 1;
1164     num >>= 1;
1165     *target_node_offset = prev_target_node_offset + num;
1166     if (len_is_one) {
1167         *size = 1;
1168     } else {
1169         *size = packed_name_dawg[edge_offset++];
1170     }
1171     *label_offset = edge_offset;
1172     return last_edge;
1173 }
1174 
1175 static int
_lookup_dawg_packed(const char * name,unsigned int namelen)1176 _lookup_dawg_packed(const char* name, unsigned int namelen)
1177 {
1178     unsigned int stringpos = 0;
1179     unsigned int node_offset = 0;
1180     unsigned int result = 0; // this is the number of final nodes that we skipped to match name
1181     while (stringpos < namelen) {
1182         bool final;
1183         unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
1184         unsigned int prev_target_node_offset = edge_offset;
1185         bool is_first_edge = true;
1186         for (;;) {
1187             unsigned int size;
1188             unsigned int label_offset, target_node_offset;
1189             int last_edge = _dawg_decode_edge(
1190                     is_first_edge, prev_target_node_offset, edge_offset,
1191                     &size, &label_offset, &target_node_offset);
1192             if (last_edge == -1) {
1193                 return -1;
1194             }
1195             is_first_edge = false;
1196             prev_target_node_offset = target_node_offset;
1197             int matched = _dawg_match_edge(name, namelen, size, label_offset, stringpos);
1198             if (matched == -1) {
1199                 return -1;
1200             }
1201             if (matched) {
1202                 if (final)
1203                     result += 1;
1204                 stringpos += size;
1205                 node_offset = target_node_offset;
1206                 break;
1207             }
1208             if (last_edge) {
1209                 return -1;
1210             }
1211             result += _dawg_node_descendant_count(target_node_offset);
1212             edge_offset = label_offset + size;
1213         }
1214     }
1215     if (_dawg_node_is_final(node_offset)) {
1216         return result;
1217     }
1218     return -1;
1219 }
1220 
1221 static int
_inverse_dawg_lookup(char * buffer,unsigned int buflen,unsigned int pos)1222 _inverse_dawg_lookup(char* buffer, unsigned int buflen, unsigned int pos)
1223 {
1224     unsigned int node_offset = 0;
1225     unsigned int bufpos = 0;
1226     for (;;) {
1227         bool final;
1228         unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
1229 
1230         if (final) {
1231             if (pos == 0) {
1232                 if (bufpos + 1 == buflen) {
1233                     return 0;
1234                 }
1235                 buffer[bufpos] = '\0';
1236                 return 1;
1237             }
1238             pos--;
1239         }
1240         unsigned int prev_target_node_offset = edge_offset;
1241         bool is_first_edge = true;
1242         for (;;) {
1243             unsigned int size;
1244             unsigned int label_offset, target_node_offset;
1245             int last_edge = _dawg_decode_edge(
1246                     is_first_edge, prev_target_node_offset, edge_offset,
1247                     &size, &label_offset, &target_node_offset);
1248             if (last_edge == -1) {
1249                 return 0;
1250             }
1251             is_first_edge = false;
1252             prev_target_node_offset = target_node_offset;
1253 
1254             unsigned int descendant_count = _dawg_node_descendant_count(target_node_offset);
1255             if (pos < descendant_count) {
1256                 if (bufpos + size >= buflen) {
1257                     return 0; // buffer overflow
1258                 }
1259                 for (unsigned int i = 0; i < size; i++) {
1260                     buffer[bufpos++] = packed_name_dawg[label_offset++];
1261                 }
1262                 node_offset = target_node_offset;
1263                 break;
1264             } else if (!last_edge) {
1265                 pos -= descendant_count;
1266                 edge_offset = label_offset + size;
1267             } else {
1268                 return 0;
1269             }
1270         }
1271     }
1272 }
1273 
1274 
1275 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1276 _getucname(PyObject *self,
1277            Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1278 {
1279     /* Find the name associated with the given code point.
1280      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1281      * that we are using for aliases and named sequences. */
1282     int offset;
1283 
1284     if (code >= 0x110000)
1285         return 0;
1286 
1287     /* XXX should we just skip all the code points in the PUAs here? */
1288     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1289         return 0;
1290 
1291     if (UCD_Check(self)) {
1292         /* in 3.2.0 there are no aliases and named sequences */
1293         const change_record *old;
1294         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1295             return 0;
1296         old = get_old_record(self, code);
1297         if (old->category_changed == 0) {
1298             /* unassigned */
1299             return 0;
1300         }
1301     }
1302 
1303     if (SBase <= code && code < SBase+SCount) {
1304         /* Hangul syllable. */
1305         int SIndex = code - SBase;
1306         int L = SIndex / NCount;
1307         int V = (SIndex % NCount) / TCount;
1308         int T = SIndex % TCount;
1309 
1310         if (buflen < 27)
1311             /* Worst case: HANGUL SYLLABLE <10chars>. */
1312             return 0;
1313         strcpy(buffer, "HANGUL SYLLABLE ");
1314         buffer += 16;
1315         strcpy(buffer, hangul_syllables[L][0]);
1316         buffer += strlen(hangul_syllables[L][0]);
1317         strcpy(buffer, hangul_syllables[V][1]);
1318         buffer += strlen(hangul_syllables[V][1]);
1319         strcpy(buffer, hangul_syllables[T][2]);
1320         buffer += strlen(hangul_syllables[T][2]);
1321         *buffer = '\0';
1322         return 1;
1323     }
1324 
1325     if (is_unified_ideograph(code)) {
1326         if (buflen < 28)
1327             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1328             return 0;
1329         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1330         return 1;
1331     }
1332 
1333     /* get position of codepoint in order of names in the dawg */
1334     offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
1335     offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
1336                                (code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
1337     if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
1338         return 0;
1339 
1340     assert(buflen >= 0);
1341     return _inverse_dawg_lookup(buffer, Py_SAFE_DOWNCAST(buflen, int, unsigned int), offset);
1342 }
1343 
1344 static int
capi_getucname(Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1345 capi_getucname(Py_UCS4 code,
1346                char* buffer, int buflen,
1347                int with_alias_and_seq)
1348 {
1349     return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1350 
1351 }
1352 
1353 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1354 find_syllable(const char *str, int *len, int *pos, int count, int column)
1355 {
1356     int i, len1;
1357     *len = -1;
1358     for (i = 0; i < count; i++) {
1359         const char *s = hangul_syllables[i][column];
1360         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1361         if (len1 <= *len)
1362             continue;
1363         if (strncmp(str, s, len1) == 0) {
1364             *len = len1;
1365             *pos = i;
1366         }
1367     }
1368     if (*len == -1) {
1369         *len = 0;
1370     }
1371 }
1372 
1373 static int
_check_alias_and_seq(Py_UCS4 * code,int with_named_seq)1374 _check_alias_and_seq(Py_UCS4* code, int with_named_seq)
1375 {
1376     /* check if named sequences are allowed */
1377     if (!with_named_seq && IS_NAMED_SEQ(*code))
1378         return 0;
1379     /* if the code point is in the PUA range that we use for aliases,
1380      * convert it to obtain the right code point */
1381     if (IS_ALIAS(*code))
1382         *code = name_aliases[*code-aliases_start];
1383     return 1;
1384 }
1385 
1386 
1387 static int
_getcode(const char * name,int namelen,Py_UCS4 * code)1388 _getcode(const char* name, int namelen, Py_UCS4* code)
1389 {
1390     /* Return the code point associated with the given name.
1391      * Named aliases are not resolved, they are returned as a code point in the
1392      * PUA */
1393 
1394     /* Check for hangul syllables. */
1395     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1396         int len, L = -1, V = -1, T = -1;
1397         const char *pos = name + 16;
1398         find_syllable(pos, &len, &L, LCount, 0);
1399         pos += len;
1400         find_syllable(pos, &len, &V, VCount, 1);
1401         pos += len;
1402         find_syllable(pos, &len, &T, TCount, 2);
1403         pos += len;
1404         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1405             *code = SBase + (L*VCount+V)*TCount + T;
1406             return 1;
1407         }
1408         /* Otherwise, it's an illegal syllable name. */
1409         return 0;
1410     }
1411 
1412     /* Check for unified ideographs. */
1413     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1414         /* Four or five hexdigits must follow. */
1415         unsigned int v;
1416         v = 0;
1417         name += 22;
1418         namelen -= 22;
1419         if (namelen != 4 && namelen != 5)
1420             return 0;
1421         while (namelen--) {
1422             v *= 16;
1423             if (*name >= '0' && *name <= '9')
1424                 v += *name - '0';
1425             else if (*name >= 'A' && *name <= 'F')
1426                 v += *name - 'A' + 10;
1427             else
1428                 return 0;
1429             name++;
1430         }
1431         if (!is_unified_ideograph(v))
1432             return 0;
1433         *code = v;
1434         return 1;
1435     }
1436 
1437     assert(namelen >= 0);
1438     int position = _lookup_dawg_packed(name, Py_SAFE_DOWNCAST(namelen, int, unsigned int));
1439     if (position < 0) {
1440         return 0;
1441     }
1442     *code = dawg_pos_to_codepoint[position];
1443     return 1;
1444 }
1445 
1446 
1447 static int
capi_getcode(const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1448 capi_getcode(const char* name, int namelen, Py_UCS4* code,
1449              int with_named_seq)
1450 {
1451     if (!_getcode(name, namelen, code)) {
1452         return 0;
1453     }
1454     return _check_alias_and_seq(code, with_named_seq);
1455 }
1456 
1457 static void
unicodedata_destroy_capi(PyObject * capsule)1458 unicodedata_destroy_capi(PyObject *capsule)
1459 {
1460     void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1461     PyMem_Free(capi);
1462 }
1463 
1464 static PyObject *
unicodedata_create_capi(void)1465 unicodedata_create_capi(void)
1466 {
1467     _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1468     if (capi == NULL) {
1469         PyErr_NoMemory();
1470         return NULL;
1471     }
1472     capi->getname = capi_getucname;
1473     capi->getcode = capi_getcode;
1474 
1475     PyObject *capsule = PyCapsule_New(capi,
1476                                       PyUnicodeData_CAPSULE_NAME,
1477                                       unicodedata_destroy_capi);
1478     if (capsule == NULL) {
1479         PyMem_Free(capi);
1480     }
1481     return capsule;
1482 };
1483 
1484 
1485 /* -------------------------------------------------------------------- */
1486 /* Python bindings */
1487 
1488 /*[clinic input]
1489 unicodedata.UCD.name
1490 
1491     self: self
1492     chr: int(accept={str})
1493     default: object=NULL
1494     /
1495 
1496 Returns the name assigned to the character chr as a string.
1497 
1498 If no name is defined, default is returned, or, if not given,
1499 ValueError is raised.
1500 [clinic start generated code]*/
1501 
1502 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1503 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1504 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1505 {
1506     char name[NAME_MAXLEN+1];
1507     Py_UCS4 c = (Py_UCS4)chr;
1508 
1509     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1510         if (default_value == NULL) {
1511             PyErr_SetString(PyExc_ValueError, "no such name");
1512             return NULL;
1513         }
1514         else {
1515             return Py_NewRef(default_value);
1516         }
1517     }
1518 
1519     return PyUnicode_FromString(name);
1520 }
1521 
1522 /*[clinic input]
1523 unicodedata.UCD.lookup
1524 
1525     self: self
1526     name: str(accept={str, robuffer}, zeroes=True)
1527     /
1528 
1529 Look up character by name.
1530 
1531 If a character with the given name is found, return the
1532 corresponding character.  If not found, KeyError is raised.
1533 [clinic start generated code]*/
1534 
1535 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_t name_length)1536 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1537                             Py_ssize_t name_length)
1538 /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
1539 {
1540     Py_UCS4 code;
1541     unsigned int index;
1542     if (name_length > NAME_MAXLEN) {
1543         PyErr_SetString(PyExc_KeyError, "name too long");
1544         return NULL;
1545     }
1546 
1547     if (!_getcode(name, (int)name_length, &code)) {
1548         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1549         return NULL;
1550     }
1551     if (UCD_Check(self)) {
1552         /* in 3.2.0 there are no aliases and named sequences */
1553         if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) {
1554             PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1555             return 0;
1556         }
1557     }
1558     /* check if code is in the PUA range that we use for named sequences
1559        and convert it */
1560     if (IS_NAMED_SEQ(code)) {
1561         index = code-named_sequences_start;
1562         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1563                                          named_sequences[index].seq,
1564                                          named_sequences[index].seqlen);
1565     }
1566     if (IS_ALIAS(code)) {
1567         code = name_aliases[code-aliases_start];
1568     }
1569     return PyUnicode_FromOrdinal(code);
1570 }
1571 
1572 // List of functions used to define module functions *AND* unicodedata.UCD
1573 // methods. For module functions, self is the module. For UCD methods, self
1574 // is an UCD instance. The UCD_Check() macro is used to check if self is
1575 // an UCD instance.
1576 static PyMethodDef unicodedata_functions[] = {
1577     UNICODEDATA_UCD_DECIMAL_METHODDEF
1578     UNICODEDATA_UCD_DIGIT_METHODDEF
1579     UNICODEDATA_UCD_NUMERIC_METHODDEF
1580     UNICODEDATA_UCD_CATEGORY_METHODDEF
1581     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1582     UNICODEDATA_UCD_COMBINING_METHODDEF
1583     UNICODEDATA_UCD_MIRRORED_METHODDEF
1584     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1585     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1586     UNICODEDATA_UCD_NAME_METHODDEF
1587     UNICODEDATA_UCD_LOOKUP_METHODDEF
1588     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1589     UNICODEDATA_UCD_NORMALIZE_METHODDEF
1590     {NULL, NULL}                /* sentinel */
1591 };
1592 
1593 static int
ucd_traverse(PreviousDBVersion * self,visitproc visit,void * arg)1594 ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1595 {
1596     Py_VISIT(Py_TYPE(self));
1597     return 0;
1598 }
1599 
1600 static void
ucd_dealloc(PreviousDBVersion * self)1601 ucd_dealloc(PreviousDBVersion *self)
1602 {
1603     PyTypeObject *tp = Py_TYPE(self);
1604     PyObject_GC_UnTrack(self);
1605     PyObject_GC_Del(self);
1606     Py_DECREF(tp);
1607 }
1608 
1609 static PyType_Slot ucd_type_slots[] = {
1610     {Py_tp_dealloc, ucd_dealloc},
1611     {Py_tp_traverse, ucd_traverse},
1612     {Py_tp_getattro, PyObject_GenericGetAttr},
1613     {Py_tp_methods, unicodedata_functions},
1614     {Py_tp_members, DB_members},
1615     {0, 0}
1616 };
1617 
1618 static PyType_Spec ucd_type_spec = {
1619     .name = "unicodedata.UCD",
1620     .basicsize = sizeof(PreviousDBVersion),
1621     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1622               Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1623     .slots = ucd_type_slots
1624 };
1625 
1626 PyDoc_STRVAR(unicodedata_docstring,
1627 "This module provides access to the Unicode Character Database which\n\
1628 defines character properties for all Unicode characters. The data in\n\
1629 this database is based on the UnicodeData.txt file version\n\
1630 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1631 \n\
1632 The module uses the same names and symbols as defined by the\n\
1633 UnicodeData File Format " UNIDATA_VERSION ".");
1634 
1635 static int
unicodedata_exec(PyObject * module)1636 unicodedata_exec(PyObject *module)
1637 {
1638     if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1639         return -1;
1640     }
1641 
1642     PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1643     if (ucd_type == NULL) {
1644         return -1;
1645     }
1646 
1647     if (PyModule_AddType(module, ucd_type) < 0) {
1648         Py_DECREF(ucd_type);
1649         return -1;
1650     }
1651 
1652     // Unicode database version 3.2.0 used by the IDNA encoding
1653     PyObject *v;
1654     v = new_previous_version(ucd_type, "3.2.0",
1655                              get_change_3_2_0, normalization_3_2_0);
1656     Py_DECREF(ucd_type);
1657     if (PyModule_Add(module, "ucd_3_2_0", v) < 0) {
1658         return -1;
1659     }
1660 
1661     /* Export C API */
1662     if (PyModule_Add(module, "_ucnhash_CAPI", unicodedata_create_capi()) < 0) {
1663         return -1;
1664     }
1665     return 0;
1666 }
1667 
1668 static PyModuleDef_Slot unicodedata_slots[] = {
1669     {Py_mod_exec, unicodedata_exec},
1670     {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
1671     {Py_mod_gil, Py_MOD_GIL_NOT_USED},
1672     {0, NULL}
1673 };
1674 
1675 static struct PyModuleDef unicodedata_module = {
1676     PyModuleDef_HEAD_INIT,
1677     .m_name = "unicodedata",
1678     .m_doc = unicodedata_docstring,
1679     .m_size = 0,
1680     .m_methods = unicodedata_functions,
1681     .m_slots = unicodedata_slots,
1682 };
1683 
1684 PyMODINIT_FUNC
PyInit_unicodedata(void)1685 PyInit_unicodedata(void)
1686 {
1687     return PyModuleDef_Init(&unicodedata_module);
1688 }
1689 
1690 
1691 /*
1692 Local variables:
1693 c-basic-offset: 4
1694 indent-tabs-mode: nil
1695 End:
1696 */
1697