• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    unicodedata -- Provides access to the Unicode database.
4 
5    Data was extracted from the UnicodeData.txt file.
6    The current version number is reported in the unidata_version constant.
7 
8    Written by Marc-Andre Lemburg (mal@lemburg.com).
9    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10    Modified by Martin v. Löwis (martin@v.loewis.de)
11 
12    Copyright (c) Corporation for National Research Initiatives.
13 
14    ------------------------------------------------------------------------ */
15 
16 #define PY_SSIZE_T_CLEAN
17 
18 #include "Python.h"
19 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
20 #include "structmember.h"         // PyMemberDef
21 
22 #include <stdbool.h>
23 
24 _Py_IDENTIFIER(NFC);
25 _Py_IDENTIFIER(NFD);
26 _Py_IDENTIFIER(NFKC);
27 _Py_IDENTIFIER(NFKD);
28 
29 /*[clinic input]
30 module unicodedata
31 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
32 [clinic start generated code]*/
33 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
34 
35 /* character properties */
36 
37 typedef struct {
38     const unsigned char category;       /* index into
39                                            _PyUnicode_CategoryNames */
40     const unsigned char combining;      /* combining class value 0 - 255 */
41     const unsigned char bidirectional;  /* index into
42                                            _PyUnicode_BidirectionalNames */
43     const unsigned char mirrored;       /* true if mirrored in bidir mode */
44     const unsigned char east_asian_width;       /* index into
45                                                    _PyUnicode_EastAsianWidth */
46     const unsigned char normalization_quick_check; /* see is_normalized() */
47 } _PyUnicode_DatabaseRecord;
48 
49 typedef struct change_record {
50     /* sequence of fields should be the same as in merge_old_version */
51     const unsigned char bidir_changed;
52     const unsigned char category_changed;
53     const unsigned char decimal_changed;
54     const unsigned char mirrored_changed;
55     const unsigned char east_asian_width_changed;
56     const double numeric_changed;
57 } change_record;
58 
59 /* data file generated by Tools/unicode/makeunicodedata.py */
60 #include "unicodedata_db.h"
61 
62 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)63 _getrecord_ex(Py_UCS4 code)
64 {
65     int index;
66     if (code >= 0x110000)
67         index = 0;
68     else {
69         index = index1[(code>>SHIFT)];
70         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71     }
72 
73     return &_PyUnicode_Database_Records[index];
74 }
75 
76 /* ------------- Previous-version API ------------------------------------- */
77 typedef struct previous_version {
78     PyObject_HEAD
79     const char *name;
80     const change_record* (*getrecord)(Py_UCS4);
81     Py_UCS4 (*normalization)(Py_UCS4);
82 } PreviousDBVersion;
83 
84 #include "clinic/unicodedata.c.h"
85 
86 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
87 
88 static PyMemberDef DB_members[] = {
89         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
90         {NULL}
91 };
92 
93 // Check if self is an unicodedata.UCD instance.
94 // If self is NULL (when the PyCapsule C API is used), return 0.
95 // PyModule_Check() is used to avoid having to retrieve the ucd_type.
96 // See unicodedata_functions comment to the rationale of this macro.
97 #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
98 
99 static PyObject*
new_previous_version(PyTypeObject * ucd_type,const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))100 new_previous_version(PyTypeObject *ucd_type,
101                      const char*name, const change_record* (*getrecord)(Py_UCS4),
102                      Py_UCS4 (*normalization)(Py_UCS4))
103 {
104     PreviousDBVersion *self;
105     self = PyObject_GC_New(PreviousDBVersion, ucd_type);
106     if (self == NULL)
107         return NULL;
108     self->name = name;
109     self->getrecord = getrecord;
110     self->normalization = normalization;
111     PyObject_GC_Track(self);
112     return (PyObject*)self;
113 }
114 
115 
116 /* --- Module API --------------------------------------------------------- */
117 
118 /*[clinic input]
119 unicodedata.UCD.decimal
120 
121     self: self
122     chr: int(accept={str})
123     default: object=NULL
124     /
125 
126 Converts a Unicode character into its equivalent decimal value.
127 
128 Returns the decimal value assigned to the character chr as integer.
129 If no such value is defined, default is returned, or, if not given,
130 ValueError is raised.
131 [clinic start generated code]*/
132 
133 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)134 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
135                              PyObject *default_value)
136 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
137 {
138     int have_old = 0;
139     long rc;
140     Py_UCS4 c = (Py_UCS4)chr;
141 
142     if (UCD_Check(self)) {
143         const change_record *old = get_old_record(self, c);
144         if (old->category_changed == 0) {
145             /* unassigned */
146             have_old = 1;
147             rc = -1;
148         }
149         else if (old->decimal_changed != 0xFF) {
150             have_old = 1;
151             rc = old->decimal_changed;
152         }
153     }
154 
155     if (!have_old)
156         rc = Py_UNICODE_TODECIMAL(c);
157     if (rc < 0) {
158         if (default_value == NULL) {
159             PyErr_SetString(PyExc_ValueError,
160                             "not a decimal");
161             return NULL;
162         }
163         else {
164             Py_INCREF(default_value);
165             return default_value;
166         }
167     }
168     return PyLong_FromLong(rc);
169 }
170 
171 /*[clinic input]
172 unicodedata.UCD.digit
173 
174     self: self
175     chr: int(accept={str})
176     default: object=NULL
177     /
178 
179 Converts a Unicode character into its equivalent digit value.
180 
181 Returns the digit value assigned to the character chr as integer.
182 If no such value is defined, default is returned, or, if not given,
183 ValueError is raised.
184 [clinic start generated code]*/
185 
186 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)187 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
188 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
189 {
190     long rc;
191     Py_UCS4 c = (Py_UCS4)chr;
192     rc = Py_UNICODE_TODIGIT(c);
193     if (rc < 0) {
194         if (default_value == NULL) {
195             PyErr_SetString(PyExc_ValueError, "not a digit");
196             return NULL;
197         }
198         else {
199             Py_INCREF(default_value);
200             return default_value;
201         }
202     }
203     return PyLong_FromLong(rc);
204 }
205 
206 /*[clinic input]
207 unicodedata.UCD.numeric
208 
209     self: self
210     chr: int(accept={str})
211     default: object=NULL
212     /
213 
214 Converts a Unicode character into its equivalent numeric value.
215 
216 Returns the numeric value assigned to the character chr as float.
217 If no such value is defined, default is returned, or, if not given,
218 ValueError is raised.
219 [clinic start generated code]*/
220 
221 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)222 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
223                              PyObject *default_value)
224 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
225 {
226     int have_old = 0;
227     double rc;
228     Py_UCS4 c = (Py_UCS4)chr;
229 
230     if (UCD_Check(self)) {
231         const change_record *old = get_old_record(self, c);
232         if (old->category_changed == 0) {
233             /* unassigned */
234             have_old = 1;
235             rc = -1.0;
236         }
237         else if (old->decimal_changed != 0xFF) {
238             have_old = 1;
239             rc = old->decimal_changed;
240         }
241     }
242 
243     if (!have_old)
244         rc = Py_UNICODE_TONUMERIC(c);
245     if (rc == -1.0) {
246         if (default_value == NULL) {
247             PyErr_SetString(PyExc_ValueError, "not a numeric character");
248             return NULL;
249         }
250         else {
251             Py_INCREF(default_value);
252             return default_value;
253         }
254     }
255     return PyFloat_FromDouble(rc);
256 }
257 
258 /*[clinic input]
259 unicodedata.UCD.category
260 
261     self: self
262     chr: int(accept={str})
263     /
264 
265 Returns the general category assigned to the character chr as string.
266 [clinic start generated code]*/
267 
268 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)269 unicodedata_UCD_category_impl(PyObject *self, int chr)
270 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
271 {
272     int index;
273     Py_UCS4 c = (Py_UCS4)chr;
274     index = (int) _getrecord_ex(c)->category;
275     if (UCD_Check(self)) {
276         const change_record *old = get_old_record(self, c);
277         if (old->category_changed != 0xFF)
278             index = old->category_changed;
279     }
280     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
281 }
282 
283 /*[clinic input]
284 unicodedata.UCD.bidirectional
285 
286     self: self
287     chr: int(accept={str})
288     /
289 
290 Returns the bidirectional class assigned to the character chr as string.
291 
292 If no such value is defined, an empty string is returned.
293 [clinic start generated code]*/
294 
295 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)296 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
297 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
298 {
299     int index;
300     Py_UCS4 c = (Py_UCS4)chr;
301     index = (int) _getrecord_ex(c)->bidirectional;
302     if (UCD_Check(self)) {
303         const change_record *old = get_old_record(self, c);
304         if (old->category_changed == 0)
305             index = 0; /* unassigned */
306         else if (old->bidir_changed != 0xFF)
307             index = old->bidir_changed;
308     }
309     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
310 }
311 
312 /*[clinic input]
313 unicodedata.UCD.combining -> int
314 
315     self: self
316     chr: int(accept={str})
317     /
318 
319 Returns the canonical combining class assigned to the character chr as integer.
320 
321 Returns 0 if no combining class is defined.
322 [clinic start generated code]*/
323 
324 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)325 unicodedata_UCD_combining_impl(PyObject *self, int chr)
326 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
327 {
328     int index;
329     Py_UCS4 c = (Py_UCS4)chr;
330     index = (int) _getrecord_ex(c)->combining;
331     if (UCD_Check(self)) {
332         const change_record *old = get_old_record(self, c);
333         if (old->category_changed == 0)
334             index = 0; /* unassigned */
335     }
336     return index;
337 }
338 
339 /*[clinic input]
340 unicodedata.UCD.mirrored -> int
341 
342     self: self
343     chr: int(accept={str})
344     /
345 
346 Returns the mirrored property assigned to the character chr as integer.
347 
348 Returns 1 if the character has been identified as a "mirrored"
349 character in bidirectional text, 0 otherwise.
350 [clinic start generated code]*/
351 
352 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)353 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
354 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
355 {
356     int index;
357     Py_UCS4 c = (Py_UCS4)chr;
358     index = (int) _getrecord_ex(c)->mirrored;
359     if (UCD_Check(self)) {
360         const change_record *old = get_old_record(self, c);
361         if (old->category_changed == 0)
362             index = 0; /* unassigned */
363         else if (old->mirrored_changed != 0xFF)
364             index = old->mirrored_changed;
365     }
366     return index;
367 }
368 
369 /*[clinic input]
370 unicodedata.UCD.east_asian_width
371 
372     self: self
373     chr: int(accept={str})
374     /
375 
376 Returns the east asian width assigned to the character chr as string.
377 [clinic start generated code]*/
378 
379 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)380 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
381 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
382 {
383     int index;
384     Py_UCS4 c = (Py_UCS4)chr;
385     index = (int) _getrecord_ex(c)->east_asian_width;
386     if (UCD_Check(self)) {
387         const change_record *old = get_old_record(self, c);
388         if (old->category_changed == 0)
389             index = 0; /* unassigned */
390         else if (old->east_asian_width_changed != 0xFF)
391             index = old->east_asian_width_changed;
392     }
393     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
394 }
395 
396 /*[clinic input]
397 unicodedata.UCD.decomposition
398 
399     self: self
400     chr: int(accept={str})
401     /
402 
403 Returns the character decomposition mapping assigned to the character chr as string.
404 
405 An empty string is returned in case no such mapping is defined.
406 [clinic start generated code]*/
407 
408 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)409 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
410 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
411 {
412     char decomp[256];
413     int code, index, count;
414     size_t i;
415     unsigned int prefix_index;
416     Py_UCS4 c = (Py_UCS4)chr;
417 
418     code = (int)c;
419 
420     if (UCD_Check(self)) {
421         const change_record *old = get_old_record(self, c);
422         if (old->category_changed == 0)
423             return PyUnicode_FromString(""); /* unassigned */
424     }
425 
426     if (code < 0 || code >= 0x110000)
427         index = 0;
428     else {
429         index = decomp_index1[(code>>DECOMP_SHIFT)];
430         index = decomp_index2[(index<<DECOMP_SHIFT)+
431                              (code&((1<<DECOMP_SHIFT)-1))];
432     }
433 
434     /* high byte is number of hex bytes (usually one or two), low byte
435        is prefix code (from*/
436     count = decomp_data[index] >> 8;
437 
438     /* XXX: could allocate the PyString up front instead
439        (strlen(prefix) + 5 * count + 1 bytes) */
440 
441     /* Based on how index is calculated above and decomp_data is generated
442        from Tools/unicode/makeunicodedata.py, it should not be possible
443        to overflow decomp_prefix. */
444     prefix_index = decomp_data[index] & 255;
445     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
446 
447     /* copy prefix */
448     i = strlen(decomp_prefix[prefix_index]);
449     memcpy(decomp, decomp_prefix[prefix_index], i);
450 
451     while (count-- > 0) {
452         if (i)
453             decomp[i++] = ' ';
454         assert(i < sizeof(decomp));
455         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
456                       decomp_data[++index]);
457         i += strlen(decomp + i);
458     }
459     return PyUnicode_FromStringAndSize(decomp, i);
460 }
461 
462 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)463 get_decomp_record(PyObject *self, Py_UCS4 code,
464                   int *index, int *prefix, int *count)
465 {
466     if (code >= 0x110000) {
467         *index = 0;
468     }
469     else if (UCD_Check(self)
470              && get_old_record(self, code)->category_changed==0) {
471         /* unassigned in old version */
472         *index = 0;
473     }
474     else {
475         *index = decomp_index1[(code>>DECOMP_SHIFT)];
476         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
477                                (code&((1<<DECOMP_SHIFT)-1))];
478     }
479 
480     /* high byte is number of hex bytes (usually one or two), low byte
481        is prefix code (from*/
482     *count = decomp_data[*index] >> 8;
483     *prefix = decomp_data[*index] & 255;
484 
485     (*index)++;
486 }
487 
488 #define SBase   0xAC00
489 #define LBase   0x1100
490 #define VBase   0x1161
491 #define TBase   0x11A7
492 #define LCount  19
493 #define VCount  21
494 #define TCount  28
495 #define NCount  (VCount*TCount)
496 #define SCount  (LCount*NCount)
497 
498 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)499 nfd_nfkd(PyObject *self, PyObject *input, int k)
500 {
501     PyObject *result;
502     Py_UCS4 *output;
503     Py_ssize_t i, o, osize;
504     int kind;
505     const void *data;
506     /* Longest decomposition in Unicode 3.2: U+FDFA */
507     Py_UCS4 stack[20];
508     Py_ssize_t space, isize;
509     int index, prefix, count, stackptr;
510     unsigned char prev, cur;
511 
512     stackptr = 0;
513     isize = PyUnicode_GET_LENGTH(input);
514     space = isize;
515     /* Overallocate at most 10 characters. */
516     if (space > 10) {
517         if (space <= PY_SSIZE_T_MAX - 10)
518             space += 10;
519     }
520     else {
521         space *= 2;
522     }
523     osize = space;
524     output = PyMem_NEW(Py_UCS4, space);
525     if (!output) {
526         PyErr_NoMemory();
527         return NULL;
528     }
529     i = o = 0;
530     kind = PyUnicode_KIND(input);
531     data = PyUnicode_DATA(input);
532 
533     while (i < isize) {
534         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
535         while(stackptr) {
536             Py_UCS4 code = stack[--stackptr];
537             /* Hangul Decomposition adds three characters in
538                a single step, so we need at least that much room. */
539             if (space < 3) {
540                 Py_UCS4 *new_output;
541                 osize += 10;
542                 space += 10;
543                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
544                 if (new_output == NULL) {
545                     PyMem_Free(output);
546                     PyErr_NoMemory();
547                     return NULL;
548                 }
549                 output = new_output;
550             }
551             /* Hangul Decomposition. */
552             if (SBase <= code && code < (SBase+SCount)) {
553                 int SIndex = code - SBase;
554                 int L = LBase + SIndex / NCount;
555                 int V = VBase + (SIndex % NCount) / TCount;
556                 int T = TBase + SIndex % TCount;
557                 output[o++] = L;
558                 output[o++] = V;
559                 space -= 2;
560                 if (T != TBase) {
561                     output[o++] = T;
562                     space --;
563                 }
564                 continue;
565             }
566             /* normalization changes */
567             if (UCD_Check(self)) {
568                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
569                 if (value != 0) {
570                     stack[stackptr++] = value;
571                     continue;
572                 }
573             }
574 
575             /* Other decompositions. */
576             get_decomp_record(self, code, &index, &prefix, &count);
577 
578             /* Copy character if it is not decomposable, or has a
579                compatibility decomposition, but we do NFD. */
580             if (!count || (prefix && !k)) {
581                 output[o++] = code;
582                 space--;
583                 continue;
584             }
585             /* Copy decomposition onto the stack, in reverse
586                order.  */
587             while(count) {
588                 code = decomp_data[index + (--count)];
589                 stack[stackptr++] = code;
590             }
591         }
592     }
593 
594     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
595                                        output, o);
596     PyMem_Free(output);
597     if (!result)
598         return NULL;
599     /* result is guaranteed to be ready, as it is compact. */
600     kind = PyUnicode_KIND(result);
601     data = PyUnicode_DATA(result);
602 
603     /* Sort canonically. */
604     i = 0;
605     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
607         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
608         if (prev == 0 || cur == 0 || prev <= cur) {
609             prev = cur;
610             continue;
611         }
612         /* Non-canonical order. Need to switch *i with previous. */
613         o = i - 1;
614         while (1) {
615             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
616             PyUnicode_WRITE(kind, data, o+1,
617                             PyUnicode_READ(kind, data, o));
618             PyUnicode_WRITE(kind, data, o, tmp);
619             o--;
620             if (o < 0)
621                 break;
622             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
623             if (prev == 0 || prev <= cur)
624                 break;
625         }
626         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
627     }
628     return result;
629 }
630 
631 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)632 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
633 {
634     unsigned int index;
635     for (index = 0; nfc[index].start; index++) {
636         unsigned int start = nfc[index].start;
637         if (code < start)
638             return -1;
639         if (code <= start + nfc[index].count) {
640             unsigned int delta = code - start;
641             return nfc[index].index + delta;
642         }
643     }
644     return -1;
645 }
646 
647 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)648 nfc_nfkc(PyObject *self, PyObject *input, int k)
649 {
650     PyObject *result;
651     int kind;
652     const void *data;
653     Py_UCS4 *output;
654     Py_ssize_t i, i1, o, len;
655     int f,l,index,index1,comb;
656     Py_UCS4 code;
657     Py_ssize_t skipped[20];
658     int cskipped = 0;
659 
660     result = nfd_nfkd(self, input, k);
661     if (!result)
662         return NULL;
663     /* result will be "ready". */
664     kind = PyUnicode_KIND(result);
665     data = PyUnicode_DATA(result);
666     len = PyUnicode_GET_LENGTH(result);
667 
668     /* We allocate a buffer for the output.
669        If we find that we made no changes, we still return
670        the NFD result. */
671     output = PyMem_NEW(Py_UCS4, len);
672     if (!output) {
673         PyErr_NoMemory();
674         Py_DECREF(result);
675         return 0;
676     }
677     i = o = 0;
678 
679   again:
680     while (i < len) {
681       for (index = 0; index < cskipped; index++) {
682           if (skipped[index] == i) {
683               /* *i character is skipped.
684                  Remove from list. */
685               skipped[index] = skipped[cskipped-1];
686               cskipped--;
687               i++;
688               goto again; /* continue while */
689           }
690       }
691       /* Hangul Composition. We don't need to check for <LV,T>
692          pairs, since we always have decomposed data. */
693       code = PyUnicode_READ(kind, data, i);
694       if (LBase <= code && code < (LBase+LCount) &&
695           i + 1 < len &&
696           VBase <= PyUnicode_READ(kind, data, i+1) &&
697           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
698           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
699              and V character is a modern vowel (0x1161 ~ 0x1175). */
700           int LIndex, VIndex;
701           LIndex = code - LBase;
702           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
703           code = SBase + (LIndex*VCount+VIndex)*TCount;
704           i+=2;
705           if (i < len &&
706               TBase < PyUnicode_READ(kind, data, i) &&
707               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
708               /* check T character is a modern trailing consonant
709                  (0x11A8 ~ 0x11C2). */
710               code += PyUnicode_READ(kind, data, i)-TBase;
711               i++;
712           }
713           output[o++] = code;
714           continue;
715       }
716 
717       /* code is still input[i] here */
718       f = find_nfc_index(nfc_first, code);
719       if (f == -1) {
720           output[o++] = code;
721           i++;
722           continue;
723       }
724       /* Find next unblocked character. */
725       i1 = i+1;
726       comb = 0;
727       /* output base character for now; might be updated later. */
728       output[o] = PyUnicode_READ(kind, data, i);
729       while (i1 < len) {
730           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
731           int comb1 = _getrecord_ex(code1)->combining;
732           if (comb) {
733               if (comb1 == 0)
734                   break;
735               if (comb >= comb1) {
736                   /* Character is blocked. */
737                   i1++;
738                   continue;
739               }
740           }
741           l = find_nfc_index(nfc_last, code1);
742           /* i1 cannot be combined with i. If i1
743              is a starter, we don't need to look further.
744              Otherwise, record the combining class. */
745           if (l == -1) {
746             not_combinable:
747               if (comb1 == 0)
748                   break;
749               comb = comb1;
750               i1++;
751               continue;
752           }
753           index = f*TOTAL_LAST + l;
754           index1 = comp_index[index >> COMP_SHIFT];
755           code = comp_data[(index1<<COMP_SHIFT)+
756                            (index&((1<<COMP_SHIFT)-1))];
757           if (code == 0)
758               goto not_combinable;
759 
760           /* Replace the original character. */
761           output[o] = code;
762           /* Mark the second character unused. */
763           assert(cskipped < 20);
764           skipped[cskipped++] = i1;
765           i1++;
766           f = find_nfc_index(nfc_first, output[o]);
767           if (f == -1)
768               break;
769       }
770       /* Output character was already written.
771          Just advance the indices. */
772       o++; i++;
773     }
774     if (o == len) {
775         /* No changes. Return original string. */
776         PyMem_Free(output);
777         return result;
778     }
779     Py_DECREF(result);
780     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
781                                        output, o);
782     PyMem_Free(output);
783     return result;
784 }
785 
786 // This needs to match the logic in makeunicodedata.py
787 // which constructs the quickcheck data.
788 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
789 
790 /* Run the Unicode normalization "quickcheck" algorithm.
791  *
792  * Return YES or NO if quickcheck determines the input is certainly
793  * normalized or certainly not, and MAYBE if quickcheck is unable to
794  * tell.
795  *
796  * If `yes_only` is true, then return MAYBE as soon as we determine
797  * the answer is not YES.
798  *
799  * For background and details on the algorithm, see UAX #15:
800  *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
801  */
802 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)803 is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
804                          bool yes_only)
805 {
806     /* UCD 3.2.0 is requested, quickchecks must be disabled. */
807     if (UCD_Check(self)) {
808         return NO;
809     }
810 
811     Py_ssize_t i, len;
812     int kind;
813     const void *data;
814     unsigned char prev_combining = 0;
815 
816     /* The two quickcheck bits at this shift have type QuickcheckResult. */
817     int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
818 
819     QuickcheckResult result = YES; /* certainly normalized, unless we find something */
820 
821     i = 0;
822     kind = PyUnicode_KIND(input);
823     data = PyUnicode_DATA(input);
824     len = PyUnicode_GET_LENGTH(input);
825     while (i < len) {
826         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
827         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
828 
829         unsigned char combining = record->combining;
830         if (combining && prev_combining > combining)
831             return NO; /* non-canonical sort order, not normalized */
832         prev_combining = combining;
833 
834         unsigned char quickcheck_whole = record->normalization_quick_check;
835         if (yes_only) {
836             if (quickcheck_whole & (3 << quickcheck_shift))
837                 return MAYBE;
838         } else {
839             switch ((quickcheck_whole >> quickcheck_shift) & 3) {
840             case NO:
841               return NO;
842             case MAYBE:
843               result = MAYBE; /* this string might need normalization */
844             }
845         }
846     }
847     return result;
848 }
849 
850 /*[clinic input]
851 unicodedata.UCD.is_normalized
852 
853     self: self
854     form: unicode
855     unistr as input: unicode
856     /
857 
858 Return whether the Unicode string unistr is in the normal form 'form'.
859 
860 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
861 [clinic start generated code]*/
862 
863 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)864 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
865                                    PyObject *input)
866 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
867 {
868     if (PyUnicode_READY(input) == -1) {
869         return NULL;
870     }
871 
872     if (PyUnicode_GET_LENGTH(input) == 0) {
873         /* special case empty input strings. */
874         Py_RETURN_TRUE;
875     }
876 
877     PyObject *result;
878     bool nfc = false;
879     bool k = false;
880     QuickcheckResult m;
881 
882     PyObject *cmp;
883     int match = 0;
884 
885     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
886         nfc = true;
887     }
888     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
889         nfc = true;
890         k = true;
891     }
892     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
893         /* matches default values for `nfc` and `k` */
894     }
895     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
896         k = true;
897     }
898     else {
899         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
900         return NULL;
901     }
902 
903     m = is_normalized_quickcheck(self, input, nfc, k, false);
904 
905     if (m == MAYBE) {
906         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
907         if (cmp == NULL) {
908             return NULL;
909         }
910         match = PyUnicode_Compare(input, cmp);
911         Py_DECREF(cmp);
912         result = (match == 0) ? Py_True : Py_False;
913     }
914     else {
915         result = (m == YES) ? Py_True : Py_False;
916     }
917 
918     Py_INCREF(result);
919     return result;
920 }
921 
922 
923 /*[clinic input]
924 unicodedata.UCD.normalize
925 
926     self: self
927     form: unicode
928     unistr as input: unicode
929     /
930 
931 Return the normal form 'form' for the Unicode string unistr.
932 
933 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
934 [clinic start generated code]*/
935 
936 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)937 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
938                                PyObject *input)
939 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
940 {
941     if (PyUnicode_GET_LENGTH(input) == 0) {
942         /* Special case empty input strings, since resizing
943            them  later would cause internal errors. */
944         Py_INCREF(input);
945         return input;
946     }
947 
948     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
949         if (is_normalized_quickcheck(self, input,
950                                      true,  false, true) == YES) {
951             Py_INCREF(input);
952             return input;
953         }
954         return nfc_nfkc(self, input, 0);
955     }
956     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
957         if (is_normalized_quickcheck(self, input,
958                                      true,  true,  true) == YES) {
959             Py_INCREF(input);
960             return input;
961         }
962         return nfc_nfkc(self, input, 1);
963     }
964     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
965         if (is_normalized_quickcheck(self, input,
966                                      false, false, true) == YES) {
967             Py_INCREF(input);
968             return input;
969         }
970         return nfd_nfkd(self, input, 0);
971     }
972     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
973         if (is_normalized_quickcheck(self, input,
974                                      false, true,  true) == YES) {
975             Py_INCREF(input);
976             return input;
977         }
978         return nfd_nfkd(self, input, 1);
979     }
980     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
981     return NULL;
982 }
983 
984 /* -------------------------------------------------------------------- */
985 /* unicode character name tables */
986 
987 /* data file generated by Tools/unicode/makeunicodedata.py */
988 #include "unicodename_db.h"
989 
990 /* -------------------------------------------------------------------- */
991 /* database code (cut and pasted from the unidb package) */
992 
993 static unsigned long
_gethash(const char * s,int len,int scale)994 _gethash(const char *s, int len, int scale)
995 {
996     int i;
997     unsigned long h = 0;
998     unsigned long ix;
999     for (i = 0; i < len; i++) {
1000         h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
1001         ix = h & 0xff000000;
1002         if (ix)
1003             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1004     }
1005     return h;
1006 }
1007 
1008 static const char * const hangul_syllables[][3] = {
1009     { "G",  "A",   ""   },
1010     { "GG", "AE",  "G"  },
1011     { "N",  "YA",  "GG" },
1012     { "D",  "YAE", "GS" },
1013     { "DD", "EO",  "N", },
1014     { "R",  "E",   "NJ" },
1015     { "M",  "YEO", "NH" },
1016     { "B",  "YE",  "D"  },
1017     { "BB", "O",   "L"  },
1018     { "S",  "WA",  "LG" },
1019     { "SS", "WAE", "LM" },
1020     { "",   "OE",  "LB" },
1021     { "J",  "YO",  "LS" },
1022     { "JJ", "U",   "LT" },
1023     { "C",  "WEO", "LP" },
1024     { "K",  "WE",  "LH" },
1025     { "T",  "WI",  "M"  },
1026     { "P",  "YU",  "B"  },
1027     { "H",  "EU",  "BS" },
1028     { 0,    "YI",  "S"  },
1029     { 0,    "I",   "SS" },
1030     { 0,    0,     "NG" },
1031     { 0,    0,     "J"  },
1032     { 0,    0,     "C"  },
1033     { 0,    0,     "K"  },
1034     { 0,    0,     "T"  },
1035     { 0,    0,     "P"  },
1036     { 0,    0,     "H"  }
1037 };
1038 
1039 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1040 static int
is_unified_ideograph(Py_UCS4 code)1041 is_unified_ideograph(Py_UCS4 code)
1042 {
1043     return
1044         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
1045         (0x4E00 <= code && code <= 0x9FFC)   || /* CJK Ideograph */
1046         (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
1047         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
1048         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1049         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1050         (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1051         (0x30000 <= code && code <= 0x3134A);   /* CJK Ideograph Extension G */
1052 }
1053 
1054 /* macros used to determine if the given code point is in the PUA range that
1055  * we are using to store aliases and named sequences */
1056 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1057 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1058                           (cp < named_sequences_end))
1059 
1060 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1061 _getucname(PyObject *self,
1062            Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1063 {
1064     /* Find the name associated with the given code point.
1065      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1066      * that we are using for aliases and named sequences. */
1067     int offset;
1068     int i;
1069     int word;
1070     const unsigned char* w;
1071 
1072     if (code >= 0x110000)
1073         return 0;
1074 
1075     /* XXX should we just skip all the code points in the PUAs here? */
1076     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1077         return 0;
1078 
1079     if (UCD_Check(self)) {
1080         /* in 3.2.0 there are no aliases and named sequences */
1081         const change_record *old;
1082         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1083             return 0;
1084         old = get_old_record(self, code);
1085         if (old->category_changed == 0) {
1086             /* unassigned */
1087             return 0;
1088         }
1089     }
1090 
1091     if (SBase <= code && code < SBase+SCount) {
1092         /* Hangul syllable. */
1093         int SIndex = code - SBase;
1094         int L = SIndex / NCount;
1095         int V = (SIndex % NCount) / TCount;
1096         int T = SIndex % TCount;
1097 
1098         if (buflen < 27)
1099             /* Worst case: HANGUL SYLLABLE <10chars>. */
1100             return 0;
1101         strcpy(buffer, "HANGUL SYLLABLE ");
1102         buffer += 16;
1103         strcpy(buffer, hangul_syllables[L][0]);
1104         buffer += strlen(hangul_syllables[L][0]);
1105         strcpy(buffer, hangul_syllables[V][1]);
1106         buffer += strlen(hangul_syllables[V][1]);
1107         strcpy(buffer, hangul_syllables[T][2]);
1108         buffer += strlen(hangul_syllables[T][2]);
1109         *buffer = '\0';
1110         return 1;
1111     }
1112 
1113     if (is_unified_ideograph(code)) {
1114         if (buflen < 28)
1115             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1116             return 0;
1117         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1118         return 1;
1119     }
1120 
1121     /* get offset into phrasebook */
1122     offset = phrasebook_offset1[(code>>phrasebook_shift)];
1123     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1124                                (code&((1<<phrasebook_shift)-1))];
1125     if (!offset)
1126         return 0;
1127 
1128     i = 0;
1129 
1130     for (;;) {
1131         /* get word index */
1132         word = phrasebook[offset] - phrasebook_short;
1133         if (word >= 0) {
1134             word = (word << 8) + phrasebook[offset+1];
1135             offset += 2;
1136         } else
1137             word = phrasebook[offset++];
1138         if (i) {
1139             if (i > buflen)
1140                 return 0; /* buffer overflow */
1141             buffer[i++] = ' ';
1142         }
1143         /* copy word string from lexicon.  the last character in the
1144            word has bit 7 set.  the last word in a string ends with
1145            0x80 */
1146         w = lexicon + lexicon_offset[word];
1147         while (*w < 128) {
1148             if (i >= buflen)
1149                 return 0; /* buffer overflow */
1150             buffer[i++] = *w++;
1151         }
1152         if (i >= buflen)
1153             return 0; /* buffer overflow */
1154         buffer[i++] = *w & 127;
1155         if (*w == 128)
1156             break; /* end of word */
1157     }
1158 
1159     return 1;
1160 }
1161 
1162 static int
capi_getucname(Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1163 capi_getucname(Py_UCS4 code,
1164                char* buffer, int buflen,
1165                int with_alias_and_seq)
1166 {
1167     return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1168 
1169 }
1170 
1171 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1172 _cmpname(PyObject *self, int code, const char* name, int namelen)
1173 {
1174     /* check if code corresponds to the given name */
1175     int i;
1176     char buffer[NAME_MAXLEN+1];
1177     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1178         return 0;
1179     for (i = 0; i < namelen; i++) {
1180         if (Py_TOUPPER(name[i]) != buffer[i])
1181             return 0;
1182     }
1183     return buffer[namelen] == '\0';
1184 }
1185 
1186 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1187 find_syllable(const char *str, int *len, int *pos, int count, int column)
1188 {
1189     int i, len1;
1190     *len = -1;
1191     for (i = 0; i < count; i++) {
1192         const char *s = hangul_syllables[i][column];
1193         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1194         if (len1 <= *len)
1195             continue;
1196         if (strncmp(str, s, len1) == 0) {
1197             *len = len1;
1198             *pos = i;
1199         }
1200     }
1201     if (*len == -1) {
1202         *len = 0;
1203     }
1204 }
1205 
1206 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1207 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1208 {
1209     /* check if named sequences are allowed */
1210     if (!with_named_seq && IS_NAMED_SEQ(cp))
1211         return 0;
1212     /* if the code point is in the PUA range that we use for aliases,
1213      * convert it to obtain the right code point */
1214     if (IS_ALIAS(cp))
1215         *code = name_aliases[cp-aliases_start];
1216     else
1217         *code = cp;
1218     return 1;
1219 }
1220 
1221 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1222 _getcode(PyObject* self,
1223          const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1224 {
1225     /* Return the code point associated with the given name.
1226      * Named aliases are resolved too (unless self != NULL (i.e. we are using
1227      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
1228      * using for the named sequence, and the caller must then convert it. */
1229     unsigned int h, v;
1230     unsigned int mask = code_size-1;
1231     unsigned int i, incr;
1232 
1233     /* Check for hangul syllables. */
1234     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1235         int len, L = -1, V = -1, T = -1;
1236         const char *pos = name + 16;
1237         find_syllable(pos, &len, &L, LCount, 0);
1238         pos += len;
1239         find_syllable(pos, &len, &V, VCount, 1);
1240         pos += len;
1241         find_syllable(pos, &len, &T, TCount, 2);
1242         pos += len;
1243         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1244             *code = SBase + (L*VCount+V)*TCount + T;
1245             return 1;
1246         }
1247         /* Otherwise, it's an illegal syllable name. */
1248         return 0;
1249     }
1250 
1251     /* Check for unified ideographs. */
1252     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1253         /* Four or five hexdigits must follow. */
1254         v = 0;
1255         name += 22;
1256         namelen -= 22;
1257         if (namelen != 4 && namelen != 5)
1258             return 0;
1259         while (namelen--) {
1260             v *= 16;
1261             if (*name >= '0' && *name <= '9')
1262                 v += *name - '0';
1263             else if (*name >= 'A' && *name <= 'F')
1264                 v += *name - 'A' + 10;
1265             else
1266                 return 0;
1267             name++;
1268         }
1269         if (!is_unified_ideograph(v))
1270             return 0;
1271         *code = v;
1272         return 1;
1273     }
1274 
1275     /* the following is the same as python's dictionary lookup, with
1276        only minor changes.  see the makeunicodedata script for more
1277        details */
1278 
1279     h = (unsigned int) _gethash(name, namelen, code_magic);
1280     i = (~h) & mask;
1281     v = code_hash[i];
1282     if (!v)
1283         return 0;
1284     if (_cmpname(self, v, name, namelen)) {
1285         return _check_alias_and_seq(v, code, with_named_seq);
1286     }
1287     incr = (h ^ (h >> 3)) & mask;
1288     if (!incr)
1289         incr = mask;
1290     for (;;) {
1291         i = (i + incr) & mask;
1292         v = code_hash[i];
1293         if (!v)
1294             return 0;
1295         if (_cmpname(self, v, name, namelen)) {
1296             return _check_alias_and_seq(v, code, with_named_seq);
1297         }
1298         incr = incr << 1;
1299         if (incr > mask)
1300             incr = incr ^ code_poly;
1301     }
1302 }
1303 
1304 static int
capi_getcode(const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1305 capi_getcode(const char* name, int namelen, Py_UCS4* code,
1306              int with_named_seq)
1307 {
1308     return _getcode(NULL, name, namelen, code, with_named_seq);
1309 
1310 }
1311 
1312 static void
unicodedata_destroy_capi(PyObject * capsule)1313 unicodedata_destroy_capi(PyObject *capsule)
1314 {
1315     void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1316     PyMem_Free(capi);
1317 }
1318 
1319 static PyObject *
unicodedata_create_capi(void)1320 unicodedata_create_capi(void)
1321 {
1322     _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1323     if (capi == NULL) {
1324         PyErr_NoMemory();
1325         return NULL;
1326     }
1327     capi->getname = capi_getucname;
1328     capi->getcode = capi_getcode;
1329 
1330     PyObject *capsule = PyCapsule_New(capi,
1331                                       PyUnicodeData_CAPSULE_NAME,
1332                                       unicodedata_destroy_capi);
1333     if (capsule == NULL) {
1334         PyMem_Free(capi);
1335     }
1336     return capsule;
1337 };
1338 
1339 
1340 /* -------------------------------------------------------------------- */
1341 /* Python bindings */
1342 
1343 /*[clinic input]
1344 unicodedata.UCD.name
1345 
1346     self: self
1347     chr: int(accept={str})
1348     default: object=NULL
1349     /
1350 
1351 Returns the name assigned to the character chr as a string.
1352 
1353 If no name is defined, default is returned, or, if not given,
1354 ValueError is raised.
1355 [clinic start generated code]*/
1356 
1357 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1358 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1359 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1360 {
1361     char name[NAME_MAXLEN+1];
1362     Py_UCS4 c = (Py_UCS4)chr;
1363 
1364     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1365         if (default_value == NULL) {
1366             PyErr_SetString(PyExc_ValueError, "no such name");
1367             return NULL;
1368         }
1369         else {
1370             Py_INCREF(default_value);
1371             return default_value;
1372         }
1373     }
1374 
1375     return PyUnicode_FromString(name);
1376 }
1377 
1378 /*[clinic input]
1379 unicodedata.UCD.lookup
1380 
1381     self: self
1382     name: str(accept={str, robuffer}, zeroes=True)
1383     /
1384 
1385 Look up character by name.
1386 
1387 If a character with the given name is found, return the
1388 corresponding character.  If not found, KeyError is raised.
1389 [clinic start generated code]*/
1390 
1391 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1392 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1393                             Py_ssize_clean_t name_length)
1394 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1395 {
1396     Py_UCS4 code;
1397     unsigned int index;
1398     if (name_length > NAME_MAXLEN) {
1399         PyErr_SetString(PyExc_KeyError, "name too long");
1400         return NULL;
1401     }
1402 
1403     if (!_getcode(self, name, (int)name_length, &code, 1)) {
1404         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1405         return NULL;
1406     }
1407     /* check if code is in the PUA range that we use for named sequences
1408        and convert it */
1409     if (IS_NAMED_SEQ(code)) {
1410         index = code-named_sequences_start;
1411         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1412                                          named_sequences[index].seq,
1413                                          named_sequences[index].seqlen);
1414     }
1415     return PyUnicode_FromOrdinal(code);
1416 }
1417 
1418 // List of functions used to define module functions *AND* unicodedata.UCD
1419 // methods. For module functions, self is the module. For UCD methods, self
1420 // is an UCD instance. The UCD_Check() macro is used to check if self is
1421 // an UCD instance.
1422 static PyMethodDef unicodedata_functions[] = {
1423     UNICODEDATA_UCD_DECIMAL_METHODDEF
1424     UNICODEDATA_UCD_DIGIT_METHODDEF
1425     UNICODEDATA_UCD_NUMERIC_METHODDEF
1426     UNICODEDATA_UCD_CATEGORY_METHODDEF
1427     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1428     UNICODEDATA_UCD_COMBINING_METHODDEF
1429     UNICODEDATA_UCD_MIRRORED_METHODDEF
1430     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1431     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1432     UNICODEDATA_UCD_NAME_METHODDEF
1433     UNICODEDATA_UCD_LOOKUP_METHODDEF
1434     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1435     UNICODEDATA_UCD_NORMALIZE_METHODDEF
1436     {NULL, NULL}                /* sentinel */
1437 };
1438 
1439 static int
ucd_traverse(PreviousDBVersion * self,visitproc visit,void * arg)1440 ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1441 {
1442     Py_VISIT(Py_TYPE(self));
1443     return 0;
1444 }
1445 
1446 static void
ucd_dealloc(PreviousDBVersion * self)1447 ucd_dealloc(PreviousDBVersion *self)
1448 {
1449     PyTypeObject *tp = Py_TYPE(self);
1450     PyObject_GC_UnTrack(self);
1451     PyObject_GC_Del(self);
1452     Py_DECREF(tp);
1453 }
1454 
1455 static PyType_Slot ucd_type_slots[] = {
1456     {Py_tp_dealloc, ucd_dealloc},
1457     {Py_tp_traverse, ucd_traverse},
1458     {Py_tp_getattro, PyObject_GenericGetAttr},
1459     {Py_tp_methods, unicodedata_functions},
1460     {Py_tp_members, DB_members},
1461     {0, 0}
1462 };
1463 
1464 static PyType_Spec ucd_type_spec = {
1465     .name = "unicodedata.UCD",
1466     .basicsize = sizeof(PreviousDBVersion),
1467     .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1468               Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1469     .slots = ucd_type_slots
1470 };
1471 
1472 PyDoc_STRVAR(unicodedata_docstring,
1473 "This module provides access to the Unicode Character Database which\n\
1474 defines character properties for all Unicode characters. The data in\n\
1475 this database is based on the UnicodeData.txt file version\n\
1476 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1477 \n\
1478 The module uses the same names and symbols as defined by the\n\
1479 UnicodeData File Format " UNIDATA_VERSION ".");
1480 
1481 static int
unicodedata_exec(PyObject * module)1482 unicodedata_exec(PyObject *module)
1483 {
1484     if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1485         return -1;
1486     }
1487 
1488     PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1489     if (ucd_type == NULL) {
1490         return -1;
1491     }
1492 
1493     if (PyModule_AddType(module, ucd_type) < 0) {
1494         Py_DECREF(ucd_type);
1495         return -1;
1496     }
1497 
1498     // Unicode database version 3.2.0 used by the IDNA encoding
1499     PyObject *v;
1500     v = new_previous_version(ucd_type, "3.2.0",
1501                              get_change_3_2_0, normalization_3_2_0);
1502     Py_DECREF(ucd_type);
1503     if (v == NULL) {
1504         return -1;
1505     }
1506     if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1507         Py_DECREF(v);
1508         return -1;
1509     }
1510 
1511     /* Export C API */
1512     PyObject *capsule = unicodedata_create_capi();
1513     if (capsule == NULL) {
1514         return -1;
1515     }
1516     int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1517     Py_DECREF(capsule);
1518     if (rc < 0) {
1519         return -1;
1520     }
1521     return 0;
1522 }
1523 
1524 static PyModuleDef_Slot unicodedata_slots[] = {
1525     {Py_mod_exec, unicodedata_exec},
1526     {0, NULL}
1527 };
1528 
1529 static struct PyModuleDef unicodedata_module = {
1530     PyModuleDef_HEAD_INIT,
1531     .m_name = "unicodedata",
1532     .m_doc = unicodedata_docstring,
1533     .m_size = 0,
1534     .m_methods = unicodedata_functions,
1535     .m_slots = unicodedata_slots,
1536 };
1537 
1538 PyMODINIT_FUNC
PyInit_unicodedata(void)1539 PyInit_unicodedata(void)
1540 {
1541     return PyModuleDef_Init(&unicodedata_module);
1542 }
1543 
1544 
1545 /*
1546 Local variables:
1547 c-basic-offset: 4
1548 indent-tabs-mode: nil
1549 End:
1550 */
1551