• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    unicodedata -- Provides access to the Unicode database.
4 
5    Data was extracted from the UnicodeData.txt file.
6    The current version number is reported in the unidata_version constant.
7 
8    Written by Marc-Andre Lemburg (mal@lemburg.com).
9    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10    Modified by Martin v. Löwis (martin@v.loewis.de)
11 
12    Copyright (c) Corporation for National Research Initiatives.
13 
14    ------------------------------------------------------------------------ */
15 
16 #define PY_SSIZE_T_CLEAN
17 
18 #include "Python.h"
19 #include "ucnhash.h"
20 #include "structmember.h"         // PyMemberDef
21 
22 #include <stdbool.h>
23 
24 _Py_IDENTIFIER(NFC);
25 _Py_IDENTIFIER(NFD);
26 _Py_IDENTIFIER(NFKC);
27 _Py_IDENTIFIER(NFKD);
28 
29 /*[clinic input]
30 module unicodedata
31 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
32 [clinic start generated code]*/
33 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
34 
35 /* character properties */
36 
37 typedef struct {
38     const unsigned char category;       /* index into
39                                            _PyUnicode_CategoryNames */
40     const unsigned char combining;      /* combining class value 0 - 255 */
41     const unsigned char bidirectional;  /* index into
42                                            _PyUnicode_BidirectionalNames */
43     const unsigned char mirrored;       /* true if mirrored in bidir mode */
44     const unsigned char east_asian_width;       /* index into
45                                                    _PyUnicode_EastAsianWidth */
46     const unsigned char normalization_quick_check; /* see is_normalized() */
47 } _PyUnicode_DatabaseRecord;
48 
49 typedef struct change_record {
50     /* sequence of fields should be the same as in merge_old_version */
51     const unsigned char bidir_changed;
52     const unsigned char category_changed;
53     const unsigned char decimal_changed;
54     const unsigned char mirrored_changed;
55     const unsigned char east_asian_width_changed;
56     const double numeric_changed;
57 } change_record;
58 
59 /* data file generated by Tools/unicode/makeunicodedata.py */
60 #include "unicodedata_db.h"
61 
62 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)63 _getrecord_ex(Py_UCS4 code)
64 {
65     int index;
66     if (code >= 0x110000)
67         index = 0;
68     else {
69         index = index1[(code>>SHIFT)];
70         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71     }
72 
73     return &_PyUnicode_Database_Records[index];
74 }
75 
76 /* ------------- Previous-version API ------------------------------------- */
77 typedef struct previous_version {
78     PyObject_HEAD
79     const char *name;
80     const change_record* (*getrecord)(Py_UCS4);
81     Py_UCS4 (*normalization)(Py_UCS4);
82 } PreviousDBVersion;
83 
84 #include "clinic/unicodedata.c.h"
85 
86 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
87 
88 static PyMemberDef DB_members[] = {
89         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
90         {NULL}
91 };
92 
93 /* forward declaration */
94 static PyTypeObject UCD_Type;
95 #define UCD_Check(o) Py_IS_TYPE(o, &UCD_Type)
96 
97 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))98 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
99                      Py_UCS4 (*normalization)(Py_UCS4))
100 {
101         PreviousDBVersion *self;
102         self = PyObject_New(PreviousDBVersion, &UCD_Type);
103         if (self == NULL)
104                 return NULL;
105         self->name = name;
106         self->getrecord = getrecord;
107         self->normalization = normalization;
108         return (PyObject*)self;
109 }
110 
111 
112 /* --- Module API --------------------------------------------------------- */
113 
114 /*[clinic input]
115 unicodedata.UCD.decimal
116 
117     self: self
118     chr: int(accept={str})
119     default: object=NULL
120     /
121 
122 Converts a Unicode character into its equivalent decimal value.
123 
124 Returns the decimal value assigned to the character chr as integer.
125 If no such value is defined, default is returned, or, if not given,
126 ValueError is raised.
127 [clinic start generated code]*/
128 
129 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)130 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
131                              PyObject *default_value)
132 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
133 {
134     int have_old = 0;
135     long rc;
136     Py_UCS4 c = (Py_UCS4)chr;
137 
138     if (self && UCD_Check(self)) {
139         const change_record *old = get_old_record(self, c);
140         if (old->category_changed == 0) {
141             /* unassigned */
142             have_old = 1;
143             rc = -1;
144         }
145         else if (old->decimal_changed != 0xFF) {
146             have_old = 1;
147             rc = old->decimal_changed;
148         }
149     }
150 
151     if (!have_old)
152         rc = Py_UNICODE_TODECIMAL(c);
153     if (rc < 0) {
154         if (default_value == NULL) {
155             PyErr_SetString(PyExc_ValueError,
156                             "not a decimal");
157             return NULL;
158         }
159         else {
160             Py_INCREF(default_value);
161             return default_value;
162         }
163     }
164     return PyLong_FromLong(rc);
165 }
166 
167 /*[clinic input]
168 unicodedata.UCD.digit
169 
170     self: self
171     chr: int(accept={str})
172     default: object=NULL
173     /
174 
175 Converts a Unicode character into its equivalent digit value.
176 
177 Returns the digit value assigned to the character chr as integer.
178 If no such value is defined, default is returned, or, if not given,
179 ValueError is raised.
180 [clinic start generated code]*/
181 
182 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)183 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
184 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
185 {
186     long rc;
187     Py_UCS4 c = (Py_UCS4)chr;
188     rc = Py_UNICODE_TODIGIT(c);
189     if (rc < 0) {
190         if (default_value == NULL) {
191             PyErr_SetString(PyExc_ValueError, "not a digit");
192             return NULL;
193         }
194         else {
195             Py_INCREF(default_value);
196             return default_value;
197         }
198     }
199     return PyLong_FromLong(rc);
200 }
201 
202 /*[clinic input]
203 unicodedata.UCD.numeric
204 
205     self: self
206     chr: int(accept={str})
207     default: object=NULL
208     /
209 
210 Converts a Unicode character into its equivalent numeric value.
211 
212 Returns the numeric value assigned to the character chr as float.
213 If no such value is defined, default is returned, or, if not given,
214 ValueError is raised.
215 [clinic start generated code]*/
216 
217 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)218 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
219                              PyObject *default_value)
220 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
221 {
222     int have_old = 0;
223     double rc;
224     Py_UCS4 c = (Py_UCS4)chr;
225 
226     if (self && UCD_Check(self)) {
227         const change_record *old = get_old_record(self, c);
228         if (old->category_changed == 0) {
229             /* unassigned */
230             have_old = 1;
231             rc = -1.0;
232         }
233         else if (old->decimal_changed != 0xFF) {
234             have_old = 1;
235             rc = old->decimal_changed;
236         }
237     }
238 
239     if (!have_old)
240         rc = Py_UNICODE_TONUMERIC(c);
241     if (rc == -1.0) {
242         if (default_value == NULL) {
243             PyErr_SetString(PyExc_ValueError, "not a numeric character");
244             return NULL;
245         }
246         else {
247             Py_INCREF(default_value);
248             return default_value;
249         }
250     }
251     return PyFloat_FromDouble(rc);
252 }
253 
254 /*[clinic input]
255 unicodedata.UCD.category
256 
257     self: self
258     chr: int(accept={str})
259     /
260 
261 Returns the general category assigned to the character chr as string.
262 [clinic start generated code]*/
263 
264 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)265 unicodedata_UCD_category_impl(PyObject *self, int chr)
266 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
267 {
268     int index;
269     Py_UCS4 c = (Py_UCS4)chr;
270     index = (int) _getrecord_ex(c)->category;
271     if (self && UCD_Check(self)) {
272         const change_record *old = get_old_record(self, c);
273         if (old->category_changed != 0xFF)
274             index = old->category_changed;
275     }
276     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
277 }
278 
279 /*[clinic input]
280 unicodedata.UCD.bidirectional
281 
282     self: self
283     chr: int(accept={str})
284     /
285 
286 Returns the bidirectional class assigned to the character chr as string.
287 
288 If no such value is defined, an empty string is returned.
289 [clinic start generated code]*/
290 
291 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)292 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
293 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
294 {
295     int index;
296     Py_UCS4 c = (Py_UCS4)chr;
297     index = (int) _getrecord_ex(c)->bidirectional;
298     if (self && UCD_Check(self)) {
299         const change_record *old = get_old_record(self, c);
300         if (old->category_changed == 0)
301             index = 0; /* unassigned */
302         else if (old->bidir_changed != 0xFF)
303             index = old->bidir_changed;
304     }
305     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
306 }
307 
308 /*[clinic input]
309 unicodedata.UCD.combining -> int
310 
311     self: self
312     chr: int(accept={str})
313     /
314 
315 Returns the canonical combining class assigned to the character chr as integer.
316 
317 Returns 0 if no combining class is defined.
318 [clinic start generated code]*/
319 
320 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)321 unicodedata_UCD_combining_impl(PyObject *self, int chr)
322 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
323 {
324     int index;
325     Py_UCS4 c = (Py_UCS4)chr;
326     index = (int) _getrecord_ex(c)->combining;
327     if (self && UCD_Check(self)) {
328         const change_record *old = get_old_record(self, c);
329         if (old->category_changed == 0)
330             index = 0; /* unassigned */
331     }
332     return index;
333 }
334 
335 /*[clinic input]
336 unicodedata.UCD.mirrored -> int
337 
338     self: self
339     chr: int(accept={str})
340     /
341 
342 Returns the mirrored property assigned to the character chr as integer.
343 
344 Returns 1 if the character has been identified as a "mirrored"
345 character in bidirectional text, 0 otherwise.
346 [clinic start generated code]*/
347 
348 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)349 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
350 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
351 {
352     int index;
353     Py_UCS4 c = (Py_UCS4)chr;
354     index = (int) _getrecord_ex(c)->mirrored;
355     if (self && UCD_Check(self)) {
356         const change_record *old = get_old_record(self, c);
357         if (old->category_changed == 0)
358             index = 0; /* unassigned */
359         else if (old->mirrored_changed != 0xFF)
360             index = old->mirrored_changed;
361     }
362     return index;
363 }
364 
365 /*[clinic input]
366 unicodedata.UCD.east_asian_width
367 
368     self: self
369     chr: int(accept={str})
370     /
371 
372 Returns the east asian width assigned to the character chr as string.
373 [clinic start generated code]*/
374 
375 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)376 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
377 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
378 {
379     int index;
380     Py_UCS4 c = (Py_UCS4)chr;
381     index = (int) _getrecord_ex(c)->east_asian_width;
382     if (self && UCD_Check(self)) {
383         const change_record *old = get_old_record(self, c);
384         if (old->category_changed == 0)
385             index = 0; /* unassigned */
386         else if (old->east_asian_width_changed != 0xFF)
387             index = old->east_asian_width_changed;
388     }
389     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
390 }
391 
392 /*[clinic input]
393 unicodedata.UCD.decomposition
394 
395     self: self
396     chr: int(accept={str})
397     /
398 
399 Returns the character decomposition mapping assigned to the character chr as string.
400 
401 An empty string is returned in case no such mapping is defined.
402 [clinic start generated code]*/
403 
404 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)405 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
406 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
407 {
408     char decomp[256];
409     int code, index, count;
410     size_t i;
411     unsigned int prefix_index;
412     Py_UCS4 c = (Py_UCS4)chr;
413 
414     code = (int)c;
415 
416     if (self && UCD_Check(self)) {
417         const change_record *old = get_old_record(self, c);
418         if (old->category_changed == 0)
419             return PyUnicode_FromString(""); /* unassigned */
420     }
421 
422     if (code < 0 || code >= 0x110000)
423         index = 0;
424     else {
425         index = decomp_index1[(code>>DECOMP_SHIFT)];
426         index = decomp_index2[(index<<DECOMP_SHIFT)+
427                              (code&((1<<DECOMP_SHIFT)-1))];
428     }
429 
430     /* high byte is number of hex bytes (usually one or two), low byte
431        is prefix code (from*/
432     count = decomp_data[index] >> 8;
433 
434     /* XXX: could allocate the PyString up front instead
435        (strlen(prefix) + 5 * count + 1 bytes) */
436 
437     /* Based on how index is calculated above and decomp_data is generated
438        from Tools/unicode/makeunicodedata.py, it should not be possible
439        to overflow decomp_prefix. */
440     prefix_index = decomp_data[index] & 255;
441     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
442 
443     /* copy prefix */
444     i = strlen(decomp_prefix[prefix_index]);
445     memcpy(decomp, decomp_prefix[prefix_index], i);
446 
447     while (count-- > 0) {
448         if (i)
449             decomp[i++] = ' ';
450         assert(i < sizeof(decomp));
451         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
452                       decomp_data[++index]);
453         i += strlen(decomp + i);
454     }
455     return PyUnicode_FromStringAndSize(decomp, i);
456 }
457 
458 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)459 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
460 {
461     if (code >= 0x110000) {
462         *index = 0;
463     } else if (self && UCD_Check(self) &&
464                get_old_record(self, code)->category_changed==0) {
465         /* unassigned in old version */
466         *index = 0;
467     }
468     else {
469         *index = decomp_index1[(code>>DECOMP_SHIFT)];
470         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471                                (code&((1<<DECOMP_SHIFT)-1))];
472     }
473 
474     /* high byte is number of hex bytes (usually one or two), low byte
475        is prefix code (from*/
476     *count = decomp_data[*index] >> 8;
477     *prefix = decomp_data[*index] & 255;
478 
479     (*index)++;
480 }
481 
482 #define SBase   0xAC00
483 #define LBase   0x1100
484 #define VBase   0x1161
485 #define TBase   0x11A7
486 #define LCount  19
487 #define VCount  21
488 #define TCount  28
489 #define NCount  (VCount*TCount)
490 #define SCount  (LCount*NCount)
491 
492 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)493 nfd_nfkd(PyObject *self, PyObject *input, int k)
494 {
495     PyObject *result;
496     Py_UCS4 *output;
497     Py_ssize_t i, o, osize;
498     int kind;
499     const void *data;
500     /* Longest decomposition in Unicode 3.2: U+FDFA */
501     Py_UCS4 stack[20];
502     Py_ssize_t space, isize;
503     int index, prefix, count, stackptr;
504     unsigned char prev, cur;
505 
506     stackptr = 0;
507     isize = PyUnicode_GET_LENGTH(input);
508     space = isize;
509     /* Overallocate at most 10 characters. */
510     if (space > 10) {
511         if (space <= PY_SSIZE_T_MAX - 10)
512             space += 10;
513     }
514     else {
515         space *= 2;
516     }
517     osize = space;
518     output = PyMem_NEW(Py_UCS4, space);
519     if (!output) {
520         PyErr_NoMemory();
521         return NULL;
522     }
523     i = o = 0;
524     kind = PyUnicode_KIND(input);
525     data = PyUnicode_DATA(input);
526 
527     while (i < isize) {
528         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
529         while(stackptr) {
530             Py_UCS4 code = stack[--stackptr];
531             /* Hangul Decomposition adds three characters in
532                a single step, so we need at least that much room. */
533             if (space < 3) {
534                 Py_UCS4 *new_output;
535                 osize += 10;
536                 space += 10;
537                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
538                 if (new_output == NULL) {
539                     PyMem_Free(output);
540                     PyErr_NoMemory();
541                     return NULL;
542                 }
543                 output = new_output;
544             }
545             /* Hangul Decomposition. */
546             if (SBase <= code && code < (SBase+SCount)) {
547                 int SIndex = code - SBase;
548                 int L = LBase + SIndex / NCount;
549                 int V = VBase + (SIndex % NCount) / TCount;
550                 int T = TBase + SIndex % TCount;
551                 output[o++] = L;
552                 output[o++] = V;
553                 space -= 2;
554                 if (T != TBase) {
555                     output[o++] = T;
556                     space --;
557                 }
558                 continue;
559             }
560             /* normalization changes */
561             if (self && UCD_Check(self)) {
562                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
563                 if (value != 0) {
564                     stack[stackptr++] = value;
565                     continue;
566                 }
567             }
568 
569             /* Other decompositions. */
570             get_decomp_record(self, code, &index, &prefix, &count);
571 
572             /* Copy character if it is not decomposable, or has a
573                compatibility decomposition, but we do NFD. */
574             if (!count || (prefix && !k)) {
575                 output[o++] = code;
576                 space--;
577                 continue;
578             }
579             /* Copy decomposition onto the stack, in reverse
580                order.  */
581             while(count) {
582                 code = decomp_data[index + (--count)];
583                 stack[stackptr++] = code;
584             }
585         }
586     }
587 
588     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
589                                        output, o);
590     PyMem_Free(output);
591     if (!result)
592         return NULL;
593     /* result is guaranteed to be ready, as it is compact. */
594     kind = PyUnicode_KIND(result);
595     data = PyUnicode_DATA(result);
596 
597     /* Sort canonically. */
598     i = 0;
599     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
600     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
601         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
602         if (prev == 0 || cur == 0 || prev <= cur) {
603             prev = cur;
604             continue;
605         }
606         /* Non-canonical order. Need to switch *i with previous. */
607         o = i - 1;
608         while (1) {
609             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
610             PyUnicode_WRITE(kind, data, o+1,
611                             PyUnicode_READ(kind, data, o));
612             PyUnicode_WRITE(kind, data, o, tmp);
613             o--;
614             if (o < 0)
615                 break;
616             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
617             if (prev == 0 || prev <= cur)
618                 break;
619         }
620         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
621     }
622     return result;
623 }
624 
625 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)626 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
627 {
628     unsigned int index;
629     for (index = 0; nfc[index].start; index++) {
630         unsigned int start = nfc[index].start;
631         if (code < start)
632             return -1;
633         if (code <= start + nfc[index].count) {
634             unsigned int delta = code - start;
635             return nfc[index].index + delta;
636         }
637     }
638     return -1;
639 }
640 
641 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)642 nfc_nfkc(PyObject *self, PyObject *input, int k)
643 {
644     PyObject *result;
645     int kind;
646     const void *data;
647     Py_UCS4 *output;
648     Py_ssize_t i, i1, o, len;
649     int f,l,index,index1,comb;
650     Py_UCS4 code;
651     Py_ssize_t skipped[20];
652     int cskipped = 0;
653 
654     result = nfd_nfkd(self, input, k);
655     if (!result)
656         return NULL;
657     /* result will be "ready". */
658     kind = PyUnicode_KIND(result);
659     data = PyUnicode_DATA(result);
660     len = PyUnicode_GET_LENGTH(result);
661 
662     /* We allocate a buffer for the output.
663        If we find that we made no changes, we still return
664        the NFD result. */
665     output = PyMem_NEW(Py_UCS4, len);
666     if (!output) {
667         PyErr_NoMemory();
668         Py_DECREF(result);
669         return 0;
670     }
671     i = o = 0;
672 
673   again:
674     while (i < len) {
675       for (index = 0; index < cskipped; index++) {
676           if (skipped[index] == i) {
677               /* *i character is skipped.
678                  Remove from list. */
679               skipped[index] = skipped[cskipped-1];
680               cskipped--;
681               i++;
682               goto again; /* continue while */
683           }
684       }
685       /* Hangul Composition. We don't need to check for <LV,T>
686          pairs, since we always have decomposed data. */
687       code = PyUnicode_READ(kind, data, i);
688       if (LBase <= code && code < (LBase+LCount) &&
689           i + 1 < len &&
690           VBase <= PyUnicode_READ(kind, data, i+1) &&
691           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
692           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
693              and V character is a modern vowel (0x1161 ~ 0x1175). */
694           int LIndex, VIndex;
695           LIndex = code - LBase;
696           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
697           code = SBase + (LIndex*VCount+VIndex)*TCount;
698           i+=2;
699           if (i < len &&
700               TBase < PyUnicode_READ(kind, data, i) &&
701               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
702               /* check T character is a modern trailing consonant
703                  (0x11A8 ~ 0x11C2). */
704               code += PyUnicode_READ(kind, data, i)-TBase;
705               i++;
706           }
707           output[o++] = code;
708           continue;
709       }
710 
711       /* code is still input[i] here */
712       f = find_nfc_index(nfc_first, code);
713       if (f == -1) {
714           output[o++] = code;
715           i++;
716           continue;
717       }
718       /* Find next unblocked character. */
719       i1 = i+1;
720       comb = 0;
721       /* output base character for now; might be updated later. */
722       output[o] = PyUnicode_READ(kind, data, i);
723       while (i1 < len) {
724           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
725           int comb1 = _getrecord_ex(code1)->combining;
726           if (comb) {
727               if (comb1 == 0)
728                   break;
729               if (comb >= comb1) {
730                   /* Character is blocked. */
731                   i1++;
732                   continue;
733               }
734           }
735           l = find_nfc_index(nfc_last, code1);
736           /* i1 cannot be combined with i. If i1
737              is a starter, we don't need to look further.
738              Otherwise, record the combining class. */
739           if (l == -1) {
740             not_combinable:
741               if (comb1 == 0)
742                   break;
743               comb = comb1;
744               i1++;
745               continue;
746           }
747           index = f*TOTAL_LAST + l;
748           index1 = comp_index[index >> COMP_SHIFT];
749           code = comp_data[(index1<<COMP_SHIFT)+
750                            (index&((1<<COMP_SHIFT)-1))];
751           if (code == 0)
752               goto not_combinable;
753 
754           /* Replace the original character. */
755           output[o] = code;
756           /* Mark the second character unused. */
757           assert(cskipped < 20);
758           skipped[cskipped++] = i1;
759           i1++;
760           f = find_nfc_index(nfc_first, output[o]);
761           if (f == -1)
762               break;
763       }
764       /* Output character was already written.
765          Just advance the indices. */
766       o++; i++;
767     }
768     if (o == len) {
769         /* No changes. Return original string. */
770         PyMem_Free(output);
771         return result;
772     }
773     Py_DECREF(result);
774     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
775                                        output, o);
776     PyMem_Free(output);
777     return result;
778 }
779 
780 // This needs to match the logic in makeunicodedata.py
781 // which constructs the quickcheck data.
782 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
783 
784 /* Run the Unicode normalization "quickcheck" algorithm.
785  *
786  * Return YES or NO if quickcheck determines the input is certainly
787  * normalized or certainly not, and MAYBE if quickcheck is unable to
788  * tell.
789  *
790  * If `yes_only` is true, then return MAYBE as soon as we determine
791  * the answer is not YES.
792  *
793  * For background and details on the algorithm, see UAX #15:
794  *   https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795  */
796 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)797 is_normalized_quickcheck(PyObject *self, PyObject *input,
798                          bool nfc, bool k, bool yes_only)
799 {
800     /* An older version of the database is requested, quickchecks must be
801        disabled. */
802     if (self && UCD_Check(self))
803         return NO;
804 
805     Py_ssize_t i, len;
806     int kind;
807     const void *data;
808     unsigned char prev_combining = 0;
809 
810     /* The two quickcheck bits at this shift have type QuickcheckResult. */
811     int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
812 
813     QuickcheckResult result = YES; /* certainly normalized, unless we find something */
814 
815     i = 0;
816     kind = PyUnicode_KIND(input);
817     data = PyUnicode_DATA(input);
818     len = PyUnicode_GET_LENGTH(input);
819     while (i < len) {
820         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
821         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
822 
823         unsigned char combining = record->combining;
824         if (combining && prev_combining > combining)
825             return NO; /* non-canonical sort order, not normalized */
826         prev_combining = combining;
827 
828         unsigned char quickcheck_whole = record->normalization_quick_check;
829         if (yes_only) {
830             if (quickcheck_whole & (3 << quickcheck_shift))
831                 return MAYBE;
832         } else {
833             switch ((quickcheck_whole >> quickcheck_shift) & 3) {
834             case NO:
835               return NO;
836             case MAYBE:
837               result = MAYBE; /* this string might need normalization */
838             }
839         }
840     }
841     return result;
842 }
843 
844 /*[clinic input]
845 unicodedata.UCD.is_normalized
846 
847     self: self
848     form: unicode
849     unistr as input: unicode
850     /
851 
852 Return whether the Unicode string unistr is in the normal form 'form'.
853 
854 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
855 [clinic start generated code]*/
856 
857 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)858 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
859                                    PyObject *input)
860 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
861 {
862     if (PyUnicode_READY(input) == -1) {
863         return NULL;
864     }
865 
866     if (PyUnicode_GET_LENGTH(input) == 0) {
867         /* special case empty input strings. */
868         Py_RETURN_TRUE;
869     }
870 
871     PyObject *result;
872     bool nfc = false;
873     bool k = false;
874     QuickcheckResult m;
875 
876     PyObject *cmp;
877     int match = 0;
878 
879     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
880         nfc = true;
881     }
882     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
883         nfc = true;
884         k = true;
885     }
886     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
887         /* matches default values for `nfc` and `k` */
888     }
889     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
890         k = true;
891     }
892     else {
893         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
894         return NULL;
895     }
896 
897     m = is_normalized_quickcheck(self, input, nfc, k, false);
898 
899     if (m == MAYBE) {
900         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
901         if (cmp == NULL) {
902             return NULL;
903         }
904         match = PyUnicode_Compare(input, cmp);
905         Py_DECREF(cmp);
906         result = (match == 0) ? Py_True : Py_False;
907     }
908     else {
909         result = (m == YES) ? Py_True : Py_False;
910     }
911 
912     Py_INCREF(result);
913     return result;
914 }
915 
916 
917 /*[clinic input]
918 unicodedata.UCD.normalize
919 
920     self: self
921     form: unicode
922     unistr as input: unicode
923     /
924 
925 Return the normal form 'form' for the Unicode string unistr.
926 
927 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
928 [clinic start generated code]*/
929 
930 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)931 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
932                                PyObject *input)
933 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
934 {
935     if (PyUnicode_GET_LENGTH(input) == 0) {
936         /* Special case empty input strings, since resizing
937            them  later would cause internal errors. */
938         Py_INCREF(input);
939         return input;
940     }
941 
942     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
943         if (is_normalized_quickcheck(self, input, true,  false, true) == YES) {
944             Py_INCREF(input);
945             return input;
946         }
947         return nfc_nfkc(self, input, 0);
948     }
949     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
950         if (is_normalized_quickcheck(self, input, true,  true,  true) == YES) {
951             Py_INCREF(input);
952             return input;
953         }
954         return nfc_nfkc(self, input, 1);
955     }
956     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
957         if (is_normalized_quickcheck(self, input, false, false, true) == YES) {
958             Py_INCREF(input);
959             return input;
960         }
961         return nfd_nfkd(self, input, 0);
962     }
963     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
964         if (is_normalized_quickcheck(self, input, false, true,  true) == YES) {
965             Py_INCREF(input);
966             return input;
967         }
968         return nfd_nfkd(self, input, 1);
969     }
970     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
971     return NULL;
972 }
973 
974 /* -------------------------------------------------------------------- */
975 /* unicode character name tables */
976 
977 /* data file generated by Tools/unicode/makeunicodedata.py */
978 #include "unicodename_db.h"
979 
980 /* -------------------------------------------------------------------- */
981 /* database code (cut and pasted from the unidb package) */
982 
983 static unsigned long
_gethash(const char * s,int len,int scale)984 _gethash(const char *s, int len, int scale)
985 {
986     int i;
987     unsigned long h = 0;
988     unsigned long ix;
989     for (i = 0; i < len; i++) {
990         h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
991         ix = h & 0xff000000;
992         if (ix)
993             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
994     }
995     return h;
996 }
997 
998 static const char * const hangul_syllables[][3] = {
999     { "G",  "A",   ""   },
1000     { "GG", "AE",  "G"  },
1001     { "N",  "YA",  "GG" },
1002     { "D",  "YAE", "GS" },
1003     { "DD", "EO",  "N", },
1004     { "R",  "E",   "NJ" },
1005     { "M",  "YEO", "NH" },
1006     { "B",  "YE",  "D"  },
1007     { "BB", "O",   "L"  },
1008     { "S",  "WA",  "LG" },
1009     { "SS", "WAE", "LM" },
1010     { "",   "OE",  "LB" },
1011     { "J",  "YO",  "LS" },
1012     { "JJ", "U",   "LT" },
1013     { "C",  "WEO", "LP" },
1014     { "K",  "WE",  "LH" },
1015     { "T",  "WI",  "M"  },
1016     { "P",  "YU",  "B"  },
1017     { "H",  "EU",  "BS" },
1018     { 0,    "YI",  "S"  },
1019     { 0,    "I",   "SS" },
1020     { 0,    0,     "NG" },
1021     { 0,    0,     "J"  },
1022     { 0,    0,     "C"  },
1023     { 0,    0,     "K"  },
1024     { 0,    0,     "T"  },
1025     { 0,    0,     "P"  },
1026     { 0,    0,     "H"  }
1027 };
1028 
1029 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1030 static int
is_unified_ideograph(Py_UCS4 code)1031 is_unified_ideograph(Py_UCS4 code)
1032 {
1033     return
1034         (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
1035         (0x4E00 <= code && code <= 0x9FFC)   || /* CJK Ideograph */
1036         (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
1037         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
1038         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1039         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1040         (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1041         (0x30000 <= code && code <= 0x3134A);   /* CJK Ideograph Extension G */
1042 }
1043 
1044 /* macros used to determine if the given code point is in the PUA range that
1045  * we are using to store aliases and named sequences */
1046 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1047 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1048                           (cp < named_sequences_end))
1049 
1050 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1051 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1052            int with_alias_and_seq)
1053 {
1054     /* Find the name associated with the given code point.
1055      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1056      * that we are using for aliases and named sequences. */
1057     int offset;
1058     int i;
1059     int word;
1060     const unsigned char* w;
1061 
1062     if (code >= 0x110000)
1063         return 0;
1064 
1065     /* XXX should we just skip all the code points in the PUAs here? */
1066     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1067         return 0;
1068 
1069     if (self && UCD_Check(self)) {
1070         /* in 3.2.0 there are no aliases and named sequences */
1071         const change_record *old;
1072         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1073             return 0;
1074         old = get_old_record(self, code);
1075         if (old->category_changed == 0) {
1076             /* unassigned */
1077             return 0;
1078         }
1079     }
1080 
1081     if (SBase <= code && code < SBase+SCount) {
1082         /* Hangul syllable. */
1083         int SIndex = code - SBase;
1084         int L = SIndex / NCount;
1085         int V = (SIndex % NCount) / TCount;
1086         int T = SIndex % TCount;
1087 
1088         if (buflen < 27)
1089             /* Worst case: HANGUL SYLLABLE <10chars>. */
1090             return 0;
1091         strcpy(buffer, "HANGUL SYLLABLE ");
1092         buffer += 16;
1093         strcpy(buffer, hangul_syllables[L][0]);
1094         buffer += strlen(hangul_syllables[L][0]);
1095         strcpy(buffer, hangul_syllables[V][1]);
1096         buffer += strlen(hangul_syllables[V][1]);
1097         strcpy(buffer, hangul_syllables[T][2]);
1098         buffer += strlen(hangul_syllables[T][2]);
1099         *buffer = '\0';
1100         return 1;
1101     }
1102 
1103     if (is_unified_ideograph(code)) {
1104         if (buflen < 28)
1105             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1106             return 0;
1107         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1108         return 1;
1109     }
1110 
1111     /* get offset into phrasebook */
1112     offset = phrasebook_offset1[(code>>phrasebook_shift)];
1113     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1114                                (code&((1<<phrasebook_shift)-1))];
1115     if (!offset)
1116         return 0;
1117 
1118     i = 0;
1119 
1120     for (;;) {
1121         /* get word index */
1122         word = phrasebook[offset] - phrasebook_short;
1123         if (word >= 0) {
1124             word = (word << 8) + phrasebook[offset+1];
1125             offset += 2;
1126         } else
1127             word = phrasebook[offset++];
1128         if (i) {
1129             if (i > buflen)
1130                 return 0; /* buffer overflow */
1131             buffer[i++] = ' ';
1132         }
1133         /* copy word string from lexicon.  the last character in the
1134            word has bit 7 set.  the last word in a string ends with
1135            0x80 */
1136         w = lexicon + lexicon_offset[word];
1137         while (*w < 128) {
1138             if (i >= buflen)
1139                 return 0; /* buffer overflow */
1140             buffer[i++] = *w++;
1141         }
1142         if (i >= buflen)
1143             return 0; /* buffer overflow */
1144         buffer[i++] = *w & 127;
1145         if (*w == 128)
1146             break; /* end of word */
1147     }
1148 
1149     return 1;
1150 }
1151 
1152 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1153 _cmpname(PyObject *self, int code, const char* name, int namelen)
1154 {
1155     /* check if code corresponds to the given name */
1156     int i;
1157     char buffer[NAME_MAXLEN+1];
1158     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1159         return 0;
1160     for (i = 0; i < namelen; i++) {
1161         if (Py_TOUPPER(name[i]) != buffer[i])
1162             return 0;
1163     }
1164     return buffer[namelen] == '\0';
1165 }
1166 
1167 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1168 find_syllable(const char *str, int *len, int *pos, int count, int column)
1169 {
1170     int i, len1;
1171     *len = -1;
1172     for (i = 0; i < count; i++) {
1173         const char *s = hangul_syllables[i][column];
1174         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1175         if (len1 <= *len)
1176             continue;
1177         if (strncmp(str, s, len1) == 0) {
1178             *len = len1;
1179             *pos = i;
1180         }
1181     }
1182     if (*len == -1) {
1183         *len = 0;
1184     }
1185 }
1186 
1187 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1188 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1189 {
1190     /* check if named sequences are allowed */
1191     if (!with_named_seq && IS_NAMED_SEQ(cp))
1192         return 0;
1193     /* if the code point is in the PUA range that we use for aliases,
1194      * convert it to obtain the right code point */
1195     if (IS_ALIAS(cp))
1196         *code = name_aliases[cp-aliases_start];
1197     else
1198         *code = cp;
1199     return 1;
1200 }
1201 
1202 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1203 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1204          int with_named_seq)
1205 {
1206     /* Return the code point associated with the given name.
1207      * Named aliases are resolved too (unless self != NULL (i.e. we are using
1208      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
1209      * using for the named sequence, and the caller must then convert it. */
1210     unsigned int h, v;
1211     unsigned int mask = code_size-1;
1212     unsigned int i, incr;
1213 
1214     /* Check for hangul syllables. */
1215     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1216         int len, L = -1, V = -1, T = -1;
1217         const char *pos = name + 16;
1218         find_syllable(pos, &len, &L, LCount, 0);
1219         pos += len;
1220         find_syllable(pos, &len, &V, VCount, 1);
1221         pos += len;
1222         find_syllable(pos, &len, &T, TCount, 2);
1223         pos += len;
1224         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1225             *code = SBase + (L*VCount+V)*TCount + T;
1226             return 1;
1227         }
1228         /* Otherwise, it's an illegal syllable name. */
1229         return 0;
1230     }
1231 
1232     /* Check for unified ideographs. */
1233     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1234         /* Four or five hexdigits must follow. */
1235         v = 0;
1236         name += 22;
1237         namelen -= 22;
1238         if (namelen != 4 && namelen != 5)
1239             return 0;
1240         while (namelen--) {
1241             v *= 16;
1242             if (*name >= '0' && *name <= '9')
1243                 v += *name - '0';
1244             else if (*name >= 'A' && *name <= 'F')
1245                 v += *name - 'A' + 10;
1246             else
1247                 return 0;
1248             name++;
1249         }
1250         if (!is_unified_ideograph(v))
1251             return 0;
1252         *code = v;
1253         return 1;
1254     }
1255 
1256     /* the following is the same as python's dictionary lookup, with
1257        only minor changes.  see the makeunicodedata script for more
1258        details */
1259 
1260     h = (unsigned int) _gethash(name, namelen, code_magic);
1261     i = (~h) & mask;
1262     v = code_hash[i];
1263     if (!v)
1264         return 0;
1265     if (_cmpname(self, v, name, namelen))
1266         return _check_alias_and_seq(v, code, with_named_seq);
1267     incr = (h ^ (h >> 3)) & mask;
1268     if (!incr)
1269         incr = mask;
1270     for (;;) {
1271         i = (i + incr) & mask;
1272         v = code_hash[i];
1273         if (!v)
1274             return 0;
1275         if (_cmpname(self, v, name, namelen))
1276             return _check_alias_and_seq(v, code, with_named_seq);
1277         incr = incr << 1;
1278         if (incr > mask)
1279             incr = incr ^ code_poly;
1280     }
1281 }
1282 
1283 static const _PyUnicode_Name_CAPI hashAPI =
1284 {
1285     sizeof(_PyUnicode_Name_CAPI),
1286     _getucname,
1287     _getcode
1288 };
1289 
1290 /* -------------------------------------------------------------------- */
1291 /* Python bindings */
1292 
1293 /*[clinic input]
1294 unicodedata.UCD.name
1295 
1296     self: self
1297     chr: int(accept={str})
1298     default: object=NULL
1299     /
1300 
1301 Returns the name assigned to the character chr as a string.
1302 
1303 If no name is defined, default is returned, or, if not given,
1304 ValueError is raised.
1305 [clinic start generated code]*/
1306 
1307 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1308 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1309 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1310 {
1311     char name[NAME_MAXLEN+1];
1312     Py_UCS4 c = (Py_UCS4)chr;
1313 
1314     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1315         if (default_value == NULL) {
1316             PyErr_SetString(PyExc_ValueError, "no such name");
1317             return NULL;
1318         }
1319         else {
1320             Py_INCREF(default_value);
1321             return default_value;
1322         }
1323     }
1324 
1325     return PyUnicode_FromString(name);
1326 }
1327 
1328 /*[clinic input]
1329 unicodedata.UCD.lookup
1330 
1331     self: self
1332     name: str(accept={str, robuffer}, zeroes=True)
1333     /
1334 
1335 Look up character by name.
1336 
1337 If a character with the given name is found, return the
1338 corresponding character.  If not found, KeyError is raised.
1339 [clinic start generated code]*/
1340 
1341 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1342 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1343                             Py_ssize_clean_t name_length)
1344 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1345 {
1346     Py_UCS4 code;
1347     unsigned int index;
1348     if (name_length > NAME_MAXLEN) {
1349         PyErr_SetString(PyExc_KeyError, "name too long");
1350         return NULL;
1351     }
1352 
1353     if (!_getcode(self, name, (int)name_length, &code, 1)) {
1354         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1355         return NULL;
1356     }
1357     /* check if code is in the PUA range that we use for named sequences
1358        and convert it */
1359     if (IS_NAMED_SEQ(code)) {
1360         index = code-named_sequences_start;
1361         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1362                                          named_sequences[index].seq,
1363                                          named_sequences[index].seqlen);
1364     }
1365     return PyUnicode_FromOrdinal(code);
1366 }
1367 
1368 /* XXX Add doc strings. */
1369 
1370 static PyMethodDef unicodedata_functions[] = {
1371     UNICODEDATA_UCD_DECIMAL_METHODDEF
1372     UNICODEDATA_UCD_DIGIT_METHODDEF
1373     UNICODEDATA_UCD_NUMERIC_METHODDEF
1374     UNICODEDATA_UCD_CATEGORY_METHODDEF
1375     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1376     UNICODEDATA_UCD_COMBINING_METHODDEF
1377     UNICODEDATA_UCD_MIRRORED_METHODDEF
1378     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1379     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1380     UNICODEDATA_UCD_NAME_METHODDEF
1381     UNICODEDATA_UCD_LOOKUP_METHODDEF
1382     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1383     UNICODEDATA_UCD_NORMALIZE_METHODDEF
1384     {NULL, NULL}                /* sentinel */
1385 };
1386 
1387 static PyTypeObject UCD_Type = {
1388         /* The ob_type field must be initialized in the module init function
1389          * to be portable to Windows without using C++. */
1390         PyVarObject_HEAD_INIT(NULL, 0)
1391         "unicodedata.UCD",              /*tp_name*/
1392         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1393         0,                      /*tp_itemsize*/
1394         /* methods */
1395         (destructor)PyObject_Del, /*tp_dealloc*/
1396         0,                      /*tp_vectorcall_offset*/
1397         0,                      /*tp_getattr*/
1398         0,                      /*tp_setattr*/
1399         0,                      /*tp_as_async*/
1400         0,                      /*tp_repr*/
1401         0,                      /*tp_as_number*/
1402         0,                      /*tp_as_sequence*/
1403         0,                      /*tp_as_mapping*/
1404         0,                      /*tp_hash*/
1405         0,                      /*tp_call*/
1406         0,                      /*tp_str*/
1407         PyObject_GenericGetAttr,/*tp_getattro*/
1408         0,                      /*tp_setattro*/
1409         0,                      /*tp_as_buffer*/
1410         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1411         0,                      /*tp_doc*/
1412         0,                      /*tp_traverse*/
1413         0,                      /*tp_clear*/
1414         0,                      /*tp_richcompare*/
1415         0,                      /*tp_weaklistoffset*/
1416         0,                      /*tp_iter*/
1417         0,                      /*tp_iternext*/
1418         unicodedata_functions,  /*tp_methods*/
1419         DB_members,             /*tp_members*/
1420         0,                      /*tp_getset*/
1421         0,                      /*tp_base*/
1422         0,                      /*tp_dict*/
1423         0,                      /*tp_descr_get*/
1424         0,                      /*tp_descr_set*/
1425         0,                      /*tp_dictoffset*/
1426         0,                      /*tp_init*/
1427         0,                      /*tp_alloc*/
1428         0,                      /*tp_new*/
1429         0,                      /*tp_free*/
1430         0,                      /*tp_is_gc*/
1431 };
1432 
1433 PyDoc_STRVAR(unicodedata_docstring,
1434 "This module provides access to the Unicode Character Database which\n\
1435 defines character properties for all Unicode characters. The data in\n\
1436 this database is based on the UnicodeData.txt file version\n\
1437 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1438 \n\
1439 The module uses the same names and symbols as defined by the\n\
1440 UnicodeData File Format " UNIDATA_VERSION ".");
1441 
1442 static struct PyModuleDef unicodedatamodule = {
1443         PyModuleDef_HEAD_INIT,
1444         "unicodedata",
1445         unicodedata_docstring,
1446         -1,
1447         unicodedata_functions,
1448         NULL,
1449         NULL,
1450         NULL,
1451         NULL
1452 };
1453 
1454 PyMODINIT_FUNC
PyInit_unicodedata(void)1455 PyInit_unicodedata(void)
1456 {
1457     PyObject *m, *v;
1458 
1459     Py_SET_TYPE(&UCD_Type, &PyType_Type);
1460 
1461     m = PyModule_Create(&unicodedatamodule);
1462     if (!m)
1463         return NULL;
1464 
1465     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1466     Py_INCREF(&UCD_Type);
1467     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1468 
1469     /* Previous versions */
1470     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1471     if (v != NULL)
1472         PyModule_AddObject(m, "ucd_3_2_0", v);
1473 
1474     /* Export C API */
1475     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1476     if (v != NULL)
1477         PyModule_AddObject(m, "ucnhash_CAPI", v);
1478     return m;
1479 }
1480 
1481 /*
1482 Local variables:
1483 c-basic-offset: 4
1484 indent-tabs-mode: nil
1485 End:
1486 */
1487