• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ------------------------------------------------------------------------
2 
3    unicodedata -- Provides access to the Unicode database.
4 
5    Data was extracted from the UnicodeData.txt file.
6    The current version number is reported in the unidata_version constant.
7 
8    Written by Marc-Andre Lemburg (mal@lemburg.com).
9    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10    Modified by Martin v. Löwis (martin@v.loewis.de)
11 
12    Copyright (c) Corporation for National Research Initiatives.
13 
14    ------------------------------------------------------------------------ */
15 
16 #define PY_SSIZE_T_CLEAN
17 
18 #include "Python.h"
19 #include "ucnhash.h"
20 #include "structmember.h"
21 
22 /*[clinic input]
23 module unicodedata
24 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
25 [clinic start generated code]*/
26 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
27 
28 /* character properties */
29 
30 typedef struct {
31     const unsigned char category;       /* index into
32                                            _PyUnicode_CategoryNames */
33     const unsigned char combining;      /* combining class value 0 - 255 */
34     const unsigned char bidirectional;  /* index into
35                                            _PyUnicode_BidirectionalNames */
36     const unsigned char mirrored;       /* true if mirrored in bidir mode */
37     const unsigned char east_asian_width;       /* index into
38                                                    _PyUnicode_EastAsianWidth */
39     const unsigned char normalization_quick_check; /* see is_normalized() */
40 } _PyUnicode_DatabaseRecord;
41 
42 typedef struct change_record {
43     /* sequence of fields should be the same as in merge_old_version */
44     const unsigned char bidir_changed;
45     const unsigned char category_changed;
46     const unsigned char decimal_changed;
47     const unsigned char mirrored_changed;
48     const unsigned char east_asian_width_changed;
49     const double numeric_changed;
50 } change_record;
51 
52 /* data file generated by Tools/unicode/makeunicodedata.py */
53 #include "unicodedata_db.h"
54 
55 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)56 _getrecord_ex(Py_UCS4 code)
57 {
58     int index;
59     if (code >= 0x110000)
60         index = 0;
61     else {
62         index = index1[(code>>SHIFT)];
63         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
64     }
65 
66     return &_PyUnicode_Database_Records[index];
67 }
68 
69 /* ------------- Previous-version API ------------------------------------- */
70 typedef struct previous_version {
71     PyObject_HEAD
72     const char *name;
73     const change_record* (*getrecord)(Py_UCS4);
74     Py_UCS4 (*normalization)(Py_UCS4);
75 } PreviousDBVersion;
76 
77 #include "clinic/unicodedata.c.h"
78 
79 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
80 
81 static PyMemberDef DB_members[] = {
82         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
83         {NULL}
84 };
85 
86 /* forward declaration */
87 static PyTypeObject UCD_Type;
88 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
89 
90 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))91 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
92                      Py_UCS4 (*normalization)(Py_UCS4))
93 {
94         PreviousDBVersion *self;
95         self = PyObject_New(PreviousDBVersion, &UCD_Type);
96         if (self == NULL)
97                 return NULL;
98         self->name = name;
99         self->getrecord = getrecord;
100         self->normalization = normalization;
101         return (PyObject*)self;
102 }
103 
104 
105 /* --- Module API --------------------------------------------------------- */
106 
107 /*[clinic input]
108 unicodedata.UCD.decimal
109 
110     self: self
111     chr: int(accept={str})
112     default: object=NULL
113     /
114 
115 Converts a Unicode character into its equivalent decimal value.
116 
117 Returns the decimal value assigned to the character chr as integer.
118 If no such value is defined, default is returned, or, if not given,
119 ValueError is raised.
120 [clinic start generated code]*/
121 
122 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)123 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
124                              PyObject *default_value)
125 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
126 {
127     int have_old = 0;
128     long rc;
129     Py_UCS4 c = (Py_UCS4)chr;
130 
131     if (self && UCD_Check(self)) {
132         const change_record *old = get_old_record(self, c);
133         if (old->category_changed == 0) {
134             /* unassigned */
135             have_old = 1;
136             rc = -1;
137         }
138         else if (old->decimal_changed != 0xFF) {
139             have_old = 1;
140             rc = old->decimal_changed;
141         }
142     }
143 
144     if (!have_old)
145         rc = Py_UNICODE_TODECIMAL(c);
146     if (rc < 0) {
147         if (default_value == NULL) {
148             PyErr_SetString(PyExc_ValueError,
149                             "not a decimal");
150             return NULL;
151         }
152         else {
153             Py_INCREF(default_value);
154             return default_value;
155         }
156     }
157     return PyLong_FromLong(rc);
158 }
159 
160 /*[clinic input]
161 unicodedata.UCD.digit
162 
163     self: self
164     chr: int(accept={str})
165     default: object=NULL
166     /
167 
168 Converts a Unicode character into its equivalent digit value.
169 
170 Returns the digit value assigned to the character chr as integer.
171 If no such value is defined, default is returned, or, if not given,
172 ValueError is raised.
173 [clinic start generated code]*/
174 
175 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)176 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
177 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
178 {
179     long rc;
180     Py_UCS4 c = (Py_UCS4)chr;
181     rc = Py_UNICODE_TODIGIT(c);
182     if (rc < 0) {
183         if (default_value == NULL) {
184             PyErr_SetString(PyExc_ValueError, "not a digit");
185             return NULL;
186         }
187         else {
188             Py_INCREF(default_value);
189             return default_value;
190         }
191     }
192     return PyLong_FromLong(rc);
193 }
194 
195 /*[clinic input]
196 unicodedata.UCD.numeric
197 
198     self: self
199     chr: int(accept={str})
200     default: object=NULL
201     /
202 
203 Converts a Unicode character into its equivalent numeric value.
204 
205 Returns the numeric value assigned to the character chr as float.
206 If no such value is defined, default is returned, or, if not given,
207 ValueError is raised.
208 [clinic start generated code]*/
209 
210 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)211 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
212                              PyObject *default_value)
213 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
214 {
215     int have_old = 0;
216     double rc;
217     Py_UCS4 c = (Py_UCS4)chr;
218 
219     if (self && UCD_Check(self)) {
220         const change_record *old = get_old_record(self, c);
221         if (old->category_changed == 0) {
222             /* unassigned */
223             have_old = 1;
224             rc = -1.0;
225         }
226         else if (old->decimal_changed != 0xFF) {
227             have_old = 1;
228             rc = old->decimal_changed;
229         }
230     }
231 
232     if (!have_old)
233         rc = Py_UNICODE_TONUMERIC(c);
234     if (rc == -1.0) {
235         if (default_value == NULL) {
236             PyErr_SetString(PyExc_ValueError, "not a numeric character");
237             return NULL;
238         }
239         else {
240             Py_INCREF(default_value);
241             return default_value;
242         }
243     }
244     return PyFloat_FromDouble(rc);
245 }
246 
247 /*[clinic input]
248 unicodedata.UCD.category
249 
250     self: self
251     chr: int(accept={str})
252     /
253 
254 Returns the general category assigned to the character chr as string.
255 [clinic start generated code]*/
256 
257 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)258 unicodedata_UCD_category_impl(PyObject *self, int chr)
259 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
260 {
261     int index;
262     Py_UCS4 c = (Py_UCS4)chr;
263     index = (int) _getrecord_ex(c)->category;
264     if (self && UCD_Check(self)) {
265         const change_record *old = get_old_record(self, c);
266         if (old->category_changed != 0xFF)
267             index = old->category_changed;
268     }
269     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
270 }
271 
272 /*[clinic input]
273 unicodedata.UCD.bidirectional
274 
275     self: self
276     chr: int(accept={str})
277     /
278 
279 Returns the bidirectional class assigned to the character chr as string.
280 
281 If no such value is defined, an empty string is returned.
282 [clinic start generated code]*/
283 
284 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)285 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
286 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
287 {
288     int index;
289     Py_UCS4 c = (Py_UCS4)chr;
290     index = (int) _getrecord_ex(c)->bidirectional;
291     if (self && UCD_Check(self)) {
292         const change_record *old = get_old_record(self, c);
293         if (old->category_changed == 0)
294             index = 0; /* unassigned */
295         else if (old->bidir_changed != 0xFF)
296             index = old->bidir_changed;
297     }
298     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
299 }
300 
301 /*[clinic input]
302 unicodedata.UCD.combining -> int
303 
304     self: self
305     chr: int(accept={str})
306     /
307 
308 Returns the canonical combining class assigned to the character chr as integer.
309 
310 Returns 0 if no combining class is defined.
311 [clinic start generated code]*/
312 
313 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)314 unicodedata_UCD_combining_impl(PyObject *self, int chr)
315 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
316 {
317     int index;
318     Py_UCS4 c = (Py_UCS4)chr;
319     index = (int) _getrecord_ex(c)->combining;
320     if (self && UCD_Check(self)) {
321         const change_record *old = get_old_record(self, c);
322         if (old->category_changed == 0)
323             index = 0; /* unassigned */
324     }
325     return index;
326 }
327 
328 /*[clinic input]
329 unicodedata.UCD.mirrored -> int
330 
331     self: self
332     chr: int(accept={str})
333     /
334 
335 Returns the mirrored property assigned to the character chr as integer.
336 
337 Returns 1 if the character has been identified as a "mirrored"
338 character in bidirectional text, 0 otherwise.
339 [clinic start generated code]*/
340 
341 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)342 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
343 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
344 {
345     int index;
346     Py_UCS4 c = (Py_UCS4)chr;
347     index = (int) _getrecord_ex(c)->mirrored;
348     if (self && UCD_Check(self)) {
349         const change_record *old = get_old_record(self, c);
350         if (old->category_changed == 0)
351             index = 0; /* unassigned */
352         else if (old->mirrored_changed != 0xFF)
353             index = old->mirrored_changed;
354     }
355     return index;
356 }
357 
358 /*[clinic input]
359 unicodedata.UCD.east_asian_width
360 
361     self: self
362     chr: int(accept={str})
363     /
364 
365 Returns the east asian width assigned to the character chr as string.
366 [clinic start generated code]*/
367 
368 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)369 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
370 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
371 {
372     int index;
373     Py_UCS4 c = (Py_UCS4)chr;
374     index = (int) _getrecord_ex(c)->east_asian_width;
375     if (self && UCD_Check(self)) {
376         const change_record *old = get_old_record(self, c);
377         if (old->category_changed == 0)
378             index = 0; /* unassigned */
379         else if (old->east_asian_width_changed != 0xFF)
380             index = old->east_asian_width_changed;
381     }
382     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
383 }
384 
385 /*[clinic input]
386 unicodedata.UCD.decomposition
387 
388     self: self
389     chr: int(accept={str})
390     /
391 
392 Returns the character decomposition mapping assigned to the character chr as string.
393 
394 An empty string is returned in case no such mapping is defined.
395 [clinic start generated code]*/
396 
397 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)398 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
399 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
400 {
401     char decomp[256];
402     int code, index, count;
403     size_t i;
404     unsigned int prefix_index;
405     Py_UCS4 c = (Py_UCS4)chr;
406 
407     code = (int)c;
408 
409     if (self && UCD_Check(self)) {
410         const change_record *old = get_old_record(self, c);
411         if (old->category_changed == 0)
412             return PyUnicode_FromString(""); /* unassigned */
413     }
414 
415     if (code < 0 || code >= 0x110000)
416         index = 0;
417     else {
418         index = decomp_index1[(code>>DECOMP_SHIFT)];
419         index = decomp_index2[(index<<DECOMP_SHIFT)+
420                              (code&((1<<DECOMP_SHIFT)-1))];
421     }
422 
423     /* high byte is number of hex bytes (usually one or two), low byte
424        is prefix code (from*/
425     count = decomp_data[index] >> 8;
426 
427     /* XXX: could allocate the PyString up front instead
428        (strlen(prefix) + 5 * count + 1 bytes) */
429 
430     /* Based on how index is calculated above and decomp_data is generated
431        from Tools/unicode/makeunicodedata.py, it should not be possible
432        to overflow decomp_prefix. */
433     prefix_index = decomp_data[index] & 255;
434     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
435 
436     /* copy prefix */
437     i = strlen(decomp_prefix[prefix_index]);
438     memcpy(decomp, decomp_prefix[prefix_index], i);
439 
440     while (count-- > 0) {
441         if (i)
442             decomp[i++] = ' ';
443         assert(i < sizeof(decomp));
444         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
445                       decomp_data[++index]);
446         i += strlen(decomp + i);
447     }
448     return PyUnicode_FromStringAndSize(decomp, i);
449 }
450 
451 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)452 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
453 {
454     if (code >= 0x110000) {
455         *index = 0;
456     } else if (self && UCD_Check(self) &&
457                get_old_record(self, code)->category_changed==0) {
458         /* unassigned in old version */
459         *index = 0;
460     }
461     else {
462         *index = decomp_index1[(code>>DECOMP_SHIFT)];
463         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
464                                (code&((1<<DECOMP_SHIFT)-1))];
465     }
466 
467     /* high byte is number of hex bytes (usually one or two), low byte
468        is prefix code (from*/
469     *count = decomp_data[*index] >> 8;
470     *prefix = decomp_data[*index] & 255;
471 
472     (*index)++;
473 }
474 
475 #define SBase   0xAC00
476 #define LBase   0x1100
477 #define VBase   0x1161
478 #define TBase   0x11A7
479 #define LCount  19
480 #define VCount  21
481 #define TCount  28
482 #define NCount  (VCount*TCount)
483 #define SCount  (LCount*NCount)
484 
485 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)486 nfd_nfkd(PyObject *self, PyObject *input, int k)
487 {
488     PyObject *result;
489     Py_UCS4 *output;
490     Py_ssize_t i, o, osize;
491     int kind;
492     void *data;
493     /* Longest decomposition in Unicode 3.2: U+FDFA */
494     Py_UCS4 stack[20];
495     Py_ssize_t space, isize;
496     int index, prefix, count, stackptr;
497     unsigned char prev, cur;
498 
499     stackptr = 0;
500     isize = PyUnicode_GET_LENGTH(input);
501     space = isize;
502     /* Overallocate at most 10 characters. */
503     if (space > 10) {
504         if (space <= PY_SSIZE_T_MAX - 10)
505             space += 10;
506     }
507     else {
508         space *= 2;
509     }
510     osize = space;
511     output = PyMem_NEW(Py_UCS4, space);
512     if (!output) {
513         PyErr_NoMemory();
514         return NULL;
515     }
516     i = o = 0;
517     kind = PyUnicode_KIND(input);
518     data = PyUnicode_DATA(input);
519 
520     while (i < isize) {
521         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
522         while(stackptr) {
523             Py_UCS4 code = stack[--stackptr];
524             /* Hangul Decomposition adds three characters in
525                a single step, so we need at least that much room. */
526             if (space < 3) {
527                 Py_UCS4 *new_output;
528                 osize += 10;
529                 space += 10;
530                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531                 if (new_output == NULL) {
532                     PyMem_Free(output);
533                     PyErr_NoMemory();
534                     return NULL;
535                 }
536                 output = new_output;
537             }
538             /* Hangul Decomposition. */
539             if (SBase <= code && code < (SBase+SCount)) {
540                 int SIndex = code - SBase;
541                 int L = LBase + SIndex / NCount;
542                 int V = VBase + (SIndex % NCount) / TCount;
543                 int T = TBase + SIndex % TCount;
544                 output[o++] = L;
545                 output[o++] = V;
546                 space -= 2;
547                 if (T != TBase) {
548                     output[o++] = T;
549                     space --;
550                 }
551                 continue;
552             }
553             /* normalization changes */
554             if (self && UCD_Check(self)) {
555                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556                 if (value != 0) {
557                     stack[stackptr++] = value;
558                     continue;
559                 }
560             }
561 
562             /* Other decompositions. */
563             get_decomp_record(self, code, &index, &prefix, &count);
564 
565             /* Copy character if it is not decomposable, or has a
566                compatibility decomposition, but we do NFD. */
567             if (!count || (prefix && !k)) {
568                 output[o++] = code;
569                 space--;
570                 continue;
571             }
572             /* Copy decomposition onto the stack, in reverse
573                order.  */
574             while(count) {
575                 code = decomp_data[index + (--count)];
576                 stack[stackptr++] = code;
577             }
578         }
579     }
580 
581     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
582                                        output, o);
583     PyMem_Free(output);
584     if (!result)
585         return NULL;
586     /* result is guaranteed to be ready, as it is compact. */
587     kind = PyUnicode_KIND(result);
588     data = PyUnicode_DATA(result);
589 
590     /* Sort canonically. */
591     i = 0;
592     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
593     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
594         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
595         if (prev == 0 || cur == 0 || prev <= cur) {
596             prev = cur;
597             continue;
598         }
599         /* Non-canonical order. Need to switch *i with previous. */
600         o = i - 1;
601         while (1) {
602             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
603             PyUnicode_WRITE(kind, data, o+1,
604                             PyUnicode_READ(kind, data, o));
605             PyUnicode_WRITE(kind, data, o, tmp);
606             o--;
607             if (o < 0)
608                 break;
609             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
610             if (prev == 0 || prev <= cur)
611                 break;
612         }
613         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
614     }
615     return result;
616 }
617 
618 static int
find_nfc_index(PyObject * self,struct reindex * nfc,Py_UCS4 code)619 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
620 {
621     unsigned int index;
622     for (index = 0; nfc[index].start; index++) {
623         unsigned int start = nfc[index].start;
624         if (code < start)
625             return -1;
626         if (code <= start + nfc[index].count) {
627             unsigned int delta = code - start;
628             return nfc[index].index + delta;
629         }
630     }
631     return -1;
632 }
633 
634 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)635 nfc_nfkc(PyObject *self, PyObject *input, int k)
636 {
637     PyObject *result;
638     int kind;
639     void *data;
640     Py_UCS4 *output;
641     Py_ssize_t i, i1, o, len;
642     int f,l,index,index1,comb;
643     Py_UCS4 code;
644     Py_ssize_t skipped[20];
645     int cskipped = 0;
646 
647     result = nfd_nfkd(self, input, k);
648     if (!result)
649         return NULL;
650     /* result will be "ready". */
651     kind = PyUnicode_KIND(result);
652     data = PyUnicode_DATA(result);
653     len = PyUnicode_GET_LENGTH(result);
654 
655     /* We allocate a buffer for the output.
656        If we find that we made no changes, we still return
657        the NFD result. */
658     output = PyMem_NEW(Py_UCS4, len);
659     if (!output) {
660         PyErr_NoMemory();
661         Py_DECREF(result);
662         return 0;
663     }
664     i = o = 0;
665 
666   again:
667     while (i < len) {
668       for (index = 0; index < cskipped; index++) {
669           if (skipped[index] == i) {
670               /* *i character is skipped.
671                  Remove from list. */
672               skipped[index] = skipped[cskipped-1];
673               cskipped--;
674               i++;
675               goto again; /* continue while */
676           }
677       }
678       /* Hangul Composition. We don't need to check for <LV,T>
679          pairs, since we always have decomposed data. */
680       code = PyUnicode_READ(kind, data, i);
681       if (LBase <= code && code < (LBase+LCount) &&
682           i + 1 < len &&
683           VBase <= PyUnicode_READ(kind, data, i+1) &&
684           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
685           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
686              and V character is a modern vowel (0x1161 ~ 0x1175). */
687           int LIndex, VIndex;
688           LIndex = code - LBase;
689           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
690           code = SBase + (LIndex*VCount+VIndex)*TCount;
691           i+=2;
692           if (i < len &&
693               TBase < PyUnicode_READ(kind, data, i) &&
694               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
695               /* check T character is a modern trailing consonant
696                  (0x11A8 ~ 0x11C2). */
697               code += PyUnicode_READ(kind, data, i)-TBase;
698               i++;
699           }
700           output[o++] = code;
701           continue;
702       }
703 
704       /* code is still input[i] here */
705       f = find_nfc_index(self, nfc_first, code);
706       if (f == -1) {
707           output[o++] = code;
708           i++;
709           continue;
710       }
711       /* Find next unblocked character. */
712       i1 = i+1;
713       comb = 0;
714       /* output base character for now; might be updated later. */
715       output[o] = PyUnicode_READ(kind, data, i);
716       while (i1 < len) {
717           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
718           int comb1 = _getrecord_ex(code1)->combining;
719           if (comb) {
720               if (comb1 == 0)
721                   break;
722               if (comb >= comb1) {
723                   /* Character is blocked. */
724                   i1++;
725                   continue;
726               }
727           }
728           l = find_nfc_index(self, nfc_last, code1);
729           /* i1 cannot be combined with i. If i1
730              is a starter, we don't need to look further.
731              Otherwise, record the combining class. */
732           if (l == -1) {
733             not_combinable:
734               if (comb1 == 0)
735                   break;
736               comb = comb1;
737               i1++;
738               continue;
739           }
740           index = f*TOTAL_LAST + l;
741           index1 = comp_index[index >> COMP_SHIFT];
742           code = comp_data[(index1<<COMP_SHIFT)+
743                            (index&((1<<COMP_SHIFT)-1))];
744           if (code == 0)
745               goto not_combinable;
746 
747           /* Replace the original character. */
748           output[o] = code;
749           /* Mark the second character unused. */
750           assert(cskipped < 20);
751           skipped[cskipped++] = i1;
752           i1++;
753           f = find_nfc_index(self, nfc_first, output[o]);
754           if (f == -1)
755               break;
756       }
757       /* Output character was already written.
758          Just advance the indices. */
759       o++; i++;
760     }
761     if (o == len) {
762         /* No changes. Return original string. */
763         PyMem_Free(output);
764         return result;
765     }
766     Py_DECREF(result);
767     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
768                                        output, o);
769     PyMem_Free(output);
770     return result;
771 }
772 
773 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
774 static int
is_normalized(PyObject * self,PyObject * input,int nfc,int k)775 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
776 {
777     Py_ssize_t i, len;
778     int kind;
779     void *data;
780     unsigned char prev_combining = 0, quickcheck_mask;
781 
782     /* An older version of the database is requested, quickchecks must be
783        disabled. */
784     if (self && UCD_Check(self))
785         return 0;
786 
787     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
788        as described in http://unicode.org/reports/tr15/#Annex8. */
789     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
790 
791     i = 0;
792     kind = PyUnicode_KIND(input);
793     data = PyUnicode_DATA(input);
794     len = PyUnicode_GET_LENGTH(input);
795     while (i < len) {
796         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
797         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
798         unsigned char combining = record->combining;
799         unsigned char quickcheck = record->normalization_quick_check;
800 
801         if (quickcheck & quickcheck_mask)
802             return 0; /* this string might need normalization */
803         if (combining && prev_combining > combining)
804             return 0; /* non-canonical sort order, not normalized */
805         prev_combining = combining;
806     }
807     return 1; /* certainly normalized */
808 }
809 
810 /*[clinic input]
811 unicodedata.UCD.normalize
812 
813     self: self
814     form: str
815     unistr as input: unicode
816     /
817 
818 Return the normal form 'form' for the Unicode string unistr.
819 
820 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
821 [clinic start generated code]*/
822 
823 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,const char * form,PyObject * input)824 unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
825                                PyObject *input)
826 /*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
827 {
828     if (PyUnicode_GET_LENGTH(input) == 0) {
829         /* Special case empty input strings, since resizing
830            them  later would cause internal errors. */
831         Py_INCREF(input);
832         return input;
833     }
834 
835     if (strcmp(form, "NFC") == 0) {
836         if (is_normalized(self, input, 1, 0)) {
837             Py_INCREF(input);
838             return input;
839         }
840         return nfc_nfkc(self, input, 0);
841     }
842     if (strcmp(form, "NFKC") == 0) {
843         if (is_normalized(self, input, 1, 1)) {
844             Py_INCREF(input);
845             return input;
846         }
847         return nfc_nfkc(self, input, 1);
848     }
849     if (strcmp(form, "NFD") == 0) {
850         if (is_normalized(self, input, 0, 0)) {
851             Py_INCREF(input);
852             return input;
853         }
854         return nfd_nfkd(self, input, 0);
855     }
856     if (strcmp(form, "NFKD") == 0) {
857         if (is_normalized(self, input, 0, 1)) {
858             Py_INCREF(input);
859             return input;
860         }
861         return nfd_nfkd(self, input, 1);
862     }
863     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
864     return NULL;
865 }
866 
867 /* -------------------------------------------------------------------- */
868 /* unicode character name tables */
869 
870 /* data file generated by Tools/unicode/makeunicodedata.py */
871 #include "unicodename_db.h"
872 
873 /* -------------------------------------------------------------------- */
874 /* database code (cut and pasted from the unidb package) */
875 
876 static unsigned long
_gethash(const char * s,int len,int scale)877 _gethash(const char *s, int len, int scale)
878 {
879     int i;
880     unsigned long h = 0;
881     unsigned long ix;
882     for (i = 0; i < len; i++) {
883         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
884         ix = h & 0xff000000;
885         if (ix)
886             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
887     }
888     return h;
889 }
890 
891 static const char * const hangul_syllables[][3] = {
892     { "G",  "A",   ""   },
893     { "GG", "AE",  "G"  },
894     { "N",  "YA",  "GG" },
895     { "D",  "YAE", "GS" },
896     { "DD", "EO",  "N", },
897     { "R",  "E",   "NJ" },
898     { "M",  "YEO", "NH" },
899     { "B",  "YE",  "D"  },
900     { "BB", "O",   "L"  },
901     { "S",  "WA",  "LG" },
902     { "SS", "WAE", "LM" },
903     { "",   "OE",  "LB" },
904     { "J",  "YO",  "LS" },
905     { "JJ", "U",   "LT" },
906     { "C",  "WEO", "LP" },
907     { "K",  "WE",  "LH" },
908     { "T",  "WI",  "M"  },
909     { "P",  "YU",  "B"  },
910     { "H",  "EU",  "BS" },
911     { 0,    "YI",  "S"  },
912     { 0,    "I",   "SS" },
913     { 0,    0,     "NG" },
914     { 0,    0,     "J"  },
915     { 0,    0,     "C"  },
916     { 0,    0,     "K"  },
917     { 0,    0,     "T"  },
918     { 0,    0,     "P"  },
919     { 0,    0,     "H"  }
920 };
921 
922 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
923 static int
is_unified_ideograph(Py_UCS4 code)924 is_unified_ideograph(Py_UCS4 code)
925 {
926     return
927         (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
928         (0x4E00 <= code && code <= 0x9FEF)   || /* CJK Ideograph */
929         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
930         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
931         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
932         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
933         (0x2CEB0 <= code && code <= 0x2EBEF);   /* CJK Ideograph Extension F */
934 }
935 
936 /* macros used to determine if the given code point is in the PUA range that
937  * we are using to store aliases and named sequences */
938 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
939 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
940                           (cp < named_sequences_end))
941 
942 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)943 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
944            int with_alias_and_seq)
945 {
946     /* Find the name associated with the given code point.
947      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
948      * that we are using for aliases and named sequences. */
949     int offset;
950     int i;
951     int word;
952     unsigned char* w;
953 
954     if (code >= 0x110000)
955         return 0;
956 
957     /* XXX should we just skip all the code points in the PUAs here? */
958     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
959         return 0;
960 
961     if (self && UCD_Check(self)) {
962         /* in 3.2.0 there are no aliases and named sequences */
963         const change_record *old;
964         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
965             return 0;
966         old = get_old_record(self, code);
967         if (old->category_changed == 0) {
968             /* unassigned */
969             return 0;
970         }
971     }
972 
973     if (SBase <= code && code < SBase+SCount) {
974         /* Hangul syllable. */
975         int SIndex = code - SBase;
976         int L = SIndex / NCount;
977         int V = (SIndex % NCount) / TCount;
978         int T = SIndex % TCount;
979 
980         if (buflen < 27)
981             /* Worst case: HANGUL SYLLABLE <10chars>. */
982             return 0;
983         strcpy(buffer, "HANGUL SYLLABLE ");
984         buffer += 16;
985         strcpy(buffer, hangul_syllables[L][0]);
986         buffer += strlen(hangul_syllables[L][0]);
987         strcpy(buffer, hangul_syllables[V][1]);
988         buffer += strlen(hangul_syllables[V][1]);
989         strcpy(buffer, hangul_syllables[T][2]);
990         buffer += strlen(hangul_syllables[T][2]);
991         *buffer = '\0';
992         return 1;
993     }
994 
995     if (is_unified_ideograph(code)) {
996         if (buflen < 28)
997             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
998             return 0;
999         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1000         return 1;
1001     }
1002 
1003     /* get offset into phrasebook */
1004     offset = phrasebook_offset1[(code>>phrasebook_shift)];
1005     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1006                                (code&((1<<phrasebook_shift)-1))];
1007     if (!offset)
1008         return 0;
1009 
1010     i = 0;
1011 
1012     for (;;) {
1013         /* get word index */
1014         word = phrasebook[offset] - phrasebook_short;
1015         if (word >= 0) {
1016             word = (word << 8) + phrasebook[offset+1];
1017             offset += 2;
1018         } else
1019             word = phrasebook[offset++];
1020         if (i) {
1021             if (i > buflen)
1022                 return 0; /* buffer overflow */
1023             buffer[i++] = ' ';
1024         }
1025         /* copy word string from lexicon.  the last character in the
1026            word has bit 7 set.  the last word in a string ends with
1027            0x80 */
1028         w = lexicon + lexicon_offset[word];
1029         while (*w < 128) {
1030             if (i >= buflen)
1031                 return 0; /* buffer overflow */
1032             buffer[i++] = *w++;
1033         }
1034         if (i >= buflen)
1035             return 0; /* buffer overflow */
1036         buffer[i++] = *w & 127;
1037         if (*w == 128)
1038             break; /* end of word */
1039     }
1040 
1041     return 1;
1042 }
1043 
1044 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1045 _cmpname(PyObject *self, int code, const char* name, int namelen)
1046 {
1047     /* check if code corresponds to the given name */
1048     int i;
1049     char buffer[NAME_MAXLEN+1];
1050     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1051         return 0;
1052     for (i = 0; i < namelen; i++) {
1053         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1054             return 0;
1055     }
1056     return buffer[namelen] == '\0';
1057 }
1058 
1059 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1060 find_syllable(const char *str, int *len, int *pos, int count, int column)
1061 {
1062     int i, len1;
1063     *len = -1;
1064     for (i = 0; i < count; i++) {
1065         const char *s = hangul_syllables[i][column];
1066         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1067         if (len1 <= *len)
1068             continue;
1069         if (strncmp(str, s, len1) == 0) {
1070             *len = len1;
1071             *pos = i;
1072         }
1073     }
1074     if (*len == -1) {
1075         *len = 0;
1076     }
1077 }
1078 
1079 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1080 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1081 {
1082     /* check if named sequences are allowed */
1083     if (!with_named_seq && IS_NAMED_SEQ(cp))
1084         return 0;
1085     /* if the code point is in the PUA range that we use for aliases,
1086      * convert it to obtain the right code point */
1087     if (IS_ALIAS(cp))
1088         *code = name_aliases[cp-aliases_start];
1089     else
1090         *code = cp;
1091     return 1;
1092 }
1093 
1094 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1095 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1096          int with_named_seq)
1097 {
1098     /* Return the code point associated with the given name.
1099      * Named aliases are resolved too (unless self != NULL (i.e. we are using
1100      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
1101      * using for the named sequence, and the caller must then convert it. */
1102     unsigned int h, v;
1103     unsigned int mask = code_size-1;
1104     unsigned int i, incr;
1105 
1106     /* Check for hangul syllables. */
1107     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1108         int len, L = -1, V = -1, T = -1;
1109         const char *pos = name + 16;
1110         find_syllable(pos, &len, &L, LCount, 0);
1111         pos += len;
1112         find_syllable(pos, &len, &V, VCount, 1);
1113         pos += len;
1114         find_syllable(pos, &len, &T, TCount, 2);
1115         pos += len;
1116         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1117             *code = SBase + (L*VCount+V)*TCount + T;
1118             return 1;
1119         }
1120         /* Otherwise, it's an illegal syllable name. */
1121         return 0;
1122     }
1123 
1124     /* Check for unified ideographs. */
1125     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1126         /* Four or five hexdigits must follow. */
1127         v = 0;
1128         name += 22;
1129         namelen -= 22;
1130         if (namelen != 4 && namelen != 5)
1131             return 0;
1132         while (namelen--) {
1133             v *= 16;
1134             if (*name >= '0' && *name <= '9')
1135                 v += *name - '0';
1136             else if (*name >= 'A' && *name <= 'F')
1137                 v += *name - 'A' + 10;
1138             else
1139                 return 0;
1140             name++;
1141         }
1142         if (!is_unified_ideograph(v))
1143             return 0;
1144         *code = v;
1145         return 1;
1146     }
1147 
1148     /* the following is the same as python's dictionary lookup, with
1149        only minor changes.  see the makeunicodedata script for more
1150        details */
1151 
1152     h = (unsigned int) _gethash(name, namelen, code_magic);
1153     i = (~h) & mask;
1154     v = code_hash[i];
1155     if (!v)
1156         return 0;
1157     if (_cmpname(self, v, name, namelen))
1158         return _check_alias_and_seq(v, code, with_named_seq);
1159     incr = (h ^ (h >> 3)) & mask;
1160     if (!incr)
1161         incr = mask;
1162     for (;;) {
1163         i = (i + incr) & mask;
1164         v = code_hash[i];
1165         if (!v)
1166             return 0;
1167         if (_cmpname(self, v, name, namelen))
1168             return _check_alias_and_seq(v, code, with_named_seq);
1169         incr = incr << 1;
1170         if (incr > mask)
1171             incr = incr ^ code_poly;
1172     }
1173 }
1174 
1175 static const _PyUnicode_Name_CAPI hashAPI =
1176 {
1177     sizeof(_PyUnicode_Name_CAPI),
1178     _getucname,
1179     _getcode
1180 };
1181 
1182 /* -------------------------------------------------------------------- */
1183 /* Python bindings */
1184 
1185 /*[clinic input]
1186 unicodedata.UCD.name
1187 
1188     self: self
1189     chr: int(accept={str})
1190     default: object=NULL
1191     /
1192 
1193 Returns the name assigned to the character chr as a string.
1194 
1195 If no name is defined, default is returned, or, if not given,
1196 ValueError is raised.
1197 [clinic start generated code]*/
1198 
1199 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1200 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1201 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1202 {
1203     char name[NAME_MAXLEN+1];
1204     Py_UCS4 c = (Py_UCS4)chr;
1205 
1206     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1207         if (default_value == NULL) {
1208             PyErr_SetString(PyExc_ValueError, "no such name");
1209             return NULL;
1210         }
1211         else {
1212             Py_INCREF(default_value);
1213             return default_value;
1214         }
1215     }
1216 
1217     return PyUnicode_FromString(name);
1218 }
1219 
1220 /*[clinic input]
1221 unicodedata.UCD.lookup
1222 
1223     self: self
1224     name: str(accept={str, robuffer}, zeroes=True)
1225     /
1226 
1227 Look up character by name.
1228 
1229 If a character with the given name is found, return the
1230 corresponding character.  If not found, KeyError is raised.
1231 [clinic start generated code]*/
1232 
1233 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1234 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1235                             Py_ssize_clean_t name_length)
1236 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1237 {
1238     Py_UCS4 code;
1239     unsigned int index;
1240     if (name_length > NAME_MAXLEN) {
1241         PyErr_SetString(PyExc_KeyError, "name too long");
1242         return NULL;
1243     }
1244 
1245     if (!_getcode(self, name, (int)name_length, &code, 1)) {
1246         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1247         return NULL;
1248     }
1249     /* check if code is in the PUA range that we use for named sequences
1250        and convert it */
1251     if (IS_NAMED_SEQ(code)) {
1252         index = code-named_sequences_start;
1253         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1254                                          named_sequences[index].seq,
1255                                          named_sequences[index].seqlen);
1256     }
1257     return PyUnicode_FromOrdinal(code);
1258 }
1259 
1260 /* XXX Add doc strings. */
1261 
1262 static PyMethodDef unicodedata_functions[] = {
1263     UNICODEDATA_UCD_DECIMAL_METHODDEF
1264     UNICODEDATA_UCD_DIGIT_METHODDEF
1265     UNICODEDATA_UCD_NUMERIC_METHODDEF
1266     UNICODEDATA_UCD_CATEGORY_METHODDEF
1267     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1268     UNICODEDATA_UCD_COMBINING_METHODDEF
1269     UNICODEDATA_UCD_MIRRORED_METHODDEF
1270     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1271     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1272     UNICODEDATA_UCD_NAME_METHODDEF
1273     UNICODEDATA_UCD_LOOKUP_METHODDEF
1274     UNICODEDATA_UCD_NORMALIZE_METHODDEF
1275     {NULL, NULL}                /* sentinel */
1276 };
1277 
1278 static PyTypeObject UCD_Type = {
1279         /* The ob_type field must be initialized in the module init function
1280          * to be portable to Windows without using C++. */
1281         PyVarObject_HEAD_INIT(NULL, 0)
1282         "unicodedata.UCD",              /*tp_name*/
1283         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1284         0,                      /*tp_itemsize*/
1285         /* methods */
1286         (destructor)PyObject_Del, /*tp_dealloc*/
1287         0,                      /*tp_print*/
1288         0,                      /*tp_getattr*/
1289         0,                      /*tp_setattr*/
1290         0,                      /*tp_reserved*/
1291         0,                      /*tp_repr*/
1292         0,                      /*tp_as_number*/
1293         0,                      /*tp_as_sequence*/
1294         0,                      /*tp_as_mapping*/
1295         0,                      /*tp_hash*/
1296         0,                      /*tp_call*/
1297         0,                      /*tp_str*/
1298         PyObject_GenericGetAttr,/*tp_getattro*/
1299         0,                      /*tp_setattro*/
1300         0,                      /*tp_as_buffer*/
1301         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1302         0,                      /*tp_doc*/
1303         0,                      /*tp_traverse*/
1304         0,                      /*tp_clear*/
1305         0,                      /*tp_richcompare*/
1306         0,                      /*tp_weaklistoffset*/
1307         0,                      /*tp_iter*/
1308         0,                      /*tp_iternext*/
1309         unicodedata_functions,  /*tp_methods*/
1310         DB_members,             /*tp_members*/
1311         0,                      /*tp_getset*/
1312         0,                      /*tp_base*/
1313         0,                      /*tp_dict*/
1314         0,                      /*tp_descr_get*/
1315         0,                      /*tp_descr_set*/
1316         0,                      /*tp_dictoffset*/
1317         0,                      /*tp_init*/
1318         0,                      /*tp_alloc*/
1319         0,                      /*tp_new*/
1320         0,                      /*tp_free*/
1321         0,                      /*tp_is_gc*/
1322 };
1323 
1324 PyDoc_STRVAR(unicodedata_docstring,
1325 "This module provides access to the Unicode Character Database which\n\
1326 defines character properties for all Unicode characters. The data in\n\
1327 this database is based on the UnicodeData.txt file version\n\
1328 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1329 \n\
1330 The module uses the same names and symbols as defined by the\n\
1331 UnicodeData File Format " UNIDATA_VERSION ".");
1332 
1333 static struct PyModuleDef unicodedatamodule = {
1334         PyModuleDef_HEAD_INIT,
1335         "unicodedata",
1336         unicodedata_docstring,
1337         -1,
1338         unicodedata_functions,
1339         NULL,
1340         NULL,
1341         NULL,
1342         NULL
1343 };
1344 
1345 PyMODINIT_FUNC
PyInit_unicodedata(void)1346 PyInit_unicodedata(void)
1347 {
1348     PyObject *m, *v;
1349 
1350     Py_TYPE(&UCD_Type) = &PyType_Type;
1351 
1352     m = PyModule_Create(&unicodedatamodule);
1353     if (!m)
1354         return NULL;
1355 
1356     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1357     Py_INCREF(&UCD_Type);
1358     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1359 
1360     /* Previous versions */
1361     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1362     if (v != NULL)
1363         PyModule_AddObject(m, "ucd_3_2_0", v);
1364 
1365     /* Export C API */
1366     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1367     if (v != NULL)
1368         PyModule_AddObject(m, "ucnhash_CAPI", v);
1369     return m;
1370 }
1371 
1372 /*
1373 Local variables:
1374 c-basic-offset: 4
1375 indent-tabs-mode: nil
1376 End:
1377 */
1378