1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode database.
4
5 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
7
8 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 Modified by Martin v. Löwis (martin@v.loewis.de)
11
12 Copyright (c) Corporation for National Research Initiatives.
13
14 ------------------------------------------------------------------------ */
15
16 #define PY_SSIZE_T_CLEAN
17
18 #include "Python.h"
19 #include "ucnhash.h"
20 #include "structmember.h" // PyMemberDef
21
22 #include <stdbool.h>
23
24 _Py_IDENTIFIER(NFC);
25 _Py_IDENTIFIER(NFD);
26 _Py_IDENTIFIER(NFKC);
27 _Py_IDENTIFIER(NFKD);
28
29 /*[clinic input]
30 module unicodedata
31 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
32 [clinic start generated code]*/
33 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
34
35 /* character properties */
36
37 typedef struct {
38 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
46 const unsigned char normalization_quick_check; /* see is_normalized() */
47 } _PyUnicode_DatabaseRecord;
48
49 typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
54 const unsigned char mirrored_changed;
55 const unsigned char east_asian_width_changed;
56 const double numeric_changed;
57 } change_record;
58
59 /* data file generated by Tools/unicode/makeunicodedata.py */
60 #include "unicodedata_db.h"
61
62 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)63 _getrecord_ex(Py_UCS4 code)
64 {
65 int index;
66 if (code >= 0x110000)
67 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74 }
75
76 /* ------------- Previous-version API ------------------------------------- */
77 typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82 } PreviousDBVersion;
83
84 #include "clinic/unicodedata.c.h"
85
86 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
88 static PyMemberDef DB_members[] = {
89 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
90 {NULL}
91 };
92
93 /* forward declaration */
94 static PyTypeObject UCD_Type;
95 #define UCD_Check(o) Py_IS_TYPE(o, &UCD_Type)
96
97 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))98 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
99 Py_UCS4 (*normalization)(Py_UCS4))
100 {
101 PreviousDBVersion *self;
102 self = PyObject_New(PreviousDBVersion, &UCD_Type);
103 if (self == NULL)
104 return NULL;
105 self->name = name;
106 self->getrecord = getrecord;
107 self->normalization = normalization;
108 return (PyObject*)self;
109 }
110
111
112 /* --- Module API --------------------------------------------------------- */
113
114 /*[clinic input]
115 unicodedata.UCD.decimal
116
117 self: self
118 chr: int(accept={str})
119 default: object=NULL
120 /
121
122 Converts a Unicode character into its equivalent decimal value.
123
124 Returns the decimal value assigned to the character chr as integer.
125 If no such value is defined, default is returned, or, if not given,
126 ValueError is raised.
127 [clinic start generated code]*/
128
129 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)130 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
131 PyObject *default_value)
132 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
133 {
134 int have_old = 0;
135 long rc;
136 Py_UCS4 c = (Py_UCS4)chr;
137
138 if (self && UCD_Check(self)) {
139 const change_record *old = get_old_record(self, c);
140 if (old->category_changed == 0) {
141 /* unassigned */
142 have_old = 1;
143 rc = -1;
144 }
145 else if (old->decimal_changed != 0xFF) {
146 have_old = 1;
147 rc = old->decimal_changed;
148 }
149 }
150
151 if (!have_old)
152 rc = Py_UNICODE_TODECIMAL(c);
153 if (rc < 0) {
154 if (default_value == NULL) {
155 PyErr_SetString(PyExc_ValueError,
156 "not a decimal");
157 return NULL;
158 }
159 else {
160 Py_INCREF(default_value);
161 return default_value;
162 }
163 }
164 return PyLong_FromLong(rc);
165 }
166
167 /*[clinic input]
168 unicodedata.UCD.digit
169
170 self: self
171 chr: int(accept={str})
172 default: object=NULL
173 /
174
175 Converts a Unicode character into its equivalent digit value.
176
177 Returns the digit value assigned to the character chr as integer.
178 If no such value is defined, default is returned, or, if not given,
179 ValueError is raised.
180 [clinic start generated code]*/
181
182 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)183 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
184 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
185 {
186 long rc;
187 Py_UCS4 c = (Py_UCS4)chr;
188 rc = Py_UNICODE_TODIGIT(c);
189 if (rc < 0) {
190 if (default_value == NULL) {
191 PyErr_SetString(PyExc_ValueError, "not a digit");
192 return NULL;
193 }
194 else {
195 Py_INCREF(default_value);
196 return default_value;
197 }
198 }
199 return PyLong_FromLong(rc);
200 }
201
202 /*[clinic input]
203 unicodedata.UCD.numeric
204
205 self: self
206 chr: int(accept={str})
207 default: object=NULL
208 /
209
210 Converts a Unicode character into its equivalent numeric value.
211
212 Returns the numeric value assigned to the character chr as float.
213 If no such value is defined, default is returned, or, if not given,
214 ValueError is raised.
215 [clinic start generated code]*/
216
217 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)218 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
219 PyObject *default_value)
220 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
221 {
222 int have_old = 0;
223 double rc;
224 Py_UCS4 c = (Py_UCS4)chr;
225
226 if (self && UCD_Check(self)) {
227 const change_record *old = get_old_record(self, c);
228 if (old->category_changed == 0) {
229 /* unassigned */
230 have_old = 1;
231 rc = -1.0;
232 }
233 else if (old->decimal_changed != 0xFF) {
234 have_old = 1;
235 rc = old->decimal_changed;
236 }
237 }
238
239 if (!have_old)
240 rc = Py_UNICODE_TONUMERIC(c);
241 if (rc == -1.0) {
242 if (default_value == NULL) {
243 PyErr_SetString(PyExc_ValueError, "not a numeric character");
244 return NULL;
245 }
246 else {
247 Py_INCREF(default_value);
248 return default_value;
249 }
250 }
251 return PyFloat_FromDouble(rc);
252 }
253
254 /*[clinic input]
255 unicodedata.UCD.category
256
257 self: self
258 chr: int(accept={str})
259 /
260
261 Returns the general category assigned to the character chr as string.
262 [clinic start generated code]*/
263
264 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)265 unicodedata_UCD_category_impl(PyObject *self, int chr)
266 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
267 {
268 int index;
269 Py_UCS4 c = (Py_UCS4)chr;
270 index = (int) _getrecord_ex(c)->category;
271 if (self && UCD_Check(self)) {
272 const change_record *old = get_old_record(self, c);
273 if (old->category_changed != 0xFF)
274 index = old->category_changed;
275 }
276 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
277 }
278
279 /*[clinic input]
280 unicodedata.UCD.bidirectional
281
282 self: self
283 chr: int(accept={str})
284 /
285
286 Returns the bidirectional class assigned to the character chr as string.
287
288 If no such value is defined, an empty string is returned.
289 [clinic start generated code]*/
290
291 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)292 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
293 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
294 {
295 int index;
296 Py_UCS4 c = (Py_UCS4)chr;
297 index = (int) _getrecord_ex(c)->bidirectional;
298 if (self && UCD_Check(self)) {
299 const change_record *old = get_old_record(self, c);
300 if (old->category_changed == 0)
301 index = 0; /* unassigned */
302 else if (old->bidir_changed != 0xFF)
303 index = old->bidir_changed;
304 }
305 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
306 }
307
308 /*[clinic input]
309 unicodedata.UCD.combining -> int
310
311 self: self
312 chr: int(accept={str})
313 /
314
315 Returns the canonical combining class assigned to the character chr as integer.
316
317 Returns 0 if no combining class is defined.
318 [clinic start generated code]*/
319
320 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)321 unicodedata_UCD_combining_impl(PyObject *self, int chr)
322 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
323 {
324 int index;
325 Py_UCS4 c = (Py_UCS4)chr;
326 index = (int) _getrecord_ex(c)->combining;
327 if (self && UCD_Check(self)) {
328 const change_record *old = get_old_record(self, c);
329 if (old->category_changed == 0)
330 index = 0; /* unassigned */
331 }
332 return index;
333 }
334
335 /*[clinic input]
336 unicodedata.UCD.mirrored -> int
337
338 self: self
339 chr: int(accept={str})
340 /
341
342 Returns the mirrored property assigned to the character chr as integer.
343
344 Returns 1 if the character has been identified as a "mirrored"
345 character in bidirectional text, 0 otherwise.
346 [clinic start generated code]*/
347
348 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)349 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
350 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
351 {
352 int index;
353 Py_UCS4 c = (Py_UCS4)chr;
354 index = (int) _getrecord_ex(c)->mirrored;
355 if (self && UCD_Check(self)) {
356 const change_record *old = get_old_record(self, c);
357 if (old->category_changed == 0)
358 index = 0; /* unassigned */
359 else if (old->mirrored_changed != 0xFF)
360 index = old->mirrored_changed;
361 }
362 return index;
363 }
364
365 /*[clinic input]
366 unicodedata.UCD.east_asian_width
367
368 self: self
369 chr: int(accept={str})
370 /
371
372 Returns the east asian width assigned to the character chr as string.
373 [clinic start generated code]*/
374
375 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)376 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
377 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
378 {
379 int index;
380 Py_UCS4 c = (Py_UCS4)chr;
381 index = (int) _getrecord_ex(c)->east_asian_width;
382 if (self && UCD_Check(self)) {
383 const change_record *old = get_old_record(self, c);
384 if (old->category_changed == 0)
385 index = 0; /* unassigned */
386 else if (old->east_asian_width_changed != 0xFF)
387 index = old->east_asian_width_changed;
388 }
389 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
390 }
391
392 /*[clinic input]
393 unicodedata.UCD.decomposition
394
395 self: self
396 chr: int(accept={str})
397 /
398
399 Returns the character decomposition mapping assigned to the character chr as string.
400
401 An empty string is returned in case no such mapping is defined.
402 [clinic start generated code]*/
403
404 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)405 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
406 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
407 {
408 char decomp[256];
409 int code, index, count;
410 size_t i;
411 unsigned int prefix_index;
412 Py_UCS4 c = (Py_UCS4)chr;
413
414 code = (int)c;
415
416 if (self && UCD_Check(self)) {
417 const change_record *old = get_old_record(self, c);
418 if (old->category_changed == 0)
419 return PyUnicode_FromString(""); /* unassigned */
420 }
421
422 if (code < 0 || code >= 0x110000)
423 index = 0;
424 else {
425 index = decomp_index1[(code>>DECOMP_SHIFT)];
426 index = decomp_index2[(index<<DECOMP_SHIFT)+
427 (code&((1<<DECOMP_SHIFT)-1))];
428 }
429
430 /* high byte is number of hex bytes (usually one or two), low byte
431 is prefix code (from*/
432 count = decomp_data[index] >> 8;
433
434 /* XXX: could allocate the PyString up front instead
435 (strlen(prefix) + 5 * count + 1 bytes) */
436
437 /* Based on how index is calculated above and decomp_data is generated
438 from Tools/unicode/makeunicodedata.py, it should not be possible
439 to overflow decomp_prefix. */
440 prefix_index = decomp_data[index] & 255;
441 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
442
443 /* copy prefix */
444 i = strlen(decomp_prefix[prefix_index]);
445 memcpy(decomp, decomp_prefix[prefix_index], i);
446
447 while (count-- > 0) {
448 if (i)
449 decomp[i++] = ' ';
450 assert(i < sizeof(decomp));
451 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
452 decomp_data[++index]);
453 i += strlen(decomp + i);
454 }
455 return PyUnicode_FromStringAndSize(decomp, i);
456 }
457
458 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)459 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
460 {
461 if (code >= 0x110000) {
462 *index = 0;
463 } else if (self && UCD_Check(self) &&
464 get_old_record(self, code)->category_changed==0) {
465 /* unassigned in old version */
466 *index = 0;
467 }
468 else {
469 *index = decomp_index1[(code>>DECOMP_SHIFT)];
470 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471 (code&((1<<DECOMP_SHIFT)-1))];
472 }
473
474 /* high byte is number of hex bytes (usually one or two), low byte
475 is prefix code (from*/
476 *count = decomp_data[*index] >> 8;
477 *prefix = decomp_data[*index] & 255;
478
479 (*index)++;
480 }
481
482 #define SBase 0xAC00
483 #define LBase 0x1100
484 #define VBase 0x1161
485 #define TBase 0x11A7
486 #define LCount 19
487 #define VCount 21
488 #define TCount 28
489 #define NCount (VCount*TCount)
490 #define SCount (LCount*NCount)
491
492 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)493 nfd_nfkd(PyObject *self, PyObject *input, int k)
494 {
495 PyObject *result;
496 Py_UCS4 *output;
497 Py_ssize_t i, o, osize;
498 int kind;
499 const void *data;
500 /* Longest decomposition in Unicode 3.2: U+FDFA */
501 Py_UCS4 stack[20];
502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
504 unsigned char prev, cur;
505
506 stackptr = 0;
507 isize = PyUnicode_GET_LENGTH(input);
508 space = isize;
509 /* Overallocate at most 10 characters. */
510 if (space > 10) {
511 if (space <= PY_SSIZE_T_MAX - 10)
512 space += 10;
513 }
514 else {
515 space *= 2;
516 }
517 osize = space;
518 output = PyMem_NEW(Py_UCS4, space);
519 if (!output) {
520 PyErr_NoMemory();
521 return NULL;
522 }
523 i = o = 0;
524 kind = PyUnicode_KIND(input);
525 data = PyUnicode_DATA(input);
526
527 while (i < isize) {
528 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
529 while(stackptr) {
530 Py_UCS4 code = stack[--stackptr];
531 /* Hangul Decomposition adds three characters in
532 a single step, so we need at least that much room. */
533 if (space < 3) {
534 Py_UCS4 *new_output;
535 osize += 10;
536 space += 10;
537 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
538 if (new_output == NULL) {
539 PyMem_Free(output);
540 PyErr_NoMemory();
541 return NULL;
542 }
543 output = new_output;
544 }
545 /* Hangul Decomposition. */
546 if (SBase <= code && code < (SBase+SCount)) {
547 int SIndex = code - SBase;
548 int L = LBase + SIndex / NCount;
549 int V = VBase + (SIndex % NCount) / TCount;
550 int T = TBase + SIndex % TCount;
551 output[o++] = L;
552 output[o++] = V;
553 space -= 2;
554 if (T != TBase) {
555 output[o++] = T;
556 space --;
557 }
558 continue;
559 }
560 /* normalization changes */
561 if (self && UCD_Check(self)) {
562 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
563 if (value != 0) {
564 stack[stackptr++] = value;
565 continue;
566 }
567 }
568
569 /* Other decompositions. */
570 get_decomp_record(self, code, &index, &prefix, &count);
571
572 /* Copy character if it is not decomposable, or has a
573 compatibility decomposition, but we do NFD. */
574 if (!count || (prefix && !k)) {
575 output[o++] = code;
576 space--;
577 continue;
578 }
579 /* Copy decomposition onto the stack, in reverse
580 order. */
581 while(count) {
582 code = decomp_data[index + (--count)];
583 stack[stackptr++] = code;
584 }
585 }
586 }
587
588 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
589 output, o);
590 PyMem_Free(output);
591 if (!result)
592 return NULL;
593 /* result is guaranteed to be ready, as it is compact. */
594 kind = PyUnicode_KIND(result);
595 data = PyUnicode_DATA(result);
596
597 /* Sort canonically. */
598 i = 0;
599 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
600 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
601 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
602 if (prev == 0 || cur == 0 || prev <= cur) {
603 prev = cur;
604 continue;
605 }
606 /* Non-canonical order. Need to switch *i with previous. */
607 o = i - 1;
608 while (1) {
609 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
610 PyUnicode_WRITE(kind, data, o+1,
611 PyUnicode_READ(kind, data, o));
612 PyUnicode_WRITE(kind, data, o, tmp);
613 o--;
614 if (o < 0)
615 break;
616 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
617 if (prev == 0 || prev <= cur)
618 break;
619 }
620 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
621 }
622 return result;
623 }
624
625 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)626 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
627 {
628 unsigned int index;
629 for (index = 0; nfc[index].start; index++) {
630 unsigned int start = nfc[index].start;
631 if (code < start)
632 return -1;
633 if (code <= start + nfc[index].count) {
634 unsigned int delta = code - start;
635 return nfc[index].index + delta;
636 }
637 }
638 return -1;
639 }
640
641 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)642 nfc_nfkc(PyObject *self, PyObject *input, int k)
643 {
644 PyObject *result;
645 int kind;
646 const void *data;
647 Py_UCS4 *output;
648 Py_ssize_t i, i1, o, len;
649 int f,l,index,index1,comb;
650 Py_UCS4 code;
651 Py_ssize_t skipped[20];
652 int cskipped = 0;
653
654 result = nfd_nfkd(self, input, k);
655 if (!result)
656 return NULL;
657 /* result will be "ready". */
658 kind = PyUnicode_KIND(result);
659 data = PyUnicode_DATA(result);
660 len = PyUnicode_GET_LENGTH(result);
661
662 /* We allocate a buffer for the output.
663 If we find that we made no changes, we still return
664 the NFD result. */
665 output = PyMem_NEW(Py_UCS4, len);
666 if (!output) {
667 PyErr_NoMemory();
668 Py_DECREF(result);
669 return 0;
670 }
671 i = o = 0;
672
673 again:
674 while (i < len) {
675 for (index = 0; index < cskipped; index++) {
676 if (skipped[index] == i) {
677 /* *i character is skipped.
678 Remove from list. */
679 skipped[index] = skipped[cskipped-1];
680 cskipped--;
681 i++;
682 goto again; /* continue while */
683 }
684 }
685 /* Hangul Composition. We don't need to check for <LV,T>
686 pairs, since we always have decomposed data. */
687 code = PyUnicode_READ(kind, data, i);
688 if (LBase <= code && code < (LBase+LCount) &&
689 i + 1 < len &&
690 VBase <= PyUnicode_READ(kind, data, i+1) &&
691 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
692 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
693 and V character is a modern vowel (0x1161 ~ 0x1175). */
694 int LIndex, VIndex;
695 LIndex = code - LBase;
696 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
697 code = SBase + (LIndex*VCount+VIndex)*TCount;
698 i+=2;
699 if (i < len &&
700 TBase < PyUnicode_READ(kind, data, i) &&
701 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
702 /* check T character is a modern trailing consonant
703 (0x11A8 ~ 0x11C2). */
704 code += PyUnicode_READ(kind, data, i)-TBase;
705 i++;
706 }
707 output[o++] = code;
708 continue;
709 }
710
711 /* code is still input[i] here */
712 f = find_nfc_index(nfc_first, code);
713 if (f == -1) {
714 output[o++] = code;
715 i++;
716 continue;
717 }
718 /* Find next unblocked character. */
719 i1 = i+1;
720 comb = 0;
721 /* output base character for now; might be updated later. */
722 output[o] = PyUnicode_READ(kind, data, i);
723 while (i1 < len) {
724 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
725 int comb1 = _getrecord_ex(code1)->combining;
726 if (comb) {
727 if (comb1 == 0)
728 break;
729 if (comb >= comb1) {
730 /* Character is blocked. */
731 i1++;
732 continue;
733 }
734 }
735 l = find_nfc_index(nfc_last, code1);
736 /* i1 cannot be combined with i. If i1
737 is a starter, we don't need to look further.
738 Otherwise, record the combining class. */
739 if (l == -1) {
740 not_combinable:
741 if (comb1 == 0)
742 break;
743 comb = comb1;
744 i1++;
745 continue;
746 }
747 index = f*TOTAL_LAST + l;
748 index1 = comp_index[index >> COMP_SHIFT];
749 code = comp_data[(index1<<COMP_SHIFT)+
750 (index&((1<<COMP_SHIFT)-1))];
751 if (code == 0)
752 goto not_combinable;
753
754 /* Replace the original character. */
755 output[o] = code;
756 /* Mark the second character unused. */
757 assert(cskipped < 20);
758 skipped[cskipped++] = i1;
759 i1++;
760 f = find_nfc_index(nfc_first, output[o]);
761 if (f == -1)
762 break;
763 }
764 /* Output character was already written.
765 Just advance the indices. */
766 o++; i++;
767 }
768 if (o == len) {
769 /* No changes. Return original string. */
770 PyMem_Free(output);
771 return result;
772 }
773 Py_DECREF(result);
774 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
775 output, o);
776 PyMem_Free(output);
777 return result;
778 }
779
780 // This needs to match the logic in makeunicodedata.py
781 // which constructs the quickcheck data.
782 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
783
784 /* Run the Unicode normalization "quickcheck" algorithm.
785 *
786 * Return YES or NO if quickcheck determines the input is certainly
787 * normalized or certainly not, and MAYBE if quickcheck is unable to
788 * tell.
789 *
790 * If `yes_only` is true, then return MAYBE as soon as we determine
791 * the answer is not YES.
792 *
793 * For background and details on the algorithm, see UAX #15:
794 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
795 */
796 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)797 is_normalized_quickcheck(PyObject *self, PyObject *input,
798 bool nfc, bool k, bool yes_only)
799 {
800 /* An older version of the database is requested, quickchecks must be
801 disabled. */
802 if (self && UCD_Check(self))
803 return NO;
804
805 Py_ssize_t i, len;
806 int kind;
807 const void *data;
808 unsigned char prev_combining = 0;
809
810 /* The two quickcheck bits at this shift have type QuickcheckResult. */
811 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
812
813 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
814
815 i = 0;
816 kind = PyUnicode_KIND(input);
817 data = PyUnicode_DATA(input);
818 len = PyUnicode_GET_LENGTH(input);
819 while (i < len) {
820 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
821 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
822
823 unsigned char combining = record->combining;
824 if (combining && prev_combining > combining)
825 return NO; /* non-canonical sort order, not normalized */
826 prev_combining = combining;
827
828 unsigned char quickcheck_whole = record->normalization_quick_check;
829 if (yes_only) {
830 if (quickcheck_whole & (3 << quickcheck_shift))
831 return MAYBE;
832 } else {
833 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
834 case NO:
835 return NO;
836 case MAYBE:
837 result = MAYBE; /* this string might need normalization */
838 }
839 }
840 }
841 return result;
842 }
843
844 /*[clinic input]
845 unicodedata.UCD.is_normalized
846
847 self: self
848 form: unicode
849 unistr as input: unicode
850 /
851
852 Return whether the Unicode string unistr is in the normal form 'form'.
853
854 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
855 [clinic start generated code]*/
856
857 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)858 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
859 PyObject *input)
860 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
861 {
862 if (PyUnicode_READY(input) == -1) {
863 return NULL;
864 }
865
866 if (PyUnicode_GET_LENGTH(input) == 0) {
867 /* special case empty input strings. */
868 Py_RETURN_TRUE;
869 }
870
871 PyObject *result;
872 bool nfc = false;
873 bool k = false;
874 QuickcheckResult m;
875
876 PyObject *cmp;
877 int match = 0;
878
879 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
880 nfc = true;
881 }
882 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
883 nfc = true;
884 k = true;
885 }
886 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
887 /* matches default values for `nfc` and `k` */
888 }
889 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
890 k = true;
891 }
892 else {
893 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
894 return NULL;
895 }
896
897 m = is_normalized_quickcheck(self, input, nfc, k, false);
898
899 if (m == MAYBE) {
900 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
901 if (cmp == NULL) {
902 return NULL;
903 }
904 match = PyUnicode_Compare(input, cmp);
905 Py_DECREF(cmp);
906 result = (match == 0) ? Py_True : Py_False;
907 }
908 else {
909 result = (m == YES) ? Py_True : Py_False;
910 }
911
912 Py_INCREF(result);
913 return result;
914 }
915
916
917 /*[clinic input]
918 unicodedata.UCD.normalize
919
920 self: self
921 form: unicode
922 unistr as input: unicode
923 /
924
925 Return the normal form 'form' for the Unicode string unistr.
926
927 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
928 [clinic start generated code]*/
929
930 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)931 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
932 PyObject *input)
933 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
934 {
935 if (PyUnicode_GET_LENGTH(input) == 0) {
936 /* Special case empty input strings, since resizing
937 them later would cause internal errors. */
938 Py_INCREF(input);
939 return input;
940 }
941
942 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
943 if (is_normalized_quickcheck(self, input, true, false, true) == YES) {
944 Py_INCREF(input);
945 return input;
946 }
947 return nfc_nfkc(self, input, 0);
948 }
949 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
950 if (is_normalized_quickcheck(self, input, true, true, true) == YES) {
951 Py_INCREF(input);
952 return input;
953 }
954 return nfc_nfkc(self, input, 1);
955 }
956 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
957 if (is_normalized_quickcheck(self, input, false, false, true) == YES) {
958 Py_INCREF(input);
959 return input;
960 }
961 return nfd_nfkd(self, input, 0);
962 }
963 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
964 if (is_normalized_quickcheck(self, input, false, true, true) == YES) {
965 Py_INCREF(input);
966 return input;
967 }
968 return nfd_nfkd(self, input, 1);
969 }
970 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
971 return NULL;
972 }
973
974 /* -------------------------------------------------------------------- */
975 /* unicode character name tables */
976
977 /* data file generated by Tools/unicode/makeunicodedata.py */
978 #include "unicodename_db.h"
979
980 /* -------------------------------------------------------------------- */
981 /* database code (cut and pasted from the unidb package) */
982
983 static unsigned long
_gethash(const char * s,int len,int scale)984 _gethash(const char *s, int len, int scale)
985 {
986 int i;
987 unsigned long h = 0;
988 unsigned long ix;
989 for (i = 0; i < len; i++) {
990 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
991 ix = h & 0xff000000;
992 if (ix)
993 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
994 }
995 return h;
996 }
997
998 static const char * const hangul_syllables[][3] = {
999 { "G", "A", "" },
1000 { "GG", "AE", "G" },
1001 { "N", "YA", "GG" },
1002 { "D", "YAE", "GS" },
1003 { "DD", "EO", "N", },
1004 { "R", "E", "NJ" },
1005 { "M", "YEO", "NH" },
1006 { "B", "YE", "D" },
1007 { "BB", "O", "L" },
1008 { "S", "WA", "LG" },
1009 { "SS", "WAE", "LM" },
1010 { "", "OE", "LB" },
1011 { "J", "YO", "LS" },
1012 { "JJ", "U", "LT" },
1013 { "C", "WEO", "LP" },
1014 { "K", "WE", "LH" },
1015 { "T", "WI", "M" },
1016 { "P", "YU", "B" },
1017 { "H", "EU", "BS" },
1018 { 0, "YI", "S" },
1019 { 0, "I", "SS" },
1020 { 0, 0, "NG" },
1021 { 0, 0, "J" },
1022 { 0, 0, "C" },
1023 { 0, 0, "K" },
1024 { 0, 0, "T" },
1025 { 0, 0, "P" },
1026 { 0, 0, "H" }
1027 };
1028
1029 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1030 static int
is_unified_ideograph(Py_UCS4 code)1031 is_unified_ideograph(Py_UCS4 code)
1032 {
1033 return
1034 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1035 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1036 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
1037 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
1038 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1039 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1040 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1041 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
1042 }
1043
1044 /* macros used to determine if the given code point is in the PUA range that
1045 * we are using to store aliases and named sequences */
1046 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1047 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1048 (cp < named_sequences_end))
1049
1050 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1051 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
1052 int with_alias_and_seq)
1053 {
1054 /* Find the name associated with the given code point.
1055 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1056 * that we are using for aliases and named sequences. */
1057 int offset;
1058 int i;
1059 int word;
1060 const unsigned char* w;
1061
1062 if (code >= 0x110000)
1063 return 0;
1064
1065 /* XXX should we just skip all the code points in the PUAs here? */
1066 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1067 return 0;
1068
1069 if (self && UCD_Check(self)) {
1070 /* in 3.2.0 there are no aliases and named sequences */
1071 const change_record *old;
1072 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1073 return 0;
1074 old = get_old_record(self, code);
1075 if (old->category_changed == 0) {
1076 /* unassigned */
1077 return 0;
1078 }
1079 }
1080
1081 if (SBase <= code && code < SBase+SCount) {
1082 /* Hangul syllable. */
1083 int SIndex = code - SBase;
1084 int L = SIndex / NCount;
1085 int V = (SIndex % NCount) / TCount;
1086 int T = SIndex % TCount;
1087
1088 if (buflen < 27)
1089 /* Worst case: HANGUL SYLLABLE <10chars>. */
1090 return 0;
1091 strcpy(buffer, "HANGUL SYLLABLE ");
1092 buffer += 16;
1093 strcpy(buffer, hangul_syllables[L][0]);
1094 buffer += strlen(hangul_syllables[L][0]);
1095 strcpy(buffer, hangul_syllables[V][1]);
1096 buffer += strlen(hangul_syllables[V][1]);
1097 strcpy(buffer, hangul_syllables[T][2]);
1098 buffer += strlen(hangul_syllables[T][2]);
1099 *buffer = '\0';
1100 return 1;
1101 }
1102
1103 if (is_unified_ideograph(code)) {
1104 if (buflen < 28)
1105 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1106 return 0;
1107 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1108 return 1;
1109 }
1110
1111 /* get offset into phrasebook */
1112 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1113 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1114 (code&((1<<phrasebook_shift)-1))];
1115 if (!offset)
1116 return 0;
1117
1118 i = 0;
1119
1120 for (;;) {
1121 /* get word index */
1122 word = phrasebook[offset] - phrasebook_short;
1123 if (word >= 0) {
1124 word = (word << 8) + phrasebook[offset+1];
1125 offset += 2;
1126 } else
1127 word = phrasebook[offset++];
1128 if (i) {
1129 if (i > buflen)
1130 return 0; /* buffer overflow */
1131 buffer[i++] = ' ';
1132 }
1133 /* copy word string from lexicon. the last character in the
1134 word has bit 7 set. the last word in a string ends with
1135 0x80 */
1136 w = lexicon + lexicon_offset[word];
1137 while (*w < 128) {
1138 if (i >= buflen)
1139 return 0; /* buffer overflow */
1140 buffer[i++] = *w++;
1141 }
1142 if (i >= buflen)
1143 return 0; /* buffer overflow */
1144 buffer[i++] = *w & 127;
1145 if (*w == 128)
1146 break; /* end of word */
1147 }
1148
1149 return 1;
1150 }
1151
1152 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1153 _cmpname(PyObject *self, int code, const char* name, int namelen)
1154 {
1155 /* check if code corresponds to the given name */
1156 int i;
1157 char buffer[NAME_MAXLEN+1];
1158 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1159 return 0;
1160 for (i = 0; i < namelen; i++) {
1161 if (Py_TOUPPER(name[i]) != buffer[i])
1162 return 0;
1163 }
1164 return buffer[namelen] == '\0';
1165 }
1166
1167 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1168 find_syllable(const char *str, int *len, int *pos, int count, int column)
1169 {
1170 int i, len1;
1171 *len = -1;
1172 for (i = 0; i < count; i++) {
1173 const char *s = hangul_syllables[i][column];
1174 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1175 if (len1 <= *len)
1176 continue;
1177 if (strncmp(str, s, len1) == 0) {
1178 *len = len1;
1179 *pos = i;
1180 }
1181 }
1182 if (*len == -1) {
1183 *len = 0;
1184 }
1185 }
1186
1187 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1188 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1189 {
1190 /* check if named sequences are allowed */
1191 if (!with_named_seq && IS_NAMED_SEQ(cp))
1192 return 0;
1193 /* if the code point is in the PUA range that we use for aliases,
1194 * convert it to obtain the right code point */
1195 if (IS_ALIAS(cp))
1196 *code = name_aliases[cp-aliases_start];
1197 else
1198 *code = cp;
1199 return 1;
1200 }
1201
1202 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1203 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1204 int with_named_seq)
1205 {
1206 /* Return the code point associated with the given name.
1207 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1208 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1209 * using for the named sequence, and the caller must then convert it. */
1210 unsigned int h, v;
1211 unsigned int mask = code_size-1;
1212 unsigned int i, incr;
1213
1214 /* Check for hangul syllables. */
1215 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1216 int len, L = -1, V = -1, T = -1;
1217 const char *pos = name + 16;
1218 find_syllable(pos, &len, &L, LCount, 0);
1219 pos += len;
1220 find_syllable(pos, &len, &V, VCount, 1);
1221 pos += len;
1222 find_syllable(pos, &len, &T, TCount, 2);
1223 pos += len;
1224 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1225 *code = SBase + (L*VCount+V)*TCount + T;
1226 return 1;
1227 }
1228 /* Otherwise, it's an illegal syllable name. */
1229 return 0;
1230 }
1231
1232 /* Check for unified ideographs. */
1233 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1234 /* Four or five hexdigits must follow. */
1235 v = 0;
1236 name += 22;
1237 namelen -= 22;
1238 if (namelen != 4 && namelen != 5)
1239 return 0;
1240 while (namelen--) {
1241 v *= 16;
1242 if (*name >= '0' && *name <= '9')
1243 v += *name - '0';
1244 else if (*name >= 'A' && *name <= 'F')
1245 v += *name - 'A' + 10;
1246 else
1247 return 0;
1248 name++;
1249 }
1250 if (!is_unified_ideograph(v))
1251 return 0;
1252 *code = v;
1253 return 1;
1254 }
1255
1256 /* the following is the same as python's dictionary lookup, with
1257 only minor changes. see the makeunicodedata script for more
1258 details */
1259
1260 h = (unsigned int) _gethash(name, namelen, code_magic);
1261 i = (~h) & mask;
1262 v = code_hash[i];
1263 if (!v)
1264 return 0;
1265 if (_cmpname(self, v, name, namelen))
1266 return _check_alias_and_seq(v, code, with_named_seq);
1267 incr = (h ^ (h >> 3)) & mask;
1268 if (!incr)
1269 incr = mask;
1270 for (;;) {
1271 i = (i + incr) & mask;
1272 v = code_hash[i];
1273 if (!v)
1274 return 0;
1275 if (_cmpname(self, v, name, namelen))
1276 return _check_alias_and_seq(v, code, with_named_seq);
1277 incr = incr << 1;
1278 if (incr > mask)
1279 incr = incr ^ code_poly;
1280 }
1281 }
1282
1283 static const _PyUnicode_Name_CAPI hashAPI =
1284 {
1285 sizeof(_PyUnicode_Name_CAPI),
1286 _getucname,
1287 _getcode
1288 };
1289
1290 /* -------------------------------------------------------------------- */
1291 /* Python bindings */
1292
1293 /*[clinic input]
1294 unicodedata.UCD.name
1295
1296 self: self
1297 chr: int(accept={str})
1298 default: object=NULL
1299 /
1300
1301 Returns the name assigned to the character chr as a string.
1302
1303 If no name is defined, default is returned, or, if not given,
1304 ValueError is raised.
1305 [clinic start generated code]*/
1306
1307 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1308 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1309 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1310 {
1311 char name[NAME_MAXLEN+1];
1312 Py_UCS4 c = (Py_UCS4)chr;
1313
1314 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1315 if (default_value == NULL) {
1316 PyErr_SetString(PyExc_ValueError, "no such name");
1317 return NULL;
1318 }
1319 else {
1320 Py_INCREF(default_value);
1321 return default_value;
1322 }
1323 }
1324
1325 return PyUnicode_FromString(name);
1326 }
1327
1328 /*[clinic input]
1329 unicodedata.UCD.lookup
1330
1331 self: self
1332 name: str(accept={str, robuffer}, zeroes=True)
1333 /
1334
1335 Look up character by name.
1336
1337 If a character with the given name is found, return the
1338 corresponding character. If not found, KeyError is raised.
1339 [clinic start generated code]*/
1340
1341 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1342 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1343 Py_ssize_clean_t name_length)
1344 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1345 {
1346 Py_UCS4 code;
1347 unsigned int index;
1348 if (name_length > NAME_MAXLEN) {
1349 PyErr_SetString(PyExc_KeyError, "name too long");
1350 return NULL;
1351 }
1352
1353 if (!_getcode(self, name, (int)name_length, &code, 1)) {
1354 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1355 return NULL;
1356 }
1357 /* check if code is in the PUA range that we use for named sequences
1358 and convert it */
1359 if (IS_NAMED_SEQ(code)) {
1360 index = code-named_sequences_start;
1361 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1362 named_sequences[index].seq,
1363 named_sequences[index].seqlen);
1364 }
1365 return PyUnicode_FromOrdinal(code);
1366 }
1367
1368 /* XXX Add doc strings. */
1369
1370 static PyMethodDef unicodedata_functions[] = {
1371 UNICODEDATA_UCD_DECIMAL_METHODDEF
1372 UNICODEDATA_UCD_DIGIT_METHODDEF
1373 UNICODEDATA_UCD_NUMERIC_METHODDEF
1374 UNICODEDATA_UCD_CATEGORY_METHODDEF
1375 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1376 UNICODEDATA_UCD_COMBINING_METHODDEF
1377 UNICODEDATA_UCD_MIRRORED_METHODDEF
1378 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1379 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1380 UNICODEDATA_UCD_NAME_METHODDEF
1381 UNICODEDATA_UCD_LOOKUP_METHODDEF
1382 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1383 UNICODEDATA_UCD_NORMALIZE_METHODDEF
1384 {NULL, NULL} /* sentinel */
1385 };
1386
1387 static PyTypeObject UCD_Type = {
1388 /* The ob_type field must be initialized in the module init function
1389 * to be portable to Windows without using C++. */
1390 PyVarObject_HEAD_INIT(NULL, 0)
1391 "unicodedata.UCD", /*tp_name*/
1392 sizeof(PreviousDBVersion), /*tp_basicsize*/
1393 0, /*tp_itemsize*/
1394 /* methods */
1395 (destructor)PyObject_Del, /*tp_dealloc*/
1396 0, /*tp_vectorcall_offset*/
1397 0, /*tp_getattr*/
1398 0, /*tp_setattr*/
1399 0, /*tp_as_async*/
1400 0, /*tp_repr*/
1401 0, /*tp_as_number*/
1402 0, /*tp_as_sequence*/
1403 0, /*tp_as_mapping*/
1404 0, /*tp_hash*/
1405 0, /*tp_call*/
1406 0, /*tp_str*/
1407 PyObject_GenericGetAttr,/*tp_getattro*/
1408 0, /*tp_setattro*/
1409 0, /*tp_as_buffer*/
1410 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1411 0, /*tp_doc*/
1412 0, /*tp_traverse*/
1413 0, /*tp_clear*/
1414 0, /*tp_richcompare*/
1415 0, /*tp_weaklistoffset*/
1416 0, /*tp_iter*/
1417 0, /*tp_iternext*/
1418 unicodedata_functions, /*tp_methods*/
1419 DB_members, /*tp_members*/
1420 0, /*tp_getset*/
1421 0, /*tp_base*/
1422 0, /*tp_dict*/
1423 0, /*tp_descr_get*/
1424 0, /*tp_descr_set*/
1425 0, /*tp_dictoffset*/
1426 0, /*tp_init*/
1427 0, /*tp_alloc*/
1428 0, /*tp_new*/
1429 0, /*tp_free*/
1430 0, /*tp_is_gc*/
1431 };
1432
1433 PyDoc_STRVAR(unicodedata_docstring,
1434 "This module provides access to the Unicode Character Database which\n\
1435 defines character properties for all Unicode characters. The data in\n\
1436 this database is based on the UnicodeData.txt file version\n\
1437 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1438 \n\
1439 The module uses the same names and symbols as defined by the\n\
1440 UnicodeData File Format " UNIDATA_VERSION ".");
1441
1442 static struct PyModuleDef unicodedatamodule = {
1443 PyModuleDef_HEAD_INIT,
1444 "unicodedata",
1445 unicodedata_docstring,
1446 -1,
1447 unicodedata_functions,
1448 NULL,
1449 NULL,
1450 NULL,
1451 NULL
1452 };
1453
1454 PyMODINIT_FUNC
PyInit_unicodedata(void)1455 PyInit_unicodedata(void)
1456 {
1457 PyObject *m, *v;
1458
1459 Py_SET_TYPE(&UCD_Type, &PyType_Type);
1460
1461 m = PyModule_Create(&unicodedatamodule);
1462 if (!m)
1463 return NULL;
1464
1465 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1466 Py_INCREF(&UCD_Type);
1467 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1468
1469 /* Previous versions */
1470 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1471 if (v != NULL)
1472 PyModule_AddObject(m, "ucd_3_2_0", v);
1473
1474 /* Export C API */
1475 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1476 if (v != NULL)
1477 PyModule_AddObject(m, "ucnhash_CAPI", v);
1478 return m;
1479 }
1480
1481 /*
1482 Local variables:
1483 c-basic-offset: 4
1484 indent-tabs-mode: nil
1485 End:
1486 */
1487