1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode database.
4
5 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
7
8 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 Modified by Martin v. Löwis (martin@v.loewis.de)
11
12 Copyright (c) Corporation for National Research Initiatives.
13
14 ------------------------------------------------------------------------ */
15
16 #define PY_SSIZE_T_CLEAN
17
18 #include "Python.h"
19 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
20 #include "structmember.h" // PyMemberDef
21
22 #include <stdbool.h>
23
24 _Py_IDENTIFIER(NFC);
25 _Py_IDENTIFIER(NFD);
26 _Py_IDENTIFIER(NFKC);
27 _Py_IDENTIFIER(NFKD);
28
29 /*[clinic input]
30 module unicodedata
31 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
32 [clinic start generated code]*/
33 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
34
35 /* character properties */
36
37 typedef struct {
38 const unsigned char category; /* index into
39 _PyUnicode_CategoryNames */
40 const unsigned char combining; /* combining class value 0 - 255 */
41 const unsigned char bidirectional; /* index into
42 _PyUnicode_BidirectionalNames */
43 const unsigned char mirrored; /* true if mirrored in bidir mode */
44 const unsigned char east_asian_width; /* index into
45 _PyUnicode_EastAsianWidth */
46 const unsigned char normalization_quick_check; /* see is_normalized() */
47 } _PyUnicode_DatabaseRecord;
48
49 typedef struct change_record {
50 /* sequence of fields should be the same as in merge_old_version */
51 const unsigned char bidir_changed;
52 const unsigned char category_changed;
53 const unsigned char decimal_changed;
54 const unsigned char mirrored_changed;
55 const unsigned char east_asian_width_changed;
56 const double numeric_changed;
57 } change_record;
58
59 /* data file generated by Tools/unicode/makeunicodedata.py */
60 #include "unicodedata_db.h"
61
62 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)63 _getrecord_ex(Py_UCS4 code)
64 {
65 int index;
66 if (code >= 0x110000)
67 index = 0;
68 else {
69 index = index1[(code>>SHIFT)];
70 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
71 }
72
73 return &_PyUnicode_Database_Records[index];
74 }
75
76 /* ------------- Previous-version API ------------------------------------- */
77 typedef struct previous_version {
78 PyObject_HEAD
79 const char *name;
80 const change_record* (*getrecord)(Py_UCS4);
81 Py_UCS4 (*normalization)(Py_UCS4);
82 } PreviousDBVersion;
83
84 #include "clinic/unicodedata.c.h"
85
86 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
87
88 static PyMemberDef DB_members[] = {
89 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
90 {NULL}
91 };
92
93 // Check if self is an unicodedata.UCD instance.
94 // If self is NULL (when the PyCapsule C API is used), return 0.
95 // PyModule_Check() is used to avoid having to retrieve the ucd_type.
96 // See unicodedata_functions comment to the rationale of this macro.
97 #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
98
99 static PyObject*
new_previous_version(PyTypeObject * ucd_type,const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))100 new_previous_version(PyTypeObject *ucd_type,
101 const char*name, const change_record* (*getrecord)(Py_UCS4),
102 Py_UCS4 (*normalization)(Py_UCS4))
103 {
104 PreviousDBVersion *self;
105 self = PyObject_GC_New(PreviousDBVersion, ucd_type);
106 if (self == NULL)
107 return NULL;
108 self->name = name;
109 self->getrecord = getrecord;
110 self->normalization = normalization;
111 PyObject_GC_Track(self);
112 return (PyObject*)self;
113 }
114
115
116 /* --- Module API --------------------------------------------------------- */
117
118 /*[clinic input]
119 unicodedata.UCD.decimal
120
121 self: self
122 chr: int(accept={str})
123 default: object=NULL
124 /
125
126 Converts a Unicode character into its equivalent decimal value.
127
128 Returns the decimal value assigned to the character chr as integer.
129 If no such value is defined, default is returned, or, if not given,
130 ValueError is raised.
131 [clinic start generated code]*/
132
133 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)134 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
135 PyObject *default_value)
136 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
137 {
138 int have_old = 0;
139 long rc;
140 Py_UCS4 c = (Py_UCS4)chr;
141
142 if (UCD_Check(self)) {
143 const change_record *old = get_old_record(self, c);
144 if (old->category_changed == 0) {
145 /* unassigned */
146 have_old = 1;
147 rc = -1;
148 }
149 else if (old->decimal_changed != 0xFF) {
150 have_old = 1;
151 rc = old->decimal_changed;
152 }
153 }
154
155 if (!have_old)
156 rc = Py_UNICODE_TODECIMAL(c);
157 if (rc < 0) {
158 if (default_value == NULL) {
159 PyErr_SetString(PyExc_ValueError,
160 "not a decimal");
161 return NULL;
162 }
163 else {
164 Py_INCREF(default_value);
165 return default_value;
166 }
167 }
168 return PyLong_FromLong(rc);
169 }
170
171 /*[clinic input]
172 unicodedata.UCD.digit
173
174 self: self
175 chr: int(accept={str})
176 default: object=NULL
177 /
178
179 Converts a Unicode character into its equivalent digit value.
180
181 Returns the digit value assigned to the character chr as integer.
182 If no such value is defined, default is returned, or, if not given,
183 ValueError is raised.
184 [clinic start generated code]*/
185
186 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)187 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
188 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
189 {
190 long rc;
191 Py_UCS4 c = (Py_UCS4)chr;
192 rc = Py_UNICODE_TODIGIT(c);
193 if (rc < 0) {
194 if (default_value == NULL) {
195 PyErr_SetString(PyExc_ValueError, "not a digit");
196 return NULL;
197 }
198 else {
199 Py_INCREF(default_value);
200 return default_value;
201 }
202 }
203 return PyLong_FromLong(rc);
204 }
205
206 /*[clinic input]
207 unicodedata.UCD.numeric
208
209 self: self
210 chr: int(accept={str})
211 default: object=NULL
212 /
213
214 Converts a Unicode character into its equivalent numeric value.
215
216 Returns the numeric value assigned to the character chr as float.
217 If no such value is defined, default is returned, or, if not given,
218 ValueError is raised.
219 [clinic start generated code]*/
220
221 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)222 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
223 PyObject *default_value)
224 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
225 {
226 int have_old = 0;
227 double rc;
228 Py_UCS4 c = (Py_UCS4)chr;
229
230 if (UCD_Check(self)) {
231 const change_record *old = get_old_record(self, c);
232 if (old->category_changed == 0) {
233 /* unassigned */
234 have_old = 1;
235 rc = -1.0;
236 }
237 else if (old->decimal_changed != 0xFF) {
238 have_old = 1;
239 rc = old->decimal_changed;
240 }
241 }
242
243 if (!have_old)
244 rc = Py_UNICODE_TONUMERIC(c);
245 if (rc == -1.0) {
246 if (default_value == NULL) {
247 PyErr_SetString(PyExc_ValueError, "not a numeric character");
248 return NULL;
249 }
250 else {
251 Py_INCREF(default_value);
252 return default_value;
253 }
254 }
255 return PyFloat_FromDouble(rc);
256 }
257
258 /*[clinic input]
259 unicodedata.UCD.category
260
261 self: self
262 chr: int(accept={str})
263 /
264
265 Returns the general category assigned to the character chr as string.
266 [clinic start generated code]*/
267
268 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)269 unicodedata_UCD_category_impl(PyObject *self, int chr)
270 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
271 {
272 int index;
273 Py_UCS4 c = (Py_UCS4)chr;
274 index = (int) _getrecord_ex(c)->category;
275 if (UCD_Check(self)) {
276 const change_record *old = get_old_record(self, c);
277 if (old->category_changed != 0xFF)
278 index = old->category_changed;
279 }
280 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
281 }
282
283 /*[clinic input]
284 unicodedata.UCD.bidirectional
285
286 self: self
287 chr: int(accept={str})
288 /
289
290 Returns the bidirectional class assigned to the character chr as string.
291
292 If no such value is defined, an empty string is returned.
293 [clinic start generated code]*/
294
295 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)296 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
297 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
298 {
299 int index;
300 Py_UCS4 c = (Py_UCS4)chr;
301 index = (int) _getrecord_ex(c)->bidirectional;
302 if (UCD_Check(self)) {
303 const change_record *old = get_old_record(self, c);
304 if (old->category_changed == 0)
305 index = 0; /* unassigned */
306 else if (old->bidir_changed != 0xFF)
307 index = old->bidir_changed;
308 }
309 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
310 }
311
312 /*[clinic input]
313 unicodedata.UCD.combining -> int
314
315 self: self
316 chr: int(accept={str})
317 /
318
319 Returns the canonical combining class assigned to the character chr as integer.
320
321 Returns 0 if no combining class is defined.
322 [clinic start generated code]*/
323
324 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)325 unicodedata_UCD_combining_impl(PyObject *self, int chr)
326 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
327 {
328 int index;
329 Py_UCS4 c = (Py_UCS4)chr;
330 index = (int) _getrecord_ex(c)->combining;
331 if (UCD_Check(self)) {
332 const change_record *old = get_old_record(self, c);
333 if (old->category_changed == 0)
334 index = 0; /* unassigned */
335 }
336 return index;
337 }
338
339 /*[clinic input]
340 unicodedata.UCD.mirrored -> int
341
342 self: self
343 chr: int(accept={str})
344 /
345
346 Returns the mirrored property assigned to the character chr as integer.
347
348 Returns 1 if the character has been identified as a "mirrored"
349 character in bidirectional text, 0 otherwise.
350 [clinic start generated code]*/
351
352 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)353 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
354 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
355 {
356 int index;
357 Py_UCS4 c = (Py_UCS4)chr;
358 index = (int) _getrecord_ex(c)->mirrored;
359 if (UCD_Check(self)) {
360 const change_record *old = get_old_record(self, c);
361 if (old->category_changed == 0)
362 index = 0; /* unassigned */
363 else if (old->mirrored_changed != 0xFF)
364 index = old->mirrored_changed;
365 }
366 return index;
367 }
368
369 /*[clinic input]
370 unicodedata.UCD.east_asian_width
371
372 self: self
373 chr: int(accept={str})
374 /
375
376 Returns the east asian width assigned to the character chr as string.
377 [clinic start generated code]*/
378
379 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)380 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
381 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
382 {
383 int index;
384 Py_UCS4 c = (Py_UCS4)chr;
385 index = (int) _getrecord_ex(c)->east_asian_width;
386 if (UCD_Check(self)) {
387 const change_record *old = get_old_record(self, c);
388 if (old->category_changed == 0)
389 index = 0; /* unassigned */
390 else if (old->east_asian_width_changed != 0xFF)
391 index = old->east_asian_width_changed;
392 }
393 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
394 }
395
396 /*[clinic input]
397 unicodedata.UCD.decomposition
398
399 self: self
400 chr: int(accept={str})
401 /
402
403 Returns the character decomposition mapping assigned to the character chr as string.
404
405 An empty string is returned in case no such mapping is defined.
406 [clinic start generated code]*/
407
408 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)409 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
410 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
411 {
412 char decomp[256];
413 int code, index, count;
414 size_t i;
415 unsigned int prefix_index;
416 Py_UCS4 c = (Py_UCS4)chr;
417
418 code = (int)c;
419
420 if (UCD_Check(self)) {
421 const change_record *old = get_old_record(self, c);
422 if (old->category_changed == 0)
423 return PyUnicode_FromString(""); /* unassigned */
424 }
425
426 if (code < 0 || code >= 0x110000)
427 index = 0;
428 else {
429 index = decomp_index1[(code>>DECOMP_SHIFT)];
430 index = decomp_index2[(index<<DECOMP_SHIFT)+
431 (code&((1<<DECOMP_SHIFT)-1))];
432 }
433
434 /* high byte is number of hex bytes (usually one or two), low byte
435 is prefix code (from*/
436 count = decomp_data[index] >> 8;
437
438 /* XXX: could allocate the PyString up front instead
439 (strlen(prefix) + 5 * count + 1 bytes) */
440
441 /* Based on how index is calculated above and decomp_data is generated
442 from Tools/unicode/makeunicodedata.py, it should not be possible
443 to overflow decomp_prefix. */
444 prefix_index = decomp_data[index] & 255;
445 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
446
447 /* copy prefix */
448 i = strlen(decomp_prefix[prefix_index]);
449 memcpy(decomp, decomp_prefix[prefix_index], i);
450
451 while (count-- > 0) {
452 if (i)
453 decomp[i++] = ' ';
454 assert(i < sizeof(decomp));
455 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
456 decomp_data[++index]);
457 i += strlen(decomp + i);
458 }
459 return PyUnicode_FromStringAndSize(decomp, i);
460 }
461
462 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)463 get_decomp_record(PyObject *self, Py_UCS4 code,
464 int *index, int *prefix, int *count)
465 {
466 if (code >= 0x110000) {
467 *index = 0;
468 }
469 else if (UCD_Check(self)
470 && get_old_record(self, code)->category_changed==0) {
471 /* unassigned in old version */
472 *index = 0;
473 }
474 else {
475 *index = decomp_index1[(code>>DECOMP_SHIFT)];
476 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
477 (code&((1<<DECOMP_SHIFT)-1))];
478 }
479
480 /* high byte is number of hex bytes (usually one or two), low byte
481 is prefix code (from*/
482 *count = decomp_data[*index] >> 8;
483 *prefix = decomp_data[*index] & 255;
484
485 (*index)++;
486 }
487
488 #define SBase 0xAC00
489 #define LBase 0x1100
490 #define VBase 0x1161
491 #define TBase 0x11A7
492 #define LCount 19
493 #define VCount 21
494 #define TCount 28
495 #define NCount (VCount*TCount)
496 #define SCount (LCount*NCount)
497
498 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)499 nfd_nfkd(PyObject *self, PyObject *input, int k)
500 {
501 PyObject *result;
502 Py_UCS4 *output;
503 Py_ssize_t i, o, osize;
504 int kind;
505 const void *data;
506 /* Longest decomposition in Unicode 3.2: U+FDFA */
507 Py_UCS4 stack[20];
508 Py_ssize_t space, isize;
509 int index, prefix, count, stackptr;
510 unsigned char prev, cur;
511
512 stackptr = 0;
513 isize = PyUnicode_GET_LENGTH(input);
514 space = isize;
515 /* Overallocate at most 10 characters. */
516 if (space > 10) {
517 if (space <= PY_SSIZE_T_MAX - 10)
518 space += 10;
519 }
520 else {
521 space *= 2;
522 }
523 osize = space;
524 output = PyMem_NEW(Py_UCS4, space);
525 if (!output) {
526 PyErr_NoMemory();
527 return NULL;
528 }
529 i = o = 0;
530 kind = PyUnicode_KIND(input);
531 data = PyUnicode_DATA(input);
532
533 while (i < isize) {
534 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
535 while(stackptr) {
536 Py_UCS4 code = stack[--stackptr];
537 /* Hangul Decomposition adds three characters in
538 a single step, so we need at least that much room. */
539 if (space < 3) {
540 Py_UCS4 *new_output;
541 osize += 10;
542 space += 10;
543 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
544 if (new_output == NULL) {
545 PyMem_Free(output);
546 PyErr_NoMemory();
547 return NULL;
548 }
549 output = new_output;
550 }
551 /* Hangul Decomposition. */
552 if (SBase <= code && code < (SBase+SCount)) {
553 int SIndex = code - SBase;
554 int L = LBase + SIndex / NCount;
555 int V = VBase + (SIndex % NCount) / TCount;
556 int T = TBase + SIndex % TCount;
557 output[o++] = L;
558 output[o++] = V;
559 space -= 2;
560 if (T != TBase) {
561 output[o++] = T;
562 space --;
563 }
564 continue;
565 }
566 /* normalization changes */
567 if (UCD_Check(self)) {
568 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
569 if (value != 0) {
570 stack[stackptr++] = value;
571 continue;
572 }
573 }
574
575 /* Other decompositions. */
576 get_decomp_record(self, code, &index, &prefix, &count);
577
578 /* Copy character if it is not decomposable, or has a
579 compatibility decomposition, but we do NFD. */
580 if (!count || (prefix && !k)) {
581 output[o++] = code;
582 space--;
583 continue;
584 }
585 /* Copy decomposition onto the stack, in reverse
586 order. */
587 while(count) {
588 code = decomp_data[index + (--count)];
589 stack[stackptr++] = code;
590 }
591 }
592 }
593
594 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
595 output, o);
596 PyMem_Free(output);
597 if (!result)
598 return NULL;
599 /* result is guaranteed to be ready, as it is compact. */
600 kind = PyUnicode_KIND(result);
601 data = PyUnicode_DATA(result);
602
603 /* Sort canonically. */
604 i = 0;
605 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
606 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
607 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
608 if (prev == 0 || cur == 0 || prev <= cur) {
609 prev = cur;
610 continue;
611 }
612 /* Non-canonical order. Need to switch *i with previous. */
613 o = i - 1;
614 while (1) {
615 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
616 PyUnicode_WRITE(kind, data, o+1,
617 PyUnicode_READ(kind, data, o));
618 PyUnicode_WRITE(kind, data, o, tmp);
619 o--;
620 if (o < 0)
621 break;
622 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
623 if (prev == 0 || prev <= cur)
624 break;
625 }
626 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
627 }
628 return result;
629 }
630
631 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)632 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
633 {
634 unsigned int index;
635 for (index = 0; nfc[index].start; index++) {
636 unsigned int start = nfc[index].start;
637 if (code < start)
638 return -1;
639 if (code <= start + nfc[index].count) {
640 unsigned int delta = code - start;
641 return nfc[index].index + delta;
642 }
643 }
644 return -1;
645 }
646
647 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)648 nfc_nfkc(PyObject *self, PyObject *input, int k)
649 {
650 PyObject *result;
651 int kind;
652 const void *data;
653 Py_UCS4 *output;
654 Py_ssize_t i, i1, o, len;
655 int f,l,index,index1,comb;
656 Py_UCS4 code;
657 Py_ssize_t skipped[20];
658 int cskipped = 0;
659
660 result = nfd_nfkd(self, input, k);
661 if (!result)
662 return NULL;
663 /* result will be "ready". */
664 kind = PyUnicode_KIND(result);
665 data = PyUnicode_DATA(result);
666 len = PyUnicode_GET_LENGTH(result);
667
668 /* We allocate a buffer for the output.
669 If we find that we made no changes, we still return
670 the NFD result. */
671 output = PyMem_NEW(Py_UCS4, len);
672 if (!output) {
673 PyErr_NoMemory();
674 Py_DECREF(result);
675 return 0;
676 }
677 i = o = 0;
678
679 again:
680 while (i < len) {
681 for (index = 0; index < cskipped; index++) {
682 if (skipped[index] == i) {
683 /* *i character is skipped.
684 Remove from list. */
685 skipped[index] = skipped[cskipped-1];
686 cskipped--;
687 i++;
688 goto again; /* continue while */
689 }
690 }
691 /* Hangul Composition. We don't need to check for <LV,T>
692 pairs, since we always have decomposed data. */
693 code = PyUnicode_READ(kind, data, i);
694 if (LBase <= code && code < (LBase+LCount) &&
695 i + 1 < len &&
696 VBase <= PyUnicode_READ(kind, data, i+1) &&
697 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
698 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
699 and V character is a modern vowel (0x1161 ~ 0x1175). */
700 int LIndex, VIndex;
701 LIndex = code - LBase;
702 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
703 code = SBase + (LIndex*VCount+VIndex)*TCount;
704 i+=2;
705 if (i < len &&
706 TBase < PyUnicode_READ(kind, data, i) &&
707 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
708 /* check T character is a modern trailing consonant
709 (0x11A8 ~ 0x11C2). */
710 code += PyUnicode_READ(kind, data, i)-TBase;
711 i++;
712 }
713 output[o++] = code;
714 continue;
715 }
716
717 /* code is still input[i] here */
718 f = find_nfc_index(nfc_first, code);
719 if (f == -1) {
720 output[o++] = code;
721 i++;
722 continue;
723 }
724 /* Find next unblocked character. */
725 i1 = i+1;
726 comb = 0;
727 /* output base character for now; might be updated later. */
728 output[o] = PyUnicode_READ(kind, data, i);
729 while (i1 < len) {
730 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
731 int comb1 = _getrecord_ex(code1)->combining;
732 if (comb) {
733 if (comb1 == 0)
734 break;
735 if (comb >= comb1) {
736 /* Character is blocked. */
737 i1++;
738 continue;
739 }
740 }
741 l = find_nfc_index(nfc_last, code1);
742 /* i1 cannot be combined with i. If i1
743 is a starter, we don't need to look further.
744 Otherwise, record the combining class. */
745 if (l == -1) {
746 not_combinable:
747 if (comb1 == 0)
748 break;
749 comb = comb1;
750 i1++;
751 continue;
752 }
753 index = f*TOTAL_LAST + l;
754 index1 = comp_index[index >> COMP_SHIFT];
755 code = comp_data[(index1<<COMP_SHIFT)+
756 (index&((1<<COMP_SHIFT)-1))];
757 if (code == 0)
758 goto not_combinable;
759
760 /* Replace the original character. */
761 output[o] = code;
762 /* Mark the second character unused. */
763 assert(cskipped < 20);
764 skipped[cskipped++] = i1;
765 i1++;
766 f = find_nfc_index(nfc_first, output[o]);
767 if (f == -1)
768 break;
769 }
770 /* Output character was already written.
771 Just advance the indices. */
772 o++; i++;
773 }
774 if (o == len) {
775 /* No changes. Return original string. */
776 PyMem_Free(output);
777 return result;
778 }
779 Py_DECREF(result);
780 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
781 output, o);
782 PyMem_Free(output);
783 return result;
784 }
785
786 // This needs to match the logic in makeunicodedata.py
787 // which constructs the quickcheck data.
788 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
789
790 /* Run the Unicode normalization "quickcheck" algorithm.
791 *
792 * Return YES or NO if quickcheck determines the input is certainly
793 * normalized or certainly not, and MAYBE if quickcheck is unable to
794 * tell.
795 *
796 * If `yes_only` is true, then return MAYBE as soon as we determine
797 * the answer is not YES.
798 *
799 * For background and details on the algorithm, see UAX #15:
800 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
801 */
802 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)803 is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
804 bool yes_only)
805 {
806 /* UCD 3.2.0 is requested, quickchecks must be disabled. */
807 if (UCD_Check(self)) {
808 return NO;
809 }
810
811 Py_ssize_t i, len;
812 int kind;
813 const void *data;
814 unsigned char prev_combining = 0;
815
816 /* The two quickcheck bits at this shift have type QuickcheckResult. */
817 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
818
819 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
820
821 i = 0;
822 kind = PyUnicode_KIND(input);
823 data = PyUnicode_DATA(input);
824 len = PyUnicode_GET_LENGTH(input);
825 while (i < len) {
826 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
827 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
828
829 unsigned char combining = record->combining;
830 if (combining && prev_combining > combining)
831 return NO; /* non-canonical sort order, not normalized */
832 prev_combining = combining;
833
834 unsigned char quickcheck_whole = record->normalization_quick_check;
835 if (yes_only) {
836 if (quickcheck_whole & (3 << quickcheck_shift))
837 return MAYBE;
838 } else {
839 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
840 case NO:
841 return NO;
842 case MAYBE:
843 result = MAYBE; /* this string might need normalization */
844 }
845 }
846 }
847 return result;
848 }
849
850 /*[clinic input]
851 unicodedata.UCD.is_normalized
852
853 self: self
854 form: unicode
855 unistr as input: unicode
856 /
857
858 Return whether the Unicode string unistr is in the normal form 'form'.
859
860 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
861 [clinic start generated code]*/
862
863 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)864 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
865 PyObject *input)
866 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
867 {
868 if (PyUnicode_READY(input) == -1) {
869 return NULL;
870 }
871
872 if (PyUnicode_GET_LENGTH(input) == 0) {
873 /* special case empty input strings. */
874 Py_RETURN_TRUE;
875 }
876
877 PyObject *result;
878 bool nfc = false;
879 bool k = false;
880 QuickcheckResult m;
881
882 PyObject *cmp;
883 int match = 0;
884
885 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
886 nfc = true;
887 }
888 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
889 nfc = true;
890 k = true;
891 }
892 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
893 /* matches default values for `nfc` and `k` */
894 }
895 else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
896 k = true;
897 }
898 else {
899 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
900 return NULL;
901 }
902
903 m = is_normalized_quickcheck(self, input, nfc, k, false);
904
905 if (m == MAYBE) {
906 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
907 if (cmp == NULL) {
908 return NULL;
909 }
910 match = PyUnicode_Compare(input, cmp);
911 Py_DECREF(cmp);
912 result = (match == 0) ? Py_True : Py_False;
913 }
914 else {
915 result = (m == YES) ? Py_True : Py_False;
916 }
917
918 Py_INCREF(result);
919 return result;
920 }
921
922
923 /*[clinic input]
924 unicodedata.UCD.normalize
925
926 self: self
927 form: unicode
928 unistr as input: unicode
929 /
930
931 Return the normal form 'form' for the Unicode string unistr.
932
933 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
934 [clinic start generated code]*/
935
936 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)937 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
938 PyObject *input)
939 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
940 {
941 if (PyUnicode_GET_LENGTH(input) == 0) {
942 /* Special case empty input strings, since resizing
943 them later would cause internal errors. */
944 Py_INCREF(input);
945 return input;
946 }
947
948 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
949 if (is_normalized_quickcheck(self, input,
950 true, false, true) == YES) {
951 Py_INCREF(input);
952 return input;
953 }
954 return nfc_nfkc(self, input, 0);
955 }
956 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
957 if (is_normalized_quickcheck(self, input,
958 true, true, true) == YES) {
959 Py_INCREF(input);
960 return input;
961 }
962 return nfc_nfkc(self, input, 1);
963 }
964 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
965 if (is_normalized_quickcheck(self, input,
966 false, false, true) == YES) {
967 Py_INCREF(input);
968 return input;
969 }
970 return nfd_nfkd(self, input, 0);
971 }
972 if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
973 if (is_normalized_quickcheck(self, input,
974 false, true, true) == YES) {
975 Py_INCREF(input);
976 return input;
977 }
978 return nfd_nfkd(self, input, 1);
979 }
980 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
981 return NULL;
982 }
983
984 /* -------------------------------------------------------------------- */
985 /* unicode character name tables */
986
987 /* data file generated by Tools/unicode/makeunicodedata.py */
988 #include "unicodename_db.h"
989
990 /* -------------------------------------------------------------------- */
991 /* database code (cut and pasted from the unidb package) */
992
993 static unsigned long
_gethash(const char * s,int len,int scale)994 _gethash(const char *s, int len, int scale)
995 {
996 int i;
997 unsigned long h = 0;
998 unsigned long ix;
999 for (i = 0; i < len; i++) {
1000 h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
1001 ix = h & 0xff000000;
1002 if (ix)
1003 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
1004 }
1005 return h;
1006 }
1007
1008 static const char * const hangul_syllables[][3] = {
1009 { "G", "A", "" },
1010 { "GG", "AE", "G" },
1011 { "N", "YA", "GG" },
1012 { "D", "YAE", "GS" },
1013 { "DD", "EO", "N", },
1014 { "R", "E", "NJ" },
1015 { "M", "YEO", "NH" },
1016 { "B", "YE", "D" },
1017 { "BB", "O", "L" },
1018 { "S", "WA", "LG" },
1019 { "SS", "WAE", "LM" },
1020 { "", "OE", "LB" },
1021 { "J", "YO", "LS" },
1022 { "JJ", "U", "LT" },
1023 { "C", "WEO", "LP" },
1024 { "K", "WE", "LH" },
1025 { "T", "WI", "M" },
1026 { "P", "YU", "B" },
1027 { "H", "EU", "BS" },
1028 { 0, "YI", "S" },
1029 { 0, "I", "SS" },
1030 { 0, 0, "NG" },
1031 { 0, 0, "J" },
1032 { 0, 0, "C" },
1033 { 0, 0, "K" },
1034 { 0, 0, "T" },
1035 { 0, 0, "P" },
1036 { 0, 0, "H" }
1037 };
1038
1039 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1040 static int
is_unified_ideograph(Py_UCS4 code)1041 is_unified_ideograph(Py_UCS4 code)
1042 {
1043 return
1044 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1045 (0x4E00 <= code && code <= 0x9FFC) || /* CJK Ideograph */
1046 (0x20000 <= code && code <= 0x2A6DD) || /* CJK Ideograph Extension B */
1047 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
1048 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1049 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1050 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1051 (0x30000 <= code && code <= 0x3134A); /* CJK Ideograph Extension G */
1052 }
1053
1054 /* macros used to determine if the given code point is in the PUA range that
1055 * we are using to store aliases and named sequences */
1056 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1057 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1058 (cp < named_sequences_end))
1059
1060 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1061 _getucname(PyObject *self,
1062 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1063 {
1064 /* Find the name associated with the given code point.
1065 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1066 * that we are using for aliases and named sequences. */
1067 int offset;
1068 int i;
1069 int word;
1070 const unsigned char* w;
1071
1072 if (code >= 0x110000)
1073 return 0;
1074
1075 /* XXX should we just skip all the code points in the PUAs here? */
1076 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1077 return 0;
1078
1079 if (UCD_Check(self)) {
1080 /* in 3.2.0 there are no aliases and named sequences */
1081 const change_record *old;
1082 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1083 return 0;
1084 old = get_old_record(self, code);
1085 if (old->category_changed == 0) {
1086 /* unassigned */
1087 return 0;
1088 }
1089 }
1090
1091 if (SBase <= code && code < SBase+SCount) {
1092 /* Hangul syllable. */
1093 int SIndex = code - SBase;
1094 int L = SIndex / NCount;
1095 int V = (SIndex % NCount) / TCount;
1096 int T = SIndex % TCount;
1097
1098 if (buflen < 27)
1099 /* Worst case: HANGUL SYLLABLE <10chars>. */
1100 return 0;
1101 strcpy(buffer, "HANGUL SYLLABLE ");
1102 buffer += 16;
1103 strcpy(buffer, hangul_syllables[L][0]);
1104 buffer += strlen(hangul_syllables[L][0]);
1105 strcpy(buffer, hangul_syllables[V][1]);
1106 buffer += strlen(hangul_syllables[V][1]);
1107 strcpy(buffer, hangul_syllables[T][2]);
1108 buffer += strlen(hangul_syllables[T][2]);
1109 *buffer = '\0';
1110 return 1;
1111 }
1112
1113 if (is_unified_ideograph(code)) {
1114 if (buflen < 28)
1115 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1116 return 0;
1117 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1118 return 1;
1119 }
1120
1121 /* get offset into phrasebook */
1122 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1123 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1124 (code&((1<<phrasebook_shift)-1))];
1125 if (!offset)
1126 return 0;
1127
1128 i = 0;
1129
1130 for (;;) {
1131 /* get word index */
1132 word = phrasebook[offset] - phrasebook_short;
1133 if (word >= 0) {
1134 word = (word << 8) + phrasebook[offset+1];
1135 offset += 2;
1136 } else
1137 word = phrasebook[offset++];
1138 if (i) {
1139 if (i > buflen)
1140 return 0; /* buffer overflow */
1141 buffer[i++] = ' ';
1142 }
1143 /* copy word string from lexicon. the last character in the
1144 word has bit 7 set. the last word in a string ends with
1145 0x80 */
1146 w = lexicon + lexicon_offset[word];
1147 while (*w < 128) {
1148 if (i >= buflen)
1149 return 0; /* buffer overflow */
1150 buffer[i++] = *w++;
1151 }
1152 if (i >= buflen)
1153 return 0; /* buffer overflow */
1154 buffer[i++] = *w & 127;
1155 if (*w == 128)
1156 break; /* end of word */
1157 }
1158
1159 return 1;
1160 }
1161
1162 static int
capi_getucname(Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1163 capi_getucname(Py_UCS4 code,
1164 char* buffer, int buflen,
1165 int with_alias_and_seq)
1166 {
1167 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1168
1169 }
1170
1171 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1172 _cmpname(PyObject *self, int code, const char* name, int namelen)
1173 {
1174 /* check if code corresponds to the given name */
1175 int i;
1176 char buffer[NAME_MAXLEN+1];
1177 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1178 return 0;
1179 for (i = 0; i < namelen; i++) {
1180 if (Py_TOUPPER(name[i]) != buffer[i])
1181 return 0;
1182 }
1183 return buffer[namelen] == '\0';
1184 }
1185
1186 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1187 find_syllable(const char *str, int *len, int *pos, int count, int column)
1188 {
1189 int i, len1;
1190 *len = -1;
1191 for (i = 0; i < count; i++) {
1192 const char *s = hangul_syllables[i][column];
1193 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1194 if (len1 <= *len)
1195 continue;
1196 if (strncmp(str, s, len1) == 0) {
1197 *len = len1;
1198 *pos = i;
1199 }
1200 }
1201 if (*len == -1) {
1202 *len = 0;
1203 }
1204 }
1205
1206 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1207 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1208 {
1209 /* check if named sequences are allowed */
1210 if (!with_named_seq && IS_NAMED_SEQ(cp))
1211 return 0;
1212 /* if the code point is in the PUA range that we use for aliases,
1213 * convert it to obtain the right code point */
1214 if (IS_ALIAS(cp))
1215 *code = name_aliases[cp-aliases_start];
1216 else
1217 *code = cp;
1218 return 1;
1219 }
1220
1221 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1222 _getcode(PyObject* self,
1223 const char* name, int namelen, Py_UCS4* code, int with_named_seq)
1224 {
1225 /* Return the code point associated with the given name.
1226 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1227 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1228 * using for the named sequence, and the caller must then convert it. */
1229 unsigned int h, v;
1230 unsigned int mask = code_size-1;
1231 unsigned int i, incr;
1232
1233 /* Check for hangul syllables. */
1234 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1235 int len, L = -1, V = -1, T = -1;
1236 const char *pos = name + 16;
1237 find_syllable(pos, &len, &L, LCount, 0);
1238 pos += len;
1239 find_syllable(pos, &len, &V, VCount, 1);
1240 pos += len;
1241 find_syllable(pos, &len, &T, TCount, 2);
1242 pos += len;
1243 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1244 *code = SBase + (L*VCount+V)*TCount + T;
1245 return 1;
1246 }
1247 /* Otherwise, it's an illegal syllable name. */
1248 return 0;
1249 }
1250
1251 /* Check for unified ideographs. */
1252 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1253 /* Four or five hexdigits must follow. */
1254 v = 0;
1255 name += 22;
1256 namelen -= 22;
1257 if (namelen != 4 && namelen != 5)
1258 return 0;
1259 while (namelen--) {
1260 v *= 16;
1261 if (*name >= '0' && *name <= '9')
1262 v += *name - '0';
1263 else if (*name >= 'A' && *name <= 'F')
1264 v += *name - 'A' + 10;
1265 else
1266 return 0;
1267 name++;
1268 }
1269 if (!is_unified_ideograph(v))
1270 return 0;
1271 *code = v;
1272 return 1;
1273 }
1274
1275 /* the following is the same as python's dictionary lookup, with
1276 only minor changes. see the makeunicodedata script for more
1277 details */
1278
1279 h = (unsigned int) _gethash(name, namelen, code_magic);
1280 i = (~h) & mask;
1281 v = code_hash[i];
1282 if (!v)
1283 return 0;
1284 if (_cmpname(self, v, name, namelen)) {
1285 return _check_alias_and_seq(v, code, with_named_seq);
1286 }
1287 incr = (h ^ (h >> 3)) & mask;
1288 if (!incr)
1289 incr = mask;
1290 for (;;) {
1291 i = (i + incr) & mask;
1292 v = code_hash[i];
1293 if (!v)
1294 return 0;
1295 if (_cmpname(self, v, name, namelen)) {
1296 return _check_alias_and_seq(v, code, with_named_seq);
1297 }
1298 incr = incr << 1;
1299 if (incr > mask)
1300 incr = incr ^ code_poly;
1301 }
1302 }
1303
1304 static int
capi_getcode(const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1305 capi_getcode(const char* name, int namelen, Py_UCS4* code,
1306 int with_named_seq)
1307 {
1308 return _getcode(NULL, name, namelen, code, with_named_seq);
1309
1310 }
1311
1312 static void
unicodedata_destroy_capi(PyObject * capsule)1313 unicodedata_destroy_capi(PyObject *capsule)
1314 {
1315 void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1316 PyMem_Free(capi);
1317 }
1318
1319 static PyObject *
unicodedata_create_capi(void)1320 unicodedata_create_capi(void)
1321 {
1322 _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1323 if (capi == NULL) {
1324 PyErr_NoMemory();
1325 return NULL;
1326 }
1327 capi->getname = capi_getucname;
1328 capi->getcode = capi_getcode;
1329
1330 PyObject *capsule = PyCapsule_New(capi,
1331 PyUnicodeData_CAPSULE_NAME,
1332 unicodedata_destroy_capi);
1333 if (capsule == NULL) {
1334 PyMem_Free(capi);
1335 }
1336 return capsule;
1337 };
1338
1339
1340 /* -------------------------------------------------------------------- */
1341 /* Python bindings */
1342
1343 /*[clinic input]
1344 unicodedata.UCD.name
1345
1346 self: self
1347 chr: int(accept={str})
1348 default: object=NULL
1349 /
1350
1351 Returns the name assigned to the character chr as a string.
1352
1353 If no name is defined, default is returned, or, if not given,
1354 ValueError is raised.
1355 [clinic start generated code]*/
1356
1357 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1358 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1359 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1360 {
1361 char name[NAME_MAXLEN+1];
1362 Py_UCS4 c = (Py_UCS4)chr;
1363
1364 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1365 if (default_value == NULL) {
1366 PyErr_SetString(PyExc_ValueError, "no such name");
1367 return NULL;
1368 }
1369 else {
1370 Py_INCREF(default_value);
1371 return default_value;
1372 }
1373 }
1374
1375 return PyUnicode_FromString(name);
1376 }
1377
1378 /*[clinic input]
1379 unicodedata.UCD.lookup
1380
1381 self: self
1382 name: str(accept={str, robuffer}, zeroes=True)
1383 /
1384
1385 Look up character by name.
1386
1387 If a character with the given name is found, return the
1388 corresponding character. If not found, KeyError is raised.
1389 [clinic start generated code]*/
1390
1391 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1392 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1393 Py_ssize_clean_t name_length)
1394 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1395 {
1396 Py_UCS4 code;
1397 unsigned int index;
1398 if (name_length > NAME_MAXLEN) {
1399 PyErr_SetString(PyExc_KeyError, "name too long");
1400 return NULL;
1401 }
1402
1403 if (!_getcode(self, name, (int)name_length, &code, 1)) {
1404 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1405 return NULL;
1406 }
1407 /* check if code is in the PUA range that we use for named sequences
1408 and convert it */
1409 if (IS_NAMED_SEQ(code)) {
1410 index = code-named_sequences_start;
1411 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1412 named_sequences[index].seq,
1413 named_sequences[index].seqlen);
1414 }
1415 return PyUnicode_FromOrdinal(code);
1416 }
1417
1418 // List of functions used to define module functions *AND* unicodedata.UCD
1419 // methods. For module functions, self is the module. For UCD methods, self
1420 // is an UCD instance. The UCD_Check() macro is used to check if self is
1421 // an UCD instance.
1422 static PyMethodDef unicodedata_functions[] = {
1423 UNICODEDATA_UCD_DECIMAL_METHODDEF
1424 UNICODEDATA_UCD_DIGIT_METHODDEF
1425 UNICODEDATA_UCD_NUMERIC_METHODDEF
1426 UNICODEDATA_UCD_CATEGORY_METHODDEF
1427 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1428 UNICODEDATA_UCD_COMBINING_METHODDEF
1429 UNICODEDATA_UCD_MIRRORED_METHODDEF
1430 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1431 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1432 UNICODEDATA_UCD_NAME_METHODDEF
1433 UNICODEDATA_UCD_LOOKUP_METHODDEF
1434 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1435 UNICODEDATA_UCD_NORMALIZE_METHODDEF
1436 {NULL, NULL} /* sentinel */
1437 };
1438
1439 static int
ucd_traverse(PreviousDBVersion * self,visitproc visit,void * arg)1440 ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1441 {
1442 Py_VISIT(Py_TYPE(self));
1443 return 0;
1444 }
1445
1446 static void
ucd_dealloc(PreviousDBVersion * self)1447 ucd_dealloc(PreviousDBVersion *self)
1448 {
1449 PyTypeObject *tp = Py_TYPE(self);
1450 PyObject_GC_UnTrack(self);
1451 PyObject_GC_Del(self);
1452 Py_DECREF(tp);
1453 }
1454
1455 static PyType_Slot ucd_type_slots[] = {
1456 {Py_tp_dealloc, ucd_dealloc},
1457 {Py_tp_traverse, ucd_traverse},
1458 {Py_tp_getattro, PyObject_GenericGetAttr},
1459 {Py_tp_methods, unicodedata_functions},
1460 {Py_tp_members, DB_members},
1461 {0, 0}
1462 };
1463
1464 static PyType_Spec ucd_type_spec = {
1465 .name = "unicodedata.UCD",
1466 .basicsize = sizeof(PreviousDBVersion),
1467 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1468 Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1469 .slots = ucd_type_slots
1470 };
1471
1472 PyDoc_STRVAR(unicodedata_docstring,
1473 "This module provides access to the Unicode Character Database which\n\
1474 defines character properties for all Unicode characters. The data in\n\
1475 this database is based on the UnicodeData.txt file version\n\
1476 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1477 \n\
1478 The module uses the same names and symbols as defined by the\n\
1479 UnicodeData File Format " UNIDATA_VERSION ".");
1480
1481 static int
unicodedata_exec(PyObject * module)1482 unicodedata_exec(PyObject *module)
1483 {
1484 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1485 return -1;
1486 }
1487
1488 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1489 if (ucd_type == NULL) {
1490 return -1;
1491 }
1492
1493 if (PyModule_AddType(module, ucd_type) < 0) {
1494 Py_DECREF(ucd_type);
1495 return -1;
1496 }
1497
1498 // Unicode database version 3.2.0 used by the IDNA encoding
1499 PyObject *v;
1500 v = new_previous_version(ucd_type, "3.2.0",
1501 get_change_3_2_0, normalization_3_2_0);
1502 Py_DECREF(ucd_type);
1503 if (v == NULL) {
1504 return -1;
1505 }
1506 if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
1507 Py_DECREF(v);
1508 return -1;
1509 }
1510
1511 /* Export C API */
1512 PyObject *capsule = unicodedata_create_capi();
1513 if (capsule == NULL) {
1514 return -1;
1515 }
1516 int rc = PyModule_AddObjectRef(module, "_ucnhash_CAPI", capsule);
1517 Py_DECREF(capsule);
1518 if (rc < 0) {
1519 return -1;
1520 }
1521 return 0;
1522 }
1523
1524 static PyModuleDef_Slot unicodedata_slots[] = {
1525 {Py_mod_exec, unicodedata_exec},
1526 {0, NULL}
1527 };
1528
1529 static struct PyModuleDef unicodedata_module = {
1530 PyModuleDef_HEAD_INIT,
1531 .m_name = "unicodedata",
1532 .m_doc = unicodedata_docstring,
1533 .m_size = 0,
1534 .m_methods = unicodedata_functions,
1535 .m_slots = unicodedata_slots,
1536 };
1537
1538 PyMODINIT_FUNC
PyInit_unicodedata(void)1539 PyInit_unicodedata(void)
1540 {
1541 return PyModuleDef_Init(&unicodedata_module);
1542 }
1543
1544
1545 /*
1546 Local variables:
1547 c-basic-offset: 4
1548 indent-tabs-mode: nil
1549 End:
1550 */
1551