• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * wchar_t helpers, version CPython >= 3.3.
3  *
4  * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
5  * platforms, even ones with wchar_t limited to 2 bytes.  As such,
6  * this code here works from the outside like wchar_helper.h in the
7  * case Py_UNICODE_SIZE == 4, but the implementation is very different.
8  */
9 
10 typedef uint16_t cffi_char16_t;
11 typedef uint32_t cffi_char32_t;
12 
13 
14 static PyObject *
_my_PyUnicode_FromChar32(const cffi_char32_t * w,Py_ssize_t size)15 _my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
16 {
17     return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
18 }
19 
20 static PyObject *
_my_PyUnicode_FromChar16(const cffi_char16_t * w,Py_ssize_t size)21 _my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
22 {
23     /* are there any surrogate pairs, and if so, how many? */
24     Py_ssize_t i, count_surrogates = 0;
25     for (i = 0; i < size - 1; i++) {
26         if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
27                 0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
28             count_surrogates++;
29     }
30     if (count_surrogates == 0) {
31         /* no, fast path */
32         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
33     }
34     else
35     {
36         PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
37         Py_UCS4 *data;
38         assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
39         data = PyUnicode_4BYTE_DATA(result);
40 
41         for (i = 0; i < size; i++)
42         {
43             cffi_char32_t ch = w[i];
44             if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
45                 cffi_char32_t ch2 = w[i + 1];
46                 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
47                     ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
48                     i++;
49                 }
50             }
51             *data++ = ch;
52         }
53         return result;
54     }
55 }
56 
57 static int
_my_PyUnicode_AsSingleChar16(PyObject * unicode,cffi_char16_t * result,char * err_got)58 _my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
59                              char *err_got)
60 {
61     cffi_char32_t ch;
62     if (PyUnicode_GET_LENGTH(unicode) != 1) {
63         sprintf(err_got, "unicode string of length %zd",
64                 PyUnicode_GET_LENGTH(unicode));
65         return -1;
66     }
67     ch = PyUnicode_READ_CHAR(unicode, 0);
68 
69     if (ch > 0xFFFF)
70     {
71         sprintf(err_got, "larger-than-0xFFFF character");
72         return -1;
73     }
74     *result = (cffi_char16_t)ch;
75     return 0;
76 }
77 
78 static int
_my_PyUnicode_AsSingleChar32(PyObject * unicode,cffi_char32_t * result,char * err_got)79 _my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
80                              char *err_got)
81 {
82     if (PyUnicode_GET_LENGTH(unicode) != 1) {
83         sprintf(err_got, "unicode string of length %zd",
84                 PyUnicode_GET_LENGTH(unicode));
85         return -1;
86     }
87     *result = PyUnicode_READ_CHAR(unicode, 0);
88     return 0;
89 }
90 
_my_PyUnicode_SizeAsChar16(PyObject * unicode)91 static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
92 {
93     Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
94     Py_ssize_t result = length;
95     unsigned int kind = PyUnicode_KIND(unicode);
96 
97     if (kind == PyUnicode_4BYTE_KIND)
98     {
99         Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
100         Py_ssize_t i;
101         for (i = 0; i < length; i++) {
102             if (data[i] > 0xFFFF)
103                 result++;
104         }
105     }
106     return result;
107 }
108 
_my_PyUnicode_SizeAsChar32(PyObject * unicode)109 static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
110 {
111     return PyUnicode_GET_LENGTH(unicode);
112 }
113 
_my_PyUnicode_AsChar16(PyObject * unicode,cffi_char16_t * result,Py_ssize_t resultlen)114 static int _my_PyUnicode_AsChar16(PyObject *unicode,
115                                   cffi_char16_t *result,
116                                   Py_ssize_t resultlen)
117 {
118     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
119     unsigned int kind = PyUnicode_KIND(unicode);
120     void *data = PyUnicode_DATA(unicode);
121     Py_ssize_t i;
122 
123     for (i = 0; i < len; i++) {
124         cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
125         if (ordinal > 0xFFFF) {
126             if (ordinal > 0x10FFFF) {
127                 PyErr_Format(PyExc_ValueError,
128                              "unicode character out of range for "
129                              "conversion to char16_t: 0x%x", (int)ordinal);
130                 return -1;
131             }
132             ordinal -= 0x10000;
133             *result++ = 0xD800 | (ordinal >> 10);
134             *result++ = 0xDC00 | (ordinal & 0x3FF);
135         }
136         else
137             *result++ = ordinal;
138     }
139     return 0;
140 }
141 
_my_PyUnicode_AsChar32(PyObject * unicode,cffi_char32_t * result,Py_ssize_t resultlen)142 static int _my_PyUnicode_AsChar32(PyObject *unicode,
143                                   cffi_char32_t *result,
144                                   Py_ssize_t resultlen)
145 {
146     if (PyUnicode_AsUCS4(unicode, (Py_UCS4 *)result, resultlen, 0) == NULL)
147         return -1;
148     return 0;
149 }
150