1 /*
2 * wchar_t helpers, version CPython >= 3.3.
3 *
4 * CPython 3.3 added support for sys.maxunicode == 0x10FFFF on all
5 * platforms, even ones with wchar_t limited to 2 bytes. As such,
6 * this code here works from the outside like wchar_helper.h in the
7 * case Py_UNICODE_SIZE == 4, but the implementation is very different.
8 */
9
10 typedef uint16_t cffi_char16_t;
11 typedef uint32_t cffi_char32_t;
12
13
14 static PyObject *
_my_PyUnicode_FromChar32(const cffi_char32_t * w,Py_ssize_t size)15 _my_PyUnicode_FromChar32(const cffi_char32_t *w, Py_ssize_t size)
16 {
17 return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, w, size);
18 }
19
20 static PyObject *
_my_PyUnicode_FromChar16(const cffi_char16_t * w,Py_ssize_t size)21 _my_PyUnicode_FromChar16(const cffi_char16_t *w, Py_ssize_t size)
22 {
23 /* are there any surrogate pairs, and if so, how many? */
24 Py_ssize_t i, count_surrogates = 0;
25 for (i = 0; i < size - 1; i++) {
26 if (0xD800 <= w[i] && w[i] <= 0xDBFF &&
27 0xDC00 <= w[i+1] && w[i+1] <= 0xDFFF)
28 count_surrogates++;
29 }
30 if (count_surrogates == 0) {
31 /* no, fast path */
32 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, w, size);
33 }
34 else
35 {
36 PyObject *result = PyUnicode_New(size - count_surrogates, 0x10FFFF);
37 Py_UCS4 *data;
38 assert(PyUnicode_KIND(result) == PyUnicode_4BYTE_KIND);
39 data = PyUnicode_4BYTE_DATA(result);
40
41 for (i = 0; i < size; i++)
42 {
43 cffi_char32_t ch = w[i];
44 if (0xD800 <= ch && ch <= 0xDBFF && i < size - 1) {
45 cffi_char32_t ch2 = w[i + 1];
46 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
47 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
48 i++;
49 }
50 }
51 *data++ = ch;
52 }
53 return result;
54 }
55 }
56
57 static int
_my_PyUnicode_AsSingleChar16(PyObject * unicode,cffi_char16_t * result,char * err_got)58 _my_PyUnicode_AsSingleChar16(PyObject *unicode, cffi_char16_t *result,
59 char *err_got)
60 {
61 cffi_char32_t ch;
62 if (PyUnicode_GET_LENGTH(unicode) != 1) {
63 sprintf(err_got, "unicode string of length %zd",
64 PyUnicode_GET_LENGTH(unicode));
65 return -1;
66 }
67 ch = PyUnicode_READ_CHAR(unicode, 0);
68
69 if (ch > 0xFFFF)
70 {
71 sprintf(err_got, "larger-than-0xFFFF character");
72 return -1;
73 }
74 *result = (cffi_char16_t)ch;
75 return 0;
76 }
77
78 static int
_my_PyUnicode_AsSingleChar32(PyObject * unicode,cffi_char32_t * result,char * err_got)79 _my_PyUnicode_AsSingleChar32(PyObject *unicode, cffi_char32_t *result,
80 char *err_got)
81 {
82 if (PyUnicode_GET_LENGTH(unicode) != 1) {
83 sprintf(err_got, "unicode string of length %zd",
84 PyUnicode_GET_LENGTH(unicode));
85 return -1;
86 }
87 *result = PyUnicode_READ_CHAR(unicode, 0);
88 return 0;
89 }
90
_my_PyUnicode_SizeAsChar16(PyObject * unicode)91 static Py_ssize_t _my_PyUnicode_SizeAsChar16(PyObject *unicode)
92 {
93 Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
94 Py_ssize_t result = length;
95 unsigned int kind = PyUnicode_KIND(unicode);
96
97 if (kind == PyUnicode_4BYTE_KIND)
98 {
99 Py_UCS4 *data = PyUnicode_4BYTE_DATA(unicode);
100 Py_ssize_t i;
101 for (i = 0; i < length; i++) {
102 if (data[i] > 0xFFFF)
103 result++;
104 }
105 }
106 return result;
107 }
108
_my_PyUnicode_SizeAsChar32(PyObject * unicode)109 static Py_ssize_t _my_PyUnicode_SizeAsChar32(PyObject *unicode)
110 {
111 return PyUnicode_GET_LENGTH(unicode);
112 }
113
_my_PyUnicode_AsChar16(PyObject * unicode,cffi_char16_t * result,Py_ssize_t resultlen)114 static int _my_PyUnicode_AsChar16(PyObject *unicode,
115 cffi_char16_t *result,
116 Py_ssize_t resultlen)
117 {
118 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
119 unsigned int kind = PyUnicode_KIND(unicode);
120 void *data = PyUnicode_DATA(unicode);
121 Py_ssize_t i;
122
123 for (i = 0; i < len; i++) {
124 cffi_char32_t ordinal = PyUnicode_READ(kind, data, i);
125 if (ordinal > 0xFFFF) {
126 if (ordinal > 0x10FFFF) {
127 PyErr_Format(PyExc_ValueError,
128 "unicode character out of range for "
129 "conversion to char16_t: 0x%x", (int)ordinal);
130 return -1;
131 }
132 ordinal -= 0x10000;
133 *result++ = 0xD800 | (ordinal >> 10);
134 *result++ = 0xDC00 | (ordinal & 0x3FF);
135 }
136 else
137 *result++ = ordinal;
138 }
139 return 0;
140 }
141
_my_PyUnicode_AsChar32(PyObject * unicode,cffi_char32_t * result,Py_ssize_t resultlen)142 static int _my_PyUnicode_AsChar32(PyObject *unicode,
143 cffi_char32_t *result,
144 Py_ssize_t resultlen)
145 {
146 if (PyUnicode_AsUCS4(unicode, (Py_UCS4 *)result, resultlen, 0) == NULL)
147 return -1;
148 return 0;
149 }
150