1 /* stringlib: split implementation */
2
3 #ifndef STRINGLIB_FASTSEARCH_H
4 #error must include "stringlib/fastsearch.h" before including this module
5 #endif
6
7 /* Overallocate the initial list to reduce the number of reallocs for small
8 split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
9 resizes, to sizes 4, 8, then 16. Most observed string splits are for human
10 text (roughly 11 words per line) and field delimited data (usually 1-10
11 fields). For large strings the split algorithms are bandwidth limited
12 so increasing the preallocation likely will not improve things.*/
13
14 #define MAX_PREALLOC 12
15
16 /* 5 splits gives 6 elements */
17 #define PREALLOC_SIZE(maxsplit) \
18 (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
19
20 #define SPLIT_APPEND(data, left, right) \
21 sub = STRINGLIB_NEW((data) + (left), \
22 (right) - (left)); \
23 if (sub == NULL) \
24 goto onError; \
25 if (PyList_Append(list, sub)) { \
26 Py_DECREF(sub); \
27 goto onError; \
28 } \
29 else \
30 Py_DECREF(sub);
31
32 #define SPLIT_ADD(data, left, right) { \
33 sub = STRINGLIB_NEW((data) + (left), \
34 (right) - (left)); \
35 if (sub == NULL) \
36 goto onError; \
37 if (count < MAX_PREALLOC) { \
38 PyList_SET_ITEM(list, count, sub); \
39 } else { \
40 if (PyList_Append(list, sub)) { \
41 Py_DECREF(sub); \
42 goto onError; \
43 } \
44 else \
45 Py_DECREF(sub); \
46 } \
47 count++; }
48
49
50 /* Always force the list to the expected size. */
51 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
52
53 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(split_whitespace)54 STRINGLIB(split_whitespace)(PyObject* str_obj,
55 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
56 Py_ssize_t maxcount)
57 {
58 Py_ssize_t i, j, count=0;
59 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
60 PyObject *sub;
61
62 if (list == NULL)
63 return NULL;
64
65 i = j = 0;
66 while (maxcount-- > 0) {
67 while (i < str_len && STRINGLIB_ISSPACE(str[i]))
68 i++;
69 if (i == str_len) break;
70 j = i; i++;
71 while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
72 i++;
73 #ifndef STRINGLIB_MUTABLE
74 if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
75 /* No whitespace in str_obj, so just use it as list[0] */
76 Py_INCREF(str_obj);
77 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
78 count++;
79 break;
80 }
81 #endif
82 SPLIT_ADD(str, j, i);
83 }
84
85 if (i < str_len) {
86 /* Only occurs when maxcount was reached */
87 /* Skip any remaining whitespace and copy to end of string */
88 while (i < str_len && STRINGLIB_ISSPACE(str[i]))
89 i++;
90 if (i != str_len)
91 SPLIT_ADD(str, i, str_len);
92 }
93 FIX_PREALLOC_SIZE(list);
94 return list;
95
96 onError:
97 Py_DECREF(list);
98 return NULL;
99 }
100
101 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(split_char)102 STRINGLIB(split_char)(PyObject* str_obj,
103 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
104 const STRINGLIB_CHAR ch,
105 Py_ssize_t maxcount)
106 {
107 Py_ssize_t i, j, count=0;
108 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
109 PyObject *sub;
110
111 if (list == NULL)
112 return NULL;
113
114 i = j = 0;
115 while ((j < str_len) && (maxcount-- > 0)) {
116 for(; j < str_len; j++) {
117 /* I found that using memchr makes no difference */
118 if (str[j] == ch) {
119 SPLIT_ADD(str, i, j);
120 i = j = j + 1;
121 break;
122 }
123 }
124 }
125 #ifndef STRINGLIB_MUTABLE
126 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
127 /* ch not in str_obj, so just use str_obj as list[0] */
128 Py_INCREF(str_obj);
129 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
130 count++;
131 } else
132 #endif
133 if (i <= str_len) {
134 SPLIT_ADD(str, i, str_len);
135 }
136 FIX_PREALLOC_SIZE(list);
137 return list;
138
139 onError:
140 Py_DECREF(list);
141 return NULL;
142 }
143
144 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(split)145 STRINGLIB(split)(PyObject* str_obj,
146 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
147 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
148 Py_ssize_t maxcount)
149 {
150 Py_ssize_t i, j, pos, count=0;
151 PyObject *list, *sub;
152
153 if (sep_len == 0) {
154 PyErr_SetString(PyExc_ValueError, "empty separator");
155 return NULL;
156 }
157 else if (sep_len == 1)
158 return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount);
159
160 list = PyList_New(PREALLOC_SIZE(maxcount));
161 if (list == NULL)
162 return NULL;
163
164 i = j = 0;
165 while (maxcount-- > 0) {
166 pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
167 if (pos < 0)
168 break;
169 j = i + pos;
170 SPLIT_ADD(str, i, j);
171 i = j + sep_len;
172 }
173 #ifndef STRINGLIB_MUTABLE
174 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
175 /* No match in str_obj, so just use it as list[0] */
176 Py_INCREF(str_obj);
177 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
178 count++;
179 } else
180 #endif
181 {
182 SPLIT_ADD(str, i, str_len);
183 }
184 FIX_PREALLOC_SIZE(list);
185 return list;
186
187 onError:
188 Py_DECREF(list);
189 return NULL;
190 }
191
192 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(rsplit_whitespace)193 STRINGLIB(rsplit_whitespace)(PyObject* str_obj,
194 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
195 Py_ssize_t maxcount)
196 {
197 Py_ssize_t i, j, count=0;
198 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
199 PyObject *sub;
200
201 if (list == NULL)
202 return NULL;
203
204 i = j = str_len - 1;
205 while (maxcount-- > 0) {
206 while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
207 i--;
208 if (i < 0) break;
209 j = i; i--;
210 while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
211 i--;
212 #ifndef STRINGLIB_MUTABLE
213 if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
214 /* No whitespace in str_obj, so just use it as list[0] */
215 Py_INCREF(str_obj);
216 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
217 count++;
218 break;
219 }
220 #endif
221 SPLIT_ADD(str, i + 1, j + 1);
222 }
223
224 if (i >= 0) {
225 /* Only occurs when maxcount was reached */
226 /* Skip any remaining whitespace and copy to beginning of string */
227 while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
228 i--;
229 if (i >= 0)
230 SPLIT_ADD(str, 0, i + 1);
231 }
232 FIX_PREALLOC_SIZE(list);
233 if (PyList_Reverse(list) < 0)
234 goto onError;
235 return list;
236
237 onError:
238 Py_DECREF(list);
239 return NULL;
240 }
241
242 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(rsplit_char)243 STRINGLIB(rsplit_char)(PyObject* str_obj,
244 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
245 const STRINGLIB_CHAR ch,
246 Py_ssize_t maxcount)
247 {
248 Py_ssize_t i, j, count=0;
249 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
250 PyObject *sub;
251
252 if (list == NULL)
253 return NULL;
254
255 i = j = str_len - 1;
256 while ((i >= 0) && (maxcount-- > 0)) {
257 for(; i >= 0; i--) {
258 if (str[i] == ch) {
259 SPLIT_ADD(str, i + 1, j + 1);
260 j = i = i - 1;
261 break;
262 }
263 }
264 }
265 #ifndef STRINGLIB_MUTABLE
266 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
267 /* ch not in str_obj, so just use str_obj as list[0] */
268 Py_INCREF(str_obj);
269 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
270 count++;
271 } else
272 #endif
273 if (j >= -1) {
274 SPLIT_ADD(str, 0, j + 1);
275 }
276 FIX_PREALLOC_SIZE(list);
277 if (PyList_Reverse(list) < 0)
278 goto onError;
279 return list;
280
281 onError:
282 Py_DECREF(list);
283 return NULL;
284 }
285
286 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(rsplit)287 STRINGLIB(rsplit)(PyObject* str_obj,
288 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
289 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
290 Py_ssize_t maxcount)
291 {
292 Py_ssize_t j, pos, count=0;
293 PyObject *list, *sub;
294
295 if (sep_len == 0) {
296 PyErr_SetString(PyExc_ValueError, "empty separator");
297 return NULL;
298 }
299 else if (sep_len == 1)
300 return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount);
301
302 list = PyList_New(PREALLOC_SIZE(maxcount));
303 if (list == NULL)
304 return NULL;
305
306 j = str_len;
307 while (maxcount-- > 0) {
308 pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH);
309 if (pos < 0)
310 break;
311 SPLIT_ADD(str, pos + sep_len, j);
312 j = pos;
313 }
314 #ifndef STRINGLIB_MUTABLE
315 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
316 /* No match in str_obj, so just use it as list[0] */
317 Py_INCREF(str_obj);
318 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
319 count++;
320 } else
321 #endif
322 {
323 SPLIT_ADD(str, 0, j);
324 }
325 FIX_PREALLOC_SIZE(list);
326 if (PyList_Reverse(list) < 0)
327 goto onError;
328 return list;
329
330 onError:
331 Py_DECREF(list);
332 return NULL;
333 }
334
335 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(splitlines)336 STRINGLIB(splitlines)(PyObject* str_obj,
337 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
338 int keepends)
339 {
340 /* This does not use the preallocated list because splitlines is
341 usually run with hundreds of newlines. The overhead of
342 switching between PyList_SET_ITEM and append causes about a
343 2-3% slowdown for that common case. A smarter implementation
344 could move the if check out, so the SET_ITEMs are done first
345 and the appends only done when the prealloc buffer is full.
346 That's too much work for little gain.*/
347
348 Py_ssize_t i;
349 Py_ssize_t j;
350 PyObject *list = PyList_New(0);
351 PyObject *sub;
352
353 if (list == NULL)
354 return NULL;
355
356 for (i = j = 0; i < str_len; ) {
357 Py_ssize_t eol;
358
359 /* Find a line and append it */
360 while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
361 i++;
362
363 /* Skip the line break reading CRLF as one line break */
364 eol = i;
365 if (i < str_len) {
366 if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
367 i += 2;
368 else
369 i++;
370 if (keepends)
371 eol = i;
372 }
373 #ifndef STRINGLIB_MUTABLE
374 if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
375 /* No linebreak in str_obj, so just use it as list[0] */
376 if (PyList_Append(list, str_obj))
377 goto onError;
378 break;
379 }
380 #endif
381 SPLIT_APPEND(str, j, eol);
382 j = i;
383 }
384 return list;
385
386 onError:
387 Py_DECREF(list);
388 return NULL;
389 }
390
391