1 /* stringlib: split implementation */
2
3 #ifndef STRINGLIB_SPLIT_H
4 #define STRINGLIB_SPLIT_H
5
6 #ifndef STRINGLIB_FASTSEARCH_H
7 #error must include "stringlib/fastsearch.h" before including this module
8 #endif
9
10 /* Overallocate the initial list to reduce the number of reallocs for small
11 split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
12 resizes, to sizes 4, 8, then 16. Most observed string splits are for human
13 text (roughly 11 words per line) and field delimited data (usually 1-10
14 fields). For large strings the split algorithms are bandwidth limited
15 so increasing the preallocation likely will not improve things.*/
16
17 #define MAX_PREALLOC 12
18
19 /* 5 splits gives 6 elements */
20 #define PREALLOC_SIZE(maxsplit) \
21 (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
22
23 #define SPLIT_APPEND(data, left, right) \
24 sub = STRINGLIB_NEW((data) + (left), \
25 (right) - (left)); \
26 if (sub == NULL) \
27 goto onError; \
28 if (PyList_Append(list, sub)) { \
29 Py_DECREF(sub); \
30 goto onError; \
31 } \
32 else \
33 Py_DECREF(sub);
34
35 #define SPLIT_ADD(data, left, right) { \
36 sub = STRINGLIB_NEW((data) + (left), \
37 (right) - (left)); \
38 if (sub == NULL) \
39 goto onError; \
40 if (count < MAX_PREALLOC) { \
41 PyList_SET_ITEM(list, count, sub); \
42 } else { \
43 if (PyList_Append(list, sub)) { \
44 Py_DECREF(sub); \
45 goto onError; \
46 } \
47 else \
48 Py_DECREF(sub); \
49 } \
50 count++; }
51
52
53 /* Always force the list to the expected size. */
54 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
55
56 Py_LOCAL_INLINE(PyObject *)
stringlib_split_whitespace(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,Py_ssize_t maxcount)57 stringlib_split_whitespace(PyObject* str_obj,
58 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
59 Py_ssize_t maxcount)
60 {
61 Py_ssize_t i, j, count=0;
62 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
63 PyObject *sub;
64
65 if (list == NULL)
66 return NULL;
67
68 i = j = 0;
69 while (maxcount-- > 0) {
70 while (i < str_len && STRINGLIB_ISSPACE(str[i]))
71 i++;
72 if (i == str_len) break;
73 j = i; i++;
74 while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
75 i++;
76 #ifndef STRINGLIB_MUTABLE
77 if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
78 /* No whitespace in str_obj, so just use it as list[0] */
79 Py_INCREF(str_obj);
80 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
81 count++;
82 break;
83 }
84 #endif
85 SPLIT_ADD(str, j, i);
86 }
87
88 if (i < str_len) {
89 /* Only occurs when maxcount was reached */
90 /* Skip any remaining whitespace and copy to end of string */
91 while (i < str_len && STRINGLIB_ISSPACE(str[i]))
92 i++;
93 if (i != str_len)
94 SPLIT_ADD(str, i, str_len);
95 }
96 FIX_PREALLOC_SIZE(list);
97 return list;
98
99 onError:
100 Py_DECREF(list);
101 return NULL;
102 }
103
104 Py_LOCAL_INLINE(PyObject *)
stringlib_split_char(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR ch,Py_ssize_t maxcount)105 stringlib_split_char(PyObject* str_obj,
106 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
107 const STRINGLIB_CHAR ch,
108 Py_ssize_t maxcount)
109 {
110 Py_ssize_t i, j, count=0;
111 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
112 PyObject *sub;
113
114 if (list == NULL)
115 return NULL;
116
117 i = j = 0;
118 while ((j < str_len) && (maxcount-- > 0)) {
119 for(; j < str_len; j++) {
120 /* I found that using memchr makes no difference */
121 if (str[j] == ch) {
122 SPLIT_ADD(str, i, j);
123 i = j = j + 1;
124 break;
125 }
126 }
127 }
128 #ifndef STRINGLIB_MUTABLE
129 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
130 /* ch not in str_obj, so just use str_obj as list[0] */
131 Py_INCREF(str_obj);
132 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
133 count++;
134 } else
135 #endif
136 if (i <= str_len) {
137 SPLIT_ADD(str, i, str_len);
138 }
139 FIX_PREALLOC_SIZE(list);
140 return list;
141
142 onError:
143 Py_DECREF(list);
144 return NULL;
145 }
146
147 Py_LOCAL_INLINE(PyObject *)
stringlib_split(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR * sep,Py_ssize_t sep_len,Py_ssize_t maxcount)148 stringlib_split(PyObject* str_obj,
149 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
150 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
151 Py_ssize_t maxcount)
152 {
153 Py_ssize_t i, j, pos, count=0;
154 PyObject *list, *sub;
155
156 if (sep_len == 0) {
157 PyErr_SetString(PyExc_ValueError, "empty separator");
158 return NULL;
159 }
160 else if (sep_len == 1)
161 return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
162
163 list = PyList_New(PREALLOC_SIZE(maxcount));
164 if (list == NULL)
165 return NULL;
166
167 i = j = 0;
168 while (maxcount-- > 0) {
169 pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
170 if (pos < 0)
171 break;
172 j = i + pos;
173 SPLIT_ADD(str, i, j);
174 i = j + sep_len;
175 }
176 #ifndef STRINGLIB_MUTABLE
177 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
178 /* No match in str_obj, so just use it as list[0] */
179 Py_INCREF(str_obj);
180 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
181 count++;
182 } else
183 #endif
184 {
185 SPLIT_ADD(str, i, str_len);
186 }
187 FIX_PREALLOC_SIZE(list);
188 return list;
189
190 onError:
191 Py_DECREF(list);
192 return NULL;
193 }
194
195 Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_whitespace(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,Py_ssize_t maxcount)196 stringlib_rsplit_whitespace(PyObject* str_obj,
197 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
198 Py_ssize_t maxcount)
199 {
200 Py_ssize_t i, j, count=0;
201 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
202 PyObject *sub;
203
204 if (list == NULL)
205 return NULL;
206
207 i = j = str_len - 1;
208 while (maxcount-- > 0) {
209 while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
210 i--;
211 if (i < 0) break;
212 j = i; i--;
213 while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
214 i--;
215 #ifndef STRINGLIB_MUTABLE
216 if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
217 /* No whitespace in str_obj, so just use it as list[0] */
218 Py_INCREF(str_obj);
219 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
220 count++;
221 break;
222 }
223 #endif
224 SPLIT_ADD(str, i + 1, j + 1);
225 }
226
227 if (i >= 0) {
228 /* Only occurs when maxcount was reached */
229 /* Skip any remaining whitespace and copy to beginning of string */
230 while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
231 i--;
232 if (i >= 0)
233 SPLIT_ADD(str, 0, i + 1);
234 }
235 FIX_PREALLOC_SIZE(list);
236 if (PyList_Reverse(list) < 0)
237 goto onError;
238 return list;
239
240 onError:
241 Py_DECREF(list);
242 return NULL;
243 }
244
245 Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_char(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR ch,Py_ssize_t maxcount)246 stringlib_rsplit_char(PyObject* str_obj,
247 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
248 const STRINGLIB_CHAR ch,
249 Py_ssize_t maxcount)
250 {
251 Py_ssize_t i, j, count=0;
252 PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
253 PyObject *sub;
254
255 if (list == NULL)
256 return NULL;
257
258 i = j = str_len - 1;
259 while ((i >= 0) && (maxcount-- > 0)) {
260 for(; i >= 0; i--) {
261 if (str[i] == ch) {
262 SPLIT_ADD(str, i + 1, j + 1);
263 j = i = i - 1;
264 break;
265 }
266 }
267 }
268 #ifndef STRINGLIB_MUTABLE
269 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
270 /* ch not in str_obj, so just use str_obj as list[0] */
271 Py_INCREF(str_obj);
272 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
273 count++;
274 } else
275 #endif
276 if (j >= -1) {
277 SPLIT_ADD(str, 0, j + 1);
278 }
279 FIX_PREALLOC_SIZE(list);
280 if (PyList_Reverse(list) < 0)
281 goto onError;
282 return list;
283
284 onError:
285 Py_DECREF(list);
286 return NULL;
287 }
288
289 Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR * sep,Py_ssize_t sep_len,Py_ssize_t maxcount)290 stringlib_rsplit(PyObject* str_obj,
291 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
292 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
293 Py_ssize_t maxcount)
294 {
295 Py_ssize_t j, pos, count=0;
296 PyObject *list, *sub;
297
298 if (sep_len == 0) {
299 PyErr_SetString(PyExc_ValueError, "empty separator");
300 return NULL;
301 }
302 else if (sep_len == 1)
303 return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
304
305 list = PyList_New(PREALLOC_SIZE(maxcount));
306 if (list == NULL)
307 return NULL;
308
309 j = str_len;
310 while (maxcount-- > 0) {
311 pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
312 if (pos < 0)
313 break;
314 SPLIT_ADD(str, pos + sep_len, j);
315 j = pos;
316 }
317 #ifndef STRINGLIB_MUTABLE
318 if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
319 /* No match in str_obj, so just use it as list[0] */
320 Py_INCREF(str_obj);
321 PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
322 count++;
323 } else
324 #endif
325 {
326 SPLIT_ADD(str, 0, j);
327 }
328 FIX_PREALLOC_SIZE(list);
329 if (PyList_Reverse(list) < 0)
330 goto onError;
331 return list;
332
333 onError:
334 Py_DECREF(list);
335 return NULL;
336 }
337
338 Py_LOCAL_INLINE(PyObject *)
stringlib_splitlines(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,int keepends)339 stringlib_splitlines(PyObject* str_obj,
340 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
341 int keepends)
342 {
343 /* This does not use the preallocated list because splitlines is
344 usually run with hundreds of newlines. The overhead of
345 switching between PyList_SET_ITEM and append causes about a
346 2-3% slowdown for that common case. A smarter implementation
347 could move the if check out, so the SET_ITEMs are done first
348 and the appends only done when the prealloc buffer is full.
349 That's too much work for little gain.*/
350
351 register Py_ssize_t i;
352 register Py_ssize_t j;
353 PyObject *list = PyList_New(0);
354 PyObject *sub;
355
356 if (list == NULL)
357 return NULL;
358
359 for (i = j = 0; i < str_len; ) {
360 Py_ssize_t eol;
361
362 /* Find a line and append it */
363 while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
364 i++;
365
366 /* Skip the line break reading CRLF as one line break */
367 eol = i;
368 if (i < str_len) {
369 if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
370 i += 2;
371 else
372 i++;
373 if (keepends)
374 eol = i;
375 }
376 #ifndef STRINGLIB_MUTABLE
377 if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
378 /* No linebreak in str_obj, so just use it as list[0] */
379 if (PyList_Append(list, str_obj))
380 goto onError;
381 break;
382 }
383 #endif
384 SPLIT_APPEND(str, j, eol);
385 j = i;
386 }
387 return list;
388
389 onError:
390 Py_DECREF(list);
391 return NULL;
392 }
393
394 #endif
395