• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* stringlib: split implementation */
2 
3 #ifndef STRINGLIB_SPLIT_H
4 #define STRINGLIB_SPLIT_H
5 
6 #ifndef STRINGLIB_FASTSEARCH_H
7 #error must include "stringlib/fastsearch.h" before including this module
8 #endif
9 
10 /* Overallocate the initial list to reduce the number of reallocs for small
11    split sizes.  Eg, "A A A A A A A A A A".split() (10 elements) has three
12    resizes, to sizes 4, 8, then 16.  Most observed string splits are for human
13    text (roughly 11 words per line) and field delimited data (usually 1-10
14    fields).  For large strings the split algorithms are bandwidth limited
15    so increasing the preallocation likely will not improve things.*/
16 
17 #define MAX_PREALLOC 12
18 
19 /* 5 splits gives 6 elements */
20 #define PREALLOC_SIZE(maxsplit) \
21     (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
22 
23 #define SPLIT_APPEND(data, left, right)         \
24     sub = STRINGLIB_NEW((data) + (left),        \
25                         (right) - (left));      \
26     if (sub == NULL)                            \
27         goto onError;                           \
28     if (PyList_Append(list, sub)) {             \
29         Py_DECREF(sub);                         \
30         goto onError;                           \
31     }                                           \
32     else                                        \
33         Py_DECREF(sub);
34 
35 #define SPLIT_ADD(data, left, right) {          \
36     sub = STRINGLIB_NEW((data) + (left),        \
37                         (right) - (left));      \
38     if (sub == NULL)                            \
39         goto onError;                           \
40     if (count < MAX_PREALLOC) {                 \
41         PyList_SET_ITEM(list, count, sub);      \
42     } else {                                    \
43         if (PyList_Append(list, sub)) {         \
44             Py_DECREF(sub);                     \
45             goto onError;                       \
46         }                                       \
47         else                                    \
48             Py_DECREF(sub);                     \
49     }                                           \
50     count++; }
51 
52 
53 /* Always force the list to the expected size. */
54 #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
55 
56 Py_LOCAL_INLINE(PyObject *)
stringlib_split_whitespace(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,Py_ssize_t maxcount)57 stringlib_split_whitespace(PyObject* str_obj,
58                            const STRINGLIB_CHAR* str, Py_ssize_t str_len,
59                            Py_ssize_t maxcount)
60 {
61     Py_ssize_t i, j, count=0;
62     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
63     PyObject *sub;
64 
65     if (list == NULL)
66         return NULL;
67 
68     i = j = 0;
69     while (maxcount-- > 0) {
70         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
71             i++;
72         if (i == str_len) break;
73         j = i; i++;
74         while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
75             i++;
76 #ifndef STRINGLIB_MUTABLE
77         if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
78             /* No whitespace in str_obj, so just use it as list[0] */
79             Py_INCREF(str_obj);
80             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
81             count++;
82             break;
83         }
84 #endif
85         SPLIT_ADD(str, j, i);
86     }
87 
88     if (i < str_len) {
89         /* Only occurs when maxcount was reached */
90         /* Skip any remaining whitespace and copy to end of string */
91         while (i < str_len && STRINGLIB_ISSPACE(str[i]))
92             i++;
93         if (i != str_len)
94             SPLIT_ADD(str, i, str_len);
95     }
96     FIX_PREALLOC_SIZE(list);
97     return list;
98 
99   onError:
100     Py_DECREF(list);
101     return NULL;
102 }
103 
104 Py_LOCAL_INLINE(PyObject *)
stringlib_split_char(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR ch,Py_ssize_t maxcount)105 stringlib_split_char(PyObject* str_obj,
106                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
107                      const STRINGLIB_CHAR ch,
108                      Py_ssize_t maxcount)
109 {
110     Py_ssize_t i, j, count=0;
111     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
112     PyObject *sub;
113 
114     if (list == NULL)
115         return NULL;
116 
117     i = j = 0;
118     while ((j < str_len) && (maxcount-- > 0)) {
119         for(; j < str_len; j++) {
120             /* I found that using memchr makes no difference */
121             if (str[j] == ch) {
122                 SPLIT_ADD(str, i, j);
123                 i = j = j + 1;
124                 break;
125             }
126         }
127     }
128 #ifndef STRINGLIB_MUTABLE
129     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
130         /* ch not in str_obj, so just use str_obj as list[0] */
131         Py_INCREF(str_obj);
132         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
133         count++;
134     } else
135 #endif
136     if (i <= str_len) {
137         SPLIT_ADD(str, i, str_len);
138     }
139     FIX_PREALLOC_SIZE(list);
140     return list;
141 
142   onError:
143     Py_DECREF(list);
144     return NULL;
145 }
146 
147 Py_LOCAL_INLINE(PyObject *)
stringlib_split(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR * sep,Py_ssize_t sep_len,Py_ssize_t maxcount)148 stringlib_split(PyObject* str_obj,
149                 const STRINGLIB_CHAR* str, Py_ssize_t str_len,
150                 const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
151                 Py_ssize_t maxcount)
152 {
153     Py_ssize_t i, j, pos, count=0;
154     PyObject *list, *sub;
155 
156     if (sep_len == 0) {
157         PyErr_SetString(PyExc_ValueError, "empty separator");
158         return NULL;
159     }
160     else if (sep_len == 1)
161         return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
162 
163     list = PyList_New(PREALLOC_SIZE(maxcount));
164     if (list == NULL)
165         return NULL;
166 
167     i = j = 0;
168     while (maxcount-- > 0) {
169         pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
170         if (pos < 0)
171             break;
172         j = i + pos;
173         SPLIT_ADD(str, i, j);
174         i = j + sep_len;
175     }
176 #ifndef STRINGLIB_MUTABLE
177     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
178         /* No match in str_obj, so just use it as list[0] */
179         Py_INCREF(str_obj);
180         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
181         count++;
182     } else
183 #endif
184     {
185         SPLIT_ADD(str, i, str_len);
186     }
187     FIX_PREALLOC_SIZE(list);
188     return list;
189 
190   onError:
191     Py_DECREF(list);
192     return NULL;
193 }
194 
195 Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_whitespace(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,Py_ssize_t maxcount)196 stringlib_rsplit_whitespace(PyObject* str_obj,
197                             const STRINGLIB_CHAR* str, Py_ssize_t str_len,
198                             Py_ssize_t maxcount)
199 {
200     Py_ssize_t i, j, count=0;
201     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
202     PyObject *sub;
203 
204     if (list == NULL)
205         return NULL;
206 
207     i = j = str_len - 1;
208     while (maxcount-- > 0) {
209         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
210             i--;
211         if (i < 0) break;
212         j = i; i--;
213         while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
214             i--;
215 #ifndef STRINGLIB_MUTABLE
216         if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
217             /* No whitespace in str_obj, so just use it as list[0] */
218             Py_INCREF(str_obj);
219             PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
220             count++;
221             break;
222         }
223 #endif
224         SPLIT_ADD(str, i + 1, j + 1);
225     }
226 
227     if (i >= 0) {
228         /* Only occurs when maxcount was reached */
229         /* Skip any remaining whitespace and copy to beginning of string */
230         while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
231             i--;
232         if (i >= 0)
233             SPLIT_ADD(str, 0, i + 1);
234     }
235     FIX_PREALLOC_SIZE(list);
236     if (PyList_Reverse(list) < 0)
237         goto onError;
238     return list;
239 
240   onError:
241     Py_DECREF(list);
242     return NULL;
243 }
244 
245 Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit_char(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR ch,Py_ssize_t maxcount)246 stringlib_rsplit_char(PyObject* str_obj,
247                       const STRINGLIB_CHAR* str, Py_ssize_t str_len,
248                       const STRINGLIB_CHAR ch,
249                       Py_ssize_t maxcount)
250 {
251     Py_ssize_t i, j, count=0;
252     PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
253     PyObject *sub;
254 
255     if (list == NULL)
256         return NULL;
257 
258     i = j = str_len - 1;
259     while ((i >= 0) && (maxcount-- > 0)) {
260         for(; i >= 0; i--) {
261             if (str[i] == ch) {
262                 SPLIT_ADD(str, i + 1, j + 1);
263                 j = i = i - 1;
264                 break;
265             }
266         }
267     }
268 #ifndef STRINGLIB_MUTABLE
269     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
270         /* ch not in str_obj, so just use str_obj as list[0] */
271         Py_INCREF(str_obj);
272         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
273         count++;
274     } else
275 #endif
276     if (j >= -1) {
277         SPLIT_ADD(str, 0, j + 1);
278     }
279     FIX_PREALLOC_SIZE(list);
280     if (PyList_Reverse(list) < 0)
281         goto onError;
282     return list;
283 
284   onError:
285     Py_DECREF(list);
286     return NULL;
287 }
288 
289 Py_LOCAL_INLINE(PyObject *)
stringlib_rsplit(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,const STRINGLIB_CHAR * sep,Py_ssize_t sep_len,Py_ssize_t maxcount)290 stringlib_rsplit(PyObject* str_obj,
291                  const STRINGLIB_CHAR* str, Py_ssize_t str_len,
292                  const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
293                  Py_ssize_t maxcount)
294 {
295     Py_ssize_t j, pos, count=0;
296     PyObject *list, *sub;
297 
298     if (sep_len == 0) {
299         PyErr_SetString(PyExc_ValueError, "empty separator");
300         return NULL;
301     }
302     else if (sep_len == 1)
303         return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
304 
305     list = PyList_New(PREALLOC_SIZE(maxcount));
306     if (list == NULL)
307         return NULL;
308 
309     j = str_len;
310     while (maxcount-- > 0) {
311         pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
312         if (pos < 0)
313             break;
314         SPLIT_ADD(str, pos + sep_len, j);
315         j = pos;
316     }
317 #ifndef STRINGLIB_MUTABLE
318     if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
319         /* No match in str_obj, so just use it as list[0] */
320         Py_INCREF(str_obj);
321         PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
322         count++;
323     } else
324 #endif
325     {
326         SPLIT_ADD(str, 0, j);
327     }
328     FIX_PREALLOC_SIZE(list);
329     if (PyList_Reverse(list) < 0)
330         goto onError;
331     return list;
332 
333   onError:
334     Py_DECREF(list);
335     return NULL;
336 }
337 
338 Py_LOCAL_INLINE(PyObject *)
stringlib_splitlines(PyObject * str_obj,const STRINGLIB_CHAR * str,Py_ssize_t str_len,int keepends)339 stringlib_splitlines(PyObject* str_obj,
340                      const STRINGLIB_CHAR* str, Py_ssize_t str_len,
341                      int keepends)
342 {
343     /* This does not use the preallocated list because splitlines is
344        usually run with hundreds of newlines.  The overhead of
345        switching between PyList_SET_ITEM and append causes about a
346        2-3% slowdown for that common case.  A smarter implementation
347        could move the if check out, so the SET_ITEMs are done first
348        and the appends only done when the prealloc buffer is full.
349        That's too much work for little gain.*/
350 
351     register Py_ssize_t i;
352     register Py_ssize_t j;
353     PyObject *list = PyList_New(0);
354     PyObject *sub;
355 
356     if (list == NULL)
357         return NULL;
358 
359     for (i = j = 0; i < str_len; ) {
360         Py_ssize_t eol;
361 
362         /* Find a line and append it */
363         while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
364             i++;
365 
366         /* Skip the line break reading CRLF as one line break */
367         eol = i;
368         if (i < str_len) {
369             if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
370                 i += 2;
371             else
372                 i++;
373             if (keepends)
374                 eol = i;
375         }
376 #ifndef STRINGLIB_MUTABLE
377         if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
378             /* No linebreak in str_obj, so just use it as list[0] */
379             if (PyList_Append(list, str_obj))
380                 goto onError;
381             break;
382         }
383 #endif
384         SPLIT_APPEND(str, j, eol);
385         j = i;
386     }
387     return list;
388 
389   onError:
390     Py_DECREF(list);
391     return NULL;
392 }
393 
394 #endif
395