• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import csv
2import cffi
3
4# IN-PROGRESS.  See the demo at the end of the file
5
6
7def _make_ffi_from_dialect(dialect_name):
8    dialect = csv.get_dialect(dialect_name)
9
10    ffi = cffi.FFI()
11
12    ffi.cdef("""
13        long parse_line(char *rawline, long inputlength);
14    """)
15
16    d = {'quotechar': ord(dialect.quotechar),
17         'quoting': int(dialect.quoting),
18         'skipinitialspace': int(dialect.skipinitialspace),
19         'delimiter': ord(dialect.delimiter),
20         'doublequote': int(dialect.doublequote),
21         'strict': int(dialect.strict),
22         }
23    if dialect.escapechar is not None:
24        d['is_escape_char'] = '== %d' % ord(dialect.escapechar)
25    else:
26        d['is_escape_char'] = '&& 0'
27
28    ffi.set_source('_fastcsv_' + dialect_name, r'''
29
30    typedef enum {
31        START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD,
32        IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
33        EAT_CRNL
34    } ParserState;
35
36    typedef enum {
37        QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE
38    } QuoteStyle;
39
40    typedef struct {
41        ParserState state;          /* current CSV parse state */
42        char *field;                /* build current field in here */
43        int field_size;             /* size of allocated buffer */
44        int field_len;              /* length of current field */
45        int numeric_field;          /* treat field as numeric */
46    } ReaderObj;
47
48    static void
49    parse_add_char(ReaderObj *self, char c)
50    {
51        *self->field++ = c;
52    }
53
54    static void
55    parse_save_field(ReaderObj *self)
56    {
57        *self->field++ = 0;
58    }
59
60    static int
61    parse_process_char(ReaderObj *self, char c)
62    {
63        switch (self->state) {
64        case START_RECORD:
65            /* start of record */
66            if (c == '\0')
67                /* empty line - return [] */
68                break;
69            else if (c == '\n' || c == '\r') {
70                self->state = EAT_CRNL;
71                break;
72            }
73            /* normal character - handle as START_FIELD */
74            self->state = START_FIELD;
75            /* fallthru */
76        case START_FIELD:
77            /* expecting field */
78            if (c == '\n' || c == '\r' || c == '\0') {
79                /* save empty field - return [fields] */
80                parse_save_field(self);
81                self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
82            }
83            else if (c == %(quotechar)d &&
84                     %(quoting)d != QUOTE_NONE) {
85                /* start quoted field */
86                self->state = IN_QUOTED_FIELD;
87            }
88            else if (c %(is_escape_char)s) {
89                /* possible escaped character */
90                self->state = ESCAPED_CHAR;
91            }
92            else if (c == ' ' && %(skipinitialspace)d)
93                /* ignore space at start of field */
94                ;
95            else if (c == %(delimiter)d) {
96                /* save empty field */
97                parse_save_field(self);
98            }
99            else {
100                /* begin new unquoted field */
101                if (%(quoting)d == QUOTE_NONNUMERIC)
102                    self->numeric_field = 1;
103                parse_add_char(self, c);
104                self->state = IN_FIELD;
105            }
106            break;
107
108        case ESCAPED_CHAR:
109            if (c == '\0')
110                c = '\n';
111            parse_add_char(self, c);
112            self->state = IN_FIELD;
113            break;
114
115        case IN_FIELD:
116            /* in unquoted field */
117            if (c == '\n' || c == '\r' || c == '\0') {
118                /* end of line - return [fields] */
119                parse_save_field(self);
120                self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
121            }
122            else if (c %(is_escape_char)s) {
123                /* possible escaped character */
124                self->state = ESCAPED_CHAR;
125            }
126            else if (c == %(delimiter)d) {
127                /* save field - wait for new field */
128                parse_save_field(self);
129                self->state = START_FIELD;
130            }
131            else {
132                /* normal character - save in field */
133                parse_add_char(self, c);
134            }
135            break;
136
137        case IN_QUOTED_FIELD:
138            /* in quoted field */
139            if (c == '\0')
140                ;
141            else if (c %(is_escape_char)s) {
142                /* Possible escape character */
143                self->state = ESCAPE_IN_QUOTED_FIELD;
144            }
145            else if (c == %(quotechar)d &&
146                     %(quoting)d != QUOTE_NONE) {
147                if (%(doublequote)d) {
148                    /* doublequote; " represented by "" */
149                    self->state = QUOTE_IN_QUOTED_FIELD;
150                }
151                else {
152                    /* end of quote part of field */
153                    self->state = IN_FIELD;
154                }
155            }
156            else {
157                /* normal character - save in field */
158                parse_add_char(self, c);
159            }
160            break;
161
162        case ESCAPE_IN_QUOTED_FIELD:
163            if (c == '\0')
164                c = '\n';
165            parse_add_char(self, c);
166            self->state = IN_QUOTED_FIELD;
167            break;
168
169        case QUOTE_IN_QUOTED_FIELD:
170            /* doublequote - seen a quote in an quoted field */
171            if (%(quoting)d != QUOTE_NONE &&
172                c == %(quotechar)d) {
173                /* save "" as " */
174                parse_add_char(self, c);
175                self->state = IN_QUOTED_FIELD;
176            }
177            else if (c == %(delimiter)d) {
178                /* save field - wait for new field */
179                parse_save_field(self);
180                self->state = START_FIELD;
181            }
182            else if (c == '\n' || c == '\r' || c == '\0') {
183                /* end of line - return [fields] */
184                parse_save_field(self);
185                self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
186            }
187            else if (!%(strict)d) {
188                parse_add_char(self, c);
189                self->state = IN_FIELD;
190            }
191            else {
192                /* illegal */
193                /*PyErr_Format(error_obj, "'%%c' expected after '%%c'",
194                                dialect->delimiter,
195                                dialect->quotechar);*/
196                return -1;
197            }
198            break;
199
200        case EAT_CRNL:
201            if (c == '\n' || c == '\r')
202                ;
203            else if (c == '\0')
204                self->state = START_RECORD;
205            else {
206                /*PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");*/
207                return -1;
208            }
209            break;
210
211        }
212        return 0;
213    }
214
215    static void
216    parse_reset(ReaderObj *self, char *rawline)
217    {
218        self->field = rawline;
219        self->state = START_RECORD;
220        self->numeric_field = 0;
221    }
222
223    long parse_line(char *rawline, long inputlength)
224    {
225        char *p;
226        ReaderObj reader;
227        parse_reset(&reader, rawline);
228
229        for (p=rawline; inputlength > 0; inputlength--, p++) {
230            if (parse_process_char(&reader, *p) < 0)
231                return -1;
232        }
233        if (parse_process_char(&reader, 0) < 0)
234            return -1;
235        return reader.field - rawline - 1;
236    }
237    ''' % d)
238
239    ffi.compile()
240
241
242def fastcsv_reader(f, dialect_name):
243    try:
244        module = __import__('_fastcsv_' + dialect_name)
245    except ImportError:
246        _make_ffi_from_dialect(dialect_name)
247        module = __import__('_fastcsv_' + dialect_name)
248    ffi, lib = module.ffi, module.lib
249    #
250    linelen = -1
251    for line in f:
252        if linelen <= len(line):
253            linelen = 2 * len(line)
254            rawline = ffi.new("char[]", linelen)
255        ffi.buffer(rawline, len(line))[:] = line
256        n = lib.parse_line(rawline, len(line))
257        assert n >= 0
258        yield ffi.buffer(rawline, n)[:].split('\x00')
259
260
261if __name__ == '__main__':
262    csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE)
263    with open('/etc/passwd', 'rb') as f:
264        reader = fastcsv_reader(f, 'unixpwd')
265        for row in reader:
266            print row
267