1import csv 2import cffi 3 4# IN-PROGRESS. See the demo at the end of the file 5 6 7def _make_ffi_from_dialect(dialect_name): 8 dialect = csv.get_dialect(dialect_name) 9 10 ffi = cffi.FFI() 11 12 ffi.cdef(""" 13 long parse_line(char *rawline, long inputlength); 14 """) 15 16 d = {'quotechar': ord(dialect.quotechar), 17 'quoting': int(dialect.quoting), 18 'skipinitialspace': int(dialect.skipinitialspace), 19 'delimiter': ord(dialect.delimiter), 20 'doublequote': int(dialect.doublequote), 21 'strict': int(dialect.strict), 22 } 23 if dialect.escapechar is not None: 24 d['is_escape_char'] = '== %d' % ord(dialect.escapechar) 25 else: 26 d['is_escape_char'] = '&& 0' 27 28 ffi.set_source('_fastcsv_' + dialect_name, r''' 29 30 typedef enum { 31 START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, 32 IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD, 33 EAT_CRNL 34 } ParserState; 35 36 typedef enum { 37 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE 38 } QuoteStyle; 39 40 typedef struct { 41 ParserState state; /* current CSV parse state */ 42 char *field; /* build current field in here */ 43 int field_size; /* size of allocated buffer */ 44 int field_len; /* length of current field */ 45 int numeric_field; /* treat field as numeric */ 46 } ReaderObj; 47 48 static void 49 parse_add_char(ReaderObj *self, char c) 50 { 51 *self->field++ = c; 52 } 53 54 static void 55 parse_save_field(ReaderObj *self) 56 { 57 *self->field++ = 0; 58 } 59 60 static int 61 parse_process_char(ReaderObj *self, char c) 62 { 63 switch (self->state) { 64 case START_RECORD: 65 /* start of record */ 66 if (c == '\0') 67 /* empty line - return [] */ 68 break; 69 else if (c == '\n' || c == '\r') { 70 self->state = EAT_CRNL; 71 break; 72 } 73 /* normal character - handle as START_FIELD */ 74 self->state = START_FIELD; 75 /* fallthru */ 76 case START_FIELD: 77 /* expecting field */ 78 if (c == '\n' || c == '\r' || c == '\0') { 79 /* save empty field - return [fields] */ 80 parse_save_field(self); 81 self->state = (c == '\0' ? START_RECORD : EAT_CRNL); 82 } 83 else if (c == %(quotechar)d && 84 %(quoting)d != QUOTE_NONE) { 85 /* start quoted field */ 86 self->state = IN_QUOTED_FIELD; 87 } 88 else if (c %(is_escape_char)s) { 89 /* possible escaped character */ 90 self->state = ESCAPED_CHAR; 91 } 92 else if (c == ' ' && %(skipinitialspace)d) 93 /* ignore space at start of field */ 94 ; 95 else if (c == %(delimiter)d) { 96 /* save empty field */ 97 parse_save_field(self); 98 } 99 else { 100 /* begin new unquoted field */ 101 if (%(quoting)d == QUOTE_NONNUMERIC) 102 self->numeric_field = 1; 103 parse_add_char(self, c); 104 self->state = IN_FIELD; 105 } 106 break; 107 108 case ESCAPED_CHAR: 109 if (c == '\0') 110 c = '\n'; 111 parse_add_char(self, c); 112 self->state = IN_FIELD; 113 break; 114 115 case IN_FIELD: 116 /* in unquoted field */ 117 if (c == '\n' || c == '\r' || c == '\0') { 118 /* end of line - return [fields] */ 119 parse_save_field(self); 120 self->state = (c == '\0' ? START_RECORD : EAT_CRNL); 121 } 122 else if (c %(is_escape_char)s) { 123 /* possible escaped character */ 124 self->state = ESCAPED_CHAR; 125 } 126 else if (c == %(delimiter)d) { 127 /* save field - wait for new field */ 128 parse_save_field(self); 129 self->state = START_FIELD; 130 } 131 else { 132 /* normal character - save in field */ 133 parse_add_char(self, c); 134 } 135 break; 136 137 case IN_QUOTED_FIELD: 138 /* in quoted field */ 139 if (c == '\0') 140 ; 141 else if (c %(is_escape_char)s) { 142 /* Possible escape character */ 143 self->state = ESCAPE_IN_QUOTED_FIELD; 144 } 145 else if (c == %(quotechar)d && 146 %(quoting)d != QUOTE_NONE) { 147 if (%(doublequote)d) { 148 /* doublequote; " represented by "" */ 149 self->state = QUOTE_IN_QUOTED_FIELD; 150 } 151 else { 152 /* end of quote part of field */ 153 self->state = IN_FIELD; 154 } 155 } 156 else { 157 /* normal character - save in field */ 158 parse_add_char(self, c); 159 } 160 break; 161 162 case ESCAPE_IN_QUOTED_FIELD: 163 if (c == '\0') 164 c = '\n'; 165 parse_add_char(self, c); 166 self->state = IN_QUOTED_FIELD; 167 break; 168 169 case QUOTE_IN_QUOTED_FIELD: 170 /* doublequote - seen a quote in an quoted field */ 171 if (%(quoting)d != QUOTE_NONE && 172 c == %(quotechar)d) { 173 /* save "" as " */ 174 parse_add_char(self, c); 175 self->state = IN_QUOTED_FIELD; 176 } 177 else if (c == %(delimiter)d) { 178 /* save field - wait for new field */ 179 parse_save_field(self); 180 self->state = START_FIELD; 181 } 182 else if (c == '\n' || c == '\r' || c == '\0') { 183 /* end of line - return [fields] */ 184 parse_save_field(self); 185 self->state = (c == '\0' ? START_RECORD : EAT_CRNL); 186 } 187 else if (!%(strict)d) { 188 parse_add_char(self, c); 189 self->state = IN_FIELD; 190 } 191 else { 192 /* illegal */ 193 /*PyErr_Format(error_obj, "'%%c' expected after '%%c'", 194 dialect->delimiter, 195 dialect->quotechar);*/ 196 return -1; 197 } 198 break; 199 200 case EAT_CRNL: 201 if (c == '\n' || c == '\r') 202 ; 203 else if (c == '\0') 204 self->state = START_RECORD; 205 else { 206 /*PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");*/ 207 return -1; 208 } 209 break; 210 211 } 212 return 0; 213 } 214 215 static void 216 parse_reset(ReaderObj *self, char *rawline) 217 { 218 self->field = rawline; 219 self->state = START_RECORD; 220 self->numeric_field = 0; 221 } 222 223 long parse_line(char *rawline, long inputlength) 224 { 225 char *p; 226 ReaderObj reader; 227 parse_reset(&reader, rawline); 228 229 for (p=rawline; inputlength > 0; inputlength--, p++) { 230 if (parse_process_char(&reader, *p) < 0) 231 return -1; 232 } 233 if (parse_process_char(&reader, 0) < 0) 234 return -1; 235 return reader.field - rawline - 1; 236 } 237 ''' % d) 238 239 ffi.compile() 240 241 242def fastcsv_reader(f, dialect_name): 243 try: 244 module = __import__('_fastcsv_' + dialect_name) 245 except ImportError: 246 _make_ffi_from_dialect(dialect_name) 247 module = __import__('_fastcsv_' + dialect_name) 248 ffi, lib = module.ffi, module.lib 249 # 250 linelen = -1 251 for line in f: 252 if linelen <= len(line): 253 linelen = 2 * len(line) 254 rawline = ffi.new("char[]", linelen) 255 ffi.buffer(rawline, len(line))[:] = line 256 n = lib.parse_line(rawline, len(line)) 257 assert n >= 0 258 yield ffi.buffer(rawline, n)[:].split('\x00') 259 260 261if __name__ == '__main__': 262 csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE) 263 with open('/etc/passwd', 'rb') as f: 264 reader = fastcsv_reader(f, 'unixpwd') 265 for row in reader: 266 print row 267