1 2r""" 3CSV parsing and writing. 4 5This module provides classes that assist in the reading and writing 6of Comma Separated Value (CSV) files, and implements the interface 7described by PEP 305. Although many CSV files are simple to parse, 8the format is not formally defined by a stable specification and 9is subtle enough that parsing lines of a CSV file with something 10like line.split(",") is bound to fail. The module supports three 11basic APIs: reading, writing, and registration of dialects. 12 13 14DIALECT REGISTRATION: 15 16Readers and writers support a dialect argument, which is a convenient 17handle on a group of settings. When the dialect argument is a string, 18it identifies one of the dialects previously registered with the module. 19If it is a class or instance, the attributes of the argument are used as 20the settings for the reader or writer: 21 22 class excel: 23 delimiter = ',' 24 quotechar = '"' 25 escapechar = None 26 doublequote = True 27 skipinitialspace = False 28 lineterminator = '\r\n' 29 quoting = QUOTE_MINIMAL 30 31SETTINGS: 32 33 * quotechar - specifies a one-character string to use as the 34 quoting character. It defaults to '"'. 35 * delimiter - specifies a one-character string to use as the 36 field separator. It defaults to ','. 37 * skipinitialspace - specifies how to interpret spaces which 38 immediately follow a delimiter. It defaults to False, which 39 means that spaces immediately following a delimiter is part 40 of the following field. 41 * lineterminator - specifies the character sequence which should 42 terminate rows. 43 * quoting - controls when quotes should be generated by the writer. 44 It can take on any of the following module constants: 45 46 csv.QUOTE_MINIMAL means only when required, for example, when a 47 field contains either the quotechar or the delimiter 48 csv.QUOTE_ALL means that quotes are always placed around fields. 49 csv.QUOTE_NONNUMERIC means that quotes are always placed around 50 fields which do not parse as integers or floating-point 51 numbers. 52 csv.QUOTE_STRINGS means that quotes are always placed around 53 fields which are strings. Note that the Python value None 54 is not a string. 55 csv.QUOTE_NOTNULL means that quotes are only placed around fields 56 that are not the Python value None. 57 csv.QUOTE_NONE means that quotes are never placed around fields. 58 * escapechar - specifies a one-character string used to escape 59 the delimiter when quoting is set to QUOTE_NONE. 60 * doublequote - controls the handling of quotes inside fields. When 61 True, two consecutive quotes are interpreted as one during read, 62 and when writing, each quote character embedded in the data is 63 written as two quotes 64""" 65 66import re 67import types 68from _csv import Error, writer, reader, register_dialect, \ 69 unregister_dialect, get_dialect, list_dialects, \ 70 field_size_limit, \ 71 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 72 QUOTE_STRINGS, QUOTE_NOTNULL 73from _csv import Dialect as _Dialect 74 75from io import StringIO 76 77__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 78 "QUOTE_STRINGS", "QUOTE_NOTNULL", 79 "Error", "Dialect", "excel", "excel_tab", 80 "field_size_limit", "reader", "writer", 81 "register_dialect", "get_dialect", "list_dialects", "Sniffer", 82 "unregister_dialect", "DictReader", "DictWriter", 83 "unix_dialect"] 84 85__version__ = "1.0" 86 87 88class Dialect: 89 """Describe a CSV dialect. 90 91 This must be subclassed (see csv.excel). Valid attributes are: 92 delimiter, quotechar, escapechar, doublequote, skipinitialspace, 93 lineterminator, quoting. 94 95 """ 96 _name = "" 97 _valid = False 98 # placeholders 99 delimiter = None 100 quotechar = None 101 escapechar = None 102 doublequote = None 103 skipinitialspace = None 104 lineterminator = None 105 quoting = None 106 107 def __init__(self): 108 if self.__class__ != Dialect: 109 self._valid = True 110 self._validate() 111 112 def _validate(self): 113 try: 114 _Dialect(self) 115 except TypeError as e: 116 # Re-raise to get a traceback showing more user code. 117 raise Error(str(e)) from None 118 119class excel(Dialect): 120 """Describe the usual properties of Excel-generated CSV files.""" 121 delimiter = ',' 122 quotechar = '"' 123 doublequote = True 124 skipinitialspace = False 125 lineterminator = '\r\n' 126 quoting = QUOTE_MINIMAL 127register_dialect("excel", excel) 128 129class excel_tab(excel): 130 """Describe the usual properties of Excel-generated TAB-delimited files.""" 131 delimiter = '\t' 132register_dialect("excel-tab", excel_tab) 133 134class unix_dialect(Dialect): 135 """Describe the usual properties of Unix-generated CSV files.""" 136 delimiter = ',' 137 quotechar = '"' 138 doublequote = True 139 skipinitialspace = False 140 lineterminator = '\n' 141 quoting = QUOTE_ALL 142register_dialect("unix", unix_dialect) 143 144 145class DictReader: 146 def __init__(self, f, fieldnames=None, restkey=None, restval=None, 147 dialect="excel", *args, **kwds): 148 if fieldnames is not None and iter(fieldnames) is fieldnames: 149 fieldnames = list(fieldnames) 150 self._fieldnames = fieldnames # list of keys for the dict 151 self.restkey = restkey # key to catch long rows 152 self.restval = restval # default value for short rows 153 self.reader = reader(f, dialect, *args, **kwds) 154 self.dialect = dialect 155 self.line_num = 0 156 157 def __iter__(self): 158 return self 159 160 @property 161 def fieldnames(self): 162 if self._fieldnames is None: 163 try: 164 self._fieldnames = next(self.reader) 165 except StopIteration: 166 pass 167 self.line_num = self.reader.line_num 168 return self._fieldnames 169 170 @fieldnames.setter 171 def fieldnames(self, value): 172 self._fieldnames = value 173 174 def __next__(self): 175 if self.line_num == 0: 176 # Used only for its side effect. 177 self.fieldnames 178 row = next(self.reader) 179 self.line_num = self.reader.line_num 180 181 # unlike the basic reader, we prefer not to return blanks, 182 # because we will typically wind up with a dict full of None 183 # values 184 while row == []: 185 row = next(self.reader) 186 d = dict(zip(self.fieldnames, row)) 187 lf = len(self.fieldnames) 188 lr = len(row) 189 if lf < lr: 190 d[self.restkey] = row[lf:] 191 elif lf > lr: 192 for key in self.fieldnames[lr:]: 193 d[key] = self.restval 194 return d 195 196 __class_getitem__ = classmethod(types.GenericAlias) 197 198 199class DictWriter: 200 def __init__(self, f, fieldnames, restval="", extrasaction="raise", 201 dialect="excel", *args, **kwds): 202 if fieldnames is not None and iter(fieldnames) is fieldnames: 203 fieldnames = list(fieldnames) 204 self.fieldnames = fieldnames # list of keys for the dict 205 self.restval = restval # for writing short dicts 206 extrasaction = extrasaction.lower() 207 if extrasaction not in ("raise", "ignore"): 208 raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" 209 % extrasaction) 210 self.extrasaction = extrasaction 211 self.writer = writer(f, dialect, *args, **kwds) 212 213 def writeheader(self): 214 header = dict(zip(self.fieldnames, self.fieldnames)) 215 return self.writerow(header) 216 217 def _dict_to_list(self, rowdict): 218 if self.extrasaction == "raise": 219 wrong_fields = rowdict.keys() - self.fieldnames 220 if wrong_fields: 221 raise ValueError("dict contains fields not in fieldnames: " 222 + ", ".join([repr(x) for x in wrong_fields])) 223 return (rowdict.get(key, self.restval) for key in self.fieldnames) 224 225 def writerow(self, rowdict): 226 return self.writer.writerow(self._dict_to_list(rowdict)) 227 228 def writerows(self, rowdicts): 229 return self.writer.writerows(map(self._dict_to_list, rowdicts)) 230 231 __class_getitem__ = classmethod(types.GenericAlias) 232 233 234class Sniffer: 235 ''' 236 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 237 Returns a Dialect object. 238 ''' 239 def __init__(self): 240 # in case there is more than one possible delimiter 241 self.preferred = [',', '\t', ';', ' ', ':'] 242 243 244 def sniff(self, sample, delimiters=None): 245 """ 246 Returns a dialect (or None) corresponding to the sample 247 """ 248 249 quotechar, doublequote, delimiter, skipinitialspace = \ 250 self._guess_quote_and_delimiter(sample, delimiters) 251 if not delimiter: 252 delimiter, skipinitialspace = self._guess_delimiter(sample, 253 delimiters) 254 255 if not delimiter: 256 raise Error("Could not determine delimiter") 257 258 class dialect(Dialect): 259 _name = "sniffed" 260 lineterminator = '\r\n' 261 quoting = QUOTE_MINIMAL 262 # escapechar = '' 263 264 dialect.doublequote = doublequote 265 dialect.delimiter = delimiter 266 # _csv.reader won't accept a quotechar of '' 267 dialect.quotechar = quotechar or '"' 268 dialect.skipinitialspace = skipinitialspace 269 270 return dialect 271 272 273 def _guess_quote_and_delimiter(self, data, delimiters): 274 """ 275 Looks for text enclosed between two identical quotes 276 (the probable quotechar) which are preceded and followed 277 by the same character (the probable delimiter). 278 For example: 279 ,'some text', 280 The quote with the most wins, same with the delimiter. 281 If there is no quotechar the delimiter can't be determined 282 this way. 283 """ 284 285 matches = [] 286 for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 287 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 288 r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 289 r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 290 regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 291 matches = regexp.findall(data) 292 if matches: 293 break 294 295 if not matches: 296 # (quotechar, doublequote, delimiter, skipinitialspace) 297 return ('', False, None, 0) 298 quotes = {} 299 delims = {} 300 spaces = 0 301 groupindex = regexp.groupindex 302 for m in matches: 303 n = groupindex['quote'] - 1 304 key = m[n] 305 if key: 306 quotes[key] = quotes.get(key, 0) + 1 307 try: 308 n = groupindex['delim'] - 1 309 key = m[n] 310 except KeyError: 311 continue 312 if key and (delimiters is None or key in delimiters): 313 delims[key] = delims.get(key, 0) + 1 314 try: 315 n = groupindex['space'] - 1 316 except KeyError: 317 continue 318 if m[n]: 319 spaces += 1 320 321 quotechar = max(quotes, key=quotes.get) 322 323 if delims: 324 delim = max(delims, key=delims.get) 325 skipinitialspace = delims[delim] == spaces 326 if delim == '\n': # most likely a file with a single column 327 delim = '' 328 else: 329 # there is *no* delimiter, it's a single column of quoted data 330 delim = '' 331 skipinitialspace = 0 332 333 # if we see an extra quote between delimiters, we've got a 334 # double quoted format 335 dq_regexp = re.compile( 336 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ 337 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) 338 339 340 341 if dq_regexp.search(data): 342 doublequote = True 343 else: 344 doublequote = False 345 346 return (quotechar, doublequote, delim, skipinitialspace) 347 348 349 def _guess_delimiter(self, data, delimiters): 350 """ 351 The delimiter /should/ occur the same number of times on 352 each row. However, due to malformed data, it may not. We don't want 353 an all or nothing approach, so we allow for small variations in this 354 number. 355 1) build a table of the frequency of each character on every line. 356 2) build a table of frequencies of this frequency (meta-frequency?), 357 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 358 7 times in 2 rows' 359 3) use the mode of the meta-frequency to determine the /expected/ 360 frequency for that character 361 4) find out how often the character actually meets that goal 362 5) the character that best meets its goal is the delimiter 363 For performance reasons, the data is evaluated in chunks, so it can 364 try and evaluate the smallest portion of the data possible, evaluating 365 additional chunks as necessary. 366 """ 367 368 data = list(filter(None, data.split('\n'))) 369 370 ascii = [chr(c) for c in range(127)] # 7-bit ASCII 371 372 # build frequency tables 373 chunkLength = min(10, len(data)) 374 iteration = 0 375 charFrequency = {} 376 modes = {} 377 delims = {} 378 start, end = 0, chunkLength 379 while start < len(data): 380 iteration += 1 381 for line in data[start:end]: 382 for char in ascii: 383 metaFrequency = charFrequency.get(char, {}) 384 # must count even if frequency is 0 385 freq = line.count(char) 386 # value is the mode 387 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 388 charFrequency[char] = metaFrequency 389 390 for char in charFrequency.keys(): 391 items = list(charFrequency[char].items()) 392 if len(items) == 1 and items[0][0] == 0: 393 continue 394 # get the mode of the frequencies 395 if len(items) > 1: 396 modes[char] = max(items, key=lambda x: x[1]) 397 # adjust the mode - subtract the sum of all 398 # other frequencies 399 items.remove(modes[char]) 400 modes[char] = (modes[char][0], modes[char][1] 401 - sum(item[1] for item in items)) 402 else: 403 modes[char] = items[0] 404 405 # build a list of possible delimiters 406 modeList = modes.items() 407 total = float(min(chunkLength * iteration, len(data))) 408 # (rows of consistent data) / (number of rows) = 100% 409 consistency = 1.0 410 # minimum consistency threshold 411 threshold = 0.9 412 while len(delims) == 0 and consistency >= threshold: 413 for k, v in modeList: 414 if v[0] > 0 and v[1] > 0: 415 if ((v[1]/total) >= consistency and 416 (delimiters is None or k in delimiters)): 417 delims[k] = v 418 consistency -= 0.01 419 420 if len(delims) == 1: 421 delim = list(delims.keys())[0] 422 skipinitialspace = (data[0].count(delim) == 423 data[0].count("%c " % delim)) 424 return (delim, skipinitialspace) 425 426 # analyze another chunkLength lines 427 start = end 428 end += chunkLength 429 430 if not delims: 431 return ('', 0) 432 433 # if there's more than one, fall back to a 'preferred' list 434 if len(delims) > 1: 435 for d in self.preferred: 436 if d in delims.keys(): 437 skipinitialspace = (data[0].count(d) == 438 data[0].count("%c " % d)) 439 return (d, skipinitialspace) 440 441 # nothing else indicates a preference, pick the character that 442 # dominates(?) 443 items = [(v,k) for (k,v) in delims.items()] 444 items.sort() 445 delim = items[-1][1] 446 447 skipinitialspace = (data[0].count(delim) == 448 data[0].count("%c " % delim)) 449 return (delim, skipinitialspace) 450 451 452 def has_header(self, sample): 453 # Creates a dictionary of types of data in each column. If any 454 # column is of a single type (say, integers), *except* for the first 455 # row, then the first row is presumed to be labels. If the type 456 # can't be determined, it is assumed to be a string in which case 457 # the length of the string is the determining factor: if all of the 458 # rows except for the first are the same length, it's a header. 459 # Finally, a 'vote' is taken at the end for each column, adding or 460 # subtracting from the likelihood of the first row being a header. 461 462 rdr = reader(StringIO(sample), self.sniff(sample)) 463 464 header = next(rdr) # assume first row is header 465 466 columns = len(header) 467 columnTypes = {} 468 for i in range(columns): columnTypes[i] = None 469 470 checked = 0 471 for row in rdr: 472 # arbitrary number of rows to check, to keep it sane 473 if checked > 20: 474 break 475 checked += 1 476 477 if len(row) != columns: 478 continue # skip rows that have irregular number of columns 479 480 for col in list(columnTypes.keys()): 481 thisType = complex 482 try: 483 thisType(row[col]) 484 except (ValueError, OverflowError): 485 # fallback to length of string 486 thisType = len(row[col]) 487 488 if thisType != columnTypes[col]: 489 if columnTypes[col] is None: # add new column type 490 columnTypes[col] = thisType 491 else: 492 # type is inconsistent, remove column from 493 # consideration 494 del columnTypes[col] 495 496 # finally, compare results against first row and "vote" 497 # on whether it's a header 498 hasHeader = 0 499 for col, colType in columnTypes.items(): 500 if isinstance(colType, int): # it's a length 501 if len(header[col]) != colType: 502 hasHeader += 1 503 else: 504 hasHeader -= 1 505 else: # attempt typecast 506 try: 507 colType(header[col]) 508 except (ValueError, TypeError): 509 hasHeader += 1 510 else: 511 hasHeader -= 1 512 513 return hasHeader > 0 514