1 2""" 3csv.py - read/write/investigate CSV files 4""" 5 6import re 7from functools import reduce 8from _csv import Error, __version__, writer, reader, register_dialect, \ 9 unregister_dialect, get_dialect, list_dialects, \ 10 field_size_limit, \ 11 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ 12 __doc__ 13from _csv import Dialect as _Dialect 14 15try: 16 from cStringIO import StringIO 17except ImportError: 18 from StringIO import StringIO 19 20__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", 21 "Error", "Dialect", "__doc__", "excel", "excel_tab", 22 "field_size_limit", "reader", "writer", 23 "register_dialect", "get_dialect", "list_dialects", "Sniffer", 24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ] 25 26class Dialect: 27 """Describe an Excel dialect. 28 29 This must be subclassed (see csv.excel). Valid attributes are: 30 delimiter, quotechar, escapechar, doublequote, skipinitialspace, 31 lineterminator, quoting. 32 33 """ 34 _name = "" 35 _valid = False 36 # placeholders 37 delimiter = None 38 quotechar = None 39 escapechar = None 40 doublequote = None 41 skipinitialspace = None 42 lineterminator = None 43 quoting = None 44 45 def __init__(self): 46 if self.__class__ != Dialect: 47 self._valid = True 48 self._validate() 49 50 def _validate(self): 51 try: 52 _Dialect(self) 53 except TypeError, e: 54 # We do this for compatibility with py2.3 55 raise Error(str(e)) 56 57class excel(Dialect): 58 """Describe the usual properties of Excel-generated CSV files.""" 59 delimiter = ',' 60 quotechar = '"' 61 doublequote = True 62 skipinitialspace = False 63 lineterminator = '\r\n' 64 quoting = QUOTE_MINIMAL 65register_dialect("excel", excel) 66 67class excel_tab(excel): 68 """Describe the usual properties of Excel-generated TAB-delimited files.""" 69 delimiter = '\t' 70register_dialect("excel-tab", excel_tab) 71 72 73class DictReader: 74 def __init__(self, f, fieldnames=None, restkey=None, restval=None, 75 dialect="excel", *args, **kwds): 76 self._fieldnames = fieldnames # list of keys for the dict 77 self.restkey = restkey # key to catch long rows 78 self.restval = restval # default value for short rows 79 self.reader = reader(f, dialect, *args, **kwds) 80 self.dialect = dialect 81 self.line_num = 0 82 83 def __iter__(self): 84 return self 85 86 @property 87 def fieldnames(self): 88 if self._fieldnames is None: 89 try: 90 self._fieldnames = self.reader.next() 91 except StopIteration: 92 pass 93 self.line_num = self.reader.line_num 94 return self._fieldnames 95 96 # Issue 20004: Because DictReader is a classic class, this setter is 97 # ignored. At this point in 2.7's lifecycle, it is too late to change the 98 # base class for fear of breaking working code. If you want to change 99 # fieldnames without overwriting the getter, set _fieldnames directly. 100 @fieldnames.setter 101 def fieldnames(self, value): 102 self._fieldnames = value 103 104 def next(self): 105 if self.line_num == 0: 106 # Used only for its side effect. 107 self.fieldnames 108 row = self.reader.next() 109 self.line_num = self.reader.line_num 110 111 # unlike the basic reader, we prefer not to return blanks, 112 # because we will typically wind up with a dict full of None 113 # values 114 while row == []: 115 row = self.reader.next() 116 d = dict(zip(self.fieldnames, row)) 117 lf = len(self.fieldnames) 118 lr = len(row) 119 if lf < lr: 120 d[self.restkey] = row[lf:] 121 elif lf > lr: 122 for key in self.fieldnames[lr:]: 123 d[key] = self.restval 124 return d 125 126 127class DictWriter: 128 def __init__(self, f, fieldnames, restval="", extrasaction="raise", 129 dialect="excel", *args, **kwds): 130 self.fieldnames = fieldnames # list of keys for the dict 131 self.restval = restval # for writing short dicts 132 if extrasaction.lower() not in ("raise", "ignore"): 133 raise ValueError, \ 134 ("extrasaction (%s) must be 'raise' or 'ignore'" % 135 extrasaction) 136 self.extrasaction = extrasaction 137 self.writer = writer(f, dialect, *args, **kwds) 138 139 def writeheader(self): 140 header = dict(zip(self.fieldnames, self.fieldnames)) 141 self.writerow(header) 142 143 def _dict_to_list(self, rowdict): 144 if self.extrasaction == "raise": 145 wrong_fields = [k for k in rowdict if k not in self.fieldnames] 146 if wrong_fields: 147 raise ValueError("dict contains fields not in fieldnames: " 148 + ", ".join([repr(x) for x in wrong_fields])) 149 return [rowdict.get(key, self.restval) for key in self.fieldnames] 150 151 def writerow(self, rowdict): 152 return self.writer.writerow(self._dict_to_list(rowdict)) 153 154 def writerows(self, rowdicts): 155 rows = [] 156 for rowdict in rowdicts: 157 rows.append(self._dict_to_list(rowdict)) 158 return self.writer.writerows(rows) 159 160# Guard Sniffer's type checking against builds that exclude complex() 161try: 162 complex 163except NameError: 164 complex = float 165 166class Sniffer: 167 ''' 168 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) 169 Returns a Dialect object. 170 ''' 171 def __init__(self): 172 # in case there is more than one possible delimiter 173 self.preferred = [',', '\t', ';', ' ', ':'] 174 175 176 def sniff(self, sample, delimiters=None): 177 """ 178 Returns a dialect (or None) corresponding to the sample 179 """ 180 181 quotechar, doublequote, delimiter, skipinitialspace = \ 182 self._guess_quote_and_delimiter(sample, delimiters) 183 if not delimiter: 184 delimiter, skipinitialspace = self._guess_delimiter(sample, 185 delimiters) 186 187 if not delimiter: 188 raise Error, "Could not determine delimiter" 189 190 class dialect(Dialect): 191 _name = "sniffed" 192 lineterminator = '\r\n' 193 quoting = QUOTE_MINIMAL 194 # escapechar = '' 195 196 dialect.doublequote = doublequote 197 dialect.delimiter = delimiter 198 # _csv.reader won't accept a quotechar of '' 199 dialect.quotechar = quotechar or '"' 200 dialect.skipinitialspace = skipinitialspace 201 202 return dialect 203 204 205 def _guess_quote_and_delimiter(self, data, delimiters): 206 """ 207 Looks for text enclosed between two identical quotes 208 (the probable quotechar) which are preceded and followed 209 by the same character (the probable delimiter). 210 For example: 211 ,'some text', 212 The quote with the most wins, same with the delimiter. 213 If there is no quotechar the delimiter can't be determined 214 this way. 215 """ 216 217 matches = [] 218 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", 219 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", 220 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" 221 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) 222 regexp = re.compile(restr, re.DOTALL | re.MULTILINE) 223 matches = regexp.findall(data) 224 if matches: 225 break 226 227 if not matches: 228 # (quotechar, doublequote, delimiter, skipinitialspace) 229 return ('', False, None, 0) 230 quotes = {} 231 delims = {} 232 spaces = 0 233 for m in matches: 234 n = regexp.groupindex['quote'] - 1 235 key = m[n] 236 if key: 237 quotes[key] = quotes.get(key, 0) + 1 238 try: 239 n = regexp.groupindex['delim'] - 1 240 key = m[n] 241 except KeyError: 242 continue 243 if key and (delimiters is None or key in delimiters): 244 delims[key] = delims.get(key, 0) + 1 245 try: 246 n = regexp.groupindex['space'] - 1 247 except KeyError: 248 continue 249 if m[n]: 250 spaces += 1 251 252 quotechar = reduce(lambda a, b, quotes = quotes: 253 (quotes[a] > quotes[b]) and a or b, quotes.keys()) 254 255 if delims: 256 delim = reduce(lambda a, b, delims = delims: 257 (delims[a] > delims[b]) and a or b, delims.keys()) 258 skipinitialspace = delims[delim] == spaces 259 if delim == '\n': # most likely a file with a single column 260 delim = '' 261 else: 262 # there is *no* delimiter, it's a single column of quoted data 263 delim = '' 264 skipinitialspace = 0 265 266 # if we see an extra quote between delimiters, we've got a 267 # double quoted format 268 dq_regexp = re.compile( 269 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \ 270 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) 271 272 273 274 if dq_regexp.search(data): 275 doublequote = True 276 else: 277 doublequote = False 278 279 return (quotechar, doublequote, delim, skipinitialspace) 280 281 282 def _guess_delimiter(self, data, delimiters): 283 """ 284 The delimiter /should/ occur the same number of times on 285 each row. However, due to malformed data, it may not. We don't want 286 an all or nothing approach, so we allow for small variations in this 287 number. 288 1) build a table of the frequency of each character on every line. 289 2) build a table of frequencies of this frequency (meta-frequency?), 290 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, 291 7 times in 2 rows' 292 3) use the mode of the meta-frequency to determine the /expected/ 293 frequency for that character 294 4) find out how often the character actually meets that goal 295 5) the character that best meets its goal is the delimiter 296 For performance reasons, the data is evaluated in chunks, so it can 297 try and evaluate the smallest portion of the data possible, evaluating 298 additional chunks as necessary. 299 """ 300 301 data = filter(None, data.split('\n')) 302 303 ascii = [chr(c) for c in range(127)] # 7-bit ASCII 304 305 # build frequency tables 306 chunkLength = min(10, len(data)) 307 iteration = 0 308 charFrequency = {} 309 modes = {} 310 delims = {} 311 start, end = 0, min(chunkLength, len(data)) 312 while start < len(data): 313 iteration += 1 314 for line in data[start:end]: 315 for char in ascii: 316 metaFrequency = charFrequency.get(char, {}) 317 # must count even if frequency is 0 318 freq = line.count(char) 319 # value is the mode 320 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 321 charFrequency[char] = metaFrequency 322 323 for char in charFrequency.keys(): 324 items = charFrequency[char].items() 325 if len(items) == 1 and items[0][0] == 0: 326 continue 327 # get the mode of the frequencies 328 if len(items) > 1: 329 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, 330 items) 331 # adjust the mode - subtract the sum of all 332 # other frequencies 333 items.remove(modes[char]) 334 modes[char] = (modes[char][0], modes[char][1] 335 - reduce(lambda a, b: (0, a[1] + b[1]), 336 items)[1]) 337 else: 338 modes[char] = items[0] 339 340 # build a list of possible delimiters 341 modeList = modes.items() 342 total = float(chunkLength * iteration) 343 # (rows of consistent data) / (number of rows) = 100% 344 consistency = 1.0 345 # minimum consistency threshold 346 threshold = 0.9 347 while len(delims) == 0 and consistency >= threshold: 348 for k, v in modeList: 349 if v[0] > 0 and v[1] > 0: 350 if ((v[1]/total) >= consistency and 351 (delimiters is None or k in delimiters)): 352 delims[k] = v 353 consistency -= 0.01 354 355 if len(delims) == 1: 356 delim = delims.keys()[0] 357 skipinitialspace = (data[0].count(delim) == 358 data[0].count("%c " % delim)) 359 return (delim, skipinitialspace) 360 361 # analyze another chunkLength lines 362 start = end 363 end += chunkLength 364 365 if not delims: 366 return ('', 0) 367 368 # if there's more than one, fall back to a 'preferred' list 369 if len(delims) > 1: 370 for d in self.preferred: 371 if d in delims.keys(): 372 skipinitialspace = (data[0].count(d) == 373 data[0].count("%c " % d)) 374 return (d, skipinitialspace) 375 376 # nothing else indicates a preference, pick the character that 377 # dominates(?) 378 items = [(v,k) for (k,v) in delims.items()] 379 items.sort() 380 delim = items[-1][1] 381 382 skipinitialspace = (data[0].count(delim) == 383 data[0].count("%c " % delim)) 384 return (delim, skipinitialspace) 385 386 387 def has_header(self, sample): 388 # Creates a dictionary of types of data in each column. If any 389 # column is of a single type (say, integers), *except* for the first 390 # row, then the first row is presumed to be labels. If the type 391 # can't be determined, it is assumed to be a string in which case 392 # the length of the string is the determining factor: if all of the 393 # rows except for the first are the same length, it's a header. 394 # Finally, a 'vote' is taken at the end for each column, adding or 395 # subtracting from the likelihood of the first row being a header. 396 397 rdr = reader(StringIO(sample), self.sniff(sample)) 398 399 header = rdr.next() # assume first row is header 400 401 columns = len(header) 402 columnTypes = {} 403 for i in range(columns): columnTypes[i] = None 404 405 checked = 0 406 for row in rdr: 407 # arbitrary number of rows to check, to keep it sane 408 if checked > 20: 409 break 410 checked += 1 411 412 if len(row) != columns: 413 continue # skip rows that have irregular number of columns 414 415 for col in columnTypes.keys(): 416 417 for thisType in [int, long, float, complex]: 418 try: 419 thisType(row[col]) 420 break 421 except (ValueError, OverflowError): 422 pass 423 else: 424 # fallback to length of string 425 thisType = len(row[col]) 426 427 # treat longs as ints 428 if thisType == long: 429 thisType = int 430 431 if thisType != columnTypes[col]: 432 if columnTypes[col] is None: # add new column type 433 columnTypes[col] = thisType 434 else: 435 # type is inconsistent, remove column from 436 # consideration 437 del columnTypes[col] 438 439 # finally, compare results against first row and "vote" 440 # on whether it's a header 441 hasHeader = 0 442 for col, colType in columnTypes.items(): 443 if type(colType) == type(0): # it's a length 444 if len(header[col]) != colType: 445 hasHeader += 1 446 else: 447 hasHeader -= 1 448 else: # attempt typecast 449 try: 450 colType(header[col]) 451 except (ValueError, TypeError): 452 hasHeader += 1 453 else: 454 hasHeader -= 1 455 456 return hasHeader > 0 457