• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from functools import reduce
8from _csv import Error, __version__, writer, reader, register_dialect, \
9                 unregister_dialect, get_dialect, list_dialects, \
10                 field_size_limit, \
11                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12                 __doc__
13from _csv import Dialect as _Dialect
14
15try:
16    from cStringIO import StringIO
17except ImportError:
18    from StringIO import StringIO
19
20__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21            "Error", "Dialect", "__doc__", "excel", "excel_tab",
22            "field_size_limit", "reader", "writer",
23            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26class Dialect:
27    """Describe an Excel dialect.
28
29    This must be subclassed (see csv.excel).  Valid attributes are:
30    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31    lineterminator, quoting.
32
33    """
34    _name = ""
35    _valid = False
36    # placeholders
37    delimiter = None
38    quotechar = None
39    escapechar = None
40    doublequote = None
41    skipinitialspace = None
42    lineterminator = None
43    quoting = None
44
45    def __init__(self):
46        if self.__class__ != Dialect:
47            self._valid = True
48        self._validate()
49
50    def _validate(self):
51        try:
52            _Dialect(self)
53        except TypeError, e:
54            # We do this for compatibility with py2.3
55            raise Error(str(e))
56
57class excel(Dialect):
58    """Describe the usual properties of Excel-generated CSV files."""
59    delimiter = ','
60    quotechar = '"'
61    doublequote = True
62    skipinitialspace = False
63    lineterminator = '\r\n'
64    quoting = QUOTE_MINIMAL
65register_dialect("excel", excel)
66
67class excel_tab(excel):
68    """Describe the usual properties of Excel-generated TAB-delimited files."""
69    delimiter = '\t'
70register_dialect("excel-tab", excel_tab)
71
72
73class DictReader:
74    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
75                 dialect="excel", *args, **kwds):
76        self._fieldnames = fieldnames   # list of keys for the dict
77        self.restkey = restkey          # key to catch long rows
78        self.restval = restval          # default value for short rows
79        self.reader = reader(f, dialect, *args, **kwds)
80        self.dialect = dialect
81        self.line_num = 0
82
83    def __iter__(self):
84        return self
85
86    @property
87    def fieldnames(self):
88        if self._fieldnames is None:
89            try:
90                self._fieldnames = self.reader.next()
91            except StopIteration:
92                pass
93        self.line_num = self.reader.line_num
94        return self._fieldnames
95
96    # Issue 20004: Because DictReader is a classic class, this setter is
97    # ignored.  At this point in 2.7's lifecycle, it is too late to change the
98    # base class for fear of breaking working code.  If you want to change
99    # fieldnames without overwriting the getter, set _fieldnames directly.
100    @fieldnames.setter
101    def fieldnames(self, value):
102        self._fieldnames = value
103
104    def next(self):
105        if self.line_num == 0:
106            # Used only for its side effect.
107            self.fieldnames
108        row = self.reader.next()
109        self.line_num = self.reader.line_num
110
111        # unlike the basic reader, we prefer not to return blanks,
112        # because we will typically wind up with a dict full of None
113        # values
114        while row == []:
115            row = self.reader.next()
116        d = dict(zip(self.fieldnames, row))
117        lf = len(self.fieldnames)
118        lr = len(row)
119        if lf < lr:
120            d[self.restkey] = row[lf:]
121        elif lf > lr:
122            for key in self.fieldnames[lr:]:
123                d[key] = self.restval
124        return d
125
126
127class DictWriter:
128    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
129                 dialect="excel", *args, **kwds):
130        self.fieldnames = fieldnames    # list of keys for the dict
131        self.restval = restval          # for writing short dicts
132        if extrasaction.lower() not in ("raise", "ignore"):
133            raise ValueError, \
134                  ("extrasaction (%s) must be 'raise' or 'ignore'" %
135                   extrasaction)
136        self.extrasaction = extrasaction
137        self.writer = writer(f, dialect, *args, **kwds)
138
139    def writeheader(self):
140        header = dict(zip(self.fieldnames, self.fieldnames))
141        self.writerow(header)
142
143    def _dict_to_list(self, rowdict):
144        if self.extrasaction == "raise":
145            wrong_fields = [k for k in rowdict if k not in self.fieldnames]
146            if wrong_fields:
147                raise ValueError("dict contains fields not in fieldnames: "
148                                 + ", ".join([repr(x) for x in wrong_fields]))
149        return [rowdict.get(key, self.restval) for key in self.fieldnames]
150
151    def writerow(self, rowdict):
152        return self.writer.writerow(self._dict_to_list(rowdict))
153
154    def writerows(self, rowdicts):
155        rows = []
156        for rowdict in rowdicts:
157            rows.append(self._dict_to_list(rowdict))
158        return self.writer.writerows(rows)
159
160# Guard Sniffer's type checking against builds that exclude complex()
161try:
162    complex
163except NameError:
164    complex = float
165
166class Sniffer:
167    '''
168    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
169    Returns a Dialect object.
170    '''
171    def __init__(self):
172        # in case there is more than one possible delimiter
173        self.preferred = [',', '\t', ';', ' ', ':']
174
175
176    def sniff(self, sample, delimiters=None):
177        """
178        Returns a dialect (or None) corresponding to the sample
179        """
180
181        quotechar, doublequote, delimiter, skipinitialspace = \
182                   self._guess_quote_and_delimiter(sample, delimiters)
183        if not delimiter:
184            delimiter, skipinitialspace = self._guess_delimiter(sample,
185                                                                delimiters)
186
187        if not delimiter:
188            raise Error, "Could not determine delimiter"
189
190        class dialect(Dialect):
191            _name = "sniffed"
192            lineterminator = '\r\n'
193            quoting = QUOTE_MINIMAL
194            # escapechar = ''
195
196        dialect.doublequote = doublequote
197        dialect.delimiter = delimiter
198        # _csv.reader won't accept a quotechar of ''
199        dialect.quotechar = quotechar or '"'
200        dialect.skipinitialspace = skipinitialspace
201
202        return dialect
203
204
205    def _guess_quote_and_delimiter(self, data, delimiters):
206        """
207        Looks for text enclosed between two identical quotes
208        (the probable quotechar) which are preceded and followed
209        by the same character (the probable delimiter).
210        For example:
211                         ,'some text',
212        The quote with the most wins, same with the delimiter.
213        If there is no quotechar the delimiter can't be determined
214        this way.
215        """
216
217        matches = []
218        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
219                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
220                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
221                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
222            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
223            matches = regexp.findall(data)
224            if matches:
225                break
226
227        if not matches:
228            # (quotechar, doublequote, delimiter, skipinitialspace)
229            return ('', False, None, 0)
230        quotes = {}
231        delims = {}
232        spaces = 0
233        for m in matches:
234            n = regexp.groupindex['quote'] - 1
235            key = m[n]
236            if key:
237                quotes[key] = quotes.get(key, 0) + 1
238            try:
239                n = regexp.groupindex['delim'] - 1
240                key = m[n]
241            except KeyError:
242                continue
243            if key and (delimiters is None or key in delimiters):
244                delims[key] = delims.get(key, 0) + 1
245            try:
246                n = regexp.groupindex['space'] - 1
247            except KeyError:
248                continue
249            if m[n]:
250                spaces += 1
251
252        quotechar = reduce(lambda a, b, quotes = quotes:
253                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
254
255        if delims:
256            delim = reduce(lambda a, b, delims = delims:
257                           (delims[a] > delims[b]) and a or b, delims.keys())
258            skipinitialspace = delims[delim] == spaces
259            if delim == '\n': # most likely a file with a single column
260                delim = ''
261        else:
262            # there is *no* delimiter, it's a single column of quoted data
263            delim = ''
264            skipinitialspace = 0
265
266        # if we see an extra quote between delimiters, we've got a
267        # double quoted format
268        dq_regexp = re.compile(
269                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
270                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
271
272
273
274        if dq_regexp.search(data):
275            doublequote = True
276        else:
277            doublequote = False
278
279        return (quotechar, doublequote, delim, skipinitialspace)
280
281
282    def _guess_delimiter(self, data, delimiters):
283        """
284        The delimiter /should/ occur the same number of times on
285        each row. However, due to malformed data, it may not. We don't want
286        an all or nothing approach, so we allow for small variations in this
287        number.
288          1) build a table of the frequency of each character on every line.
289          2) build a table of frequencies of this frequency (meta-frequency?),
290             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
291             7 times in 2 rows'
292          3) use the mode of the meta-frequency to determine the /expected/
293             frequency for that character
294          4) find out how often the character actually meets that goal
295          5) the character that best meets its goal is the delimiter
296        For performance reasons, the data is evaluated in chunks, so it can
297        try and evaluate the smallest portion of the data possible, evaluating
298        additional chunks as necessary.
299        """
300
301        data = filter(None, data.split('\n'))
302
303        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
304
305        # build frequency tables
306        chunkLength = min(10, len(data))
307        iteration = 0
308        charFrequency = {}
309        modes = {}
310        delims = {}
311        start, end = 0, min(chunkLength, len(data))
312        while start < len(data):
313            iteration += 1
314            for line in data[start:end]:
315                for char in ascii:
316                    metaFrequency = charFrequency.get(char, {})
317                    # must count even if frequency is 0
318                    freq = line.count(char)
319                    # value is the mode
320                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
321                    charFrequency[char] = metaFrequency
322
323            for char in charFrequency.keys():
324                items = charFrequency[char].items()
325                if len(items) == 1 and items[0][0] == 0:
326                    continue
327                # get the mode of the frequencies
328                if len(items) > 1:
329                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
330                                         items)
331                    # adjust the mode - subtract the sum of all
332                    # other frequencies
333                    items.remove(modes[char])
334                    modes[char] = (modes[char][0], modes[char][1]
335                                   - reduce(lambda a, b: (0, a[1] + b[1]),
336                                            items)[1])
337                else:
338                    modes[char] = items[0]
339
340            # build a list of possible delimiters
341            modeList = modes.items()
342            total = float(chunkLength * iteration)
343            # (rows of consistent data) / (number of rows) = 100%
344            consistency = 1.0
345            # minimum consistency threshold
346            threshold = 0.9
347            while len(delims) == 0 and consistency >= threshold:
348                for k, v in modeList:
349                    if v[0] > 0 and v[1] > 0:
350                        if ((v[1]/total) >= consistency and
351                            (delimiters is None or k in delimiters)):
352                            delims[k] = v
353                consistency -= 0.01
354
355            if len(delims) == 1:
356                delim = delims.keys()[0]
357                skipinitialspace = (data[0].count(delim) ==
358                                    data[0].count("%c " % delim))
359                return (delim, skipinitialspace)
360
361            # analyze another chunkLength lines
362            start = end
363            end += chunkLength
364
365        if not delims:
366            return ('', 0)
367
368        # if there's more than one, fall back to a 'preferred' list
369        if len(delims) > 1:
370            for d in self.preferred:
371                if d in delims.keys():
372                    skipinitialspace = (data[0].count(d) ==
373                                        data[0].count("%c " % d))
374                    return (d, skipinitialspace)
375
376        # nothing else indicates a preference, pick the character that
377        # dominates(?)
378        items = [(v,k) for (k,v) in delims.items()]
379        items.sort()
380        delim = items[-1][1]
381
382        skipinitialspace = (data[0].count(delim) ==
383                            data[0].count("%c " % delim))
384        return (delim, skipinitialspace)
385
386
387    def has_header(self, sample):
388        # Creates a dictionary of types of data in each column. If any
389        # column is of a single type (say, integers), *except* for the first
390        # row, then the first row is presumed to be labels. If the type
391        # can't be determined, it is assumed to be a string in which case
392        # the length of the string is the determining factor: if all of the
393        # rows except for the first are the same length, it's a header.
394        # Finally, a 'vote' is taken at the end for each column, adding or
395        # subtracting from the likelihood of the first row being a header.
396
397        rdr = reader(StringIO(sample), self.sniff(sample))
398
399        header = rdr.next() # assume first row is header
400
401        columns = len(header)
402        columnTypes = {}
403        for i in range(columns): columnTypes[i] = None
404
405        checked = 0
406        for row in rdr:
407            # arbitrary number of rows to check, to keep it sane
408            if checked > 20:
409                break
410            checked += 1
411
412            if len(row) != columns:
413                continue # skip rows that have irregular number of columns
414
415            for col in columnTypes.keys():
416
417                for thisType in [int, long, float, complex]:
418                    try:
419                        thisType(row[col])
420                        break
421                    except (ValueError, OverflowError):
422                        pass
423                else:
424                    # fallback to length of string
425                    thisType = len(row[col])
426
427                # treat longs as ints
428                if thisType == long:
429                    thisType = int
430
431                if thisType != columnTypes[col]:
432                    if columnTypes[col] is None: # add new column type
433                        columnTypes[col] = thisType
434                    else:
435                        # type is inconsistent, remove column from
436                        # consideration
437                        del columnTypes[col]
438
439        # finally, compare results against first row and "vote"
440        # on whether it's a header
441        hasHeader = 0
442        for col, colType in columnTypes.items():
443            if type(colType) == type(0): # it's a length
444                if len(header[col]) != colType:
445                    hasHeader += 1
446                else:
447                    hasHeader -= 1
448            else: # attempt typecast
449                try:
450                    colType(header[col])
451                except (ValueError, TypeError):
452                    hasHeader += 1
453                else:
454                    hasHeader -= 1
455
456        return hasHeader > 0
457