• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from _csv import Error, __version__, writer, reader, register_dialect, \
8                 unregister_dialect, get_dialect, list_dialects, \
9                 field_size_limit, \
10                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11                 __doc__
12from _csv import Dialect as _Dialect
13
14from io import StringIO
15
16__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
17           "Error", "Dialect", "__doc__", "excel", "excel_tab",
18           "field_size_limit", "reader", "writer",
19           "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20           "unregister_dialect", "__version__", "DictReader", "DictWriter",
21           "unix_dialect"]
22
23class Dialect:
24    """Describe a CSV dialect.
25
26    This must be subclassed (see csv.excel).  Valid attributes are:
27    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
28    lineterminator, quoting.
29
30    """
31    _name = ""
32    _valid = False
33    # placeholders
34    delimiter = None
35    quotechar = None
36    escapechar = None
37    doublequote = None
38    skipinitialspace = None
39    lineterminator = None
40    quoting = None
41
42    def __init__(self):
43        if self.__class__ != Dialect:
44            self._valid = True
45        self._validate()
46
47    def _validate(self):
48        try:
49            _Dialect(self)
50        except TypeError as e:
51            # We do this for compatibility with py2.3
52            raise Error(str(e))
53
54class excel(Dialect):
55    """Describe the usual properties of Excel-generated CSV files."""
56    delimiter = ','
57    quotechar = '"'
58    doublequote = True
59    skipinitialspace = False
60    lineterminator = '\r\n'
61    quoting = QUOTE_MINIMAL
62register_dialect("excel", excel)
63
64class excel_tab(excel):
65    """Describe the usual properties of Excel-generated TAB-delimited files."""
66    delimiter = '\t'
67register_dialect("excel-tab", excel_tab)
68
69class unix_dialect(Dialect):
70    """Describe the usual properties of Unix-generated CSV files."""
71    delimiter = ','
72    quotechar = '"'
73    doublequote = True
74    skipinitialspace = False
75    lineterminator = '\n'
76    quoting = QUOTE_ALL
77register_dialect("unix", unix_dialect)
78
79
80class DictReader:
81    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
82                 dialect="excel", *args, **kwds):
83        self._fieldnames = fieldnames   # list of keys for the dict
84        self.restkey = restkey          # key to catch long rows
85        self.restval = restval          # default value for short rows
86        self.reader = reader(f, dialect, *args, **kwds)
87        self.dialect = dialect
88        self.line_num = 0
89
90    def __iter__(self):
91        return self
92
93    @property
94    def fieldnames(self):
95        if self._fieldnames is None:
96            try:
97                self._fieldnames = next(self.reader)
98            except StopIteration:
99                pass
100        self.line_num = self.reader.line_num
101        return self._fieldnames
102
103    @fieldnames.setter
104    def fieldnames(self, value):
105        self._fieldnames = value
106
107    def __next__(self):
108        if self.line_num == 0:
109            # Used only for its side effect.
110            self.fieldnames
111        row = next(self.reader)
112        self.line_num = self.reader.line_num
113
114        # unlike the basic reader, we prefer not to return blanks,
115        # because we will typically wind up with a dict full of None
116        # values
117        while row == []:
118            row = next(self.reader)
119        d = dict(zip(self.fieldnames, row))
120        lf = len(self.fieldnames)
121        lr = len(row)
122        if lf < lr:
123            d[self.restkey] = row[lf:]
124        elif lf > lr:
125            for key in self.fieldnames[lr:]:
126                d[key] = self.restval
127        return d
128
129
130class DictWriter:
131    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
132                 dialect="excel", *args, **kwds):
133        self.fieldnames = fieldnames    # list of keys for the dict
134        self.restval = restval          # for writing short dicts
135        if extrasaction.lower() not in ("raise", "ignore"):
136            raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
137                             % extrasaction)
138        self.extrasaction = extrasaction
139        self.writer = writer(f, dialect, *args, **kwds)
140
141    def writeheader(self):
142        header = dict(zip(self.fieldnames, self.fieldnames))
143        return self.writerow(header)
144
145    def _dict_to_list(self, rowdict):
146        if self.extrasaction == "raise":
147            wrong_fields = rowdict.keys() - self.fieldnames
148            if wrong_fields:
149                raise ValueError("dict contains fields not in fieldnames: "
150                                 + ", ".join([repr(x) for x in wrong_fields]))
151        return (rowdict.get(key, self.restval) for key in self.fieldnames)
152
153    def writerow(self, rowdict):
154        return self.writer.writerow(self._dict_to_list(rowdict))
155
156    def writerows(self, rowdicts):
157        return self.writer.writerows(map(self._dict_to_list, rowdicts))
158
159# Guard Sniffer's type checking against builds that exclude complex()
160try:
161    complex
162except NameError:
163    complex = float
164
165class Sniffer:
166    '''
167    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
168    Returns a Dialect object.
169    '''
170    def __init__(self):
171        # in case there is more than one possible delimiter
172        self.preferred = [',', '\t', ';', ' ', ':']
173
174
175    def sniff(self, sample, delimiters=None):
176        """
177        Returns a dialect (or None) corresponding to the sample
178        """
179
180        quotechar, doublequote, delimiter, skipinitialspace = \
181                   self._guess_quote_and_delimiter(sample, delimiters)
182        if not delimiter:
183            delimiter, skipinitialspace = self._guess_delimiter(sample,
184                                                                delimiters)
185
186        if not delimiter:
187            raise Error("Could not determine delimiter")
188
189        class dialect(Dialect):
190            _name = "sniffed"
191            lineterminator = '\r\n'
192            quoting = QUOTE_MINIMAL
193            # escapechar = ''
194
195        dialect.doublequote = doublequote
196        dialect.delimiter = delimiter
197        # _csv.reader won't accept a quotechar of ''
198        dialect.quotechar = quotechar or '"'
199        dialect.skipinitialspace = skipinitialspace
200
201        return dialect
202
203
204    def _guess_quote_and_delimiter(self, data, delimiters):
205        """
206        Looks for text enclosed between two identical quotes
207        (the probable quotechar) which are preceded and followed
208        by the same character (the probable delimiter).
209        For example:
210                         ,'some text',
211        The quote with the most wins, same with the delimiter.
212        If there is no quotechar the delimiter can't be determined
213        this way.
214        """
215
216        matches = []
217        for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
218                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
219                      r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
220                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
221            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
222            matches = regexp.findall(data)
223            if matches:
224                break
225
226        if not matches:
227            # (quotechar, doublequote, delimiter, skipinitialspace)
228            return ('', False, None, 0)
229        quotes = {}
230        delims = {}
231        spaces = 0
232        groupindex = regexp.groupindex
233        for m in matches:
234            n = groupindex['quote'] - 1
235            key = m[n]
236            if key:
237                quotes[key] = quotes.get(key, 0) + 1
238            try:
239                n = groupindex['delim'] - 1
240                key = m[n]
241            except KeyError:
242                continue
243            if key and (delimiters is None or key in delimiters):
244                delims[key] = delims.get(key, 0) + 1
245            try:
246                n = groupindex['space'] - 1
247            except KeyError:
248                continue
249            if m[n]:
250                spaces += 1
251
252        quotechar = max(quotes, key=quotes.get)
253
254        if delims:
255            delim = max(delims, key=delims.get)
256            skipinitialspace = delims[delim] == spaces
257            if delim == '\n': # most likely a file with a single column
258                delim = ''
259        else:
260            # there is *no* delimiter, it's a single column of quoted data
261            delim = ''
262            skipinitialspace = 0
263
264        # if we see an extra quote between delimiters, we've got a
265        # double quoted format
266        dq_regexp = re.compile(
267                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
268                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
269
270
271
272        if dq_regexp.search(data):
273            doublequote = True
274        else:
275            doublequote = False
276
277        return (quotechar, doublequote, delim, skipinitialspace)
278
279
280    def _guess_delimiter(self, data, delimiters):
281        """
282        The delimiter /should/ occur the same number of times on
283        each row. However, due to malformed data, it may not. We don't want
284        an all or nothing approach, so we allow for small variations in this
285        number.
286          1) build a table of the frequency of each character on every line.
287          2) build a table of frequencies of this frequency (meta-frequency?),
288             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
289             7 times in 2 rows'
290          3) use the mode of the meta-frequency to determine the /expected/
291             frequency for that character
292          4) find out how often the character actually meets that goal
293          5) the character that best meets its goal is the delimiter
294        For performance reasons, the data is evaluated in chunks, so it can
295        try and evaluate the smallest portion of the data possible, evaluating
296        additional chunks as necessary.
297        """
298
299        data = list(filter(None, data.split('\n')))
300
301        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
302
303        # build frequency tables
304        chunkLength = min(10, len(data))
305        iteration = 0
306        charFrequency = {}
307        modes = {}
308        delims = {}
309        start, end = 0, chunkLength
310        while start < len(data):
311            iteration += 1
312            for line in data[start:end]:
313                for char in ascii:
314                    metaFrequency = charFrequency.get(char, {})
315                    # must count even if frequency is 0
316                    freq = line.count(char)
317                    # value is the mode
318                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
319                    charFrequency[char] = metaFrequency
320
321            for char in charFrequency.keys():
322                items = list(charFrequency[char].items())
323                if len(items) == 1 and items[0][0] == 0:
324                    continue
325                # get the mode of the frequencies
326                if len(items) > 1:
327                    modes[char] = max(items, key=lambda x: x[1])
328                    # adjust the mode - subtract the sum of all
329                    # other frequencies
330                    items.remove(modes[char])
331                    modes[char] = (modes[char][0], modes[char][1]
332                                   - sum(item[1] for item in items))
333                else:
334                    modes[char] = items[0]
335
336            # build a list of possible delimiters
337            modeList = modes.items()
338            total = float(min(chunkLength * iteration, len(data)))
339            # (rows of consistent data) / (number of rows) = 100%
340            consistency = 1.0
341            # minimum consistency threshold
342            threshold = 0.9
343            while len(delims) == 0 and consistency >= threshold:
344                for k, v in modeList:
345                    if v[0] > 0 and v[1] > 0:
346                        if ((v[1]/total) >= consistency and
347                            (delimiters is None or k in delimiters)):
348                            delims[k] = v
349                consistency -= 0.01
350
351            if len(delims) == 1:
352                delim = list(delims.keys())[0]
353                skipinitialspace = (data[0].count(delim) ==
354                                    data[0].count("%c " % delim))
355                return (delim, skipinitialspace)
356
357            # analyze another chunkLength lines
358            start = end
359            end += chunkLength
360
361        if not delims:
362            return ('', 0)
363
364        # if there's more than one, fall back to a 'preferred' list
365        if len(delims) > 1:
366            for d in self.preferred:
367                if d in delims.keys():
368                    skipinitialspace = (data[0].count(d) ==
369                                        data[0].count("%c " % d))
370                    return (d, skipinitialspace)
371
372        # nothing else indicates a preference, pick the character that
373        # dominates(?)
374        items = [(v,k) for (k,v) in delims.items()]
375        items.sort()
376        delim = items[-1][1]
377
378        skipinitialspace = (data[0].count(delim) ==
379                            data[0].count("%c " % delim))
380        return (delim, skipinitialspace)
381
382
383    def has_header(self, sample):
384        # Creates a dictionary of types of data in each column. If any
385        # column is of a single type (say, integers), *except* for the first
386        # row, then the first row is presumed to be labels. If the type
387        # can't be determined, it is assumed to be a string in which case
388        # the length of the string is the determining factor: if all of the
389        # rows except for the first are the same length, it's a header.
390        # Finally, a 'vote' is taken at the end for each column, adding or
391        # subtracting from the likelihood of the first row being a header.
392
393        rdr = reader(StringIO(sample), self.sniff(sample))
394
395        header = next(rdr) # assume first row is header
396
397        columns = len(header)
398        columnTypes = {}
399        for i in range(columns): columnTypes[i] = None
400
401        checked = 0
402        for row in rdr:
403            # arbitrary number of rows to check, to keep it sane
404            if checked > 20:
405                break
406            checked += 1
407
408            if len(row) != columns:
409                continue # skip rows that have irregular number of columns
410
411            for col in list(columnTypes.keys()):
412                thisType = complex
413                try:
414                    thisType(row[col])
415                except (ValueError, OverflowError):
416                    # fallback to length of string
417                    thisType = len(row[col])
418
419                if thisType != columnTypes[col]:
420                    if columnTypes[col] is None: # add new column type
421                        columnTypes[col] = thisType
422                    else:
423                        # type is inconsistent, remove column from
424                        # consideration
425                        del columnTypes[col]
426
427        # finally, compare results against first row and "vote"
428        # on whether it's a header
429        hasHeader = 0
430        for col, colType in columnTypes.items():
431            if type(colType) == type(0): # it's a length
432                if len(header[col]) != colType:
433                    hasHeader += 1
434                else:
435                    hasHeader -= 1
436            else: # attempt typecast
437                try:
438                    colType(header[col])
439                except (ValueError, TypeError):
440                    hasHeader += 1
441                else:
442                    hasHeader -= 1
443
444        return hasHeader > 0
445