• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1
2r"""
3CSV parsing and writing.
4
5This module provides classes that assist in the reading and writing
6of Comma Separated Value (CSV) files, and implements the interface
7described by PEP 305.  Although many CSV files are simple to parse,
8the format is not formally defined by a stable specification and
9is subtle enough that parsing lines of a CSV file with something
10like line.split(",") is bound to fail.  The module supports three
11basic APIs: reading, writing, and registration of dialects.
12
13
14DIALECT REGISTRATION:
15
16Readers and writers support a dialect argument, which is a convenient
17handle on a group of settings.  When the dialect argument is a string,
18it identifies one of the dialects previously registered with the module.
19If it is a class or instance, the attributes of the argument are used as
20the settings for the reader or writer:
21
22    class excel:
23        delimiter = ','
24        quotechar = '"'
25        escapechar = None
26        doublequote = True
27        skipinitialspace = False
28        lineterminator = '\r\n'
29        quoting = QUOTE_MINIMAL
30
31SETTINGS:
32
33    * quotechar - specifies a one-character string to use as the
34        quoting character.  It defaults to '"'.
35    * delimiter - specifies a one-character string to use as the
36        field separator.  It defaults to ','.
37    * skipinitialspace - specifies how to interpret spaces which
38        immediately follow a delimiter.  It defaults to False, which
39        means that spaces immediately following a delimiter is part
40        of the following field.
41    * lineterminator - specifies the character sequence which should
42        terminate rows.
43    * quoting - controls when quotes should be generated by the writer.
44        It can take on any of the following module constants:
45
46        csv.QUOTE_MINIMAL means only when required, for example, when a
47            field contains either the quotechar or the delimiter
48        csv.QUOTE_ALL means that quotes are always placed around fields.
49        csv.QUOTE_NONNUMERIC means that quotes are always placed around
50            fields which do not parse as integers or floating-point
51            numbers.
52        csv.QUOTE_STRINGS means that quotes are always placed around
53            fields which are strings.  Note that the Python value None
54            is not a string.
55        csv.QUOTE_NOTNULL means that quotes are only placed around fields
56            that are not the Python value None.
57        csv.QUOTE_NONE means that quotes are never placed around fields.
58    * escapechar - specifies a one-character string used to escape
59        the delimiter when quoting is set to QUOTE_NONE.
60    * doublequote - controls the handling of quotes inside fields.  When
61        True, two consecutive quotes are interpreted as one during read,
62        and when writing, each quote character embedded in the data is
63        written as two quotes
64"""
65
66import re
67import types
68from _csv import Error, writer, reader, register_dialect, \
69                 unregister_dialect, get_dialect, list_dialects, \
70                 field_size_limit, \
71                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
72                 QUOTE_STRINGS, QUOTE_NOTNULL
73from _csv import Dialect as _Dialect
74
75from io import StringIO
76
77__all__ = ["QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
78           "QUOTE_STRINGS", "QUOTE_NOTNULL",
79           "Error", "Dialect", "excel", "excel_tab",
80           "field_size_limit", "reader", "writer",
81           "register_dialect", "get_dialect", "list_dialects", "Sniffer",
82           "unregister_dialect", "DictReader", "DictWriter",
83           "unix_dialect"]
84
85__version__ = "1.0"
86
87
88class Dialect:
89    """Describe a CSV dialect.
90
91    This must be subclassed (see csv.excel).  Valid attributes are:
92    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
93    lineterminator, quoting.
94
95    """
96    _name = ""
97    _valid = False
98    # placeholders
99    delimiter = None
100    quotechar = None
101    escapechar = None
102    doublequote = None
103    skipinitialspace = None
104    lineterminator = None
105    quoting = None
106
107    def __init__(self):
108        if self.__class__ != Dialect:
109            self._valid = True
110        self._validate()
111
112    def _validate(self):
113        try:
114            _Dialect(self)
115        except TypeError as e:
116            # Re-raise to get a traceback showing more user code.
117            raise Error(str(e)) from None
118
119class excel(Dialect):
120    """Describe the usual properties of Excel-generated CSV files."""
121    delimiter = ','
122    quotechar = '"'
123    doublequote = True
124    skipinitialspace = False
125    lineterminator = '\r\n'
126    quoting = QUOTE_MINIMAL
127register_dialect("excel", excel)
128
129class excel_tab(excel):
130    """Describe the usual properties of Excel-generated TAB-delimited files."""
131    delimiter = '\t'
132register_dialect("excel-tab", excel_tab)
133
134class unix_dialect(Dialect):
135    """Describe the usual properties of Unix-generated CSV files."""
136    delimiter = ','
137    quotechar = '"'
138    doublequote = True
139    skipinitialspace = False
140    lineterminator = '\n'
141    quoting = QUOTE_ALL
142register_dialect("unix", unix_dialect)
143
144
145class DictReader:
146    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
147                 dialect="excel", *args, **kwds):
148        if fieldnames is not None and iter(fieldnames) is fieldnames:
149            fieldnames = list(fieldnames)
150        self._fieldnames = fieldnames   # list of keys for the dict
151        self.restkey = restkey          # key to catch long rows
152        self.restval = restval          # default value for short rows
153        self.reader = reader(f, dialect, *args, **kwds)
154        self.dialect = dialect
155        self.line_num = 0
156
157    def __iter__(self):
158        return self
159
160    @property
161    def fieldnames(self):
162        if self._fieldnames is None:
163            try:
164                self._fieldnames = next(self.reader)
165            except StopIteration:
166                pass
167        self.line_num = self.reader.line_num
168        return self._fieldnames
169
170    @fieldnames.setter
171    def fieldnames(self, value):
172        self._fieldnames = value
173
174    def __next__(self):
175        if self.line_num == 0:
176            # Used only for its side effect.
177            self.fieldnames
178        row = next(self.reader)
179        self.line_num = self.reader.line_num
180
181        # unlike the basic reader, we prefer not to return blanks,
182        # because we will typically wind up with a dict full of None
183        # values
184        while row == []:
185            row = next(self.reader)
186        d = dict(zip(self.fieldnames, row))
187        lf = len(self.fieldnames)
188        lr = len(row)
189        if lf < lr:
190            d[self.restkey] = row[lf:]
191        elif lf > lr:
192            for key in self.fieldnames[lr:]:
193                d[key] = self.restval
194        return d
195
196    __class_getitem__ = classmethod(types.GenericAlias)
197
198
199class DictWriter:
200    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
201                 dialect="excel", *args, **kwds):
202        if fieldnames is not None and iter(fieldnames) is fieldnames:
203            fieldnames = list(fieldnames)
204        self.fieldnames = fieldnames    # list of keys for the dict
205        self.restval = restval          # for writing short dicts
206        extrasaction = extrasaction.lower()
207        if extrasaction not in ("raise", "ignore"):
208            raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
209                             % extrasaction)
210        self.extrasaction = extrasaction
211        self.writer = writer(f, dialect, *args, **kwds)
212
213    def writeheader(self):
214        header = dict(zip(self.fieldnames, self.fieldnames))
215        return self.writerow(header)
216
217    def _dict_to_list(self, rowdict):
218        if self.extrasaction == "raise":
219            wrong_fields = rowdict.keys() - self.fieldnames
220            if wrong_fields:
221                raise ValueError("dict contains fields not in fieldnames: "
222                                 + ", ".join([repr(x) for x in wrong_fields]))
223        return (rowdict.get(key, self.restval) for key in self.fieldnames)
224
225    def writerow(self, rowdict):
226        return self.writer.writerow(self._dict_to_list(rowdict))
227
228    def writerows(self, rowdicts):
229        return self.writer.writerows(map(self._dict_to_list, rowdicts))
230
231    __class_getitem__ = classmethod(types.GenericAlias)
232
233
234class Sniffer:
235    '''
236    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
237    Returns a Dialect object.
238    '''
239    def __init__(self):
240        # in case there is more than one possible delimiter
241        self.preferred = [',', '\t', ';', ' ', ':']
242
243
244    def sniff(self, sample, delimiters=None):
245        """
246        Returns a dialect (or None) corresponding to the sample
247        """
248
249        quotechar, doublequote, delimiter, skipinitialspace = \
250                   self._guess_quote_and_delimiter(sample, delimiters)
251        if not delimiter:
252            delimiter, skipinitialspace = self._guess_delimiter(sample,
253                                                                delimiters)
254
255        if not delimiter:
256            raise Error("Could not determine delimiter")
257
258        class dialect(Dialect):
259            _name = "sniffed"
260            lineterminator = '\r\n'
261            quoting = QUOTE_MINIMAL
262            # escapechar = ''
263
264        dialect.doublequote = doublequote
265        dialect.delimiter = delimiter
266        # _csv.reader won't accept a quotechar of ''
267        dialect.quotechar = quotechar or '"'
268        dialect.skipinitialspace = skipinitialspace
269
270        return dialect
271
272
273    def _guess_quote_and_delimiter(self, data, delimiters):
274        """
275        Looks for text enclosed between two identical quotes
276        (the probable quotechar) which are preceded and followed
277        by the same character (the probable delimiter).
278        For example:
279                         ,'some text',
280        The quote with the most wins, same with the delimiter.
281        If there is no quotechar the delimiter can't be determined
282        this way.
283        """
284
285        matches = []
286        for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
287                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
288                      r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',   # ,".*?"
289                      r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
290            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
291            matches = regexp.findall(data)
292            if matches:
293                break
294
295        if not matches:
296            # (quotechar, doublequote, delimiter, skipinitialspace)
297            return ('', False, None, 0)
298        quotes = {}
299        delims = {}
300        spaces = 0
301        groupindex = regexp.groupindex
302        for m in matches:
303            n = groupindex['quote'] - 1
304            key = m[n]
305            if key:
306                quotes[key] = quotes.get(key, 0) + 1
307            try:
308                n = groupindex['delim'] - 1
309                key = m[n]
310            except KeyError:
311                continue
312            if key and (delimiters is None or key in delimiters):
313                delims[key] = delims.get(key, 0) + 1
314            try:
315                n = groupindex['space'] - 1
316            except KeyError:
317                continue
318            if m[n]:
319                spaces += 1
320
321        quotechar = max(quotes, key=quotes.get)
322
323        if delims:
324            delim = max(delims, key=delims.get)
325            skipinitialspace = delims[delim] == spaces
326            if delim == '\n': # most likely a file with a single column
327                delim = ''
328        else:
329            # there is *no* delimiter, it's a single column of quoted data
330            delim = ''
331            skipinitialspace = 0
332
333        # if we see an extra quote between delimiters, we've got a
334        # double quoted format
335        dq_regexp = re.compile(
336                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
337                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
338
339
340
341        if dq_regexp.search(data):
342            doublequote = True
343        else:
344            doublequote = False
345
346        return (quotechar, doublequote, delim, skipinitialspace)
347
348
349    def _guess_delimiter(self, data, delimiters):
350        """
351        The delimiter /should/ occur the same number of times on
352        each row. However, due to malformed data, it may not. We don't want
353        an all or nothing approach, so we allow for small variations in this
354        number.
355          1) build a table of the frequency of each character on every line.
356          2) build a table of frequencies of this frequency (meta-frequency?),
357             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
358             7 times in 2 rows'
359          3) use the mode of the meta-frequency to determine the /expected/
360             frequency for that character
361          4) find out how often the character actually meets that goal
362          5) the character that best meets its goal is the delimiter
363        For performance reasons, the data is evaluated in chunks, so it can
364        try and evaluate the smallest portion of the data possible, evaluating
365        additional chunks as necessary.
366        """
367
368        data = list(filter(None, data.split('\n')))
369
370        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
371
372        # build frequency tables
373        chunkLength = min(10, len(data))
374        iteration = 0
375        charFrequency = {}
376        modes = {}
377        delims = {}
378        start, end = 0, chunkLength
379        while start < len(data):
380            iteration += 1
381            for line in data[start:end]:
382                for char in ascii:
383                    metaFrequency = charFrequency.get(char, {})
384                    # must count even if frequency is 0
385                    freq = line.count(char)
386                    # value is the mode
387                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
388                    charFrequency[char] = metaFrequency
389
390            for char in charFrequency.keys():
391                items = list(charFrequency[char].items())
392                if len(items) == 1 and items[0][0] == 0:
393                    continue
394                # get the mode of the frequencies
395                if len(items) > 1:
396                    modes[char] = max(items, key=lambda x: x[1])
397                    # adjust the mode - subtract the sum of all
398                    # other frequencies
399                    items.remove(modes[char])
400                    modes[char] = (modes[char][0], modes[char][1]
401                                   - sum(item[1] for item in items))
402                else:
403                    modes[char] = items[0]
404
405            # build a list of possible delimiters
406            modeList = modes.items()
407            total = float(min(chunkLength * iteration, len(data)))
408            # (rows of consistent data) / (number of rows) = 100%
409            consistency = 1.0
410            # minimum consistency threshold
411            threshold = 0.9
412            while len(delims) == 0 and consistency >= threshold:
413                for k, v in modeList:
414                    if v[0] > 0 and v[1] > 0:
415                        if ((v[1]/total) >= consistency and
416                            (delimiters is None or k in delimiters)):
417                            delims[k] = v
418                consistency -= 0.01
419
420            if len(delims) == 1:
421                delim = list(delims.keys())[0]
422                skipinitialspace = (data[0].count(delim) ==
423                                    data[0].count("%c " % delim))
424                return (delim, skipinitialspace)
425
426            # analyze another chunkLength lines
427            start = end
428            end += chunkLength
429
430        if not delims:
431            return ('', 0)
432
433        # if there's more than one, fall back to a 'preferred' list
434        if len(delims) > 1:
435            for d in self.preferred:
436                if d in delims.keys():
437                    skipinitialspace = (data[0].count(d) ==
438                                        data[0].count("%c " % d))
439                    return (d, skipinitialspace)
440
441        # nothing else indicates a preference, pick the character that
442        # dominates(?)
443        items = [(v,k) for (k,v) in delims.items()]
444        items.sort()
445        delim = items[-1][1]
446
447        skipinitialspace = (data[0].count(delim) ==
448                            data[0].count("%c " % delim))
449        return (delim, skipinitialspace)
450
451
452    def has_header(self, sample):
453        # Creates a dictionary of types of data in each column. If any
454        # column is of a single type (say, integers), *except* for the first
455        # row, then the first row is presumed to be labels. If the type
456        # can't be determined, it is assumed to be a string in which case
457        # the length of the string is the determining factor: if all of the
458        # rows except for the first are the same length, it's a header.
459        # Finally, a 'vote' is taken at the end for each column, adding or
460        # subtracting from the likelihood of the first row being a header.
461
462        rdr = reader(StringIO(sample), self.sniff(sample))
463
464        header = next(rdr) # assume first row is header
465
466        columns = len(header)
467        columnTypes = {}
468        for i in range(columns): columnTypes[i] = None
469
470        checked = 0
471        for row in rdr:
472            # arbitrary number of rows to check, to keep it sane
473            if checked > 20:
474                break
475            checked += 1
476
477            if len(row) != columns:
478                continue # skip rows that have irregular number of columns
479
480            for col in list(columnTypes.keys()):
481                thisType = complex
482                try:
483                    thisType(row[col])
484                except (ValueError, OverflowError):
485                    # fallback to length of string
486                    thisType = len(row[col])
487
488                if thisType != columnTypes[col]:
489                    if columnTypes[col] is None: # add new column type
490                        columnTypes[col] = thisType
491                    else:
492                        # type is inconsistent, remove column from
493                        # consideration
494                        del columnTypes[col]
495
496        # finally, compare results against first row and "vote"
497        # on whether it's a header
498        hasHeader = 0
499        for col, colType in columnTypes.items():
500            if isinstance(colType, int): # it's a length
501                if len(header[col]) != colType:
502                    hasHeader += 1
503                else:
504                    hasHeader -= 1
505            else: # attempt typecast
506                try:
507                    colType(header[col])
508                except (ValueError, TypeError):
509                    hasHeader += 1
510                else:
511                    hasHeader -= 1
512
513        return hasHeader > 0
514