• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2#
3# Copyright (C) 2018 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""Merge multiple CSV files, possibly with different columns.
17"""
18
19import argparse
20import csv
21import io
22import heapq
23import itertools
24import operator
25
26from zipfile import ZipFile
27
28args_parser = argparse.ArgumentParser(
29    description='Merge given CSV files into a single one.'
30)
31args_parser.add_argument(
32    '--header',
33    help='Comma separated field names; '
34    'if missing determines the header from input files.',
35)
36args_parser.add_argument(
37    '--zip_input',
38    help='Treat files as ZIP archives containing CSV files to merge.',
39    action="store_true",
40)
41args_parser.add_argument(
42    '--key_field',
43    help='The name of the field by which the rows should be sorted. '
44    'Must be in the field names. '
45    'Will be the first field in the output. '
46    'All input files must be sorted by that field.',
47)
48args_parser.add_argument(
49    '--output',
50    help='Output file for merged CSV.',
51    default='-',
52    type=argparse.FileType('w'),
53)
54args_parser.add_argument('files', nargs=argparse.REMAINDER)
55args = args_parser.parse_args()
56
57
58def dict_reader(csvfile):
59    return csv.DictReader(csvfile, delimiter=',', quotechar='|')
60
61
62csv_readers = []
63if not args.zip_input:
64    for file in args.files:
65        csv_readers.append(dict_reader(open(file, 'r')))
66else:
67    for file in args.files:
68        with ZipFile(file) as zipfile:
69            for entry in zipfile.namelist():
70                if entry.endswith('.uau'):
71                    csv_readers.append(
72                        dict_reader(io.TextIOWrapper(zipfile.open(entry, 'r')))
73                    )
74
75if args.header:
76    fieldnames = args.header.split(',')
77else:
78    headers = {}
79    # Build union of all columns from source files:
80    for reader in csv_readers:
81        for fieldname in reader.fieldnames:
82            headers[fieldname] = ""
83    fieldnames = list(headers.keys())
84
85# By default chain the csv readers together so that the resulting output is
86# the concatenation of the rows from each of them:
87all_rows = itertools.chain.from_iterable(csv_readers)
88
89if len(csv_readers) > 0:
90    keyField = args.key_field
91    if keyField:
92        assert keyField in fieldnames, (
93            "--key_field {} not found, must be one of {}\n"
94        ).format(keyField, ",".join(fieldnames))
95        # Make the key field the first field in the output
96        keyFieldIndex = fieldnames.index(args.key_field)
97        fieldnames.insert(0, fieldnames.pop(keyFieldIndex))
98        # Create an iterable that performs a lazy merge sort on the csv readers
99        # sorting the rows by the key field.
100        all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField))
101
102# Write all rows from the input files to the output:
103writer = csv.DictWriter(
104    args.output,
105    delimiter=',',
106    quotechar='|',
107    quoting=csv.QUOTE_MINIMAL,
108    dialect='unix',
109    fieldnames=fieldnames,
110)
111writer.writeheader()
112
113# Read all the rows from the input and write them to the output in the correct
114# order:
115for row in all_rows:
116    writer.writerow(row)
117