1from collections import namedtuple 2import csv 3import re 4import textwrap 5 6from . import NOT_SET, strutil, fsutil 7 8 9EMPTY = '-' 10UNKNOWN = '???' 11 12 13def parse_markers(markers, default=None): 14 if markers is NOT_SET: 15 return default 16 if not markers: 17 return None 18 if type(markers) is not str: 19 return markers 20 if markers == markers[0] * len(markers): 21 return [markers] 22 return list(markers) 23 24 25def fix_row(row, **markers): 26 if isinstance(row, str): 27 raise NotImplementedError(row) 28 empty = parse_markers(markers.pop('empty', ('-',))) 29 unknown = parse_markers(markers.pop('unknown', ('???',))) 30 row = (val if val else None for val in row) 31 if not empty: 32 if unknown: 33 row = (UNKNOWN if val in unknown else val for val in row) 34 elif not unknown: 35 row = (EMPTY if val in empty else val for val in row) 36 else: 37 row = (EMPTY if val in empty else (UNKNOWN if val in unknown else val) 38 for val in row) 39 return tuple(row) 40 41 42def _fix_read_default(row): 43 for value in row: 44 yield value.strip() 45 46 47def _fix_write_default(row, empty=''): 48 for value in row: 49 yield empty if value is None else str(value) 50 51 52def _normalize_fix_read(fix): 53 if fix is None: 54 fix = '' 55 if callable(fix): 56 def fix_row(row): 57 values = fix(row) 58 return _fix_read_default(values) 59 elif isinstance(fix, str): 60 def fix_row(row): 61 values = _fix_read_default(row) 62 return (None if v == fix else v 63 for v in values) 64 else: 65 raise NotImplementedError(fix) 66 return fix_row 67 68 69def _normalize_fix_write(fix, empty=''): 70 if fix is None: 71 fix = empty 72 if callable(fix): 73 def fix_row(row): 74 values = fix(row) 75 return _fix_write_default(values, empty) 76 elif isinstance(fix, str): 77 def fix_row(row): 78 return _fix_write_default(row, fix) 79 else: 80 raise NotImplementedError(fix) 81 return fix_row 82 83 84def read_table(infile, header, *, 85 sep='\t', 86 fix=None, 87 _open=open, 88 _get_reader=csv.reader, 89 ): 90 """Yield each row of the given ???-separated (e.g. tab) file.""" 91 if isinstance(infile, str): 92 with _open(infile, newline='') as infile: 93 yield from read_table( 94 infile, 95 header, 96 sep=sep, 97 fix=fix, 98 _open=_open, 99 _get_reader=_get_reader, 100 ) 101 return 102 lines = strutil._iter_significant_lines(infile) 103 104 # Validate the header. 105 if not isinstance(header, str): 106 header = sep.join(header) 107 try: 108 actualheader = next(lines).strip() 109 except StopIteration: 110 actualheader = '' 111 if actualheader != header: 112 raise ValueError(f'bad header {actualheader!r}') 113 114 fix_row = _normalize_fix_read(fix) 115 for row in _get_reader(lines, delimiter=sep or '\t'): 116 yield tuple(fix_row(row)) 117 118 119def write_table(outfile, header, rows, *, 120 sep='\t', 121 fix=None, 122 backup=True, 123 _open=open, 124 _get_writer=csv.writer, 125 ): 126 """Write each of the rows to the given ???-separated (e.g. tab) file.""" 127 if backup: 128 fsutil.create_backup(outfile, backup) 129 if isinstance(outfile, str): 130 with _open(outfile, 'w', newline='') as outfile: 131 return write_table( 132 outfile, 133 header, 134 rows, 135 sep=sep, 136 fix=fix, 137 backup=backup, 138 _open=_open, 139 _get_writer=_get_writer, 140 ) 141 142 if isinstance(header, str): 143 header = header.split(sep or '\t') 144 fix_row = _normalize_fix_write(fix) 145 writer = _get_writer(outfile, delimiter=sep or '\t') 146 writer.writerow(header) 147 for row in rows: 148 writer.writerow( 149 tuple(fix_row(row)) 150 ) 151 152 153def parse_table(entries, sep, header=None, rawsep=None, *, 154 default=NOT_SET, 155 strict=True, 156 ): 157 header, sep = _normalize_table_file_props(header, sep) 158 if not sep: 159 raise ValueError('missing "sep"') 160 161 ncols = None 162 if header: 163 if strict: 164 ncols = len(header.split(sep)) 165 cur_file = None 166 for line, filename in strutil.parse_entries(entries, ignoresep=sep): 167 _sep = sep 168 if filename: 169 if header and cur_file != filename: 170 cur_file = filename 171 # Skip the first line if it's the header. 172 if line.strip() == header: 173 continue 174 else: 175 # We expected the header. 176 raise NotImplementedError((header, line)) 177 elif rawsep and sep not in line: 178 _sep = rawsep 179 180 row = _parse_row(line, _sep, ncols, default) 181 if strict and not ncols: 182 ncols = len(row) 183 yield row, filename 184 185 186def parse_row(line, sep, *, ncols=None, default=NOT_SET): 187 if not sep: 188 raise ValueError('missing "sep"') 189 return _parse_row(line, sep, ncols, default) 190 191 192def _parse_row(line, sep, ncols, default): 193 row = tuple(v.strip() for v in line.split(sep)) 194 if (ncols or 0) > 0: 195 diff = ncols - len(row) 196 if diff: 197 if default is NOT_SET or diff < 0: 198 raise Exception(f'bad row (expected {ncols} columns, got {row!r})') 199 row += (default,) * diff 200 return row 201 202 203def _normalize_table_file_props(header, sep): 204 if not header: 205 return None, sep 206 207 if not isinstance(header, str): 208 if not sep: 209 raise NotImplementedError(header) 210 header = sep.join(header) 211 elif not sep: 212 for sep in ('\t', ',', ' '): 213 if sep in header: 214 break 215 else: 216 sep = None 217 return header, sep 218 219 220################################## 221# stdout tables 222 223WIDTH = 20 224 225 226def resolve_columns(specs): 227 if isinstance(specs, str): 228 specs = specs.replace(',', ' ').strip().split() 229 resolved = [] 230 for raw in specs: 231 column = ColumnSpec.from_raw(raw) 232 resolved.append(column) 233 return resolved 234 235 236def build_table(specs, *, sep=' ', defaultwidth=None): 237 columns = resolve_columns(specs) 238 return _build_table(columns, sep=sep, defaultwidth=defaultwidth) 239 240 241class ColumnSpec(namedtuple('ColumnSpec', 'field label fmt')): 242 243 REGEX = re.compile(textwrap.dedent(r''' 244 ^ 245 (?: 246 \[ 247 ( 248 (?: [^\s\]] [^\]]* )? 249 [^\s\]] 250 ) # <label> 251 ] 252 )? 253 ( [-\w]+ ) # <field> 254 (?: 255 (?: 256 : 257 ( [<^>] ) # <align> 258 ( \d+ )? # <width1> 259 ) 260 | 261 (?: 262 (?: 263 : 264 ( \d+ ) # <width2> 265 )? 266 (?: 267 : 268 ( .*? ) # <fmt> 269 )? 270 ) 271 )? 272 $ 273 '''), re.VERBOSE) 274 275 @classmethod 276 def from_raw(cls, raw): 277 if not raw: 278 raise ValueError('missing column spec') 279 elif isinstance(raw, cls): 280 return raw 281 282 if isinstance(raw, str): 283 *values, _ = cls._parse(raw) 284 else: 285 *values, _ = cls._normalize(raw) 286 if values is None: 287 raise ValueError(f'unsupported column spec {raw!r}') 288 return cls(*values) 289 290 @classmethod 291 def parse(cls, specstr): 292 parsed = cls._parse(specstr) 293 if not parsed: 294 return None 295 *values, _ = parsed 296 return cls(*values) 297 298 @classmethod 299 def _parse(cls, specstr): 300 m = cls.REGEX.match(specstr) 301 if not m: 302 return None 303 (label, field, 304 align, width1, 305 width2, fmt, 306 ) = m.groups() 307 if not label: 308 label = field 309 if fmt: 310 assert not align and not width1, (specstr,) 311 _parsed = _parse_fmt(fmt) 312 if not _parsed: 313 raise NotImplementedError 314 elif width2: 315 width, _ = _parsed 316 if width != int(width2): 317 raise NotImplementedError(specstr) 318 elif width2: 319 fmt = width2 320 width = int(width2) 321 else: 322 assert not fmt, (fmt, specstr) 323 if align: 324 width = int(width1) if width1 else len(label) 325 fmt = f'{align}{width}' 326 else: 327 width = None 328 return field, label, fmt, width 329 330 @classmethod 331 def _normalize(cls, spec): 332 if len(spec) == 1: 333 raw, = spec 334 raise NotImplementedError 335 return _resolve_column(raw) 336 337 if len(spec) == 4: 338 label, field, width, fmt = spec 339 if width: 340 if not fmt: 341 fmt = str(width) 342 elif _parse_fmt(fmt)[0] != width: 343 raise ValueError(f'width mismatch in {spec}') 344 elif len(raw) == 3: 345 label, field, fmt = spec 346 if not field: 347 label, field = None, label 348 elif not isinstance(field, str) or not field.isidentifier(): 349 # XXX This doesn't seem right... 350 fmt = f'{field}:{fmt}' if fmt else field 351 label, field = None, label 352 elif len(raw) == 2: 353 label = None 354 field, fmt = raw 355 if not field: 356 field, fmt = fmt, None 357 elif not field.isidentifier() or fmt.isidentifier(): 358 label, field = field, fmt 359 else: 360 raise NotImplementedError 361 362 fmt = f':{fmt}' if fmt else '' 363 if label: 364 return cls._parse(f'[{label}]{field}{fmt}') 365 else: 366 return cls._parse(f'{field}{fmt}') 367 368 @property 369 def width(self): 370 if not self.fmt: 371 return None 372 parsed = _parse_fmt(self.fmt) 373 if not parsed: 374 return None 375 width, _ = parsed 376 return width 377 378 def resolve_width(self, default=None): 379 return _resolve_width(self.width, self.fmt, self.label, default) 380 381 382def _parse_fmt(fmt): 383 if fmt.startswith(tuple('<^>')): 384 align = fmt[0] 385 width = fmt[1:] 386 if width.isdigit(): 387 return int(width), align 388 elif fmt.isdigit(): 389 return int(fmt), '<' 390 return None 391 392 393def _resolve_width(width, fmt, label, default): 394 if width: 395 if not isinstance(width, int): 396 raise NotImplementedError 397 return width 398 elif fmt: 399 parsed = _parse_fmt(fmt) 400 if parsed: 401 width, _ = parsed 402 if width: 403 return width 404 405 if not default: 406 return WIDTH 407 elif hasattr(default, 'get'): 408 defaults = default 409 default = defaults.get(None) or WIDTH 410 return defaults.get(label) or default 411 else: 412 return default or WIDTH 413 414 415def _build_table(columns, *, sep=' ', defaultwidth=None): 416 header = [] 417 div = [] 418 rowfmt = [] 419 for spec in columns: 420 width = spec.resolve_width(defaultwidth) 421 colfmt = spec.fmt 422 colfmt = f':{spec.fmt}' if spec.fmt else f':{width}' 423 424 header.append(f' {{:^{width}}} '.format(spec.label)) 425 div.append('-' * (width + 2)) 426 rowfmt.append(f' {{{spec.field}{colfmt}}} ') 427 return ( 428 sep.join(header), 429 sep.join(div), 430 sep.join(rowfmt), 431 ) 432