1"""text_file 2 3provides the TextFile class, which gives an interface to text files 4that (optionally) takes care of stripping comments, ignoring blank 5lines, and joining lines with backslashes.""" 6 7import sys, io 8 9 10class TextFile: 11 """Provides a file-like object that takes care of all the things you 12 commonly want to do when processing a text file that has some 13 line-by-line syntax: strip comments (as long as "#" is your 14 comment character), skip blank lines, join adjacent lines by 15 escaping the newline (ie. backslash at end of line), strip 16 leading and/or trailing whitespace. All of these are optional 17 and independently controllable. 18 19 Provides a 'warn()' method so you can generate warning messages that 20 report physical line number, even if the logical line in question 21 spans multiple physical lines. Also provides 'unreadline()' for 22 implementing line-at-a-time lookahead. 23 24 Constructor is called as: 25 26 TextFile (filename=None, file=None, **options) 27 28 It bombs (RuntimeError) if both 'filename' and 'file' are None; 29 'filename' should be a string, and 'file' a file object (or 30 something that provides 'readline()' and 'close()' methods). It is 31 recommended that you supply at least 'filename', so that TextFile 32 can include it in warning messages. If 'file' is not supplied, 33 TextFile creates its own using 'io.open()'. 34 35 The options are all boolean, and affect the value returned by 36 'readline()': 37 strip_comments [default: true] 38 strip from "#" to end-of-line, as well as any whitespace 39 leading up to the "#" -- unless it is escaped by a backslash 40 lstrip_ws [default: false] 41 strip leading whitespace from each line before returning it 42 rstrip_ws [default: true] 43 strip trailing whitespace (including line terminator!) from 44 each line before returning it 45 skip_blanks [default: true} 46 skip lines that are empty *after* stripping comments and 47 whitespace. (If both lstrip_ws and rstrip_ws are false, 48 then some lines may consist of solely whitespace: these will 49 *not* be skipped, even if 'skip_blanks' is true.) 50 join_lines [default: false] 51 if a backslash is the last non-newline character on a line 52 after stripping comments and whitespace, join the following line 53 to it to form one "logical line"; if N consecutive lines end 54 with a backslash, then N+1 physical lines will be joined to 55 form one logical line. 56 collapse_join [default: false] 57 strip leading whitespace from lines that are joined to their 58 predecessor; only matters if (join_lines and not lstrip_ws) 59 errors [default: 'strict'] 60 error handler used to decode the file content 61 62 Note that since 'rstrip_ws' can strip the trailing newline, the 63 semantics of 'readline()' must differ from those of the builtin file 64 object's 'readline()' method! In particular, 'readline()' returns 65 None for end-of-file: an empty string might just be a blank line (or 66 an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is 67 not.""" 68 69 default_options = { 'strip_comments': 1, 70 'skip_blanks': 1, 71 'lstrip_ws': 0, 72 'rstrip_ws': 1, 73 'join_lines': 0, 74 'collapse_join': 0, 75 'errors': 'strict', 76 } 77 78 def __init__(self, filename=None, file=None, **options): 79 """Construct a new TextFile object. At least one of 'filename' 80 (a string) and 'file' (a file-like object) must be supplied. 81 They keyword argument options are described above and affect 82 the values returned by 'readline()'.""" 83 if filename is None and file is None: 84 raise RuntimeError("you must supply either or both of 'filename' and 'file'") 85 86 # set values for all options -- either from client option hash 87 # or fallback to default_options 88 for opt in self.default_options.keys(): 89 if opt in options: 90 setattr(self, opt, options[opt]) 91 else: 92 setattr(self, opt, self.default_options[opt]) 93 94 # sanity check client option hash 95 for opt in options.keys(): 96 if opt not in self.default_options: 97 raise KeyError("invalid TextFile option '%s'" % opt) 98 99 if file is None: 100 self.open(filename) 101 else: 102 self.filename = filename 103 self.file = file 104 self.current_line = 0 # assuming that file is at BOF! 105 106 # 'linebuf' is a stack of lines that will be emptied before we 107 # actually read from the file; it's only populated by an 108 # 'unreadline()' operation 109 self.linebuf = [] 110 111 def open(self, filename): 112 """Open a new file named 'filename'. This overrides both the 113 'filename' and 'file' arguments to the constructor.""" 114 self.filename = filename 115 self.file = io.open(self.filename, 'r', errors=self.errors) 116 self.current_line = 0 117 118 def close(self): 119 """Close the current file and forget everything we know about it 120 (filename, current line number).""" 121 file = self.file 122 self.file = None 123 self.filename = None 124 self.current_line = None 125 file.close() 126 127 def gen_error(self, msg, line=None): 128 outmsg = [] 129 if line is None: 130 line = self.current_line 131 outmsg.append(self.filename + ", ") 132 if isinstance(line, (list, tuple)): 133 outmsg.append("lines %d-%d: " % tuple(line)) 134 else: 135 outmsg.append("line %d: " % line) 136 outmsg.append(str(msg)) 137 return "".join(outmsg) 138 139 def error(self, msg, line=None): 140 raise ValueError("error: " + self.gen_error(msg, line)) 141 142 def warn(self, msg, line=None): 143 """Print (to stderr) a warning message tied to the current logical 144 line in the current file. If the current logical line in the 145 file spans multiple physical lines, the warning refers to the 146 whole range, eg. "lines 3-5". If 'line' supplied, it overrides 147 the current line number; it may be a list or tuple to indicate a 148 range of physical lines, or an integer for a single physical 149 line.""" 150 sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n") 151 152 def readline(self): 153 """Read and return a single logical line from the current file (or 154 from an internal buffer if lines have previously been "unread" 155 with 'unreadline()'). If the 'join_lines' option is true, this 156 may involve reading multiple physical lines concatenated into a 157 single string. Updates the current line number, so calling 158 'warn()' after 'readline()' emits a warning about the physical 159 line(s) just read. Returns None on end-of-file, since the empty 160 string can occur if 'rstrip_ws' is true but 'strip_blanks' is 161 not.""" 162 # If any "unread" lines waiting in 'linebuf', return the top 163 # one. (We don't actually buffer read-ahead data -- lines only 164 # get put in 'linebuf' if the client explicitly does an 165 # 'unreadline()'. 166 if self.linebuf: 167 line = self.linebuf[-1] 168 del self.linebuf[-1] 169 return line 170 171 buildup_line = '' 172 173 while True: 174 # read the line, make it None if EOF 175 line = self.file.readline() 176 if line == '': 177 line = None 178 179 if self.strip_comments and line: 180 181 # Look for the first "#" in the line. If none, never 182 # mind. If we find one and it's the first character, or 183 # is not preceded by "\", then it starts a comment -- 184 # strip the comment, strip whitespace before it, and 185 # carry on. Otherwise, it's just an escaped "#", so 186 # unescape it (and any other escaped "#"'s that might be 187 # lurking in there) and otherwise leave the line alone. 188 189 pos = line.find("#") 190 if pos == -1: # no "#" -- no comments 191 pass 192 193 # It's definitely a comment -- either "#" is the first 194 # character, or it's elsewhere and unescaped. 195 elif pos == 0 or line[pos-1] != "\\": 196 # Have to preserve the trailing newline, because it's 197 # the job of a later step (rstrip_ws) to remove it -- 198 # and if rstrip_ws is false, we'd better preserve it! 199 # (NB. this means that if the final line is all comment 200 # and has no trailing newline, we will think that it's 201 # EOF; I think that's OK.) 202 eol = (line[-1] == '\n') and '\n' or '' 203 line = line[0:pos] + eol 204 205 # If all that's left is whitespace, then skip line 206 # *now*, before we try to join it to 'buildup_line' -- 207 # that way constructs like 208 # hello \\ 209 # # comment that should be ignored 210 # there 211 # result in "hello there". 212 if line.strip() == "": 213 continue 214 else: # it's an escaped "#" 215 line = line.replace("\\#", "#") 216 217 # did previous line end with a backslash? then accumulate 218 if self.join_lines and buildup_line: 219 # oops: end of file 220 if line is None: 221 self.warn("continuation line immediately precedes " 222 "end-of-file") 223 return buildup_line 224 225 if self.collapse_join: 226 line = line.lstrip() 227 line = buildup_line + line 228 229 # careful: pay attention to line number when incrementing it 230 if isinstance(self.current_line, list): 231 self.current_line[1] = self.current_line[1] + 1 232 else: 233 self.current_line = [self.current_line, 234 self.current_line + 1] 235 # just an ordinary line, read it as usual 236 else: 237 if line is None: # eof 238 return None 239 240 # still have to be careful about incrementing the line number! 241 if isinstance(self.current_line, list): 242 self.current_line = self.current_line[1] + 1 243 else: 244 self.current_line = self.current_line + 1 245 246 # strip whitespace however the client wants (leading and 247 # trailing, or one or the other, or neither) 248 if self.lstrip_ws and self.rstrip_ws: 249 line = line.strip() 250 elif self.lstrip_ws: 251 line = line.lstrip() 252 elif self.rstrip_ws: 253 line = line.rstrip() 254 255 # blank line (whether we rstrip'ed or not)? skip to next line 256 # if appropriate 257 if (line == '' or line == '\n') and self.skip_blanks: 258 continue 259 260 if self.join_lines: 261 if line[-1] == '\\': 262 buildup_line = line[:-1] 263 continue 264 265 if line[-2:] == '\\\n': 266 buildup_line = line[0:-2] + '\n' 267 continue 268 269 # well, I guess there's some actual content there: return it 270 return line 271 272 def readlines(self): 273 """Read and return the list of all logical lines remaining in the 274 current file.""" 275 lines = [] 276 while True: 277 line = self.readline() 278 if line is None: 279 return lines 280 lines.append(line) 281 282 def unreadline(self, line): 283 """Push 'line' (a string) onto an internal buffer that will be 284 checked by future 'readline()' calls. Handy for implementing 285 a parser with line-at-a-time lookahead.""" 286 self.linebuf.append(line) 287