1""" 2Try to detect suspicious constructs, resembling markup 3that has leaked into the final output. 4 5Suspicious lines are reported in a comma-separated-file, 6``suspicious.csv``, located in the output directory. 7 8The file is utf-8 encoded, and each line contains four fields: 9 10 * document name (normalized) 11 * line number in the source document 12 * problematic text 13 * complete line showing the problematic text in context 14 15It is common to find many false positives. To avoid reporting them 16again and again, they may be added to the ``ignored.csv`` file 17(located in the configuration directory). The file has the same 18format as ``suspicious.csv`` with a few differences: 19 20 - each line defines a rule; if the rule matches, the issue 21 is ignored. 22 - line number may be empty (that is, nothing between the 23 commas: ",,"). In this case, line numbers are ignored (the 24 rule matches anywhere in the file). 25 - the last field does not have to be a complete line; some 26 surrounding text (never more than a line) is enough for 27 context. 28 29Rules are processed sequentially. A rule matches when: 30 31 * document names are the same 32 * problematic texts are the same 33 * line numbers are close to each other (5 lines up or down) 34 * the rule text is completely contained into the source line 35 36The simplest way to create the ignored.csv file is by copying 37undesired entries from suspicious.csv (possibly trimming the last 38field.) 39 40Copyright 2009 Gabriel A. Genellina 41 42""" 43 44import os 45import re 46import csv 47import sys 48 49from docutils import nodes 50from sphinx.builders import Builder 51 52detect_all = re.compile(r''' 53 ::(?=[^=])| # two :: (but NOT ::=) 54 :[a-zA-Z][a-zA-Z0-9]+| # :foo 55 `| # ` (seldom used by itself) 56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:) 57 ''', re.UNICODE | re.VERBOSE).finditer 58 59py3 = sys.version_info >= (3, 0) 60 61 62class Rule: 63 def __init__(self, docname, lineno, issue, line): 64 """A rule for ignoring issues""" 65 self.docname = docname # document to which this rule applies 66 self.lineno = lineno # line number in the original source; 67 # this rule matches only near that. 68 # None -> don't care 69 self.issue = issue # the markup fragment that triggered this rule 70 self.line = line # text of the container element (single line only) 71 self.used = False 72 73 def __repr__(self): 74 return '{0.docname},,{0.issue},{0.line}'.format(self) 75 76 77 78class dialect(csv.excel): 79 """Our dialect: uses only linefeed as newline.""" 80 lineterminator = '\n' 81 82 83class CheckSuspiciousMarkupBuilder(Builder): 84 """ 85 Checks for possibly invalid markup that may leak into the output. 86 """ 87 name = 'suspicious' 88 89 def init(self): 90 # create output file 91 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv') 92 open(self.log_file_name, 'w').close() 93 # load database of previously ignored issues 94 self.load_rules(os.path.join(os.path.dirname(__file__), '..', 95 'susp-ignored.csv')) 96 97 def get_outdated_docs(self): 98 return self.env.found_docs 99 100 def get_target_uri(self, docname, typ=None): 101 return '' 102 103 def prepare_writing(self, docnames): 104 pass 105 106 def write_doc(self, docname, doctree): 107 # set when any issue is encountered in this document 108 self.any_issue = False 109 self.docname = docname 110 visitor = SuspiciousVisitor(doctree, self) 111 doctree.walk(visitor) 112 113 def finish(self): 114 unused_rules = [rule for rule in self.rules if not rule.used] 115 if unused_rules: 116 self.warn('Found %s/%s unused rules:' % 117 (len(unused_rules), len(self.rules))) 118 for rule in unused_rules: 119 self.info(repr(rule)) 120 return 121 122 def check_issue(self, line, lineno, issue): 123 if not self.is_ignored(line, lineno, issue): 124 self.report_issue(line, lineno, issue) 125 126 def is_ignored(self, line, lineno, issue): 127 """Determine whether this issue should be ignored.""" 128 docname = self.docname 129 for rule in self.rules: 130 if rule.docname != docname: continue 131 if rule.issue != issue: continue 132 # Both lines must match *exactly*. This is rather strict, 133 # and probably should be improved. 134 # Doing fuzzy matches with levenshtein distance could work, 135 # but that means bringing other libraries... 136 # Ok, relax that requirement: just check if the rule fragment 137 # is contained in the document line 138 if rule.line not in line: continue 139 # Check both line numbers. If they're "near" 140 # this rule matches. (lineno=None means "don't care") 141 if (rule.lineno is not None) and \ 142 abs(rule.lineno - lineno) > 5: continue 143 # if it came this far, the rule matched 144 rule.used = True 145 return True 146 return False 147 148 def report_issue(self, text, lineno, issue): 149 if not self.any_issue: self.info() 150 self.any_issue = True 151 self.write_log_entry(lineno, issue, text) 152 if py3: 153 self.warn('[%s:%d] "%s" found in "%-.120s"' % 154 (self.docname, lineno, issue, text)) 155 else: 156 self.warn('[%s:%d] "%s" found in "%-.120s"' % ( 157 self.docname.encode(sys.getdefaultencoding(),'replace'), 158 lineno, 159 issue.encode(sys.getdefaultencoding(),'replace'), 160 text.strip().encode(sys.getdefaultencoding(),'replace'))) 161 self.app.statuscode = 1 162 163 def write_log_entry(self, lineno, issue, text): 164 if py3: 165 f = open(self.log_file_name, 'a') 166 writer = csv.writer(f, dialect) 167 writer.writerow([self.docname, lineno, issue, text.strip()]) 168 f.close() 169 else: 170 f = open(self.log_file_name, 'ab') 171 writer = csv.writer(f, dialect) 172 writer.writerow([self.docname.encode('utf-8'), 173 lineno, 174 issue.encode('utf-8'), 175 text.strip().encode('utf-8')]) 176 f.close() 177 178 def load_rules(self, filename): 179 """Load database of previously ignored issues. 180 181 A csv file, with exactly the same format as suspicious.csv 182 Fields: document name (normalized), line number, issue, surrounding text 183 """ 184 self.info("loading ignore rules... ", nonl=1) 185 self.rules = rules = [] 186 try: 187 if py3: 188 f = open(filename, 'r') 189 else: 190 f = open(filename, 'rb') 191 except IOError: 192 return 193 for i, row in enumerate(csv.reader(f)): 194 if len(row) != 4: 195 raise ValueError( 196 "wrong format in %s, line %d: %s" % (filename, i+1, row)) 197 docname, lineno, issue, text = row 198 if lineno: 199 lineno = int(lineno) 200 else: 201 lineno = None 202 if not py3: 203 docname = docname.decode('utf-8') 204 issue = issue.decode('utf-8') 205 text = text.decode('utf-8') 206 rule = Rule(docname, lineno, issue, text) 207 rules.append(rule) 208 f.close() 209 self.info('done, %d rules loaded' % len(self.rules)) 210 211 212def get_lineno(node): 213 """Obtain line number information for a node.""" 214 lineno = None 215 while lineno is None and node: 216 node = node.parent 217 lineno = node.line 218 return lineno 219 220 221def extract_line(text, index): 222 """text may be a multiline string; extract 223 only the line containing the given character index. 224 225 >>> extract_line("abc\ndefgh\ni", 6) 226 >>> 'defgh' 227 >>> for i in (0, 2, 3, 4, 10): 228 ... print extract_line("abc\ndefgh\ni", i) 229 abc 230 abc 231 abc 232 defgh 233 defgh 234 i 235 """ 236 p = text.rfind('\n', 0, index) + 1 237 q = text.find('\n', index) 238 if q < 0: 239 q = len(text) 240 return text[p:q] 241 242 243class SuspiciousVisitor(nodes.GenericNodeVisitor): 244 245 lastlineno = 0 246 247 def __init__(self, document, builder): 248 nodes.GenericNodeVisitor.__init__(self, document) 249 self.builder = builder 250 251 def default_visit(self, node): 252 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers 253 text = node.astext() 254 # lineno seems to go backwards sometimes (?) 255 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno) 256 seen = set() # don't report the same issue more than only once per line 257 for match in detect_all(text): 258 issue = match.group() 259 line = extract_line(text, match.start()) 260 if (issue, line) not in seen: 261 self.builder.check_issue(line, lineno, issue) 262 seen.add((issue, line)) 263 264 unknown_visit = default_visit 265 266 def visit_document(self, node): 267 self.lastlineno = 0 268 269 def visit_comment(self, node): 270 # ignore comments -- too much false positives. 271 # (although doing this could miss some errors; 272 # there were two sections "commented-out" by mistake 273 # in the Python docs that would not be caught) 274 raise nodes.SkipNode 275