1""" 2Try to detect suspicious constructs, resembling markup 3that has leaked into the final output. 4 5Suspicious lines are reported in a comma-separated-file, 6``suspicious.csv``, located in the output directory. 7 8The file is utf-8 encoded, and each line contains four fields: 9 10 * document name (normalized) 11 * line number in the source document 12 * problematic text 13 * complete line showing the problematic text in context 14 15It is common to find many false positives. To avoid reporting them 16again and again, they may be added to the ``ignored.csv`` file 17(located in the configuration directory). The file has the same 18format as ``suspicious.csv`` with a few differences: 19 20 - each line defines a rule; if the rule matches, the issue 21 is ignored. 22 - line number may be empty (that is, nothing between the 23 commas: ",,"). In this case, line numbers are ignored (the 24 rule matches anywhere in the file). 25 - the last field does not have to be a complete line; some 26 surrounding text (never more than a line) is enough for 27 context. 28 29Rules are processed sequentially. A rule matches when: 30 31 * document names are the same 32 * problematic texts are the same 33 * line numbers are close to each other (5 lines up or down) 34 * the rule text is completely contained into the source line 35 36The simplest way to create the ignored.csv file is by copying 37undesired entries from suspicious.csv (possibly trimming the last 38field.) 39 40Copyright 2009 Gabriel A. Genellina 41 42""" 43 44import os 45import re 46import csv 47import sys 48 49from docutils import nodes 50from sphinx.builders import Builder 51import sphinx.util 52 53detect_all = re.compile(r''' 54 ::(?=[^=])| # two :: (but NOT ::=) 55 :[a-zA-Z][a-zA-Z0-9]+| # :foo 56 `| # ` (seldom used by itself) 57 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:) 58 ''', re.UNICODE | re.VERBOSE).finditer 59 60py3 = sys.version_info >= (3, 0) 61 62 63class Rule: 64 def __init__(self, docname, lineno, issue, line): 65 """A rule for ignoring issues""" 66 self.docname = docname # document to which this rule applies 67 self.lineno = lineno # line number in the original source; 68 # this rule matches only near that. 69 # None -> don't care 70 self.issue = issue # the markup fragment that triggered this rule 71 self.line = line # text of the container element (single line only) 72 self.used = False 73 74 def __repr__(self): 75 return '{0.docname},,{0.issue},{0.line}'.format(self) 76 77 78 79class dialect(csv.excel): 80 """Our dialect: uses only linefeed as newline.""" 81 lineterminator = '\n' 82 83 84class CheckSuspiciousMarkupBuilder(Builder): 85 """ 86 Checks for possibly invalid markup that may leak into the output. 87 """ 88 name = 'suspicious' 89 logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder") 90 91 def init(self): 92 # create output file 93 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv') 94 open(self.log_file_name, 'w').close() 95 # load database of previously ignored issues 96 self.load_rules(os.path.join(os.path.dirname(__file__), '..', 97 'susp-ignored.csv')) 98 99 def get_outdated_docs(self): 100 return self.env.found_docs 101 102 def get_target_uri(self, docname, typ=None): 103 return '' 104 105 def prepare_writing(self, docnames): 106 pass 107 108 def write_doc(self, docname, doctree): 109 # set when any issue is encountered in this document 110 self.any_issue = False 111 self.docname = docname 112 visitor = SuspiciousVisitor(doctree, self) 113 doctree.walk(visitor) 114 115 def finish(self): 116 unused_rules = [rule for rule in self.rules if not rule.used] 117 if unused_rules: 118 self.logger.warning( 119 'Found %s/%s unused rules: %s' % ( 120 len(unused_rules), len(self.rules), 121 ''.join(repr(rule) for rule in unused_rules), 122 ) 123 ) 124 return 125 126 def check_issue(self, line, lineno, issue): 127 if not self.is_ignored(line, lineno, issue): 128 self.report_issue(line, lineno, issue) 129 130 def is_ignored(self, line, lineno, issue): 131 """Determine whether this issue should be ignored.""" 132 docname = self.docname 133 for rule in self.rules: 134 if rule.docname != docname: continue 135 if rule.issue != issue: continue 136 # Both lines must match *exactly*. This is rather strict, 137 # and probably should be improved. 138 # Doing fuzzy matches with levenshtein distance could work, 139 # but that means bringing other libraries... 140 # Ok, relax that requirement: just check if the rule fragment 141 # is contained in the document line 142 if rule.line not in line: continue 143 # Check both line numbers. If they're "near" 144 # this rule matches. (lineno=None means "don't care") 145 if (rule.lineno is not None) and \ 146 abs(rule.lineno - lineno) > 5: continue 147 # if it came this far, the rule matched 148 rule.used = True 149 return True 150 return False 151 152 def report_issue(self, text, lineno, issue): 153 self.any_issue = True 154 self.write_log_entry(lineno, issue, text) 155 if py3: 156 self.logger.warning('[%s:%d] "%s" found in "%-.120s"' % 157 (self.docname, lineno, issue, text)) 158 else: 159 self.logger.warning( 160 '[%s:%d] "%s" found in "%-.120s"' % ( 161 self.docname.encode(sys.getdefaultencoding(),'replace'), 162 lineno, 163 issue.encode(sys.getdefaultencoding(),'replace'), 164 text.strip().encode(sys.getdefaultencoding(),'replace'))) 165 self.app.statuscode = 1 166 167 def write_log_entry(self, lineno, issue, text): 168 if py3: 169 f = open(self.log_file_name, 'a') 170 writer = csv.writer(f, dialect) 171 writer.writerow([self.docname, lineno, issue, text.strip()]) 172 f.close() 173 else: 174 f = open(self.log_file_name, 'ab') 175 writer = csv.writer(f, dialect) 176 writer.writerow([self.docname.encode('utf-8'), 177 lineno, 178 issue.encode('utf-8'), 179 text.strip().encode('utf-8')]) 180 f.close() 181 182 def load_rules(self, filename): 183 """Load database of previously ignored issues. 184 185 A csv file, with exactly the same format as suspicious.csv 186 Fields: document name (normalized), line number, issue, surrounding text 187 """ 188 self.logger.info("loading ignore rules... ", nonl=1) 189 self.rules = rules = [] 190 try: 191 if py3: 192 f = open(filename, 'r') 193 else: 194 f = open(filename, 'rb') 195 except IOError: 196 return 197 for i, row in enumerate(csv.reader(f)): 198 if len(row) != 4: 199 raise ValueError( 200 "wrong format in %s, line %d: %s" % (filename, i+1, row)) 201 docname, lineno, issue, text = row 202 if lineno: 203 lineno = int(lineno) 204 else: 205 lineno = None 206 if not py3: 207 docname = docname.decode('utf-8') 208 issue = issue.decode('utf-8') 209 text = text.decode('utf-8') 210 rule = Rule(docname, lineno, issue, text) 211 rules.append(rule) 212 f.close() 213 self.logger.info('done, %d rules loaded' % len(self.rules)) 214 215 216def get_lineno(node): 217 """Obtain line number information for a node.""" 218 lineno = None 219 while lineno is None and node: 220 node = node.parent 221 lineno = node.line 222 return lineno 223 224 225def extract_line(text, index): 226 """text may be a multiline string; extract 227 only the line containing the given character index. 228 229 >>> extract_line("abc\ndefgh\ni", 6) 230 >>> 'defgh' 231 >>> for i in (0, 2, 3, 4, 10): 232 ... print extract_line("abc\ndefgh\ni", i) 233 abc 234 abc 235 abc 236 defgh 237 defgh 238 i 239 """ 240 p = text.rfind('\n', 0, index) + 1 241 q = text.find('\n', index) 242 if q < 0: 243 q = len(text) 244 return text[p:q] 245 246 247class SuspiciousVisitor(nodes.GenericNodeVisitor): 248 249 lastlineno = 0 250 251 def __init__(self, document, builder): 252 nodes.GenericNodeVisitor.__init__(self, document) 253 self.builder = builder 254 255 def default_visit(self, node): 256 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers 257 text = node.astext() 258 # lineno seems to go backwards sometimes (?) 259 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno) 260 seen = set() # don't report the same issue more than only once per line 261 for match in detect_all(text): 262 issue = match.group() 263 line = extract_line(text, match.start()) 264 if (issue, line) not in seen: 265 self.builder.check_issue(line, lineno, issue) 266 seen.add((issue, line)) 267 268 unknown_visit = default_visit 269 270 def visit_document(self, node): 271 self.lastlineno = 0 272 273 def visit_comment(self, node): 274 # ignore comments -- too much false positives. 275 # (although doing this could miss some errors; 276 # there were two sections "commented-out" by mistake 277 # in the Python docs that would not be caught) 278 raise nodes.SkipNode 279