1from __future__ import annotations 2import collections 3import dataclasses as dc 4import re 5import shlex 6from typing import Any 7 8import libclinic 9from libclinic import fail, ClinicError 10from libclinic.language import Language 11from libclinic.function import ( 12 Module, Class, Function) 13 14 15@dc.dataclass(slots=True, repr=False) 16class Block: 17 r""" 18 Represents a single block of text embedded in 19 another file. If dsl_name is None, the block represents 20 verbatim text, raw original text from the file, in 21 which case "input" will be the only non-false member. 22 If dsl_name is not None, the block represents a Clinic 23 block. 24 25 input is always str, with embedded \n characters. 26 input represents the original text from the file; 27 if it's a Clinic block, it is the original text with 28 the body_prefix and redundant leading whitespace removed. 29 30 dsl_name is either str or None. If str, it's the text 31 found on the start line of the block between the square 32 brackets. 33 34 signatures is a list. 35 It may only contain clinic.Module, clinic.Class, and 36 clinic.Function objects. At the moment it should 37 contain at most one of each. 38 39 output is either str or None. If str, it's the output 40 from this block, with embedded '\n' characters. 41 42 indent is a str. It's the leading whitespace 43 that was found on every line of input. (If body_prefix is 44 not empty, this is the indent *after* removing the 45 body_prefix.) 46 47 "indent" is different from the concept of "preindent" 48 (which is not stored as state on Block objects). 49 "preindent" is the whitespace that 50 was found in front of every line of input *before* the 51 "body_prefix" (see the Language object). If body_prefix 52 is empty, preindent must always be empty too. 53 54 To illustrate the difference between "indent" and "preindent": 55 56 Assume that '_' represents whitespace. 57 If the block processed was in a Python file, and looked like this: 58 ____#/*[python] 59 ____#__for a in range(20): 60 ____#____print(a) 61 ____#[python]*/ 62 "preindent" would be "____" and "indent" would be "__". 63 64 """ 65 input: str 66 dsl_name: str | None = None 67 signatures: list[Module | Class | Function] = dc.field(default_factory=list) 68 output: Any = None # TODO: Very dynamic; probably untypeable in its current form? 69 indent: str = '' 70 71 def __repr__(self) -> str: 72 dsl_name = self.dsl_name or "text" 73 def summarize(s: object) -> str: 74 s = repr(s) 75 if len(s) > 30: 76 return s[:26] + "..." + s[0] 77 return s 78 parts = ( 79 repr(dsl_name), 80 f"input={summarize(self.input)}", 81 f"output={summarize(self.output)}" 82 ) 83 return f"<clinic.Block {' '.join(parts)}>" 84 85 86class BlockParser: 87 """ 88 Block-oriented parser for Argument Clinic. 89 Iterator, yields Block objects. 90 """ 91 92 def __init__( 93 self, 94 input: str, 95 language: Language, 96 *, 97 verify: bool = True 98 ) -> None: 99 """ 100 "input" should be a str object 101 with embedded \n characters. 102 103 "language" should be a Language object. 104 """ 105 language.validate() 106 107 self.input = collections.deque(reversed(input.splitlines(keepends=True))) 108 self.block_start_line_number = self.line_number = 0 109 110 self.language = language 111 before, _, after = language.start_line.partition('{dsl_name}') 112 assert _ == '{dsl_name}' 113 self.find_start_re = libclinic.create_regex(before, after, 114 whole_line=False) 115 self.start_re = libclinic.create_regex(before, after) 116 self.verify = verify 117 self.last_checksum_re: re.Pattern[str] | None = None 118 self.last_dsl_name: str | None = None 119 self.dsl_name: str | None = None 120 self.first_block = True 121 122 def __iter__(self) -> BlockParser: 123 return self 124 125 def __next__(self) -> Block: 126 while True: 127 if not self.input: 128 raise StopIteration 129 130 if self.dsl_name: 131 try: 132 return_value = self.parse_clinic_block(self.dsl_name) 133 except ClinicError as exc: 134 exc.filename = self.language.filename 135 exc.lineno = self.line_number 136 raise 137 self.dsl_name = None 138 self.first_block = False 139 return return_value 140 block = self.parse_verbatim_block() 141 if self.first_block and not block.input: 142 continue 143 self.first_block = False 144 return block 145 146 147 def is_start_line(self, line: str) -> str | None: 148 match = self.start_re.match(line.lstrip()) 149 return match.group(1) if match else None 150 151 def _line(self, lookahead: bool = False) -> str: 152 self.line_number += 1 153 line = self.input.pop() 154 if not lookahead: 155 self.language.parse_line(line) 156 return line 157 158 def parse_verbatim_block(self) -> Block: 159 lines = [] 160 self.block_start_line_number = self.line_number 161 162 while self.input: 163 line = self._line() 164 dsl_name = self.is_start_line(line) 165 if dsl_name: 166 self.dsl_name = dsl_name 167 break 168 lines.append(line) 169 170 return Block("".join(lines)) 171 172 def parse_clinic_block(self, dsl_name: str) -> Block: 173 in_lines = [] 174 self.block_start_line_number = self.line_number + 1 175 stop_line = self.language.stop_line.format(dsl_name=dsl_name) 176 body_prefix = self.language.body_prefix.format(dsl_name=dsl_name) 177 178 def is_stop_line(line: str) -> bool: 179 # make sure to recognize stop line even if it 180 # doesn't end with EOL (it could be the very end of the file) 181 if line.startswith(stop_line): 182 remainder = line.removeprefix(stop_line) 183 if remainder and not remainder.isspace(): 184 fail(f"Garbage after stop line: {remainder!r}") 185 return True 186 else: 187 # gh-92256: don't allow incorrectly formatted stop lines 188 if line.lstrip().startswith(stop_line): 189 fail(f"Whitespace is not allowed before the stop line: {line!r}") 190 return False 191 192 # consume body of program 193 while self.input: 194 line = self._line() 195 if is_stop_line(line) or self.is_start_line(line): 196 break 197 if body_prefix: 198 line = line.lstrip() 199 assert line.startswith(body_prefix) 200 line = line.removeprefix(body_prefix) 201 in_lines.append(line) 202 203 # consume output and checksum line, if present. 204 if self.last_dsl_name == dsl_name: 205 checksum_re = self.last_checksum_re 206 else: 207 before, _, after = self.language.checksum_line.format(dsl_name=dsl_name, arguments='{arguments}').partition('{arguments}') 208 assert _ == '{arguments}' 209 checksum_re = libclinic.create_regex(before, after, word=False) 210 self.last_dsl_name = dsl_name 211 self.last_checksum_re = checksum_re 212 assert checksum_re is not None 213 214 # scan forward for checksum line 215 out_lines = [] 216 arguments = None 217 while self.input: 218 line = self._line(lookahead=True) 219 match = checksum_re.match(line.lstrip()) 220 arguments = match.group(1) if match else None 221 if arguments: 222 break 223 out_lines.append(line) 224 if self.is_start_line(line): 225 break 226 227 output: str | None 228 output = "".join(out_lines) 229 if arguments: 230 d = {} 231 for field in shlex.split(arguments): 232 name, equals, value = field.partition('=') 233 if not equals: 234 fail(f"Mangled Argument Clinic marker line: {line!r}") 235 d[name.strip()] = value.strip() 236 237 if self.verify: 238 if 'input' in d: 239 checksum = d['output'] 240 else: 241 checksum = d['checksum'] 242 243 computed = libclinic.compute_checksum(output, len(checksum)) 244 if checksum != computed: 245 fail("Checksum mismatch! " 246 f"Expected {checksum!r}, computed {computed!r}. " 247 "Suggested fix: remove all generated code including " 248 "the end marker, or use the '-f' option.") 249 else: 250 # put back output 251 output_lines = output.splitlines(keepends=True) 252 self.line_number -= len(output_lines) 253 self.input.extend(reversed(output_lines)) 254 output = None 255 256 return Block("".join(in_lines), dsl_name, output=output) 257