1import xml.etree.ElementTree as ET 2import random 3import os 4 5def generate_random_xml(depth=0, max_depth=3, num_attributes=2, max_text_length=20): 6 """ 7 Recursively generates a random XML structure. 8 9 Args: 10 depth: Current nesting depth. 11 max_depth: Maximum allowed nesting depth. 12 num_attributes: Number of attributes to generate for each element. 13 max_text_length: Maximum length for text content in elements. 14 15 Returns: 16 ET.Element: The generated XML element. 17 """ 18 19 tag_name = f"element_{random.randint(1, 100)}" 20 element = ET.Element(tag_name) 21 22 # Add attributes 23 for _ in range(random.randint(0, num_attributes)): 24 attr_name = f"attr_{random.randint(1, 10)}" 25 attr_value = "".join(random.choices("abcdefghijklmnopqrstuvwxyz ", k=random.randint(0, 15))) 26 element.set(attr_name, attr_value) 27 28 # Add text content 29 text_content = "".join(random.choices("abcdefghijklmnopqrstuvwxyz <>&\"'", k=random.randint(0, max_text_length))) 30 element.text = text_content 31 32 # Recursively add child elements 33 if depth < max_depth: 34 num_children = random.randint(0, 3) 35 for _ in range(num_children): 36 child = generate_random_xml(depth + 1, max_depth, num_attributes, max_text_length) 37 element.append(child) 38 39 return element 40 41 42if __name__ == "__main__": 43 num_files = 100 # Number of XML files to generate 44 corpus_dir = "xml_corpus" # Directory to store the generated files 45 46 os.makedirs(corpus_dir, exist_ok=True) 47 48 for i in range(num_files): 49 root_element = generate_random_xml() 50 tree = ET.ElementTree(root_element) 51 52 # Add corruptions (optional) 53 if random.random() < 0.2: # 20% chance of introducing corruption 54 corruption_type = random.choice(["missing_end_tag", "invalid_attribute", "unescaped_chars"]) 55 # ... add logic to introduce the specific corruption type ... 56 57 # Save to file 58 filename = os.path.join(corpus_dir, f"sample_{i}.xml") 59 tree.write(filename, encoding="utf-8", xml_declaration=True) 60