• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import xml.etree.ElementTree as ET
2import random
3import os
4
5def generate_random_xml(depth=0, max_depth=3, num_attributes=2, max_text_length=20):
6    """
7    Recursively generates a random XML structure.
8
9    Args:
10        depth: Current nesting depth.
11        max_depth: Maximum allowed nesting depth.
12        num_attributes: Number of attributes to generate for each element.
13        max_text_length: Maximum length for text content in elements.
14
15    Returns:
16        ET.Element: The generated XML element.
17    """
18
19    tag_name = f"element_{random.randint(1, 100)}"
20    element = ET.Element(tag_name)
21
22    # Add attributes
23    for _ in range(random.randint(0, num_attributes)):
24        attr_name = f"attr_{random.randint(1, 10)}"
25        attr_value = "".join(random.choices("abcdefghijklmnopqrstuvwxyz ", k=random.randint(0, 15)))
26        element.set(attr_name, attr_value)
27
28    # Add text content
29    text_content = "".join(random.choices("abcdefghijklmnopqrstuvwxyz  <>&\"'", k=random.randint(0, max_text_length)))
30    element.text = text_content
31
32    # Recursively add child elements
33    if depth < max_depth:
34        num_children = random.randint(0, 3)
35        for _ in range(num_children):
36            child = generate_random_xml(depth + 1, max_depth, num_attributes, max_text_length)
37            element.append(child)
38
39    return element
40
41
42if __name__ == "__main__":
43    num_files = 100  # Number of XML files to generate
44    corpus_dir = "xml_corpus"  # Directory to store the generated files
45
46    os.makedirs(corpus_dir, exist_ok=True)
47
48    for i in range(num_files):
49        root_element = generate_random_xml()
50        tree = ET.ElementTree(root_element)
51
52        # Add corruptions (optional)
53        if random.random() < 0.2:  # 20% chance of introducing corruption
54            corruption_type = random.choice(["missing_end_tag", "invalid_attribute", "unescaped_chars"])
55            # ... add logic to introduce the specific corruption type ...
56
57        # Save to file
58        filename = os.path.join(corpus_dir, f"sample_{i}.xml")
59        tree.write(filename, encoding="utf-8", xml_declaration=True)
60