• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# coding=utf-8
2#
3# Copyright (c) 2025 Huawei Device Co., Ltd.
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""Creates a unique name.
17
18The mangling scheme preserves information about original underscore positions in identifiers
19while creating a flat namespace. This is useful for languages that allow underscores in
20package names but need to generate unique identifiers for linking or bytecode generation.
21
22Algorithm:
231. Join all segments with underscore separator
242. Record positions of "special" underscores (those in original identifiers)
253. Encode special positions using ULEB8 compression
264. Append kind marker and encoded positions as suffix
27
28Input:
29    pkg_segments: package/name segments (e.g. ['a', 'b_c', 'd', 'e_f_g']).
30    Last segment is assumed to be the identifier name
31
32Returns: Mangled name that preserves underscore position information
33Format: {joined_segments}_{kind}{encoded_positions}
34
35Example:
36    Input:
37        pkg_segments = ['a', 'b_c', 'd', 'e_f_g']
38        kind = 'f' (function)
39
40    Mangling steps:
41    1. Join segments:      'a_b_c_d_e_f_g'
42       Underscore positions: 0 1 2 3 4 5
43
44    2. Record special underscore positions:
45        - 'b_c'   -> position 1
46        - 'e_f_g' -> positions 4,5
47        special_positions = [1, 4, 5]
48
49    3. Convert to position deltas:
50        [1, 4-1=3, 5-4=1] = [1, 3, 1]
51
52    4. Encode as ULEB8:
53        [0b0001, 0b0011, 0b0001] = 0x131
54
55    5. Add kind marker and suffix:
56        Final result = 'a_b_c_d_e_f_g_f131'
57
58Note:
59    - Position encoding uses ULEB128 for compact representation
60    - Kind marker is a single character indicating symbol type:
61    - Decoder can reconstruct original identifier structure using
62        encoded position information
63"""
64
65from enum import Enum
66
67
68class DeclKind(Enum):
69    FUNC = "f"
70    TYPE = "t"
71    IID = "i"
72    UNION = "union"
73    FTABLE = "ftable"
74    VTABLE = "vtable"
75    DYNAMIC_CAST = "dynamic"
76    STATIC_CAST = "static"
77
78
79def _encode_uleb8(value: int) -> list[int]:
80    """Encodes a single integer in ULEB8 format.
81
82    Args:
83        value: Non-negative integer to encode
84
85    Returns:
86        List of bytes representing the ULEB8 encoding
87
88    Raises:
89        ValueError: If value is negative
90    """
91    if value < 0:
92        raise ValueError("ULEB8 encoding requires non-negative values")
93
94    result = []
95    while True:
96        byte = value & 0x0F
97        value >>= 4
98        if value:
99            byte |= 0x10  # Set continuation bit
100        result.append(byte)
101        if not value:
102            break
103    return result
104
105
106def _decode_uleb8(encoded: str, start_pos: int) -> tuple[int, int]:
107    """Decodes a single ULEB8 value from a hex string.
108
109    Args:
110        encoded: Hex string containing ULEB8 encoded values
111        start_pos: Starting position in the hex string
112
113    Returns:
114        Tuple of (decoded value, number of bytes consumed)
115
116    Raises:
117        ValueError: If invalid hex characters are encountered
118    """
119    value = 0
120    shift = 0
121    pos = start_pos
122
123    while pos < len(encoded):
124        try:
125            byte = int(encoded[pos], 16)
126        except ValueError:
127            raise ValueError(f"Invalid hex character at position {pos}") from None
128
129        value |= (byte & 0x0F) << shift
130        pos += 1
131        if not (byte & 0x10):
132            break
133        shift += 4
134
135    return value, pos - start_pos
136
137
138def encode(segments: list[str], kind: DeclKind) -> str:
139    """Encodes segments into a mangled name.
140
141    Args:
142        segments: List of name segments
143        kind: Single character kind marker
144
145    Returns:
146        Mangled name string
147
148    Raises:
149        ValueError: If segments is empty or kind is invalid
150    """
151    if not segments:
152        raise ValueError("segments list cannot be empty")
153
154    # Find positions of "special" underscores (those within segments)
155    special_positions = []
156    total_underscores = 0
157
158    for segment in segments:
159        for char in segment:
160            if char == "_":
161                special_positions.append(total_underscores)
162                total_underscores += 1
163        total_underscores += 1  # For segment separator
164
165    # Convert to position deltas
166    deltas = []
167    if special_positions:
168        deltas.append(special_positions[0])  # First position is absolute
169        for i in range(1, len(special_positions)):
170            deltas.append(special_positions[i] - special_positions[i - 1])
171
172    # Encode positions
173    encoded_bytes = []
174    for delta in deltas:
175        encoded_bytes.extend(_encode_uleb8(delta))
176
177    # Convert to hex string
178    encoded_str = "".join(f"{byte:x}" for byte in encoded_bytes)
179
180    base = "_".join(segments)
181    return f"{base}_{kind.value}{encoded_str}"
182
183
184def decode(mangled_name: str) -> tuple[list[str], DeclKind]:
185    """Decodes a mangled name back into original segments.
186
187    Args:
188        mangled_name: The mangled name to decode
189
190    Returns:
191        List of original name segments
192
193    Raises:
194        ValueError: If mangled_name format is invalid
195    """
196    try:
197        last_underscore = mangled_name.rindex("_")
198    except ValueError:
199        raise ValueError("Invalid mangled name: missing underscore separator") from None
200
201    if last_underscore + 2 >= len(mangled_name):
202        raise ValueError("Invalid mangled name: missing kind marker or encoding")
203
204    base_name = mangled_name[:last_underscore]
205    kind = DeclKind(mangled_name[last_underscore + 1])
206    encoding = mangled_name[last_underscore + 2 :]
207
208    # Decode special underscore positions
209    special_positions = []
210    pos = 0
211
212    try:
213        while pos < len(encoding):
214            delta, consumed = _decode_uleb8(encoding, pos)
215            if pos == 0:
216                special_positions.append(delta)
217            else:
218                special_positions.append(special_positions[-1] + delta)
219            pos += consumed
220    except ValueError as e:
221        raise ValueError(f"Failed to decode positions: {e!s}") from None
222
223    # Split and reconstruct segments
224    parts = base_name.split("_")
225    if not parts:
226        raise ValueError("Invalid mangled name: empty base name") from None
227
228    segments = []
229    current_parts = [parts[0]]
230    underscore_pos = 0
231
232    for underscore_pos in range(1, len(parts)):
233        if underscore_pos - 1 in special_positions:
234            current_parts.append(parts[underscore_pos])
235        else:
236            segments.append("_".join(current_parts))
237            current_parts = [parts[underscore_pos]]
238
239    if current_parts:
240        segments.append("_".join(current_parts))
241
242    return segments, kind