1# coding=utf-8 2# 3# Copyright (c) 2025 Huawei Device Co., Ltd. 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16"""Creates a unique name. 17 18The mangling scheme preserves information about original underscore positions in identifiers 19while creating a flat namespace. This is useful for languages that allow underscores in 20package names but need to generate unique identifiers for linking or bytecode generation. 21 22Algorithm: 231. Join all segments with underscore separator 242. Record positions of "special" underscores (those in original identifiers) 253. Encode special positions using ULEB8 compression 264. Append kind marker and encoded positions as suffix 27 28Input: 29 pkg_segments: package/name segments (e.g. ['a', 'b_c', 'd', 'e_f_g']). 30 Last segment is assumed to be the identifier name 31 32Returns: Mangled name that preserves underscore position information 33Format: {joined_segments}_{kind}{encoded_positions} 34 35Example: 36 Input: 37 pkg_segments = ['a', 'b_c', 'd', 'e_f_g'] 38 kind = 'f' (function) 39 40 Mangling steps: 41 1. Join segments: 'a_b_c_d_e_f_g' 42 Underscore positions: 0 1 2 3 4 5 43 44 2. Record special underscore positions: 45 - 'b_c' -> position 1 46 - 'e_f_g' -> positions 4,5 47 special_positions = [1, 4, 5] 48 49 3. Convert to position deltas: 50 [1, 4-1=3, 5-4=1] = [1, 3, 1] 51 52 4. Encode as ULEB8: 53 [0b0001, 0b0011, 0b0001] = 0x131 54 55 5. Add kind marker and suffix: 56 Final result = 'a_b_c_d_e_f_g_f131' 57 58Note: 59 - Position encoding uses ULEB128 for compact representation 60 - Kind marker is a single character indicating symbol type: 61 - Decoder can reconstruct original identifier structure using 62 encoded position information 63""" 64 65from enum import Enum 66 67 68class DeclKind(Enum): 69 FUNC = "f" 70 TYPE = "t" 71 IID = "i" 72 UNION = "union" 73 FTABLE = "ftable" 74 VTABLE = "vtable" 75 DYNAMIC_CAST = "dynamic" 76 STATIC_CAST = "static" 77 78 79def _encode_uleb8(value: int) -> list[int]: 80 """Encodes a single integer in ULEB8 format. 81 82 Args: 83 value: Non-negative integer to encode 84 85 Returns: 86 List of bytes representing the ULEB8 encoding 87 88 Raises: 89 ValueError: If value is negative 90 """ 91 if value < 0: 92 raise ValueError("ULEB8 encoding requires non-negative values") 93 94 result = [] 95 while True: 96 byte = value & 0x0F 97 value >>= 4 98 if value: 99 byte |= 0x10 # Set continuation bit 100 result.append(byte) 101 if not value: 102 break 103 return result 104 105 106def _decode_uleb8(encoded: str, start_pos: int) -> tuple[int, int]: 107 """Decodes a single ULEB8 value from a hex string. 108 109 Args: 110 encoded: Hex string containing ULEB8 encoded values 111 start_pos: Starting position in the hex string 112 113 Returns: 114 Tuple of (decoded value, number of bytes consumed) 115 116 Raises: 117 ValueError: If invalid hex characters are encountered 118 """ 119 value = 0 120 shift = 0 121 pos = start_pos 122 123 while pos < len(encoded): 124 try: 125 byte = int(encoded[pos], 16) 126 except ValueError: 127 raise ValueError(f"Invalid hex character at position {pos}") from None 128 129 value |= (byte & 0x0F) << shift 130 pos += 1 131 if not (byte & 0x10): 132 break 133 shift += 4 134 135 return value, pos - start_pos 136 137 138def encode(segments: list[str], kind: DeclKind) -> str: 139 """Encodes segments into a mangled name. 140 141 Args: 142 segments: List of name segments 143 kind: Single character kind marker 144 145 Returns: 146 Mangled name string 147 148 Raises: 149 ValueError: If segments is empty or kind is invalid 150 """ 151 if not segments: 152 raise ValueError("segments list cannot be empty") 153 154 # Find positions of "special" underscores (those within segments) 155 special_positions = [] 156 total_underscores = 0 157 158 for segment in segments: 159 for char in segment: 160 if char == "_": 161 special_positions.append(total_underscores) 162 total_underscores += 1 163 total_underscores += 1 # For segment separator 164 165 # Convert to position deltas 166 deltas = [] 167 if special_positions: 168 deltas.append(special_positions[0]) # First position is absolute 169 for i in range(1, len(special_positions)): 170 deltas.append(special_positions[i] - special_positions[i - 1]) 171 172 # Encode positions 173 encoded_bytes = [] 174 for delta in deltas: 175 encoded_bytes.extend(_encode_uleb8(delta)) 176 177 # Convert to hex string 178 encoded_str = "".join(f"{byte:x}" for byte in encoded_bytes) 179 180 base = "_".join(segments) 181 return f"{base}_{kind.value}{encoded_str}" 182 183 184def decode(mangled_name: str) -> tuple[list[str], DeclKind]: 185 """Decodes a mangled name back into original segments. 186 187 Args: 188 mangled_name: The mangled name to decode 189 190 Returns: 191 List of original name segments 192 193 Raises: 194 ValueError: If mangled_name format is invalid 195 """ 196 try: 197 last_underscore = mangled_name.rindex("_") 198 except ValueError: 199 raise ValueError("Invalid mangled name: missing underscore separator") from None 200 201 if last_underscore + 2 >= len(mangled_name): 202 raise ValueError("Invalid mangled name: missing kind marker or encoding") 203 204 base_name = mangled_name[:last_underscore] 205 kind = DeclKind(mangled_name[last_underscore + 1]) 206 encoding = mangled_name[last_underscore + 2 :] 207 208 # Decode special underscore positions 209 special_positions = [] 210 pos = 0 211 212 try: 213 while pos < len(encoding): 214 delta, consumed = _decode_uleb8(encoding, pos) 215 if pos == 0: 216 special_positions.append(delta) 217 else: 218 special_positions.append(special_positions[-1] + delta) 219 pos += consumed 220 except ValueError as e: 221 raise ValueError(f"Failed to decode positions: {e!s}") from None 222 223 # Split and reconstruct segments 224 parts = base_name.split("_") 225 if not parts: 226 raise ValueError("Invalid mangled name: empty base name") from None 227 228 segments = [] 229 current_parts = [parts[0]] 230 underscore_pos = 0 231 232 for underscore_pos in range(1, len(parts)): 233 if underscore_pos - 1 in special_positions: 234 current_parts.append(parts[underscore_pos]) 235 else: 236 segments.append("_".join(current_parts)) 237 current_parts = [parts[underscore_pos]] 238 239 if current_parts: 240 segments.append("_".join(current_parts)) 241 242 return segments, kind