1 // Copyright 2024 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/debugging/internal/demangle_rust.h"
16
17 #include <cstddef>
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24
25 namespace absl {
26 ABSL_NAMESPACE_BEGIN
27 namespace debugging_internal {
28
29 namespace {
30
31 // Same step limit as the C++ demangler in demangle.cc uses.
32 constexpr int kMaxReturns = 1 << 17;
33
IsDigit(char c)34 bool IsDigit(char c) { return '0' <= c && c <= '9'; }
IsLower(char c)35 bool IsLower(char c) { return 'a' <= c && c <= 'z'; }
IsUpper(char c)36 bool IsUpper(char c) { return 'A' <= c && c <= 'Z'; }
IsAlpha(char c)37 bool IsAlpha(char c) { return IsLower(c) || IsUpper(c); }
IsIdentifierChar(char c)38 bool IsIdentifierChar(char c) { return IsAlpha(c) || IsDigit(c) || c == '_'; }
IsLowerHexDigit(char c)39 bool IsLowerHexDigit(char c) { return IsDigit(c) || ('a' <= c && c <= 'f'); }
40
BasicTypeName(char c)41 const char* BasicTypeName(char c) {
42 switch (c) {
43 case 'a': return "i8";
44 case 'b': return "bool";
45 case 'c': return "char";
46 case 'd': return "f64";
47 case 'e': return "str";
48 case 'f': return "f32";
49 case 'h': return "u8";
50 case 'i': return "isize";
51 case 'j': return "usize";
52 case 'l': return "i32";
53 case 'm': return "u32";
54 case 'n': return "i128";
55 case 'o': return "u128";
56 case 'p': return "_";
57 case 's': return "i16";
58 case 't': return "u16";
59 case 'u': return "()";
60 case 'v': return "...";
61 case 'x': return "i64";
62 case 'y': return "u64";
63 case 'z': return "!";
64 }
65 return nullptr;
66 }
67
68 // Parser for Rust symbol mangling v0, whose grammar is defined here:
69 //
70 // https://doc.rust-lang.org/rustc/symbol-mangling/v0.html#symbol-grammar-summary
71 class RustSymbolParser {
72 public:
73 // Prepares to demangle the given encoding, a Rust symbol name starting with
74 // _R, into the output buffer [out, out_end). The caller is expected to
75 // continue by calling the new object's Parse function.
RustSymbolParser(const char * encoding,char * out,char * const out_end)76 RustSymbolParser(const char* encoding, char* out, char* const out_end)
77 : encoding_(encoding), out_(out), out_end_(out_end) {
78 if (out_ != out_end_) *out_ = '\0';
79 }
80
81 // Parses the constructor's encoding argument, writing output into the range
82 // [out, out_end). Returns true on success and false for input whose
83 // structure was not recognized or exceeded implementation limits, such as by
84 // nesting structures too deep. In either case *this should not be used
85 // again.
Parse()86 ABSL_MUST_USE_RESULT bool Parse() && {
87 // Recursively parses the grammar production named by callee, then resumes
88 // execution at the next statement.
89 //
90 // Recursive-descent parsing is a beautifully readable translation of a
91 // grammar, but it risks stack overflow if implemented by naive recursion on
92 // the C++ call stack. So we simulate recursion by goto and switch instead,
93 // keeping a bounded stack of "return addresses" in the recursion_stack_
94 // member.
95 //
96 // The callee argument is a statement label. We goto that label after
97 // saving the "return address" on recursion_stack_. The next continue
98 // statement in the for loop below "returns" from this "call".
99 //
100 // The caller argument names the return point. Each value of caller must
101 // appear in only one ABSL_DEMANGLER_RECURSE call and be listed in the
102 // definition of enum ReturnAddress. The switch implements the control
103 // transfer from the end of a "called" subroutine back to the statement
104 // after the "call".
105 //
106 // Note that not all the grammar productions have to be packed into the
107 // switch, but only those which appear in a cycle in the grammar. Anything
108 // acyclic can be written as ordinary functions and function calls, e.g.,
109 // ParseIdentifier.
110 #define ABSL_DEMANGLER_RECURSE(callee, caller) \
111 do { \
112 if (recursion_depth_ == kStackSize) return false; \
113 /* The next continue will switch on this saved value ... */ \
114 recursion_stack_[recursion_depth_++] = caller; \
115 goto callee; \
116 /* ... and will land here, resuming the suspended code. */ \
117 case caller: {} \
118 } while (0)
119
120 // Parse the encoding, counting completed recursive calls to guard against
121 // excessively complex input and infinite-loop bugs.
122 int iter = 0;
123 goto whole_encoding;
124 for (; iter < kMaxReturns && recursion_depth_ > 0; ++iter) {
125 // This switch resumes the code path most recently suspended by
126 // ABSL_DEMANGLER_RECURSE.
127 switch (recursion_stack_[--recursion_depth_]) {
128 //
129 // symbol-name ->
130 // _R decimal-number? path instantiating-crate? vendor-specific-suffix?
131 whole_encoding:
132 if (!Eat('_') || !Eat('R')) return false;
133 // decimal-number? is always empty today, so proceed to path, which
134 // can't start with a decimal digit.
135 ABSL_DEMANGLER_RECURSE(path, kInstantiatingCrate);
136 if (IsAlpha(Peek())) {
137 ++silence_depth_; // Print nothing more from here on.
138 ABSL_DEMANGLER_RECURSE(path, kVendorSpecificSuffix);
139 }
140 switch (Take()) {
141 case '.': case '$': case '\0': return true;
142 }
143 return false; // unexpected trailing content
144
145 // path -> crate-root | inherent-impl | trait-impl | trait-definition |
146 // nested-path | generic-args | backref
147 //
148 // Note that ABSL_DEMANGLER_RECURSE does not work inside a nested switch
149 // (which would hide the generated case label). Thus we jump out of the
150 // inner switch with gotos before performing any fake recursion.
151 path:
152 switch (Take()) {
153 case 'C': goto crate_root;
154 case 'M': goto inherent_impl;
155 case 'X': goto trait_impl;
156 case 'Y': goto trait_definition;
157 case 'N': goto nested_path;
158 case 'I': goto generic_args;
159 case 'B': goto path_backref;
160 default: return false;
161 }
162
163 // crate-root -> C identifier (C consumed above)
164 crate_root:
165 if (!ParseIdentifier()) return false;
166 continue;
167
168 // inherent-impl -> M impl-path type (M already consumed)
169 inherent_impl:
170 if (!Emit("<")) return false;
171 ABSL_DEMANGLER_RECURSE(impl_path, kInherentImplType);
172 ABSL_DEMANGLER_RECURSE(type, kInherentImplEnding);
173 if (!Emit(">")) return false;
174 continue;
175
176 // trait-impl -> X impl-path type path (X already consumed)
177 trait_impl:
178 if (!Emit("<")) return false;
179 ABSL_DEMANGLER_RECURSE(impl_path, kTraitImplType);
180 ABSL_DEMANGLER_RECURSE(type, kTraitImplInfix);
181 if (!Emit(" as ")) return false;
182 ABSL_DEMANGLER_RECURSE(path, kTraitImplEnding);
183 if (!Emit(">")) return false;
184 continue;
185
186 // impl-path -> disambiguator? path (but never print it!)
187 impl_path:
188 ++silence_depth_;
189 {
190 int ignored_disambiguator;
191 if (!ParseDisambiguator(ignored_disambiguator)) return false;
192 }
193 ABSL_DEMANGLER_RECURSE(path, kImplPathEnding);
194 --silence_depth_;
195 continue;
196
197 // trait-definition -> Y type path (Y already consumed)
198 trait_definition:
199 if (!Emit("<")) return false;
200 ABSL_DEMANGLER_RECURSE(type, kTraitDefinitionInfix);
201 if (!Emit(" as ")) return false;
202 ABSL_DEMANGLER_RECURSE(path, kTraitDefinitionEnding);
203 if (!Emit(">")) return false;
204 continue;
205
206 // nested-path -> N namespace path identifier (N already consumed)
207 // namespace -> lower | upper
208 nested_path:
209 // Uppercase namespaces must be saved on a stack so we can print
210 // ::{closure#0} or ::{shim:vtable#0} or ::{X:name#0} as needed.
211 if (IsUpper(Peek())) {
212 if (!PushNamespace(Take())) return false;
213 ABSL_DEMANGLER_RECURSE(path, kIdentifierInUppercaseNamespace);
214 if (!Emit("::")) return false;
215 if (!ParseIdentifier(PopNamespace())) return false;
216 continue;
217 }
218
219 // Lowercase namespaces, however, are never represented in the output;
220 // they all emit just ::name.
221 if (IsLower(Take())) {
222 ABSL_DEMANGLER_RECURSE(path, kIdentifierInLowercaseNamespace);
223 if (!Emit("::")) return false;
224 if (!ParseIdentifier()) return false;
225 continue;
226 }
227
228 // Neither upper or lower
229 return false;
230
231 // type -> basic-type | array-type | slice-type | tuple-type |
232 // ref-type | mut-ref-type | const-ptr-type | mut-ptr-type |
233 // fn-type | dyn-trait-type | path | backref
234 //
235 // We use ifs instead of switch (Take()) because the default case jumps
236 // to path, which will need to see the first character not yet Taken
237 // from the input. Because we do not use a nested switch here,
238 // ABSL_DEMANGLER_RECURSE works fine in the 'S' case.
239 type:
240 if (IsLower(Peek())) {
241 const char* type_name = BasicTypeName(Take());
242 if (type_name == nullptr || !Emit(type_name)) return false;
243 continue;
244 }
245 if (Eat('A')) {
246 // array-type = A type const
247 if (!Emit("[")) return false;
248 ABSL_DEMANGLER_RECURSE(type, kArraySize);
249 if (!Emit("; ")) return false;
250 ABSL_DEMANGLER_RECURSE(constant, kFinishArray);
251 if (!Emit("]")) return false;
252 continue;
253 }
254 if (Eat('S')) {
255 if (!Emit("[")) return false;
256 ABSL_DEMANGLER_RECURSE(type, kSliceEnding);
257 if (!Emit("]")) return false;
258 continue;
259 }
260 if (Eat('T')) goto tuple_type;
261 if (Eat('R')) {
262 if (!Emit("&")) return false;
263 if (!ParseOptionalLifetime()) return false;
264 goto type;
265 }
266 if (Eat('Q')) {
267 if (!Emit("&mut ")) return false;
268 if (!ParseOptionalLifetime()) return false;
269 goto type;
270 }
271 if (Eat('P')) {
272 if (!Emit("*const ")) return false;
273 goto type;
274 }
275 if (Eat('O')) {
276 if (!Emit("*mut ")) return false;
277 goto type;
278 }
279 if (Eat('F')) goto fn_type;
280 if (Eat('D')) goto dyn_trait_type;
281 if (Eat('B')) goto type_backref;
282 goto path;
283
284 // tuple-type -> T type* E (T already consumed)
285 tuple_type:
286 if (!Emit("(")) return false;
287
288 // The toolchain should call the unit type u instead of TE, but the
289 // grammar and other demanglers also recognize TE, so we do too.
290 if (Eat('E')) {
291 if (!Emit(")")) return false;
292 continue;
293 }
294
295 // A tuple with one element is rendered (type,) instead of (type).
296 ABSL_DEMANGLER_RECURSE(type, kAfterFirstTupleElement);
297 if (Eat('E')) {
298 if (!Emit(",)")) return false;
299 continue;
300 }
301
302 // A tuple with two elements is of course (x, y).
303 if (!Emit(", ")) return false;
304 ABSL_DEMANGLER_RECURSE(type, kAfterSecondTupleElement);
305 if (Eat('E')) {
306 if (!Emit(")")) return false;
307 continue;
308 }
309
310 // And (x, y, z) for three elements.
311 if (!Emit(", ")) return false;
312 ABSL_DEMANGLER_RECURSE(type, kAfterThirdTupleElement);
313 if (Eat('E')) {
314 if (!Emit(")")) return false;
315 continue;
316 }
317
318 // For longer tuples we write (x, y, z, ...), printing none of the
319 // content of the fourth and later types. Thus we avoid exhausting
320 // output buffers and human readers' patience when some library has a
321 // long tuple as an implementation detail, without having to
322 // completely obfuscate all tuples.
323 if (!Emit(", ...)")) return false;
324 ++silence_depth_;
325 while (!Eat('E')) {
326 ABSL_DEMANGLER_RECURSE(type, kAfterSubsequentTupleElement);
327 }
328 --silence_depth_;
329 continue;
330
331 // fn-type -> F fn-sig (F already consumed)
332 // fn-sig -> binder? U? (K abi)? type* E type
333 // abi -> C | undisambiguated-identifier
334 //
335 // We follow the C++ demangler in suppressing details of function
336 // signatures. Every function type is rendered "fn...".
337 fn_type:
338 if (!Emit("fn...")) return false;
339 ++silence_depth_;
340 if (!ParseOptionalBinder()) return false;
341 (void)Eat('U');
342 if (Eat('K')) {
343 if (!Eat('C') && !ParseUndisambiguatedIdentifier()) return false;
344 }
345 while (!Eat('E')) {
346 ABSL_DEMANGLER_RECURSE(type, kContinueParameterList);
347 }
348 ABSL_DEMANGLER_RECURSE(type, kFinishFn);
349 --silence_depth_;
350 continue;
351
352 // dyn-trait-type -> D dyn-bounds lifetime (D already consumed)
353 // dyn-bounds -> binder? dyn-trait* E
354 //
355 // The grammar strangely allows an empty trait list, even though the
356 // compiler should never output one. We follow existing demanglers in
357 // rendering DEL_ as "dyn ".
358 //
359 // Because auto traits lengthen a type name considerably without
360 // providing much value to a search for related source code, it would be
361 // desirable to abbreviate
362 // dyn main::Trait + std::marker::Copy + std::marker::Send
363 // to
364 // dyn main::Trait + ...,
365 // eliding the auto traits. But it is difficult to do so correctly, in
366 // part because there is no guarantee that the mangling will list the
367 // main trait first. So we just print all the traits in their order of
368 // appearance in the mangled name.
369 dyn_trait_type:
370 if (!Emit("dyn ")) return false;
371 if (!ParseOptionalBinder()) return false;
372 if (!Eat('E')) {
373 ABSL_DEMANGLER_RECURSE(dyn_trait, kBeginAutoTraits);
374 while (!Eat('E')) {
375 if (!Emit(" + ")) return false;
376 ABSL_DEMANGLER_RECURSE(dyn_trait, kContinueAutoTraits);
377 }
378 }
379 if (!ParseRequiredLifetime()) return false;
380 continue;
381
382 // dyn-trait -> path dyn-trait-assoc-binding*
383 // dyn-trait-assoc-binding -> p undisambiguated-identifier type
384 //
385 // We render nonempty binding lists as <>, omitting their contents as
386 // for generic-args.
387 dyn_trait:
388 ABSL_DEMANGLER_RECURSE(path, kContinueDynTrait);
389 if (Peek() == 'p') {
390 if (!Emit("<>")) return false;
391 ++silence_depth_;
392 while (Eat('p')) {
393 if (!ParseUndisambiguatedIdentifier()) return false;
394 ABSL_DEMANGLER_RECURSE(type, kContinueAssocBinding);
395 }
396 --silence_depth_;
397 }
398 continue;
399
400 // const -> type const-data | p | backref
401 //
402 // const is a C++ keyword, so we use the label `constant` instead.
403 constant:
404 if (Eat('B')) goto const_backref;
405 if (Eat('p')) {
406 if (!Emit("_")) return false;
407 continue;
408 }
409
410 // Scan the type without printing it.
411 //
412 // The Rust language restricts the type of a const generic argument
413 // much more than the mangling grammar does. We do not enforce this.
414 //
415 // We also do not bother printing false, true, 'A', and '\u{abcd}' for
416 // the types bool and char. Because we do not print generic-args
417 // contents, we expect to print constants only in array sizes, and
418 // those should not be bool or char.
419 ++silence_depth_;
420 ABSL_DEMANGLER_RECURSE(type, kConstData);
421 --silence_depth_;
422
423 // const-data -> n? hex-digit* _
424 //
425 // Although the grammar doesn't say this, existing demanglers expect
426 // that zero is 0, not an empty digit sequence, and no nonzero value
427 // may have leading zero digits. Also n0_ is accepted and printed as
428 // -0, though a toolchain will probably never write that encoding.
429 if (Eat('n') && !EmitChar('-')) return false;
430 if (!Emit("0x")) return false;
431 if (Eat('0')) {
432 if (!EmitChar('0')) return false;
433 if (!Eat('_')) return false;
434 continue;
435 }
436 while (IsLowerHexDigit(Peek())) {
437 if (!EmitChar(Take())) return false;
438 }
439 if (!Eat('_')) return false;
440 continue;
441
442 // generic-args -> I path generic-arg* E (I already consumed)
443 //
444 // We follow the C++ demangler in omitting all the arguments from the
445 // output, printing only the list opening and closing tokens.
446 generic_args:
447 ABSL_DEMANGLER_RECURSE(path, kBeginGenericArgList);
448 if (!Emit("::<>")) return false;
449 ++silence_depth_;
450 while (!Eat('E')) {
451 ABSL_DEMANGLER_RECURSE(generic_arg, kContinueGenericArgList);
452 }
453 --silence_depth_;
454 continue;
455
456 // generic-arg -> lifetime | type | K const
457 generic_arg:
458 if (Peek() == 'L') {
459 if (!ParseOptionalLifetime()) return false;
460 continue;
461 }
462 if (Eat('K')) goto constant;
463 goto type;
464
465 // backref -> B base-62-number (B already consumed)
466 //
467 // The BeginBackref call parses and range-checks the base-62-number. We
468 // always do that much.
469 //
470 // The recursive call parses and prints what the backref points at. We
471 // save CPU and stack by skipping this work if the output would be
472 // suppressed anyway.
473 path_backref:
474 if (!BeginBackref()) return false;
475 if (silence_depth_ == 0) {
476 ABSL_DEMANGLER_RECURSE(path, kPathBackrefEnding);
477 }
478 EndBackref();
479 continue;
480
481 // This represents the same backref production as in path_backref but
482 // parses the target as a type instead of a path.
483 type_backref:
484 if (!BeginBackref()) return false;
485 if (silence_depth_ == 0) {
486 ABSL_DEMANGLER_RECURSE(type, kTypeBackrefEnding);
487 }
488 EndBackref();
489 continue;
490
491 const_backref:
492 if (!BeginBackref()) return false;
493 if (silence_depth_ == 0) {
494 ABSL_DEMANGLER_RECURSE(constant, kConstantBackrefEnding);
495 }
496 EndBackref();
497 continue;
498 }
499 }
500
501 return false; // hit iteration limit or a bug in our stack handling
502 }
503
504 private:
505 // Enumerates resumption points for ABSL_DEMANGLER_RECURSE calls.
506 enum ReturnAddress : std::uint8_t {
507 kInstantiatingCrate,
508 kVendorSpecificSuffix,
509 kIdentifierInUppercaseNamespace,
510 kIdentifierInLowercaseNamespace,
511 kInherentImplType,
512 kInherentImplEnding,
513 kTraitImplType,
514 kTraitImplInfix,
515 kTraitImplEnding,
516 kImplPathEnding,
517 kTraitDefinitionInfix,
518 kTraitDefinitionEnding,
519 kArraySize,
520 kFinishArray,
521 kSliceEnding,
522 kAfterFirstTupleElement,
523 kAfterSecondTupleElement,
524 kAfterThirdTupleElement,
525 kAfterSubsequentTupleElement,
526 kContinueParameterList,
527 kFinishFn,
528 kBeginAutoTraits,
529 kContinueAutoTraits,
530 kContinueDynTrait,
531 kContinueAssocBinding,
532 kConstData,
533 kBeginGenericArgList,
534 kContinueGenericArgList,
535 kPathBackrefEnding,
536 kTypeBackrefEnding,
537 kConstantBackrefEnding,
538 };
539
540 // Element counts for the stack arrays. Larger stack sizes accommodate more
541 // deeply nested names at the cost of a larger footprint on the C++ call
542 // stack.
543 enum {
544 // Maximum recursive calls outstanding at one time.
545 kStackSize = 256,
546
547 // Maximum N<uppercase> nested-paths open at once. We do not expect
548 // closures inside closures inside closures as much as functions inside
549 // modules inside other modules, so we can use a smaller array here.
550 kNamespaceStackSize = 64,
551
552 // Maximum number of nested backrefs. We can keep this stack pretty small
553 // because we do not follow backrefs inside generic-args or other contexts
554 // that suppress printing, so deep stacking is unlikely in practice.
555 kPositionStackSize = 16,
556 };
557
558 // Returns the next input character without consuming it.
Peek() const559 char Peek() const { return encoding_[pos_]; }
560
561 // Consumes and returns the next input character.
Take()562 char Take() { return encoding_[pos_++]; }
563
564 // If the next input character is the given character, consumes it and returns
565 // true; otherwise returns false without consuming a character.
Eat(char want)566 ABSL_MUST_USE_RESULT bool Eat(char want) {
567 if (encoding_[pos_] != want) return false;
568 ++pos_;
569 return true;
570 }
571
572 // Provided there is enough remaining output space, appends c to the output,
573 // writing a fresh NUL terminator afterward, and returns true. Returns false
574 // if the output buffer had less than two bytes free.
EmitChar(char c)575 ABSL_MUST_USE_RESULT bool EmitChar(char c) {
576 if (silence_depth_ > 0) return true;
577 if (out_end_ - out_ < 2) return false;
578 *out_++ = c;
579 *out_ = '\0';
580 return true;
581 }
582
583 // Provided there is enough remaining output space, appends the C string token
584 // to the output, followed by a NUL character, and returns true. Returns
585 // false if not everything fit into the output buffer.
Emit(const char * token)586 ABSL_MUST_USE_RESULT bool Emit(const char* token) {
587 if (silence_depth_ > 0) return true;
588 const std::size_t token_length = std::strlen(token);
589 const std::size_t bytes_to_copy = token_length + 1; // token and final NUL
590 if (static_cast<std::size_t>(out_end_ - out_) < bytes_to_copy) return false;
591 std::memcpy(out_, token, bytes_to_copy);
592 out_ += token_length;
593 return true;
594 }
595
596 // Provided there is enough remaining output space, appends the decimal form
597 // of disambiguator (if it's nonnegative) or "?" (if it's negative) to the
598 // output, followed by a NUL character, and returns true. Returns false if
599 // not everything fit into the output buffer.
EmitDisambiguator(int disambiguator)600 ABSL_MUST_USE_RESULT bool EmitDisambiguator(int disambiguator) {
601 if (disambiguator < 0) return EmitChar('?'); // parsed but too large
602 if (disambiguator == 0) return EmitChar('0');
603 // Convert disambiguator to decimal text. Three digits per byte is enough
604 // because 999 > 256. The bound will remain correct even if future
605 // maintenance changes the type of the disambiguator variable.
606 char digits[3 * sizeof(disambiguator)] = {};
607 std::size_t leading_digit_index = sizeof(digits) - 1;
608 for (; disambiguator > 0; disambiguator /= 10) {
609 digits[--leading_digit_index] =
610 static_cast<char>('0' + disambiguator % 10);
611 }
612 return Emit(digits + leading_digit_index);
613 }
614
615 // Consumes an optional disambiguator (s123_) from the input.
616 //
617 // On success returns true and fills value with the encoded value if it was
618 // not too big, otherwise with -1. If the optional disambiguator was omitted,
619 // value is 0. On parse failure returns false and sets value to -1.
ParseDisambiguator(int & value)620 ABSL_MUST_USE_RESULT bool ParseDisambiguator(int& value) {
621 value = -1;
622
623 // disambiguator = s base-62-number
624 //
625 // Disambiguators are optional. An omitted disambiguator is zero.
626 if (!Eat('s')) {
627 value = 0;
628 return true;
629 }
630 int base_62_value = 0;
631 if (!ParseBase62Number(base_62_value)) return false;
632 value = base_62_value < 0 ? -1 : base_62_value + 1;
633 return true;
634 }
635
636 // Consumes a base-62 number like _ or 123_ from the input.
637 //
638 // On success returns true and fills value with the encoded value if it was
639 // not too big, otherwise with -1. On parse failure returns false and sets
640 // value to -1.
ParseBase62Number(int & value)641 ABSL_MUST_USE_RESULT bool ParseBase62Number(int& value) {
642 value = -1;
643
644 // base-62-number = (digit | lower | upper)* _
645 //
646 // An empty base-62 digit sequence means 0.
647 if (Eat('_')) {
648 value = 0;
649 return true;
650 }
651
652 // A nonempty digit sequence denotes its base-62 value plus 1.
653 int encoded_number = 0;
654 bool overflowed = false;
655 while (IsAlpha(Peek()) || IsDigit(Peek())) {
656 const char c = Take();
657 if (encoded_number >= std::numeric_limits<int>::max()/62) {
658 // If we are close to overflowing an int, keep parsing but stop updating
659 // encoded_number and remember to return -1 at the end. The point is to
660 // avoid undefined behavior while parsing crate-root disambiguators,
661 // which are large in practice but not shown in demangling, while
662 // successfully computing closure and shim disambiguators, which are
663 // typically small and are printed out.
664 overflowed = true;
665 } else {
666 int digit;
667 if (IsDigit(c)) {
668 digit = c - '0';
669 } else if (IsLower(c)) {
670 digit = c - 'a' + 10;
671 } else {
672 digit = c - 'A' + 36;
673 }
674 encoded_number = 62 * encoded_number + digit;
675 }
676 }
677
678 if (!Eat('_')) return false;
679 if (!overflowed) value = encoded_number + 1;
680 return true;
681 }
682
683 // Consumes an identifier from the input, returning true on success.
684 //
685 // A nonzero uppercase_namespace specifies the character after the N in a
686 // nested-identifier, e.g., 'C' for a closure, allowing ParseIdentifier to
687 // write out the name with the conventional decoration for that namespace.
ParseIdentifier(char uppercase_namespace='\\0')688 ABSL_MUST_USE_RESULT bool ParseIdentifier(char uppercase_namespace = '\0') {
689 // identifier -> disambiguator? undisambiguated-identifier
690 int disambiguator = 0;
691 if (!ParseDisambiguator(disambiguator)) return false;
692
693 return ParseUndisambiguatedIdentifier(uppercase_namespace, disambiguator);
694 }
695
696 // Consumes from the input an identifier with no preceding disambiguator,
697 // returning true on success.
698 //
699 // When ParseIdentifier calls this, it passes the N<namespace> character and
700 // disambiguator value so that "{closure#42}" and similar forms can be
701 // rendered correctly.
702 //
703 // At other appearances of undisambiguated-identifier in the grammar, this
704 // treatment is not applicable, and the call site omits both arguments.
ParseUndisambiguatedIdentifier(char uppercase_namespace='\\0',int disambiguator=0)705 ABSL_MUST_USE_RESULT bool ParseUndisambiguatedIdentifier(
706 char uppercase_namespace = '\0', int disambiguator = 0) {
707 // undisambiguated-identifier -> u? decimal-number _? bytes
708 const bool is_punycoded = Eat('u');
709 if (!IsDigit(Peek())) return false;
710 int num_bytes = 0;
711 if (!ParseDecimalNumber(num_bytes)) return false;
712 (void)Eat('_'); // optional separator, needed if a digit follows
713
714 // Emit the beginnings of braced forms like {shim:vtable#0}.
715 if (uppercase_namespace == '\0') {
716 // Decoding of Punycode is not yet implemented. For now we emit
717 // "{Punycode ...}" with the raw encoding inside.
718 if (is_punycoded && !Emit("{Punycode ")) return false;
719 } else {
720 switch (uppercase_namespace) {
721 case 'C':
722 if (!Emit("{closure")) return false;
723 break;
724 case 'S':
725 if (!Emit("{shim")) return false;
726 break;
727 default:
728 if (!EmitChar('{') || !EmitChar(uppercase_namespace)) return false;
729 break;
730 }
731 if (num_bytes > 0 && !Emit(":")) return false;
732 }
733
734 // Emit the name itself.
735 for (int i = 0; i < num_bytes; ++i) {
736 const char c = Take();
737 if (!IsIdentifierChar(c) &&
738 // The spec gives toolchains the choice of Punycode or raw UTF-8 for
739 // identifiers containing code points above 0x7f, so accept bytes with
740 // the high bit set if this is not a u... encoding.
741 (is_punycoded || (c & 0x80) == 0)) {
742 return false;
743 }
744 if (!EmitChar(c)) return false;
745 }
746
747 // Emit the endings of braced forms: "#42}" or "}".
748 if (uppercase_namespace != '\0') {
749 if (!EmitChar('#')) return false;
750 if (!EmitDisambiguator(disambiguator)) return false;
751 }
752 if (uppercase_namespace != '\0' || is_punycoded) {
753 if (!EmitChar('}')) return false;
754 }
755
756 return true;
757 }
758
759 // Consumes a decimal number like 0 or 123 from the input. On success returns
760 // true and fills value with the encoded value. If the encoded value is too
761 // large or otherwise unparsable, returns false and sets value to -1.
ParseDecimalNumber(int & value)762 ABSL_MUST_USE_RESULT bool ParseDecimalNumber(int& value) {
763 value = -1;
764 if (!IsDigit(Peek())) return false;
765 int encoded_number = Take() - '0';
766 if (encoded_number == 0) {
767 // Decimal numbers are never encoded with extra leading zeroes.
768 value = 0;
769 return true;
770 }
771 while (IsDigit(Peek()) &&
772 // avoid overflow
773 encoded_number < std::numeric_limits<int>::max()/10) {
774 encoded_number = 10 * encoded_number + (Take() - '0');
775 }
776 if (IsDigit(Peek())) return false; // too big
777 value = encoded_number;
778 return true;
779 }
780
781 // Consumes a binder of higher-ranked lifetimes if one is present. On success
782 // returns true and discards the encoded lifetime count. On parse failure
783 // returns false.
ParseOptionalBinder()784 ABSL_MUST_USE_RESULT bool ParseOptionalBinder() {
785 // binder -> G base-62-number
786 if (!Eat('G')) return true;
787 int ignored_binding_count;
788 return ParseBase62Number(ignored_binding_count);
789 }
790
791 // Consumes a lifetime if one is present.
792 //
793 // On success returns true and discards the lifetime index. We do not print
794 // or even range-check lifetimes because they are a finer detail than other
795 // things we omit from output, such as the entire contents of generic-args.
796 //
797 // On parse failure returns false.
ParseOptionalLifetime()798 ABSL_MUST_USE_RESULT bool ParseOptionalLifetime() {
799 // lifetime -> L base-62-number
800 if (!Eat('L')) return true;
801 int ignored_de_bruijn_index;
802 return ParseBase62Number(ignored_de_bruijn_index);
803 }
804
805 // Consumes a lifetime just like ParseOptionalLifetime, but returns false if
806 // there is no lifetime here.
ParseRequiredLifetime()807 ABSL_MUST_USE_RESULT bool ParseRequiredLifetime() {
808 if (Peek() != 'L') return false;
809 return ParseOptionalLifetime();
810 }
811
812 // Pushes ns onto the namespace stack and returns true if the stack is not
813 // full, else returns false.
PushNamespace(char ns)814 ABSL_MUST_USE_RESULT bool PushNamespace(char ns) {
815 if (namespace_depth_ == kNamespaceStackSize) return false;
816 namespace_stack_[namespace_depth_++] = ns;
817 return true;
818 }
819
820 // Pops the last pushed namespace. Requires that the namespace stack is not
821 // empty (namespace_depth_ > 0).
PopNamespace()822 char PopNamespace() { return namespace_stack_[--namespace_depth_]; }
823
824 // Pushes position onto the position stack and returns true if the stack is
825 // not full, else returns false.
PushPosition(int position)826 ABSL_MUST_USE_RESULT bool PushPosition(int position) {
827 if (position_depth_ == kPositionStackSize) return false;
828 position_stack_[position_depth_++] = position;
829 return true;
830 }
831
832 // Pops the last pushed input position. Requires that the position stack is
833 // not empty (position_depth_ > 0).
PopPosition()834 int PopPosition() { return position_stack_[--position_depth_]; }
835
836 // Consumes a base-62-number denoting a backref target, pushes the current
837 // input position on the data stack, and sets the input position to the
838 // beginning of the backref target. Returns true on success. Returns false
839 // if parsing failed, the stack is exhausted, or the backref target position
840 // is out of range.
BeginBackref()841 ABSL_MUST_USE_RESULT bool BeginBackref() {
842 // backref = B base-62-number (B already consumed)
843 //
844 // Reject backrefs that don't parse, overflow int, or don't point backward.
845 // If the offset looks fine, adjust it to account for the _R prefix.
846 int offset = 0;
847 const int offset_of_this_backref =
848 pos_ - 2 /* _R */ - 1 /* B already consumed */;
849 if (!ParseBase62Number(offset) || offset < 0 ||
850 offset >= offset_of_this_backref) {
851 return false;
852 }
853 offset += 2;
854
855 // Save the old position to restore later.
856 if (!PushPosition(pos_)) return false;
857
858 // Move the input position to the backref target.
859 //
860 // Note that we do not check whether the new position points to the
861 // beginning of a construct matching the context in which the backref
862 // appeared. We just jump to it and see whether nested parsing succeeds.
863 // We therefore accept various wrong manglings, e.g., a type backref
864 // pointing to an 'l' character inside an identifier, which happens to mean
865 // i32 when parsed as a type mangling. This saves the complexity and RAM
866 // footprint of remembering which offsets began which kinds of
867 // substructures. Existing demanglers take similar shortcuts.
868 pos_ = offset;
869 return true;
870 }
871
872 // Cleans up after a backref production by restoring the previous input
873 // position from the data stack.
EndBackref()874 void EndBackref() { pos_ = PopPosition(); }
875
876 // The leftmost recursion_depth_ elements of recursion_stack_ contain the
877 // ReturnAddresses pushed by ABSL_DEMANGLER_RECURSE calls not yet completed.
878 ReturnAddress recursion_stack_[kStackSize] = {};
879 int recursion_depth_ = 0;
880
881 // The leftmost namespace_depth_ elements of namespace_stack_ contain the
882 // uppercase namespace identifiers for open nested-paths, e.g., 'C' for a
883 // closure.
884 char namespace_stack_[kNamespaceStackSize] = {};
885 int namespace_depth_ = 0;
886
887 // The leftmost position_depth_ elements of position_stack_ contain the input
888 // positions to return to after fully printing the targets of backrefs.
889 int position_stack_[kPositionStackSize] = {};
890 int position_depth_ = 0;
891
892 // Anything parsed while silence_depth_ > 0 contributes nothing to the
893 // demangled output. For constructs omitted from the demangling, such as
894 // impl-path and the contents of generic-args, we will increment
895 // silence_depth_ on the way in and decrement silence_depth_ on the way out.
896 int silence_depth_ = 0;
897
898 // Input: encoding_ points to a Rust mangled symbol, and encoding_[pos_] is
899 // the next input character to be scanned.
900 int pos_ = 0;
901 const char* encoding_ = nullptr;
902
903 // Output: *out_ is where the next output character should be written, and
904 // out_end_ points past the last byte of available space.
905 char* out_ = nullptr;
906 char* out_end_ = nullptr;
907 };
908
909 } // namespace
910
DemangleRustSymbolEncoding(const char * mangled,char * out,std::size_t out_size)911 bool DemangleRustSymbolEncoding(const char* mangled, char* out,
912 std::size_t out_size) {
913 return RustSymbolParser(mangled, out, out + out_size).Parse();
914 }
915
916 } // namespace debugging_internal
917 ABSL_NAMESPACE_END
918 } // namespace absl
919