• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2024 The Abseil Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/debugging/internal/demangle_rust.h"
16 
17 #include <cstddef>
18 #include <cstdint>
19 #include <cstring>
20 #include <limits>
21 
22 #include "absl/base/attributes.h"
23 #include "absl/base/config.h"
24 
25 namespace absl {
26 ABSL_NAMESPACE_BEGIN
27 namespace debugging_internal {
28 
29 namespace {
30 
31 // Same step limit as the C++ demangler in demangle.cc uses.
32 constexpr int kMaxReturns = 1 << 17;
33 
IsDigit(char c)34 bool IsDigit(char c) { return '0' <= c && c <= '9'; }
IsLower(char c)35 bool IsLower(char c) { return 'a' <= c && c <= 'z'; }
IsUpper(char c)36 bool IsUpper(char c) { return 'A' <= c && c <= 'Z'; }
IsAlpha(char c)37 bool IsAlpha(char c) { return IsLower(c) || IsUpper(c); }
IsIdentifierChar(char c)38 bool IsIdentifierChar(char c) { return IsAlpha(c) || IsDigit(c) || c == '_'; }
IsLowerHexDigit(char c)39 bool IsLowerHexDigit(char c) { return IsDigit(c) || ('a' <= c && c <= 'f'); }
40 
BasicTypeName(char c)41 const char* BasicTypeName(char c) {
42   switch (c) {
43     case 'a': return "i8";
44     case 'b': return "bool";
45     case 'c': return "char";
46     case 'd': return "f64";
47     case 'e': return "str";
48     case 'f': return "f32";
49     case 'h': return "u8";
50     case 'i': return "isize";
51     case 'j': return "usize";
52     case 'l': return "i32";
53     case 'm': return "u32";
54     case 'n': return "i128";
55     case 'o': return "u128";
56     case 'p': return "_";
57     case 's': return "i16";
58     case 't': return "u16";
59     case 'u': return "()";
60     case 'v': return "...";
61     case 'x': return "i64";
62     case 'y': return "u64";
63     case 'z': return "!";
64   }
65   return nullptr;
66 }
67 
68 // Parser for Rust symbol mangling v0, whose grammar is defined here:
69 //
70 // https://doc.rust-lang.org/rustc/symbol-mangling/v0.html#symbol-grammar-summary
71 class RustSymbolParser {
72  public:
73   // Prepares to demangle the given encoding, a Rust symbol name starting with
74   // _R, into the output buffer [out, out_end).  The caller is expected to
75   // continue by calling the new object's Parse function.
RustSymbolParser(const char * encoding,char * out,char * const out_end)76   RustSymbolParser(const char* encoding, char* out, char* const out_end)
77       : encoding_(encoding), out_(out), out_end_(out_end) {
78     if (out_ != out_end_) *out_ = '\0';
79   }
80 
81   // Parses the constructor's encoding argument, writing output into the range
82   // [out, out_end).  Returns true on success and false for input whose
83   // structure was not recognized or exceeded implementation limits, such as by
84   // nesting structures too deep.  In either case *this should not be used
85   // again.
Parse()86   ABSL_MUST_USE_RESULT bool Parse() && {
87     // Recursively parses the grammar production named by callee, then resumes
88     // execution at the next statement.
89     //
90     // Recursive-descent parsing is a beautifully readable translation of a
91     // grammar, but it risks stack overflow if implemented by naive recursion on
92     // the C++ call stack.  So we simulate recursion by goto and switch instead,
93     // keeping a bounded stack of "return addresses" in the recursion_stack_
94     // member.
95     //
96     // The callee argument is a statement label.  We goto that label after
97     // saving the "return address" on recursion_stack_.  The next continue
98     // statement in the for loop below "returns" from this "call".
99     //
100     // The caller argument names the return point.  Each value of caller must
101     // appear in only one ABSL_DEMANGLER_RECURSE call and be listed in the
102     // definition of enum ReturnAddress.  The switch implements the control
103     // transfer from the end of a "called" subroutine back to the statement
104     // after the "call".
105     //
106     // Note that not all the grammar productions have to be packed into the
107     // switch, but only those which appear in a cycle in the grammar.  Anything
108     // acyclic can be written as ordinary functions and function calls, e.g.,
109     // ParseIdentifier.
110 #define ABSL_DEMANGLER_RECURSE(callee, caller) \
111     do { \
112       if (recursion_depth_ == kStackSize) return false; \
113       /* The next continue will switch on this saved value ... */ \
114       recursion_stack_[recursion_depth_++] = caller; \
115       goto callee; \
116       /* ... and will land here, resuming the suspended code. */ \
117       case caller: {} \
118     } while (0)
119 
120     // Parse the encoding, counting completed recursive calls to guard against
121     // excessively complex input and infinite-loop bugs.
122     int iter = 0;
123     goto whole_encoding;
124     for (; iter < kMaxReturns && recursion_depth_ > 0; ++iter) {
125       // This switch resumes the code path most recently suspended by
126       // ABSL_DEMANGLER_RECURSE.
127       switch (recursion_stack_[--recursion_depth_]) {
128         //
129         // symbol-name ->
130         // _R decimal-number? path instantiating-crate? vendor-specific-suffix?
131         whole_encoding:
132           if (!Eat('_') || !Eat('R')) return false;
133           // decimal-number? is always empty today, so proceed to path, which
134           // can't start with a decimal digit.
135           ABSL_DEMANGLER_RECURSE(path, kInstantiatingCrate);
136           if (IsAlpha(Peek())) {
137             ++silence_depth_;  // Print nothing more from here on.
138             ABSL_DEMANGLER_RECURSE(path, kVendorSpecificSuffix);
139           }
140           switch (Take()) {
141             case '.': case '$': case '\0': return true;
142           }
143           return false;  // unexpected trailing content
144 
145         // path -> crate-root | inherent-impl | trait-impl | trait-definition |
146         //         nested-path | generic-args | backref
147         //
148         // Note that ABSL_DEMANGLER_RECURSE does not work inside a nested switch
149         // (which would hide the generated case label).  Thus we jump out of the
150         // inner switch with gotos before performing any fake recursion.
151         path:
152           switch (Take()) {
153             case 'C': goto crate_root;
154             case 'M': goto inherent_impl;
155             case 'X': goto trait_impl;
156             case 'Y': goto trait_definition;
157             case 'N': goto nested_path;
158             case 'I': goto generic_args;
159             case 'B': goto path_backref;
160             default: return false;
161           }
162 
163         // crate-root -> C identifier (C consumed above)
164         crate_root:
165           if (!ParseIdentifier()) return false;
166           continue;
167 
168         // inherent-impl -> M impl-path type (M already consumed)
169         inherent_impl:
170           if (!Emit("<")) return false;
171           ABSL_DEMANGLER_RECURSE(impl_path, kInherentImplType);
172           ABSL_DEMANGLER_RECURSE(type, kInherentImplEnding);
173           if (!Emit(">")) return false;
174           continue;
175 
176         // trait-impl -> X impl-path type path (X already consumed)
177         trait_impl:
178           if (!Emit("<")) return false;
179           ABSL_DEMANGLER_RECURSE(impl_path, kTraitImplType);
180           ABSL_DEMANGLER_RECURSE(type, kTraitImplInfix);
181           if (!Emit(" as ")) return false;
182           ABSL_DEMANGLER_RECURSE(path, kTraitImplEnding);
183           if (!Emit(">")) return false;
184           continue;
185 
186         // impl-path -> disambiguator? path (but never print it!)
187         impl_path:
188           ++silence_depth_;
189           {
190             int ignored_disambiguator;
191             if (!ParseDisambiguator(ignored_disambiguator)) return false;
192           }
193           ABSL_DEMANGLER_RECURSE(path, kImplPathEnding);
194           --silence_depth_;
195           continue;
196 
197         // trait-definition -> Y type path (Y already consumed)
198         trait_definition:
199           if (!Emit("<")) return false;
200           ABSL_DEMANGLER_RECURSE(type, kTraitDefinitionInfix);
201           if (!Emit(" as ")) return false;
202           ABSL_DEMANGLER_RECURSE(path, kTraitDefinitionEnding);
203           if (!Emit(">")) return false;
204           continue;
205 
206         // nested-path -> N namespace path identifier (N already consumed)
207         // namespace -> lower | upper
208         nested_path:
209           // Uppercase namespaces must be saved on a stack so we can print
210           // ::{closure#0} or ::{shim:vtable#0} or ::{X:name#0} as needed.
211           if (IsUpper(Peek())) {
212             if (!PushNamespace(Take())) return false;
213             ABSL_DEMANGLER_RECURSE(path, kIdentifierInUppercaseNamespace);
214             if (!Emit("::")) return false;
215             if (!ParseIdentifier(PopNamespace())) return false;
216             continue;
217           }
218 
219           // Lowercase namespaces, however, are never represented in the output;
220           // they all emit just ::name.
221           if (IsLower(Take())) {
222             ABSL_DEMANGLER_RECURSE(path, kIdentifierInLowercaseNamespace);
223             if (!Emit("::")) return false;
224             if (!ParseIdentifier()) return false;
225             continue;
226           }
227 
228           // Neither upper or lower
229           return false;
230 
231         // type -> basic-type | array-type | slice-type | tuple-type |
232         //         ref-type | mut-ref-type | const-ptr-type | mut-ptr-type |
233         //         fn-type | dyn-trait-type | path | backref
234         //
235         // We use ifs instead of switch (Take()) because the default case jumps
236         // to path, which will need to see the first character not yet Taken
237         // from the input.  Because we do not use a nested switch here,
238         // ABSL_DEMANGLER_RECURSE works fine in the 'S' case.
239         type:
240           if (IsLower(Peek())) {
241             const char* type_name = BasicTypeName(Take());
242             if (type_name == nullptr || !Emit(type_name)) return false;
243             continue;
244           }
245           if (Eat('A')) {
246             // array-type = A type const
247             if (!Emit("[")) return false;
248             ABSL_DEMANGLER_RECURSE(type, kArraySize);
249             if (!Emit("; ")) return false;
250             ABSL_DEMANGLER_RECURSE(constant, kFinishArray);
251             if (!Emit("]")) return false;
252             continue;
253           }
254           if (Eat('S')) {
255             if (!Emit("[")) return false;
256             ABSL_DEMANGLER_RECURSE(type, kSliceEnding);
257             if (!Emit("]")) return false;
258             continue;
259           }
260           if (Eat('T')) goto tuple_type;
261           if (Eat('R')) {
262             if (!Emit("&")) return false;
263             if (!ParseOptionalLifetime()) return false;
264             goto type;
265           }
266           if (Eat('Q')) {
267             if (!Emit("&mut ")) return false;
268             if (!ParseOptionalLifetime()) return false;
269             goto type;
270           }
271           if (Eat('P')) {
272             if (!Emit("*const ")) return false;
273             goto type;
274           }
275           if (Eat('O')) {
276             if (!Emit("*mut ")) return false;
277             goto type;
278           }
279           if (Eat('F')) goto fn_type;
280           if (Eat('D')) goto dyn_trait_type;
281           if (Eat('B')) goto type_backref;
282           goto path;
283 
284         // tuple-type -> T type* E (T already consumed)
285         tuple_type:
286           if (!Emit("(")) return false;
287 
288           // The toolchain should call the unit type u instead of TE, but the
289           // grammar and other demanglers also recognize TE, so we do too.
290           if (Eat('E')) {
291             if (!Emit(")")) return false;
292             continue;
293           }
294 
295           // A tuple with one element is rendered (type,) instead of (type).
296           ABSL_DEMANGLER_RECURSE(type, kAfterFirstTupleElement);
297           if (Eat('E')) {
298             if (!Emit(",)")) return false;
299             continue;
300           }
301 
302           // A tuple with two elements is of course (x, y).
303           if (!Emit(", ")) return false;
304           ABSL_DEMANGLER_RECURSE(type, kAfterSecondTupleElement);
305           if (Eat('E')) {
306             if (!Emit(")")) return false;
307             continue;
308           }
309 
310           // And (x, y, z) for three elements.
311           if (!Emit(", ")) return false;
312           ABSL_DEMANGLER_RECURSE(type, kAfterThirdTupleElement);
313           if (Eat('E')) {
314             if (!Emit(")")) return false;
315             continue;
316           }
317 
318           // For longer tuples we write (x, y, z, ...), printing none of the
319           // content of the fourth and later types.  Thus we avoid exhausting
320           // output buffers and human readers' patience when some library has a
321           // long tuple as an implementation detail, without having to
322           // completely obfuscate all tuples.
323           if (!Emit(", ...)")) return false;
324           ++silence_depth_;
325           while (!Eat('E')) {
326             ABSL_DEMANGLER_RECURSE(type, kAfterSubsequentTupleElement);
327           }
328           --silence_depth_;
329           continue;
330 
331         // fn-type -> F fn-sig (F already consumed)
332         // fn-sig -> binder? U? (K abi)? type* E type
333         // abi -> C | undisambiguated-identifier
334         //
335         // We follow the C++ demangler in suppressing details of function
336         // signatures.  Every function type is rendered "fn...".
337         fn_type:
338           if (!Emit("fn...")) return false;
339           ++silence_depth_;
340           if (!ParseOptionalBinder()) return false;
341           (void)Eat('U');
342           if (Eat('K')) {
343             if (!Eat('C') && !ParseUndisambiguatedIdentifier()) return false;
344           }
345           while (!Eat('E')) {
346             ABSL_DEMANGLER_RECURSE(type, kContinueParameterList);
347           }
348           ABSL_DEMANGLER_RECURSE(type, kFinishFn);
349           --silence_depth_;
350           continue;
351 
352         // dyn-trait-type -> D dyn-bounds lifetime (D already consumed)
353         // dyn-bounds -> binder? dyn-trait* E
354         //
355         // The grammar strangely allows an empty trait list, even though the
356         // compiler should never output one.  We follow existing demanglers in
357         // rendering DEL_ as "dyn ".
358         //
359         // Because auto traits lengthen a type name considerably without
360         // providing much value to a search for related source code, it would be
361         // desirable to abbreviate
362         //     dyn main::Trait + std::marker::Copy + std::marker::Send
363         // to
364         //     dyn main::Trait + ...,
365         // eliding the auto traits.  But it is difficult to do so correctly, in
366         // part because there is no guarantee that the mangling will list the
367         // main trait first.  So we just print all the traits in their order of
368         // appearance in the mangled name.
369         dyn_trait_type:
370           if (!Emit("dyn ")) return false;
371           if (!ParseOptionalBinder()) return false;
372           if (!Eat('E')) {
373             ABSL_DEMANGLER_RECURSE(dyn_trait, kBeginAutoTraits);
374             while (!Eat('E')) {
375               if (!Emit(" + ")) return false;
376               ABSL_DEMANGLER_RECURSE(dyn_trait, kContinueAutoTraits);
377             }
378           }
379           if (!ParseRequiredLifetime()) return false;
380           continue;
381 
382         // dyn-trait -> path dyn-trait-assoc-binding*
383         // dyn-trait-assoc-binding -> p undisambiguated-identifier type
384         //
385         // We render nonempty binding lists as <>, omitting their contents as
386         // for generic-args.
387         dyn_trait:
388           ABSL_DEMANGLER_RECURSE(path, kContinueDynTrait);
389           if (Peek() == 'p') {
390             if (!Emit("<>")) return false;
391             ++silence_depth_;
392             while (Eat('p')) {
393               if (!ParseUndisambiguatedIdentifier()) return false;
394               ABSL_DEMANGLER_RECURSE(type, kContinueAssocBinding);
395             }
396             --silence_depth_;
397           }
398           continue;
399 
400         // const -> type const-data | p | backref
401         //
402         // const is a C++ keyword, so we use the label `constant` instead.
403         constant:
404           if (Eat('B')) goto const_backref;
405           if (Eat('p')) {
406             if (!Emit("_")) return false;
407             continue;
408           }
409 
410           // Scan the type without printing it.
411           //
412           // The Rust language restricts the type of a const generic argument
413           // much more than the mangling grammar does.  We do not enforce this.
414           //
415           // We also do not bother printing false, true, 'A', and '\u{abcd}' for
416           // the types bool and char.  Because we do not print generic-args
417           // contents, we expect to print constants only in array sizes, and
418           // those should not be bool or char.
419           ++silence_depth_;
420           ABSL_DEMANGLER_RECURSE(type, kConstData);
421           --silence_depth_;
422 
423           // const-data -> n? hex-digit* _
424           //
425           // Although the grammar doesn't say this, existing demanglers expect
426           // that zero is 0, not an empty digit sequence, and no nonzero value
427           // may have leading zero digits.  Also n0_ is accepted and printed as
428           // -0, though a toolchain will probably never write that encoding.
429           if (Eat('n') && !EmitChar('-')) return false;
430           if (!Emit("0x")) return false;
431           if (Eat('0')) {
432             if (!EmitChar('0')) return false;
433             if (!Eat('_')) return false;
434             continue;
435           }
436           while (IsLowerHexDigit(Peek())) {
437             if (!EmitChar(Take())) return false;
438           }
439           if (!Eat('_')) return false;
440           continue;
441 
442         // generic-args -> I path generic-arg* E (I already consumed)
443         //
444         // We follow the C++ demangler in omitting all the arguments from the
445         // output, printing only the list opening and closing tokens.
446         generic_args:
447           ABSL_DEMANGLER_RECURSE(path, kBeginGenericArgList);
448           if (!Emit("::<>")) return false;
449           ++silence_depth_;
450           while (!Eat('E')) {
451             ABSL_DEMANGLER_RECURSE(generic_arg, kContinueGenericArgList);
452           }
453           --silence_depth_;
454           continue;
455 
456         // generic-arg -> lifetime | type | K const
457         generic_arg:
458           if (Peek() == 'L') {
459             if (!ParseOptionalLifetime()) return false;
460             continue;
461           }
462           if (Eat('K')) goto constant;
463           goto type;
464 
465         // backref -> B base-62-number (B already consumed)
466         //
467         // The BeginBackref call parses and range-checks the base-62-number.  We
468         // always do that much.
469         //
470         // The recursive call parses and prints what the backref points at.  We
471         // save CPU and stack by skipping this work if the output would be
472         // suppressed anyway.
473         path_backref:
474           if (!BeginBackref()) return false;
475           if (silence_depth_ == 0) {
476             ABSL_DEMANGLER_RECURSE(path, kPathBackrefEnding);
477           }
478           EndBackref();
479           continue;
480 
481         // This represents the same backref production as in path_backref but
482         // parses the target as a type instead of a path.
483         type_backref:
484           if (!BeginBackref()) return false;
485           if (silence_depth_ == 0) {
486             ABSL_DEMANGLER_RECURSE(type, kTypeBackrefEnding);
487           }
488           EndBackref();
489           continue;
490 
491         const_backref:
492           if (!BeginBackref()) return false;
493           if (silence_depth_ == 0) {
494             ABSL_DEMANGLER_RECURSE(constant, kConstantBackrefEnding);
495           }
496           EndBackref();
497           continue;
498       }
499     }
500 
501     return false;  // hit iteration limit or a bug in our stack handling
502   }
503 
504  private:
505   // Enumerates resumption points for ABSL_DEMANGLER_RECURSE calls.
506   enum ReturnAddress : std::uint8_t {
507     kInstantiatingCrate,
508     kVendorSpecificSuffix,
509     kIdentifierInUppercaseNamespace,
510     kIdentifierInLowercaseNamespace,
511     kInherentImplType,
512     kInherentImplEnding,
513     kTraitImplType,
514     kTraitImplInfix,
515     kTraitImplEnding,
516     kImplPathEnding,
517     kTraitDefinitionInfix,
518     kTraitDefinitionEnding,
519     kArraySize,
520     kFinishArray,
521     kSliceEnding,
522     kAfterFirstTupleElement,
523     kAfterSecondTupleElement,
524     kAfterThirdTupleElement,
525     kAfterSubsequentTupleElement,
526     kContinueParameterList,
527     kFinishFn,
528     kBeginAutoTraits,
529     kContinueAutoTraits,
530     kContinueDynTrait,
531     kContinueAssocBinding,
532     kConstData,
533     kBeginGenericArgList,
534     kContinueGenericArgList,
535     kPathBackrefEnding,
536     kTypeBackrefEnding,
537     kConstantBackrefEnding,
538   };
539 
540   // Element counts for the stack arrays.  Larger stack sizes accommodate more
541   // deeply nested names at the cost of a larger footprint on the C++ call
542   // stack.
543   enum {
544     // Maximum recursive calls outstanding at one time.
545     kStackSize = 256,
546 
547     // Maximum N<uppercase> nested-paths open at once.  We do not expect
548     // closures inside closures inside closures as much as functions inside
549     // modules inside other modules, so we can use a smaller array here.
550     kNamespaceStackSize = 64,
551 
552     // Maximum number of nested backrefs.  We can keep this stack pretty small
553     // because we do not follow backrefs inside generic-args or other contexts
554     // that suppress printing, so deep stacking is unlikely in practice.
555     kPositionStackSize = 16,
556   };
557 
558   // Returns the next input character without consuming it.
Peek() const559   char Peek() const { return encoding_[pos_]; }
560 
561   // Consumes and returns the next input character.
Take()562   char Take() { return encoding_[pos_++]; }
563 
564   // If the next input character is the given character, consumes it and returns
565   // true; otherwise returns false without consuming a character.
Eat(char want)566   ABSL_MUST_USE_RESULT bool Eat(char want) {
567     if (encoding_[pos_] != want) return false;
568     ++pos_;
569     return true;
570   }
571 
572   // Provided there is enough remaining output space, appends c to the output,
573   // writing a fresh NUL terminator afterward, and returns true.  Returns false
574   // if the output buffer had less than two bytes free.
EmitChar(char c)575   ABSL_MUST_USE_RESULT bool EmitChar(char c) {
576     if (silence_depth_ > 0) return true;
577     if (out_end_ - out_ < 2) return false;
578     *out_++ = c;
579     *out_ = '\0';
580     return true;
581   }
582 
583   // Provided there is enough remaining output space, appends the C string token
584   // to the output, followed by a NUL character, and returns true.  Returns
585   // false if not everything fit into the output buffer.
Emit(const char * token)586   ABSL_MUST_USE_RESULT bool Emit(const char* token) {
587     if (silence_depth_ > 0) return true;
588     const std::size_t token_length = std::strlen(token);
589     const std::size_t bytes_to_copy = token_length + 1;  // token and final NUL
590     if (static_cast<std::size_t>(out_end_ - out_) < bytes_to_copy) return false;
591     std::memcpy(out_, token, bytes_to_copy);
592     out_ += token_length;
593     return true;
594   }
595 
596   // Provided there is enough remaining output space, appends the decimal form
597   // of disambiguator (if it's nonnegative) or "?" (if it's negative) to the
598   // output, followed by a NUL character, and returns true.  Returns false if
599   // not everything fit into the output buffer.
EmitDisambiguator(int disambiguator)600   ABSL_MUST_USE_RESULT bool EmitDisambiguator(int disambiguator) {
601     if (disambiguator < 0) return EmitChar('?');  // parsed but too large
602     if (disambiguator == 0) return EmitChar('0');
603     // Convert disambiguator to decimal text.  Three digits per byte is enough
604     // because 999 > 256.  The bound will remain correct even if future
605     // maintenance changes the type of the disambiguator variable.
606     char digits[3 * sizeof(disambiguator)] = {};
607     std::size_t leading_digit_index = sizeof(digits) - 1;
608     for (; disambiguator > 0; disambiguator /= 10) {
609       digits[--leading_digit_index] =
610           static_cast<char>('0' + disambiguator % 10);
611     }
612     return Emit(digits + leading_digit_index);
613   }
614 
615   // Consumes an optional disambiguator (s123_) from the input.
616   //
617   // On success returns true and fills value with the encoded value if it was
618   // not too big, otherwise with -1.  If the optional disambiguator was omitted,
619   // value is 0.  On parse failure returns false and sets value to -1.
ParseDisambiguator(int & value)620   ABSL_MUST_USE_RESULT bool ParseDisambiguator(int& value) {
621     value = -1;
622 
623     // disambiguator = s base-62-number
624     //
625     // Disambiguators are optional.  An omitted disambiguator is zero.
626     if (!Eat('s')) {
627       value = 0;
628       return true;
629     }
630     int base_62_value = 0;
631     if (!ParseBase62Number(base_62_value)) return false;
632     value = base_62_value < 0 ? -1 : base_62_value + 1;
633     return true;
634   }
635 
636   // Consumes a base-62 number like _ or 123_ from the input.
637   //
638   // On success returns true and fills value with the encoded value if it was
639   // not too big, otherwise with -1.  On parse failure returns false and sets
640   // value to -1.
ParseBase62Number(int & value)641   ABSL_MUST_USE_RESULT bool ParseBase62Number(int& value) {
642     value = -1;
643 
644     // base-62-number = (digit | lower | upper)* _
645     //
646     // An empty base-62 digit sequence means 0.
647     if (Eat('_')) {
648       value = 0;
649       return true;
650     }
651 
652     // A nonempty digit sequence denotes its base-62 value plus 1.
653     int encoded_number = 0;
654     bool overflowed = false;
655     while (IsAlpha(Peek()) || IsDigit(Peek())) {
656       const char c = Take();
657       if (encoded_number >= std::numeric_limits<int>::max()/62) {
658         // If we are close to overflowing an int, keep parsing but stop updating
659         // encoded_number and remember to return -1 at the end.  The point is to
660         // avoid undefined behavior while parsing crate-root disambiguators,
661         // which are large in practice but not shown in demangling, while
662         // successfully computing closure and shim disambiguators, which are
663         // typically small and are printed out.
664         overflowed = true;
665       } else {
666         int digit;
667         if (IsDigit(c)) {
668           digit = c - '0';
669         } else if (IsLower(c)) {
670           digit = c - 'a' + 10;
671         } else {
672           digit = c - 'A' + 36;
673         }
674         encoded_number = 62 * encoded_number + digit;
675       }
676     }
677 
678     if (!Eat('_')) return false;
679     if (!overflowed) value = encoded_number + 1;
680     return true;
681   }
682 
683   // Consumes an identifier from the input, returning true on success.
684   //
685   // A nonzero uppercase_namespace specifies the character after the N in a
686   // nested-identifier, e.g., 'C' for a closure, allowing ParseIdentifier to
687   // write out the name with the conventional decoration for that namespace.
ParseIdentifier(char uppercase_namespace='\\0')688   ABSL_MUST_USE_RESULT bool ParseIdentifier(char uppercase_namespace = '\0') {
689     // identifier -> disambiguator? undisambiguated-identifier
690     int disambiguator = 0;
691     if (!ParseDisambiguator(disambiguator)) return false;
692 
693     return ParseUndisambiguatedIdentifier(uppercase_namespace, disambiguator);
694   }
695 
696   // Consumes from the input an identifier with no preceding disambiguator,
697   // returning true on success.
698   //
699   // When ParseIdentifier calls this, it passes the N<namespace> character and
700   // disambiguator value so that "{closure#42}" and similar forms can be
701   // rendered correctly.
702   //
703   // At other appearances of undisambiguated-identifier in the grammar, this
704   // treatment is not applicable, and the call site omits both arguments.
ParseUndisambiguatedIdentifier(char uppercase_namespace='\\0',int disambiguator=0)705   ABSL_MUST_USE_RESULT bool ParseUndisambiguatedIdentifier(
706       char uppercase_namespace = '\0', int disambiguator = 0) {
707     // undisambiguated-identifier -> u? decimal-number _? bytes
708     const bool is_punycoded = Eat('u');
709     if (!IsDigit(Peek())) return false;
710     int num_bytes = 0;
711     if (!ParseDecimalNumber(num_bytes)) return false;
712     (void)Eat('_');  // optional separator, needed if a digit follows
713 
714     // Emit the beginnings of braced forms like {shim:vtable#0}.
715     if (uppercase_namespace == '\0') {
716       // Decoding of Punycode is not yet implemented.  For now we emit
717       // "{Punycode ...}" with the raw encoding inside.
718       if (is_punycoded && !Emit("{Punycode ")) return false;
719     } else {
720       switch (uppercase_namespace) {
721         case 'C':
722           if (!Emit("{closure")) return false;
723           break;
724         case 'S':
725           if (!Emit("{shim")) return false;
726           break;
727         default:
728           if (!EmitChar('{') || !EmitChar(uppercase_namespace)) return false;
729           break;
730       }
731       if (num_bytes > 0 && !Emit(":")) return false;
732     }
733 
734     // Emit the name itself.
735     for (int i = 0; i < num_bytes; ++i) {
736       const char c = Take();
737       if (!IsIdentifierChar(c) &&
738           // The spec gives toolchains the choice of Punycode or raw UTF-8 for
739           // identifiers containing code points above 0x7f, so accept bytes with
740           // the high bit set if this is not a u... encoding.
741           (is_punycoded || (c & 0x80) == 0)) {
742         return false;
743       }
744       if (!EmitChar(c)) return false;
745     }
746 
747     // Emit the endings of braced forms: "#42}" or "}".
748     if (uppercase_namespace != '\0') {
749       if (!EmitChar('#')) return false;
750       if (!EmitDisambiguator(disambiguator)) return false;
751     }
752     if (uppercase_namespace != '\0' || is_punycoded) {
753       if (!EmitChar('}')) return false;
754     }
755 
756     return true;
757   }
758 
759   // Consumes a decimal number like 0 or 123 from the input.  On success returns
760   // true and fills value with the encoded value.  If the encoded value is too
761   // large or otherwise unparsable, returns false and sets value to -1.
ParseDecimalNumber(int & value)762   ABSL_MUST_USE_RESULT bool ParseDecimalNumber(int& value) {
763     value = -1;
764     if (!IsDigit(Peek())) return false;
765     int encoded_number = Take() - '0';
766     if (encoded_number == 0) {
767       // Decimal numbers are never encoded with extra leading zeroes.
768       value = 0;
769       return true;
770     }
771     while (IsDigit(Peek()) &&
772            // avoid overflow
773            encoded_number < std::numeric_limits<int>::max()/10) {
774       encoded_number = 10 * encoded_number + (Take() - '0');
775     }
776     if (IsDigit(Peek())) return false;  // too big
777     value = encoded_number;
778     return true;
779   }
780 
781   // Consumes a binder of higher-ranked lifetimes if one is present.  On success
782   // returns true and discards the encoded lifetime count.  On parse failure
783   // returns false.
ParseOptionalBinder()784   ABSL_MUST_USE_RESULT bool ParseOptionalBinder() {
785     // binder -> G base-62-number
786     if (!Eat('G')) return true;
787     int ignored_binding_count;
788     return ParseBase62Number(ignored_binding_count);
789   }
790 
791   // Consumes a lifetime if one is present.
792   //
793   // On success returns true and discards the lifetime index.  We do not print
794   // or even range-check lifetimes because they are a finer detail than other
795   // things we omit from output, such as the entire contents of generic-args.
796   //
797   // On parse failure returns false.
ParseOptionalLifetime()798   ABSL_MUST_USE_RESULT bool ParseOptionalLifetime() {
799     // lifetime -> L base-62-number
800     if (!Eat('L')) return true;
801     int ignored_de_bruijn_index;
802     return ParseBase62Number(ignored_de_bruijn_index);
803   }
804 
805   // Consumes a lifetime just like ParseOptionalLifetime, but returns false if
806   // there is no lifetime here.
ParseRequiredLifetime()807   ABSL_MUST_USE_RESULT bool ParseRequiredLifetime() {
808     if (Peek() != 'L') return false;
809     return ParseOptionalLifetime();
810   }
811 
812   // Pushes ns onto the namespace stack and returns true if the stack is not
813   // full, else returns false.
PushNamespace(char ns)814   ABSL_MUST_USE_RESULT bool PushNamespace(char ns) {
815     if (namespace_depth_ == kNamespaceStackSize) return false;
816     namespace_stack_[namespace_depth_++] = ns;
817     return true;
818   }
819 
820   // Pops the last pushed namespace.  Requires that the namespace stack is not
821   // empty (namespace_depth_ > 0).
PopNamespace()822   char PopNamespace() { return namespace_stack_[--namespace_depth_]; }
823 
824   // Pushes position onto the position stack and returns true if the stack is
825   // not full, else returns false.
PushPosition(int position)826   ABSL_MUST_USE_RESULT bool PushPosition(int position) {
827     if (position_depth_ == kPositionStackSize) return false;
828     position_stack_[position_depth_++] = position;
829     return true;
830   }
831 
832   // Pops the last pushed input position.  Requires that the position stack is
833   // not empty (position_depth_ > 0).
PopPosition()834   int PopPosition() { return position_stack_[--position_depth_]; }
835 
836   // Consumes a base-62-number denoting a backref target, pushes the current
837   // input position on the data stack, and sets the input position to the
838   // beginning of the backref target.  Returns true on success.  Returns false
839   // if parsing failed, the stack is exhausted, or the backref target position
840   // is out of range.
BeginBackref()841   ABSL_MUST_USE_RESULT bool BeginBackref() {
842     // backref = B base-62-number (B already consumed)
843     //
844     // Reject backrefs that don't parse, overflow int, or don't point backward.
845     // If the offset looks fine, adjust it to account for the _R prefix.
846     int offset = 0;
847     const int offset_of_this_backref =
848         pos_ - 2 /* _R */ - 1 /* B already consumed */;
849     if (!ParseBase62Number(offset) || offset < 0 ||
850         offset >= offset_of_this_backref) {
851       return false;
852     }
853     offset += 2;
854 
855     // Save the old position to restore later.
856     if (!PushPosition(pos_)) return false;
857 
858     // Move the input position to the backref target.
859     //
860     // Note that we do not check whether the new position points to the
861     // beginning of a construct matching the context in which the backref
862     // appeared.  We just jump to it and see whether nested parsing succeeds.
863     // We therefore accept various wrong manglings, e.g., a type backref
864     // pointing to an 'l' character inside an identifier, which happens to mean
865     // i32 when parsed as a type mangling.  This saves the complexity and RAM
866     // footprint of remembering which offsets began which kinds of
867     // substructures.  Existing demanglers take similar shortcuts.
868     pos_ = offset;
869     return true;
870   }
871 
872   // Cleans up after a backref production by restoring the previous input
873   // position from the data stack.
EndBackref()874   void EndBackref() { pos_ = PopPosition(); }
875 
876   // The leftmost recursion_depth_ elements of recursion_stack_ contain the
877   // ReturnAddresses pushed by ABSL_DEMANGLER_RECURSE calls not yet completed.
878   ReturnAddress recursion_stack_[kStackSize] = {};
879   int recursion_depth_ = 0;
880 
881   // The leftmost namespace_depth_ elements of namespace_stack_ contain the
882   // uppercase namespace identifiers for open nested-paths, e.g., 'C' for a
883   // closure.
884   char namespace_stack_[kNamespaceStackSize] = {};
885   int namespace_depth_ = 0;
886 
887   // The leftmost position_depth_ elements of position_stack_ contain the input
888   // positions to return to after fully printing the targets of backrefs.
889   int position_stack_[kPositionStackSize] = {};
890   int position_depth_ = 0;
891 
892   // Anything parsed while silence_depth_ > 0 contributes nothing to the
893   // demangled output.  For constructs omitted from the demangling, such as
894   // impl-path and the contents of generic-args, we will increment
895   // silence_depth_ on the way in and decrement silence_depth_ on the way out.
896   int silence_depth_ = 0;
897 
898   // Input: encoding_ points to a Rust mangled symbol, and encoding_[pos_] is
899   // the next input character to be scanned.
900   int pos_ = 0;
901   const char* encoding_ = nullptr;
902 
903   // Output: *out_ is where the next output character should be written, and
904   // out_end_ points past the last byte of available space.
905   char* out_ = nullptr;
906   char* out_end_ = nullptr;
907 };
908 
909 }  // namespace
910 
DemangleRustSymbolEncoding(const char * mangled,char * out,std::size_t out_size)911 bool DemangleRustSymbolEncoding(const char* mangled, char* out,
912                                 std::size_t out_size) {
913   return RustSymbolParser(mangled, out, out + out_size).Parse();
914 }
915 
916 }  // namespace debugging_internal
917 ABSL_NAMESPACE_END
918 }  // namespace absl
919