1 //===--- FuzzySymbolIndex.cpp - Lookup symbols for autocomplete -*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "FuzzySymbolIndex.h"
9 #include "llvm/Support/Regex.h"
10
11 using clang::find_all_symbols::SymbolAndSignals;
12 using llvm::StringRef;
13
14 namespace clang {
15 namespace include_fixer {
16 namespace {
17
18 class MemSymbolIndex : public FuzzySymbolIndex {
19 public:
MemSymbolIndex(std::vector<SymbolAndSignals> Symbols)20 MemSymbolIndex(std::vector<SymbolAndSignals> Symbols) {
21 for (auto &Symbol : Symbols) {
22 auto Tokens = tokenize(Symbol.Symbol.getName());
23 this->Symbols.emplace_back(
24 StringRef(llvm::join(Tokens.begin(), Tokens.end(), " ")),
25 std::move(Symbol));
26 }
27 }
28
search(StringRef Query)29 std::vector<SymbolAndSignals> search(StringRef Query) override {
30 auto Tokens = tokenize(Query);
31 llvm::Regex Pattern("^" + queryRegexp(Tokens));
32 std::vector<SymbolAndSignals> Results;
33 for (const Entry &E : Symbols)
34 if (Pattern.match(E.first))
35 Results.push_back(E.second);
36 return Results;
37 }
38
39 private:
40 using Entry = std::pair<llvm::SmallString<32>, SymbolAndSignals>;
41 std::vector<Entry> Symbols;
42 };
43
44 // Helpers for tokenize state machine.
45 enum TokenizeState {
46 EMPTY, // No pending characters.
47 ONE_BIG, // Read one uppercase letter, could be WORD or Word.
48 BIG_WORD, // Reading an uppercase WORD.
49 SMALL_WORD, // Reading a lowercase word.
50 NUMBER // Reading a number.
51 };
52
53 enum CharType { UPPER, LOWER, DIGIT, MISC };
classify(char c)54 CharType classify(char c) {
55 if (isupper(c))
56 return UPPER;
57 if (islower(c))
58 return LOWER;
59 if (isdigit(c))
60 return DIGIT;
61 return MISC;
62 }
63
64 } // namespace
65
tokenize(StringRef Text)66 std::vector<std::string> FuzzySymbolIndex::tokenize(StringRef Text) {
67 std::vector<std::string> Result;
68 // State describes the treatment of text from Start to I.
69 // Once text is Flush()ed into Result, we're done with it and advance Start.
70 TokenizeState State = EMPTY;
71 size_t Start = 0;
72 auto Flush = [&](size_t End) {
73 if (State != EMPTY) {
74 Result.push_back(Text.substr(Start, End - Start).lower());
75 State = EMPTY;
76 }
77 Start = End;
78 };
79 for (size_t I = 0; I < Text.size(); ++I) {
80 CharType Type = classify(Text[I]);
81 if (Type == MISC)
82 Flush(I);
83 else if (Type == LOWER)
84 switch (State) {
85 case BIG_WORD:
86 Flush(I - 1); // FOOBar: first token is FOO, not FOOB.
87 LLVM_FALLTHROUGH;
88 case ONE_BIG:
89 State = SMALL_WORD;
90 LLVM_FALLTHROUGH;
91 case SMALL_WORD:
92 break;
93 default:
94 Flush(I);
95 State = SMALL_WORD;
96 }
97 else if (Type == UPPER)
98 switch (State) {
99 case ONE_BIG:
100 State = BIG_WORD;
101 LLVM_FALLTHROUGH;
102 case BIG_WORD:
103 break;
104 default:
105 Flush(I);
106 State = ONE_BIG;
107 }
108 else if (Type == DIGIT && State != NUMBER) {
109 Flush(I);
110 State = NUMBER;
111 }
112 }
113 Flush(Text.size());
114 return Result;
115 }
116
117 std::string
queryRegexp(const std::vector<std::string> & Tokens)118 FuzzySymbolIndex::queryRegexp(const std::vector<std::string> &Tokens) {
119 std::string Result;
120 for (size_t I = 0; I < Tokens.size(); ++I) {
121 if (I)
122 Result.append("[[:alnum:]]* ");
123 for (size_t J = 0; J < Tokens[I].size(); ++J) {
124 if (J)
125 Result.append("([[:alnum:]]* )?");
126 Result.push_back(Tokens[I][J]);
127 }
128 }
129 return Result;
130 }
131
132 llvm::Expected<std::unique_ptr<FuzzySymbolIndex>>
createFromYAML(StringRef FilePath)133 FuzzySymbolIndex::createFromYAML(StringRef FilePath) {
134 auto Buffer = llvm::MemoryBuffer::getFile(FilePath);
135 if (!Buffer)
136 return llvm::errorCodeToError(Buffer.getError());
137 return std::make_unique<MemSymbolIndex>(
138 find_all_symbols::ReadSymbolInfosFromYAML(Buffer.get()->getBuffer()));
139 }
140
141 } // namespace include_fixer
142 } // namespace clang
143