1 //===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements fuzzy-matching of strings against identifiers. 10 // It indicates both the existence and quality of a match: 11 // 'eb' matches both 'emplace_back' and 'embed', the former has a better score. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H 16 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H 17 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/ADT/SmallString.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/Support/raw_ostream.h" 23 24 namespace clang { 25 namespace clangd { 26 27 // Utilities for word segmentation. 28 // FuzzyMatcher already incorporates this logic, so most users don't need this. 29 // 30 // A name like "fooBar_baz" consists of several parts foo, bar, baz. 31 // Aligning segmentation of word and pattern improves the fuzzy-match. 32 // For example: [lol] matches "LaughingOutLoud" better than "LionPopulation" 33 // 34 // First we classify each character into types (uppercase, lowercase, etc). 35 // Then we look at the sequence: e.g. [upper, lower] is the start of a segment. 36 37 // We distinguish the types of characters that affect segmentation. 38 // It's not obvious how to segment digits, we treat them as lowercase letters. 39 // As we don't decode UTF-8, we treat bytes over 127 as lowercase too. 40 // This means we require exact (case-sensitive) match for those characters. 41 enum CharType : unsigned char { 42 Empty = 0, // Before-the-start and after-the-end (and control chars). 43 Lower = 1, // Lowercase letters, digits, and non-ASCII bytes. 44 Upper = 2, // Uppercase letters. 45 Punctuation = 3, // ASCII punctuation (including Space) 46 }; 47 // A CharTypeSet is a bitfield representing all the character types in a word. 48 // Its bits are 1<<Empty, 1<<Lower, etc. 49 using CharTypeSet = unsigned char; 50 51 // Each character's Role is the Head or Tail of a segment, or a Separator. 52 // e.g. XMLHttpRequest_Async 53 // +--+---+------ +---- 54 // ^Head ^Tail ^Separator 55 enum CharRole : unsigned char { 56 Unknown = 0, // Stray control characters or impossible states. 57 Tail = 1, // Part of a word segment, but not the first character. 58 Head = 2, // The first character of a word segment. 59 Separator = 3, // Punctuation characters that separate word segments. 60 }; 61 62 // Compute segmentation of Text. 63 // Character roles are stored in Roles (Roles.size() must equal Text.size()). 64 // The set of character types encountered is returned, this may inform 65 // heuristics for dealing with poorly-segmented identifiers like "strndup". 66 CharTypeSet calculateRoles(llvm::StringRef Text, 67 llvm::MutableArrayRef<CharRole> Roles); 68 69 // A matcher capable of matching and scoring strings against a single pattern. 70 // It's optimized for matching against many strings - match() does not allocate. 71 class FuzzyMatcher { 72 public: 73 // Characters beyond MaxPat are ignored. 74 FuzzyMatcher(llvm::StringRef Pattern); 75 76 // If Word matches the pattern, return a score indicating the quality match. 77 // Scores usually fall in a [0,1] range, with 1 being a very good score. 78 // "Super" scores in (1,2] are possible if the pattern is the full word. 79 // Characters beyond MaxWord are ignored. 80 llvm::Optional<float> match(llvm::StringRef Word); 81 pattern()82 llvm::StringRef pattern() const { return llvm::StringRef(Pat, PatN); } empty()83 bool empty() const { return PatN == 0; } 84 85 // Dump internal state from the last match() to the stream, for debugging. 86 // Returns the pattern with [] around matched characters, e.g. 87 // [u_p] + "unique_ptr" --> "[u]nique[_p]tr" 88 llvm::SmallString<256> dumpLast(llvm::raw_ostream &) const; 89 90 private: 91 // We truncate the pattern and the word to bound the cost of matching. 92 constexpr static int MaxPat = 63, MaxWord = 127; 93 // Action describes how a word character was matched to the pattern. 94 // It should be an enum, but this causes bitfield problems: 95 // - for MSVC the enum type must be explicitly unsigned for correctness 96 // - GCC 4.8 complains not all values fit if the type is unsigned 97 using Action = bool; 98 constexpr static Action Miss = false; // Word character was skipped. 99 constexpr static Action Match = true; // Matched against a pattern character. 100 101 bool init(llvm::StringRef Word); 102 void buildGraph(); 103 bool allowMatch(int P, int W, Action Last) const; 104 int skipPenalty(int W, Action Last) const; 105 int matchBonus(int P, int W, Action Last) const; 106 107 // Pattern data is initialized by the constructor, then constant. 108 char Pat[MaxPat]; // Pattern data 109 int PatN; // Length 110 char LowPat[MaxPat]; // Pattern in lowercase 111 CharRole PatRole[MaxPat]; // Pattern segmentation info 112 CharTypeSet PatTypeSet; // Bitmask of 1<<CharType for all Pattern characters 113 float ScoreScale; // Normalizes scores for the pattern length. 114 115 // Word data is initialized on each call to match(), mostly by init(). 116 char Word[MaxWord]; // Word data 117 int WordN; // Length 118 char LowWord[MaxWord]; // Word in lowercase 119 CharRole WordRole[MaxWord]; // Word segmentation info 120 CharTypeSet WordTypeSet; // Bitmask of 1<<CharType for all Word characters 121 bool WordContainsPattern; // Simple substring check 122 123 // Cumulative best-match score table. 124 // Boundary conditions are filled in by the constructor. 125 // The rest is repopulated for each match(), by buildGraph(). 126 struct ScoreInfo { 127 signed int Score : 15; 128 Action Prev : 1; 129 }; 130 ScoreInfo Scores[MaxPat + 1][MaxWord + 1][/* Last Action */ 2]; 131 }; 132 133 } // namespace clangd 134 } // namespace clang 135 136 #endif 137