1 //===--- SourceCode.h - Manipulating source code as strings -----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Various code that examines C++ source code without using heavy AST machinery 10 // (and often not even the lexer). To be used sparingly! 11 // 12 //===----------------------------------------------------------------------===// 13 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H 14 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SOURCECODE_H 15 16 #include "Protocol.h" 17 #include "support/Context.h" 18 #include "support/ThreadsafeFS.h" 19 #include "clang/Basic/Diagnostic.h" 20 #include "clang/Basic/LangOptions.h" 21 #include "clang/Basic/SourceLocation.h" 22 #include "clang/Basic/SourceManager.h" 23 #include "clang/Format/Format.h" 24 #include "clang/Tooling/Core/Replacement.h" 25 #include "clang/Tooling/Syntax/Tokens.h" 26 #include "llvm/ADT/StringRef.h" 27 #include "llvm/ADT/StringSet.h" 28 #include "llvm/Support/Error.h" 29 #include "llvm/Support/SHA1.h" 30 #include <string> 31 32 namespace clang { 33 class SourceManager; 34 35 namespace clangd { 36 37 // We tend to generate digests for source codes in a lot of different places. 38 // This represents the type for those digests to prevent us hard coding details 39 // of hashing function at every place that needs to store this information. 40 using FileDigest = std::array<uint8_t, 8>; 41 FileDigest digest(StringRef Content); 42 Optional<FileDigest> digestFile(const SourceManager &SM, FileID FID); 43 44 // This context variable controls the behavior of functions in this file 45 // that convert between LSP offsets and native clang byte offsets. 46 // If not set, defaults to UTF-16 for backwards-compatibility. 47 extern Key<OffsetEncoding> kCurrentOffsetEncoding; 48 49 // Counts the number of UTF-16 code units needed to represent a string (LSP 50 // specifies string lengths in UTF-16 code units). 51 // Use of UTF-16 may be overridden by kCurrentOffsetEncoding. 52 size_t lspLength(StringRef Code); 53 54 /// Turn a [line, column] pair into an offset in Code. 55 /// 56 /// If P.character exceeds the line length, returns the offset at end-of-line. 57 /// (If !AllowColumnsBeyondLineLength, then returns an error instead). 58 /// If the line number is out of range, returns an error. 59 /// 60 /// The returned value is in the range [0, Code.size()]. 61 llvm::Expected<size_t> 62 positionToOffset(llvm::StringRef Code, Position P, 63 bool AllowColumnsBeyondLineLength = true); 64 65 /// Turn an offset in Code into a [line, column] pair. 66 /// The offset must be in range [0, Code.size()]. 67 Position offsetToPosition(llvm::StringRef Code, size_t Offset); 68 69 /// Turn a SourceLocation into a [line, column] pair. 70 /// FIXME: This should return an error if the location is invalid. 71 Position sourceLocToPosition(const SourceManager &SM, SourceLocation Loc); 72 73 /// Return the file location, corresponding to \p P. Note that one should take 74 /// care to avoid comparing the result with expansion locations. 75 llvm::Expected<SourceLocation> sourceLocationInMainFile(const SourceManager &SM, 76 Position P); 77 78 /// Returns true iff \p Loc is inside the main file. This function handles 79 /// file & macro locations. For macro locations, returns iff the macro is being 80 /// expanded inside the main file. 81 /// 82 /// The function is usually used to check whether a declaration is inside the 83 /// the main file. 84 bool isInsideMainFile(SourceLocation Loc, const SourceManager &SM); 85 86 /// Returns the #include location through which IncludedFIle was loaded. 87 /// Where SM.getIncludeLoc() returns the location of the *filename*, which may 88 /// be in a macro, includeHashLoc() returns the location of the #. 89 SourceLocation includeHashLoc(FileID IncludedFile, const SourceManager &SM); 90 91 /// Returns true if the token at Loc is spelled in the source code. 92 /// This is not the case for: 93 /// * symbols formed via macro concatenation, the spelling location will 94 /// be "<scratch space>" 95 /// * symbols controlled and defined by a compile command-line option 96 /// `-DName=foo`, the spelling location will be "<command line>". 97 bool isSpelledInSource(SourceLocation Loc, const SourceManager &SM); 98 99 /// Turns a token range into a half-open range and checks its correctness. 100 /// The resulting range will have only valid source location on both sides, both 101 /// of which are file locations. 102 /// 103 /// File locations always point to a particular offset in a file, i.e. they 104 /// never refer to a location inside a macro expansion. Turning locations from 105 /// macro expansions into file locations is ambiguous - one can use 106 /// SourceManager::{getExpansion|getFile|getSpelling}Loc. This function 107 /// calls SourceManager::getFileLoc on both ends of \p R to do the conversion. 108 /// 109 /// User input (e.g. cursor position) is expressed as a file location, so this 110 /// function can be viewed as a way to normalize the ranges used in the clang 111 /// AST so that they are comparable with ranges coming from the user input. 112 llvm::Optional<SourceRange> toHalfOpenFileRange(const SourceManager &Mgr, 113 const LangOptions &LangOpts, 114 SourceRange R); 115 116 /// Returns true iff all of the following conditions hold: 117 /// - start and end locations are valid, 118 /// - start and end locations are file locations from the same file 119 /// (i.e. expansion locations are not taken into account). 120 /// - start offset <= end offset. 121 /// FIXME: introduce a type for source range with this invariant. 122 bool isValidFileRange(const SourceManager &Mgr, SourceRange R); 123 124 /// Returns the source code covered by the source range. 125 /// EXPECTS: isValidFileRange(R) == true. 126 llvm::StringRef toSourceCode(const SourceManager &SM, SourceRange R); 127 128 // Converts a half-open clang source range to an LSP range. 129 // Note that clang also uses closed source ranges, which this can't handle! 130 Range halfOpenToRange(const SourceManager &SM, CharSourceRange R); 131 132 // Converts an offset to a clang line/column (1-based, columns are bytes). 133 // The offset must be in range [0, Code.size()]. 134 // Prefer to use SourceManager if one is available. 135 std::pair<size_t, size_t> offsetToClangLineColumn(llvm::StringRef Code, 136 size_t Offset); 137 138 /// From "a::b::c", return {"a::b::", "c"}. Scope is empty if there's no 139 /// qualifier. 140 std::pair<llvm::StringRef, llvm::StringRef> 141 splitQualifiedName(llvm::StringRef QName); 142 143 TextEdit replacementToEdit(StringRef Code, const tooling::Replacement &R); 144 145 std::vector<TextEdit> replacementsToEdits(StringRef Code, 146 const tooling::Replacements &Repls); 147 148 TextEdit toTextEdit(const FixItHint &FixIt, const SourceManager &M, 149 const LangOptions &L); 150 151 /// Get the canonical path of \p F. This means: 152 /// 153 /// - Absolute path 154 /// - Symlinks resolved 155 /// - No "." or ".." component 156 /// - No duplicate or trailing directory separator 157 /// 158 /// This function should be used when paths needs to be used outside the 159 /// component that generate it, so that paths are normalized as much as 160 /// possible. 161 llvm::Optional<std::string> getCanonicalPath(const FileEntry *F, 162 const SourceManager &SourceMgr); 163 164 /// Choose the clang-format style we should apply to a certain file. 165 /// This will usually use FS to look for .clang-format directories. 166 /// FIXME: should we be caching the .clang-format file search? 167 /// This uses format::DefaultFormatStyle and format::DefaultFallbackStyle, 168 /// though the latter may have been overridden in main()! 169 format::FormatStyle getFormatStyleForFile(llvm::StringRef File, 170 llvm::StringRef Content, 171 const ThreadsafeFS &TFS); 172 173 /// Cleanup and format the given replacements. 174 llvm::Expected<tooling::Replacements> 175 cleanupAndFormat(StringRef Code, const tooling::Replacements &Replaces, 176 const format::FormatStyle &Style); 177 178 /// A set of edits generated for a single file. Can verify whether it is safe to 179 /// apply these edits to a code block. 180 struct Edit { 181 tooling::Replacements Replacements; 182 std::string InitialCode; 183 184 Edit() = default; 185 EditEdit186 Edit(llvm::StringRef Code, tooling::Replacements Reps) 187 : Replacements(std::move(Reps)), InitialCode(Code) {} 188 189 /// Returns the file contents after changes are applied. 190 llvm::Expected<std::string> apply() const; 191 192 /// Represents Replacements as TextEdits that are available for use in LSP. 193 std::vector<TextEdit> asTextEdits() const; 194 195 /// Checks whether the Replacements are applicable to given Code. 196 bool canApplyTo(llvm::StringRef Code) const; 197 }; 198 /// A mapping from absolute file path (the one used for accessing the underlying 199 /// VFS) to edits. 200 using FileEdits = llvm::StringMap<Edit>; 201 202 /// Formats the edits and code around it according to Style. Changes 203 /// Replacements to formatted ones if succeeds. 204 llvm::Error reformatEdit(Edit &E, const format::FormatStyle &Style); 205 206 /// Collects identifiers with counts in the source code. 207 llvm::StringMap<unsigned> collectIdentifiers(llvm::StringRef Content, 208 const format::FormatStyle &Style); 209 210 /// Collects all ranges of the given identifier in the source code. 211 std::vector<Range> collectIdentifierRanges(llvm::StringRef Identifier, 212 llvm::StringRef Content, 213 const LangOptions &LangOpts); 214 215 /// Collects words from the source code. 216 /// Unlike collectIdentifiers: 217 /// - also finds text in comments: 218 /// - splits text into words 219 /// - drops stopwords like "get" and "for" 220 llvm::StringSet<> collectWords(llvm::StringRef Content); 221 222 // Something that looks like a word in the source code. 223 // Could be a "real" token that's "live" in the AST, a spelled token consumed by 224 // the preprocessor, or part of a spelled token (e.g. word in a comment). 225 struct SpelledWord { 226 // (Spelling) location of the start of the word. 227 SourceLocation Location; 228 // The range of the word itself, excluding any quotes. 229 // This is a subrange of the file buffer. 230 llvm::StringRef Text; 231 // Whether this word is likely to refer to an identifier. True if: 232 // - the word is a spelled identifier token 233 // - Text is identifier-like (e.g. "foo_bar") 234 // - Text is surrounded by backticks (e.g. Foo in "// returns `Foo`") 235 bool LikelyIdentifier = false; 236 // Set if the word is contained in a token spelled in the file. 237 // (This should always be true, but comments aren't retained by TokenBuffer). 238 const syntax::Token *PartOfSpelledToken = nullptr; 239 // Set if the word is exactly a token spelled in the file. 240 const syntax::Token *SpelledToken = nullptr; 241 // Set if the word is a token spelled in the file, and that token survives 242 // preprocessing to emit an expanded token spelled the same way. 243 const syntax::Token *ExpandedToken = nullptr; 244 245 // Find the unique word that contains SpelledLoc or starts/ends there. 246 static llvm::Optional<SpelledWord> touching(SourceLocation SpelledLoc, 247 const syntax::TokenBuffer &TB, 248 const LangOptions &LangOpts); 249 }; 250 251 /// Return true if the \p TokenName is in the list of reversed keywords of the 252 /// language. 253 bool isKeyword(llvm::StringRef TokenName, const LangOptions &LangOpts); 254 255 /// Heuristically determine namespaces visible at a point, without parsing Code. 256 /// This considers using-directives and enclosing namespace-declarations that 257 /// are visible (and not obfuscated) in the file itself (not headers). 258 /// Code should be truncated at the point of interest. 259 /// 260 /// The returned vector is always non-empty. 261 /// - The first element is the namespace that encloses the point: a declaration 262 /// near the point would be within this namespace. 263 /// - The elements are the namespaces in scope at the point: an unqualified 264 /// lookup would search within these namespaces. 265 /// 266 /// Using directives are resolved against all enclosing scopes, but no other 267 /// namespace directives. 268 /// 269 /// example: 270 /// using namespace a; 271 /// namespace foo { 272 /// using namespace b; 273 /// 274 /// visibleNamespaces are {"foo::", "", "a::", "b::", "foo::b::"}, not "a::b::". 275 std::vector<std::string> visibleNamespaces(llvm::StringRef Code, 276 const LangOptions &LangOpts); 277 278 /// Represents locations that can accept a definition. 279 struct EligibleRegion { 280 /// Namespace that owns all of the EligiblePoints, e.g. 281 /// namespace a{ namespace b {^ void foo();^} } 282 /// It will be “a::b” for both carrot locations. 283 std::string EnclosingNamespace; 284 /// Offsets into the code marking eligible points to insert a function 285 /// definition. 286 std::vector<Position> EligiblePoints; 287 }; 288 289 /// Returns most eligible region to insert a definition for \p 290 /// FullyQualifiedName in the \p Code. 291 /// Pseudo parses \pCode under the hood to determine namespace decls and 292 /// possible insertion points. Choses the region that matches the longest prefix 293 /// of \p FullyQualifiedName. Returns EOF if there are no shared namespaces. 294 /// \p FullyQualifiedName should not contain anonymous namespaces. 295 EligibleRegion getEligiblePoints(llvm::StringRef Code, 296 llvm::StringRef FullyQualifiedName, 297 const LangOptions &LangOpts); 298 299 struct DefinedMacro { 300 llvm::StringRef Name; 301 const MacroInfo *Info; 302 /// Location of the identifier that names the macro. 303 /// Unlike Info->Location, this translates preamble-patch locations to 304 /// main-file locations. 305 SourceLocation NameLoc; 306 }; 307 /// Gets the macro referenced by \p SpelledTok. It must be a spelled token 308 /// aligned to the beginning of an identifier. 309 llvm::Optional<DefinedMacro> locateMacroAt(const syntax::Token &SpelledTok, 310 Preprocessor &PP); 311 312 /// Infers whether this is a header from the FileName and LangOpts (if 313 /// presents). 314 bool isHeaderFile(llvm::StringRef FileName, 315 llvm::Optional<LangOptions> LangOpts = llvm::None); 316 317 /// Returns true if the given location is in a generated protobuf file. 318 bool isProtoFile(SourceLocation Loc, const SourceManager &SourceMgr); 319 320 } // namespace clangd 321 } // namespace clang 322 #endif 323