1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "gn/string_utils.h"
6
7 #include <stddef.h>
8 #include <cctype>
9
10 #include "base/strings/string_number_conversions.h"
11 #include "gn/err.h"
12 #include "gn/input_file.h"
13 #include "gn/parser.h"
14 #include "gn/scope.h"
15 #include "gn/token.h"
16 #include "gn/tokenizer.h"
17 #include "gn/value.h"
18
19 namespace {
20
21 // Constructs an Err indicating a range inside a string. We assume that the
22 // token has quotes around it that are not counted by the offset.
ErrInsideStringToken(const Token & token,size_t offset,size_t size,const std::string & msg,const std::string & help=std::string ())23 Err ErrInsideStringToken(const Token& token,
24 size_t offset,
25 size_t size,
26 const std::string& msg,
27 const std::string& help = std::string()) {
28 // The "+1" is skipping over the " at the beginning of the token.
29 int int_offset = static_cast<int>(offset);
30 Location begin_loc(token.location().file(), token.location().line_number(),
31 token.location().column_number() + int_offset + 1);
32 Location end_loc(token.location().file(), token.location().line_number(),
33 token.location().column_number() + int_offset + 1 +
34 static_cast<int>(size));
35 return Err(LocationRange(begin_loc, end_loc), msg, help);
36 }
37
38 // Notes about expression interpolation. This is based loosly on Dart but is
39 // slightly less flexible. In Dart, seeing the ${ in a string is something
40 // the toplevel parser knows about, and it will recurse into the block
41 // treating it as a first-class {...} block. So even things like this work:
42 // "hello ${"foo}"*2+"bar"}" => "hello foo}foo}bar"
43 // (you can see it did not get confused by the nested strings or the nested "}"
44 // inside the block).
45 //
46 // This is cool but complicates the parser for almost no benefit for this
47 // non-general-purpose programming language. The main reason expressions are
48 // supported here at all are to support "${scope.variable}" and "${list[0]}",
49 // neither of which have any of these edge-cases.
50 //
51 // In this simplified approach, we search for the terminating '}' and execute
52 // the result. This means we can't support any expressions with embedded '}'
53 // or '"'. To keep people from getting confusing about what's supported and
54 // what's not, only identifier and accessor expressions are allowed (neither
55 // of these run into any of these edge-cases).
AppendInterpolatedExpression(Scope * scope,const Token & token,const char * input,size_t begin_offset,size_t end_offset,std::string * output,Err * err)56 bool AppendInterpolatedExpression(Scope* scope,
57 const Token& token,
58 const char* input,
59 size_t begin_offset,
60 size_t end_offset,
61 std::string* output,
62 Err* err) {
63 SourceFile empty_source_file; // Prevent most vexing parse.
64 InputFile input_file(empty_source_file);
65 input_file.SetContents(
66 std::string(&input[begin_offset], end_offset - begin_offset));
67
68 // Tokenize.
69 std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
70 if (err->has_error()) {
71 // The error will point into our temporary buffer, rewrite it to refer
72 // to the original token. This will make the location information less
73 // precise, but generally there won't be complicated things in string
74 // interpolations.
75 *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
76 err->message(), err->help_text());
77 return false;
78 }
79
80 // Parse.
81 std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
82 if (err->has_error()) {
83 // Rewrite error as above.
84 *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
85 err->message(), err->help_text());
86 return false;
87 }
88 if (!(node->AsIdentifier() || node->AsAccessor())) {
89 *err = ErrInsideStringToken(
90 token, begin_offset, end_offset - begin_offset,
91 "Invalid string interpolation.",
92 "The thing inside the ${} must be an identifier ${foo},\n"
93 "a scope access ${foo.bar}, or a list access ${foo[0]}.");
94 return false;
95 }
96
97 // Evaluate.
98 Value result = node->Execute(scope, err);
99 if (err->has_error()) {
100 // Rewrite error as above.
101 *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
102 err->message(), err->help_text());
103 return false;
104 }
105
106 output->append(result.ToString(false));
107 return true;
108 }
109
AppendInterpolatedIdentifier(Scope * scope,const Token & token,const char * input,size_t begin_offset,size_t end_offset,std::string * output,Err * err)110 bool AppendInterpolatedIdentifier(Scope* scope,
111 const Token& token,
112 const char* input,
113 size_t begin_offset,
114 size_t end_offset,
115 std::string* output,
116 Err* err) {
117 std::string_view identifier(&input[begin_offset], end_offset - begin_offset);
118 const Value* value = scope->GetValue(identifier, true);
119 if (!value) {
120 // We assume the input points inside the token.
121 *err = ErrInsideStringToken(
122 token, identifier.data() - token.value().data() - 1, identifier.size(),
123 "Undefined identifier in string expansion.",
124 std::string("\"") + identifier + "\" is not currently in scope.");
125 return false;
126 }
127
128 output->append(value->ToString(false));
129 return true;
130 }
131
132 // Handles string interpolations: $identifier and ${expression}
133 //
134 // |*i| is the index into |input| after the $. This will be updated to point to
135 // the last character consumed on success. The token is the original string
136 // to blame on failure.
137 //
138 // On failure, returns false and sets the error. On success, appends the
139 // result of the interpolation to |*output|.
AppendStringInterpolation(Scope * scope,const Token & token,const char * input,size_t size,size_t * i,std::string * output,Err * err)140 bool AppendStringInterpolation(Scope* scope,
141 const Token& token,
142 const char* input,
143 size_t size,
144 size_t* i,
145 std::string* output,
146 Err* err) {
147 size_t dollars_index = *i - 1;
148
149 if (input[*i] == '{') {
150 // Bracketed expression.
151 (*i)++;
152 size_t begin_offset = *i;
153
154 // Find the closing } and check for non-identifier chars. Don't need to
155 // bother checking for the more-restricted first character of an identifier
156 // since the {} unambiguously denotes the range, and identifiers with
157 // invalid names just won't be found later.
158 bool has_non_ident_chars = false;
159 while (*i < size && input[*i] != '}') {
160 has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]);
161 (*i)++;
162 }
163 if (*i == size) {
164 *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index,
165 "Unterminated ${...");
166 return false;
167 }
168
169 // In the common case, the thing inside the {} will actually be a
170 // simple identifier. Avoid all the complicated parsing of accessors
171 // in this case.
172 if (!has_non_ident_chars) {
173 return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i,
174 output, err);
175 }
176 return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
177 output, err);
178 }
179
180 // Simple identifier.
181 // The first char of an identifier is more restricted.
182 if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
183 *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
184 "$ not followed by an identifier char.",
185 "It you want a literal $ use \"\\$\".");
186 return false;
187 }
188 size_t begin_offset = *i;
189 (*i)++;
190
191 // Find the first non-identifier char following the string.
192 while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i]))
193 (*i)++;
194 size_t end_offset = *i;
195 (*i)--; // Back up to mark the last character consumed.
196 return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
197 end_offset, output, err);
198 }
199
200 // Handles a hex literal: $0xFF
201 //
202 // |*i| is the index into |input| after the $. This will be updated to point to
203 // the last character consumed on success. The token is the original string
204 // to blame on failure.
205 //
206 // On failure, returns false and sets the error. On success, appends the
207 // char with the given hex value to |*output|.
AppendHexByte(Scope * scope,const Token & token,const char * input,size_t size,size_t * i,std::string * output,Err * err)208 bool AppendHexByte(Scope* scope,
209 const Token& token,
210 const char* input,
211 size_t size,
212 size_t* i,
213 std::string* output,
214 Err* err) {
215 size_t dollars_index = *i - 1;
216 // "$0" is already known to exist.
217 if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) ||
218 !std::isxdigit(input[*i + 3])) {
219 *err = ErrInsideStringToken(
220 token, dollars_index, *i - dollars_index + 1,
221 "Invalid hex character. Hex values must look like 0xFF.");
222 return false;
223 }
224 int value = 0;
225 if (!base::HexStringToInt(std::string_view(&input[*i + 2], 2), &value)) {
226 *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
227 "Could not convert hex value.");
228 return false;
229 }
230 *i += 3;
231 output->push_back(value);
232 return true;
233 }
234
235 } // namespace
236
ExpandStringLiteral(Scope * scope,const Token & literal,Value * result,Err * err)237 bool ExpandStringLiteral(Scope* scope,
238 const Token& literal,
239 Value* result,
240 Err* err) {
241 DCHECK(literal.type() == Token::STRING);
242 DCHECK(literal.value().size() > 1); // Should include quotes.
243 DCHECK(result->type() == Value::STRING); // Should be already set.
244
245 // The token includes the surrounding quotes, so strip those off.
246 const char* input = &literal.value().data()[1];
247 size_t size = literal.value().size() - 2;
248
249 std::string& output = result->string_value();
250 output.reserve(size);
251 for (size_t i = 0; i < size; i++) {
252 if (input[i] == '\\') {
253 if (i < size - 1) {
254 switch (input[i + 1]) {
255 case '\\':
256 case '"':
257 case '$':
258 output.push_back(input[i + 1]);
259 i++;
260 continue;
261 default: // Everything else has no meaning: pass the literal.
262 break;
263 }
264 }
265 output.push_back(input[i]);
266 } else if (input[i] == '$') {
267 i++;
268 if (i == size) {
269 *err = ErrInsideStringToken(
270 literal, i - 1, 1, "$ at end of string.",
271 "I was expecting an identifier, 0xFF, or {...} after the $.");
272 return false;
273 }
274 if (input[i] == '0') {
275 if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
276 return false;
277 } else if (!AppendStringInterpolation(scope, literal, input, size, &i,
278 &output, err))
279 return false;
280 } else {
281 output.push_back(input[i]);
282 }
283 }
284 return true;
285 }
286
EditDistance(std::string_view s1,std::string_view s2,size_t max_edit_distance)287 size_t EditDistance(std::string_view s1,
288 std::string_view s2,
289 size_t max_edit_distance) {
290 // The algorithm implemented below is the "classic"
291 // dynamic-programming algorithm for computing the Levenshtein
292 // distance, which is described here:
293 //
294 // http://en.wikipedia.org/wiki/Levenshtein_distance
295 //
296 // Although the algorithm is typically described using an m x n
297 // array, only one row plus one element are used at a time, so this
298 // implementation just keeps one vector for the row. To update one entry,
299 // only the entries to the left, top, and top-left are needed. The left
300 // entry is in row[x-1], the top entry is what's in row[x] from the last
301 // iteration, and the top-left entry is stored in previous.
302 size_t m = s1.size();
303 size_t n = s2.size();
304
305 std::vector<size_t> row(n + 1);
306 for (size_t i = 1; i <= n; ++i)
307 row[i] = i;
308
309 for (size_t y = 1; y <= m; ++y) {
310 row[0] = y;
311 size_t best_this_row = row[0];
312
313 size_t previous = y - 1;
314 for (size_t x = 1; x <= n; ++x) {
315 size_t old_row = row[x];
316 row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
317 std::min(row[x - 1], row[x]) + 1u);
318 previous = old_row;
319 best_this_row = std::min(best_this_row, row[x]);
320 }
321
322 if (max_edit_distance && best_this_row > max_edit_distance)
323 return max_edit_distance + 1;
324 }
325
326 return row[n];
327 }
328
SpellcheckString(std::string_view text,const std::vector<std::string_view> & words)329 std::string_view SpellcheckString(std::string_view text,
330 const std::vector<std::string_view>& words) {
331 const size_t kMaxValidEditDistance = 3u;
332
333 size_t min_distance = kMaxValidEditDistance + 1u;
334 std::string_view result;
335 for (std::string_view word : words) {
336 size_t distance = EditDistance(word, text, kMaxValidEditDistance);
337 if (distance < min_distance) {
338 min_distance = distance;
339 result = word;
340 }
341 }
342 return result;
343 }
344
ReadStdin()345 std::string ReadStdin() {
346 char buffer[4 << 10];
347 std::string result;
348 size_t len;
349 while ((len = fread(buffer, 1, sizeof(buffer), stdin)) > 0)
350 result.append(buffer, len);
351 // TODO(thakis): Check ferror(stdin)?
352 return result;
353 }
354