• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "gn/string_utils.h"
6 
7 #include <stddef.h>
8 #include <cctype>
9 
10 #include "base/strings/string_number_conversions.h"
11 #include "gn/err.h"
12 #include "gn/input_file.h"
13 #include "gn/parser.h"
14 #include "gn/scope.h"
15 #include "gn/token.h"
16 #include "gn/tokenizer.h"
17 #include "gn/value.h"
18 
19 namespace {
20 
21 // Constructs an Err indicating a range inside a string. We assume that the
22 // token has quotes around it that are not counted by the offset.
ErrInsideStringToken(const Token & token,size_t offset,size_t size,const std::string & msg,const std::string & help=std::string ())23 Err ErrInsideStringToken(const Token& token,
24                          size_t offset,
25                          size_t size,
26                          const std::string& msg,
27                          const std::string& help = std::string()) {
28   // The "+1" is skipping over the " at the beginning of the token.
29   int int_offset = static_cast<int>(offset);
30   Location begin_loc(token.location().file(), token.location().line_number(),
31                      token.location().column_number() + int_offset + 1);
32   Location end_loc(token.location().file(), token.location().line_number(),
33                    token.location().column_number() + int_offset + 1 +
34                        static_cast<int>(size));
35   return Err(LocationRange(begin_loc, end_loc), msg, help);
36 }
37 
38 // Notes about expression interpolation. This is based loosly on Dart but is
39 // slightly less flexible. In Dart, seeing the ${ in a string is something
40 // the toplevel parser knows about, and it will recurse into the block
41 // treating it as a first-class {...} block. So even things like this work:
42 //   "hello ${"foo}"*2+"bar"}"  =>  "hello foo}foo}bar"
43 // (you can see it did not get confused by the nested strings or the nested "}"
44 // inside the block).
45 //
46 // This is cool but complicates the parser for almost no benefit for this
47 // non-general-purpose programming language. The main reason expressions are
48 // supported here at all are to support "${scope.variable}" and "${list[0]}",
49 // neither of which have any of these edge-cases.
50 //
51 // In this simplified approach, we search for the terminating '}' and execute
52 // the result. This means we can't support any expressions with embedded '}'
53 // or '"'. To keep people from getting confusing about what's supported and
54 // what's not, only identifier and accessor expressions are allowed (neither
55 // of these run into any of these edge-cases).
AppendInterpolatedExpression(Scope * scope,const Token & token,const char * input,size_t begin_offset,size_t end_offset,std::string * output,Err * err)56 bool AppendInterpolatedExpression(Scope* scope,
57                                   const Token& token,
58                                   const char* input,
59                                   size_t begin_offset,
60                                   size_t end_offset,
61                                   std::string* output,
62                                   Err* err) {
63   SourceFile empty_source_file;  // Prevent most vexing parse.
64   InputFile input_file(empty_source_file);
65   input_file.SetContents(
66       std::string(&input[begin_offset], end_offset - begin_offset));
67 
68   // Tokenize.
69   std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
70   if (err->has_error()) {
71     // The error will point into our temporary buffer, rewrite it to refer
72     // to the original token. This will make the location information less
73     // precise, but generally there won't be complicated things in string
74     // interpolations.
75     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
76                                 err->message(), err->help_text());
77     return false;
78   }
79 
80   // Parse.
81   std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
82   if (err->has_error()) {
83     // Rewrite error as above.
84     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
85                                 err->message(), err->help_text());
86     return false;
87   }
88   if (!(node->AsIdentifier() || node->AsAccessor())) {
89     *err = ErrInsideStringToken(
90         token, begin_offset, end_offset - begin_offset,
91         "Invalid string interpolation.",
92         "The thing inside the ${} must be an identifier ${foo},\n"
93         "a scope access ${foo.bar}, or a list access ${foo[0]}.");
94     return false;
95   }
96 
97   // Evaluate.
98   Value result = node->Execute(scope, err);
99   if (err->has_error()) {
100     // Rewrite error as above.
101     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
102                                 err->message(), err->help_text());
103     return false;
104   }
105 
106   output->append(result.ToString(false));
107   return true;
108 }
109 
AppendInterpolatedIdentifier(Scope * scope,const Token & token,const char * input,size_t begin_offset,size_t end_offset,std::string * output,Err * err)110 bool AppendInterpolatedIdentifier(Scope* scope,
111                                   const Token& token,
112                                   const char* input,
113                                   size_t begin_offset,
114                                   size_t end_offset,
115                                   std::string* output,
116                                   Err* err) {
117   std::string_view identifier(&input[begin_offset], end_offset - begin_offset);
118   const Value* value = scope->GetValue(identifier, true);
119   if (!value) {
120     // We assume the input points inside the token.
121     *err = ErrInsideStringToken(
122         token, identifier.data() - token.value().data() - 1, identifier.size(),
123         "Undefined identifier in string expansion.",
124         std::string("\"") + identifier + "\" is not currently in scope.");
125     return false;
126   }
127 
128   output->append(value->ToString(false));
129   return true;
130 }
131 
132 // Handles string interpolations: $identifier and ${expression}
133 //
134 // |*i| is the index into |input| after the $. This will be updated to point to
135 // the last character consumed on success. The token is the original string
136 // to blame on failure.
137 //
138 // On failure, returns false and sets the error. On success, appends the
139 // result of the interpolation to |*output|.
AppendStringInterpolation(Scope * scope,const Token & token,const char * input,size_t size,size_t * i,std::string * output,Err * err)140 bool AppendStringInterpolation(Scope* scope,
141                                const Token& token,
142                                const char* input,
143                                size_t size,
144                                size_t* i,
145                                std::string* output,
146                                Err* err) {
147   size_t dollars_index = *i - 1;
148 
149   if (input[*i] == '{') {
150     // Bracketed expression.
151     (*i)++;
152     size_t begin_offset = *i;
153 
154     // Find the closing } and check for non-identifier chars. Don't need to
155     // bother checking for the more-restricted first character of an identifier
156     // since the {} unambiguously denotes the range, and identifiers with
157     // invalid names just won't be found later.
158     bool has_non_ident_chars = false;
159     while (*i < size && input[*i] != '}') {
160       has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]);
161       (*i)++;
162     }
163     if (*i == size) {
164       *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index,
165                                   "Unterminated ${...");
166       return false;
167     }
168 
169     // In the common case, the thing inside the {} will actually be a
170     // simple identifier. Avoid all the complicated parsing of accessors
171     // in this case.
172     if (!has_non_ident_chars) {
173       return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i,
174                                           output, err);
175     }
176     return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
177                                         output, err);
178   }
179 
180   // Simple identifier.
181   // The first char of an identifier is more restricted.
182   if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
183     *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
184                                 "$ not followed by an identifier char.",
185                                 "It you want a literal $ use \"\\$\".");
186     return false;
187   }
188   size_t begin_offset = *i;
189   (*i)++;
190 
191   // Find the first non-identifier char following the string.
192   while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i]))
193     (*i)++;
194   size_t end_offset = *i;
195   (*i)--;  // Back up to mark the last character consumed.
196   return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
197                                       end_offset, output, err);
198 }
199 
200 // Handles a hex literal: $0xFF
201 //
202 // |*i| is the index into |input| after the $. This will be updated to point to
203 // the last character consumed on success. The token is the original string
204 // to blame on failure.
205 //
206 // On failure, returns false and sets the error. On success, appends the
207 // char with the given hex value to |*output|.
AppendHexByte(Scope * scope,const Token & token,const char * input,size_t size,size_t * i,std::string * output,Err * err)208 bool AppendHexByte(Scope* scope,
209                    const Token& token,
210                    const char* input,
211                    size_t size,
212                    size_t* i,
213                    std::string* output,
214                    Err* err) {
215   size_t dollars_index = *i - 1;
216   // "$0" is already known to exist.
217   if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) ||
218       !std::isxdigit(input[*i + 3])) {
219     *err = ErrInsideStringToken(
220         token, dollars_index, *i - dollars_index + 1,
221         "Invalid hex character. Hex values must look like 0xFF.");
222     return false;
223   }
224   int value = 0;
225   if (!base::HexStringToInt(std::string_view(&input[*i + 2], 2), &value)) {
226     *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
227                                 "Could not convert hex value.");
228     return false;
229   }
230   *i += 3;
231   output->push_back(value);
232   return true;
233 }
234 
235 }  // namespace
236 
ExpandStringLiteral(Scope * scope,const Token & literal,Value * result,Err * err)237 bool ExpandStringLiteral(Scope* scope,
238                          const Token& literal,
239                          Value* result,
240                          Err* err) {
241   DCHECK(literal.type() == Token::STRING);
242   DCHECK(literal.value().size() > 1);       // Should include quotes.
243   DCHECK(result->type() == Value::STRING);  // Should be already set.
244 
245   // The token includes the surrounding quotes, so strip those off.
246   const char* input = &literal.value().data()[1];
247   size_t size = literal.value().size() - 2;
248 
249   std::string& output = result->string_value();
250   output.reserve(size);
251   for (size_t i = 0; i < size; i++) {
252     if (input[i] == '\\') {
253       if (i < size - 1) {
254         switch (input[i + 1]) {
255           case '\\':
256           case '"':
257           case '$':
258             output.push_back(input[i + 1]);
259             i++;
260             continue;
261           default:  // Everything else has no meaning: pass the literal.
262             break;
263         }
264       }
265       output.push_back(input[i]);
266     } else if (input[i] == '$') {
267       i++;
268       if (i == size) {
269         *err = ErrInsideStringToken(
270             literal, i - 1, 1, "$ at end of string.",
271             "I was expecting an identifier, 0xFF, or {...} after the $.");
272         return false;
273       }
274       if (input[i] == '0') {
275         if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
276           return false;
277       } else if (!AppendStringInterpolation(scope, literal, input, size, &i,
278                                             &output, err))
279         return false;
280     } else {
281       output.push_back(input[i]);
282     }
283   }
284   return true;
285 }
286 
EditDistance(std::string_view s1,std::string_view s2,size_t max_edit_distance)287 size_t EditDistance(std::string_view s1,
288                     std::string_view s2,
289                     size_t max_edit_distance) {
290   // The algorithm implemented below is the "classic"
291   // dynamic-programming algorithm for computing the Levenshtein
292   // distance, which is described here:
293   //
294   //   http://en.wikipedia.org/wiki/Levenshtein_distance
295   //
296   // Although the algorithm is typically described using an m x n
297   // array, only one row plus one element are used at a time, so this
298   // implementation just keeps one vector for the row.  To update one entry,
299   // only the entries to the left, top, and top-left are needed.  The left
300   // entry is in row[x-1], the top entry is what's in row[x] from the last
301   // iteration, and the top-left entry is stored in previous.
302   size_t m = s1.size();
303   size_t n = s2.size();
304 
305   std::vector<size_t> row(n + 1);
306   for (size_t i = 1; i <= n; ++i)
307     row[i] = i;
308 
309   for (size_t y = 1; y <= m; ++y) {
310     row[0] = y;
311     size_t best_this_row = row[0];
312 
313     size_t previous = y - 1;
314     for (size_t x = 1; x <= n; ++x) {
315       size_t old_row = row[x];
316       row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
317                         std::min(row[x - 1], row[x]) + 1u);
318       previous = old_row;
319       best_this_row = std::min(best_this_row, row[x]);
320     }
321 
322     if (max_edit_distance && best_this_row > max_edit_distance)
323       return max_edit_distance + 1;
324   }
325 
326   return row[n];
327 }
328 
SpellcheckString(std::string_view text,const std::vector<std::string_view> & words)329 std::string_view SpellcheckString(std::string_view text,
330                                   const std::vector<std::string_view>& words) {
331   const size_t kMaxValidEditDistance = 3u;
332 
333   size_t min_distance = kMaxValidEditDistance + 1u;
334   std::string_view result;
335   for (std::string_view word : words) {
336     size_t distance = EditDistance(word, text, kMaxValidEditDistance);
337     if (distance < min_distance) {
338       min_distance = distance;
339       result = word;
340     }
341   }
342   return result;
343 }
344 
ReadStdin()345 std::string ReadStdin() {
346   char buffer[4 << 10];
347   std::string result;
348   size_t len;
349   while ((len = fread(buffer, 1, sizeof(buffer), stdin)) > 0)
350     result.append(buffer, len);
351   // TODO(thakis): Check ferror(stdin)?
352   return result;
353 }
354