• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "gn/string_utils.h"
6 
7 #include <stddef.h>
8 #include <cctype>
9 
10 #include "base/strings/string_number_conversions.h"
11 #include "gn/err.h"
12 #include "gn/input_file.h"
13 #include "gn/parser.h"
14 #include "gn/scope.h"
15 #include "gn/token.h"
16 #include "gn/tokenizer.h"
17 #include "gn/value.h"
18 
19 namespace {
20 
21 // Constructs an Err indicating a range inside a string. We assume that the
22 // token has quotes around it that are not counted by the offset.
ErrInsideStringToken(const Token & token,size_t offset,size_t size,const std::string & msg,const std::string & help=std::string ())23 Err ErrInsideStringToken(const Token& token,
24                          size_t offset,
25                          size_t size,
26                          const std::string& msg,
27                          const std::string& help = std::string()) {
28   // The "+1" is skipping over the " at the beginning of the token.
29   int int_offset = static_cast<int>(offset);
30   Location begin_loc(token.location().file(), token.location().line_number(),
31                      token.location().column_number() + int_offset + 1,
32                      token.location().byte() + int_offset + 1);
33   Location end_loc(
34       token.location().file(), token.location().line_number(),
35       token.location().column_number() + int_offset + 1 +
36           static_cast<int>(size),
37       token.location().byte() + int_offset + 1 + static_cast<int>(size));
38   return Err(LocationRange(begin_loc, end_loc), msg, help);
39 }
40 
41 // Notes about expression interpolation. This is based loosly on Dart but is
42 // slightly less flexible. In Dart, seeing the ${ in a string is something
43 // the toplevel parser knows about, and it will recurse into the block
44 // treating it as a first-class {...} block. So even things like this work:
45 //   "hello ${"foo}"*2+"bar"}"  =>  "hello foo}foo}bar"
46 // (you can see it did not get confused by the nested strings or the nested "}"
47 // inside the block).
48 //
49 // This is cool but complicates the parser for almost no benefit for this
50 // non-general-purpose programming language. The main reason expressions are
51 // supported here at all are to support "${scope.variable}" and "${list[0]}",
52 // neither of which have any of these edge-cases.
53 //
54 // In this simplified approach, we search for the terminating '}' and execute
55 // the result. This means we can't support any expressions with embedded '}'
56 // or '"'. To keep people from getting confusing about what's supported and
57 // what's not, only identifier and accessor expressions are allowed (neither
58 // of these run into any of these edge-cases).
AppendInterpolatedExpression(Scope * scope,const Token & token,const char * input,size_t begin_offset,size_t end_offset,std::string * output,Err * err)59 bool AppendInterpolatedExpression(Scope* scope,
60                                   const Token& token,
61                                   const char* input,
62                                   size_t begin_offset,
63                                   size_t end_offset,
64                                   std::string* output,
65                                   Err* err) {
66   SourceFile empty_source_file;  // Prevent most vexing parse.
67   InputFile input_file(empty_source_file);
68   input_file.SetContents(
69       std::string(&input[begin_offset], end_offset - begin_offset));
70 
71   // Tokenize.
72   std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
73   if (err->has_error()) {
74     // The error will point into our temporary buffer, rewrite it to refer
75     // to the original token. This will make the location information less
76     // precise, but generally there won't be complicated things in string
77     // interpolations.
78     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
79                                 err->message(), err->help_text());
80     return false;
81   }
82 
83   // Parse.
84   std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
85   if (err->has_error()) {
86     // Rewrite error as above.
87     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
88                                 err->message(), err->help_text());
89     return false;
90   }
91   if (!(node->AsIdentifier() || node->AsAccessor())) {
92     *err = ErrInsideStringToken(
93         token, begin_offset, end_offset - begin_offset,
94         "Invalid string interpolation.",
95         "The thing inside the ${} must be an identifier ${foo},\n"
96         "a scope access ${foo.bar}, or a list access ${foo[0]}.");
97     return false;
98   }
99 
100   // Evaluate.
101   Value result = node->Execute(scope, err);
102   if (err->has_error()) {
103     // Rewrite error as above.
104     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
105                                 err->message(), err->help_text());
106     return false;
107   }
108 
109   output->append(result.ToString(false));
110   return true;
111 }
112 
AppendInterpolatedIdentifier(Scope * scope,const Token & token,const char * input,size_t begin_offset,size_t end_offset,std::string * output,Err * err)113 bool AppendInterpolatedIdentifier(Scope* scope,
114                                   const Token& token,
115                                   const char* input,
116                                   size_t begin_offset,
117                                   size_t end_offset,
118                                   std::string* output,
119                                   Err* err) {
120   std::string_view identifier(&input[begin_offset], end_offset - begin_offset);
121   const Value* value = scope->GetValue(identifier, true);
122   if (!value) {
123     // We assume the input points inside the token.
124     *err = ErrInsideStringToken(
125         token, identifier.data() - token.value().data() - 1, identifier.size(),
126         "Undefined identifier in string expansion.",
127         std::string("\"") + identifier + "\" is not currently in scope.");
128     return false;
129   }
130 
131   output->append(value->ToString(false));
132   return true;
133 }
134 
135 // Handles string interpolations: $identifier and ${expression}
136 //
137 // |*i| is the index into |input| after the $. This will be updated to point to
138 // the last character consumed on success. The token is the original string
139 // to blame on failure.
140 //
141 // On failure, returns false and sets the error. On success, appends the
142 // result of the interpolation to |*output|.
AppendStringInterpolation(Scope * scope,const Token & token,const char * input,size_t size,size_t * i,std::string * output,Err * err)143 bool AppendStringInterpolation(Scope* scope,
144                                const Token& token,
145                                const char* input,
146                                size_t size,
147                                size_t* i,
148                                std::string* output,
149                                Err* err) {
150   size_t dollars_index = *i - 1;
151 
152   if (input[*i] == '{') {
153     // Bracketed expression.
154     (*i)++;
155     size_t begin_offset = *i;
156 
157     // Find the closing } and check for non-identifier chars. Don't need to
158     // bother checking for the more-restricted first character of an identifier
159     // since the {} unambiguously denotes the range, and identifiers with
160     // invalid names just won't be found later.
161     bool has_non_ident_chars = false;
162     while (*i < size && input[*i] != '}') {
163       has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]);
164       (*i)++;
165     }
166     if (*i == size) {
167       *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index,
168                                   "Unterminated ${...");
169       return false;
170     }
171 
172     // In the common case, the thing inside the {} will actually be a
173     // simple identifier. Avoid all the complicated parsing of accessors
174     // in this case.
175     if (!has_non_ident_chars) {
176       return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i,
177                                           output, err);
178     }
179     return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
180                                         output, err);
181   }
182 
183   // Simple identifier.
184   // The first char of an identifier is more restricted.
185   if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
186     *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
187                                 "$ not followed by an identifier char.",
188                                 "It you want a literal $ use \"\\$\".");
189     return false;
190   }
191   size_t begin_offset = *i;
192   (*i)++;
193 
194   // Find the first non-identifier char following the string.
195   while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i]))
196     (*i)++;
197   size_t end_offset = *i;
198   (*i)--;  // Back up to mark the last character consumed.
199   return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
200                                       end_offset, output, err);
201 }
202 
203 // Handles a hex literal: $0xFF
204 //
205 // |*i| is the index into |input| after the $. This will be updated to point to
206 // the last character consumed on success. The token is the original string
207 // to blame on failure.
208 //
209 // On failure, returns false and sets the error. On success, appends the
210 // char with the given hex value to |*output|.
AppendHexByte(Scope * scope,const Token & token,const char * input,size_t size,size_t * i,std::string * output,Err * err)211 bool AppendHexByte(Scope* scope,
212                    const Token& token,
213                    const char* input,
214                    size_t size,
215                    size_t* i,
216                    std::string* output,
217                    Err* err) {
218   size_t dollars_index = *i - 1;
219   // "$0" is already known to exist.
220   if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) ||
221       !std::isxdigit(input[*i + 3])) {
222     *err = ErrInsideStringToken(
223         token, dollars_index, *i - dollars_index + 1,
224         "Invalid hex character. Hex values must look like 0xFF.");
225     return false;
226   }
227   int value = 0;
228   if (!base::HexStringToInt(std::string_view(&input[*i + 2], 2), &value)) {
229     *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
230                                 "Could not convert hex value.");
231     return false;
232   }
233   *i += 3;
234   output->push_back(value);
235   return true;
236 }
237 
238 }  // namespace
239 
ExpandStringLiteral(Scope * scope,const Token & literal,Value * result,Err * err)240 bool ExpandStringLiteral(Scope* scope,
241                          const Token& literal,
242                          Value* result,
243                          Err* err) {
244   DCHECK(literal.type() == Token::STRING);
245   DCHECK(literal.value().size() > 1);       // Should include quotes.
246   DCHECK(result->type() == Value::STRING);  // Should be already set.
247 
248   // The token includes the surrounding quotes, so strip those off.
249   const char* input = &literal.value().data()[1];
250   size_t size = literal.value().size() - 2;
251 
252   std::string& output = result->string_value();
253   output.reserve(size);
254   for (size_t i = 0; i < size; i++) {
255     if (input[i] == '\\') {
256       if (i < size - 1) {
257         switch (input[i + 1]) {
258           case '\\':
259           case '"':
260           case '$':
261             output.push_back(input[i + 1]);
262             i++;
263             continue;
264           default:  // Everything else has no meaning: pass the literal.
265             break;
266         }
267       }
268       output.push_back(input[i]);
269     } else if (input[i] == '$') {
270       i++;
271       if (i == size) {
272         *err = ErrInsideStringToken(
273             literal, i - 1, 1, "$ at end of string.",
274             "I was expecting an identifier, 0xFF, or {...} after the $.");
275         return false;
276       }
277       if (input[i] == '0') {
278         if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
279           return false;
280       } else if (!AppendStringInterpolation(scope, literal, input, size, &i,
281                                             &output, err))
282         return false;
283     } else {
284       output.push_back(input[i]);
285     }
286   }
287   return true;
288 }
289 
EditDistance(const std::string_view & s1,const std::string_view & s2,size_t max_edit_distance)290 size_t EditDistance(const std::string_view& s1,
291                     const std::string_view& s2,
292                     size_t max_edit_distance) {
293   // The algorithm implemented below is the "classic"
294   // dynamic-programming algorithm for computing the Levenshtein
295   // distance, which is described here:
296   //
297   //   http://en.wikipedia.org/wiki/Levenshtein_distance
298   //
299   // Although the algorithm is typically described using an m x n
300   // array, only one row plus one element are used at a time, so this
301   // implementation just keeps one vector for the row.  To update one entry,
302   // only the entries to the left, top, and top-left are needed.  The left
303   // entry is in row[x-1], the top entry is what's in row[x] from the last
304   // iteration, and the top-left entry is stored in previous.
305   size_t m = s1.size();
306   size_t n = s2.size();
307 
308   std::vector<size_t> row(n + 1);
309   for (size_t i = 1; i <= n; ++i)
310     row[i] = i;
311 
312   for (size_t y = 1; y <= m; ++y) {
313     row[0] = y;
314     size_t best_this_row = row[0];
315 
316     size_t previous = y - 1;
317     for (size_t x = 1; x <= n; ++x) {
318       size_t old_row = row[x];
319       row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
320                         std::min(row[x - 1], row[x]) + 1u);
321       previous = old_row;
322       best_this_row = std::min(best_this_row, row[x]);
323     }
324 
325     if (max_edit_distance && best_this_row > max_edit_distance)
326       return max_edit_distance + 1;
327   }
328 
329   return row[n];
330 }
331 
SpellcheckString(const std::string_view & text,const std::vector<std::string_view> & words)332 std::string_view SpellcheckString(const std::string_view& text,
333                                   const std::vector<std::string_view>& words) {
334   const size_t kMaxValidEditDistance = 3u;
335 
336   size_t min_distance = kMaxValidEditDistance + 1u;
337   std::string_view result;
338   for (std::string_view word : words) {
339     size_t distance = EditDistance(word, text, kMaxValidEditDistance);
340     if (distance < min_distance) {
341       min_distance = distance;
342       result = word;
343     }
344   }
345   return result;
346 }
347 
ReadStdin()348 std::string ReadStdin() {
349   char buffer[4 << 10];
350   std::string result;
351   size_t len;
352   while ((len = fread(buffer, 1, sizeof(buffer), stdin)) > 0)
353     result.append(buffer, len);
354   // TODO(thakis): Check ferror(stdin)?
355   return result;
356 }
357