• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 // Create a state machine for validating UTF-8. The algorithm in brief:
11 // 1. Convert the complete unicode range of code points, except for the
12 //    surrogate code points, to an ordered array of sequences of bytes in
13 //    UTF-8.
14 // 2. Convert individual bytes to ranges, starting from the right of each byte
15 //    sequence. For each range, ensure the bytes on the left and the ranges
16 //    on the right are the identical.
17 // 3. Convert the resulting list of ranges into a state machine, collapsing
18 //    identical states.
19 // 4. Convert the state machine to an array of bytes.
20 // 5. Output as a C++ file.
21 //
22 // To use:
23 //  $ ninja -C out/Release build_utf8_validator_tables
24 //  $ out/Release/build_utf8_validator_tables
25 //                                   --output=base/i18n/utf8_validator_tables.cc
26 //  $ git add base/i18n/utf8_validator_tables.cc
27 //
28 // Because the table is not expected to ever change, it is checked into the
29 // repository rather than being regenerated at build time.
30 //
31 // This code uses type uint8_t throughout to represent bytes, to avoid
32 // signed/unsigned char confusion.
33 
34 #include <stddef.h>
35 #include <stdint.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 
40 #include <algorithm>
41 #include <map>
42 #include <string>
43 #include <vector>
44 
45 #include "base/command_line.h"
46 #include "base/files/file_path.h"
47 #include "base/files/file_util.h"
48 #include "base/logging.h"
49 #include "base/memory/raw_ptr.h"
50 #include "base/numerics/safe_conversions.h"
51 #include "base/strings/stringprintf.h"
52 #include "third_party/icu/source/common/unicode/utf8.h"
53 
54 namespace {
55 
56 const char kHelpText[] =
57     "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
58 
59 const char kProlog[] =
60     "// Copyright 2013 The Chromium Authors\n"
61     "// Use of this source code is governed by a BSD-style license that can "
62     "be\n"
63     "// found in the LICENSE file.\n"
64     "\n"
65     "// This file is auto-generated by build_utf8_validator_tables.\n"
66     "// DO NOT EDIT.\n"
67     "\n"
68     "#include \"base/i18n/utf8_validator_tables.h\"\n"
69     "\n"
70     "namespace base {\n"
71     "namespace internal {\n"
72     "\n"
73     "const uint8_t kUtf8ValidatorTables[] = {\n";
74 
75 const char kEpilog[] =
76     "};\n"
77     "\n"
78     "const size_t kUtf8ValidatorTablesSize = "
79     "std::size(kUtf8ValidatorTables);\n"
80     "\n"
81     "}  // namespace internal\n"
82     "}  // namespace base\n";
83 
84 // Ranges are inclusive at both ends--they represent [from, to]
85 class Range {
86  public:
87   // Ranges always start with just one byte.
Range(uint8_t value)88   explicit Range(uint8_t value) : from_(value), to_(value) {}
89 
90   // Range objects are copyable and assignable to be used in STL
91   // containers. Since they only contain non-pointer POD types, the default copy
92   // constructor, assignment operator and destructor will work.
93 
94   // Add a byte to the range. We intentionally only support adding a byte at the
95   // end, since that is the only operation the code needs.
AddByte(uint8_t to)96   void AddByte(uint8_t to) {
97     CHECK(to == to_ + 1);
98     to_ = to;
99   }
100 
from() const101   uint8_t from() const { return from_; }
to() const102   uint8_t to() const { return to_; }
103 
operator <(const Range & rhs) const104   bool operator<(const Range& rhs) const {
105     return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
106   }
107 
operator ==(const Range & rhs) const108   bool operator==(const Range& rhs) const {
109     return from() == rhs.from() && to() == rhs.to();
110   }
111 
112  private:
113   uint8_t from_;
114   uint8_t to_;
115 };
116 
117 // A vector of Ranges is like a simple regular expression--it corresponds to
118 // a set of strings of the same length that have bytes in each position in
119 // the appropriate range.
120 typedef std::vector<Range> StringSet;
121 
122 // A UTF-8 "character" is represented by a sequence of bytes.
123 typedef std::vector<uint8_t> Character;
124 
125 // In the second stage of the algorithm, we want to convert a large list of
126 // Characters into a small list of StringSets.
127 struct Pair {
128   Character character;
129   StringSet set;
130 };
131 
132 typedef std::vector<Pair> PairVector;
133 
134 // A class to print a table of numbers in the same style as clang-format.
135 class TablePrinter {
136  public:
TablePrinter(FILE * stream)137   explicit TablePrinter(FILE* stream)
138       : stream_(stream), values_on_this_line_(0), current_offset_(0) {}
139 
140   TablePrinter(const TablePrinter&) = delete;
141   TablePrinter& operator=(const TablePrinter&) = delete;
142 
PrintValue(uint8_t value)143   void PrintValue(uint8_t value) {
144     if (values_on_this_line_ == 0) {
145       fputs("   ", stream_);
146     } else if (values_on_this_line_ == kMaxValuesPerLine) {
147       fprintf(stream_.get(), "  // 0x%02x\n   ", current_offset_);
148       values_on_this_line_ = 0;
149     }
150     fprintf(stream_.get(), " 0x%02x,", static_cast<int>(value));
151     ++values_on_this_line_;
152     ++current_offset_;
153   }
154 
NewLine()155   void NewLine() {
156     while (values_on_this_line_ < kMaxValuesPerLine) {
157       fputs("      ", stream_);
158       ++values_on_this_line_;
159     }
160     fprintf(stream_.get(), "  // 0x%02x\n", current_offset_);
161     values_on_this_line_ = 0;
162   }
163 
164  private:
165   // stdio stream. Not owned.
166   raw_ptr<FILE> stream_;
167 
168   // Number of values so far printed on this line.
169   int values_on_this_line_;
170 
171   // Total values printed so far.
172   int current_offset_;
173 
174   static const int kMaxValuesPerLine = 8;
175 };
176 
177 // Start by filling a PairVector with characters. The resulting vector goes from
178 // "\x00" to "\xf4\x8f\xbf\xbf".
InitializeCharacters()179 PairVector InitializeCharacters() {
180   PairVector vector;
181   for (int i = 0; i <= 0x10FFFF; ++i) {
182     if (i >= 0xD800 && i < 0xE000) {
183       // Surrogate codepoints are not permitted. Non-character code points are
184       // explicitly permitted.
185       continue;
186     }
187     uint8_t bytes[4];
188     unsigned int offset = 0;
189     UBool is_error = false;
190     U8_APPEND(bytes, offset, std::size(bytes), i, is_error);
191     DCHECK(!is_error);
192     DCHECK_GT(offset, 0u);
193     DCHECK_LE(offset, std::size(bytes));
194     Pair pair = {Character(bytes, bytes + offset), StringSet()};
195     vector.push_back(pair);
196   }
197   return vector;
198 }
199 
200 // Construct a new Pair from |character| and the concatenation of |new_range|
201 // and |existing_set|, and append it to |pairs|.
ConstructPairAndAppend(const Character & character,const Range & new_range,const StringSet & existing_set,PairVector * pairs)202 void ConstructPairAndAppend(const Character& character,
203                             const Range& new_range,
204                             const StringSet& existing_set,
205                             PairVector* pairs) {
206   Pair new_pair = {character, StringSet(1, new_range)};
207   new_pair.set.insert(
208       new_pair.set.end(), existing_set.begin(), existing_set.end());
209   pairs->push_back(new_pair);
210 }
211 
212 // Each pass over the PairVector strips one byte off the right-hand-side of the
213 // characters and adds a range to the set on the right. For example, the first
214 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
215 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
216 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
217 // [\xa0-\xbf][\x80-\xbf]).
MoveRightMostCharToSet(PairVector * pairs)218 void MoveRightMostCharToSet(PairVector* pairs) {
219   PairVector new_pairs;
220   PairVector::const_iterator it = pairs->begin();
221   while (it != pairs->end() && it->character.empty()) {
222     new_pairs.push_back(*it);
223     ++it;
224   }
225   CHECK(it != pairs->end());
226   Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
227   Range new_range(it->character.back());
228   StringSet converted = it->set;
229   ++it;
230   while (it != pairs->end()) {
231     const Pair& current_pair = *it++;
232     if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
233         std::equal(unconverted_bytes.begin(),
234                    unconverted_bytes.end(),
235                    current_pair.character.begin()) &&
236         converted == current_pair.set) {
237       // The particular set of UTF-8 codepoints we are validating guarantees
238       // that each byte range will be contiguous. This would not necessarily be
239       // true for an arbitrary set of UTF-8 codepoints.
240       DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
241       new_range.AddByte(current_pair.character.back());
242       continue;
243     }
244     ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
245     unconverted_bytes = Character(current_pair.character.begin(),
246                                   current_pair.character.end() - 1);
247     new_range = Range(current_pair.character.back());
248     converted = current_pair.set;
249   }
250   ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
251   new_pairs.swap(*pairs);
252 }
253 
MoveAllCharsToSets(PairVector * pairs)254 void MoveAllCharsToSets(PairVector* pairs) {
255   // Since each pass of the function moves one character, and UTF-8 sequences
256   // are at most 4 characters long, this simply runs the algorithm four times.
257   for (int i = 0; i < 4; ++i) {
258     MoveRightMostCharToSet(pairs);
259   }
260 #if DCHECK_IS_ON()
261   for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
262        ++it) {
263     DCHECK(it->character.empty());
264   }
265 #endif
266 }
267 
268 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
269 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
270 // algorithm is working. Use the command-line option
271 // --vmodule=build_utf8_validator_tables=1 to see this output.
LogStringSets(const PairVector & pairs)272 void LogStringSets(const PairVector& pairs) {
273   for (const auto& pair_it : pairs) {
274     std::string set_as_string;
275     for (auto set_it = pair_it.set.begin(); set_it != pair_it.set.end();
276          ++set_it) {
277       set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
278                                           static_cast<int>(set_it->from()),
279                                           static_cast<int>(set_it->to()));
280     }
281     VLOG(1) << set_as_string;
282   }
283 }
284 
285 // A single state in the state machine is represented by a sorted vector of
286 // start bytes and target states. All input bytes in the range between the start
287 // byte and the next entry in the vector (or 0xFF) result in a transition to the
288 // target state.
289 struct StateRange {
290   uint8_t from;
291   uint8_t target_state;
292 };
293 
294 typedef std::vector<StateRange> State;
295 
296 // Generates a state where all bytes go to state 1 (invalid). This is also used
297 // as an initialiser for other states (since bytes from outside the desired
298 // range are invalid).
GenerateInvalidState()299 State GenerateInvalidState() {
300   const StateRange range = {0, 1};
301   return State(1, range);
302 }
303 
304 // A map from a state (ie. a set of strings which will match from this state) to
305 // a number (which is an index into the array of states).
306 typedef std::map<StringSet, uint8_t> StateMap;
307 
308 // Create a new state corresponding to |set|, add it |states| and |state_map|
309 // and return the index it was given in |states|.
MakeState(const StringSet & set,std::vector<State> * states,StateMap * state_map)310 uint8_t MakeState(const StringSet& set,
311                   std::vector<State>* states,
312                   StateMap* state_map) {
313   DCHECK(!set.empty());
314   const Range& range = set.front();
315   const StringSet rest(set.begin() + 1, set.end());
316   const StateMap::const_iterator where = state_map->find(rest);
317   const uint8_t target_state = where == state_map->end()
318                                    ? MakeState(rest, states, state_map)
319                                    : where->second;
320   DCHECK_LT(0, range.from());
321   DCHECK_LT(range.to(), 0xFF);
322   const StateRange new_state_initializer[] = {
323       {0, 1},
324       {range.from(), target_state},
325       {static_cast<uint8_t>(range.to() + 1), 1}};
326   states->push_back(
327       State(new_state_initializer,
328             new_state_initializer + std::size(new_state_initializer)));
329   const uint8_t new_state_number =
330       base::checked_cast<uint8_t>(states->size() - 1);
331   CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
332   return new_state_number;
333 }
334 
GenerateStates(const PairVector & pairs)335 std::vector<State> GenerateStates(const PairVector& pairs) {
336   // States 0 and 1 are the initial/valid state and invalid state, respectively.
337   std::vector<State> states(2, GenerateInvalidState());
338   StateMap state_map;
339   state_map.insert(std::make_pair(StringSet(), 0));
340   for (auto it = pairs.begin(); it != pairs.end(); ++it) {
341     DCHECK(it->character.empty());
342     DCHECK(!it->set.empty());
343     const Range& range = it->set.front();
344     const StringSet rest(it->set.begin() + 1, it->set.end());
345     const StateMap::const_iterator where = state_map.find(rest);
346     const uint8_t target_state = where == state_map.end()
347                                      ? MakeState(rest, &states, &state_map)
348                                      : where->second;
349     if (states[0].back().from == range.from()) {
350       DCHECK_EQ(1, states[0].back().target_state);
351       states[0].back().target_state = target_state;
352       DCHECK_LT(range.to(), 0xFF);
353       const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};
354       states[0].push_back(new_range);
355     } else {
356       DCHECK_LT(range.to(), 0xFF);
357       const StateRange new_range_initializer[] = {
358           {range.from(), target_state},
359           {static_cast<uint8_t>(range.to() + 1), 1}};
360       states[0].insert(
361           states[0].end(), new_range_initializer,
362           new_range_initializer + std::size(new_range_initializer));
363     }
364   }
365   return states;
366 }
367 
368 // Output the generated states as a C++ table. Two tricks are used to compact
369 // the table: each state in the table starts with a shift value which indicates
370 // how many bits we can discard from the right-hand-side of the byte before
371 // doing the table lookup. Secondly, only the state-transitions for bytes
372 // with the top-bit set are included in the table; bytes without the top-bit set
373 // are just ASCII and are handled directly by the code.
PrintStates(const std::vector<State> & states,FILE * stream)374 void PrintStates(const std::vector<State>& states, FILE* stream) {
375   // First calculate the start-offset of each state. This allows the state
376   // machine to jump directly to the correct offset, avoiding an extra
377   // indirection. State 0 starts at offset 0.
378   std::vector<uint8_t> state_offset(1, 0);
379   std::vector<uint8_t> shifts;
380   uint8_t pos = 0;
381 
382   for (const auto& state_it : states) {
383     // We want to set |shift| to the (0-based) index of the least-significant
384     // set bit in any of the ranges for this state, since this tells us how many
385     // bits we can discard and still determine what range a byte lies in. Sadly
386     // it appears that ffs() is not portable, so we do it clumsily.
387     uint8_t shift = 7;
388     for (auto range_it = state_it.begin(); range_it != state_it.end();
389          ++range_it) {
390       while (shift > 0 && range_it->from % (1 << shift) != 0) {
391         --shift;
392       }
393     }
394     shifts.push_back(shift);
395     pos += 1 + (1 << (7 - shift));
396     state_offset.push_back(pos);
397   }
398 
399   DCHECK_EQ(129, state_offset[1]);
400 
401   fputs(kProlog, stream);
402   TablePrinter table_printer(stream);
403 
404   for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {
405     const uint8_t shift = shifts[state_index];
406     uint8_t next_range = 0;
407     uint8_t target_state = 1;
408     fprintf(stream,
409             "    // State %d, offset 0x%02x\n",
410             static_cast<int>(state_index),
411             static_cast<int>(state_offset[state_index]));
412     table_printer.PrintValue(shift);
413     for (int i = 0; i < 0x100; i += (1 << shift)) {
414       if (next_range < states[state_index].size() &&
415           states[state_index][next_range].from == i) {
416         target_state = states[state_index][next_range].target_state;
417         ++next_range;
418       }
419       if (i >= 0x80) {
420         table_printer.PrintValue(state_offset[target_state]);
421       }
422     }
423     table_printer.NewLine();
424   }
425 
426   fputs(kEpilog, stream);
427 }
428 
429 }  // namespace
430 
main(int argc,char * argv[])431 int main(int argc, char* argv[]) {
432   base::CommandLine::Init(argc, argv);
433   logging::LoggingSettings settings;
434   settings.logging_dest =
435       logging::LOG_TO_SYSTEM_DEBUG_LOG | logging::LOG_TO_STDERR;
436   logging::InitLogging(settings);
437   if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
438     fwrite(kHelpText, 1, std::size(kHelpText), stdout);
439     exit(EXIT_SUCCESS);
440   }
441   base::FilePath filename =
442       base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
443 
444   FILE* output = stdout;
445   if (!filename.empty()) {
446     output = base::OpenFile(filename, "wb");
447     if (!output)
448       PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
449                   << "' for writing";
450   }
451 
452   // Step 1: Enumerate the characters
453   PairVector pairs = InitializeCharacters();
454   // Step 2: Convert to sets.
455   MoveAllCharsToSets(&pairs);
456   if (VLOG_IS_ON(1)) {
457     LogStringSets(pairs);
458   }
459   // Step 3: Generate states.
460   std::vector<State> states = GenerateStates(pairs);
461   // Step 4/5: Print output
462   PrintStates(states, output);
463 
464   if (!filename.empty()) {
465     if (!base::CloseFile(output))
466       PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
467                   << "'";
468   }
469 
470   return EXIT_SUCCESS;
471 }
472