naiveproxy/base/i18n/build_utf8_validator_tables.cc

// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Create a state machine for validating UTF-8. The algorithm in brief:
// 1. Convert the complete unicode range of code points, except for the
//    surrogate code points, to an ordered array of sequences of bytes in
//    UTF-8.
// 2. Convert individual bytes to ranges, starting from the right of each byte
//    sequence. For each range, ensure the bytes on the left and the ranges
//    on the right are the identical.
// 3. Convert the resulting list of ranges into a state machine, collapsing
//    identical states.
// 4. Convert the state machine to an array of bytes.
// 5. Output as a C++ file.
//
// To use:
//  $ ninja -C out/Release build_utf8_validator_tables
//  $ out/Release/build_utf8_validator_tables
//                                   --output=base/i18n/utf8_validator_tables.cc
//  $ git add base/i18n/utf8_validator_tables.cc
//
// Because the table is not expected to ever change, it is checked into the
// repository rather than being regenerated at build time.
//
// This code uses type uint8_t throughout to represent bytes, to avoid
// signed/unsigned char confusion.

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <algorithm>
#include <map>
#include <string>
#include <vector>

#include "base/command_line.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/macros.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/stringprintf.h"
#include "third_party/icu/source/common/unicode/utf8.h"

namespace {

const char kHelpText[] =
    "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";

const char kProlog[] =
    "// Copyright 2013 The Chromium Authors. All rights reserved.\n"
    "// Use of this source code is governed by a BSD-style license that can "
    "be\n"
    "// found in the LICENSE file.\n"
    "\n"
    "// This file is auto-generated by build_utf8_validator_tables.\n"
    "// DO NOT EDIT.\n"
    "\n"
    "#include \"base/i18n/utf8_validator_tables.h\"\n"
    "\n"
    "namespace base {\n"
    "namespace internal {\n"
    "\n"
    "const uint8_t kUtf8ValidatorTables[] = {\n";

const char kEpilog[] =
    "};\n"
    "\n"
    "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"
    "\n"
    "}  // namespace internal\n"
    "}  // namespace base\n";

// Ranges are inclusive at both ends--they represent [from, to]
class Range {
 public:
  // Ranges always start with just one byte.
  explicit Range(uint8_t value) : from_(value), to_(value) {}

  // Range objects are copyable and assignable to be used in STL
  // containers. Since they only contain non-pointer POD types, the default copy
  // constructor, assignment operator and destructor will work.

  // Add a byte to the range. We intentionally only support adding a byte at the
  // end, since that is the only operation the code needs.
  void AddByte(uint8_t to) {
    CHECK(to == to_ + 1);
    to_ = to;
  }

  uint8_t from() const { return from_; }
  uint8_t to() const { return to_; }

  bool operator<(const Range& rhs) const {
    return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
  }

  bool operator==(const Range& rhs) const {
    return from() == rhs.from() && to() == rhs.to();
  }

 private:
  uint8_t from_;
  uint8_t to_;
};

// A vector of Ranges is like a simple regular expression--it corresponds to
// a set of strings of the same length that have bytes in each position in
// the appropriate range.
typedef std::vector<Range> StringSet;

// A UTF-8 "character" is represented by a sequence of bytes.
typedef std::vector<uint8_t> Character;

// In the second stage of the algorithm, we want to convert a large list of
// Characters into a small list of StringSets.
struct Pair {
  Character character;
  StringSet set;
};

typedef std::vector<Pair> PairVector;

// A class to print a table of numbers in the same style as clang-format.
class TablePrinter {
 public:
  explicit TablePrinter(FILE* stream)
      : stream_(stream), values_on_this_line_(0), current_offset_(0) {}

  void PrintValue(uint8_t value) {
    if (values_on_this_line_ == 0) {
      fputs("   ", stream_);
    } else if (values_on_this_line_ == kMaxValuesPerLine) {
      fprintf(stream_, "  // 0x%02x\n   ", current_offset_);
      values_on_this_line_ = 0;
    }
    fprintf(stream_, " 0x%02x,", static_cast<int>(value));
    ++values_on_this_line_;
    ++current_offset_;
  }

  void NewLine() {
    while (values_on_this_line_ < kMaxValuesPerLine) {
      fputs("      ", stream_);
      ++values_on_this_line_;
    }
    fprintf(stream_, "  // 0x%02x\n", current_offset_);
    values_on_this_line_ = 0;
  }

 private:
  // stdio stream. Not owned.
  FILE* stream_;

  // Number of values so far printed on this line.
  int values_on_this_line_;

  // Total values printed so far.
  int current_offset_;

  static const int kMaxValuesPerLine = 8;

  DISALLOW_COPY_AND_ASSIGN(TablePrinter);
};

// Start by filling a PairVector with characters. The resulting vector goes from
// "\x00" to "\xf4\x8f\xbf\xbf".
PairVector InitializeCharacters() {
  PairVector vector;
  for (int i = 0; i <= 0x10FFFF; ++i) {
    if (i >= 0xD800 && i < 0xE000) {
      // Surrogate codepoints are not permitted. Non-character code points are
      // explicitly permitted.
      continue;
    }
    uint8_t bytes[4];
    unsigned int offset = 0;
    UBool is_error = false;
    U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);
    DCHECK(!is_error);
    DCHECK_GT(offset, 0u);
    DCHECK_LE(offset, arraysize(bytes));
    Pair pair = {Character(bytes, bytes + offset), StringSet()};
    vector.push_back(pair);
  }
  return vector;
}

// Construct a new Pair from |character| and the concatenation of |new_range|
// and |existing_set|, and append it to |pairs|.
void ConstructPairAndAppend(const Character& character,
                            const Range& new_range,
                            const StringSet& existing_set,
                            PairVector* pairs) {
  Pair new_pair = {character, StringSet(1, new_range)};
  new_pair.set.insert(
      new_pair.set.end(), existing_set.begin(), existing_set.end());
  pairs->push_back(new_pair);
}

// Each pass over the PairVector strips one byte off the right-hand-side of the
// characters and adds a range to the set on the right. For example, the first
// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
// [\xa0-\xbf][\x80-\xbf]).
void MoveRightMostCharToSet(PairVector* pairs) {
  PairVector new_pairs;
  PairVector::const_iterator it = pairs->begin();
  while (it != pairs->end() && it->character.empty()) {
    new_pairs.push_back(*it);
    ++it;
  }
  CHECK(it != pairs->end());
  Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
  Range new_range(it->character.back());
  StringSet converted = it->set;
  ++it;
  while (it != pairs->end()) {
    const Pair& current_pair = *it++;
    if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
        std::equal(unconverted_bytes.begin(),
                   unconverted_bytes.end(),
                   current_pair.character.begin()) &&
        converted == current_pair.set) {
      // The particular set of UTF-8 codepoints we are validating guarantees
      // that each byte range will be contiguous. This would not necessarily be
      // true for an arbitrary set of UTF-8 codepoints.
      DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
      new_range.AddByte(current_pair.character.back());
      continue;
    }
    ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
    unconverted_bytes = Character(current_pair.character.begin(),
                                  current_pair.character.end() - 1);
    new_range = Range(current_pair.character.back());
    converted = current_pair.set;
  }
  ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
  new_pairs.swap(*pairs);
}

void MoveAllCharsToSets(PairVector* pairs) {
  // Since each pass of the function moves one character, and UTF-8 sequences
  // are at most 4 characters long, this simply runs the algorithm four times.
  for (int i = 0; i < 4; ++i) {
    MoveRightMostCharToSet(pairs);
  }
#if DCHECK_IS_ON()
  for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
       ++it) {
    DCHECK(it->character.empty());
  }
#endif
}

// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
// algorithm is working. Use the command-line option
// --vmodule=build_utf8_validator_tables=1 to see this output.
void LogStringSets(const PairVector& pairs) {
  for (PairVector::const_iterator pair_it = pairs.begin();
       pair_it != pairs.end();
       ++pair_it) {
    std::string set_as_string;
    for (StringSet::const_iterator set_it = pair_it->set.begin();
         set_it != pair_it->set.end();
         ++set_it) {
      set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
                                          static_cast<int>(set_it->from()),
                                          static_cast<int>(set_it->to()));
    }
    VLOG(1) << set_as_string;
  }
}

// A single state in the state machine is represented by a sorted vector of
// start bytes and target states. All input bytes in the range between the start
// byte and the next entry in the vector (or 0xFF) result in a transition to the
// target state.
struct StateRange {
  uint8_t from;
  uint8_t target_state;
};

typedef std::vector<StateRange> State;

// Generates a state where all bytes go to state 1 (invalid). This is also used
// as an initialiser for other states (since bytes from outside the desired
// range are invalid).
State GenerateInvalidState() {
  const StateRange range = {0, 1};
  return State(1, range);
}

// A map from a state (ie. a set of strings which will match from this state) to
// a number (which is an index into the array of states).
typedef std::map<StringSet, uint8_t> StateMap;

// Create a new state corresponding to |set|, add it |states| and |state_map|
// and return the index it was given in |states|.
uint8_t MakeState(const StringSet& set,
                  std::vector<State>* states,
                  StateMap* state_map) {
  DCHECK(!set.empty());
  const Range& range = set.front();
  const StringSet rest(set.begin() + 1, set.end());
  const StateMap::const_iterator where = state_map->find(rest);
  const uint8_t target_state = where == state_map->end()
                                   ? MakeState(rest, states, state_map)
                                   : where->second;
  DCHECK_LT(0, range.from());
  DCHECK_LT(range.to(), 0xFF);
  const StateRange new_state_initializer[] = {
      {0, 1},
      {range.from(), target_state},
      {static_cast<uint8_t>(range.to() + 1), 1}};
  states->push_back(
      State(new_state_initializer,
            new_state_initializer + arraysize(new_state_initializer)));
  const uint8_t new_state_number =
      base::checked_cast<uint8_t>(states->size() - 1);
  CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
  return new_state_number;
}

std::vector<State> GenerateStates(const PairVector& pairs) {
  // States 0 and 1 are the initial/valid state and invalid state, respectively.
  std::vector<State> states(2, GenerateInvalidState());
  StateMap state_map;
  state_map.insert(std::make_pair(StringSet(), 0));
  for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {
    DCHECK(it->character.empty());
    DCHECK(!it->set.empty());
    const Range& range = it->set.front();
    const StringSet rest(it->set.begin() + 1, it->set.end());
    const StateMap::const_iterator where = state_map.find(rest);
    const uint8_t target_state = where == state_map.end()
                                     ? MakeState(rest, &states, &state_map)
                                     : where->second;
    if (states[0].back().from == range.from()) {
      DCHECK_EQ(1, states[0].back().target_state);
      states[0].back().target_state = target_state;
      DCHECK_LT(range.to(), 0xFF);
      const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};
      states[0].push_back(new_range);
    } else {
      DCHECK_LT(range.to(), 0xFF);
      const StateRange new_range_initializer[] = {
          {range.from(), target_state},
          {static_cast<uint8_t>(range.to() + 1), 1}};
      states[0]
          .insert(states[0].end(),
                  new_range_initializer,
                  new_range_initializer + arraysize(new_range_initializer));
    }
  }
  return states;
}

// Output the generated states as a C++ table. Two tricks are used to compact
// the table: each state in the table starts with a shift value which indicates
// how many bits we can discard from the right-hand-side of the byte before
// doing the table lookup. Secondly, only the state-transitions for bytes
// with the top-bit set are included in the table; bytes without the top-bit set
// are just ASCII and are handled directly by the code.
void PrintStates(const std::vector<State>& states, FILE* stream) {
  // First calculate the start-offset of each state. This allows the state
  // machine to jump directly to the correct offset, avoiding an extra
  // indirection. State 0 starts at offset 0.
  std::vector<uint8_t> state_offset(1, 0);
  std::vector<uint8_t> shifts;
  uint8_t pos = 0;

  for (std::vector<State>::const_iterator state_it = states.begin();
       state_it != states.end();
       ++state_it) {
    // We want to set |shift| to the (0-based) index of the least-significant
    // set bit in any of the ranges for this state, since this tells us how many
    // bits we can discard and still determine what range a byte lies in. Sadly
    // it appears that ffs() is not portable, so we do it clumsily.
    uint8_t shift = 7;
    for (State::const_iterator range_it = state_it->begin();
         range_it != state_it->end();
         ++range_it) {
      while (shift > 0 && range_it->from % (1 << shift) != 0) {
        --shift;
      }
    }
    shifts.push_back(shift);
    pos += 1 + (1 << (7 - shift));
    state_offset.push_back(pos);
  }

  DCHECK_EQ(129, state_offset[1]);

  fputs(kProlog, stream);
  TablePrinter table_printer(stream);

  for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {
    const uint8_t shift = shifts[state_index];
    uint8_t next_range = 0;
    uint8_t target_state = 1;
    fprintf(stream,
            "    // State %d, offset 0x%02x\n",
            static_cast<int>(state_index),
            static_cast<int>(state_offset[state_index]));
    table_printer.PrintValue(shift);
    for (int i = 0; i < 0x100; i += (1 << shift)) {
      if (next_range < states[state_index].size() &&
          states[state_index][next_range].from == i) {
        target_state = states[state_index][next_range].target_state;
        ++next_range;
      }
      if (i >= 0x80) {
        table_printer.PrintValue(state_offset[target_state]);
      }
    }
    table_printer.NewLine();
  }

  fputs(kEpilog, stream);
}

}  // namespace

int main(int argc, char* argv[]) {
  base::CommandLine::Init(argc, argv);
  logging::LoggingSettings settings;
  settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
  logging::InitLogging(settings);
  if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
    fwrite(kHelpText, 1, arraysize(kHelpText), stdout);
    exit(EXIT_SUCCESS);
  }
  base::FilePath filename =
      base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");

  FILE* output = stdout;
  if (!filename.empty()) {
    output = base::OpenFile(filename, "wb");
    if (!output)
      PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
                  << "' for writing";
  }

  // Step 1: Enumerate the characters
  PairVector pairs = InitializeCharacters();
  // Step 2: Convert to sets.
  MoveAllCharsToSets(&pairs);
  if (VLOG_IS_ON(1)) {
    LogStringSets(pairs);
  }
  // Step 3: Generate states.
  std::vector<State> states = GenerateStates(pairs);
  // Step 4/5: Print output
  PrintStates(states, output);

  if (!filename.empty()) {
    if (!base::CloseFile(output))
      PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
                  << "'";
  }

  return EXIT_SUCCESS;
}
Import chromium-64.0.3282.140 2018-02-02 05:49:39 -05:00			`// Copyright 2014 The Chromium Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style license that can be`
			`// found in the LICENSE file.`

			`// Create a state machine for validating UTF-8. The algorithm in brief:`
			`// 1. Convert the complete unicode range of code points, except for the`
			`// surrogate code points, to an ordered array of sequences of bytes in`
			`// UTF-8.`
			`// 2. Convert individual bytes to ranges, starting from the right of each byte`
			`// sequence. For each range, ensure the bytes on the left and the ranges`
			`// on the right are the identical.`
			`// 3. Convert the resulting list of ranges into a state machine, collapsing`
			`// identical states.`
			`// 4. Convert the state machine to an array of bytes.`
			`// 5. Output as a C++ file.`
			`//`
			`// To use:`
			`// $ ninja -C out/Release build_utf8_validator_tables`
			`// $ out/Release/build_utf8_validator_tables`
			`// --output=base/i18n/utf8_validator_tables.cc`
			`// $ git add base/i18n/utf8_validator_tables.cc`
			`//`
			`// Because the table is not expected to ever change, it is checked into the`
			`// repository rather than being regenerated at build time.`
			`//`
			`// This code uses type uint8_t throughout to represent bytes, to avoid`
			`// signed/unsigned char confusion.`

			`#include <stddef.h>`
			`#include <stdint.h>`
			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`

			`#include <algorithm>`
			`#include <map>`
			`#include <string>`
			`#include <vector>`

			`#include "base/command_line.h"`
			`#include "base/files/file_path.h"`
			`#include "base/files/file_util.h"`
			`#include "base/logging.h"`
			`#include "base/macros.h"`
			`#include "base/numerics/safe_conversions.h"`
			`#include "base/strings/stringprintf.h"`
			`#include "third_party/icu/source/common/unicode/utf8.h"`

			`namespace {`

			`const char kHelpText[] =`
			`"Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";`

			`const char kProlog[] =`
			`"// Copyright 2013 The Chromium Authors. All rights reserved.\n"`
			`"// Use of this source code is governed by a BSD-style license that can "`
			`"be\n"`
			`"// found in the LICENSE file.\n"`
			`"\n"`
			`"// This file is auto-generated by build_utf8_validator_tables.\n"`
			`"// DO NOT EDIT.\n"`
			`"\n"`
			`"#include \"base/i18n/utf8_validator_tables.h\"\n"`
			`"\n"`
			`"namespace base {\n"`
			`"namespace internal {\n"`
			`"\n"`
			`"const uint8_t kUtf8ValidatorTables[] = {\n";`

			`const char kEpilog[] =`
			`"};\n"`
			`"\n"`
			`"const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"`
			`"\n"`
			`"} // namespace internal\n"`
			`"} // namespace base\n";`

			`// Ranges are inclusive at both ends--they represent [from, to]`
			`class Range {`
			`public:`
			`// Ranges always start with just one byte.`
			`explicit Range(uint8_t value) : from_(value), to_(value) {}`

			`// Range objects are copyable and assignable to be used in STL`
			`// containers. Since they only contain non-pointer POD types, the default copy`
			`// constructor, assignment operator and destructor will work.`

			`// Add a byte to the range. We intentionally only support adding a byte at the`
			`// end, since that is the only operation the code needs.`
			`void AddByte(uint8_t to) {`
			`CHECK(to == to_ + 1);`
			`to_ = to;`
			`}`

			`uint8_t from() const { return from_; }`
			`uint8_t to() const { return to_; }`

			`bool operator<(const Range& rhs) const {`
			`return (from() < rhs.from() \|\| (from() == rhs.from() && to() < rhs.to()));`
			`}`

			`bool operator==(const Range& rhs) const {`
			`return from() == rhs.from() && to() == rhs.to();`
			`}`

			`private:`
			`uint8_t from_;`
			`uint8_t to_;`
			`};`

			`// A vector of Ranges is like a simple regular expression--it corresponds to`
			`// a set of strings of the same length that have bytes in each position in`
			`// the appropriate range.`
			`typedef std::vector<Range> StringSet;`

			`// A UTF-8 "character" is represented by a sequence of bytes.`
			`typedef std::vector<uint8_t> Character;`

			`// In the second stage of the algorithm, we want to convert a large list of`
			`// Characters into a small list of StringSets.`
			`struct Pair {`
			`Character character;`
			`StringSet set;`
			`};`

			`typedef std::vector<Pair> PairVector;`

			`// A class to print a table of numbers in the same style as clang-format.`
			`class TablePrinter {`
			`public:`
			`explicit TablePrinter(FILE* stream)`
			`: stream_(stream), values_on_this_line_(0), current_offset_(0) {}`

			`void PrintValue(uint8_t value) {`
			`if (values_on_this_line_ == 0) {`
			`fputs(" ", stream_);`
			`} else if (values_on_this_line_ == kMaxValuesPerLine) {`
			`fprintf(stream_, " // 0x%02x\n ", current_offset_);`
			`values_on_this_line_ = 0;`
			`}`
			`fprintf(stream_, " 0x%02x,", static_cast<int>(value));`
			`++values_on_this_line_;`
			`++current_offset_;`
			`}`

			`void NewLine() {`
			`while (values_on_this_line_ < kMaxValuesPerLine) {`
			`fputs(" ", stream_);`
			`++values_on_this_line_;`
			`}`
			`fprintf(stream_, " // 0x%02x\n", current_offset_);`
			`values_on_this_line_ = 0;`
			`}`

			`private:`
			`// stdio stream. Not owned.`
			`FILE* stream_;`

			`// Number of values so far printed on this line.`
			`int values_on_this_line_;`

			`// Total values printed so far.`
			`int current_offset_;`

			`static const int kMaxValuesPerLine = 8;`

			`DISALLOW_COPY_AND_ASSIGN(TablePrinter);`
			`};`

			`// Start by filling a PairVector with characters. The resulting vector goes from`
			`// "\x00" to "\xf4\x8f\xbf\xbf".`
			`PairVector InitializeCharacters() {`
			`PairVector vector;`
			`for (int i = 0; i <= 0x10FFFF; ++i) {`
			`if (i >= 0xD800 && i < 0xE000) {`
			`// Surrogate codepoints are not permitted. Non-character code points are`
			`// explicitly permitted.`
			`continue;`
			`}`
			`uint8_t bytes[4];`
			`unsigned int offset = 0;`
			`UBool is_error = false;`
			`U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);`
			`DCHECK(!is_error);`
			`DCHECK_GT(offset, 0u);`
			`DCHECK_LE(offset, arraysize(bytes));`
			`Pair pair = {Character(bytes, bytes + offset), StringSet()};`
			`vector.push_back(pair);`
			`}`
			`return vector;`
			`}`

			`// Construct a new Pair from \|character\| and the concatenation of \|new_range\|`
			`// and \|existing_set\|, and append it to \|pairs\|.`
			`void ConstructPairAndAppend(const Character& character,`
			`const Range& new_range,`
			`const StringSet& existing_set,`
			`PairVector* pairs) {`
			`Pair new_pair = {character, StringSet(1, new_range)};`
			`new_pair.set.insert(`
			`new_pair.set.end(), existing_set.begin(), existing_set.end());`
			`pairs->push_back(new_pair);`
			`}`

			`// Each pass over the PairVector strips one byte off the right-hand-side of the`
			`// characters and adds a range to the set on the right. For example, the first`
			`// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",`
			`// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",`
			`// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",`
			`// [\xa0-\xbf][\x80-\xbf]).`
			`void MoveRightMostCharToSet(PairVector* pairs) {`
			`PairVector new_pairs;`
			`PairVector::const_iterator it = pairs->begin();`
			`while (it != pairs->end() && it->character.empty()) {`
			`new_pairs.push_back(*it);`
			`++it;`
			`}`
			`CHECK(it != pairs->end());`
			`Character unconverted_bytes(it->character.begin(), it->character.end() - 1);`
			`Range new_range(it->character.back());`
			`StringSet converted = it->set;`
			`++it;`
			`while (it != pairs->end()) {`
			`const Pair& current_pair = *it++;`
			`if (current_pair.character.size() == unconverted_bytes.size() + 1 &&`
			`std::equal(unconverted_bytes.begin(),`
			`unconverted_bytes.end(),`
			`current_pair.character.begin()) &&`
			`converted == current_pair.set) {`
			`// The particular set of UTF-8 codepoints we are validating guarantees`
			`// that each byte range will be contiguous. This would not necessarily be`
			`// true for an arbitrary set of UTF-8 codepoints.`
			`DCHECK_EQ(new_range.to() + 1, current_pair.character.back());`
			`new_range.AddByte(current_pair.character.back());`
			`continue;`
			`}`
			`ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);`
			`unconverted_bytes = Character(current_pair.character.begin(),`
			`current_pair.character.end() - 1);`
			`new_range = Range(current_pair.character.back());`
			`converted = current_pair.set;`
			`}`
			`ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);`
			`new_pairs.swap(*pairs);`
			`}`

			`void MoveAllCharsToSets(PairVector* pairs) {`
			`// Since each pass of the function moves one character, and UTF-8 sequences`
			`// are at most 4 characters long, this simply runs the algorithm four times.`
			`for (int i = 0; i < 4; ++i) {`
			`MoveRightMostCharToSet(pairs);`
			`}`
			`#if DCHECK_IS_ON()`
			`for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();`
			`++it) {`
			`DCHECK(it->character.empty());`
			`}`
			`#endif`
			`}`

			`// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],`
			`// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the`
			`// algorithm is working. Use the command-line option`
			`// --vmodule=build_utf8_validator_tables=1 to see this output.`
			`void LogStringSets(const PairVector& pairs) {`
			`for (PairVector::const_iterator pair_it = pairs.begin();`
			`pair_it != pairs.end();`
			`++pair_it) {`
			`std::string set_as_string;`
			`for (StringSet::const_iterator set_it = pair_it->set.begin();`
			`set_it != pair_it->set.end();`
			`++set_it) {`
			`set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",`
			`static_cast<int>(set_it->from()),`
			`static_cast<int>(set_it->to()));`
			`}`
			`VLOG(1) << set_as_string;`
			`}`
			`}`

			`// A single state in the state machine is represented by a sorted vector of`
			`// start bytes and target states. All input bytes in the range between the start`
			`// byte and the next entry in the vector (or 0xFF) result in a transition to the`
			`// target state.`
			`struct StateRange {`
			`uint8_t from;`
			`uint8_t target_state;`
			`};`

			`typedef std::vector<StateRange> State;`

			`// Generates a state where all bytes go to state 1 (invalid). This is also used`
			`// as an initialiser for other states (since bytes from outside the desired`
			`// range are invalid).`
			`State GenerateInvalidState() {`
			`const StateRange range = {0, 1};`
			`return State(1, range);`
			`}`

			`// A map from a state (ie. a set of strings which will match from this state) to`
			`// a number (which is an index into the array of states).`
			`typedef std::map<StringSet, uint8_t> StateMap;`

			`// Create a new state corresponding to \|set\|, add it \|states\| and \|state_map\|`
			`// and return the index it was given in \|states\|.`
			`uint8_t MakeState(const StringSet& set,`
			`std::vector<State>* states,`
			`StateMap* state_map) {`
			`DCHECK(!set.empty());`
			`const Range& range = set.front();`
			`const StringSet rest(set.begin() + 1, set.end());`
			`const StateMap::const_iterator where = state_map->find(rest);`
			`const uint8_t target_state = where == state_map->end()`
			`? MakeState(rest, states, state_map)`
			`: where->second;`
			`DCHECK_LT(0, range.from());`
			`DCHECK_LT(range.to(), 0xFF);`
			`const StateRange new_state_initializer[] = {`
			`{0, 1},`
			`{range.from(), target_state},`
			`{static_cast<uint8_t>(range.to() + 1), 1}};`
			`states->push_back(`
			`State(new_state_initializer,`
			`new_state_initializer + arraysize(new_state_initializer)));`
			`const uint8_t new_state_number =`
			`base::checked_cast<uint8_t>(states->size() - 1);`
			`CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);`
			`return new_state_number;`
			`}`

			`std::vector<State> GenerateStates(const PairVector& pairs) {`
			`// States 0 and 1 are the initial/valid state and invalid state, respectively.`
			`std::vector<State> states(2, GenerateInvalidState());`
			`StateMap state_map;`
			`state_map.insert(std::make_pair(StringSet(), 0));`
			`for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {`
			`DCHECK(it->character.empty());`
			`DCHECK(!it->set.empty());`
			`const Range& range = it->set.front();`
			`const StringSet rest(it->set.begin() + 1, it->set.end());`
			`const StateMap::const_iterator where = state_map.find(rest);`
			`const uint8_t target_state = where == state_map.end()`
			`? MakeState(rest, &states, &state_map)`
			`: where->second;`
			`if (states[0].back().from == range.from()) {`
			`DCHECK_EQ(1, states[0].back().target_state);`
			`states[0].back().target_state = target_state;`
			`DCHECK_LT(range.to(), 0xFF);`
			`const StateRange new_range = {static_cast<uint8_t>(range.to() + 1), 1};`
			`states[0].push_back(new_range);`
			`} else {`
			`DCHECK_LT(range.to(), 0xFF);`
			`const StateRange new_range_initializer[] = {`
			`{range.from(), target_state},`
			`{static_cast<uint8_t>(range.to() + 1), 1}};`
			`states[0]`
			`.insert(states[0].end(),`
			`new_range_initializer,`
			`new_range_initializer + arraysize(new_range_initializer));`
			`}`
			`}`
			`return states;`
			`}`

			`// Output the generated states as a C++ table. Two tricks are used to compact`
			`// the table: each state in the table starts with a shift value which indicates`
			`// how many bits we can discard from the right-hand-side of the byte before`
			`// doing the table lookup. Secondly, only the state-transitions for bytes`
			`// with the top-bit set are included in the table; bytes without the top-bit set`
			`// are just ASCII and are handled directly by the code.`
			`void PrintStates(const std::vector<State>& states, FILE* stream) {`
			`// First calculate the start-offset of each state. This allows the state`
			`// machine to jump directly to the correct offset, avoiding an extra`
			`// indirection. State 0 starts at offset 0.`
			`std::vector<uint8_t> state_offset(1, 0);`
			`std::vector<uint8_t> shifts;`
			`uint8_t pos = 0;`

			`for (std::vector<State>::const_iterator state_it = states.begin();`
			`state_it != states.end();`
			`++state_it) {`
			`// We want to set \|shift\| to the (0-based) index of the least-significant`
			`// set bit in any of the ranges for this state, since this tells us how many`
			`// bits we can discard and still determine what range a byte lies in. Sadly`
			`// it appears that ffs() is not portable, so we do it clumsily.`
			`uint8_t shift = 7;`
			`for (State::const_iterator range_it = state_it->begin();`
			`range_it != state_it->end();`
			`++range_it) {`
			`while (shift > 0 && range_it->from % (1 << shift) != 0) {`
			`--shift;`
			`}`
			`}`
			`shifts.push_back(shift);`
			`pos += 1 + (1 << (7 - shift));`
			`state_offset.push_back(pos);`
			`}`

			`DCHECK_EQ(129, state_offset[1]);`

			`fputs(kProlog, stream);`
			`TablePrinter table_printer(stream);`

			`for (uint8_t state_index = 0; state_index < states.size(); ++state_index) {`
			`const uint8_t shift = shifts[state_index];`
			`uint8_t next_range = 0;`
			`uint8_t target_state = 1;`
			`fprintf(stream,`
			`" // State %d, offset 0x%02x\n",`
			`static_cast<int>(state_index),`
			`static_cast<int>(state_offset[state_index]));`
			`table_printer.PrintValue(shift);`
			`for (int i = 0; i < 0x100; i += (1 << shift)) {`
			`if (next_range < states[state_index].size() &&`
			`states[state_index][next_range].from == i) {`
			`target_state = states[state_index][next_range].target_state;`
			`++next_range;`
			`}`
			`if (i >= 0x80) {`
			`table_printer.PrintValue(state_offset[target_state]);`
			`}`
			`}`
			`table_printer.NewLine();`
			`}`

			`fputs(kEpilog, stream);`
			`}`

			`} // namespace`

			`int main(int argc, char* argv[]) {`
			`base::CommandLine::Init(argc, argv);`
			`logging::LoggingSettings settings;`
			`settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;`
			`logging::InitLogging(settings);`
			`if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {`
			`fwrite(kHelpText, 1, arraysize(kHelpText), stdout);`
			`exit(EXIT_SUCCESS);`
			`}`
			`base::FilePath filename =`
			`base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");`

			`FILE* output = stdout;`
			`if (!filename.empty()) {`
			`output = base::OpenFile(filename, "wb");`
			`if (!output)`
			`PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()`
			`<< "' for writing";`
			`}`

			`// Step 1: Enumerate the characters`
			`PairVector pairs = InitializeCharacters();`
			`// Step 2: Convert to sets.`
			`MoveAllCharsToSets(&pairs);`
			`if (VLOG_IS_ON(1)) {`
			`LogStringSets(pairs);`
			`}`
			`// Step 3: Generate states.`
			`std::vector<State> states = GenerateStates(pairs);`
			`// Step 4/5: Print output`
			`PrintStates(states, output);`

			`if (!filename.empty()) {`
			`if (!base::CloseFile(output))`
			`PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()`
			`<< "'";`
			`}`

			`return EXIT_SUCCESS;`
			`}`