naiveproxy/base/strings/utf_offset_string_conversions.cc

// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "base/strings/utf_offset_string_conversions.h"

#include <stdint.h>

#include <algorithm>
#include <memory>

#include "base/logging.h"
#include "base/strings/string_piece.h"
#include "base/strings/utf_string_conversion_utils.h"

namespace base {

OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
                                       size_t original_length,
                                       size_t output_length)
    : original_offset(original_offset),
      original_length(original_length),
      output_length(output_length) {
}

// static
void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
                                   std::vector<size_t>* offsets_for_adjustment,
                                   size_t limit) {
  DCHECK(offsets_for_adjustment);
  for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
       i != offsets_for_adjustment->end(); ++i)
    AdjustOffset(adjustments, &(*i), limit);
}

// static
void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
                                  size_t* offset,
                                  size_t limit) {
  DCHECK(offset);
  if (*offset == string16::npos)
    return;
  int adjustment = 0;
  for (Adjustments::const_iterator i = adjustments.begin();
       i != adjustments.end(); ++i) {
    if (*offset <= i->original_offset)
      break;
    if (*offset < (i->original_offset + i->original_length)) {
      *offset = string16::npos;
      return;
    }
    adjustment += static_cast<int>(i->original_length - i->output_length);
  }
  *offset -= adjustment;

  if (*offset > limit)
    *offset = string16::npos;
}

// static
void OffsetAdjuster::UnadjustOffsets(
    const Adjustments& adjustments,
    std::vector<size_t>* offsets_for_unadjustment) {
  if (!offsets_for_unadjustment || adjustments.empty())
    return;
  for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
       i != offsets_for_unadjustment->end(); ++i)
    UnadjustOffset(adjustments, &(*i));
}

// static
void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
                                    size_t* offset) {
  if (*offset == string16::npos)
    return;
  int adjustment = 0;
  for (Adjustments::const_iterator i = adjustments.begin();
       i != adjustments.end(); ++i) {
    if (*offset + adjustment <= i->original_offset)
      break;
    adjustment += static_cast<int>(i->original_length - i->output_length);
    if ((*offset + adjustment) <
        (i->original_offset + i->original_length)) {
      *offset = string16::npos;
      return;
    }
  }
  *offset += adjustment;
}

// static
void OffsetAdjuster::MergeSequentialAdjustments(
    const Adjustments& first_adjustments,
    Adjustments* adjustments_on_adjusted_string) {
  Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
  Adjustments::const_iterator first_iter = first_adjustments.begin();
  // Simultaneously iterate over all |adjustments_on_adjusted_string| and
  // |first_adjustments|, adding adjustments to or correcting the adjustments
  // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
  // current number of characters collapsed by |first_adjustments| up to this
  // point.  |currently_collapsing| keeps track of the number of characters
  // collapsed by |first_adjustments| into the current |adjusted_iter|'s
  // length.  These are characters that will change |shift| as soon as we're
  // done processing the current |adjusted_iter|; they are not yet reflected in
  // |shift|.
  size_t shift = 0;
  size_t currently_collapsing = 0;
  while (adjusted_iter != adjustments_on_adjusted_string->end()) {
    if ((first_iter == first_adjustments.end()) ||
        ((adjusted_iter->original_offset + shift +
          adjusted_iter->original_length) <= first_iter->original_offset)) {
      // Entire |adjusted_iter| (accounting for its shift and including its
      // whole original length) comes before |first_iter|.
      //
      // Correct the offset at |adjusted_iter| and move onto the next
      // adjustment that needs revising.
      adjusted_iter->original_offset += shift;
      shift += currently_collapsing;
      currently_collapsing = 0;
      ++adjusted_iter;
    } else if ((adjusted_iter->original_offset + shift) >
               first_iter->original_offset) {
      // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).

      // It's not possible for the adjustments to overlap.  (It shouldn't
      // be possible that we have an |adjusted_iter->original_offset| that,
      // when adjusted by the computed |shift|, is in the middle of
      // |first_iter|'s output's length.  After all, that would mean the
      // current adjustment_on_adjusted_string somehow points to an offset
      // that was supposed to have been eliminated by the first set of
      // adjustments.)
      DCHECK_LE(first_iter->original_offset + first_iter->output_length,
                adjusted_iter->original_offset + shift);

      // Add the |first_adjustment_iter| to the full set of adjustments while
      // making sure |adjusted_iter| continues pointing to the same element.
      // We do this by inserting the |first_adjustment_iter| right before
      // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
      // the following element.
      shift += first_iter->original_length - first_iter->output_length;
      adjusted_iter = adjustments_on_adjusted_string->insert(
          adjusted_iter, *first_iter);
      ++adjusted_iter;
      ++first_iter;
    } else {
      // The first adjustment adjusted something that then got further adjusted
      // by the second set of adjustments.  In other words, |first_iter| points
      // to something in the range covered by |adjusted_iter|'s length (after
      // accounting for |shift|).  Precisely,
      //   adjusted_iter->original_offset + shift
      //   <=
      //   first_iter->original_offset
      //   <=
      //   adjusted_iter->original_offset + shift +
      //       adjusted_iter->original_length

      // Modify the current |adjusted_iter| to include whatever collapsing
      // happened in |first_iter|, then advance to the next |first_adjustments|
      // because we dealt with the current one.
      const int collapse = static_cast<int>(first_iter->original_length) -
          static_cast<int>(first_iter->output_length);
      // This function does not know how to deal with a string that expands and
      // then gets modified, only strings that collapse and then get modified.
      DCHECK_GT(collapse, 0);
      adjusted_iter->original_length += collapse;
      currently_collapsing += collapse;
      ++first_iter;
    }
  }
  DCHECK_EQ(0u, currently_collapsing);
  if (first_iter != first_adjustments.end()) {
    // Only first adjustments are left.  These do not need to be modified.
    // (Their offsets are already correct with respect to the original string.)
    // Append them all.
    DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
    adjustments_on_adjusted_string->insert(
        adjustments_on_adjusted_string->end(), first_iter,
        first_adjustments.end());
  }
}

// Converts the given source Unicode character type to the given destination
// Unicode character type as a STL string. The given input buffer and size
// determine the source, and the given output STL string will be replaced by
// the result.  If non-NULL, |adjustments| is set to reflect the all the
// alterations to the string that are not one-character-to-one-character.
// It will always be sorted by increasing offset.
template<typename SrcChar, typename DestStdString>
bool ConvertUnicode(const SrcChar* src,
                    size_t src_len,
                    DestStdString* output,
                    OffsetAdjuster::Adjustments* adjustments) {
  if (adjustments)
    adjustments->clear();
  // ICU requires 32-bit numbers.
  bool success = true;
  int32_t src_len32 = static_cast<int32_t>(src_len);
  for (int32_t i = 0; i < src_len32; i++) {
    uint32_t code_point;
    size_t original_i = i;
    size_t chars_written = 0;
    if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
      chars_written = WriteUnicodeCharacter(code_point, output);
    } else {
      chars_written = WriteUnicodeCharacter(0xFFFD, output);
      success = false;
    }

    // Only bother writing an adjustment if this modification changed the
    // length of this character.
    // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
    // character read, not after it (so that incrementing it in the loop
    // increment will place it at the right location), so we need to account
    // for that in determining the amount that was read.
    if (adjustments && ((i - original_i + 1) != chars_written)) {
      adjustments->push_back(OffsetAdjuster::Adjustment(
          original_i, i - original_i + 1, chars_written));
    }
  }
  return success;
}

bool UTF8ToUTF16WithAdjustments(
    const char* src,
    size_t src_len,
    string16* output,
    base::OffsetAdjuster::Adjustments* adjustments) {
  PrepareForUTF16Or32Output(src, src_len, output);
  return ConvertUnicode(src, src_len, output, adjustments);
}

string16 UTF8ToUTF16WithAdjustments(
    const base::StringPiece& utf8,
    base::OffsetAdjuster::Adjustments* adjustments) {
  string16 result;
  UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
  return result;
}

string16 UTF8ToUTF16AndAdjustOffsets(
    const base::StringPiece& utf8,
    std::vector<size_t>* offsets_for_adjustment) {
  for (size_t& offset : *offsets_for_adjustment) {
    if (offset > utf8.length())
      offset = string16::npos;
  }
  OffsetAdjuster::Adjustments adjustments;
  string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
  return result;
}

std::string UTF16ToUTF8AndAdjustOffsets(
    const base::StringPiece16& utf16,
    std::vector<size_t>* offsets_for_adjustment) {
  for (size_t& offset : *offsets_for_adjustment) {
    if (offset > utf16.length())
      offset = string16::npos;
  }
  std::string result;
  PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
  OffsetAdjuster::Adjustments adjustments;
  ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
  OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
  return result;
}

}  // namespace base
Import chromium-64.0.3282.119 2018-01-28 21:32:06 +03:00			`// Copyright (c) 2011 The Chromium Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style license that can be`
			`// found in the LICENSE file.`

			`#include "base/strings/utf_offset_string_conversions.h"`

			`#include <stdint.h>`

			`#include <algorithm>`
			`#include <memory>`

			`#include "base/logging.h"`
			`#include "base/strings/string_piece.h"`
			`#include "base/strings/utf_string_conversion_utils.h"`

			`namespace base {`

			`OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,`
			`size_t original_length,`
			`size_t output_length)`
			`: original_offset(original_offset),`
			`original_length(original_length),`
			`output_length(output_length) {`
			`}`

			`// static`
			`void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,`
			`std::vector<size_t>* offsets_for_adjustment,`
			`size_t limit) {`
			`DCHECK(offsets_for_adjustment);`
			`for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());`
			`i != offsets_for_adjustment->end(); ++i)`
			`AdjustOffset(adjustments, &(*i), limit);`
			`}`

			`// static`
			`void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,`
			`size_t* offset,`
			`size_t limit) {`
			`DCHECK(offset);`
			`if (*offset == string16::npos)`
			`return;`
			`int adjustment = 0;`
			`for (Adjustments::const_iterator i = adjustments.begin();`
			`i != adjustments.end(); ++i) {`
			`if (*offset <= i->original_offset)`
			`break;`
			`if (*offset < (i->original_offset + i->original_length)) {`
			`*offset = string16::npos;`
			`return;`
			`}`
			`adjustment += static_cast<int>(i->original_length - i->output_length);`
			`}`
			`*offset -= adjustment;`

			`if (*offset > limit)`
			`*offset = string16::npos;`
			`}`

			`// static`
			`void OffsetAdjuster::UnadjustOffsets(`
			`const Adjustments& adjustments,`
			`std::vector<size_t>* offsets_for_unadjustment) {`
			`if (!offsets_for_unadjustment \|\| adjustments.empty())`
			`return;`
			`for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());`
			`i != offsets_for_unadjustment->end(); ++i)`
			`UnadjustOffset(adjustments, &(*i));`
			`}`

			`// static`
			`void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,`
			`size_t* offset) {`
			`if (*offset == string16::npos)`
			`return;`
			`int adjustment = 0;`
			`for (Adjustments::const_iterator i = adjustments.begin();`
			`i != adjustments.end(); ++i) {`
			`if (*offset + adjustment <= i->original_offset)`
			`break;`
			`adjustment += static_cast<int>(i->original_length - i->output_length);`
			`if ((*offset + adjustment) <`
			`(i->original_offset + i->original_length)) {`
			`*offset = string16::npos;`
			`return;`
			`}`
			`}`
			`*offset += adjustment;`
			`}`

			`// static`
			`void OffsetAdjuster::MergeSequentialAdjustments(`
			`const Adjustments& first_adjustments,`
			`Adjustments* adjustments_on_adjusted_string) {`
			`Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();`
			`Adjustments::const_iterator first_iter = first_adjustments.begin();`
			`// Simultaneously iterate over all \|adjustments_on_adjusted_string\| and`
			`// \|first_adjustments\|, adding adjustments to or correcting the adjustments`
			`// in \|adjustments_on_adjusted_string\| as we go. \|shift\| keeps track of the`
			`// current number of characters collapsed by \|first_adjustments\| up to this`
			`// point. \|currently_collapsing\| keeps track of the number of characters`
			`// collapsed by \|first_adjustments\| into the current \|adjusted_iter\|'s`
			`// length. These are characters that will change \|shift\| as soon as we're`
			`// done processing the current \|adjusted_iter\|; they are not yet reflected in`
			`// \|shift\|.`
			`size_t shift = 0;`
			`size_t currently_collapsing = 0;`
			`while (adjusted_iter != adjustments_on_adjusted_string->end()) {`
			`if ((first_iter == first_adjustments.end()) \|\|`
			`((adjusted_iter->original_offset + shift +`
			`adjusted_iter->original_length) <= first_iter->original_offset)) {`
			`// Entire \|adjusted_iter\| (accounting for its shift and including its`
			`// whole original length) comes before \|first_iter\|.`
			`//`
			`// Correct the offset at \|adjusted_iter\| and move onto the next`
			`// adjustment that needs revising.`
			`adjusted_iter->original_offset += shift;`
			`shift += currently_collapsing;`
			`currently_collapsing = 0;`
			`++adjusted_iter;`
			`} else if ((adjusted_iter->original_offset + shift) >`
			`first_iter->original_offset) {`
			`// \|first_iter\| comes before the \|adjusted_iter\| (as adjusted by \|shift\|).`

			`// It's not possible for the adjustments to overlap. (It shouldn't`
			`// be possible that we have an \|adjusted_iter->original_offset\| that,`
			`// when adjusted by the computed \|shift\|, is in the middle of`
			`// \|first_iter\|'s output's length. After all, that would mean the`
			`// current adjustment_on_adjusted_string somehow points to an offset`
			`// that was supposed to have been eliminated by the first set of`
			`// adjustments.)`
			`DCHECK_LE(first_iter->original_offset + first_iter->output_length,`
			`adjusted_iter->original_offset + shift);`

			`// Add the \|first_adjustment_iter\| to the full set of adjustments while`
			`// making sure \|adjusted_iter\| continues pointing to the same element.`
			`// We do this by inserting the \|first_adjustment_iter\| right before`
			`// \|adjusted_iter\|, then incrementing \|adjusted_iter\| so it points to`
			`// the following element.`
			`shift += first_iter->original_length - first_iter->output_length;`
			`adjusted_iter = adjustments_on_adjusted_string->insert(`
			`adjusted_iter, *first_iter);`
			`++adjusted_iter;`
			`++first_iter;`
			`} else {`
			`// The first adjustment adjusted something that then got further adjusted`
			`// by the second set of adjustments. In other words, \|first_iter\| points`
			`// to something in the range covered by \|adjusted_iter\|'s length (after`
			`// accounting for \|shift\|). Precisely,`
			`// adjusted_iter->original_offset + shift`
			`// <=`
			`// first_iter->original_offset`
			`// <=`
			`// adjusted_iter->original_offset + shift +`
			`// adjusted_iter->original_length`

			`// Modify the current \|adjusted_iter\| to include whatever collapsing`
			`// happened in \|first_iter\|, then advance to the next \|first_adjustments\|`
			`// because we dealt with the current one.`
			`const int collapse = static_cast<int>(first_iter->original_length) -`
			`static_cast<int>(first_iter->output_length);`
			`// This function does not know how to deal with a string that expands and`
			`// then gets modified, only strings that collapse and then get modified.`
			`DCHECK_GT(collapse, 0);`
			`adjusted_iter->original_length += collapse;`
			`currently_collapsing += collapse;`
			`++first_iter;`
			`}`
			`}`
			`DCHECK_EQ(0u, currently_collapsing);`
			`if (first_iter != first_adjustments.end()) {`
			`// Only first adjustments are left. These do not need to be modified.`
			`// (Their offsets are already correct with respect to the original string.)`
			`// Append them all.`
			`DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());`
			`adjustments_on_adjusted_string->insert(`
			`adjustments_on_adjusted_string->end(), first_iter,`
			`first_adjustments.end());`
			`}`
			`}`

			`// Converts the given source Unicode character type to the given destination`
			`// Unicode character type as a STL string. The given input buffer and size`
			`// determine the source, and the given output STL string will be replaced by`
			`// the result. If non-NULL, \|adjustments\| is set to reflect the all the`
			`// alterations to the string that are not one-character-to-one-character.`
			`// It will always be sorted by increasing offset.`
			`template<typename SrcChar, typename DestStdString>`
			`bool ConvertUnicode(const SrcChar* src,`
			`size_t src_len,`
			`DestStdString* output,`
			`OffsetAdjuster::Adjustments* adjustments) {`
			`if (adjustments)`
			`adjustments->clear();`
			`// ICU requires 32-bit numbers.`
			`bool success = true;`
			`int32_t src_len32 = static_cast<int32_t>(src_len);`
			`for (int32_t i = 0; i < src_len32; i++) {`
			`uint32_t code_point;`
			`size_t original_i = i;`
			`size_t chars_written = 0;`
			`if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {`
			`chars_written = WriteUnicodeCharacter(code_point, output);`
			`} else {`
			`chars_written = WriteUnicodeCharacter(0xFFFD, output);`
			`success = false;`
			`}`

			`// Only bother writing an adjustment if this modification changed the`
			`// length of this character.`
			`// NOTE: ReadUnicodeCharacter() adjusts \|i\| to point _at_ the last`
			`// character read, not after it (so that incrementing it in the loop`
			`// increment will place it at the right location), so we need to account`
			`// for that in determining the amount that was read.`
			`if (adjustments && ((i - original_i + 1) != chars_written)) {`
			`adjustments->push_back(OffsetAdjuster::Adjustment(`
			`original_i, i - original_i + 1, chars_written));`
			`}`
			`}`
			`return success;`
			`}`

			`bool UTF8ToUTF16WithAdjustments(`
			`const char* src,`
			`size_t src_len,`
			`string16* output,`
			`base::OffsetAdjuster::Adjustments* adjustments) {`
			`PrepareForUTF16Or32Output(src, src_len, output);`
			`return ConvertUnicode(src, src_len, output, adjustments);`
			`}`

			`string16 UTF8ToUTF16WithAdjustments(`
			`const base::StringPiece& utf8,`
			`base::OffsetAdjuster::Adjustments* adjustments) {`
			`string16 result;`
			`UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);`
			`return result;`
			`}`

			`string16 UTF8ToUTF16AndAdjustOffsets(`
			`const base::StringPiece& utf8,`
			`std::vector<size_t>* offsets_for_adjustment) {`
			`for (size_t& offset : *offsets_for_adjustment) {`
			`if (offset > utf8.length())`
			`offset = string16::npos;`
			`}`
			`OffsetAdjuster::Adjustments adjustments;`
			`string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);`
			`OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);`
			`return result;`
			`}`

			`std::string UTF16ToUTF8AndAdjustOffsets(`
			`const base::StringPiece16& utf16,`
			`std::vector<size_t>* offsets_for_adjustment) {`
			`for (size_t& offset : *offsets_for_adjustment) {`
			`if (offset > utf16.length())`
			`offset = string16::npos;`
			`}`
			`std::string result;`
			`PrepareForUTF8Output(utf16.data(), utf16.length(), &result);`
			`OffsetAdjuster::Adjustments adjustments;`
			`ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);`
			`OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);`
			`return result;`
			`}`

			`} // namespace base`