mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-11-28 08:16:09 +03:00
199 lines
7.2 KiB
C++
199 lines
7.2 KiB
C++
|
// Copyright 2015 The Chromium Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
#include "net/base/lookup_string_in_fixed_set.h"
|
||
|
|
||
|
#include "base/logging.h"
|
||
|
|
||
|
namespace net {
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
// Read next offset from |pos|, increment |offset| by that amount, and increment
|
||
|
// |pos| either to point to the start of the next encoded offset in its node, or
|
||
|
// nullptr, if there are no remaining offsets.
|
||
|
//
|
||
|
// Returns true if an offset could be read; false otherwise.
|
||
|
inline bool GetNextOffset(const unsigned char** pos,
|
||
|
const unsigned char** offset) {
|
||
|
if (*pos == nullptr)
|
||
|
return false;
|
||
|
|
||
|
size_t bytes_consumed;
|
||
|
switch (**pos & 0x60) {
|
||
|
case 0x60: // Read three byte offset
|
||
|
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
|
||
|
bytes_consumed = 3;
|
||
|
break;
|
||
|
case 0x40: // Read two byte offset
|
||
|
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
|
||
|
bytes_consumed = 2;
|
||
|
break;
|
||
|
default:
|
||
|
*offset += (*pos)[0] & 0x3F;
|
||
|
bytes_consumed = 1;
|
||
|
}
|
||
|
if ((**pos & 0x80) != 0) {
|
||
|
*pos = nullptr;
|
||
|
} else {
|
||
|
*pos += bytes_consumed;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
// Check if byte at |offset| is last in label.
|
||
|
bool IsEOL(const unsigned char* offset) {
|
||
|
return (*offset & 0x80) != 0;
|
||
|
}
|
||
|
|
||
|
// Check if byte at |offset| matches key. This version matches both end-of-label
|
||
|
// chars and not-end-of-label chars.
|
||
|
bool IsMatch(const unsigned char* offset, char key) {
|
||
|
return (*offset & 0x7F) == key;
|
||
|
}
|
||
|
|
||
|
// Read return value at |offset|, if it is a return value. Returns true if a
|
||
|
// return value could be read, false otherwise.
|
||
|
bool GetReturnValue(const unsigned char* offset, int* return_value) {
|
||
|
// Return values are always encoded as end-of-label chars (so the high bit is
|
||
|
// set). So byte values in the inclusive range [0x80, 0x9F] encode the return
|
||
|
// values 0 through 31 (though make_dafsa.py doesn't currently encode values
|
||
|
// higher than 7). The following code does that translation.
|
||
|
if ((*offset & 0xE0) == 0x80) {
|
||
|
*return_value = *offset & 0x1F;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
FixedSetIncrementalLookup::FixedSetIncrementalLookup(const unsigned char* graph,
|
||
|
size_t length)
|
||
|
: pos_(graph), end_(graph + length), pos_is_label_character_(false) {}
|
||
|
|
||
|
FixedSetIncrementalLookup::FixedSetIncrementalLookup(
|
||
|
const FixedSetIncrementalLookup& other) = default;
|
||
|
|
||
|
FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=(
|
||
|
const FixedSetIncrementalLookup& other) = default;
|
||
|
|
||
|
FixedSetIncrementalLookup::~FixedSetIncrementalLookup() = default;
|
||
|
|
||
|
bool FixedSetIncrementalLookup::Advance(char input) {
|
||
|
if (!pos_) {
|
||
|
// A previous input exhausted the graph, so there are no possible matches.
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
// Only ASCII printable chars are supported by the current DAFSA format -- the
|
||
|
// high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the
|
||
|
// low values (values 0x00-0x1F) are reserved to encode the return values. So
|
||
|
// values outside this range will never be in the dictionary.
|
||
|
if (input >= 0x20) {
|
||
|
if (pos_is_label_character_) {
|
||
|
// Currently processing a label, so it is only necessary to check the byte
|
||
|
// at |pos_| to see if it encodes a character matching |input|.
|
||
|
bool is_last_char_in_label = IsEOL(pos_);
|
||
|
bool is_match = IsMatch(pos_, input);
|
||
|
if (is_match) {
|
||
|
// If this is not the last character in the label, the next byte should
|
||
|
// be interpreted as a character or return value. Otherwise, the next
|
||
|
// byte should be interpreted as a list of child node offsets.
|
||
|
++pos_;
|
||
|
DCHECK(pos_ < end_);
|
||
|
pos_is_label_character_ = !is_last_char_in_label;
|
||
|
return true;
|
||
|
}
|
||
|
} else {
|
||
|
const unsigned char* offset = pos_;
|
||
|
// Read offsets from |pos_| until the label of the child node at |offset|
|
||
|
// matches |input|, or until there are no more offsets.
|
||
|
while (GetNextOffset(&pos_, &offset)) {
|
||
|
DCHECK(offset < end_);
|
||
|
DCHECK((pos_ == nullptr) || (pos_ < end_));
|
||
|
|
||
|
// |offset| points to a DAFSA node that is a child of the original node.
|
||
|
//
|
||
|
// The low 7 bits of a node encodes a character value; the high bit
|
||
|
// indicates whether it's the last character in the label.
|
||
|
//
|
||
|
// Note that |*offset| could also be a result code value, but these are
|
||
|
// really just out-of-range ASCII values, encoded the same way as
|
||
|
// characters. Since |input| was already validated as a printable ASCII
|
||
|
// value ASCII value, IsMatch will never return true if |offset| is a
|
||
|
// result code.
|
||
|
bool is_last_char_in_label = IsEOL(offset);
|
||
|
bool is_match = IsMatch(offset, input);
|
||
|
|
||
|
if (is_match) {
|
||
|
// If this is not the last character in the label, the next byte
|
||
|
// should be interpreted as a character or return value. Otherwise,
|
||
|
// the next byte should be interpreted as a list of child node
|
||
|
// offsets.
|
||
|
pos_ = offset + 1;
|
||
|
DCHECK(pos_ < end_);
|
||
|
pos_is_label_character_ = !is_last_char_in_label;
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// If no match was found, then end of the DAFSA has been reached.
|
||
|
pos_ = nullptr;
|
||
|
pos_is_label_character_ = false;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
int FixedSetIncrementalLookup::GetResultForCurrentSequence() const {
|
||
|
int value = kDafsaNotFound;
|
||
|
// Look to see if there is a next character that's a return value.
|
||
|
if (pos_is_label_character_) {
|
||
|
// Currently processing a label, so it is only necessary to check the byte
|
||
|
// at |pos_| to see if encodes a return value.
|
||
|
GetReturnValue(pos_, &value);
|
||
|
} else {
|
||
|
// Otherwise, |pos_| is an offset list (or nullptr). Explore the list of
|
||
|
// child nodes (given by their offsets) to find one whose label is a result
|
||
|
// code.
|
||
|
//
|
||
|
// This search uses a temporary copy of |pos_|, since mutating |pos_| could
|
||
|
// skip over a node that would be important to a subsequent Advance() call.
|
||
|
const unsigned char* temp_pos = pos_;
|
||
|
|
||
|
// Read offsets from |temp_pos| until either |temp_pos| is nullptr or until
|
||
|
// the byte at |offset| contains a result code (encoded as an ASCII
|
||
|
// character below 0x20).
|
||
|
const unsigned char* offset = pos_;
|
||
|
while (GetNextOffset(&temp_pos, &offset)) {
|
||
|
DCHECK(offset < end_);
|
||
|
DCHECK((temp_pos == nullptr) || temp_pos < end_);
|
||
|
if (GetReturnValue(offset, &value))
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return value;
|
||
|
}
|
||
|
|
||
|
int LookupStringInFixedSet(const unsigned char* graph,
|
||
|
size_t length,
|
||
|
const char* key,
|
||
|
size_t key_length) {
|
||
|
// Do an incremental lookup until either the end of the graph is reached, or
|
||
|
// until every character in |key| is consumed.
|
||
|
FixedSetIncrementalLookup lookup(graph, length);
|
||
|
const char* key_end = key + key_length;
|
||
|
while (key != key_end) {
|
||
|
if (!lookup.Advance(*key))
|
||
|
return kDafsaNotFound;
|
||
|
key++;
|
||
|
}
|
||
|
// The entire input was consumed without reaching the end of the graph. Return
|
||
|
// the result code (if present) for the current position, or kDafsaNotFound.
|
||
|
return lookup.GetResultForCurrentSequence();
|
||
|
}
|
||
|
|
||
|
} // namespace net
|