mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-11-22 21:36:12 +03:00
429 lines
17 KiB
C++
429 lines
17 KiB
C++
// Copyright 2013 The Chromium Authors
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
// Canonicalizers for random bits that aren't big enough for their own files.
|
|
|
|
#include <string.h>
|
|
|
|
#include "url/url_canon.h"
|
|
#include "url/url_canon_internal.h"
|
|
|
|
namespace url {
|
|
|
|
namespace {
|
|
|
|
// Returns true if the given character should be removed from the middle of a
|
|
// URL.
|
|
inline bool IsRemovableURLWhitespace(int ch) {
|
|
return ch == '\r' || ch == '\n' || ch == '\t';
|
|
}
|
|
|
|
// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
|
|
// It sucks that we have to do this, since this takes about 13% of the total URL
|
|
// canonicalization time.
|
|
template <typename CHAR>
|
|
const CHAR* DoRemoveURLWhitespace(const CHAR* input,
|
|
int input_len,
|
|
CanonOutputT<CHAR>* buffer,
|
|
int* output_len,
|
|
bool* potentially_dangling_markup) {
|
|
// Fast verification that there's nothing that needs removal. This is the 99%
|
|
// case, so we want it to be fast and don't care about impacting the speed
|
|
// when we do find whitespace.
|
|
bool found_whitespace = false;
|
|
if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
|
|
// For large strings, memchr is much faster than any scalar code we can
|
|
// write, even if we need to run it three times. (If this turns out to still
|
|
// be a bottleneck, we could write our own vector code, but given that
|
|
// memchr is so fast, it's unlikely to be relevant.)
|
|
found_whitespace = memchr(input, '\n', input_len) != nullptr ||
|
|
memchr(input, '\r', input_len) != nullptr ||
|
|
memchr(input, '\t', input_len) != nullptr;
|
|
} else {
|
|
for (int i = 0; i < input_len; i++) {
|
|
if (!IsRemovableURLWhitespace(input[i]))
|
|
continue;
|
|
found_whitespace = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found_whitespace) {
|
|
// Didn't find any whitespace, we don't need to do anything. We can just
|
|
// return the input as the output.
|
|
*output_len = input_len;
|
|
return input;
|
|
}
|
|
|
|
// Skip whitespace removal for `data:` URLs.
|
|
//
|
|
// TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
|
|
// that turns out to be difficult to do correctly given this function's
|
|
// character type templating.
|
|
if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
|
|
input[3] == 'a' && input[4] == ':') {
|
|
*output_len = input_len;
|
|
return input;
|
|
}
|
|
|
|
// Remove the whitespace into the new buffer and return it.
|
|
for (int i = 0; i < input_len; i++) {
|
|
if (!IsRemovableURLWhitespace(input[i])) {
|
|
if (potentially_dangling_markup && input[i] == 0x3C)
|
|
*potentially_dangling_markup = true;
|
|
buffer->push_back(input[i]);
|
|
}
|
|
}
|
|
*output_len = buffer->length();
|
|
return buffer->data();
|
|
}
|
|
|
|
// Contains the canonical version of each possible input letter in the scheme
|
|
// (basically, lower-cased). The corresponding entry will be 0 if the letter
|
|
// is not allowed in a scheme.
|
|
// clang-format off
|
|
const char kSchemeCanonical[0x80] = {
|
|
// 00-1f: all are invalid
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
// ' ' ! " # $ % & ' ( ) * + , - . /
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
|
|
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
|
|
// @ A B C D E F G H I J K L M N O
|
|
0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
// P Q R S T U V W X Y Z [ \ ] ^ _
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
|
|
// ` a b c d e f g h i j k l m n o
|
|
0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
|
|
// p q r s t u v w x y z { | } ~
|
|
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
|
|
// clang-format on
|
|
|
|
// This could be a table lookup as well by setting the high bit for each
|
|
// valid character, but it's only called once per URL, and it makes the lookup
|
|
// table easier to read not having extra stuff in it.
|
|
inline bool IsSchemeFirstChar(unsigned char c) {
|
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
|
|
}
|
|
|
|
template <typename CHAR, typename UCHAR>
|
|
bool DoScheme(const CHAR* spec,
|
|
const Component& scheme,
|
|
CanonOutput* output,
|
|
Component* out_scheme) {
|
|
if (scheme.is_empty()) {
|
|
// Scheme is unspecified or empty, convert to empty by appending a colon.
|
|
*out_scheme = Component(output->length(), 0);
|
|
output->push_back(':');
|
|
return false;
|
|
}
|
|
|
|
// The output scheme starts from the current position.
|
|
out_scheme->begin = output->length();
|
|
|
|
// Danger: it's important that this code does not strip any characters;
|
|
// it only emits the canonical version (be it valid or escaped) for each
|
|
// of the input characters. Stripping would put it out of sync with
|
|
// FindAndCompareScheme, which could cause some security checks on
|
|
// schemes to be incorrect.
|
|
bool success = true;
|
|
size_t begin = static_cast<size_t>(scheme.begin);
|
|
size_t end = static_cast<size_t>(scheme.end());
|
|
for (size_t i = begin; i < end; i++) {
|
|
UCHAR ch = static_cast<UCHAR>(spec[i]);
|
|
char replacement = 0;
|
|
if (ch < 0x80) {
|
|
if (i == begin) {
|
|
// Need to do a special check for the first letter of the scheme.
|
|
if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
|
|
replacement = kSchemeCanonical[ch];
|
|
} else {
|
|
replacement = kSchemeCanonical[ch];
|
|
}
|
|
}
|
|
|
|
if (replacement) {
|
|
output->push_back(replacement);
|
|
} else if (ch == '%') {
|
|
// Canonicalizing the scheme multiple times should lead to the same
|
|
// result. Since invalid characters will be escaped, we need to preserve
|
|
// the percent to avoid multiple escaping. The scheme will be invalid.
|
|
success = false;
|
|
output->push_back('%');
|
|
} else {
|
|
// Invalid character, store it but mark this scheme as invalid.
|
|
success = false;
|
|
|
|
// This will escape the output and also handle encoding issues.
|
|
// Ignore the return value since we already failed.
|
|
AppendUTF8EscapedChar(spec, &i, end, output);
|
|
}
|
|
}
|
|
|
|
// The output scheme ends with the the current position, before appending
|
|
// the colon.
|
|
out_scheme->len = output->length() - out_scheme->begin;
|
|
output->push_back(':');
|
|
return success;
|
|
}
|
|
|
|
// The username and password components reference ranges in the corresponding
|
|
// *_spec strings. Typically, these specs will be the same (we're
|
|
// canonicalizing a single source string), but may be different when
|
|
// replacing components.
|
|
template <typename CHAR, typename UCHAR>
|
|
bool DoUserInfo(const CHAR* username_spec,
|
|
const Component& username,
|
|
const CHAR* password_spec,
|
|
const Component& password,
|
|
CanonOutput* output,
|
|
Component* out_username,
|
|
Component* out_password) {
|
|
if (username.is_empty() && password.is_empty()) {
|
|
// Common case: no user info. We strip empty username/passwords.
|
|
*out_username = Component();
|
|
*out_password = Component();
|
|
return true;
|
|
}
|
|
|
|
// Write the username.
|
|
out_username->begin = output->length();
|
|
if (username.is_nonempty()) {
|
|
// This will escape characters not valid for the username.
|
|
AppendStringOfType(&username_spec[username.begin],
|
|
static_cast<size_t>(username.len), CHAR_USERINFO,
|
|
output);
|
|
}
|
|
out_username->len = output->length() - out_username->begin;
|
|
|
|
// When there is a password, we need the separator. Note that we strip
|
|
// empty but specified passwords.
|
|
if (password.is_nonempty()) {
|
|
output->push_back(':');
|
|
out_password->begin = output->length();
|
|
AppendStringOfType(&password_spec[password.begin],
|
|
static_cast<size_t>(password.len), CHAR_USERINFO,
|
|
output);
|
|
out_password->len = output->length() - out_password->begin;
|
|
} else {
|
|
*out_password = Component();
|
|
}
|
|
|
|
output->push_back('@');
|
|
return true;
|
|
}
|
|
|
|
// Helper functions for converting port integers to strings.
|
|
inline void WritePortInt(char* output, int output_len, int port) {
|
|
_itoa_s(port, output, output_len, 10);
|
|
}
|
|
|
|
// This function will prepend the colon if there will be a port.
|
|
template <typename CHAR, typename UCHAR>
|
|
bool DoPort(const CHAR* spec,
|
|
const Component& port,
|
|
int default_port_for_scheme,
|
|
CanonOutput* output,
|
|
Component* out_port) {
|
|
int port_num = ParsePort(spec, port);
|
|
if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
|
|
*out_port = Component();
|
|
return true; // Leave port empty.
|
|
}
|
|
|
|
if (port_num == PORT_INVALID) {
|
|
// Invalid port: We'll copy the text from the input so the user can see
|
|
// what the error was, and mark the URL as invalid by returning false.
|
|
output->push_back(':');
|
|
out_port->begin = output->length();
|
|
AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
|
|
static_cast<size_t>(port.end()), output);
|
|
out_port->len = output->length() - out_port->begin;
|
|
return false;
|
|
}
|
|
|
|
// Convert port number back to an integer. Max port value is 5 digits, and
|
|
// the Parsed::ExtractPort will have made sure the integer is in range.
|
|
const int buf_size = 6;
|
|
char buf[buf_size];
|
|
WritePortInt(buf, buf_size, port_num);
|
|
|
|
// Append the port number to the output, preceded by a colon.
|
|
output->push_back(':');
|
|
out_port->begin = output->length();
|
|
for (int i = 0; i < buf_size && buf[i]; i++)
|
|
output->push_back(buf[i]);
|
|
|
|
out_port->len = output->length() - out_port->begin;
|
|
return true;
|
|
}
|
|
|
|
// clang-format off
|
|
// Percent-escape all characters from the fragment percent-encode set
|
|
// https://url.spec.whatwg.org/#fragment-percent-encode-set
|
|
const bool kShouldEscapeCharInFragment[0x80] = {
|
|
// Control characters (0x00-0x1F)
|
|
true, true, true, true, true, true, true, true,
|
|
true, true, true, true, true, true, true, true,
|
|
true, true, true, true, true, true, true, true,
|
|
true, true, true, true, true, true, true, true,
|
|
// ' ' ! " # $ % & '
|
|
true, false, true, false, false, false, false, false,
|
|
// ( ) * + , - . /
|
|
false, false, false, false, false, false, false, false,
|
|
// 0 1 2 3 4 5 6 7
|
|
false, false, false, false, false, false, false, false,
|
|
// 8 9 : ; < = > ?
|
|
false, false, false, false, true, false, true, false,
|
|
// @ A B C D E F G
|
|
false, false, false, false, false, false, false, false,
|
|
// H I J K L M N O
|
|
false, false, false, false, false, false, false, false,
|
|
// P Q R S T U V W
|
|
false, false, false, false, false, false, false, false,
|
|
// X Y Z [ \ ] ^ _
|
|
false, false, false, false, false, false, false, false,
|
|
// ` a b c d e f g
|
|
true, false, false, false, false, false, false, false,
|
|
// h i j k l m n o
|
|
false, false, false, false, false, false, false, false,
|
|
// p q r s t u v w
|
|
false, false, false, false, false, false, false, false,
|
|
// x y z { | } ~ DELETE
|
|
false, false, false, false, false, false, false, true
|
|
};
|
|
// clang-format on
|
|
|
|
template <typename CHAR, typename UCHAR>
|
|
void DoCanonicalizeRef(const CHAR* spec,
|
|
const Component& ref,
|
|
CanonOutput* output,
|
|
Component* out_ref) {
|
|
if (!ref.is_valid()) {
|
|
// Common case of no ref.
|
|
*out_ref = Component();
|
|
return;
|
|
}
|
|
|
|
// Append the ref separator. Note that we need to do this even when the ref
|
|
// is empty but present.
|
|
output->push_back('#');
|
|
out_ref->begin = output->length();
|
|
|
|
// Now iterate through all the characters, converting to UTF-8 and validating.
|
|
size_t end = static_cast<size_t>(ref.end());
|
|
for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
|
|
UCHAR current_char = static_cast<UCHAR>(spec[i]);
|
|
if (current_char < 0x80) {
|
|
if (kShouldEscapeCharInFragment[current_char])
|
|
AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
|
|
else
|
|
output->push_back(static_cast<char>(spec[i]));
|
|
} else {
|
|
AppendUTF8EscapedChar(spec, &i, end, output);
|
|
}
|
|
}
|
|
|
|
out_ref->len = output->length() - out_ref->begin;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
const char* RemoveURLWhitespace(const char* input,
|
|
int input_len,
|
|
CanonOutputT<char>* buffer,
|
|
int* output_len,
|
|
bool* potentially_dangling_markup) {
|
|
return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
|
|
potentially_dangling_markup);
|
|
}
|
|
|
|
const char16_t* RemoveURLWhitespace(const char16_t* input,
|
|
int input_len,
|
|
CanonOutputT<char16_t>* buffer,
|
|
int* output_len,
|
|
bool* potentially_dangling_markup) {
|
|
return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
|
|
potentially_dangling_markup);
|
|
}
|
|
|
|
char CanonicalSchemeChar(char16_t ch) {
|
|
if (ch >= 0x80)
|
|
return 0; // Non-ASCII is not supported by schemes.
|
|
return kSchemeCanonical[ch];
|
|
}
|
|
|
|
bool CanonicalizeScheme(const char* spec,
|
|
const Component& scheme,
|
|
CanonOutput* output,
|
|
Component* out_scheme) {
|
|
return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
|
|
}
|
|
|
|
bool CanonicalizeScheme(const char16_t* spec,
|
|
const Component& scheme,
|
|
CanonOutput* output,
|
|
Component* out_scheme) {
|
|
return DoScheme<char16_t, char16_t>(spec, scheme, output, out_scheme);
|
|
}
|
|
|
|
bool CanonicalizeUserInfo(const char* username_source,
|
|
const Component& username,
|
|
const char* password_source,
|
|
const Component& password,
|
|
CanonOutput* output,
|
|
Component* out_username,
|
|
Component* out_password) {
|
|
return DoUserInfo<char, unsigned char>(username_source, username,
|
|
password_source, password, output,
|
|
out_username, out_password);
|
|
}
|
|
|
|
bool CanonicalizeUserInfo(const char16_t* username_source,
|
|
const Component& username,
|
|
const char16_t* password_source,
|
|
const Component& password,
|
|
CanonOutput* output,
|
|
Component* out_username,
|
|
Component* out_password) {
|
|
return DoUserInfo<char16_t, char16_t>(username_source, username,
|
|
password_source, password, output,
|
|
out_username, out_password);
|
|
}
|
|
|
|
bool CanonicalizePort(const char* spec,
|
|
const Component& port,
|
|
int default_port_for_scheme,
|
|
CanonOutput* output,
|
|
Component* out_port) {
|
|
return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
|
|
output, out_port);
|
|
}
|
|
|
|
bool CanonicalizePort(const char16_t* spec,
|
|
const Component& port,
|
|
int default_port_for_scheme,
|
|
CanonOutput* output,
|
|
Component* out_port) {
|
|
return DoPort<char16_t, char16_t>(spec, port, default_port_for_scheme, output,
|
|
out_port);
|
|
}
|
|
|
|
void CanonicalizeRef(const char* spec,
|
|
const Component& ref,
|
|
CanonOutput* output,
|
|
Component* out_ref) {
|
|
DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
|
|
}
|
|
|
|
void CanonicalizeRef(const char16_t* spec,
|
|
const Component& ref,
|
|
CanonOutput* output,
|
|
Component* out_ref) {
|
|
DoCanonicalizeRef<char16_t, char16_t>(spec, ref, output, out_ref);
|
|
}
|
|
|
|
} // namespace url
|