mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-12-01 01:36:09 +03:00
437 lines
14 KiB
C++
437 lines
14 KiB
C++
|
// Copyright 2013 The Chromium Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
#include "net/base/url_util.h"
|
||
|
|
||
|
#include "build/build_config.h"
|
||
|
|
||
|
#if defined(OS_POSIX)
|
||
|
#include <netinet/in.h>
|
||
|
#elif defined(OS_WIN)
|
||
|
#include <ws2tcpip.h>
|
||
|
#endif
|
||
|
|
||
|
#include "base/logging.h"
|
||
|
#include "base/strings/string_util.h"
|
||
|
#include "base/strings/stringprintf.h"
|
||
|
#include "net/base/escape.h"
|
||
|
#include "net/base/ip_address.h"
|
||
|
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
|
||
|
#include "url/gurl.h"
|
||
|
#include "url/url_canon.h"
|
||
|
#include "url/url_canon_ip.h"
|
||
|
|
||
|
namespace net {
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
bool IsHostCharAlphanumeric(char c) {
|
||
|
// We can just check lowercase because uppercase characters have already been
|
||
|
// normalized.
|
||
|
return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
|
||
|
}
|
||
|
|
||
|
bool IsNormalizedLocalhostTLD(const std::string& host) {
|
||
|
return base::EndsWith(host, ".localhost", base::CompareCase::SENSITIVE);
|
||
|
}
|
||
|
|
||
|
} // namespace
|
||
|
|
||
|
GURL AppendQueryParameter(const GURL& url,
|
||
|
const std::string& name,
|
||
|
const std::string& value) {
|
||
|
std::string query(url.query());
|
||
|
|
||
|
if (!query.empty())
|
||
|
query += "&";
|
||
|
|
||
|
query += (EscapeQueryParamValue(name, true) + "=" +
|
||
|
EscapeQueryParamValue(value, true));
|
||
|
GURL::Replacements replacements;
|
||
|
replacements.SetQueryStr(query);
|
||
|
return url.ReplaceComponents(replacements);
|
||
|
}
|
||
|
|
||
|
GURL AppendOrReplaceQueryParameter(const GURL& url,
|
||
|
const std::string& name,
|
||
|
const std::string& value) {
|
||
|
bool replaced = false;
|
||
|
std::string param_name = EscapeQueryParamValue(name, true);
|
||
|
std::string param_value = EscapeQueryParamValue(value, true);
|
||
|
|
||
|
const std::string input = url.query();
|
||
|
url::Component cursor(0, input.size());
|
||
|
std::string output;
|
||
|
url::Component key_range, value_range;
|
||
|
while (url::ExtractQueryKeyValue(input.data(), &cursor, &key_range,
|
||
|
&value_range)) {
|
||
|
const base::StringPiece key(
|
||
|
input.data() + key_range.begin, key_range.len);
|
||
|
std::string key_value_pair;
|
||
|
// Check |replaced| as only the first pair should be replaced.
|
||
|
if (!replaced && key == param_name) {
|
||
|
replaced = true;
|
||
|
key_value_pair = (param_name + "=" + param_value);
|
||
|
} else {
|
||
|
key_value_pair.assign(input, key_range.begin,
|
||
|
value_range.end() - key_range.begin);
|
||
|
}
|
||
|
if (!output.empty())
|
||
|
output += "&";
|
||
|
|
||
|
output += key_value_pair;
|
||
|
}
|
||
|
if (!replaced) {
|
||
|
if (!output.empty())
|
||
|
output += "&";
|
||
|
|
||
|
output += (param_name + "=" + param_value);
|
||
|
}
|
||
|
GURL::Replacements replacements;
|
||
|
replacements.SetQueryStr(output);
|
||
|
return url.ReplaceComponents(replacements);
|
||
|
}
|
||
|
|
||
|
QueryIterator::QueryIterator(const GURL& url)
|
||
|
: url_(url),
|
||
|
at_end_(!url.is_valid()) {
|
||
|
if (!at_end_) {
|
||
|
query_ = url.parsed_for_possibly_invalid_spec().query;
|
||
|
Advance();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
QueryIterator::~QueryIterator() = default;
|
||
|
|
||
|
std::string QueryIterator::GetKey() const {
|
||
|
DCHECK(!at_end_);
|
||
|
if (key_.is_nonempty())
|
||
|
return url_.spec().substr(key_.begin, key_.len);
|
||
|
return std::string();
|
||
|
}
|
||
|
|
||
|
std::string QueryIterator::GetValue() const {
|
||
|
DCHECK(!at_end_);
|
||
|
if (value_.is_nonempty())
|
||
|
return url_.spec().substr(value_.begin, value_.len);
|
||
|
return std::string();
|
||
|
}
|
||
|
|
||
|
const std::string& QueryIterator::GetUnescapedValue() {
|
||
|
DCHECK(!at_end_);
|
||
|
if (value_.is_nonempty() && unescaped_value_.empty()) {
|
||
|
unescaped_value_ = UnescapeURLComponent(
|
||
|
GetValue(), UnescapeRule::SPACES | UnescapeRule::PATH_SEPARATORS |
|
||
|
UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
|
||
|
UnescapeRule::REPLACE_PLUS_WITH_SPACE);
|
||
|
}
|
||
|
return unescaped_value_;
|
||
|
}
|
||
|
|
||
|
bool QueryIterator::IsAtEnd() const {
|
||
|
return at_end_;
|
||
|
}
|
||
|
|
||
|
void QueryIterator::Advance() {
|
||
|
DCHECK (!at_end_);
|
||
|
key_.reset();
|
||
|
value_.reset();
|
||
|
unescaped_value_.clear();
|
||
|
at_end_ =
|
||
|
!url::ExtractQueryKeyValue(url_.spec().c_str(), &query_, &key_, &value_);
|
||
|
}
|
||
|
|
||
|
bool GetValueForKeyInQuery(const GURL& url,
|
||
|
const std::string& search_key,
|
||
|
std::string* out_value) {
|
||
|
for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
|
||
|
if (it.GetKey() == search_key) {
|
||
|
*out_value = it.GetUnescapedValue();
|
||
|
return true;
|
||
|
}
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool ParseHostAndPort(base::StringPiece input, std::string* host, int* port) {
|
||
|
if (input.empty())
|
||
|
return false;
|
||
|
|
||
|
url::Component auth_component(0, input.size());
|
||
|
url::Component username_component;
|
||
|
url::Component password_component;
|
||
|
url::Component hostname_component;
|
||
|
url::Component port_component;
|
||
|
|
||
|
url::ParseAuthority(input.data(), auth_component, &username_component,
|
||
|
&password_component, &hostname_component,
|
||
|
&port_component);
|
||
|
|
||
|
// There shouldn't be a username/password.
|
||
|
if (username_component.is_valid() || password_component.is_valid())
|
||
|
return false;
|
||
|
|
||
|
if (!hostname_component.is_nonempty())
|
||
|
return false; // Failed parsing.
|
||
|
|
||
|
int parsed_port_number = -1;
|
||
|
if (port_component.is_nonempty()) {
|
||
|
parsed_port_number = url::ParsePort(input.data(), port_component);
|
||
|
|
||
|
// If parsing failed, port_number will be either PORT_INVALID or
|
||
|
// PORT_UNSPECIFIED, both of which are negative.
|
||
|
if (parsed_port_number < 0)
|
||
|
return false; // Failed parsing the port number.
|
||
|
}
|
||
|
|
||
|
if (port_component.len == 0)
|
||
|
return false; // Reject inputs like "foo:"
|
||
|
|
||
|
unsigned char tmp_ipv6_addr[16];
|
||
|
|
||
|
// If the hostname starts with a bracket, it is either an IPv6 literal or
|
||
|
// invalid. If it is an IPv6 literal then strip the brackets.
|
||
|
if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
|
||
|
if (input[hostname_component.end() - 1] == ']' &&
|
||
|
url::IPv6AddressToNumber(input.data(), hostname_component,
|
||
|
tmp_ipv6_addr)) {
|
||
|
// Strip the brackets.
|
||
|
hostname_component.begin++;
|
||
|
hostname_component.len -= 2;
|
||
|
} else {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Pass results back to caller.
|
||
|
host->assign(input.data() + hostname_component.begin, hostname_component.len);
|
||
|
*port = parsed_port_number;
|
||
|
|
||
|
return true; // Success.
|
||
|
}
|
||
|
|
||
|
|
||
|
std::string GetHostAndPort(const GURL& url) {
|
||
|
// For IPv6 literals, GURL::host() already includes the brackets so it is
|
||
|
// safe to just append a colon.
|
||
|
return base::StringPrintf("%s:%d", url.host().c_str(),
|
||
|
url.EffectiveIntPort());
|
||
|
}
|
||
|
|
||
|
std::string GetHostAndOptionalPort(const GURL& url) {
|
||
|
// For IPv6 literals, GURL::host() already includes the brackets
|
||
|
// so it is safe to just append a colon.
|
||
|
if (url.has_port())
|
||
|
return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
|
||
|
return url.host();
|
||
|
}
|
||
|
|
||
|
std::string TrimEndingDot(base::StringPiece host) {
|
||
|
base::StringPiece host_trimmed = host;
|
||
|
size_t len = host_trimmed.length();
|
||
|
if (len > 1 && host_trimmed[len - 1] == '.') {
|
||
|
host_trimmed.remove_suffix(1);
|
||
|
}
|
||
|
return host_trimmed.as_string();
|
||
|
}
|
||
|
|
||
|
std::string GetHostOrSpecFromURL(const GURL& url) {
|
||
|
return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
|
||
|
}
|
||
|
|
||
|
std::string CanonicalizeHost(base::StringPiece host,
|
||
|
url::CanonHostInfo* host_info) {
|
||
|
// Try to canonicalize the host.
|
||
|
const url::Component raw_host_component(0, static_cast<int>(host.length()));
|
||
|
std::string canon_host;
|
||
|
url::StdStringCanonOutput canon_host_output(&canon_host);
|
||
|
url::CanonicalizeHostVerbose(host.data(), raw_host_component,
|
||
|
&canon_host_output, host_info);
|
||
|
|
||
|
if (host_info->out_host.is_nonempty() &&
|
||
|
host_info->family != url::CanonHostInfo::BROKEN) {
|
||
|
// Success! Assert that there's no extra garbage.
|
||
|
canon_host_output.Complete();
|
||
|
DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
|
||
|
} else {
|
||
|
// Empty host, or canonicalization failed. We'll return empty.
|
||
|
canon_host.clear();
|
||
|
}
|
||
|
|
||
|
return canon_host;
|
||
|
}
|
||
|
|
||
|
bool IsCanonicalizedHostCompliant(const std::string& host) {
|
||
|
if (host.empty())
|
||
|
return false;
|
||
|
|
||
|
bool in_component = false;
|
||
|
bool most_recent_component_started_alphanumeric = false;
|
||
|
|
||
|
for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) {
|
||
|
const char c = *i;
|
||
|
if (!in_component) {
|
||
|
most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
|
||
|
if (!most_recent_component_started_alphanumeric && (c != '-') &&
|
||
|
(c != '_')) {
|
||
|
return false;
|
||
|
}
|
||
|
in_component = true;
|
||
|
} else if (c == '.') {
|
||
|
in_component = false;
|
||
|
} else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return most_recent_component_started_alphanumeric;
|
||
|
}
|
||
|
|
||
|
bool IsHostnameNonUnique(const std::string& hostname) {
|
||
|
// CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
|
||
|
const std::string host_or_ip = hostname.find(':') != std::string::npos ?
|
||
|
"[" + hostname + "]" : hostname;
|
||
|
url::CanonHostInfo host_info;
|
||
|
std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
|
||
|
|
||
|
// If canonicalization fails, then the input is truly malformed. However,
|
||
|
// to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
|
||
|
if (canonical_name.empty())
|
||
|
return false;
|
||
|
|
||
|
// If |hostname| is an IP address, check to see if it's in an IANA-reserved
|
||
|
// range reserved for non-publicly routable networks.
|
||
|
if (host_info.IsIPAddress()) {
|
||
|
IPAddress host_addr;
|
||
|
if (!host_addr.AssignFromIPLiteral(hostname.substr(
|
||
|
host_info.out_host.begin, host_info.out_host.len))) {
|
||
|
return false;
|
||
|
}
|
||
|
switch (host_info.family) {
|
||
|
case url::CanonHostInfo::IPV4:
|
||
|
case url::CanonHostInfo::IPV6:
|
||
|
return !host_addr.IsPubliclyRoutable();
|
||
|
case url::CanonHostInfo::NEUTRAL:
|
||
|
case url::CanonHostInfo::BROKEN:
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Check for a registry controlled portion of |hostname|, ignoring private
|
||
|
// registries, as they already chain to ICANN-administered registries,
|
||
|
// and explicitly ignoring unknown registries.
|
||
|
//
|
||
|
// Note: This means that as new gTLDs are introduced on the Internet, they
|
||
|
// will be treated as non-unique until the registry controlled domain list
|
||
|
// is updated. However, because gTLDs are expected to provide significant
|
||
|
// advance notice to deprecate older versions of this code, this an
|
||
|
// acceptable tradeoff.
|
||
|
return !registry_controlled_domains::HostHasRegistryControlledDomain(
|
||
|
canonical_name, registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
|
||
|
registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
|
||
|
}
|
||
|
|
||
|
bool IsLocalhost(const GURL& url) {
|
||
|
return HostStringIsLocalhost(url.HostNoBracketsPiece());
|
||
|
}
|
||
|
|
||
|
bool HostStringIsLocalhost(base::StringPiece host) {
|
||
|
if (IsLocalHostname(host, nullptr))
|
||
|
return true;
|
||
|
|
||
|
IPAddress ip_address;
|
||
|
if (ip_address.AssignFromIPLiteral(host)) {
|
||
|
size_t size = ip_address.size();
|
||
|
switch (size) {
|
||
|
case IPAddress::kIPv4AddressSize: {
|
||
|
const uint8_t prefix[] = {127};
|
||
|
return IPAddressStartsWith(ip_address, prefix);
|
||
|
}
|
||
|
|
||
|
case IPAddress::kIPv6AddressSize:
|
||
|
return ip_address == IPAddress::IPv6Localhost();
|
||
|
|
||
|
default:
|
||
|
NOTREACHED();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
GURL SimplifyUrlForRequest(const GURL& url) {
|
||
|
DCHECK(url.is_valid());
|
||
|
// Fast path to avoid re-canonicalization via ReplaceComponents.
|
||
|
if (!url.has_username() && !url.has_password() && !url.has_ref())
|
||
|
return url;
|
||
|
GURL::Replacements replacements;
|
||
|
replacements.ClearUsername();
|
||
|
replacements.ClearPassword();
|
||
|
replacements.ClearRef();
|
||
|
return url.ReplaceComponents(replacements);
|
||
|
}
|
||
|
|
||
|
void GetIdentityFromURL(const GURL& url,
|
||
|
base::string16* username,
|
||
|
base::string16* password) {
|
||
|
UnescapeRule::Type flags =
|
||
|
UnescapeRule::SPACES | UnescapeRule::PATH_SEPARATORS |
|
||
|
UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS;
|
||
|
*username = UnescapeAndDecodeUTF8URLComponent(url.username(), flags);
|
||
|
*password = UnescapeAndDecodeUTF8URLComponent(url.password(), flags);
|
||
|
}
|
||
|
|
||
|
bool HasGoogleHost(const GURL& url) {
|
||
|
static const char* kGoogleHostSuffixes[] = {
|
||
|
".google.com",
|
||
|
".youtube.com",
|
||
|
".gmail.com",
|
||
|
".doubleclick.net",
|
||
|
".gstatic.com",
|
||
|
".googlevideo.com",
|
||
|
".googleusercontent.com",
|
||
|
".googlesyndication.com",
|
||
|
".google-analytics.com",
|
||
|
".googleadservices.com",
|
||
|
".googleapis.com",
|
||
|
".ytimg.com",
|
||
|
};
|
||
|
base::StringPiece host = url.host_piece();
|
||
|
for (const char* suffix : kGoogleHostSuffixes) {
|
||
|
// Here it's possible to get away with faster case-sensitive comparisons
|
||
|
// because the list above is all lowercase, and a GURL's host name will
|
||
|
// always be canonicalized to lowercase as well.
|
||
|
if (base::EndsWith(host, suffix, base::CompareCase::SENSITIVE))
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool IsTLS13ExperimentHost(base::StringPiece host) {
|
||
|
return host == "inbox.google.com" || host == "mail.google.com" ||
|
||
|
host == "gmail.com";
|
||
|
}
|
||
|
|
||
|
bool IsLocalHostname(base::StringPiece host, bool* is_local6) {
|
||
|
std::string normalized_host = base::ToLowerASCII(host);
|
||
|
// Remove any trailing '.'.
|
||
|
if (!normalized_host.empty() && *normalized_host.rbegin() == '.')
|
||
|
normalized_host.resize(normalized_host.size() - 1);
|
||
|
|
||
|
if (normalized_host == "localhost6" ||
|
||
|
normalized_host == "localhost6.localdomain6") {
|
||
|
if (is_local6)
|
||
|
*is_local6 = true;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
if (is_local6)
|
||
|
*is_local6 = false;
|
||
|
return normalized_host == "localhost" ||
|
||
|
normalized_host == "localhost.localdomain" ||
|
||
|
IsNormalizedLocalhostTLD(normalized_host);
|
||
|
}
|
||
|
|
||
|
} // namespace net
|