mirror of
https://github.com/klzgrad/naiveproxy.git
synced 2024-11-24 22:36:09 +03:00
256 lines
9.0 KiB
C++
256 lines
9.0 KiB
C++
|
// Copyright 2013 The Chromium Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
#include "net/tools/tld_cleanup/tld_cleanup_util.h"
|
||
|
|
||
|
#include "base/files/file_util.h"
|
||
|
#include "base/logging.h"
|
||
|
#include "base/strings/string_number_conversions.h"
|
||
|
#include "base/strings/string_util.h"
|
||
|
#include "url/gurl.h"
|
||
|
#include "url/third_party/mozilla/url_parse.h"
|
||
|
|
||
|
namespace {
|
||
|
|
||
|
const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
|
||
|
const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
|
||
|
|
||
|
const int kExceptionRule = 1;
|
||
|
const int kWildcardRule = 2;
|
||
|
const int kPrivateRule = 4;
|
||
|
}
|
||
|
|
||
|
namespace net {
|
||
|
namespace tld_cleanup {
|
||
|
|
||
|
// Writes the list of domain rules contained in the 'rules' set to the
|
||
|
// 'outfile', with each rule terminated by a LF. The file must already have
|
||
|
// been created with write access.
|
||
|
bool WriteRules(const RuleMap& rules, const base::FilePath& outfile) {
|
||
|
std::string data;
|
||
|
data.append("%{\n"
|
||
|
"// Copyright 2012 The Chromium Authors. All rights reserved.\n"
|
||
|
"// Use of this source code is governed by a BSD-style license "
|
||
|
"that can be\n"
|
||
|
"// found in the LICENSE file.\n\n"
|
||
|
"// This file is generated by net/tools/tld_cleanup/.\n"
|
||
|
"// DO NOT MANUALLY EDIT!\n"
|
||
|
"%}\n"
|
||
|
"struct DomainRule {\n"
|
||
|
" int name_offset;\n"
|
||
|
" int type; // flags: 1: exception, 2: wildcard, 4: private\n"
|
||
|
"};\n"
|
||
|
"%%\n");
|
||
|
|
||
|
for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
|
||
|
data.append(i->first);
|
||
|
data.append(", ");
|
||
|
int type = 0;
|
||
|
if (i->second.exception) {
|
||
|
type = kExceptionRule;
|
||
|
} else if (i->second.wildcard) {
|
||
|
type = kWildcardRule;
|
||
|
}
|
||
|
if (i->second.is_private) {
|
||
|
type += kPrivateRule;
|
||
|
}
|
||
|
data.append(base::IntToString(type));
|
||
|
data.append("\n");
|
||
|
}
|
||
|
|
||
|
data.append("%%\n");
|
||
|
|
||
|
int written = base::WriteFile(outfile,
|
||
|
data.data(),
|
||
|
static_cast<int>(data.size()));
|
||
|
|
||
|
return written == static_cast<int>(data.size());
|
||
|
}
|
||
|
|
||
|
// Adjusts the rule to a standard form: removes single extraneous dots and
|
||
|
// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
|
||
|
// valid; logs a warning and returns kWarning if it is probably invalid; and
|
||
|
// logs an error and returns kError if the rule is (almost) certainly invalid.
|
||
|
NormalizeResult NormalizeRule(std::string* domain, Rule* rule) {
|
||
|
NormalizeResult result = kSuccess;
|
||
|
|
||
|
// Strip single leading and trailing dots.
|
||
|
if (domain->at(0) == '.')
|
||
|
domain->erase(0, 1);
|
||
|
if (domain->empty()) {
|
||
|
LOG(WARNING) << "Ignoring empty rule";
|
||
|
return kWarning;
|
||
|
}
|
||
|
if (domain->at(domain->size() - 1) == '.')
|
||
|
domain->erase(domain->size() - 1, 1);
|
||
|
if (domain->empty()) {
|
||
|
LOG(WARNING) << "Ignoring empty rule";
|
||
|
return kWarning;
|
||
|
}
|
||
|
|
||
|
// Allow single leading '*.' or '!', saved here so it's not canonicalized.
|
||
|
size_t start_offset = 0;
|
||
|
if (domain->at(0) == '!') {
|
||
|
domain->erase(0, 1);
|
||
|
rule->exception = true;
|
||
|
} else if (domain->find("*.") == 0) {
|
||
|
domain->erase(0, 2);
|
||
|
rule->wildcard = true;
|
||
|
}
|
||
|
if (domain->empty()) {
|
||
|
LOG(WARNING) << "Ignoring empty rule";
|
||
|
return kWarning;
|
||
|
}
|
||
|
|
||
|
// Warn about additional '*.' or '!'.
|
||
|
if (domain->find("*.", start_offset) != std::string::npos ||
|
||
|
domain->find('!', start_offset) != std::string::npos) {
|
||
|
LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
|
||
|
result = kWarning;
|
||
|
}
|
||
|
|
||
|
// Make a GURL and normalize it, then get the host back out.
|
||
|
std::string url = "http://";
|
||
|
url.append(*domain);
|
||
|
GURL gurl(url);
|
||
|
const std::string& spec = gurl.possibly_invalid_spec();
|
||
|
url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
|
||
|
if (host.len < 0) {
|
||
|
LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
|
||
|
return kError;
|
||
|
}
|
||
|
if (!gurl.is_valid()) {
|
||
|
LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
|
||
|
result = kWarning;
|
||
|
}
|
||
|
domain->assign(spec.substr(host.begin, host.len));
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
NormalizeResult NormalizeDataToRuleMap(const std::string data,
|
||
|
RuleMap* rules) {
|
||
|
CHECK(rules);
|
||
|
// We do a lot of string assignment during parsing, but simplicity is more
|
||
|
// important than performance here.
|
||
|
std::string domain;
|
||
|
NormalizeResult result = kSuccess;
|
||
|
size_t line_start = 0;
|
||
|
size_t line_end = 0;
|
||
|
bool is_private = false;
|
||
|
RuleMap extra_rules;
|
||
|
int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
|
||
|
int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
|
||
|
while (line_start < data.size()) {
|
||
|
if (line_start + begin_private_length < data.size() &&
|
||
|
!data.compare(line_start, begin_private_length,
|
||
|
kBeginPrivateDomainsComment)) {
|
||
|
is_private = true;
|
||
|
line_end = line_start + begin_private_length;
|
||
|
} else if (line_start + end_private_length < data.size() &&
|
||
|
!data.compare(line_start, end_private_length,
|
||
|
kEndPrivateDomainsComment)) {
|
||
|
is_private = false;
|
||
|
line_end = line_start + end_private_length;
|
||
|
} else if (line_start + 1 < data.size() &&
|
||
|
data[line_start] == '/' &&
|
||
|
data[line_start + 1] == '/') {
|
||
|
// Skip comments.
|
||
|
line_end = data.find_first_of("\r\n", line_start);
|
||
|
if (line_end == std::string::npos)
|
||
|
line_end = data.size();
|
||
|
} else {
|
||
|
// Truncate at first whitespace.
|
||
|
line_end = data.find_first_of("\r\n \t", line_start);
|
||
|
if (line_end == std::string::npos)
|
||
|
line_end = data.size();
|
||
|
domain.assign(data, line_start, line_end - line_start);
|
||
|
|
||
|
Rule rule;
|
||
|
rule.wildcard = false;
|
||
|
rule.exception = false;
|
||
|
rule.is_private = is_private;
|
||
|
NormalizeResult new_result = NormalizeRule(&domain, &rule);
|
||
|
if (new_result != kError) {
|
||
|
// Check the existing rules to make sure we don't have an exception and
|
||
|
// wildcard for the same rule, or that the same domain is listed as both
|
||
|
// private and not private. If we did, we'd have to update our
|
||
|
// parsing code to handle this case.
|
||
|
CHECK(rules->find(domain) == rules->end())
|
||
|
<< "Duplicate rule found for " << domain;
|
||
|
|
||
|
(*rules)[domain] = rule;
|
||
|
// Add true TLD for multi-level rules. We don't add them right now, in
|
||
|
// case there's an exception or wild card that either exists or might be
|
||
|
// added in a later iteration. In those cases, there's no need to add
|
||
|
// it and it would just slow down parsing the data.
|
||
|
size_t tld_start = domain.find_last_of('.');
|
||
|
if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
|
||
|
std::string extra_rule_domain = domain.substr(tld_start + 1);
|
||
|
RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
|
||
|
Rule extra_rule;
|
||
|
extra_rule.exception = false;
|
||
|
extra_rule.wildcard = false;
|
||
|
if (iter == extra_rules.end()) {
|
||
|
extra_rule.is_private = is_private;
|
||
|
} else {
|
||
|
// A rule already exists, so we ensure that if any of the entries is
|
||
|
// not private the result should be that the entry is not private.
|
||
|
// An example is .au which is not listed as a real TLD, but only
|
||
|
// lists second-level domains such as com.au. Subdomains of .au
|
||
|
// (eg. blogspot.com.au) are also listed in the private section,
|
||
|
// which is processed later, so this ensures that the real TLD
|
||
|
// (eg. .au) is listed as public.
|
||
|
extra_rule.is_private = is_private && iter->second.is_private;
|
||
|
}
|
||
|
extra_rules[extra_rule_domain] = extra_rule;
|
||
|
}
|
||
|
}
|
||
|
result = std::max(result, new_result);
|
||
|
}
|
||
|
|
||
|
// Find beginning of next non-empty line.
|
||
|
line_start = data.find_first_of("\r\n", line_end);
|
||
|
if (line_start == std::string::npos)
|
||
|
line_start = data.size();
|
||
|
line_start = data.find_first_not_of("\r\n", line_start);
|
||
|
if (line_start == std::string::npos)
|
||
|
line_start = data.size();
|
||
|
}
|
||
|
|
||
|
for (RuleMap::const_iterator iter = extra_rules.begin();
|
||
|
iter != extra_rules.end();
|
||
|
++iter) {
|
||
|
if (rules->find(iter->first) == rules->end()) {
|
||
|
(*rules)[iter->first] = iter->second;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
NormalizeResult NormalizeFile(const base::FilePath& in_filename,
|
||
|
const base::FilePath& out_filename) {
|
||
|
RuleMap rules;
|
||
|
std::string data;
|
||
|
if (!base::ReadFileToString(in_filename, &data)) {
|
||
|
LOG(ERROR) << "Unable to read file";
|
||
|
// We return success since we've already reported the error.
|
||
|
return kSuccess;
|
||
|
}
|
||
|
|
||
|
NormalizeResult result = NormalizeDataToRuleMap(data, &rules);
|
||
|
|
||
|
if (!WriteRules(rules, out_filename)) {
|
||
|
LOG(ERROR) << "Error(s) writing output file";
|
||
|
result = kError;
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
|
||
|
} // namespace tld_cleanup
|
||
|
} // namespace net
|