mirror of
https://bitbucket.org/anticensority/antizapret-pac-generator-light.git
synced 2024-11-22 05:06:04 +03:00
New compression feature: replace most common two-character sequences
This commit is contained in:
parent
e630f46ce2
commit
2d4ba7ef9b
@ -4,6 +4,8 @@ set -e
|
||||
source config/config.sh
|
||||
echo -n > "$PACFILE"
|
||||
|
||||
python3 scripts/topsequences.py result/hostlist_zones.txt temp/replace-common-sequences.awk temp/pacpatterns.js
|
||||
|
||||
# .pac header
|
||||
echo "// ProstoVPN.AntiZapret PAC-host File
|
||||
// Generated on $(date), by https://bitbucket.org/anticensority/antizapret-pac-generator-light/
|
||||
@ -27,12 +29,22 @@ sort -Vu temp/include-ips.txt result/iplist_blockedbyip_noid2971_collapsed.txt |
|
||||
SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
|
||||
awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
|
||||
|
||||
PATTERNS=$(cat temp/pacpatterns.js)
|
||||
|
||||
echo "var special = [
|
||||
$SPECIAL
|
||||
];
|
||||
var az_initialized = 0;
|
||||
// CIDR to netmask, for special
|
||||
function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
|
||||
// replace repeating sequences in domain
|
||||
function patternreplace(s) {
|
||||
var patterns = $PATTERNS;
|
||||
for (pattern in patterns) {
|
||||
s = s.split(patterns[pattern]).join(pattern);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
function FindProxyForURL(url, host) {" >> "$PACFILE"
|
||||
|
||||
@ -93,6 +105,7 @@ echo "
|
||||
if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
|
||||
var curhost = curdomain[1];
|
||||
var curzone = curdomain[2];
|
||||
curhost = patternreplace(curhost);
|
||||
var curarr = []; // dummy empty array
|
||||
if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
|
||||
if (typeof domains[curzone][curhost.length] === 'string') {
|
||||
|
@ -1,6 +1,9 @@
|
||||
{
|
||||
domainzone = gensub(/(.*)\.([^.]+$)/, "\\2", 1)
|
||||
domainname = gensub(/(.*)\.([^.]+$)/, "\\1", 1)
|
||||
}
|
||||
@include "temp/replace-common-sequences.awk"
|
||||
{
|
||||
domainlength = length(domainname)
|
||||
domainarray[domainzone][domainlength][domainname] = domainname
|
||||
#print "adding", $0, ":", domainzone, domainlength, domainname
|
||||
|
57
scripts/topsequences.py
Executable file
57
scripts/topsequences.py
Executable file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
|
||||
'''
|
||||
This script finds the most common two-character sequences
|
||||
and replace them with a single uppercase character or
|
||||
special character, to compression purposes.
|
||||
'''
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
print("{}: <host list.txt> <awk output.awk> <pac function.js>".format(sys.argv[0]))
|
||||
sys.exit(1)
|
||||
|
||||
patternhit = {}
|
||||
# "&" character should be prepended with two backslashes for awk's gsub.
|
||||
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
|
||||
"K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
|
||||
"U", "V", "W", "X", "Y", "Z",
|
||||
"!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
||||
"=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"]
|
||||
|
||||
with open(sys.argv[1], "r") as dfile:
|
||||
domains = dfile.read().split("\n")
|
||||
|
||||
new_domains = []
|
||||
for domain in domains:
|
||||
new_domains.append('.'.join(domain.split(".")[:-1]))
|
||||
domains = ''.join(new_domains)
|
||||
|
||||
domain_len = len(domains)
|
||||
position = 0
|
||||
|
||||
while position <= domain_len:
|
||||
cut = domains[position:position+2]
|
||||
if not patternhit.get(cut):
|
||||
patternhit[cut] = 0
|
||||
patternhit[cut] += 1
|
||||
position += 2
|
||||
|
||||
patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1]))
|
||||
|
||||
#print(patternhit, file=sys.stderr)
|
||||
finallist = list(patternhit)[-1 * len(wordreplace):]
|
||||
#print(finallist, file=sys.stderr)
|
||||
|
||||
with open(sys.argv[2], "w") as awkfile:
|
||||
print("{", file=awkfile)
|
||||
for i, w in enumerate(finallist):
|
||||
print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]), file=awkfile)
|
||||
print("}", file=awkfile)
|
||||
|
||||
with open(sys.argv[3], "w") as pacfile:
|
||||
pacdict = {}
|
||||
for i, w in enumerate(finallist):
|
||||
pacdict[wordreplace[i].strip('\\')] = w
|
||||
print(pacdict, file=pacfile)
|
Loading…
Reference in New Issue
Block a user