diff --git a/generate-pac.sh b/generate-pac.sh index 51c9154..d128e27 100755 --- a/generate-pac.sh +++ b/generate-pac.sh @@ -4,6 +4,8 @@ set -e source config/config.sh echo -n > "$PACFILE" +python3 scripts/topsequences.py result/hostlist_zones.txt temp/replace-common-sequences.awk temp/pacpatterns.js + # .pac header echo "// ProstoVPN.AntiZapret PAC-host File // Generated on $(date), by https://bitbucket.org/anticensority/antizapret-pac-generator-light/ @@ -27,12 +29,22 @@ sort -Vu temp/include-ips.txt result/iplist_blockedbyip_noid2971_collapsed.txt | SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \ awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')" +PATTERNS=$(cat temp/pacpatterns.js) + echo "var special = [ $SPECIAL ]; var az_initialized = 0; // CIDR to netmask, for special function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');} +// replace repeating sequences in domain +function patternreplace(s) { + var patterns = $PATTERNS; + for (pattern in patterns) { + s = s.split(patterns[pattern]).join(pattern); + } + return s; +} function FindProxyForURL(url, host) {" >> "$PACFILE" @@ -93,6 +105,7 @@ echo " if (!curdomain || !curdomain[2]) {return \"DIRECT\";} var curhost = curdomain[1]; var curzone = curdomain[2]; + curhost = patternreplace(curhost); var curarr = []; // dummy empty array if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) { if (typeof domains[curzone][curhost.length] === 'string') { diff --git a/scripts/generate-pac-domains.awk b/scripts/generate-pac-domains.awk index c773863..c90df43 100644 --- a/scripts/generate-pac-domains.awk +++ b/scripts/generate-pac-domains.awk @@ -1,6 +1,9 @@ { domainzone = gensub(/(.*)\.([^.]+$)/, "\\2", 1) domainname = gensub(/(.*)\.([^.]+$)/, "\\1", 1) +} + @include "temp/replace-common-sequences.awk" +{ domainlength = length(domainname) domainarray[domainzone][domainlength][domainname] = domainname #print "adding", $0, ":", domainzone, domainlength, domainname diff --git a/scripts/topsequences.py b/scripts/topsequences.py new file mode 100755 index 0000000..1548ef0 --- /dev/null +++ b/scripts/topsequences.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 + +import sys + +''' +This script finds the most common two-character sequences +and replace them with a single uppercase character or +special character, to compression purposes. +''' + +if len(sys.argv) != 4: + print("{}: ".format(sys.argv[0])) + sys.exit(1) + +patternhit = {} +# "&" character should be prepended with two backslashes for awk's gsub. +wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", + "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", + "U", "V", "W", "X", "Y", "Z", + "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")", + "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"] + +with open(sys.argv[1], "r") as dfile: + domains = dfile.read().split("\n") + + new_domains = [] + for domain in domains: + new_domains.append('.'.join(domain.split(".")[:-1])) + domains = ''.join(new_domains) + + domain_len = len(domains) + position = 0 + + while position <= domain_len: + cut = domains[position:position+2] + if not patternhit.get(cut): + patternhit[cut] = 0 + patternhit[cut] += 1 + position += 2 + +patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1])) + +#print(patternhit, file=sys.stderr) +finallist = list(patternhit)[-1 * len(wordreplace):] +#print(finallist, file=sys.stderr) + +with open(sys.argv[2], "w") as awkfile: + print("{", file=awkfile) + for i, w in enumerate(finallist): + print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]), file=awkfile) + print("}", file=awkfile) + +with open(sys.argv[3], "w") as pacfile: + pacdict = {} + for i, w in enumerate(finallist): + pacdict[wordreplace[i].strip('\\')] = w + print(pacdict, file=pacfile)