mirror of
https://bitbucket.org/anticensority/antizapret-pac-generator-light.git
synced 2024-11-29 16:46:03 +03:00
TEST: replace top repeating sequences
This commit is contained in:
parent
e98ac41a00
commit
fe0f9fbf92
1000
dict/google-1000.txt
1000
dict/google-1000.txt
File diff suppressed because it is too large
Load Diff
@ -1,48 +1,46 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
from pprint import pprint
|
|
||||||
|
|
||||||
wordhit = {}
|
patternhit = {}
|
||||||
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
|
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
|
||||||
"K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
|
"K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
|
||||||
"U", "V", "W", "X", "Y", "Z",
|
"U", "V", "W", "X", "Y", "Z",
|
||||||
"!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
"!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
||||||
"=", "+", "/", ",", "<", ">", "~"]
|
"=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"]
|
||||||
|
|
||||||
with open(sys.argv[1], "r") as wfile:
|
with open(sys.argv[1], "r") as dfile:
|
||||||
with open(sys.argv[2], "r") as dfile:
|
|
||||||
domains = dfile.read().split("\n")
|
domains = dfile.read().split("\n")
|
||||||
words = wfile.read().split("\n")
|
|
||||||
|
|
||||||
new_domains = []
|
new_domains = []
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
new_domains.append('.'.join(domain.split(".")[:-1]))
|
new_domains.append('.'.join(domain.split(".")[:-1]))
|
||||||
domains = new_domains
|
domains = ''.join(new_domains)
|
||||||
for word in words:
|
|
||||||
wordhit[word] = 0
|
|
||||||
|
|
||||||
domain_len = len(domains)
|
domain_len = len(domains)
|
||||||
for i, domain in enumerate(domains):
|
position = 0
|
||||||
if (i % 1000) == 0:
|
|
||||||
print(i, "/", domain_len, end="\r", file=sys.stderr)
|
|
||||||
for word in words:
|
|
||||||
if word in domain:
|
|
||||||
wordhit[word] += 1
|
|
||||||
|
|
||||||
wordhit_c = {}
|
while position <= domain_len:
|
||||||
for word in wordhit:
|
cut = domains[position:position+2]
|
||||||
value = wordhit[word]
|
if not patternhit.get(cut):
|
||||||
if value != 0 and word != '':
|
patternhit[cut] = 0
|
||||||
wordhit_c[word] = value
|
patternhit[cut] += 1
|
||||||
|
position += 2
|
||||||
|
|
||||||
wordhit_c = dict(sorted(wordhit_c.items(), key=lambda x: x[1]))
|
patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1]))
|
||||||
|
|
||||||
#print(wordhit_c)
|
#print(patternhit, file=sys.stderr)
|
||||||
finallist = list(wordhit_c)[-43:]
|
finallist = list(patternhit)[-1 * len(wordreplace):]
|
||||||
finallist = sorted(finallist, key=lambda x: 1000 - len(x))
|
|
||||||
print(finallist, file=sys.stderr)
|
print(finallist, file=sys.stderr)
|
||||||
print("{")
|
|
||||||
|
with open(sys.argv[2], "w") as awkfile:
|
||||||
|
print("{", file=awkfile)
|
||||||
for i, w in enumerate(finallist):
|
for i, w in enumerate(finallist):
|
||||||
print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]))
|
print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]), file=awkfile)
|
||||||
print("}")
|
print("}", file=awkfile)
|
||||||
|
|
||||||
|
with open(sys.argv[3], "w") as pacfile:
|
||||||
|
pacdict = {}
|
||||||
|
for i, w in enumerate(finallist):
|
||||||
|
pacdict[wordreplace[i].strip('\\')] = w
|
||||||
|
print(pacdict, file=pacfile)
|
||||||
|
@ -4,6 +4,8 @@ set -e
|
|||||||
source config/config.sh
|
source config/config.sh
|
||||||
echo -n > "$PACFILE"
|
echo -n > "$PACFILE"
|
||||||
|
|
||||||
|
python3 dict/topwords.py result/hostlist_zones.txt temp/replace-common-words.awk temp/pacpatterns.js
|
||||||
|
|
||||||
# .pac header
|
# .pac header
|
||||||
echo "// ProstoVPN.AntiZapret PAC-host File
|
echo "// ProstoVPN.AntiZapret PAC-host File
|
||||||
// Generated on $(date), by https://bitbucket.org/anticensority/antizapret-pac-generator-light/
|
// Generated on $(date), by https://bitbucket.org/anticensority/antizapret-pac-generator-light/
|
||||||
@ -27,12 +29,22 @@ sort -Vu temp/include-ips.txt result/iplist_blockedbyip_noid2971_collapsed.txt |
|
|||||||
SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
|
SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
|
||||||
awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
|
awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
|
||||||
|
|
||||||
|
PATTERNS=$(cat temp/pacpatterns.js)
|
||||||
|
|
||||||
echo "var special = [
|
echo "var special = [
|
||||||
$SPECIAL
|
$SPECIAL
|
||||||
];
|
];
|
||||||
var az_initialized = 0;
|
var az_initialized = 0;
|
||||||
// CIDR to netmask, for special
|
// CIDR to netmask, for special
|
||||||
function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
|
function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
|
||||||
|
// replace repeating sequences in domain
|
||||||
|
function patternreplace(s) {
|
||||||
|
var patterns = $PATTERNS;
|
||||||
|
for (pattern in patterns) {
|
||||||
|
s = s.split(patterns[pattern]).join(pattern);
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
function FindProxyForURL(url, host) {" >> "$PACFILE"
|
function FindProxyForURL(url, host) {" >> "$PACFILE"
|
||||||
|
|
||||||
@ -93,6 +105,7 @@ echo "
|
|||||||
if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
|
if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
|
||||||
var curhost = curdomain[1];
|
var curhost = curdomain[1];
|
||||||
var curzone = curdomain[2];
|
var curzone = curdomain[2];
|
||||||
|
curhost = patternreplace(curhost);
|
||||||
var curarr = []; // dummy empty array
|
var curarr = []; // dummy empty array
|
||||||
if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
|
if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
|
||||||
if (typeof domains[curzone][curhost.length] === 'string') {
|
if (typeof domains[curzone][curhost.length] === 'string') {
|
||||||
|
2
parse.sh
2
parse.sh
@ -30,8 +30,6 @@ then
|
|||||||
awk -f scripts/getzones.awk temp/hostlist_original_with_include.txt | grep -v -F -x -f temp/exclude-hosts.txt | sort -u > result/hostlist_zones.txt
|
awk -f scripts/getzones.awk temp/hostlist_original_with_include.txt | grep -v -F -x -f temp/exclude-hosts.txt | sort -u > result/hostlist_zones.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
python dict/topwords.py dict/google-1000.txt result/hostlist_zones.txt > temp/replace-common-words.awk
|
|
||||||
|
|
||||||
# Generate a list of IP addresses
|
# Generate a list of IP addresses
|
||||||
awk -F';' '$1 ~ /\// {print $1}' temp/list.csv | grep -P '([0-9]{1,3}\.){3}[0-9]{1,3}\/[0-9]{1,2}' -o | sort -Vu > result/iplist_special_range.txt
|
awk -F';' '$1 ~ /\// {print $1}' temp/list.csv | grep -P '([0-9]{1,3}\.){3}[0-9]{1,3}\/[0-9]{1,2}' -o | sort -Vu > result/iplist_special_range.txt
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user