mirror of
https://bitbucket.org/anticensority/antizapret-pac-generator-light.git
synced 2024-11-22 05:06:04 +03:00
Improve topsequence compression algorithm
This commit is contained in:
parent
b0073641a1
commit
bf2cb905ec
@ -12,37 +12,78 @@ if len(sys.argv) != 4:
|
|||||||
print("{}: <host list.txt> <awk output.awk> <pac function.js>".format(sys.argv[0]))
|
print("{}: <host list.txt> <awk output.awk> <pac function.js>".format(sys.argv[0]))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
#patternhit = {"cloudfront": 999999999}
|
||||||
patternhit = {}
|
patternhit = {}
|
||||||
# "&" character should be prepended with two backslashes for awk's gsub.
|
# "&" character should be prepended with two backslashes for awk's gsub.
|
||||||
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
|
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
|
||||||
"K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
|
"K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
|
||||||
"U", "V", "W", "X", "Y", "Z",
|
"U", "V", "W", "X", "Y", "Z",
|
||||||
"!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
"@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
||||||
"=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"]
|
"=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",
|
||||||
|
"`", ":", ";", "?"]
|
||||||
|
wordreplace_big = ["!" + chr(x) for x in range(ord("A"), ord("Z") + 1)]
|
||||||
|
|
||||||
with open(sys.argv[1], "r") as dfile:
|
with open(sys.argv[1], "r") as dfile:
|
||||||
domains = dfile.read().split("\n")
|
domains = dfile.read().split("\n")
|
||||||
|
|
||||||
new_domains = []
|
new_domains = []
|
||||||
for domain in domains:
|
for domain in domains:
|
||||||
|
#print('.'.join(domain.split(".")[:-1]))
|
||||||
|
#sys.exit(1)
|
||||||
new_domains.append('.'.join(domain.split(".")[:-1]))
|
new_domains.append('.'.join(domain.split(".")[:-1]))
|
||||||
domains = ''.join(new_domains)
|
domains = ''.join(new_domains)
|
||||||
|
|
||||||
domain_len = len(domains)
|
domain_len = len(domains)
|
||||||
position = 0
|
position = 0
|
||||||
|
|
||||||
while position <= domain_len:
|
pattern_found = {}
|
||||||
cut = domains[position:position+2]
|
|
||||||
if not patternhit.get(cut):
|
|
||||||
patternhit[cut] = 0
|
|
||||||
patternhit[cut] += 1
|
|
||||||
position += 2
|
|
||||||
|
|
||||||
patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1]))
|
for patternlen in (4,):
|
||||||
|
for round, _ in enumerate(wordreplace_big):
|
||||||
|
position = 0
|
||||||
|
while position <= domain_len:
|
||||||
|
cut = domains[position:position+patternlen]
|
||||||
|
position += 1
|
||||||
|
if len(cut) != patternlen:
|
||||||
|
continue
|
||||||
|
if not patternhit.get(cut):
|
||||||
|
patternhit[cut] = 0
|
||||||
|
patternhit[cut] += 1
|
||||||
|
#print("Round", round, "patternhit", patternhit)
|
||||||
|
patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
|
||||||
|
domains = domains.replace(list(patternhit.keys())[0], '')
|
||||||
|
pattern_found.update(patternhit)
|
||||||
|
patternhit = {}
|
||||||
|
print("Big round", round, "pattern_found", pattern_found)
|
||||||
|
|
||||||
|
for patternlen in (2,):
|
||||||
|
for round, _ in enumerate(wordreplace):
|
||||||
|
position = 0
|
||||||
|
while position <= domain_len:
|
||||||
|
cut = domains[position:position+patternlen]
|
||||||
|
position += 1
|
||||||
|
if len(cut) != patternlen:
|
||||||
|
continue
|
||||||
|
if not patternhit.get(cut):
|
||||||
|
patternhit[cut] = 0
|
||||||
|
patternhit[cut] += 1
|
||||||
|
#print("Round", round, "patternhit", patternhit)
|
||||||
|
patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
|
||||||
|
domains = domains.replace(list(patternhit.keys())[0], '')
|
||||||
|
pattern_found.update(patternhit)
|
||||||
|
patternhit = {}
|
||||||
|
print("Round", round, "pattern_found", pattern_found)
|
||||||
|
|
||||||
|
#patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1]))
|
||||||
|
patternhit = pattern_found
|
||||||
|
print(patternhit)
|
||||||
|
|
||||||
#print(patternhit, file=sys.stderr)
|
#print(patternhit, file=sys.stderr)
|
||||||
finallist = list(patternhit)[-1 * len(wordreplace):]
|
#finallist = list(patternhit)[:len(wordreplace)]
|
||||||
#print(finallist, file=sys.stderr)
|
finallist = list(patternhit)
|
||||||
|
#finallist.reverse()
|
||||||
|
print(finallist, file=sys.stderr)
|
||||||
|
wordreplace = wordreplace_big + wordreplace
|
||||||
|
|
||||||
with open(sys.argv[2], "w") as awkfile:
|
with open(sys.argv[2], "w") as awkfile:
|
||||||
print("{", file=awkfile)
|
print("{", file=awkfile)
|
||||||
|
Loading…
Reference in New Issue
Block a user