antizapret-pac-generator-light/scripts/topsequences.py

#!/usr/bin/env python3

import sys

'''
This script finds the most common two-character sequences
and replace them with a single uppercase character or
special character, to compression purposes.
'''

if len(sys.argv) != 4:
    print("{}: <host list.txt> <awk output.awk> <pac function.js>".format(sys.argv[0]))
    sys.exit(1)

#patternhit = {"cloudfront": 999999999}
patternhit = {}
# "&" character should be prepended with two backslashes for awk's gsub.
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
             "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
             "U", "V", "W", "X", "Y", "Z",
             "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
             "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",
             "`", ":", ";", "?"]
wordreplace_big = ["!" + x for x in wordreplace]

with open(sys.argv[1], "r") as dfile:
    domains = dfile.read().split("\n")

    new_domains = []
    for domain in domains:
        #print('.'.join(domain.split(".")[:-1]))
        #sys.exit(1)
        new_domains.append('.'.join(domain.split(".")[:-1]))
    domains = ''.join(new_domains)

    domain_len = len(domains)
    position = 0

    pattern_found = {}

    for patternlen in (4,):
        for round, _ in enumerate(wordreplace_big):
            position = 0
            while position <= domain_len:
                cut = domains[position:position+patternlen]
                position += 1
                if len(cut) != patternlen:
                    continue
                if not patternhit.get(cut):
                    patternhit[cut] = 0
                patternhit[cut] += 1
            #print("Round", round, "patternhit", patternhit)
            patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
            domains = domains.replace(list(patternhit.keys())[0], '')
            pattern_found.update(patternhit)
            patternhit = {}
            print("Big round", round, "pattern_found", pattern_found)

    for patternlen in (2,):
        for round, _ in enumerate(wordreplace):
            position = 0
            while position <= domain_len:
                cut = domains[position:position+patternlen]
                position += 1
                if len(cut) != patternlen:
                    continue
                if not patternhit.get(cut):
                    patternhit[cut] = 0
                patternhit[cut] += 1
            #print("Round", round, "patternhit", patternhit)
            patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
            domains = domains.replace(list(patternhit.keys())[0], '')
            pattern_found.update(patternhit)
            patternhit = {}
            print("Round", round, "pattern_found", pattern_found)

#patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1]))
patternhit = pattern_found
print(patternhit)

#print(patternhit, file=sys.stderr)
#finallist = list(patternhit)[:len(wordreplace)]
finallist = list(patternhit)
#finallist.reverse()
print(finallist, file=sys.stderr)
wordreplace = wordreplace_big + wordreplace

with open(sys.argv[2], "w") as awkfile:
    print("{", file=awkfile)
    for i, w in enumerate(finallist):
        print('gsub(/{}/, "{}", domainname)'.format(w.replace(".", "\\."), wordreplace[i]), file=awkfile)
    print("}", file=awkfile)

with open(sys.argv[3], "w") as pacfile:
    pacdict = {}
    for i, w in enumerate(finallist):
        pacdict[wordreplace[i].replace('\\', '')] = w
    print(pacdict, file=pacfile)
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00			`#!/usr/bin/env python3`

			`import sys`

			`'''`
			`This script finds the most common two-character sequences`
			`and replace them with a single uppercase character or`
			`special character, to compression purposes.`
			`'''`

			`if len(sys.argv) != 4:`
			`print("{}: <host list.txt> <awk output.awk> <pac function.js>".format(sys.argv[0]))`
			`sys.exit(1)`

Improve topsequence compression algorithm 2024-01-06 03:12:44 +03:00			`#patternhit = {"cloudfront": 999999999}`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00			`patternhit = {}`
			`# "&" character should be prepended with two backslashes for awk's gsub.`
			`wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",`
			`"K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",`
			`"U", "V", "W", "X", "Y", "Z",`
Improve topsequence compression algorithm 2024-01-06 03:12:44 +03:00			`"@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",`
			`"=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",`
			"`", ":", ";", "?"]
LZP compression for domains 2024-03-24 14:32:07 +03:00			`wordreplace_big = ["!" + x for x in wordreplace]`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00
			`with open(sys.argv[1], "r") as dfile:`
			`domains = dfile.read().split("\n")`

			`new_domains = []`
			`for domain in domains:`
Improve topsequence compression algorithm 2024-01-06 03:12:44 +03:00			`#print('.'.join(domain.split(".")[:-1]))`
			`#sys.exit(1)`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00			`new_domains.append('.'.join(domain.split(".")[:-1]))`
			`domains = ''.join(new_domains)`

			`domain_len = len(domains)`
			`position = 0`

Improve topsequence compression algorithm 2024-01-06 03:12:44 +03:00			`pattern_found = {}`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00
Improve topsequence compression algorithm 2024-01-06 03:12:44 +03:00			`for patternlen in (4,):`
			`for round, _ in enumerate(wordreplace_big):`
			`position = 0`
			`while position <= domain_len:`
			`cut = domains[position:position+patternlen]`
			`position += 1`
			`if len(cut) != patternlen:`
			`continue`
			`if not patternhit.get(cut):`
			`patternhit[cut] = 0`
			`patternhit[cut] += 1`
			`#print("Round", round, "patternhit", patternhit)`
			`patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])`
			`domains = domains.replace(list(patternhit.keys())[0], '')`
			`pattern_found.update(patternhit)`
			`patternhit = {}`
			`print("Big round", round, "pattern_found", pattern_found)`

			`for patternlen in (2,):`
			`for round, _ in enumerate(wordreplace):`
			`position = 0`
			`while position <= domain_len:`
			`cut = domains[position:position+patternlen]`
			`position += 1`
			`if len(cut) != patternlen:`
			`continue`
			`if not patternhit.get(cut):`
			`patternhit[cut] = 0`
			`patternhit[cut] += 1`
			`#print("Round", round, "patternhit", patternhit)`
			`patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])`
			`domains = domains.replace(list(patternhit.keys())[0], '')`
			`pattern_found.update(patternhit)`
			`patternhit = {}`
			`print("Round", round, "pattern_found", pattern_found)`

			`#patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1]))`
			`patternhit = pattern_found`
			`print(patternhit)`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00
			`#print(patternhit, file=sys.stderr)`
Improve topsequence compression algorithm 2024-01-06 03:12:44 +03:00			`#finallist = list(patternhit)[:len(wordreplace)]`
			`finallist = list(patternhit)`
			`#finallist.reverse()`
			`print(finallist, file=sys.stderr)`
			`wordreplace = wordreplace_big + wordreplace`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00
			`with open(sys.argv[2], "w") as awkfile:`
			`print("{", file=awkfile)`
			`for i, w in enumerate(finallist):`
Bugfix: quote dot in topsequence awk regex generation 2023-06-11 16:07:16 +03:00			`print('gsub(/{}/, "{}", domainname)'.format(w.replace(".", "\\."), wordreplace[i]), file=awkfile)`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00			`print("}", file=awkfile)`

			`with open(sys.argv[3], "w") as pacfile:`
			`pacdict = {}`
			`for i, w in enumerate(finallist):`
Properly remove ampersand escaping for large topsequence 2024-07-26 22:52:33 +03:00			`pacdict[wordreplace[i].replace('\\', '')] = w`
New compression feature: replace most common two-character sequences 2023-06-03 13:56:46 +03:00			`print(pacdict, file=pacfile)`