From bf2cb905ec06286623f63e2b8c9ac230c39ab050 Mon Sep 17 00:00:00 2001 From: ValdikSS Date: Sat, 6 Jan 2024 03:12:44 +0300 Subject: [PATCH] Improve topsequence compression algorithm --- scripts/topsequences.py | 63 ++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/scripts/topsequences.py b/scripts/topsequences.py index 510eaea..60ed611 100755 --- a/scripts/topsequences.py +++ b/scripts/topsequences.py @@ -12,37 +12,78 @@ if len(sys.argv) != 4: print("{}: ".format(sys.argv[0])) sys.exit(1) +#patternhit = {"cloudfront": 999999999} patternhit = {} # "&" character should be prepended with two backslashes for awk's gsub. wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", - "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")", - "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"] + "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")", + "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}", + "`", ":", ";", "?"] +wordreplace_big = ["!" + chr(x) for x in range(ord("A"), ord("Z") + 1)] with open(sys.argv[1], "r") as dfile: domains = dfile.read().split("\n") new_domains = [] for domain in domains: + #print('.'.join(domain.split(".")[:-1])) + #sys.exit(1) new_domains.append('.'.join(domain.split(".")[:-1])) domains = ''.join(new_domains) domain_len = len(domains) position = 0 - while position <= domain_len: - cut = domains[position:position+2] - if not patternhit.get(cut): - patternhit[cut] = 0 - patternhit[cut] += 1 - position += 2 + pattern_found = {} -patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1])) + for patternlen in (4,): + for round, _ in enumerate(wordreplace_big): + position = 0 + while position <= domain_len: + cut = domains[position:position+patternlen] + position += 1 + if len(cut) != patternlen: + continue + if not patternhit.get(cut): + patternhit[cut] = 0 + patternhit[cut] += 1 + #print("Round", round, "patternhit", patternhit) + patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:]) + domains = domains.replace(list(patternhit.keys())[0], '') + pattern_found.update(patternhit) + patternhit = {} + print("Big round", round, "pattern_found", pattern_found) + + for patternlen in (2,): + for round, _ in enumerate(wordreplace): + position = 0 + while position <= domain_len: + cut = domains[position:position+patternlen] + position += 1 + if len(cut) != patternlen: + continue + if not patternhit.get(cut): + patternhit[cut] = 0 + patternhit[cut] += 1 + #print("Round", round, "patternhit", patternhit) + patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:]) + domains = domains.replace(list(patternhit.keys())[0], '') + pattern_found.update(patternhit) + patternhit = {} + print("Round", round, "pattern_found", pattern_found) + +#patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])) +patternhit = pattern_found +print(patternhit) #print(patternhit, file=sys.stderr) -finallist = list(patternhit)[-1 * len(wordreplace):] -#print(finallist, file=sys.stderr) +#finallist = list(patternhit)[:len(wordreplace)] +finallist = list(patternhit) +#finallist.reverse() +print(finallist, file=sys.stderr) +wordreplace = wordreplace_big + wordreplace with open(sys.argv[2], "w") as awkfile: print("{", file=awkfile)