Improve topsequence compression algorithm

2025-02-19 16:03:14 +03:00 · 2024-01-06 03:12:44 +03:00 · 2024-01-06 03:12:44 +03:00 · bf2cb905ec
commit bf2cb905ec
parent b0073641a1
1 changed files with 52 additions and 11 deletions
--- a/scripts/topsequences.py
+++ b/scripts/topsequences.py
@ -12,37 +12,78 @@ if len(sys.argv) != 4:
    print("{}: <host list.txt> <awk output.awk> <pac function.js>".format(sys.argv[0]))
    sys.exit(1)

+#patternhit = {"cloudfront": 999999999}
 patternhit = {}
 # "&" character should be prepended with two backslashes for awk's gsub.
 wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
             "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
             "U", "V", "W", "X", "Y", "Z",
-             "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
-             "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"]
+             "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
+             "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",
+             "`", ":", ";", "?"]
+wordreplace_big = ["!" + chr(x) for x in range(ord("A"), ord("Z") + 1)]

 with open(sys.argv[1], "r") as dfile:
    domains = dfile.read().split("\n")

    new_domains = []
    for domain in domains:
+        #print('.'.join(domain.split(".")[:-1]))
+        #sys.exit(1)
        new_domains.append('.'.join(domain.split(".")[:-1]))
    domains = ''.join(new_domains)

    domain_len = len(domains)
    position = 0

-    while position <= domain_len:
-        cut = domains[position:position+2]
-        if not patternhit.get(cut):
-            patternhit[cut] = 0
-        patternhit[cut] += 1
-        position += 2
+    pattern_found = {}

-patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1]))
+    for patternlen in (4,):
+        for round, _ in enumerate(wordreplace_big):
+            position = 0
+            while position <= domain_len:
+                cut = domains[position:position+patternlen]
+                position += 1
+                if len(cut) != patternlen:
+                    continue
+                if not patternhit.get(cut):
+                    patternhit[cut] = 0
+                patternhit[cut] += 1
+            #print("Round", round, "patternhit", patternhit)
+            patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
+            domains = domains.replace(list(patternhit.keys())[0], '')
+            pattern_found.update(patternhit)
+            patternhit = {}
+            print("Big round", round, "pattern_found", pattern_found)
+
+    for patternlen in (2,):
+        for round, _ in enumerate(wordreplace):
+            position = 0
+            while position <= domain_len:
+                cut = domains[position:position+patternlen]
+                position += 1
+                if len(cut) != patternlen:
+                    continue
+                if not patternhit.get(cut):
+                    patternhit[cut] = 0
+                patternhit[cut] += 1
+            #print("Round", round, "patternhit", patternhit)
+            patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
+            domains = domains.replace(list(patternhit.keys())[0], '')
+            pattern_found.update(patternhit)
+            patternhit = {}
+            print("Round", round, "pattern_found", pattern_found)
+
+#patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1]))
+patternhit = pattern_found
+print(patternhit)

 #print(patternhit, file=sys.stderr)
-finallist = list(patternhit)[-1 * len(wordreplace):]
-#print(finallist, file=sys.stderr)
+#finallist = list(patternhit)[:len(wordreplace)]
+finallist = list(patternhit)
+#finallist.reverse()
+print(finallist, file=sys.stderr)
+wordreplace = wordreplace_big + wordreplace

 with open(sys.argv[2], "w") as awkfile:
    print("{", file=awkfile)