TEST: replace top repeating sequences

2024-11-25 22:56:05 +03:00 · 2023-06-03 15:01:32 +03:00 · 2023-06-03 15:01:32 +03:00 · fe0f9fbf92
commit fe0f9fbf92
parent e98ac41a00
4 changed files with 45 additions and 1036 deletions
--- a/dict/google-1000.txt
+++ b/dict/google-1000.txt
--- a/dict/topwords.py
+++ b/dict/topwords.py
@ -1,48 +1,46 @@
 #!/usr/bin/env python3

 import sys
-from pprint import pprint

-wordhit = {}
+patternhit = {}
 wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
             "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
             "U", "V", "W", "X", "Y", "Z",
             "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
-             "=", "+", "/", ",", "<", ">", "~"]
+             "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"]

-with open(sys.argv[1], "r") as wfile:
-    with open(sys.argv[2], "r") as dfile:
+with open(sys.argv[1], "r") as dfile:
    domains = dfile.read().split("\n")
-        words = wfile.read().split("\n")

    new_domains = []
    for domain in domains:
        new_domains.append('.'.join(domain.split(".")[:-1]))
-        domains = new_domains
-        for word in words:
-            wordhit[word] = 0
+    domains = ''.join(new_domains)

    domain_len = len(domains)
-        for i, domain in enumerate(domains):
-            if (i % 1000) == 0:
-                print(i, "/", domain_len, end="\r", file=sys.stderr)
-            for word in words:
-                if word in domain:
-                    wordhit[word] += 1
+    position = 0

-wordhit_c = {}
-for word in wordhit:
-    value = wordhit[word]
-    if value != 0 and word != '':
-        wordhit_c[word] = value
+    while position <= domain_len:
+        cut = domains[position:position+2]
+        if not patternhit.get(cut):
+            patternhit[cut] = 0
+        patternhit[cut] += 1
+        position += 2

-wordhit_c = dict(sorted(wordhit_c.items(), key=lambda x: x[1]))
+patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1]))

-#print(wordhit_c)
-finallist = list(wordhit_c)[-43:]
-finallist = sorted(finallist, key=lambda x: 1000 - len(x))
+#print(patternhit, file=sys.stderr)
+finallist = list(patternhit)[-1 * len(wordreplace):]
 print(finallist, file=sys.stderr)
-print("{")
-for i, w in enumerate(finallist):
-    print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]))
-print("}")
+
+with open(sys.argv[2], "w") as awkfile:
+    print("{", file=awkfile)
+    for i, w in enumerate(finallist):
+        print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]), file=awkfile)
+    print("}", file=awkfile)
+
+with open(sys.argv[3], "w") as pacfile:
+    pacdict = {}
+    for i, w in enumerate(finallist):
+        pacdict[wordreplace[i].strip('\\')] = w
+    print(pacdict, file=pacfile)
--- a/generate-pac.sh
+++ b/generate-pac.sh
@ -4,6 +4,8 @@ set -e
 source config/config.sh
 echo -n > "$PACFILE"

+python3 dict/topwords.py result/hostlist_zones.txt temp/replace-common-words.awk temp/pacpatterns.js
+
 # .pac header
 echo "// ProstoVPN.AntiZapret PAC-host File
 // Generated on $(date), by https://bitbucket.org/anticensority/antizapret-pac-generator-light/
@ -27,12 +29,22 @@ sort -Vu temp/include-ips.txt result/iplist_blockedbyip_noid2971_collapsed.txt |
 SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
    awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"

+PATTERNS=$(cat temp/pacpatterns.js)
+
 echo "var special = [
 $SPECIAL
 ];
 var az_initialized = 0;
 // CIDR to netmask, for special
 function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
+// replace repeating sequences in domain
+function patternreplace(s) {
+  var patterns = $PATTERNS;
+  for (pattern in patterns) {
+    s = s.split(patterns[pattern]).join(pattern);
+  }
+  return s;
+}

 function FindProxyForURL(url, host) {" >> "$PACFILE"

@ -93,6 +105,7 @@ echo "
  if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
  var curhost = curdomain[1];
  var curzone = curdomain[2];
+  curhost = patternreplace(curhost);
  var curarr = []; // dummy empty array
  if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
    if (typeof domains[curzone][curhost.length] === 'string') {
--- a/parse.sh
+++ b/parse.sh
@ -30,8 +30,6 @@ then
    awk -f scripts/getzones.awk temp/hostlist_original_with_include.txt | grep -v -F -x -f temp/exclude-hosts.txt | sort -u > result/hostlist_zones.txt
 fi

-python dict/topwords.py dict/google-1000.txt result/hostlist_zones.txt > temp/replace-common-words.awk
-
 # Generate a list of IP addresses
 awk -F';' '$1 ~ /\// {print $1}' temp/list.csv | grep -P '([0-9]{1,3}\.){3}[0-9]{1,3}\/[0-9]{1,2}' -o | sort -Vu > result/iplist_special_range.txt