From 2d4ba7ef9b6c6e191c04d56e5e3361bcf5a94631 Mon Sep 17 00:00:00 2001
From: ValdikSS <iam@valdikss.org.ru>
Date: Sat, 3 Jun 2023 13:56:46 +0300
Subject: [PATCH] New compression feature: replace most common two-character
 sequences

---
 generate-pac.sh                  | 13 ++++++++
 scripts/generate-pac-domains.awk |  3 ++
 scripts/topsequences.py          | 57 ++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+)
 create mode 100755 scripts/topsequences.py
diff --git a/generate-pac.sh b/generate-pac.sh
index 51c9154..d128e27 100755
--- a/generate-pac.sh
+++ b/generate-pac.sh
@@ -4,6 +4,8 @@ set -e
 source config/config.sh
 echo -n > "$PACFILE"
 
+python3 scripts/topsequences.py result/hostlist_zones.txt temp/replace-common-sequences.awk temp/pacpatterns.js
+
 # .pac header
 echo "// ProstoVPN.AntiZapret PAC-host File
 // Generated on $(date), by https://bitbucket.org/anticensority/antizapret-pac-generator-light/
@@ -27,12 +29,22 @@ sort -Vu temp/include-ips.txt result/iplist_blockedbyip_noid2971_collapsed.txt |
 SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
     awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
 
+PATTERNS=$(cat temp/pacpatterns.js)
+
 echo "var special = [
 $SPECIAL
 ];
 var az_initialized = 0;
 // CIDR to netmask, for special
 function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
+// replace repeating sequences in domain
+function patternreplace(s) {
+  var patterns = $PATTERNS;
+  for (pattern in patterns) {
+    s = s.split(patterns[pattern]).join(pattern);
+  }
+  return s;
+}
 
 function FindProxyForURL(url, host) {" >> "$PACFILE"
 
@@ -93,6 +105,7 @@ echo "
   if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
   var curhost = curdomain[1];
   var curzone = curdomain[2];
+  curhost = patternreplace(curhost);
   var curarr = []; // dummy empty array
   if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
     if (typeof domains[curzone][curhost.length] === 'string') {
diff --git a/scripts/generate-pac-domains.awk b/scripts/generate-pac-domains.awk
index c773863..c90df43 100644
--- a/scripts/generate-pac-domains.awk
+++ b/scripts/generate-pac-domains.awk
@@ -1,6 +1,9 @@
 {
     domainzone = gensub(/(.*)\.([^.]+$)/, "\\2", 1)
     domainname = gensub(/(.*)\.([^.]+$)/, "\\1", 1)
+}
+    @include "temp/replace-common-sequences.awk"
+{
     domainlength = length(domainname)
     domainarray[domainzone][domainlength][domainname] = domainname
     #print "adding", $0, ":", domainzone, domainlength, domainname
diff --git a/scripts/topsequences.py b/scripts/topsequences.py
new file mode 100755
index 0000000..1548ef0
--- /dev/null
+++ b/scripts/topsequences.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+
+import sys
+
+'''
+This script finds the most common two-character sequences
+and replace them with a single uppercase character or
+special character, to compression purposes.
+'''
+
+if len(sys.argv) != 4:
+    print("{}: <host list.txt> <awk output.awk> <pac function.js>".format(sys.argv[0]))
+    sys.exit(1)
+
+patternhit = {}
+# "&" character should be prepended with two backslashes for awk's gsub.
+wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
+             "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
+             "U", "V", "W", "X", "Y", "Z",
+             "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
+             "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"]
+
+with open(sys.argv[1], "r") as dfile:
+    domains = dfile.read().split("\n")
+
+    new_domains = []
+    for domain in domains:
+        new_domains.append('.'.join(domain.split(".")[:-1]))
+    domains = ''.join(new_domains)
+
+    domain_len = len(domains)
+    position = 0
+
+    while position <= domain_len:
+        cut = domains[position:position+2]
+        if not patternhit.get(cut):
+            patternhit[cut] = 0
+        patternhit[cut] += 1
+        position += 2
+
+patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1]))
+
+#print(patternhit, file=sys.stderr)
+finallist = list(patternhit)[-1 * len(wordreplace):]
+#print(finallist, file=sys.stderr)
+
+with open(sys.argv[2], "w") as awkfile:
+    print("{", file=awkfile)
+    for i, w in enumerate(finallist):
+        print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]), file=awkfile)
+    print("}", file=awkfile)
+
+with open(sys.argv[3], "w") as pacfile:
+    pacdict = {}
+    for i, w in enumerate(finallist):
+        pacdict[wordreplace[i].strip('\\')] = w
+    print(pacdict, file=pacfile)