From 4c88eb126430848be97c2c0abefd89a2a4ae8dda Mon Sep 17 00:00:00 2001
From: ValdikSS <iam@valdikss.org.ru>
Date: Sun, 24 Mar 2024 18:32:07 +0700
Subject: [PATCH] LZP compression for domains

---
 generate-pac.sh                  |  70 ++++++++++++++++++++-
 scripts/generate-pac-domains.awk |  33 +++++++---
 scripts/lzp.py                   | 105 +++++++++++++++++++++++++++++++
 scripts/topsequences.py          |   2 +-
 4 files changed, 198 insertions(+), 12 deletions(-)
 create mode 100755 scripts/lzp.py

diff --git a/generate-pac.sh b/generate-pac.sh
index f3f3fcb..82105d4 100755
--- a/generate-pac.sh
+++ b/generate-pac.sh
@@ -18,6 +18,8 @@ echo "// ProstoVPN.AntiZapret PAC-host File
 " >> "$PACFILE"
 
 awk -f scripts/generate-pac-domains.awk result/hostlist_zones.txt >> "$PACFILE"
+awk -v lzp=1 -f scripts/generate-pac-domains.awk result/hostlist_zones.txt > temp/domains-oneline.txt
+python3 scripts/lzp.py temp/domains-oneline.txt temp/domains-oneline-data.txt temp/domains-oneline-mask.txt temp/domains-oneline-pac.js
 
 # Collapse IP list
 scripts/collapse_blockedbyip_noid2971.py
@@ -30,21 +32,72 @@ SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
     awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
 
 PATTERNS=$(cat temp/pacpatterns.js)
+PATTERNS_LZP=$(cat temp/domains-oneline-pac.js)
+DOMAINS_LZP=$(cat temp/domains-oneline-data.txt)
+MASK_LZP=$(cat temp/domains-oneline-mask.txt)
 
 echo "var special = [
 $SPECIAL
 ];
+
+// domain name data encoded with LZP, without mask data
+var domains_lzp = \"$DOMAINS_LZP\";
+
+// LZP mask data, b64+patternreplace
+var mask_lzp = \"$MASK_LZP\";
+
 var az_initialized = 0;
 // CIDR to netmask, for special
 function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
 // replace repeating sequences in domain
-function patternreplace(s) {
+function patternreplace(s, lzpmask) {
   var patterns = $PATTERNS;
+  if (lzpmask)
+   var patterns = $PATTERNS_LZP;
   for (pattern in patterns) {
     s = s.split(patterns[pattern]).join(pattern);
   }
   return s;
 }
+// LZP as in PPP, different hash func
+function unlzp(d, m) {
+  var TABLE_LEN_BITS = 18;
+  var HASH_MASK = (1 << TABLE_LEN_BITS) - 1;
+  var hash = 0, mask = 0, maskpos = 0, dpos = 0, table = Array(1 << TABLE_LEN_BITS), out = Array(8), outpos = 0, outfinal = '';
+
+  for (;;) {
+    mask = m[maskpos++];
+    if (!mask)
+      break
+    mask = mask.charCodeAt(0);
+    outpos = 0;
+    for (var i = 0; i < 8; i++) {
+      if (mask & (1 << i)) {
+        c = table[hash];
+      } else {
+        c = d[dpos++];
+        if (!c)
+          break
+        c = c.charCodeAt(0);
+        table[hash] = c;
+      }
+      out[outpos++] = String.fromCharCode(c);
+      hash = ( (hash << 7) ^ c ) & HASH_MASK
+    }
+    if (outpos == 8)
+      outfinal += out.join('');
+  }
+  if (outpos < 8)
+    outfinal += out.slice(0, outpos).join('');
+  return outfinal;
+}
+
+function a2b(a) {
+  var b, c, d, e = {}, f = 0, g = 0, h = \"\", i = String.fromCharCode, j = a.length;
+  for (b = 0; 64 > b; b++) e[\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\".charAt(b)] = b;
+  for (c = 0; j > c; c++) for (b = e[a.charAt(c)], f = (f << 6) + b, g += 6; g >= 8; ) ((d = 255 & f >>> (g -= 8)) || j - 2 > c) && (h += i(d));
+  return h;
+}
 
 function FindProxyForURL(url, host) {" >> "$PACFILE"
 
@@ -75,6 +128,19 @@ echo "  if (domains.length < 10) return \"DIRECT\"; // list is broken
      special[i][1] = nmfc(special[i][1]);
     }
 
+    mask_lzp = patternreplace(mask_lzp, true);
+    mask_lzp = a2b(mask_lzp);
+    domains_lzp = unlzp(domains_lzp, mask_lzp);
+    mask_lzp = 0;
+
+    for (dmn in domains) {
+     for (dcnt in domains[dmn]) {
+      dmnl = domains[dmn][dcnt];
+      domains[dmn][dcnt] = domains_lzp.slice(0, dmnl);
+      domains_lzp = domains_lzp.slice(dmnl);
+     }
+    }
+
     az_initialized = 1;
   }
 
@@ -105,7 +171,7 @@ echo "
   if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
   var curhost = curdomain[1];
   var curzone = curdomain[2];
-  curhost = patternreplace(curhost);
+  curhost = patternreplace(curhost, false);
   var curarr = []; // dummy empty array
   if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
     if (typeof domains[curzone][curhost.length] === 'string') {
diff --git a/scripts/generate-pac-domains.awk b/scripts/generate-pac-domains.awk
index c90df43..4d11165 100644
--- a/scripts/generate-pac-domains.awk
+++ b/scripts/generate-pac-domains.awk
@@ -19,25 +19,40 @@ function printarray(arrname, arr) {
     for (domainzone in arr) {
         if (firsttime_1 == 0) {printf ",\n"} firsttime_1 = 0;
 
-        print "\"" domainzone "\":{"
+        printf "\"" domainzone "\":{"
 
         for (domainlength in arr[domainzone]) {
-            if (firsttime_2 == 0) {printf ",\n"} firsttime_2 = 0;
+            if (firsttime_2 == 0) {printf ","} firsttime_2 = 0;
 
-            printf " %s", "" domainlength ":\""
-            for (domainname in arr[domainzone][domainlength]) {
-                printf "%s", domainname
-            }
-            printf "\""
+            printf "%s", "" domainlength ":"
+            printf "%d", length(arr[domainzone][domainlength]) * domainlength
+            #for (domainname in arr[domainzone][domainlength]) {
+            #    printf "%d", length(domainname)
+            #}
+            #printf "\""
         }
 
         firsttime_2 = 1;
-        printf "\n}"
+        printf "}"
     }
     print "};"
 }
 
+function printarray_oneline(arr) {
+    for (domainzone in arr) {
+        for (domainlength in arr[domainzone]) {
+            for (domainname in arr[domainzone][domainlength]) {
+                printf "%s", domainname
+            }
+        }
+    }
+}
+
 # Final function
 END {
-    printarray("domains", domainarray)
+    if (lzp) {
+        printarray_oneline(domainarray)
+    } else {
+        printarray("domains", domainarray)
+    }
 }
diff --git a/scripts/lzp.py b/scripts/lzp.py
new file mode 100755
index 0000000..852872b
--- /dev/null
+++ b/scripts/lzp.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import base64
+import textwrap
+
+'''
+This script implements LZP compression for PAC file.
+'''
+
+def encode(inputname):
+    TABLE_LEN_BITS = 18
+    HASH_MASK = (1 << TABLE_LEN_BITS) - 1
+
+    ifile = open(sys.argv[1], "rb")
+    table = bytearray(1 << TABLE_LEN_BITS)
+    masks = bytearray()
+    obytes = bytearray()
+    hashed = 0
+
+    while True:
+        mask = 0
+        buf = b""
+
+        for i in range(8):
+            cb = ifile.read(1)
+            if not cb:
+                break
+            c = ord(cb)
+
+            if c == table[hashed]:
+                mask |= 1 << i;
+            else:
+                table[hashed] = c
+                buf += cb
+
+            hashed = ( (hashed << 7) ^ c ) & HASH_MASK
+
+        masks += mask.to_bytes(1, 'big')
+        obytes += buf
+
+        if not cb:
+            break
+
+    ifile.close()
+    return [obytes, masks]
+
+
+def findsequence(inputstr):
+    wordreplace=["!", "@", "#", "$", "%", "^", "*", "(", ")", "[", "]", "-", ",", ".", "?"]
+    patternhit = {}
+    pattern_found = {}
+    input_len = len(inputstr)
+
+    for patternlen in (2,):
+        for round, _ in enumerate(wordreplace):
+            position = 0
+            while position <= input_len:
+                cut = inputstr[position:position+patternlen]
+                position += 1
+                if len(cut) != patternlen:
+                    continue
+                if not patternhit.get(cut):
+                    patternhit[cut] = 0
+                patternhit[cut] += 1
+            #print("Round", round, "patternhit", patternhit)
+            patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
+            inputstr = inputstr.replace(list(patternhit.keys())[0], '')
+            pattern_found.update(patternhit)
+            patternhit = {}
+            print("Round", round, "pattern_found", pattern_found)
+
+    pattern_ret = {}
+    for i, p in enumerate(pattern_found.keys()):
+        pattern_ret.update({p: wordreplace[i]})
+    return pattern_ret
+
+if __name__ == "__main__":
+    if len(sys.argv) != 5:
+        print("{}: <input.txt> <output_data.txt> <output_mask.txt> <pac function.js>".format(sys.argv[0]))
+        sys.exit(1)
+
+    ofile = open(sys.argv[2], "wb")
+    mfile = open(sys.argv[3], "wb")
+
+    obytes, masks = encode(sys.argv[1])
+    masks_b64 = base64.b64encode(masks).decode()
+    masks_seqences = findsequence(masks_b64)
+    masks_sequenced = masks_b64
+    for k, v in masks_seqences.items():
+        #print(k, v)
+        masks_sequenced = masks_sequenced.replace(k, v)
+
+    print("masks:", len(masks), " masks_b64:", len(masks_b64), " masks_sequenced:", len(masks_sequenced))
+    print("obytes:", len(obytes))
+    print("overall:", len(obytes) + len(masks_sequenced))
+
+    os.write(ofile.fileno(), "\\\n".join(textwrap.wrap(obytes.decode(), 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode())
+    os.write(mfile.fileno(), "\\\n".join(textwrap.wrap(masks_sequenced, 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode())
+    ofile.close()
+    mfile.close()
+
+    with open(sys.argv[4], "w") as pacfile:
+        print(masks_seqences, file=pacfile)
diff --git a/scripts/topsequences.py b/scripts/topsequences.py
index 60ed611..b5eda14 100755
--- a/scripts/topsequences.py
+++ b/scripts/topsequences.py
@@ -21,7 +21,7 @@ wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
              "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
              "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",
              "`", ":", ";", "?"]
-wordreplace_big = ["!" + chr(x) for x in range(ord("A"), ord("Z") + 1)]
+wordreplace_big = ["!" + x for x in wordreplace]
 
 with open(sys.argv[1], "r") as dfile:
     domains = dfile.read().split("\n")