LZP compression for domains

2024-11-22 05:06:04 +03:00 · 2024-03-24 18:32:07 +07:00 · 2024-03-24 18:32:07 +07:00 · 4c88eb1264
commit 4c88eb1264
parent f95edbfdf2
4 changed files with 198 additions and 12 deletions
--- a/generate-pac.sh
+++ b/generate-pac.sh
@ -18,6 +18,8 @@ echo "// ProstoVPN.AntiZapret PAC-host File
 " >> "$PACFILE"
 awk -f scripts/generate-pac-domains.awk result/hostlist_zones.txt >> "$PACFILE"
 awk -v lzp=1 -f scripts/generate-pac-domains.awk result/hostlist_zones.txt > temp/domains-oneline.txt
 python3 scripts/lzp.py temp/domains-oneline.txt temp/domains-oneline-data.txt temp/domains-oneline-mask.txt temp/domains-oneline-pac.js
 # Collapse IP list
 scripts/collapse_blockedbyip_noid2971.py
@ -30,21 +32,72 @@ SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
    awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
 PATTERNS=$(cat temp/pacpatterns.js)
 PATTERNS_LZP=$(cat temp/domains-oneline-pac.js)
 DOMAINS_LZP=$(cat temp/domains-oneline-data.txt)
 MASK_LZP=$(cat temp/domains-oneline-mask.txt)
 echo "var special = [
 $SPECIAL
 ];
 // domain name data encoded with LZP, without mask data
 var domains_lzp = \"$DOMAINS_LZP\";
 // LZP mask data, b64+patternreplace
 var mask_lzp = \"$MASK_LZP\";
 var az_initialized = 0;
 // CIDR to netmask, for special
 function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
 // replace repeating sequences in domain
-function patternreplace(s) {
+function patternreplace(s, lzpmask) {
  var patterns = $PATTERNS;
  if (lzpmask)
   var patterns = $PATTERNS_LZP;
  for (pattern in patterns) {
    s = s.split(patterns[pattern]).join(pattern);
  }
  return s;
 }
 // LZP as in PPP, different hash func
 function unlzp(d, m) {
  var TABLE_LEN_BITS = 18;
  var HASH_MASK = (1 << TABLE_LEN_BITS) - 1;
  var hash = 0, mask = 0, maskpos = 0, dpos = 0, table = Array(1 << TABLE_LEN_BITS), out = Array(8), outpos = 0, outfinal = '';
  for (;;) {
    mask = m[maskpos++];
    if (!mask)
      break
    mask = mask.charCodeAt(0);
    outpos = 0;
    for (var i = 0; i < 8; i++) {
      if (mask & (1 << i)) {
        c = table[hash];
      } else {
        c = d[dpos++];
        if (!c)
          break
        c = c.charCodeAt(0);
        table[hash] = c;
      }
      out[outpos++] = String.fromCharCode(c);
      hash = ( (hash << 7) ^ c ) & HASH_MASK
    }
    if (outpos == 8)
      outfinal += out.join('');
  }
  if (outpos < 8)
    outfinal += out.slice(0, outpos).join('');
  return outfinal;
 }
 function a2b(a) {
  var b, c, d, e = {}, f = 0, g = 0, h = \"\", i = String.fromCharCode, j = a.length;
  for (b = 0; 64 > b; b++) e[\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\".charAt(b)] = b;
  for (c = 0; j > c; c++) for (b = e[a.charAt(c)], f = (f << 6) + b, g += 6; g >= 8; ) ((d = 255 & f >>> (g -= 8)) || j - 2 > c) && (h += i(d));
  return h;
 }
 function FindProxyForURL(url, host) {" >> "$PACFILE"
@ -75,6 +128,19 @@ echo "  if (domains.length < 10) return \"DIRECT\"; // list is broken
     special[i][1] = nmfc(special[i][1]);
    }
    mask_lzp = patternreplace(mask_lzp, true);
    mask_lzp = a2b(mask_lzp);
    domains_lzp = unlzp(domains_lzp, mask_lzp);
    mask_lzp = 0;
    for (dmn in domains) {
     for (dcnt in domains[dmn]) {
      dmnl = domains[dmn][dcnt];
      domains[dmn][dcnt] = domains_lzp.slice(0, dmnl);
      domains_lzp = domains_lzp.slice(dmnl);
     }
    }
    az_initialized = 1;
  }
@ -105,7 +171,7 @@ echo "
  if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
  var curhost = curdomain[1];
  var curzone = curdomain[2];
-  curhost = patternreplace(curhost);
+  curhost = patternreplace(curhost, false);
  var curarr = []; // dummy empty array
  if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
    if (typeof domains[curzone][curhost.length] === 'string') {
--- a/scripts/generate-pac-domains.awk
+++ b/scripts/generate-pac-domains.awk
@ -19,25 +19,40 @@ function printarray(arrname, arr) {
    for (domainzone in arr) {
        if (firsttime_1 == 0) {printf ",\n"} firsttime_1 = 0;
-        print "\"" domainzone "\":{"
+        printf "\"" domainzone "\":{"
        for (domainlength in arr[domainzone]) {
-            if (firsttime_2 == 0) {printf ",\n"} firsttime_2 = 0;
+            if (firsttime_2 == 0) {printf ","} firsttime_2 = 0;
-            printf " %s", "" domainlength ":\""
+            printf "%s", "" domainlength ":"
-            for (domainname in arr[domainzone][domainlength]) {
+            printf "%d", length(arr[domainzone][domainlength]) * domainlength
-                printf "%s", domainname
+            #for (domainname in arr[domainzone][domainlength]) {
-            }
+            #    printf "%d", length(domainname)
-            printf "\""
+            #}
            #printf "\""
        }
        firsttime_2 = 1;
-        printf "\n}"
+        printf "}"
    }
    print "};"
 }
 function printarray_oneline(arr) {
    for (domainzone in arr) {
        for (domainlength in arr[domainzone]) {
            for (domainname in arr[domainzone][domainlength]) {
                printf "%s", domainname
            }
        }
    }
 }
 # Final function
 END {
-    printarray("domains", domainarray)
+    if (lzp) {
        printarray_oneline(domainarray)
    } else {
        printarray("domains", domainarray)
    }
 }
--- a/scripts/lzp.py
+++ b/scripts/lzp.py
@ -0,0 +1,105 @@
 #!/usr/bin/env python3
 import sys
 import os
 import base64
 import textwrap
 '''
 This script implements LZP compression for PAC file.
 '''
 def encode(inputname):
    TABLE_LEN_BITS = 18
    HASH_MASK = (1 << TABLE_LEN_BITS) - 1
    ifile = open(sys.argv[1], "rb")
    table = bytearray(1 << TABLE_LEN_BITS)
    masks = bytearray()
    obytes = bytearray()
    hashed = 0
    while True:
        mask = 0
        buf = b""
        for i in range(8):
            cb = ifile.read(1)
            if not cb:
                break
            c = ord(cb)
            if c == table[hashed]:
                mask |= 1 << i;
            else:
                table[hashed] = c
                buf += cb
            hashed = ( (hashed << 7) ^ c ) & HASH_MASK
        masks += mask.to_bytes(1, 'big')
        obytes += buf
        if not cb:
            break
    ifile.close()
    return [obytes, masks]
 def findsequence(inputstr):
    wordreplace=["!", "@", "#", "$", "%", "^", "*", "(", ")", "[", "]", "-", ",", ".", "?"]
    patternhit = {}
    pattern_found = {}
    input_len = len(inputstr)
    for patternlen in (2,):
        for round, _ in enumerate(wordreplace):
            position = 0
            while position <= input_len:
                cut = inputstr[position:position+patternlen]
                position += 1
                if len(cut) != patternlen:
                    continue
                if not patternhit.get(cut):
                    patternhit[cut] = 0
                patternhit[cut] += 1
            #print("Round", round, "patternhit", patternhit)
            patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
            inputstr = inputstr.replace(list(patternhit.keys())[0], '')
            pattern_found.update(patternhit)
            patternhit = {}
            print("Round", round, "pattern_found", pattern_found)
    pattern_ret = {}
    for i, p in enumerate(pattern_found.keys()):
        pattern_ret.update({p: wordreplace[i]})
    return pattern_ret
 if __name__ == "__main__":
    if len(sys.argv) != 5:
        print("{}: <input.txt> <output_data.txt> <output_mask.txt> <pac function.js>".format(sys.argv[0]))
        sys.exit(1)
    ofile = open(sys.argv[2], "wb")
    mfile = open(sys.argv[3], "wb")
    obytes, masks = encode(sys.argv[1])
    masks_b64 = base64.b64encode(masks).decode()
    masks_seqences = findsequence(masks_b64)
    masks_sequenced = masks_b64
    for k, v in masks_seqences.items():
        #print(k, v)
        masks_sequenced = masks_sequenced.replace(k, v)
    print("masks:", len(masks), " masks_b64:", len(masks_b64), " masks_sequenced:", len(masks_sequenced))
    print("obytes:", len(obytes))
    print("overall:", len(obytes) + len(masks_sequenced))
    os.write(ofile.fileno(), "\\\n".join(textwrap.wrap(obytes.decode(), 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode())
    os.write(mfile.fileno(), "\\\n".join(textwrap.wrap(masks_sequenced, 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode())
    ofile.close()
    mfile.close()
    with open(sys.argv[4], "w") as pacfile:
        print(masks_seqences, file=pacfile)
--- a/scripts/topsequences.py
+++ b/scripts/topsequences.py
@ -21,7 +21,7 @@ wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
             "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
             "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",
             "`", ":", ";", "?"]
-wordreplace_big = ["!" + chr(x) for x in range(ord("A"), ord("Z") + 1)]
+wordreplace_big = ["!" + x for x in wordreplace]
 with open(sys.argv[1], "r") as dfile:
    domains = dfile.read().split("\n")