From 4c88eb126430848be97c2c0abefd89a2a4ae8dda Mon Sep 17 00:00:00 2001 From: ValdikSS Date: Sun, 24 Mar 2024 18:32:07 +0700 Subject: [PATCH] LZP compression for domains --- generate-pac.sh | 70 ++++++++++++++++++++- scripts/generate-pac-domains.awk | 33 +++++++--- scripts/lzp.py | 105 +++++++++++++++++++++++++++++++ scripts/topsequences.py | 2 +- 4 files changed, 198 insertions(+), 12 deletions(-) create mode 100755 scripts/lzp.py diff --git a/generate-pac.sh b/generate-pac.sh index f3f3fcb..82105d4 100755 --- a/generate-pac.sh +++ b/generate-pac.sh @@ -18,6 +18,8 @@ echo "// ProstoVPN.AntiZapret PAC-host File " >> "$PACFILE" awk -f scripts/generate-pac-domains.awk result/hostlist_zones.txt >> "$PACFILE" +awk -v lzp=1 -f scripts/generate-pac-domains.awk result/hostlist_zones.txt > temp/domains-oneline.txt +python3 scripts/lzp.py temp/domains-oneline.txt temp/domains-oneline-data.txt temp/domains-oneline-mask.txt temp/domains-oneline-pac.js # Collapse IP list scripts/collapse_blockedbyip_noid2971.py @@ -30,21 +32,72 @@ SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \ awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')" PATTERNS=$(cat temp/pacpatterns.js) +PATTERNS_LZP=$(cat temp/domains-oneline-pac.js) +DOMAINS_LZP=$(cat temp/domains-oneline-data.txt) +MASK_LZP=$(cat temp/domains-oneline-mask.txt) echo "var special = [ $SPECIAL ]; + +// domain name data encoded with LZP, without mask data +var domains_lzp = \"$DOMAINS_LZP\"; + +// LZP mask data, b64+patternreplace +var mask_lzp = \"$MASK_LZP\"; + var az_initialized = 0; // CIDR to netmask, for special function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');} // replace repeating sequences in domain -function patternreplace(s) { +function patternreplace(s, lzpmask) { var patterns = $PATTERNS; + if (lzpmask) + var patterns = $PATTERNS_LZP; for (pattern in patterns) { s = s.split(patterns[pattern]).join(pattern); } return s; } +// LZP as in PPP, different hash func +function unlzp(d, m) { + var TABLE_LEN_BITS = 18; + var HASH_MASK = (1 << TABLE_LEN_BITS) - 1; + var hash = 0, mask = 0, maskpos = 0, dpos = 0, table = Array(1 << TABLE_LEN_BITS), out = Array(8), outpos = 0, outfinal = ''; + + for (;;) { + mask = m[maskpos++]; + if (!mask) + break + mask = mask.charCodeAt(0); + outpos = 0; + for (var i = 0; i < 8; i++) { + if (mask & (1 << i)) { + c = table[hash]; + } else { + c = d[dpos++]; + if (!c) + break + c = c.charCodeAt(0); + table[hash] = c; + } + out[outpos++] = String.fromCharCode(c); + hash = ( (hash << 7) ^ c ) & HASH_MASK + } + if (outpos == 8) + outfinal += out.join(''); + } + if (outpos < 8) + outfinal += out.slice(0, outpos).join(''); + return outfinal; +} + +function a2b(a) { + var b, c, d, e = {}, f = 0, g = 0, h = \"\", i = String.fromCharCode, j = a.length; + for (b = 0; 64 > b; b++) e[\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\".charAt(b)] = b; + for (c = 0; j > c; c++) for (b = e[a.charAt(c)], f = (f << 6) + b, g += 6; g >= 8; ) ((d = 255 & f >>> (g -= 8)) || j - 2 > c) && (h += i(d)); + return h; +} function FindProxyForURL(url, host) {" >> "$PACFILE" @@ -75,6 +128,19 @@ echo " if (domains.length < 10) return \"DIRECT\"; // list is broken special[i][1] = nmfc(special[i][1]); } + mask_lzp = patternreplace(mask_lzp, true); + mask_lzp = a2b(mask_lzp); + domains_lzp = unlzp(domains_lzp, mask_lzp); + mask_lzp = 0; + + for (dmn in domains) { + for (dcnt in domains[dmn]) { + dmnl = domains[dmn][dcnt]; + domains[dmn][dcnt] = domains_lzp.slice(0, dmnl); + domains_lzp = domains_lzp.slice(dmnl); + } + } + az_initialized = 1; } @@ -105,7 +171,7 @@ echo " if (!curdomain || !curdomain[2]) {return \"DIRECT\";} var curhost = curdomain[1]; var curzone = curdomain[2]; - curhost = patternreplace(curhost); + curhost = patternreplace(curhost, false); var curarr = []; // dummy empty array if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) { if (typeof domains[curzone][curhost.length] === 'string') { diff --git a/scripts/generate-pac-domains.awk b/scripts/generate-pac-domains.awk index c90df43..4d11165 100644 --- a/scripts/generate-pac-domains.awk +++ b/scripts/generate-pac-domains.awk @@ -19,25 +19,40 @@ function printarray(arrname, arr) { for (domainzone in arr) { if (firsttime_1 == 0) {printf ",\n"} firsttime_1 = 0; - print "\"" domainzone "\":{" + printf "\"" domainzone "\":{" for (domainlength in arr[domainzone]) { - if (firsttime_2 == 0) {printf ",\n"} firsttime_2 = 0; + if (firsttime_2 == 0) {printf ","} firsttime_2 = 0; - printf " %s", "" domainlength ":\"" - for (domainname in arr[domainzone][domainlength]) { - printf "%s", domainname - } - printf "\"" + printf "%s", "" domainlength ":" + printf "%d", length(arr[domainzone][domainlength]) * domainlength + #for (domainname in arr[domainzone][domainlength]) { + # printf "%d", length(domainname) + #} + #printf "\"" } firsttime_2 = 1; - printf "\n}" + printf "}" } print "};" } +function printarray_oneline(arr) { + for (domainzone in arr) { + for (domainlength in arr[domainzone]) { + for (domainname in arr[domainzone][domainlength]) { + printf "%s", domainname + } + } + } +} + # Final function END { - printarray("domains", domainarray) + if (lzp) { + printarray_oneline(domainarray) + } else { + printarray("domains", domainarray) + } } diff --git a/scripts/lzp.py b/scripts/lzp.py new file mode 100755 index 0000000..852872b --- /dev/null +++ b/scripts/lzp.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +import sys +import os +import base64 +import textwrap + +''' +This script implements LZP compression for PAC file. +''' + +def encode(inputname): + TABLE_LEN_BITS = 18 + HASH_MASK = (1 << TABLE_LEN_BITS) - 1 + + ifile = open(sys.argv[1], "rb") + table = bytearray(1 << TABLE_LEN_BITS) + masks = bytearray() + obytes = bytearray() + hashed = 0 + + while True: + mask = 0 + buf = b"" + + for i in range(8): + cb = ifile.read(1) + if not cb: + break + c = ord(cb) + + if c == table[hashed]: + mask |= 1 << i; + else: + table[hashed] = c + buf += cb + + hashed = ( (hashed << 7) ^ c ) & HASH_MASK + + masks += mask.to_bytes(1, 'big') + obytes += buf + + if not cb: + break + + ifile.close() + return [obytes, masks] + + +def findsequence(inputstr): + wordreplace=["!", "@", "#", "$", "%", "^", "*", "(", ")", "[", "]", "-", ",", ".", "?"] + patternhit = {} + pattern_found = {} + input_len = len(inputstr) + + for patternlen in (2,): + for round, _ in enumerate(wordreplace): + position = 0 + while position <= input_len: + cut = inputstr[position:position+patternlen] + position += 1 + if len(cut) != patternlen: + continue + if not patternhit.get(cut): + patternhit[cut] = 0 + patternhit[cut] += 1 + #print("Round", round, "patternhit", patternhit) + patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:]) + inputstr = inputstr.replace(list(patternhit.keys())[0], '') + pattern_found.update(patternhit) + patternhit = {} + print("Round", round, "pattern_found", pattern_found) + + pattern_ret = {} + for i, p in enumerate(pattern_found.keys()): + pattern_ret.update({p: wordreplace[i]}) + return pattern_ret + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("{}: ".format(sys.argv[0])) + sys.exit(1) + + ofile = open(sys.argv[2], "wb") + mfile = open(sys.argv[3], "wb") + + obytes, masks = encode(sys.argv[1]) + masks_b64 = base64.b64encode(masks).decode() + masks_seqences = findsequence(masks_b64) + masks_sequenced = masks_b64 + for k, v in masks_seqences.items(): + #print(k, v) + masks_sequenced = masks_sequenced.replace(k, v) + + print("masks:", len(masks), " masks_b64:", len(masks_b64), " masks_sequenced:", len(masks_sequenced)) + print("obytes:", len(obytes)) + print("overall:", len(obytes) + len(masks_sequenced)) + + os.write(ofile.fileno(), "\\\n".join(textwrap.wrap(obytes.decode(), 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode()) + os.write(mfile.fileno(), "\\\n".join(textwrap.wrap(masks_sequenced, 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode()) + ofile.close() + mfile.close() + + with open(sys.argv[4], "w") as pacfile: + print(masks_seqences, file=pacfile) diff --git a/scripts/topsequences.py b/scripts/topsequences.py index 60ed611..b5eda14 100755 --- a/scripts/topsequences.py +++ b/scripts/topsequences.py @@ -21,7 +21,7 @@ wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")", "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}", "`", ":", ";", "?"] -wordreplace_big = ["!" + chr(x) for x in range(ord("A"), ord("Z") + 1)] +wordreplace_big = ["!" + x for x in wordreplace] with open(sys.argv[1], "r") as dfile: domains = dfile.read().split("\n")