mirror of
https://bitbucket.org/anticensority/antizapret-pac-generator-light.git
synced 2024-11-22 05:06:04 +03:00
LZP compression for domains
This commit is contained in:
parent
f95edbfdf2
commit
4c88eb1264
@ -18,6 +18,8 @@ echo "// ProstoVPN.AntiZapret PAC-host File
|
|||||||
" >> "$PACFILE"
|
" >> "$PACFILE"
|
||||||
|
|
||||||
awk -f scripts/generate-pac-domains.awk result/hostlist_zones.txt >> "$PACFILE"
|
awk -f scripts/generate-pac-domains.awk result/hostlist_zones.txt >> "$PACFILE"
|
||||||
|
awk -v lzp=1 -f scripts/generate-pac-domains.awk result/hostlist_zones.txt > temp/domains-oneline.txt
|
||||||
|
python3 scripts/lzp.py temp/domains-oneline.txt temp/domains-oneline-data.txt temp/domains-oneline-mask.txt temp/domains-oneline-pac.js
|
||||||
|
|
||||||
# Collapse IP list
|
# Collapse IP list
|
||||||
scripts/collapse_blockedbyip_noid2971.py
|
scripts/collapse_blockedbyip_noid2971.py
|
||||||
@ -30,21 +32,72 @@ SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \
|
|||||||
awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
|
awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')"
|
||||||
|
|
||||||
PATTERNS=$(cat temp/pacpatterns.js)
|
PATTERNS=$(cat temp/pacpatterns.js)
|
||||||
|
PATTERNS_LZP=$(cat temp/domains-oneline-pac.js)
|
||||||
|
DOMAINS_LZP=$(cat temp/domains-oneline-data.txt)
|
||||||
|
MASK_LZP=$(cat temp/domains-oneline-mask.txt)
|
||||||
|
|
||||||
echo "var special = [
|
echo "var special = [
|
||||||
$SPECIAL
|
$SPECIAL
|
||||||
];
|
];
|
||||||
|
|
||||||
|
// domain name data encoded with LZP, without mask data
|
||||||
|
var domains_lzp = \"$DOMAINS_LZP\";
|
||||||
|
|
||||||
|
// LZP mask data, b64+patternreplace
|
||||||
|
var mask_lzp = \"$MASK_LZP\";
|
||||||
|
|
||||||
var az_initialized = 0;
|
var az_initialized = 0;
|
||||||
// CIDR to netmask, for special
|
// CIDR to netmask, for special
|
||||||
function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
|
function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');}
|
||||||
// replace repeating sequences in domain
|
// replace repeating sequences in domain
|
||||||
function patternreplace(s) {
|
function patternreplace(s, lzpmask) {
|
||||||
var patterns = $PATTERNS;
|
var patterns = $PATTERNS;
|
||||||
|
if (lzpmask)
|
||||||
|
var patterns = $PATTERNS_LZP;
|
||||||
for (pattern in patterns) {
|
for (pattern in patterns) {
|
||||||
s = s.split(patterns[pattern]).join(pattern);
|
s = s.split(patterns[pattern]).join(pattern);
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
// LZP as in PPP, different hash func
|
||||||
|
function unlzp(d, m) {
|
||||||
|
var TABLE_LEN_BITS = 18;
|
||||||
|
var HASH_MASK = (1 << TABLE_LEN_BITS) - 1;
|
||||||
|
var hash = 0, mask = 0, maskpos = 0, dpos = 0, table = Array(1 << TABLE_LEN_BITS), out = Array(8), outpos = 0, outfinal = '';
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
mask = m[maskpos++];
|
||||||
|
if (!mask)
|
||||||
|
break
|
||||||
|
mask = mask.charCodeAt(0);
|
||||||
|
outpos = 0;
|
||||||
|
for (var i = 0; i < 8; i++) {
|
||||||
|
if (mask & (1 << i)) {
|
||||||
|
c = table[hash];
|
||||||
|
} else {
|
||||||
|
c = d[dpos++];
|
||||||
|
if (!c)
|
||||||
|
break
|
||||||
|
c = c.charCodeAt(0);
|
||||||
|
table[hash] = c;
|
||||||
|
}
|
||||||
|
out[outpos++] = String.fromCharCode(c);
|
||||||
|
hash = ( (hash << 7) ^ c ) & HASH_MASK
|
||||||
|
}
|
||||||
|
if (outpos == 8)
|
||||||
|
outfinal += out.join('');
|
||||||
|
}
|
||||||
|
if (outpos < 8)
|
||||||
|
outfinal += out.slice(0, outpos).join('');
|
||||||
|
return outfinal;
|
||||||
|
}
|
||||||
|
|
||||||
|
function a2b(a) {
|
||||||
|
var b, c, d, e = {}, f = 0, g = 0, h = \"\", i = String.fromCharCode, j = a.length;
|
||||||
|
for (b = 0; 64 > b; b++) e[\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\".charAt(b)] = b;
|
||||||
|
for (c = 0; j > c; c++) for (b = e[a.charAt(c)], f = (f << 6) + b, g += 6; g >= 8; ) ((d = 255 & f >>> (g -= 8)) || j - 2 > c) && (h += i(d));
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
function FindProxyForURL(url, host) {" >> "$PACFILE"
|
function FindProxyForURL(url, host) {" >> "$PACFILE"
|
||||||
|
|
||||||
@ -75,6 +128,19 @@ echo " if (domains.length < 10) return \"DIRECT\"; // list is broken
|
|||||||
special[i][1] = nmfc(special[i][1]);
|
special[i][1] = nmfc(special[i][1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mask_lzp = patternreplace(mask_lzp, true);
|
||||||
|
mask_lzp = a2b(mask_lzp);
|
||||||
|
domains_lzp = unlzp(domains_lzp, mask_lzp);
|
||||||
|
mask_lzp = 0;
|
||||||
|
|
||||||
|
for (dmn in domains) {
|
||||||
|
for (dcnt in domains[dmn]) {
|
||||||
|
dmnl = domains[dmn][dcnt];
|
||||||
|
domains[dmn][dcnt] = domains_lzp.slice(0, dmnl);
|
||||||
|
domains_lzp = domains_lzp.slice(dmnl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
az_initialized = 1;
|
az_initialized = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,7 +171,7 @@ echo "
|
|||||||
if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
|
if (!curdomain || !curdomain[2]) {return \"DIRECT\";}
|
||||||
var curhost = curdomain[1];
|
var curhost = curdomain[1];
|
||||||
var curzone = curdomain[2];
|
var curzone = curdomain[2];
|
||||||
curhost = patternreplace(curhost);
|
curhost = patternreplace(curhost, false);
|
||||||
var curarr = []; // dummy empty array
|
var curarr = []; // dummy empty array
|
||||||
if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
|
if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) {
|
||||||
if (typeof domains[curzone][curhost.length] === 'string') {
|
if (typeof domains[curzone][curhost.length] === 'string') {
|
||||||
|
@ -19,25 +19,40 @@ function printarray(arrname, arr) {
|
|||||||
for (domainzone in arr) {
|
for (domainzone in arr) {
|
||||||
if (firsttime_1 == 0) {printf ",\n"} firsttime_1 = 0;
|
if (firsttime_1 == 0) {printf ",\n"} firsttime_1 = 0;
|
||||||
|
|
||||||
print "\"" domainzone "\":{"
|
printf "\"" domainzone "\":{"
|
||||||
|
|
||||||
for (domainlength in arr[domainzone]) {
|
for (domainlength in arr[domainzone]) {
|
||||||
if (firsttime_2 == 0) {printf ",\n"} firsttime_2 = 0;
|
if (firsttime_2 == 0) {printf ","} firsttime_2 = 0;
|
||||||
|
|
||||||
printf " %s", "" domainlength ":\""
|
printf "%s", "" domainlength ":"
|
||||||
for (domainname in arr[domainzone][domainlength]) {
|
printf "%d", length(arr[domainzone][domainlength]) * domainlength
|
||||||
printf "%s", domainname
|
#for (domainname in arr[domainzone][domainlength]) {
|
||||||
}
|
# printf "%d", length(domainname)
|
||||||
printf "\""
|
#}
|
||||||
|
#printf "\""
|
||||||
}
|
}
|
||||||
|
|
||||||
firsttime_2 = 1;
|
firsttime_2 = 1;
|
||||||
printf "\n}"
|
printf "}"
|
||||||
}
|
}
|
||||||
print "};"
|
print "};"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function printarray_oneline(arr) {
|
||||||
|
for (domainzone in arr) {
|
||||||
|
for (domainlength in arr[domainzone]) {
|
||||||
|
for (domainname in arr[domainzone][domainlength]) {
|
||||||
|
printf "%s", domainname
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# Final function
|
# Final function
|
||||||
END {
|
END {
|
||||||
|
if (lzp) {
|
||||||
|
printarray_oneline(domainarray)
|
||||||
|
} else {
|
||||||
printarray("domains", domainarray)
|
printarray("domains", domainarray)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
105
scripts/lzp.py
Executable file
105
scripts/lzp.py
Executable file
@ -0,0 +1,105 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import base64
|
||||||
|
import textwrap
|
||||||
|
|
||||||
|
'''
|
||||||
|
This script implements LZP compression for PAC file.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def encode(inputname):
|
||||||
|
TABLE_LEN_BITS = 18
|
||||||
|
HASH_MASK = (1 << TABLE_LEN_BITS) - 1
|
||||||
|
|
||||||
|
ifile = open(sys.argv[1], "rb")
|
||||||
|
table = bytearray(1 << TABLE_LEN_BITS)
|
||||||
|
masks = bytearray()
|
||||||
|
obytes = bytearray()
|
||||||
|
hashed = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
mask = 0
|
||||||
|
buf = b""
|
||||||
|
|
||||||
|
for i in range(8):
|
||||||
|
cb = ifile.read(1)
|
||||||
|
if not cb:
|
||||||
|
break
|
||||||
|
c = ord(cb)
|
||||||
|
|
||||||
|
if c == table[hashed]:
|
||||||
|
mask |= 1 << i;
|
||||||
|
else:
|
||||||
|
table[hashed] = c
|
||||||
|
buf += cb
|
||||||
|
|
||||||
|
hashed = ( (hashed << 7) ^ c ) & HASH_MASK
|
||||||
|
|
||||||
|
masks += mask.to_bytes(1, 'big')
|
||||||
|
obytes += buf
|
||||||
|
|
||||||
|
if not cb:
|
||||||
|
break
|
||||||
|
|
||||||
|
ifile.close()
|
||||||
|
return [obytes, masks]
|
||||||
|
|
||||||
|
|
||||||
|
def findsequence(inputstr):
|
||||||
|
wordreplace=["!", "@", "#", "$", "%", "^", "*", "(", ")", "[", "]", "-", ",", ".", "?"]
|
||||||
|
patternhit = {}
|
||||||
|
pattern_found = {}
|
||||||
|
input_len = len(inputstr)
|
||||||
|
|
||||||
|
for patternlen in (2,):
|
||||||
|
for round, _ in enumerate(wordreplace):
|
||||||
|
position = 0
|
||||||
|
while position <= input_len:
|
||||||
|
cut = inputstr[position:position+patternlen]
|
||||||
|
position += 1
|
||||||
|
if len(cut) != patternlen:
|
||||||
|
continue
|
||||||
|
if not patternhit.get(cut):
|
||||||
|
patternhit[cut] = 0
|
||||||
|
patternhit[cut] += 1
|
||||||
|
#print("Round", round, "patternhit", patternhit)
|
||||||
|
patternhit = dict(sorted(patternhit.items(), key=lambda x: len(x[0]) * x[1])[-1:])
|
||||||
|
inputstr = inputstr.replace(list(patternhit.keys())[0], '')
|
||||||
|
pattern_found.update(patternhit)
|
||||||
|
patternhit = {}
|
||||||
|
print("Round", round, "pattern_found", pattern_found)
|
||||||
|
|
||||||
|
pattern_ret = {}
|
||||||
|
for i, p in enumerate(pattern_found.keys()):
|
||||||
|
pattern_ret.update({p: wordreplace[i]})
|
||||||
|
return pattern_ret
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 5:
|
||||||
|
print("{}: <input.txt> <output_data.txt> <output_mask.txt> <pac function.js>".format(sys.argv[0]))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
ofile = open(sys.argv[2], "wb")
|
||||||
|
mfile = open(sys.argv[3], "wb")
|
||||||
|
|
||||||
|
obytes, masks = encode(sys.argv[1])
|
||||||
|
masks_b64 = base64.b64encode(masks).decode()
|
||||||
|
masks_seqences = findsequence(masks_b64)
|
||||||
|
masks_sequenced = masks_b64
|
||||||
|
for k, v in masks_seqences.items():
|
||||||
|
#print(k, v)
|
||||||
|
masks_sequenced = masks_sequenced.replace(k, v)
|
||||||
|
|
||||||
|
print("masks:", len(masks), " masks_b64:", len(masks_b64), " masks_sequenced:", len(masks_sequenced))
|
||||||
|
print("obytes:", len(obytes))
|
||||||
|
print("overall:", len(obytes) + len(masks_sequenced))
|
||||||
|
|
||||||
|
os.write(ofile.fileno(), "\\\n".join(textwrap.wrap(obytes.decode(), 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode())
|
||||||
|
os.write(mfile.fileno(), "\\\n".join(textwrap.wrap(masks_sequenced, 8192, expand_tabs=False, replace_whitespace=False, drop_whitespace=False, break_long_words=True, break_on_hyphens=False)).encode())
|
||||||
|
ofile.close()
|
||||||
|
mfile.close()
|
||||||
|
|
||||||
|
with open(sys.argv[4], "w") as pacfile:
|
||||||
|
print(masks_seqences, file=pacfile)
|
@ -21,7 +21,7 @@ wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
|
|||||||
"@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
"@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
|
||||||
"=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",
|
"=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}",
|
||||||
"`", ":", ";", "?"]
|
"`", ":", ";", "?"]
|
||||||
wordreplace_big = ["!" + chr(x) for x in range(ord("A"), ord("Z") + 1)]
|
wordreplace_big = ["!" + x for x in wordreplace]
|
||||||
|
|
||||||
with open(sys.argv[1], "r") as dfile:
|
with open(sys.argv[1], "r") as dfile:
|
||||||
domains = dfile.read().split("\n")
|
domains = dfile.read().split("\n")
|
||||||
|
Loading…
Reference in New Issue
Block a user