diff --git a/dict/google-1000.txt b/dict/google-1000.txt deleted file mode 100644 index 1851ecc..0000000 --- a/dict/google-1000.txt +++ /dev/null @@ -1,1000 +0,0 @@ -the -and -for -that -this -with -you -not -are -from -your -all -have -new -more -was -will -home -can -about -page -has -search -free -but -our -one -other -information -time -they -site -may -what -which -their -news -out -use -any -there -see -only -his -when -contact -here -business -who -web -also -now -help -get -view -online -first -been -would -how -were -services -some -these -click -its -like -service -than -find -price -date -back -top -people -had -list -name -just -over -state -year -day -into -email -two -health -world -next -used -work -last -most -products -music -buy -data -make -them -should -product -system -post -her -city -add -policy -number -such -please -available -copyright -support -message -after -best -software -then -jan -good -video -well -where -info -rights -public -books -high -school -through -each -links -she -review -years -order -very -privacy -book -items -company -read -group -sex -need -many -user -said -does -set -under -general -research -university -january -mail -full -map -reviews -program -life -know -games -way -days -management -part -could -great -united -hotel -real -item -international -center -ebay -must -store -travel -comments -made -development -report -off -member -details -line -terms -before -hotels -did -send -right -type -because -local -those -using -results -office -education -national -car -design -take -posted -internet -address -community -within -states -area -want -phone -dvd -shipping -reserved -subject -between -forum -family -long -based -code -show -even -black -check -special -prices -website -index -being -women -much -sign -file -link -open -today -technology -south -case -project -same -pages -version -section -own -found -sports -house -related -security -both -county -american -photo -game -members -power -while -care -network -down -computer -systems -three -total -place -end -following -download -him -without -per -access -think -north -resources -current -posts -big -media -law -control -water -history -pictures -size -art -personal -since -including -guide -shop -directory -board -location -change -white -text -small -rating -rate -government -children -during -usa -return -students -shopping -account -times -sites -level -digital -profile -previous -form -events -love -old -john -main -call -hours -image -department -title -description -non -insurance -another -why -shall -property -class -still -money -quality -every -listing -content -country -private -little -visit -save -tools -low -reply -customer -december -compare -movies -include -college -value -article -york -man -card -jobs -provide -food -source -author -different -press -learn -sale -around -print -course -job -canada -process -teen -room -stock -training -too -credit -point -join -science -men -categories -advanced -west -sales -look -english -left -team -estate -box -conditions -select -windows -photos -gay -thread -week -category -note -live -large -gallery -table -register -however -june -october -november -market -library -really -action -start -series -model -features -air -industry -plan -human -provided -yes -required -second -hot -accessories -cost -movie -forums -march -september -better -say -questions -july -yahoo -going -medical -test -friend -come -dec -server -study -application -cart -staff -articles -san -feedback -again -play -looking -issues -april -never -users -complete -street -topic -comment -financial -things -working -against -standard -tax -person -below -mobile -less -got -blog -party -payment -equipment -login -student -let -programs -offers -legal -above -recent -park -stores -side -act -problem -red -give -memory -performance -social -august -quote -language -story -sell -options -experience -rates -create -key -body -young -america -important -field -few -east -paper -single -age -activities -club -example -girls -additional -password -latest -something -road -gift -question -changes -night -hard -texas -oct -pay -four -poker -status -browse -issue -range -building -seller -court -february -always -result -audio -light -write -war -nov -offer -blue -groups -easy -given -files -event -release -analysis -request -fax -china -making -picture -needs -possible -might -professional -yet -month -major -star -areas -future -space -committee -hand -sun -cards -problems -london -washington -meeting -rss -become -interest -child -keep -enter -california -porn -share -similar -garden -schools -million -added -reference -companies -listed -baby -learning -energy -run -delivery -net -popular -term -film -stories -put -computers -journal -reports -try -welcome -central -images -president -notice -god -original -head -radio -until -cell -color -self -council -away -includes -track -australia -discussion -archive -once -others -entertainment -agreement -format -least -society -months -log -safety -friends -sure -faq -trade -edition -cars -messages -marketing -tell -further -updated -association -able -having -provides -david -fun -already -green -studies -close -common -drive -specific -several -gold -feb -living -sep -collection -called -short -arts -lot -ask -display -limited -powered -solutions -means -director -daily -beach -past -natural -whether -due -electronics -five -upon -period -planning -database -says -official -weather -mar -land -average -done -technical -window -france -pro -region -island -record -direct -microsoft -conference -environment -records -district -calendar -costs -style -url -front -statement -update -parts -aug -ever -downloads -early -miles -sound -resource -present -applications -either -ago -document -word -works -material -bill -apr -written -talk -federal -hosting -rules -final -adult -tickets -thing -centre -requirements -via -cheap -nude -kids -finance -true -minutes -else -mark -third -rock -gifts -europe -reading -topics -bad -individual -tips -plus -auto -cover -usually -edit -together -videos -percent -fast -function -fact -unit -getting -global -tech -meet -far -economic -player -projects -lyrics -often -subscribe -submit -germany -amount -watch -included -feel -though -bank -risk -thanks -everything -deals -various -words -linux -jul -production -commercial -james -weight -town -heart -advertising -received -choose -treatment -newsletter -archives -points -knowledge -magazine -error -camera -jun -girl -currently -construction -toys -registered -clear -golf -receive -domain -methods -chapter -makes -protection -policies -loan -wide -beauty -manager -india -position -taken -sort -listings -models -michael -known -half -cases -step -engineering -florida -simple -quick -none -wireless -license -paul -friday -lake -whole -annual -published -later -basic -sony -shows -corporate -google -church -method -purchase -customers -active -response -practice -hardware -figure -materials -fire -holiday -chat -enough -designed -along -among -death -writing -speed -html -countries -loss -face -brand -discount -higher -effects -created -remember -standards -oil -bit -yellow -political -increase -advertise -kingdom -base -near -environmental -thought -stuff -french -storage -japan -doing -loans -shoes -entry -stay -nature -orders -availability -africa -summary -turn -mean -growth -notes -agency -king -monday -european -activity -copy -although -drug -pics -western -income -force -cash -employment -overall -bay -river -commission -package -contents -seen -players -engine -port -album -regional -stop -supplies -started -administration -bar -institute -views -plans -double -dog -build -screen -exchange -types -soon -sponsored -lines -electronic -continue -across -benefits -needed -season -apply -someone -held -anything -printer -condition -effective -believe diff --git a/dict/topwords.py b/dict/topwords.py index f9761cd..41ff00d 100755 --- a/dict/topwords.py +++ b/dict/topwords.py @@ -1,48 +1,46 @@ #!/usr/bin/env python3 import sys -from pprint import pprint -wordhit = {} +patternhit = {} wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")", - "=", "+", "/", ",", "<", ">", "~"] + "=", "+", "/", ",", "<", ">", "~", "[", "]", "{", "}"] -with open(sys.argv[1], "r") as wfile: - with open(sys.argv[2], "r") as dfile: - domains = dfile.read().split("\n") - words = wfile.read().split("\n") - - new_domains = [] - for domain in domains: - new_domains.append('.'.join(domain.split(".")[:-1])) - domains = new_domains - for word in words: - wordhit[word] = 0 +with open(sys.argv[1], "r") as dfile: + domains = dfile.read().split("\n") - domain_len = len(domains) - for i, domain in enumerate(domains): - if (i % 1000) == 0: - print(i, "/", domain_len, end="\r", file=sys.stderr) - for word in words: - if word in domain: - wordhit[word] += 1 + new_domains = [] + for domain in domains: + new_domains.append('.'.join(domain.split(".")[:-1])) + domains = ''.join(new_domains) -wordhit_c = {} -for word in wordhit: - value = wordhit[word] - if value != 0 and word != '': - wordhit_c[word] = value + domain_len = len(domains) + position = 0 -wordhit_c = dict(sorted(wordhit_c.items(), key=lambda x: x[1])) + while position <= domain_len: + cut = domains[position:position+2] + if not patternhit.get(cut): + patternhit[cut] = 0 + patternhit[cut] += 1 + position += 2 -#print(wordhit_c) -finallist = list(wordhit_c)[-43:] -finallist = sorted(finallist, key=lambda x: 1000 - len(x)) +patternhit = dict(sorted(patternhit.items(), key=lambda x: x[1])) + +#print(patternhit, file=sys.stderr) +finallist = list(patternhit)[-1 * len(wordreplace):] print(finallist, file=sys.stderr) -print("{") -for i, w in enumerate(finallist): - print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i])) -print("}") + +with open(sys.argv[2], "w") as awkfile: + print("{", file=awkfile) + for i, w in enumerate(finallist): + print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]), file=awkfile) + print("}", file=awkfile) + +with open(sys.argv[3], "w") as pacfile: + pacdict = {} + for i, w in enumerate(finallist): + pacdict[wordreplace[i].strip('\\')] = w + print(pacdict, file=pacfile) diff --git a/generate-pac.sh b/generate-pac.sh index 51c9154..4d7a24b 100755 --- a/generate-pac.sh +++ b/generate-pac.sh @@ -4,6 +4,8 @@ set -e source config/config.sh echo -n > "$PACFILE" +python3 dict/topwords.py result/hostlist_zones.txt temp/replace-common-words.awk temp/pacpatterns.js + # .pac header echo "// ProstoVPN.AntiZapret PAC-host File // Generated on $(date), by https://bitbucket.org/anticensority/antizapret-pac-generator-light/ @@ -27,12 +29,22 @@ sort -Vu temp/include-ips.txt result/iplist_blockedbyip_noid2971_collapsed.txt | SPECIAL="$(cat result/iplist_special_range.txt | xargs -n1 sipcalc | \ awk 'BEGIN {notfirst=0} /Network address/ {n=$4} /Network mask \(bits\)/ {if (notfirst) {printf ","} printf "[\"%s\", %s]", n, $5; notfirst=1;}')" +PATTERNS=$(cat temp/pacpatterns.js) + echo "var special = [ $SPECIAL ]; var az_initialized = 0; // CIDR to netmask, for special function nmfc(b) {var m=[];for(var i=0;i<4;i++) {var n=Math.min(b,8); m.push(256-Math.pow(2, 8-n)); b-=n;} return m.join('.');} +// replace repeating sequences in domain +function patternreplace(s) { + var patterns = $PATTERNS; + for (pattern in patterns) { + s = s.split(patterns[pattern]).join(pattern); + } + return s; +} function FindProxyForURL(url, host) {" >> "$PACFILE" @@ -93,6 +105,7 @@ echo " if (!curdomain || !curdomain[2]) {return \"DIRECT\";} var curhost = curdomain[1]; var curzone = curdomain[2]; + curhost = patternreplace(curhost); var curarr = []; // dummy empty array if (domains.hasOwnProperty(curzone) && domains[curzone].hasOwnProperty(curhost.length)) { if (typeof domains[curzone][curhost.length] === 'string') { diff --git a/parse.sh b/parse.sh index c83aa5f..a241fca 100755 --- a/parse.sh +++ b/parse.sh @@ -30,8 +30,6 @@ then awk -f scripts/getzones.awk temp/hostlist_original_with_include.txt | grep -v -F -x -f temp/exclude-hosts.txt | sort -u > result/hostlist_zones.txt fi -python dict/topwords.py dict/google-1000.txt result/hostlist_zones.txt > temp/replace-common-words.awk - # Generate a list of IP addresses awk -F';' '$1 ~ /\// {print $1}' temp/list.csv | grep -P '([0-9]{1,3}\.){3}[0-9]{1,3}\/[0-9]{1,2}' -o | sort -Vu > result/iplist_special_range.txt