antizapret-pac-generator-light/dict/topwords.py

#!/usr/bin/env python3

import sys
from pprint import pprint

wordhit = {}
wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J",
             "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T",
             "U", "V", "W", "X", "Y", "Z",
             "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")",
             "=", "+", "/", ",", "<", ">", "~"]

with open(sys.argv[1], "r") as wfile:
    with open(sys.argv[2], "r") as dfile:
        domains = dfile.read().split("\n")
        words = wfile.read().split("\n")

        new_domains = []
        for domain in domains:
            new_domains.append('.'.join(domain.split(".")[:-1]))
        domains = new_domains
        for word in words:
            wordhit[word] = 0

        domain_len = len(domains)
        for i, domain in enumerate(domains):
            if (i % 1000) == 0:
                print(i, "/", domain_len, end="\r", file=sys.stderr)
            for word in words:
                if word in domain:
                    wordhit[word] += 1

wordhit_c = {}
for word in wordhit:
    value = wordhit[word]
    if value != 0 and word != '':
        wordhit_c[word] = value

wordhit_c = dict(sorted(wordhit_c.items(), key=lambda x: x[1]))

#print(wordhit_c)
finallist = list(wordhit_c)[-43:]
finallist = sorted(finallist, key=lambda x: 1000 - len(x))
print(finallist, file=sys.stderr)
print("{")
for i, w in enumerate(finallist):
    print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i]))
print("}")