#!/usr/bin/env python3 import sys from pprint import pprint wordhit = {} wordreplace=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "!", "@", "#", "$", "%", "^", "\\\\&", "*", "(", ")", "=", "+", "/", ",", "<", ">", "~"] with open(sys.argv[1], "r") as wfile: with open(sys.argv[2], "r") as dfile: domains = dfile.read().split("\n") words = wfile.read().split("\n") new_domains = [] for domain in domains: new_domains.append('.'.join(domain.split(".")[:-1])) domains = new_domains for word in words: wordhit[word] = 0 domain_len = len(domains) for i, domain in enumerate(domains): if (i % 1000) == 0: print(i, "/", domain_len, end="\r", file=sys.stderr) for word in words: if word in domain: wordhit[word] += 1 wordhit_c = {} for word in wordhit: value = wordhit[word] if value != 0 and word != '': wordhit_c[word] = value wordhit_c = dict(sorted(wordhit_c.items(), key=lambda x: x[1])) #print(wordhit_c) finallist = list(wordhit_c)[-43:] finallist = sorted(finallist, key=lambda x: 1000 - len(x)) print(finallist, file=sys.stderr) print("{") for i, w in enumerate(finallist): print('gsub(/{}/, "{}", domainname)'.format(w, wordreplace[i])) print("}")