2019-12-01 04:58:25 +03:00
|
|
|
@include "config/exclude-regexp-dist.awk"
|
|
|
|
|
|
|
|
# Skipping empty strings
|
|
|
|
(!$1) {next}
|
|
|
|
|
|
|
|
# Exclude some domains
|
|
|
|
(/duckdns/) {next}
|
|
|
|
(/\.r\.cloudfront\.net/) {next}
|
|
|
|
|
|
|
|
# Skipping IP addresses
|
|
|
|
(/^([0-9]{1,3}\.){3}[0-9]{1,3}$/) {next}
|
|
|
|
|
|
|
|
# Removing leading "www."
|
|
|
|
{sub(/^www\./, "", $1)}
|
|
|
|
|
|
|
|
# Removing ending dot
|
|
|
|
{sub(/\.$/, "", $1)}
|
|
|
|
|
|
|
|
{
|
2022-03-28 12:41:43 +03:00
|
|
|
if (/\.(ru|co|cu|com|info|net|org|gov|edu|int|mil|biz|pp|ne|msk|spb|nnov|od|in|ho|cc|dn|i|tut|v|dp|sl|ddns|dyndns|livejournal|herokuapp|azurewebsites|cloudfront|ucoz|3dn|nov|linode|amazonaws|sl-reverse|kiev|beget|kirov|akadns|scaleway|fastly|hldns|appspot|my1|hwcdn|deviantart|wixmp|netdna-ssl|brightcove|berlogovo|edgecastcdn|trafficmanager|pximg|github|hopto|u-stream|google)\.[^.]+$/)
|
2022-03-02 10:40:57 +03:00
|
|
|
{$1 = gensub(/(.+)\.([^.]+\.[^.]+\.[^.]+$)/, "\\2", 1)}
|
|
|
|
else
|
2019-12-01 04:58:25 +03:00
|
|
|
{$1 = gensub(/(.+)\.([^.]+\.[^.]+$)/, "\\2", 1)}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Sorting domains
|
|
|
|
{d_other[$1] = $1}
|
|
|
|
|
|
|
|
function printarray(arrname, arr) {
|
|
|
|
for (i in arr) {
|
|
|
|
print i
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Final function
|
|
|
|
END {
|
|
|
|
printarray("d_other", d_other)
|
2020-04-11 11:56:12 +03:00
|
|
|
}
|