diff --git a/config/config.sh b/config/config.sh index 780a55d..8004b77 100755 --- a/config/config.sh +++ b/config/config.sh @@ -14,3 +14,7 @@ PACFILE_NOSSL="result/proxy-host-nossl.pac" # Perform DNS resolving to detect and filter non-existent domains RESOLVE_NXDOMAIN="no" + +# Perform HTTP (port 80)/HTTPS (port 443) reachability test with curl, +# to detect and filter broken websites +TEST_WEB_AVAILABILITY="no" diff --git a/parse.sh b/parse.sh index 68b656e..3075c35 100755 --- a/parse.sh +++ b/parse.sh @@ -31,6 +31,13 @@ then awk -f scripts/getzones.awk temp/hostlist_original_with_include.txt | grep -v -F -x -f temp/exclude-hosts.txt | cat - temp/include-hosts.txt | sort -u > result/hostlist_zones.txt fi +if [[ "$TEST_WEB_AVAILABILITY" == "yes" ]]; +then + timeout 2h parallel --bar -j96 -n1 -a result/hostlist_zones.txt scripts/test-with-curl.sh > temp/web-unreachable-exclude-hosts.txt || true + cat temp/web-unreachable-exclude-hosts.txt >> temp/exclude-hosts.txt + awk -f scripts/getzones.awk temp/hostlist_original_with_include.txt | grep -v -F -x -f temp/exclude-hosts.txt | cat - temp/include-hosts.txt | sort -u > result/hostlist_zones.txt +fi + # Generate a list of IP addresses awk -F';' '$1 ~ /\// {print $1}' temp/list.csv | grep -P '([0-9]{1,3}\.){3}[0-9]{1,3}\/[0-9]{1,2}' -o | sort -Vu > result/iplist_special_range.txt diff --git a/scripts/test-with-curl.sh b/scripts/test-with-curl.sh new file mode 100755 index 0000000..3f67265 --- /dev/null +++ b/scripts/test-with-curl.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +function curl_open() { + local WEBSITE="$1" + + curl --max-time 10 --compressed --fail --fail-early \ + -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0 antizapret.prostovpn.org anti-censorship tester' \ + "http://$WEBSITE" &> /dev/null + RESULT_HTTP="$?" + curl --max-time 10 --compressed --fail --fail-early \ + -H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0 antizapret.prostovpn.org anti-censorship tester' \ + -k "https://$WEBSITE" &> /dev/null + RESULT_HTTPS="$?" + + if [ "$RESULT_HTTP" -ne 0 ] && [ "$RESULT_HTTPS" -ne 0 ]; then + # If both HTTP and HTTPS failed, the website doesn't work + return 1 + fi + return 0 +} + +WEBSITE="$1" +curl_open "$WEBSITE" +ERRCODE="$?" +if [ "$ERRCODE" -ne 0 ]; then + #echo "$WEBSITE does not respond, marking as failed, not adding to the PAC list" + echo "$WEBSITE" +fi +exit $ERRCODE