feat: enforce blocklist (and remove testing statements)

This commit is contained in:
B Tasker 2024-08-18 16:58:11 +01:00
parent 6e1830f87c
commit 3eb2589a8b
No known key found for this signature in database
GPG key ID: 8DC652174C1EBA9B

View file

@ -1121,6 +1121,10 @@ def can_fetch(user_agent, url):
parsed_uri = urlparse(url)
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
if parsed_uri.netloc in INSTANCE_BLOCKLIST:
# Never connect to these locations
raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist")
robotsTxt = get_robots_from_url(robots_url)
if isinstance(robotsTxt, bool):
return robotsTxt
@ -1577,9 +1581,6 @@ if __name__ == "__main__":
INSTANCE_BLOCKLIST = arguments.instance_blocklist.split(",")
ROBOTS_TXT = {}
print(INSTANCE_BLOCKLIST)
sys.exit()
seen_urls = OrderedSet([])
if os.path.exists(SEEN_URLS_FILE):
with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f: