From 3eb2589a8be9307c6bca4924a27bee53764f540e Mon Sep 17 00:00:00 2001 From: B Tasker Date: Sun, 18 Aug 2024 16:58:11 +0100 Subject: [PATCH] feat: enforce blocklist (and remove testing statements) --- find_posts.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/find_posts.py b/find_posts.py index 9e5da6c..a303903 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1121,6 +1121,10 @@ def can_fetch(user_agent, url): parsed_uri = urlparse(url) robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri) + if parsed_uri.netloc in INSTANCE_BLOCKLIST: + # Never connect to these locations + raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist") + robotsTxt = get_robots_from_url(robots_url) if isinstance(robotsTxt, bool): return robotsTxt @@ -1577,9 +1581,6 @@ if __name__ == "__main__": INSTANCE_BLOCKLIST = arguments.instance_blocklist.split(",") ROBOTS_TXT = {} - print(INSTANCE_BLOCKLIST) - sys.exit() - seen_urls = OrderedSet([]) if os.path.exists(SEEN_URLS_FILE): with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f: