From dd468d59561ce132ac5b7e34cf8da792453dcf58 Mon Sep 17 00:00:00 2001 From: nanos Date: Tue, 25 Jun 2024 16:15:43 +0100 Subject: [PATCH] User FediFetcher as User Agent to fetch the robots.txt --- find_posts.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/find_posts.py b/find_posts.py index 7bd73c6..ae6ed8f 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1008,11 +1008,23 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5): def can_fetch(user_agent, url): parsed_uri = urlparse(url) robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri) + + try: + # We are getting the robots.txt manually from here, because otherwise + robotsTxt = get(robots, ignore_robots_txt=True) + if robotsTxt.status_code in (401, 403): + return False + elif robotsTxt.status_code != 200: + return True + robotsTxt = robotsTxt.text + except Exception as ex: + return True + robotParser = urllib.robotparser.RobotFileParser() - robotParser.set_url(robots) - robotParser.read() + robotParser.parse(robotsTxt.splitlines()) return robotParser.can_fetch(user_agent, url) + def user_agent(): return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)"