User FediFetcher as User Agent to fetch the robots.txt

This commit is contained in:
nanos 2024-06-25 16:15:43 +01:00
parent e40d61d291
commit dd468d5956

View file

@ -1008,11 +1008,23 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
def can_fetch(user_agent, url): def can_fetch(user_agent, url):
parsed_uri = urlparse(url) parsed_uri = urlparse(url)
robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri) robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
try:
# We are getting the robots.txt manually from here, because otherwise
robotsTxt = get(robots, ignore_robots_txt=True)
if robotsTxt.status_code in (401, 403):
return False
elif robotsTxt.status_code != 200:
return True
robotsTxt = robotsTxt.text
except Exception as ex:
return True
robotParser = urllib.robotparser.RobotFileParser() robotParser = urllib.robotparser.RobotFileParser()
robotParser.set_url(robots) robotParser.parse(robotsTxt.splitlines())
robotParser.read()
return robotParser.can_fetch(user_agent, url) return robotParser.can_fetch(user_agent, url)
def user_agent(): def user_agent():
return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)" return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)"