From 1b4c135f8fb3f20d84c69cdb7dde57201a037654 Mon Sep 17 00:00:00 2001 From: nanos Date: Tue, 25 Jun 2024 10:24:45 +0100 Subject: [PATCH 1/2] respect robots.txt --- find_posts.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/find_posts.py b/find_posts.py index 8f64c53..e5f7930 100644 --- a/find_posts.py +++ b/find_posts.py @@ -14,8 +14,13 @@ import time import argparse import uuid import defusedxml.ElementTree as ET +import urllib.robotparser +from urllib.parse import urlparse logger = logging.getLogger("FediFetcher") +robotParser = urllib.robotparser.RobotFileParser() + +VERSION = "7.0.8" argparser=argparse.ArgumentParser() @@ -1000,12 +1005,25 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5): break return result +def can_fetch(user_agent, url): + parsed_uri = urlparse(url) + robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri) + robotParser = urllib.robotparser.RobotFileParser() + robotParser.set_url(robots) + robotParser.read() + return robotParser.can_fetch(user_agent, url) + +def user_agent(): + return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)" def get(url, headers = {}, timeout = 0, max_tries = 5): """A simple wrapper to make a get request while providing our user agent, and respecting rate limits""" h = headers.copy() if 'User-Agent' not in h: - h['User-Agent'] = f"FediFetcher (+{arguments.server}; https://go.thms.uk/ff)" + h['User-Agent'] = user_agent() + + if not can_fetch(h['User-Agent'], url): + raise Exception(f"Querying {url} prohibited by robots.txt") if timeout == 0: timeout = arguments.http_timeout @@ -1027,8 +1045,11 @@ def post(url, json, headers = {}, timeout = 0, max_tries = 5): """A simple wrapper to make a post request while providing our user agent, and respecting rate limits""" h = headers.copy() if 'User-Agent' not in h: - h['User-Agent'] = 'FediFetcher (https://go.thms.uk/mgr)' + h['User-Agent'] = user_agent() + if not can_fetch(h['User-Agent'], url): + raise Exception(f"Querying {url} prohibited by robots.txt") + if timeout == 0: timeout = arguments.http_timeout From 885b84d5984859aea4406f173f4add2a4ce5ebde Mon Sep 17 00:00:00 2001 From: nanos Date: Tue, 25 Jun 2024 10:38:47 +0100 Subject: [PATCH 2/2] ensure callbacks aren't blocked by robtos --- find_posts.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/find_posts.py b/find_posts.py index e5f7930..639970b 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1016,13 +1016,13 @@ def can_fetch(user_agent, url): def user_agent(): return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)" -def get(url, headers = {}, timeout = 0, max_tries = 5): +def get(url, headers = {}, timeout = 0, max_tries = 5, ignore_robots_txt = False): """A simple wrapper to make a get request while providing our user agent, and respecting rate limits""" h = headers.copy() if 'User-Agent' not in h: h['User-Agent'] = user_agent() - if not can_fetch(h['User-Agent'], url): + if not ignore_robots_txt and not can_fetch(h['User-Agent'], url): raise Exception(f"Querying {url} prohibited by robots.txt") if timeout == 0: @@ -1334,7 +1334,7 @@ if __name__ == "__main__": if(arguments.on_start != None and arguments.on_start != ''): try: - get(f"{arguments.on_start}?rid={runId}") + get(f"{arguments.on_start}?rid={runId}", ignore_robots_txt = True) except Exception as ex: logger.error(f"Error getting callback url: {ex}") @@ -1356,7 +1356,7 @@ if __name__ == "__main__": logger.critical(f"Lock file age is {datetime.now() - lock_time} - below --lock-hours={arguments.lock_hours} provided.") if(arguments.on_fail != None and arguments.on_fail != ''): try: - get(f"{arguments.on_fail}?rid={runId}") + get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True) except Exception as ex: logger.error(f"Error getting callback url: {ex}") sys.exit(1) @@ -1365,7 +1365,7 @@ if __name__ == "__main__": logger.critical(f"Cannot read logfile age - aborting.") if(arguments.on_fail != None and arguments.on_fail != ''): try: - get(f"{arguments.on_fail}?rid={runId}") + get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True) except Exception as ex: logger.error(f"Error getting callback url: {ex}") sys.exit(1) @@ -1549,7 +1549,7 @@ if __name__ == "__main__": if(arguments.on_done != None and arguments.on_done != ''): try: - get(f"{arguments.on_done}?rid={runId}") + get(f"{arguments.on_done}?rid={runId}", ignore_robots_txt = True) except Exception as ex: logger.error(f"Error getting callback url: {ex}") @@ -1560,7 +1560,7 @@ if __name__ == "__main__": logger.error(f"Job failed after {datetime.now() - start}.") if(arguments.on_fail != None and arguments.on_fail != ''): try: - get(f"{arguments.on_fail}?rid={runId}") + get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True) except Exception as ex: logger.error(f"Error getting callback url: {ex}") raise