From 7e8ca17640a3ec33e1c56ae3770753ec05964eb7 Mon Sep 17 00:00:00 2001 From: nanos Date: Wed, 26 Jun 2024 16:41:51 +0100 Subject: [PATCH 1/3] Cache robots.txt for 24 hours on disk to reduce load on servers --- find_posts.py | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/find_posts.py b/find_posts.py index 5e7d574..1d1334b 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1015,19 +1015,31 @@ def can_fetch(user_agent, url): else: robotsTxt = ROBOTS_TXT[robots] else: - try: - # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent - robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True) - if robotsTxt.status_code in (401, 403): - ROBOTS_TXT[robots] = False - return False - elif robotsTxt.status_code != 200: - ROBOTS_TXT[robots] = True - return True - robotsTxt = robotsTxt.text + robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}') + if os.path.exists(robotsCachePath): + with open(robotsCachePath, "r", encoding="utf-8") as f: + logger.debug(f"Getting robots.text file from cache {file_name}") + robotsTxt = f.read() ROBOTS_TXT[robots] = robotsTxt - except Exception as ex: - return True + + else: + try: + # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent + robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True) + if robotsTxt.status_code in (401, 403): + ROBOTS_TXT[robots] = False + return False + elif robotsTxt.status_code != 200: + ROBOTS_TXT[robots] = True + return True + robotsTxt = robotsTxt.text + ROBOTS_TXT[robots] = robotsTxt + + with open(robotsCachePath, "w", encoding="utf-8") as f: + f.write(robotsTxt) + + except Exception as ex: + return True robotParser = urllib.robotparser.RobotFileParser() robotParser.parse(robotsTxt.splitlines()) @@ -1467,6 +1479,15 @@ if __name__ == "__main__": else: seen_hosts = ServerList({}) + # Delete any old robots.txt files so we can re-download them + for file_name in os.listdir(arguments.state_dir): + file_path = os.path.join(arguments.state_dir,file_name) + if file_name.startswith('robots-') and os.path.isfile(file_path): + if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24: + logger.debug(f"Removing cached robots.text file {file_name}") + os.remove(file_path) + + if(isinstance(arguments.access_token, str)): setattr(arguments, 'access_token', [arguments.access_token]) From 90988872b7d31d9242407dfeb53239be8ceb5cb3 Mon Sep 17 00:00:00 2001 From: nanos Date: Wed, 26 Jun 2024 16:45:30 +0100 Subject: [PATCH 2/3] fix --- find_posts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/find_posts.py b/find_posts.py index 1d1334b..18ac2c7 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1018,7 +1018,7 @@ def can_fetch(user_agent, url): robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}') if os.path.exists(robotsCachePath): with open(robotsCachePath, "r", encoding="utf-8") as f: - logger.debug(f"Getting robots.text file from cache {file_name}") + logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}") robotsTxt = f.read() ROBOTS_TXT[robots] = robotsTxt @@ -1484,7 +1484,7 @@ if __name__ == "__main__": file_path = os.path.join(arguments.state_dir,file_name) if file_name.startswith('robots-') and os.path.isfile(file_path): if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24: - logger.debug(f"Removing cached robots.text file {file_name}") + logger.debug(f"Removing cached robots.txt file {file_name}") os.remove(file_path) From 40b624aaff9c7fdce67d4aea882683e7220e7871 Mon Sep 17 00:00:00 2001 From: nanos Date: Thu, 27 Jun 2024 07:56:50 +0100 Subject: [PATCH 3/3] update version --- find_posts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/find_posts.py b/find_posts.py index 18ac2c7..4c5e235 100644 --- a/find_posts.py +++ b/find_posts.py @@ -20,7 +20,7 @@ from urllib.parse import urlparse logger = logging.getLogger("FediFetcher") robotParser = urllib.robotparser.RobotFileParser() -VERSION = "7.1.1" +VERSION = "7.1.2" argparser=argparse.ArgumentParser()