Merge pull request #130 from nanos/cache-robots-on-disk

Cache robots.txt for 24 hours on disk to reduce load on servers
2024-06-27 16:46:01 +01:00 · 2024-06-27 16:46:01 +01:00 · e0faafb37a
commit e0faafb37a
parent 009fbe54b4 40b624aaff
1 changed files with 34 additions and 13 deletions
--- a/find_posts.py
+++ b/find_posts.py
@ -20,7 +20,7 @@ from urllib.parse import urlparse
 logger = logging.getLogger("FediFetcher")
 robotParser = urllib.robotparser.RobotFileParser()
-VERSION = "7.1.1"
+VERSION = "7.1.2"
 argparser=argparse.ArgumentParser()
@ -1028,19 +1028,31 @@ def can_fetch(user_agent, url):
        else:
            robotsTxt = ROBOTS_TXT[robots]
    else:
-        try:
+        robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
-            # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
+        if os.path.exists(robotsCachePath):
-            robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
+            with open(robotsCachePath, "r", encoding="utf-8") as f:
-            if robotsTxt.status_code in (401, 403):
+                logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}")
-                ROBOTS_TXT[robots] = False
+                robotsTxt = f.read()
                return False
            elif robotsTxt.status_code != 200:
                ROBOTS_TXT[robots] = True
                return True
            robotsTxt = robotsTxt.text
            ROBOTS_TXT[robots] = robotsTxt
-        except Exception as ex:
+
-            return True
+        else:
            try:
                # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
                robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
                if robotsTxt.status_code in (401, 403):
                    ROBOTS_TXT[robots] = False
                    return False
                elif robotsTxt.status_code != 200:
                    ROBOTS_TXT[robots] = True
                    return True
                robotsTxt = robotsTxt.text
                ROBOTS_TXT[robots] = robotsTxt
                with open(robotsCachePath, "w", encoding="utf-8") as f:
                    f.write(robotsTxt)
            except Exception as ex:
                return True
    robotParser = urllib.robotparser.RobotFileParser()
    robotParser.parse(robotsTxt.splitlines())
@ -1480,6 +1492,15 @@ if __name__ == "__main__":
        else:
            seen_hosts = ServerList({})
        # Delete any old robots.txt files so we can re-download them
        for file_name in os.listdir(arguments.state_dir):
            file_path = os.path.join(arguments.state_dir,file_name)
            if file_name.startswith('robots-') and os.path.isfile(file_path):
                if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24:
                    logger.debug(f"Removing cached robots.txt file {file_name}")
                    os.remove(file_path)
        if(isinstance(arguments.access_token, str)):
            setattr(arguments, 'access_token', [arguments.access_token])