Merge pull request #129 from nanos/cache-robots

Cache robots.txt for each run of the script, to reduce load on the server
2024-06-25 16:25:26 +01:00 · 2024-06-25 16:25:26 +01:00 · dec718db76
commit dec718db76
parent ac8044db83 7b9896b5c0
1 changed files with 19 additions and 9 deletions
--- a/find_posts.py
+++ b/find_posts.py
@ -1009,14 +1009,23 @@ def can_fetch(user_agent, url):
    parsed_uri = urlparse(url)
    robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)

+    if robots in ROBOTS_TXT:
+        if isinstance(ROBOTS_TXT[robots], bool):
+            return ROBOTS_TXT[robots]
+        else:
+            robotsTxt = ROBOTS_TXT[robots]
+    else:
        try:
-        # We are getting the robots.txt manually from here, because otherwise 
+            # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
            robotsTxt = get(robots, ignore_robots_txt=True)
            if robotsTxt.status_code in (401, 403):
+                ROBOTS_TXT[robots] = False
                return False
            elif robotsTxt.status_code != 200:
+                ROBOTS_TXT[robots] = True
                return True
            robotsTxt = robotsTxt.text
+            ROBOTS_TXT[robots] = robotsTxt
        except Exception as ex:
            return True
    
@ -1394,6 +1403,7 @@ if __name__ == "__main__":
        SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
        RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')

+        ROBOTS_TXT = {}

        seen_urls = OrderedSet([])
        if os.path.exists(SEEN_URLS_FILE):