Merge pull request #138 from nanos/cache-file-names

use sha hashes to cache file names
2024-07-02 06:52:51 +01:00 · 2024-07-02 06:52:51 +01:00 · 5f92da7178
commit 5f92da7178
parent e85384a5a6 c58c5b5af0
1 changed files with 46 additions and 32 deletions
--- a/find_posts.py
+++ b/find_posts.py
@ -16,6 +16,7 @@ import uuid
 import defusedxml.ElementTree as ET
 import urllib.robotparser
 from urllib.parse import urlparse
 import hashlib
 logger = logging.getLogger("FediFetcher")
 robotParser = urllib.robotparser.RobotFileParser()
@ -1011,41 +1012,54 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
                break
    return result
-def can_fetch(user_agent, url):
+def get_robots_txt_cache_path(robots_url):
-    parsed_uri = urlparse(url)
+    hash = hashlib.sha256(robots_url.encode('utf-8'))
-    robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
+    return os.path.join(arguments.state_dir, f'robots-{hash.hexdigest()}.txt')
-    if robots in ROBOTS_TXT:
+def get_cached_robots(robots_url):
-        if isinstance(ROBOTS_TXT[robots], bool):
+    ## firstly: check the in-memory cache
-            return ROBOTS_TXT[robots]
+    if robots_url in ROBOTS_TXT:
-        else:
+        return ROBOTS_TXT[robots_url]
-            robotsTxt = ROBOTS_TXT[robots]
+        
-    else:
+    robotsCachePath = get_robots_txt_cache_path(robots_url)
        robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
    if os.path.exists(robotsCachePath):
        with open(robotsCachePath, "r", encoding="utf-8") as f:
-                logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}")
+            logger.debug(f"Getting robots.txt file from cache for {robots_url}.")
            robotsTxt = f.read()
-            ROBOTS_TXT[robots] = robotsTxt
+            ROBOTS_TXT[robots_url] = robotsTxt
            return robotsTxt
    return None
 def get_robots_from_url(robots_url):
    robotsTxt = get_cached_robots(robots_url)
    if robotsTxt != None:
        return robotsTxt
        else:
    try:
        # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
-                robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
+        robotsTxt = get(robots_url, timeout = 2, ignore_robots_txt=True)
        if robotsTxt.status_code in (401, 403):
-                    ROBOTS_TXT[robots] = False
+            robotsTxt = False
-                    return False
+        else:
                elif robotsTxt.status_code != 200:
                    ROBOTS_TXT[robots] = True
                    return True
            robotsTxt = robotsTxt.text
-                ROBOTS_TXT[robots] = robotsTxt
+            with open(get_robots_txt_cache_path(robots_url), "w", encoding="utf-8") as f:
                with open(robotsCachePath, "w", encoding="utf-8") as f:
                f.write(robotsTxt)
    except Exception as ex:
-                return True
+        robotsTxt = True
    ROBOTS_TXT[robots_url] = robotsTxt
    return robotsTxt
 def can_fetch(user_agent, url):
    parsed_uri = urlparse(url)
    robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
    robotsTxt = get_robots_from_url(robots_url)
    if isinstance(robotsTxt, bool):
        return robotsTxt
    robotParser = urllib.robotparser.RobotFileParser()
    robotParser.parse(robotsTxt.splitlines())