Try to use xxHash to hash robots cache file names

This should be faster and more efficient.
2026-05-30 23:41:26 +00:00 · 2024-08-15 08:00:00 +01:00 · 2024-08-15 08:00:00 +01:00 · 939e775c27
commit 939e775c27
parent df22348eb1
2 changed files with 3 additions and 2 deletions
--- a/find_posts.py
+++ b/find_posts.py
@ -16,7 +16,7 @@ import uuid
 import defusedxml.ElementTree as ET
 import urllib.robotparser
 from urllib.parse import urlparse
-import hashlib
+import xxhash

 logger = logging.getLogger("FediFetcher")
 robotParser = urllib.robotparser.RobotFileParser()
@ -1076,7 +1076,7 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
    return result

 def get_robots_txt_cache_path(robots_url):
-    hash = hashlib.sha256(robots_url.encode('utf-8'))
+    hash = xxhash.xxh128(robots_url.encode('utf-8'))
    return os.path.join(arguments.state_dir, f'robots-{hash.hexdigest()}.txt')

 def get_cached_robots(robots_url):
--- a/requirements.txt
+++ b/requirements.txt
@ -12,3 +12,4 @@ requests==2.32.0
 six==1.16.0
 smmap==5.0.0
 urllib3==1.26.19
+xxhash==3.4.1