Try to use xxHash to hash robots cache file names

This should be faster and more efficient.
This commit is contained in:
nanos 2024-08-15 08:00:00 +01:00
parent df22348eb1
commit 939e775c27
2 changed files with 3 additions and 2 deletions

View file

@ -16,7 +16,7 @@ import uuid
import defusedxml.ElementTree as ET
import urllib.robotparser
from urllib.parse import urlparse
import hashlib
import xxhash
logger = logging.getLogger("FediFetcher")
robotParser = urllib.robotparser.RobotFileParser()
@ -1076,7 +1076,7 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
return result
def get_robots_txt_cache_path(robots_url):
hash = hashlib.sha256(robots_url.encode('utf-8'))
hash = xxhash.xxh128(robots_url.encode('utf-8'))
return os.path.join(arguments.state_dir, f'robots-{hash.hexdigest()}.txt')
def get_cached_robots(robots_url):

View file

@ -12,3 +12,4 @@ requests==2.32.0
six==1.16.0
smmap==5.0.0
urllib3==1.26.19
xxhash==3.4.1