Merge pull request #138 from nanos/cache-file-names

use sha hashes to cache file names
This commit is contained in:
Michael 2024-07-02 06:52:51 +01:00 committed by GitHub
commit 5f92da7178
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -16,6 +16,7 @@ import uuid
import defusedxml.ElementTree as ET import defusedxml.ElementTree as ET
import urllib.robotparser import urllib.robotparser
from urllib.parse import urlparse from urllib.parse import urlparse
import hashlib
logger = logging.getLogger("FediFetcher") logger = logging.getLogger("FediFetcher")
robotParser = urllib.robotparser.RobotFileParser() robotParser = urllib.robotparser.RobotFileParser()
@ -1011,41 +1012,54 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
break break
return result return result
def can_fetch(user_agent, url): def get_robots_txt_cache_path(robots_url):
parsed_uri = urlparse(url) hash = hashlib.sha256(robots_url.encode('utf-8'))
robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri) return os.path.join(arguments.state_dir, f'robots-{hash.hexdigest()}.txt')
if robots in ROBOTS_TXT: def get_cached_robots(robots_url):
if isinstance(ROBOTS_TXT[robots], bool): ## firstly: check the in-memory cache
return ROBOTS_TXT[robots] if robots_url in ROBOTS_TXT:
else: return ROBOTS_TXT[robots_url]
robotsTxt = ROBOTS_TXT[robots]
else: robotsCachePath = get_robots_txt_cache_path(robots_url)
robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
if os.path.exists(robotsCachePath): if os.path.exists(robotsCachePath):
with open(robotsCachePath, "r", encoding="utf-8") as f: with open(robotsCachePath, "r", encoding="utf-8") as f:
logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}") logger.debug(f"Getting robots.txt file from cache for {robots_url}.")
robotsTxt = f.read() robotsTxt = f.read()
ROBOTS_TXT[robots] = robotsTxt ROBOTS_TXT[robots_url] = robotsTxt
return robotsTxt
return None
def get_robots_from_url(robots_url):
robotsTxt = get_cached_robots(robots_url)
if robotsTxt != None:
return robotsTxt
else:
try: try:
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True) robotsTxt = get(robots_url, timeout = 2, ignore_robots_txt=True)
if robotsTxt.status_code in (401, 403): if robotsTxt.status_code in (401, 403):
ROBOTS_TXT[robots] = False robotsTxt = False
return False else:
elif robotsTxt.status_code != 200:
ROBOTS_TXT[robots] = True
return True
robotsTxt = robotsTxt.text robotsTxt = robotsTxt.text
ROBOTS_TXT[robots] = robotsTxt with open(get_robots_txt_cache_path(robots_url), "w", encoding="utf-8") as f:
with open(robotsCachePath, "w", encoding="utf-8") as f:
f.write(robotsTxt) f.write(robotsTxt)
except Exception as ex: except Exception as ex:
return True robotsTxt = True
ROBOTS_TXT[robots_url] = robotsTxt
return robotsTxt
def can_fetch(user_agent, url):
parsed_uri = urlparse(url)
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
robotsTxt = get_robots_from_url(robots_url)
if isinstance(robotsTxt, bool):
return robotsTxt
robotParser = urllib.robotparser.RobotFileParser() robotParser = urllib.robotparser.RobotFileParser()
robotParser.parse(robotsTxt.splitlines()) robotParser.parse(robotsTxt.splitlines())