Merge pull request #130 from nanos/cache-robots-on-disk

Cache robots.txt for 24 hours on disk to reduce load on servers
This commit is contained in:
Michael 2024-06-27 16:46:01 +01:00 committed by GitHub
commit e0faafb37a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -20,7 +20,7 @@ from urllib.parse import urlparse
logger = logging.getLogger("FediFetcher") logger = logging.getLogger("FediFetcher")
robotParser = urllib.robotparser.RobotFileParser() robotParser = urllib.robotparser.RobotFileParser()
VERSION = "7.1.1" VERSION = "7.1.2"
argparser=argparse.ArgumentParser() argparser=argparse.ArgumentParser()
@ -1027,6 +1027,14 @@ def can_fetch(user_agent, url):
return ROBOTS_TXT[robots] return ROBOTS_TXT[robots]
else: else:
robotsTxt = ROBOTS_TXT[robots] robotsTxt = ROBOTS_TXT[robots]
else:
robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
if os.path.exists(robotsCachePath):
with open(robotsCachePath, "r", encoding="utf-8") as f:
logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}")
robotsTxt = f.read()
ROBOTS_TXT[robots] = robotsTxt
else: else:
try: try:
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent # We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
@ -1039,6 +1047,10 @@ def can_fetch(user_agent, url):
return True return True
robotsTxt = robotsTxt.text robotsTxt = robotsTxt.text
ROBOTS_TXT[robots] = robotsTxt ROBOTS_TXT[robots] = robotsTxt
with open(robotsCachePath, "w", encoding="utf-8") as f:
f.write(robotsTxt)
except Exception as ex: except Exception as ex:
return True return True
@ -1480,6 +1492,15 @@ if __name__ == "__main__":
else: else:
seen_hosts = ServerList({}) seen_hosts = ServerList({})
# Delete any old robots.txt files so we can re-download them
for file_name in os.listdir(arguments.state_dir):
file_path = os.path.join(arguments.state_dir,file_name)
if file_name.startswith('robots-') and os.path.isfile(file_path):
if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24:
logger.debug(f"Removing cached robots.txt file {file_name}")
os.remove(file_path)
if(isinstance(arguments.access_token, str)): if(isinstance(arguments.access_token, str)):
setattr(arguments, 'access_token', [arguments.access_token]) setattr(arguments, 'access_token', [arguments.access_token])