Cache robots.txt for 24 hours on disk to reduce load on servers
This commit is contained in:
parent
3651d028a6
commit
7e8ca17640
1 changed files with 33 additions and 12 deletions
|
|
@ -1014,6 +1014,14 @@ def can_fetch(user_agent, url):
|
|||
return ROBOTS_TXT[robots]
|
||||
else:
|
||||
robotsTxt = ROBOTS_TXT[robots]
|
||||
else:
|
||||
robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
|
||||
if os.path.exists(robotsCachePath):
|
||||
with open(robotsCachePath, "r", encoding="utf-8") as f:
|
||||
logger.debug(f"Getting robots.text file from cache {file_name}")
|
||||
robotsTxt = f.read()
|
||||
ROBOTS_TXT[robots] = robotsTxt
|
||||
|
||||
else:
|
||||
try:
|
||||
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
|
||||
|
|
@ -1026,6 +1034,10 @@ def can_fetch(user_agent, url):
|
|||
return True
|
||||
robotsTxt = robotsTxt.text
|
||||
ROBOTS_TXT[robots] = robotsTxt
|
||||
|
||||
with open(robotsCachePath, "w", encoding="utf-8") as f:
|
||||
f.write(robotsTxt)
|
||||
|
||||
except Exception as ex:
|
||||
return True
|
||||
|
||||
|
|
@ -1467,6 +1479,15 @@ if __name__ == "__main__":
|
|||
else:
|
||||
seen_hosts = ServerList({})
|
||||
|
||||
# Delete any old robots.txt files so we can re-download them
|
||||
for file_name in os.listdir(arguments.state_dir):
|
||||
file_path = os.path.join(arguments.state_dir,file_name)
|
||||
if file_name.startswith('robots-') and os.path.isfile(file_path):
|
||||
if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24:
|
||||
logger.debug(f"Removing cached robots.text file {file_name}")
|
||||
os.remove(file_path)
|
||||
|
||||
|
||||
if(isinstance(arguments.access_token, str)):
|
||||
setattr(arguments, 'access_token', [arguments.access_token])
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue