Merge pull request #130 from nanos/cache-robots-on-disk
Cache robots.txt for 24 hours on disk to reduce load on servers
This commit is contained in:
commit
e0faafb37a
1 changed files with 34 additions and 13 deletions
|
|
@ -20,7 +20,7 @@ from urllib.parse import urlparse
|
||||||
logger = logging.getLogger("FediFetcher")
|
logger = logging.getLogger("FediFetcher")
|
||||||
robotParser = urllib.robotparser.RobotFileParser()
|
robotParser = urllib.robotparser.RobotFileParser()
|
||||||
|
|
||||||
VERSION = "7.1.1"
|
VERSION = "7.1.2"
|
||||||
|
|
||||||
argparser=argparse.ArgumentParser()
|
argparser=argparse.ArgumentParser()
|
||||||
|
|
||||||
|
|
@ -1028,19 +1028,31 @@ def can_fetch(user_agent, url):
|
||||||
else:
|
else:
|
||||||
robotsTxt = ROBOTS_TXT[robots]
|
robotsTxt = ROBOTS_TXT[robots]
|
||||||
else:
|
else:
|
||||||
try:
|
robotsCachePath = os.path.join(arguments.state_dir, f'robots-{parsed_uri.netloc}')
|
||||||
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
|
if os.path.exists(robotsCachePath):
|
||||||
robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
|
with open(robotsCachePath, "r", encoding="utf-8") as f:
|
||||||
if robotsTxt.status_code in (401, 403):
|
logger.debug(f"Getting robots.txt file from cache {parsed_uri.netloc}")
|
||||||
ROBOTS_TXT[robots] = False
|
robotsTxt = f.read()
|
||||||
return False
|
|
||||||
elif robotsTxt.status_code != 200:
|
|
||||||
ROBOTS_TXT[robots] = True
|
|
||||||
return True
|
|
||||||
robotsTxt = robotsTxt.text
|
|
||||||
ROBOTS_TXT[robots] = robotsTxt
|
ROBOTS_TXT[robots] = robotsTxt
|
||||||
except Exception as ex:
|
|
||||||
return True
|
else:
|
||||||
|
try:
|
||||||
|
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
|
||||||
|
robotsTxt = get(robots, timeout = 2, ignore_robots_txt=True)
|
||||||
|
if robotsTxt.status_code in (401, 403):
|
||||||
|
ROBOTS_TXT[robots] = False
|
||||||
|
return False
|
||||||
|
elif robotsTxt.status_code != 200:
|
||||||
|
ROBOTS_TXT[robots] = True
|
||||||
|
return True
|
||||||
|
robotsTxt = robotsTxt.text
|
||||||
|
ROBOTS_TXT[robots] = robotsTxt
|
||||||
|
|
||||||
|
with open(robotsCachePath, "w", encoding="utf-8") as f:
|
||||||
|
f.write(robotsTxt)
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
return True
|
||||||
|
|
||||||
robotParser = urllib.robotparser.RobotFileParser()
|
robotParser = urllib.robotparser.RobotFileParser()
|
||||||
robotParser.parse(robotsTxt.splitlines())
|
robotParser.parse(robotsTxt.splitlines())
|
||||||
|
|
@ -1480,6 +1492,15 @@ if __name__ == "__main__":
|
||||||
else:
|
else:
|
||||||
seen_hosts = ServerList({})
|
seen_hosts = ServerList({})
|
||||||
|
|
||||||
|
# Delete any old robots.txt files so we can re-download them
|
||||||
|
for file_name in os.listdir(arguments.state_dir):
|
||||||
|
file_path = os.path.join(arguments.state_dir,file_name)
|
||||||
|
if file_name.startswith('robots-') and os.path.isfile(file_path):
|
||||||
|
if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24:
|
||||||
|
logger.debug(f"Removing cached robots.txt file {file_name}")
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
|
||||||
if(isinstance(arguments.access_token, str)):
|
if(isinstance(arguments.access_token, str)):
|
||||||
setattr(arguments, 'access_token', [arguments.access_token])
|
setattr(arguments, 'access_token', [arguments.access_token])
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue