Cache robots.txt
This commit is contained in:
parent
ac8044db83
commit
7b9896b5c0
1 changed files with 19 additions and 9 deletions
|
|
@ -1009,14 +1009,23 @@ def can_fetch(user_agent, url):
|
||||||
parsed_uri = urlparse(url)
|
parsed_uri = urlparse(url)
|
||||||
robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
|
robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
|
||||||
|
|
||||||
|
if robots in ROBOTS_TXT:
|
||||||
|
if isinstance(ROBOTS_TXT[robots], bool):
|
||||||
|
return ROBOTS_TXT[robots]
|
||||||
|
else:
|
||||||
|
robotsTxt = ROBOTS_TXT[robots]
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
# We are getting the robots.txt manually from here, because otherwise
|
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
|
||||||
robotsTxt = get(robots, ignore_robots_txt=True)
|
robotsTxt = get(robots, ignore_robots_txt=True)
|
||||||
if robotsTxt.status_code in (401, 403):
|
if robotsTxt.status_code in (401, 403):
|
||||||
|
ROBOTS_TXT[robots] = False
|
||||||
return False
|
return False
|
||||||
elif robotsTxt.status_code != 200:
|
elif robotsTxt.status_code != 200:
|
||||||
|
ROBOTS_TXT[robots] = True
|
||||||
return True
|
return True
|
||||||
robotsTxt = robotsTxt.text
|
robotsTxt = robotsTxt.text
|
||||||
|
ROBOTS_TXT[robots] = robotsTxt
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
@ -1394,6 +1403,7 @@ if __name__ == "__main__":
|
||||||
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
|
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
|
||||||
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
|
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
|
||||||
|
|
||||||
|
ROBOTS_TXT = {}
|
||||||
|
|
||||||
seen_urls = OrderedSet([])
|
seen_urls = OrderedSet([])
|
||||||
if os.path.exists(SEEN_URLS_FILE):
|
if os.path.exists(SEEN_URLS_FILE):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue