feat: enforce blocklist (and remove testing statements)
This commit is contained in:
parent
6e1830f87c
commit
3eb2589a8b
1 changed files with 4 additions and 3 deletions
|
|
@ -1121,6 +1121,10 @@ def can_fetch(user_agent, url):
|
||||||
parsed_uri = urlparse(url)
|
parsed_uri = urlparse(url)
|
||||||
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
|
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
|
||||||
|
|
||||||
|
if parsed_uri.netloc in INSTANCE_BLOCKLIST:
|
||||||
|
# Never connect to these locations
|
||||||
|
raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist")
|
||||||
|
|
||||||
robotsTxt = get_robots_from_url(robots_url)
|
robotsTxt = get_robots_from_url(robots_url)
|
||||||
if isinstance(robotsTxt, bool):
|
if isinstance(robotsTxt, bool):
|
||||||
return robotsTxt
|
return robotsTxt
|
||||||
|
|
@ -1577,9 +1581,6 @@ if __name__ == "__main__":
|
||||||
INSTANCE_BLOCKLIST = arguments.instance_blocklist.split(",")
|
INSTANCE_BLOCKLIST = arguments.instance_blocklist.split(",")
|
||||||
ROBOTS_TXT = {}
|
ROBOTS_TXT = {}
|
||||||
|
|
||||||
print(INSTANCE_BLOCKLIST)
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
seen_urls = OrderedSet([])
|
seen_urls = OrderedSet([])
|
||||||
if os.path.exists(SEEN_URLS_FILE):
|
if os.path.exists(SEEN_URLS_FILE):
|
||||||
with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f:
|
with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue