ensure callbacks aren't blocked by robtos

This commit is contained in:
nanos 2024-06-25 10:38:47 +01:00
parent 1b4c135f8f
commit 885b84d598

View file

@ -1016,13 +1016,13 @@ def can_fetch(user_agent, url):
def user_agent(): def user_agent():
return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)" return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)"
def get(url, headers = {}, timeout = 0, max_tries = 5): def get(url, headers = {}, timeout = 0, max_tries = 5, ignore_robots_txt = False):
"""A simple wrapper to make a get request while providing our user agent, and respecting rate limits""" """A simple wrapper to make a get request while providing our user agent, and respecting rate limits"""
h = headers.copy() h = headers.copy()
if 'User-Agent' not in h: if 'User-Agent' not in h:
h['User-Agent'] = user_agent() h['User-Agent'] = user_agent()
if not can_fetch(h['User-Agent'], url): if not ignore_robots_txt and not can_fetch(h['User-Agent'], url):
raise Exception(f"Querying {url} prohibited by robots.txt") raise Exception(f"Querying {url} prohibited by robots.txt")
if timeout == 0: if timeout == 0:
@ -1334,7 +1334,7 @@ if __name__ == "__main__":
if(arguments.on_start != None and arguments.on_start != ''): if(arguments.on_start != None and arguments.on_start != ''):
try: try:
get(f"{arguments.on_start}?rid={runId}") get(f"{arguments.on_start}?rid={runId}", ignore_robots_txt = True)
except Exception as ex: except Exception as ex:
logger.error(f"Error getting callback url: {ex}") logger.error(f"Error getting callback url: {ex}")
@ -1356,7 +1356,7 @@ if __name__ == "__main__":
logger.critical(f"Lock file age is {datetime.now() - lock_time} - below --lock-hours={arguments.lock_hours} provided.") logger.critical(f"Lock file age is {datetime.now() - lock_time} - below --lock-hours={arguments.lock_hours} provided.")
if(arguments.on_fail != None and arguments.on_fail != ''): if(arguments.on_fail != None and arguments.on_fail != ''):
try: try:
get(f"{arguments.on_fail}?rid={runId}") get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True)
except Exception as ex: except Exception as ex:
logger.error(f"Error getting callback url: {ex}") logger.error(f"Error getting callback url: {ex}")
sys.exit(1) sys.exit(1)
@ -1365,7 +1365,7 @@ if __name__ == "__main__":
logger.critical(f"Cannot read logfile age - aborting.") logger.critical(f"Cannot read logfile age - aborting.")
if(arguments.on_fail != None and arguments.on_fail != ''): if(arguments.on_fail != None and arguments.on_fail != ''):
try: try:
get(f"{arguments.on_fail}?rid={runId}") get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True)
except Exception as ex: except Exception as ex:
logger.error(f"Error getting callback url: {ex}") logger.error(f"Error getting callback url: {ex}")
sys.exit(1) sys.exit(1)
@ -1549,7 +1549,7 @@ if __name__ == "__main__":
if(arguments.on_done != None and arguments.on_done != ''): if(arguments.on_done != None and arguments.on_done != ''):
try: try:
get(f"{arguments.on_done}?rid={runId}") get(f"{arguments.on_done}?rid={runId}", ignore_robots_txt = True)
except Exception as ex: except Exception as ex:
logger.error(f"Error getting callback url: {ex}") logger.error(f"Error getting callback url: {ex}")
@ -1560,7 +1560,7 @@ if __name__ == "__main__":
logger.error(f"Job failed after {datetime.now() - start}.") logger.error(f"Job failed after {datetime.now() - start}.")
if(arguments.on_fail != None and arguments.on_fail != ''): if(arguments.on_fail != None and arguments.on_fail != ''):
try: try:
get(f"{arguments.on_fail}?rid={runId}") get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True)
except Exception as ex: except Exception as ex:
logger.error(f"Error getting callback url: {ex}") logger.error(f"Error getting callback url: {ex}")
raise raise