respect robots.txt

This commit is contained in:
nanos 2024-06-25 10:24:45 +01:00
parent ed5f0ba3b4
commit 1b4c135f8f

View file

@ -14,8 +14,13 @@ import time
import argparse import argparse
import uuid import uuid
import defusedxml.ElementTree as ET import defusedxml.ElementTree as ET
import urllib.robotparser
from urllib.parse import urlparse
logger = logging.getLogger("FediFetcher") logger = logging.getLogger("FediFetcher")
robotParser = urllib.robotparser.RobotFileParser()
VERSION = "7.0.8"
argparser=argparse.ArgumentParser() argparser=argparse.ArgumentParser()
@ -1000,12 +1005,25 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
break break
return result return result
def can_fetch(user_agent, url):
parsed_uri = urlparse(url)
robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
robotParser = urllib.robotparser.RobotFileParser()
robotParser.set_url(robots)
robotParser.read()
return robotParser.can_fetch(user_agent, url)
def user_agent():
return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)"
def get(url, headers = {}, timeout = 0, max_tries = 5): def get(url, headers = {}, timeout = 0, max_tries = 5):
"""A simple wrapper to make a get request while providing our user agent, and respecting rate limits""" """A simple wrapper to make a get request while providing our user agent, and respecting rate limits"""
h = headers.copy() h = headers.copy()
if 'User-Agent' not in h: if 'User-Agent' not in h:
h['User-Agent'] = f"FediFetcher (+{arguments.server}; https://go.thms.uk/ff)" h['User-Agent'] = user_agent()
if not can_fetch(h['User-Agent'], url):
raise Exception(f"Querying {url} prohibited by robots.txt")
if timeout == 0: if timeout == 0:
timeout = arguments.http_timeout timeout = arguments.http_timeout
@ -1027,8 +1045,11 @@ def post(url, json, headers = {}, timeout = 0, max_tries = 5):
"""A simple wrapper to make a post request while providing our user agent, and respecting rate limits""" """A simple wrapper to make a post request while providing our user agent, and respecting rate limits"""
h = headers.copy() h = headers.copy()
if 'User-Agent' not in h: if 'User-Agent' not in h:
h['User-Agent'] = 'FediFetcher (https://go.thms.uk/mgr)' h['User-Agent'] = user_agent()
if not can_fetch(h['User-Agent'], url):
raise Exception(f"Querying {url} prohibited by robots.txt")
if timeout == 0: if timeout == 0:
timeout = arguments.http_timeout timeout = arguments.http_timeout