respect robots.txt

2024-06-25 10:24:45 +01:00 · 2024-06-25 10:24:45 +01:00 · 1b4c135f8f
commit 1b4c135f8f
parent ed5f0ba3b4
1 changed files with 23 additions and 2 deletions
--- a/find_posts.py
+++ b/find_posts.py
@ -14,8 +14,13 @@ import time
 import argparse
 import uuid
 import defusedxml.ElementTree as ET
 import urllib.robotparser
 from urllib.parse import urlparse
 logger = logging.getLogger("FediFetcher")
 robotParser = urllib.robotparser.RobotFileParser()
 VERSION = "7.0.8"
 argparser=argparse.ArgumentParser()
@ -1000,12 +1005,25 @@ def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
                break
    return result
 def can_fetch(user_agent, url):
    parsed_uri = urlparse(url)
    robots = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
    robotParser = urllib.robotparser.RobotFileParser()
    robotParser.set_url(robots)
    robotParser.read()
    return robotParser.can_fetch(user_agent, url)
 def user_agent():
    return f"FediFetcher/{VERSION}; +{arguments.server} (https://go.thms.uk/ff)"
 def get(url, headers = {}, timeout = 0, max_tries = 5):
    """A simple wrapper to make a get request while providing our user agent, and respecting rate limits"""
    h = headers.copy()
    if 'User-Agent' not in h:
-        h['User-Agent'] = f"FediFetcher (+{arguments.server}; https://go.thms.uk/ff)"
+        h['User-Agent'] = user_agent()
    if not can_fetch(h['User-Agent'], url):
        raise Exception(f"Querying {url} prohibited by robots.txt")    
    if timeout == 0:
        timeout = arguments.http_timeout
@ -1027,8 +1045,11 @@ def post(url, json, headers = {}, timeout = 0, max_tries = 5):
    """A simple wrapper to make a post request while providing our user agent, and respecting rate limits"""
    h = headers.copy()
    if 'User-Agent' not in h:
-        h['User-Agent'] = 'FediFetcher (https://go.thms.uk/mgr)'
+        h['User-Agent'] = user_agent()
    if not can_fetch(h['User-Agent'], url):
        raise Exception(f"Querying {url} prohibited by robots.txt")    
    if timeout == 0:
        timeout = arguments.http_timeout