Merge pull request #124 from nanos/rate-limits

Rate limit fetching of context
2024-06-25 08:53:16 +01:00 · 2024-06-25 08:53:16 +01:00 · 721d2fc5bb
commit 721d2fc5bb
parent f965b4f6fc 120008ced0
1 changed files with 61 additions and 13 deletions
--- a/find_posts.py
+++ b/find_posts.py
@ -459,6 +459,24 @@ def get_reply_toots(user_id, server, access_token, seen_urls, reply_since):
        f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}"
    )
 def toot_context_should_be_fetched(toot):
    if toot['uri'] not in recently_checked_context:
        recently_checked_context[toot['uri']] = toot
        return True
    else:
        lastSeen = recently_checked_context[toot['uri']]['lastSeen']
        lastSeenInSeconds = (datetime.now(lastSeen.tzinfo) - lastSeen).total_seconds()
        ageInSeconds = (datetime.now(lastSeen.tzinfo) - recently_checked_context[toot['uri']]['created_at']).total_seconds()
        if(ageInSeconds <= 60 * 60):
            # For the first hour: allow refetching context as desired
            return True
        if(ageInSeconds <= 24 * 60 * 60 and lastSeenInSeconds > 10 * 60):
            # For the next day: once every 10 minutes
            return True
        if(lastSeenInSeconds > 60 * 60):
            # After that: hourly
            return True
    return False    
 def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
    """get the context toots of the given toots from their original server"""
@ -468,12 +486,14 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
        if toot_has_parseable_url(toot, parsed_urls):
            url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
            parsed_url = parse_url(url, parsed_urls)
-            context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
+            if(toot_context_should_be_fetched(toot)):
-            if context is not None:
+                recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
-                for item in context:
+                context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
-                    known_context_urls.add(item)
+                if context is not None:
-            else:
+                    for item in context:
-                logger.error(f"Error getting context for toot {url}")
+                        known_context_urls.add(item)
                else:
                    logger.error(f"Error getting context for toot {url}")
    known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
    logger.info(f"Found {len(known_context_urls)} known context toots")
@ -707,13 +727,22 @@ def get_redirect_url(url):
 def get_all_context_urls(server, replied_toot_ids, seen_hosts):
    """get the URLs of the context toots of the given toots"""
-    return filter(
+    known_context_urls = set()
-        lambda url: not url.startswith(f"https://{server}/"),
+    for (url, (server, toot_id)) in replied_toot_ids:
-        itertools.chain.from_iterable(
+        if(toot_context_should_be_fetched(toot)):
-            get_toot_context(server, toot_id, url, seen_hosts)
+            recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
-            for (url, (server, toot_id)) in replied_toot_ids
+            context = get_toot_context(server, toot_id, url, seen_hosts)
-        ),
+            if context is not None:
-    )
+                for item in context:
                    known_context_urls.add(item)
            else:
                logger.error(f"Error getting context for toot {url}")
    known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
    logger.info(f"Found {len(known_context_urls)} known context toots")
    return known_context_urls
 def get_toot_context(server, toot_id, toot_url, seen_hosts):
@ -1324,6 +1353,7 @@ if __name__ == "__main__":
        KNOWN_FOLLOWINGS_FILE = os.path.join(arguments.state_dir, "known_followings")
        RECENTLY_CHECKED_USERS_FILE = os.path.join(arguments.state_dir, "recently_checked_users")
        SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
        RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
        seen_urls = OrderedSet([])
@ -1353,6 +1383,21 @@ if __name__ == "__main__":
            if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
                recently_checked_users.pop(user)    
        recently_checked_context = {}
        if(os.path.exists(RECENTLY_CHECKED_CONTEXTS_FILE)):
            with open(RECENTLY_CHECKED_CONTEXTS_FILE, "r", encoding="utf-8") as f:
                recently_checked_context = json.load(f)
        # Remove any toots that we haven't seen in a while, to ensure this doesn't grow indefinitely
        for tootUrl in recently_checked_context:
            recently_checked_context[tootUrl]['lastSeen'] = parser.parse(recently_checked_context[tootUrl]['lastSeen'])
            recently_checked_context[tootUrl]['created_at'] = parser.parse(recently_checked_context[tootUrl]['created_at'])
            lastSeen = recently_checked_context[tootUrl]['lastSeen']
            userAge = datetime.now(lastSeen.tzinfo) - lastSeen
            # dont really need to keep track for more than 7 days: if we haven't seen it in 7 days we can refetch content anyway
            if(userAge.total_seconds() > 7 * 24 * 60 * 60):
                recently_checked_users.pop(tootUrl)    
        parsed_urls = {}
        all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users))
@ -1470,6 +1515,9 @@ if __name__ == "__main__":
        with open(SEEN_HOSTS_FILE, "w", encoding="utf-8") as f:
            f.write(seen_hosts.toJSON())
        with open(RECENTLY_CHECKED_CONTEXTS_FILE, "w", encoding="utf-8") as f:
            f.write(json.dumps(recently_checked_context, default=str))
        os.remove(LOCK_FILE)
        if(arguments.on_done != None and arguments.on_done != ''):