Merge pull request #124 from nanos/rate-limits

Rate limit fetching of context
2024-06-25 08:53:16 +01:00 · 2024-06-25 08:53:16 +01:00 · 721d2fc5bb
commit 721d2fc5bb
parent f965b4f6fc 120008ced0
1 changed files with 61 additions and 13 deletions
--- a/find_posts.py
+++ b/find_posts.py
@ -459,6 +459,24 @@ def get_reply_toots(user_id, server, access_token, seen_urls, reply_since):
        f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}"
    )

+def toot_context_should_be_fetched(toot):
+    if toot['uri'] not in recently_checked_context:
+        recently_checked_context[toot['uri']] = toot
+        return True
+    else:
+        lastSeen = recently_checked_context[toot['uri']]['lastSeen']
+        lastSeenInSeconds = (datetime.now(lastSeen.tzinfo) - lastSeen).total_seconds()
+        ageInSeconds = (datetime.now(lastSeen.tzinfo) - recently_checked_context[toot['uri']]['created_at']).total_seconds()
+        if(ageInSeconds <= 60 * 60):
+            # For the first hour: allow refetching context as desired
+            return True
+        if(ageInSeconds <= 24 * 60 * 60 and lastSeenInSeconds > 10 * 60):
+            # For the next day: once every 10 minutes
+            return True
+        if(lastSeenInSeconds > 60 * 60):
+            # After that: hourly
+            return True
+    return False    

 def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
    """get the context toots of the given toots from their original server"""
@ -468,12 +486,14 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
        if toot_has_parseable_url(toot, parsed_urls):
            url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
            parsed_url = parse_url(url, parsed_urls)
-            context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
-            if context is not None:
-                for item in context:
-                    known_context_urls.add(item)
-            else:
-                logger.error(f"Error getting context for toot {url}")
+            if(toot_context_should_be_fetched(toot)):
+                recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
+                context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
+                if context is not None:
+                    for item in context:
+                        known_context_urls.add(item)
+                else:
+                    logger.error(f"Error getting context for toot {url}")
    
    known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
    logger.info(f"Found {len(known_context_urls)} known context toots")
@ -707,13 +727,22 @@ def get_redirect_url(url):

 def get_all_context_urls(server, replied_toot_ids, seen_hosts):
    """get the URLs of the context toots of the given toots"""
-    return filter(
-        lambda url: not url.startswith(f"https://{server}/"),
-        itertools.chain.from_iterable(
-            get_toot_context(server, toot_id, url, seen_hosts)
-            for (url, (server, toot_id)) in replied_toot_ids
-        ),
-    )
+    known_context_urls = set()
+    for (url, (server, toot_id)) in replied_toot_ids:
+        if(toot_context_should_be_fetched(toot)):
+            recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
+            context = get_toot_context(server, toot_id, url, seen_hosts)
+            if context is not None:
+                for item in context:
+                    known_context_urls.add(item)
+            else:
+                logger.error(f"Error getting context for toot {url}")
+
+    known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
+    
+    logger.info(f"Found {len(known_context_urls)} known context toots")
+    
+    return known_context_urls


 def get_toot_context(server, toot_id, toot_url, seen_hosts):
@ -1324,6 +1353,7 @@ if __name__ == "__main__":
        KNOWN_FOLLOWINGS_FILE = os.path.join(arguments.state_dir, "known_followings")
        RECENTLY_CHECKED_USERS_FILE = os.path.join(arguments.state_dir, "recently_checked_users")
        SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
+        RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')


        seen_urls = OrderedSet([])
@ -1353,6 +1383,21 @@ if __name__ == "__main__":
            if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
                recently_checked_users.pop(user)    

+        recently_checked_context = {}
+        if(os.path.exists(RECENTLY_CHECKED_CONTEXTS_FILE)):
+            with open(RECENTLY_CHECKED_CONTEXTS_FILE, "r", encoding="utf-8") as f:
+                recently_checked_context = json.load(f)
+
+        # Remove any toots that we haven't seen in a while, to ensure this doesn't grow indefinitely
+        for tootUrl in recently_checked_context:
+            recently_checked_context[tootUrl]['lastSeen'] = parser.parse(recently_checked_context[tootUrl]['lastSeen'])
+            recently_checked_context[tootUrl]['created_at'] = parser.parse(recently_checked_context[tootUrl]['created_at'])
+            lastSeen = recently_checked_context[tootUrl]['lastSeen']
+            userAge = datetime.now(lastSeen.tzinfo) - lastSeen
+            # dont really need to keep track for more than 7 days: if we haven't seen it in 7 days we can refetch content anyway
+            if(userAge.total_seconds() > 7 * 24 * 60 * 60):
+                recently_checked_users.pop(tootUrl)    
+
        parsed_urls = {}

        all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users))
@ -1470,6 +1515,9 @@ if __name__ == "__main__":
        with open(SEEN_HOSTS_FILE, "w", encoding="utf-8") as f:
            f.write(seen_hosts.toJSON())

+        with open(RECENTLY_CHECKED_CONTEXTS_FILE, "w", encoding="utf-8") as f:
+            f.write(json.dumps(recently_checked_context, default=str))
+
        os.remove(LOCK_FILE)

        if(arguments.on_done != None and arguments.on_done != ''):