Merge pull request #124 from nanos/rate-limits

Rate limit fetching of context
This commit is contained in:
Michael 2024-06-25 08:53:16 +01:00 committed by GitHub
commit 721d2fc5bb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -459,6 +459,24 @@ def get_reply_toots(user_id, server, access_token, seen_urls, reply_since):
f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}"
)
def toot_context_should_be_fetched(toot):
if toot['uri'] not in recently_checked_context:
recently_checked_context[toot['uri']] = toot
return True
else:
lastSeen = recently_checked_context[toot['uri']]['lastSeen']
lastSeenInSeconds = (datetime.now(lastSeen.tzinfo) - lastSeen).total_seconds()
ageInSeconds = (datetime.now(lastSeen.tzinfo) - recently_checked_context[toot['uri']]['created_at']).total_seconds()
if(ageInSeconds <= 60 * 60):
# For the first hour: allow refetching context as desired
return True
if(ageInSeconds <= 24 * 60 * 60 and lastSeenInSeconds > 10 * 60):
# For the next day: once every 10 minutes
return True
if(lastSeenInSeconds > 60 * 60):
# After that: hourly
return True
return False
def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
"""get the context toots of the given toots from their original server"""
@ -468,12 +486,14 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
if toot_has_parseable_url(toot, parsed_urls):
url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
parsed_url = parse_url(url, parsed_urls)
context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
if context is not None:
for item in context:
known_context_urls.add(item)
else:
logger.error(f"Error getting context for toot {url}")
if(toot_context_should_be_fetched(toot)):
recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
if context is not None:
for item in context:
known_context_urls.add(item)
else:
logger.error(f"Error getting context for toot {url}")
known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
logger.info(f"Found {len(known_context_urls)} known context toots")
@ -707,13 +727,22 @@ def get_redirect_url(url):
def get_all_context_urls(server, replied_toot_ids, seen_hosts):
"""get the URLs of the context toots of the given toots"""
return filter(
lambda url: not url.startswith(f"https://{server}/"),
itertools.chain.from_iterable(
get_toot_context(server, toot_id, url, seen_hosts)
for (url, (server, toot_id)) in replied_toot_ids
),
)
known_context_urls = set()
for (url, (server, toot_id)) in replied_toot_ids:
if(toot_context_should_be_fetched(toot)):
recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
context = get_toot_context(server, toot_id, url, seen_hosts)
if context is not None:
for item in context:
known_context_urls.add(item)
else:
logger.error(f"Error getting context for toot {url}")
known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
logger.info(f"Found {len(known_context_urls)} known context toots")
return known_context_urls
def get_toot_context(server, toot_id, toot_url, seen_hosts):
@ -1324,6 +1353,7 @@ if __name__ == "__main__":
KNOWN_FOLLOWINGS_FILE = os.path.join(arguments.state_dir, "known_followings")
RECENTLY_CHECKED_USERS_FILE = os.path.join(arguments.state_dir, "recently_checked_users")
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
seen_urls = OrderedSet([])
@ -1353,6 +1383,21 @@ if __name__ == "__main__":
if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
recently_checked_users.pop(user)
recently_checked_context = {}
if(os.path.exists(RECENTLY_CHECKED_CONTEXTS_FILE)):
with open(RECENTLY_CHECKED_CONTEXTS_FILE, "r", encoding="utf-8") as f:
recently_checked_context = json.load(f)
# Remove any toots that we haven't seen in a while, to ensure this doesn't grow indefinitely
for tootUrl in recently_checked_context:
recently_checked_context[tootUrl]['lastSeen'] = parser.parse(recently_checked_context[tootUrl]['lastSeen'])
recently_checked_context[tootUrl]['created_at'] = parser.parse(recently_checked_context[tootUrl]['created_at'])
lastSeen = recently_checked_context[tootUrl]['lastSeen']
userAge = datetime.now(lastSeen.tzinfo) - lastSeen
# dont really need to keep track for more than 7 days: if we haven't seen it in 7 days we can refetch content anyway
if(userAge.total_seconds() > 7 * 24 * 60 * 60):
recently_checked_users.pop(tootUrl)
parsed_urls = {}
all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users))
@ -1470,6 +1515,9 @@ if __name__ == "__main__":
with open(SEEN_HOSTS_FILE, "w", encoding="utf-8") as f:
f.write(seen_hosts.toJSON())
with open(RECENTLY_CHECKED_CONTEXTS_FILE, "w", encoding="utf-8") as f:
f.write(json.dumps(recently_checked_context, default=str))
os.remove(LOCK_FILE)
if(arguments.on_done != None and arguments.on_done != ''):