Merge pull request #124 from nanos/rate-limits
Rate limit fetching of context
This commit is contained in:
commit
721d2fc5bb
1 changed files with 61 additions and 13 deletions
|
|
@ -459,6 +459,24 @@ def get_reply_toots(user_id, server, access_token, seen_urls, reply_since):
|
||||||
f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}"
|
f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def toot_context_should_be_fetched(toot):
|
||||||
|
if toot['uri'] not in recently_checked_context:
|
||||||
|
recently_checked_context[toot['uri']] = toot
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
lastSeen = recently_checked_context[toot['uri']]['lastSeen']
|
||||||
|
lastSeenInSeconds = (datetime.now(lastSeen.tzinfo) - lastSeen).total_seconds()
|
||||||
|
ageInSeconds = (datetime.now(lastSeen.tzinfo) - recently_checked_context[toot['uri']]['created_at']).total_seconds()
|
||||||
|
if(ageInSeconds <= 60 * 60):
|
||||||
|
# For the first hour: allow refetching context as desired
|
||||||
|
return True
|
||||||
|
if(ageInSeconds <= 24 * 60 * 60 and lastSeenInSeconds > 10 * 60):
|
||||||
|
# For the next day: once every 10 minutes
|
||||||
|
return True
|
||||||
|
if(lastSeenInSeconds > 60 * 60):
|
||||||
|
# After that: hourly
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
|
def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
|
||||||
"""get the context toots of the given toots from their original server"""
|
"""get the context toots of the given toots from their original server"""
|
||||||
|
|
@ -468,12 +486,14 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
|
||||||
if toot_has_parseable_url(toot, parsed_urls):
|
if toot_has_parseable_url(toot, parsed_urls):
|
||||||
url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
|
url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
|
||||||
parsed_url = parse_url(url, parsed_urls)
|
parsed_url = parse_url(url, parsed_urls)
|
||||||
context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
|
if(toot_context_should_be_fetched(toot)):
|
||||||
if context is not None:
|
recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
|
||||||
for item in context:
|
context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts)
|
||||||
known_context_urls.add(item)
|
if context is not None:
|
||||||
else:
|
for item in context:
|
||||||
logger.error(f"Error getting context for toot {url}")
|
known_context_urls.add(item)
|
||||||
|
else:
|
||||||
|
logger.error(f"Error getting context for toot {url}")
|
||||||
|
|
||||||
known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
|
known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
|
||||||
logger.info(f"Found {len(known_context_urls)} known context toots")
|
logger.info(f"Found {len(known_context_urls)} known context toots")
|
||||||
|
|
@ -707,13 +727,22 @@ def get_redirect_url(url):
|
||||||
|
|
||||||
def get_all_context_urls(server, replied_toot_ids, seen_hosts):
|
def get_all_context_urls(server, replied_toot_ids, seen_hosts):
|
||||||
"""get the URLs of the context toots of the given toots"""
|
"""get the URLs of the context toots of the given toots"""
|
||||||
return filter(
|
known_context_urls = set()
|
||||||
lambda url: not url.startswith(f"https://{server}/"),
|
for (url, (server, toot_id)) in replied_toot_ids:
|
||||||
itertools.chain.from_iterable(
|
if(toot_context_should_be_fetched(toot)):
|
||||||
get_toot_context(server, toot_id, url, seen_hosts)
|
recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo)
|
||||||
for (url, (server, toot_id)) in replied_toot_ids
|
context = get_toot_context(server, toot_id, url, seen_hosts)
|
||||||
),
|
if context is not None:
|
||||||
)
|
for item in context:
|
||||||
|
known_context_urls.add(item)
|
||||||
|
else:
|
||||||
|
logger.error(f"Error getting context for toot {url}")
|
||||||
|
|
||||||
|
known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
|
||||||
|
|
||||||
|
logger.info(f"Found {len(known_context_urls)} known context toots")
|
||||||
|
|
||||||
|
return known_context_urls
|
||||||
|
|
||||||
|
|
||||||
def get_toot_context(server, toot_id, toot_url, seen_hosts):
|
def get_toot_context(server, toot_id, toot_url, seen_hosts):
|
||||||
|
|
@ -1324,6 +1353,7 @@ if __name__ == "__main__":
|
||||||
KNOWN_FOLLOWINGS_FILE = os.path.join(arguments.state_dir, "known_followings")
|
KNOWN_FOLLOWINGS_FILE = os.path.join(arguments.state_dir, "known_followings")
|
||||||
RECENTLY_CHECKED_USERS_FILE = os.path.join(arguments.state_dir, "recently_checked_users")
|
RECENTLY_CHECKED_USERS_FILE = os.path.join(arguments.state_dir, "recently_checked_users")
|
||||||
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
|
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
|
||||||
|
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
|
||||||
|
|
||||||
|
|
||||||
seen_urls = OrderedSet([])
|
seen_urls = OrderedSet([])
|
||||||
|
|
@ -1353,6 +1383,21 @@ if __name__ == "__main__":
|
||||||
if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
|
if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
|
||||||
recently_checked_users.pop(user)
|
recently_checked_users.pop(user)
|
||||||
|
|
||||||
|
recently_checked_context = {}
|
||||||
|
if(os.path.exists(RECENTLY_CHECKED_CONTEXTS_FILE)):
|
||||||
|
with open(RECENTLY_CHECKED_CONTEXTS_FILE, "r", encoding="utf-8") as f:
|
||||||
|
recently_checked_context = json.load(f)
|
||||||
|
|
||||||
|
# Remove any toots that we haven't seen in a while, to ensure this doesn't grow indefinitely
|
||||||
|
for tootUrl in recently_checked_context:
|
||||||
|
recently_checked_context[tootUrl]['lastSeen'] = parser.parse(recently_checked_context[tootUrl]['lastSeen'])
|
||||||
|
recently_checked_context[tootUrl]['created_at'] = parser.parse(recently_checked_context[tootUrl]['created_at'])
|
||||||
|
lastSeen = recently_checked_context[tootUrl]['lastSeen']
|
||||||
|
userAge = datetime.now(lastSeen.tzinfo) - lastSeen
|
||||||
|
# dont really need to keep track for more than 7 days: if we haven't seen it in 7 days we can refetch content anyway
|
||||||
|
if(userAge.total_seconds() > 7 * 24 * 60 * 60):
|
||||||
|
recently_checked_users.pop(tootUrl)
|
||||||
|
|
||||||
parsed_urls = {}
|
parsed_urls = {}
|
||||||
|
|
||||||
all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users))
|
all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users))
|
||||||
|
|
@ -1470,6 +1515,9 @@ if __name__ == "__main__":
|
||||||
with open(SEEN_HOSTS_FILE, "w", encoding="utf-8") as f:
|
with open(SEEN_HOSTS_FILE, "w", encoding="utf-8") as f:
|
||||||
f.write(seen_hosts.toJSON())
|
f.write(seen_hosts.toJSON())
|
||||||
|
|
||||||
|
with open(RECENTLY_CHECKED_CONTEXTS_FILE, "w", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(recently_checked_context, default=str))
|
||||||
|
|
||||||
os.remove(LOCK_FILE)
|
os.remove(LOCK_FILE)
|
||||||
|
|
||||||
if(arguments.on_done != None and arguments.on_done != ''):
|
if(arguments.on_done != None and arguments.on_done != ''):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue