diff --git a/find_posts.py b/find_posts.py index 22204ae..975baac 100644 --- a/find_posts.py +++ b/find_posts.py @@ -459,6 +459,24 @@ def get_reply_toots(user_id, server, access_token, seen_urls, reply_since): f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}" ) +def toot_context_should_be_fetched(toot): + if toot['uri'] not in recently_checked_context: + recently_checked_context[toot['uri']] = toot + return True + else: + lastSeen = recently_checked_context[toot['uri']]['lastSeen'] + lastSeenInSeconds = (datetime.now(lastSeen.tzinfo) - lastSeen).total_seconds() + ageInSeconds = (datetime.now(lastSeen.tzinfo) - recently_checked_context[toot['uri']]['created_at']).total_seconds() + if(ageInSeconds <= 60 * 60): + # For the first hour: allow refetching context as desired + return True + if(ageInSeconds <= 24 * 60 * 60 and lastSeenInSeconds > 10 * 60): + # For the next day: once every 10 minutes + return True + if(lastSeenInSeconds > 60 * 60): + # After that: hourly + return True + return False def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts): """get the context toots of the given toots from their original server""" @@ -468,12 +486,14 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts): if toot_has_parseable_url(toot, parsed_urls): url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"] parsed_url = parse_url(url, parsed_urls) - context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts) - if context is not None: - for item in context: - known_context_urls.add(item) - else: - logger.error(f"Error getting context for toot {url}") + if(toot_context_should_be_fetched(toot)): + recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo) + context = get_toot_context(parsed_url[0], parsed_url[1], url, seen_hosts) + if context is not None: + for item in context: + known_context_urls.add(item) + else: + logger.error(f"Error getting context for toot {url}") known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls)) logger.info(f"Found {len(known_context_urls)} known context toots") @@ -707,13 +727,22 @@ def get_redirect_url(url): def get_all_context_urls(server, replied_toot_ids, seen_hosts): """get the URLs of the context toots of the given toots""" - return filter( - lambda url: not url.startswith(f"https://{server}/"), - itertools.chain.from_iterable( - get_toot_context(server, toot_id, url, seen_hosts) - for (url, (server, toot_id)) in replied_toot_ids - ), - ) + known_context_urls = set() + for (url, (server, toot_id)) in replied_toot_ids: + if(toot_context_should_be_fetched(toot)): + recently_checked_context[toot['uri']]['lastSeen'] = datetime.now(datetime.now().astimezone().tzinfo) + context = get_toot_context(server, toot_id, url, seen_hosts) + if context is not None: + for item in context: + known_context_urls.add(item) + else: + logger.error(f"Error getting context for toot {url}") + + known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls)) + + logger.info(f"Found {len(known_context_urls)} known context toots") + + return known_context_urls def get_toot_context(server, toot_id, toot_url, seen_hosts): @@ -1324,6 +1353,7 @@ if __name__ == "__main__": KNOWN_FOLLOWINGS_FILE = os.path.join(arguments.state_dir, "known_followings") RECENTLY_CHECKED_USERS_FILE = os.path.join(arguments.state_dir, "recently_checked_users") SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts") + RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context') seen_urls = OrderedSet([]) @@ -1353,6 +1383,21 @@ if __name__ == "__main__": if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60): recently_checked_users.pop(user) + recently_checked_context = {} + if(os.path.exists(RECENTLY_CHECKED_CONTEXTS_FILE)): + with open(RECENTLY_CHECKED_CONTEXTS_FILE, "r", encoding="utf-8") as f: + recently_checked_context = json.load(f) + + # Remove any toots that we haven't seen in a while, to ensure this doesn't grow indefinitely + for tootUrl in recently_checked_context: + recently_checked_context[tootUrl]['lastSeen'] = parser.parse(recently_checked_context[tootUrl]['lastSeen']) + recently_checked_context[tootUrl]['created_at'] = parser.parse(recently_checked_context[tootUrl]['created_at']) + lastSeen = recently_checked_context[tootUrl]['lastSeen'] + userAge = datetime.now(lastSeen.tzinfo) - lastSeen + # dont really need to keep track for more than 7 days: if we haven't seen it in 7 days we can refetch content anyway + if(userAge.total_seconds() > 7 * 24 * 60 * 60): + recently_checked_users.pop(tootUrl) + parsed_urls = {} all_known_users = OrderedSet(list(known_followings) + list(recently_checked_users)) @@ -1470,6 +1515,9 @@ if __name__ == "__main__": with open(SEEN_HOSTS_FILE, "w", encoding="utf-8") as f: f.write(seen_hosts.toJSON()) + with open(RECENTLY_CHECKED_CONTEXTS_FILE, "w", encoding="utf-8") as f: + f.write(json.dumps(recently_checked_context, default=str)) + os.remove(LOCK_FILE) if(arguments.on_done != None and arguments.on_done != ''):