mirror of
https://github.com/nicholasr-itsulu/FediFetcher.git
synced 2026-05-30 23:41:26 +00:00
Merge pull request #164 from AndrewKvalheim/trivial
Remove unused code, correct spelling
This commit is contained in:
commit
d6066f34c8
1 changed files with 65 additions and 66 deletions
131
find_posts.py
131
find_posts.py
|
|
@ -1,7 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import string
|
|
||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
|
|
@ -53,7 +52,7 @@ argparser.add_argument('--max-list-length', required=False, type=int, default=10
|
||||||
argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
|
argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
|
||||||
argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
|
argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
|
||||||
argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
|
argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
|
||||||
argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-seperated array of instances that FediFetcher should never try to connect to")
|
argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-separated array of instances that FediFetcher should never try to connect to")
|
||||||
|
|
||||||
def get_notification_users(server, access_token, known_users, max_age):
|
def get_notification_users(server, access_token, known_users, max_age):
|
||||||
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
|
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
|
||||||
|
|
@ -100,7 +99,7 @@ def add_user_posts(server, access_token, followings, known_followings, all_known
|
||||||
failed += 1
|
failed += 1
|
||||||
logger.info(f"Added {count} posts for user {user['acct']} with {failed} errors")
|
logger.info(f"Added {count} posts for user {user['acct']} with {failed} errors")
|
||||||
if failed == 0:
|
if failed == 0:
|
||||||
known_followings.add(user['acct'])
|
known_followings.add(user['acct'])
|
||||||
all_known_users.add(user['acct'])
|
all_known_users.add(user['acct'])
|
||||||
|
|
||||||
def add_post_with_context(post, server, access_token, seen_urls, seen_hosts):
|
def add_post_with_context(post, server, access_token, seen_urls, seen_hosts):
|
||||||
|
|
@ -115,7 +114,7 @@ def add_post_with_context(post, server, access_token, seen_urls, seen_hosts):
|
||||||
known_context_urls = get_all_known_context_urls(server, [post],parsed_urls, seen_hosts)
|
known_context_urls = get_all_known_context_urls(server, [post],parsed_urls, seen_hosts)
|
||||||
add_context_urls(server, access_token, known_context_urls, seen_urls)
|
add_context_urls(server, access_token, known_context_urls, seen_urls)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def user_has_opted_out(user):
|
def user_has_opted_out(user):
|
||||||
|
|
@ -126,7 +125,7 @@ def user_has_opted_out(user):
|
||||||
if 'discoverable' in user and not user['discoverable']:
|
if 'discoverable' in user and not user['discoverable']:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_user_posts(user, known_followings, server, seen_hosts):
|
def get_user_posts(user, known_followings, server, seen_hosts):
|
||||||
if user_has_opted_out(user):
|
if user_has_opted_out(user):
|
||||||
|
|
@ -138,7 +137,7 @@ def get_user_posts(user, known_followings, server, seen_hosts):
|
||||||
# We are adding it as 'known' anyway, because we won't be able to fix this.
|
# We are adding it as 'known' anyway, because we won't be able to fix this.
|
||||||
known_followings.add(user['acct'])
|
known_followings.add(user['acct'])
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if(parsed_url[0] == server):
|
if(parsed_url[0] == server):
|
||||||
logger.debug(f"{user['acct']} is a local user. Skip")
|
logger.debug(f"{user['acct']} is a local user. Skip")
|
||||||
known_followings.add(user['acct'])
|
known_followings.add(user['acct'])
|
||||||
|
|
@ -157,7 +156,7 @@ def get_user_posts(user, known_followings, server, seen_hosts):
|
||||||
|
|
||||||
if post_server['misskeyApiSupport']:
|
if post_server['misskeyApiSupport']:
|
||||||
return get_user_posts_misskey(parsed_url[1], post_server['webserver'])
|
return get_user_posts_misskey(parsed_url[1], post_server['webserver'])
|
||||||
|
|
||||||
if post_server['peertubeApiSupport']:
|
if post_server['peertubeApiSupport']:
|
||||||
return get_user_posts_peertube(parsed_url[1], post_server['webserver'])
|
return get_user_posts_peertube(parsed_url[1], post_server['webserver'])
|
||||||
|
|
||||||
|
|
@ -219,11 +218,11 @@ def get_user_posts_lemmy(userName, userUrl, webserver):
|
||||||
for post in all_posts:
|
for post in all_posts:
|
||||||
post['url'] = post['ap_id']
|
post['url'] = post['ap_id']
|
||||||
return all_posts
|
return all_posts
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.error(f"Error getting user posts for user {userName}: {ex}")
|
logger.error(f"Error getting user posts for user {userName}: {ex}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_user_posts_peertube(userName, webserver):
|
def get_user_posts_peertube(userName, webserver):
|
||||||
try:
|
try:
|
||||||
url = f'https://{webserver}/api/v1/accounts/{userName}/videos'
|
url = f'https://{webserver}/api/v1/accounts/{userName}/videos'
|
||||||
|
|
@ -280,7 +279,7 @@ def get_user_posts_misskey(userName, webserver):
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.error(f"Error getting posts by user {userName} from {webserver}. Exception: {ex}")
|
logger.error(f"Error getting posts by user {userName} from {webserver}. Exception: {ex}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_new_follow_requests(server, access_token, max, known_followings):
|
def get_new_follow_requests(server, access_token, max, known_followings):
|
||||||
"""Get any new follow requests for the specified user, up to the max number provided"""
|
"""Get any new follow requests for the specified user, up to the max number provided"""
|
||||||
|
|
@ -289,11 +288,11 @@ def get_new_follow_requests(server, access_token, max, known_followings):
|
||||||
"Authorization": f"Bearer {access_token}",
|
"Authorization": f"Bearer {access_token}",
|
||||||
})
|
})
|
||||||
|
|
||||||
# Remove any we already know about
|
# Remove any we already know about
|
||||||
new_follow_requests = filter_known_users(follow_requests, known_followings)
|
new_follow_requests = filter_known_users(follow_requests, known_followings)
|
||||||
|
|
||||||
logger.info(f"Got {len(follow_requests)} follow_requests, {len(new_follow_requests)} of which are new")
|
logger.info(f"Got {len(follow_requests)} follow_requests, {len(new_follow_requests)} of which are new")
|
||||||
|
|
||||||
return new_follow_requests
|
return new_follow_requests
|
||||||
|
|
||||||
def filter_known_users(users, known_users):
|
def filter_known_users(users, known_users):
|
||||||
|
|
@ -306,24 +305,24 @@ def get_new_followers(server, user_id, max, known_followers):
|
||||||
"""Get any new followings for the specified user, up to the max number provided"""
|
"""Get any new followings for the specified user, up to the max number provided"""
|
||||||
followers = get_paginated_mastodon(f"https://{server}/api/v1/accounts/{user_id}/followers", max)
|
followers = get_paginated_mastodon(f"https://{server}/api/v1/accounts/{user_id}/followers", max)
|
||||||
|
|
||||||
# Remove any we already know about
|
# Remove any we already know about
|
||||||
new_followers = filter_known_users(followers, known_followers)
|
new_followers = filter_known_users(followers, known_followers)
|
||||||
|
|
||||||
logger.info(f"Got {len(followers)} followers, {len(new_followers)} of which are new")
|
logger.info(f"Got {len(followers)} followers, {len(new_followers)} of which are new")
|
||||||
|
|
||||||
return new_followers
|
return new_followers
|
||||||
|
|
||||||
def get_new_followings(server, user_id, max, known_followings):
|
def get_new_followings(server, user_id, max, known_followings):
|
||||||
"""Get any new followings for the specified user, up to the max number provided"""
|
"""Get any new followings for the specified user, up to the max number provided"""
|
||||||
following = get_paginated_mastodon(f"https://{server}/api/v1/accounts/{user_id}/following", max)
|
following = get_paginated_mastodon(f"https://{server}/api/v1/accounts/{user_id}/following", max)
|
||||||
|
|
||||||
# Remove any we already know about
|
# Remove any we already know about
|
||||||
new_followings = filter_known_users(following, known_followings)
|
new_followings = filter_known_users(following, known_followings)
|
||||||
|
|
||||||
logger.info(f"Got {len(following)} followings, {len(new_followings)} of which are new")
|
logger.info(f"Got {len(following)} followings, {len(new_followings)} of which are new")
|
||||||
|
|
||||||
return new_followings
|
return new_followings
|
||||||
|
|
||||||
|
|
||||||
def get_user_id(server, user = None, access_token = None):
|
def get_user_id(server, user = None, access_token = None):
|
||||||
"""Get the user id from the server, using a username"""
|
"""Get the user id from the server, using a username"""
|
||||||
|
|
@ -339,11 +338,11 @@ def get_user_id(server, user = None, access_token = None):
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
raise Exception('You must supply either a user name or an access token, to get an user ID')
|
raise Exception('You must supply either a user name or an access token, to get an user ID')
|
||||||
|
|
||||||
response = get(url, headers=headers)
|
response = get(url, headers=headers)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
return response.json()['id']
|
return response.json()['id']
|
||||||
elif response.status_code == 404:
|
elif response.status_code == 404:
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"User {user} was not found on server {server}."
|
f"User {user} was not found on server {server}."
|
||||||
|
|
@ -359,7 +358,7 @@ def get_timeline(server, access_token, max):
|
||||||
url = f"https://{server}/api/v1/timelines/home"
|
url = f"https://{server}/api/v1/timelines/home"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
response = get_toots(url, access_token)
|
response = get_toots(url, access_token)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
|
|
@ -390,7 +389,7 @@ def get_timeline(server, access_token, max):
|
||||||
logger.info(f"Found {len(toots)} toots in timeline")
|
logger.info(f"Found {len(toots)} toots in timeline")
|
||||||
|
|
||||||
return toots
|
return toots
|
||||||
|
|
||||||
def get_toots(url, access_token):
|
def get_toots(url, access_token):
|
||||||
response = get( url, headers={
|
response = get( url, headers={
|
||||||
"Authorization": f"Bearer {access_token}",
|
"Authorization": f"Bearer {access_token}",
|
||||||
|
|
@ -412,7 +411,7 @@ def get_toots(url, access_token):
|
||||||
raise Exception(
|
raise Exception(
|
||||||
f"Error getting URL {url}. Status code: {response.status_code}"
|
f"Error getting URL {url}. Status code: {response.status_code}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_active_user_ids(server, access_token, reply_interval_hours):
|
def get_active_user_ids(server, access_token, reply_interval_hours):
|
||||||
"""get all user IDs on the server that have posted a toot in the given
|
"""get all user IDs on the server that have posted a toot in the given
|
||||||
time interval"""
|
time interval"""
|
||||||
|
|
@ -529,12 +528,12 @@ def toot_context_should_be_fetched(toot):
|
||||||
if(lastSeenInSeconds >= 60 * 60):
|
if(lastSeenInSeconds >= 60 * 60):
|
||||||
# After that: hourly
|
# After that: hourly
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
|
def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
|
||||||
"""get the context toots of the given toots from their original server"""
|
"""get the context toots of the given toots from their original server"""
|
||||||
known_context_urls = set()
|
known_context_urls = set()
|
||||||
|
|
||||||
for toot in reply_toots:
|
for toot in reply_toots:
|
||||||
if toot_has_parseable_url(toot, parsed_urls):
|
if toot_has_parseable_url(toot, parsed_urls):
|
||||||
url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
|
url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
|
||||||
|
|
@ -547,10 +546,10 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls, seen_hosts):
|
||||||
known_context_urls.add(item)
|
known_context_urls.add(item)
|
||||||
else:
|
else:
|
||||||
logger.error(f"Error getting context for toot {url}")
|
logger.error(f"Error getting context for toot {url}")
|
||||||
|
|
||||||
known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
|
known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
|
||||||
logger.info(f"Found {len(known_context_urls)} known context toots")
|
logger.info(f"Found {len(known_context_urls)} known context toots")
|
||||||
|
|
||||||
return known_context_urls
|
return known_context_urls
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -559,7 +558,7 @@ def toot_has_parseable_url(toot,parsed_urls):
|
||||||
if(parsed is None) :
|
if(parsed is None) :
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def get_all_replied_toot_server_ids(
|
def get_all_replied_toot_server_ids(
|
||||||
server, reply_toots, replied_toot_server_ids, parsed_urls
|
server, reply_toots, replied_toot_server_ids, parsed_urls
|
||||||
|
|
@ -610,7 +609,7 @@ def parse_user_url(url):
|
||||||
match = parse_mastodon_profile_url(url)
|
match = parse_mastodon_profile_url(url)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
return match
|
return match
|
||||||
|
|
||||||
match = parse_pleroma_profile_url(url)
|
match = parse_pleroma_profile_url(url)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
return match
|
return match
|
||||||
|
|
@ -629,7 +628,7 @@ def parse_user_url(url):
|
||||||
return match
|
return match
|
||||||
|
|
||||||
logger.error(f"Error parsing Profile URL {url}")
|
logger.error(f"Error parsing Profile URL {url}")
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def parse_url(url, parsed_urls):
|
def parse_url(url, parsed_urls):
|
||||||
|
|
@ -642,7 +641,7 @@ def parse_url(url, parsed_urls):
|
||||||
match = parse_mastodon_uri(url)
|
match = parse_mastodon_uri(url)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
parsed_urls[url] = match
|
parsed_urls[url] = match
|
||||||
|
|
||||||
if url not in parsed_urls:
|
if url not in parsed_urls:
|
||||||
match = parse_pleroma_url(url)
|
match = parse_pleroma_url(url)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
|
|
@ -671,7 +670,7 @@ def parse_url(url, parsed_urls):
|
||||||
if url not in parsed_urls:
|
if url not in parsed_urls:
|
||||||
logger.error(f"Error parsing toot URL {url}")
|
logger.error(f"Error parsing toot URL {url}")
|
||||||
parsed_urls[url] = None
|
parsed_urls[url] = None
|
||||||
|
|
||||||
return parsed_urls[url]
|
return parsed_urls[url]
|
||||||
|
|
||||||
def parse_mastodon_profile_url(url):
|
def parse_mastodon_profile_url(url):
|
||||||
|
|
@ -709,7 +708,7 @@ def parse_pleroma_url(url):
|
||||||
url = get_redirect_url(url)
|
url = get_redirect_url(url)
|
||||||
if url is None:
|
if url is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
match = re.match(r"/notice/(?P<toot_id>[^/]+)", url)
|
match = re.match(r"/notice/(?P<toot_id>[^/]+)", url)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
return (server, match.group("toot_id"))
|
return (server, match.group("toot_id"))
|
||||||
|
|
@ -872,7 +871,7 @@ def get_lemmy_comment_context(webserver, toot_id, toot_url):
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.error(f"Error getting comment {toot_id} from {toot_url}. Exception: {ex}")
|
logger.error(f"Error getting comment {toot_id} from {toot_url}. Exception: {ex}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
try:
|
try:
|
||||||
res = resp.json()
|
res = resp.json()
|
||||||
|
|
@ -929,7 +928,7 @@ def get_peertube_urls(webserver, post_id, toot_url):
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
logger.error(f"Error getting comments on video {post_id} from {toot_url}. Exception: {ex}")
|
logger.error(f"Error getting comments on video {post_id} from {toot_url}. Exception: {ex}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
return [comment['url'] for comment in resp.json()['data']]
|
return [comment['url'] for comment in resp.json()['data']]
|
||||||
|
|
||||||
|
|
@ -1019,7 +1018,7 @@ def add_context_url(url, server, access_token):
|
||||||
f"Error adding url {search_url} to server {server}. Status code: {resp.status_code}"
|
f"Error adding url {search_url} to server {server}. Status code: {resp.status_code}"
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
|
def get_paginated_mastodon(url, max, headers = {}, timeout = 0, max_tries = 5):
|
||||||
"""Make a paginated request to mastodon"""
|
"""Make a paginated request to mastodon"""
|
||||||
if(isinstance(max, int)):
|
if(isinstance(max, int)):
|
||||||
|
|
@ -1084,7 +1083,7 @@ def get_cached_robots(robots_url):
|
||||||
## firstly: check the in-memory cache
|
## firstly: check the in-memory cache
|
||||||
if robots_url in ROBOTS_TXT:
|
if robots_url in ROBOTS_TXT:
|
||||||
return ROBOTS_TXT[robots_url]
|
return ROBOTS_TXT[robots_url]
|
||||||
|
|
||||||
robotsCachePath = get_robots_txt_cache_path(robots_url)
|
robotsCachePath = get_robots_txt_cache_path(robots_url)
|
||||||
if os.path.exists(robotsCachePath):
|
if os.path.exists(robotsCachePath):
|
||||||
with open(robotsCachePath, "r", encoding="utf-8") as f:
|
with open(robotsCachePath, "r", encoding="utf-8") as f:
|
||||||
|
|
@ -1092,14 +1091,14 @@ def get_cached_robots(robots_url):
|
||||||
robotsTxt = f.read()
|
robotsTxt = f.read()
|
||||||
ROBOTS_TXT[robots_url] = robotsTxt
|
ROBOTS_TXT[robots_url] = robotsTxt
|
||||||
return robotsTxt
|
return robotsTxt
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_robots_from_url(robots_url):
|
def get_robots_from_url(robots_url):
|
||||||
robotsTxt = get_cached_robots(robots_url)
|
robotsTxt = get_cached_robots(robots_url)
|
||||||
if robotsTxt != None:
|
if robotsTxt != None:
|
||||||
return robotsTxt
|
return robotsTxt
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
|
# We are getting the robots.txt manually from here, because otherwise we can't change the User Agent
|
||||||
robotsTxt = get(robots_url, timeout = 2, ignore_robots_txt=True)
|
robotsTxt = get(robots_url, timeout = 2, ignore_robots_txt=True)
|
||||||
|
|
@ -1110,7 +1109,7 @@ def get_robots_from_url(robots_url):
|
||||||
with open(get_robots_txt_cache_path(robots_url), "w", encoding="utf-8") as f:
|
with open(get_robots_txt_cache_path(robots_url), "w", encoding="utf-8") as f:
|
||||||
f.write(robotsTxt)
|
f.write(robotsTxt)
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception:
|
||||||
robotsTxt = True
|
robotsTxt = True
|
||||||
|
|
||||||
ROBOTS_TXT[robots_url] = robotsTxt
|
ROBOTS_TXT[robots_url] = robotsTxt
|
||||||
|
|
@ -1128,7 +1127,7 @@ def can_fetch(user_agent, url):
|
||||||
robotsTxt = get_robots_from_url(robots_url)
|
robotsTxt = get_robots_from_url(robots_url)
|
||||||
if isinstance(robotsTxt, bool):
|
if isinstance(robotsTxt, bool):
|
||||||
return robotsTxt
|
return robotsTxt
|
||||||
|
|
||||||
robotParser = urllib.robotparser.RobotFileParser()
|
robotParser = urllib.robotparser.RobotFileParser()
|
||||||
robotParser.parse(robotsTxt.splitlines())
|
robotParser.parse(robotsTxt.splitlines())
|
||||||
return robotParser.can_fetch(user_agent, url)
|
return robotParser.can_fetch(user_agent, url)
|
||||||
|
|
@ -1144,11 +1143,11 @@ def get(url, headers = {}, timeout = 0, max_tries = 5, ignore_robots_txt = False
|
||||||
h['User-Agent'] = user_agent()
|
h['User-Agent'] = user_agent()
|
||||||
|
|
||||||
if not ignore_robots_txt and not can_fetch(h['User-Agent'], url):
|
if not ignore_robots_txt and not can_fetch(h['User-Agent'], url):
|
||||||
raise Exception(f"Querying {url} prohibited by robots.txt")
|
raise Exception(f"Querying {url} prohibited by robots.txt")
|
||||||
|
|
||||||
if timeout == 0:
|
if timeout == 0:
|
||||||
timeout = arguments.http_timeout
|
timeout = arguments.http_timeout
|
||||||
|
|
||||||
response = requests.get( url, headers= h, timeout=timeout)
|
response = requests.get( url, headers= h, timeout=timeout)
|
||||||
if response.status_code == 429:
|
if response.status_code == 429:
|
||||||
if max_tries > 0:
|
if max_tries > 0:
|
||||||
|
|
@ -1158,7 +1157,7 @@ def get(url, headers = {}, timeout = 0, max_tries = 5, ignore_robots_txt = False
|
||||||
logger.warning(f"Rate Limit hit requesting {url}. Waiting {wait} sec to retry at {response.headers['x-ratelimit-reset']}")
|
logger.warning(f"Rate Limit hit requesting {url}. Waiting {wait} sec to retry at {response.headers['x-ratelimit-reset']}")
|
||||||
time.sleep(wait)
|
time.sleep(wait)
|
||||||
return get(url, headers, timeout, max_tries - 1)
|
return get(url, headers, timeout, max_tries - 1)
|
||||||
|
|
||||||
raise Exception(f"Maximum number of retries exceeded for rate limited request {url}")
|
raise Exception(f"Maximum number of retries exceeded for rate limited request {url}")
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
@ -1169,8 +1168,8 @@ def post(url, json, headers = {}, timeout = 0, max_tries = 5):
|
||||||
h['User-Agent'] = user_agent()
|
h['User-Agent'] = user_agent()
|
||||||
|
|
||||||
if not can_fetch(h['User-Agent'], url):
|
if not can_fetch(h['User-Agent'], url):
|
||||||
raise Exception(f"Querying {url} prohibited by robots.txt")
|
raise Exception(f"Querying {url} prohibited by robots.txt")
|
||||||
|
|
||||||
if timeout == 0:
|
if timeout == 0:
|
||||||
timeout = arguments.http_timeout
|
timeout = arguments.http_timeout
|
||||||
|
|
||||||
|
|
@ -1200,10 +1199,10 @@ class ServerList:
|
||||||
|
|
||||||
def get(self, key):
|
def get(self, key):
|
||||||
return self._dict[key]
|
return self._dict[key]
|
||||||
|
|
||||||
def pop(self,key):
|
def pop(self,key):
|
||||||
return self._dict.pop(key)
|
return self._dict.pop(key)
|
||||||
|
|
||||||
def __contains__(self, item):
|
def __contains__(self, item):
|
||||||
return item in self._dict
|
return item in self._dict
|
||||||
|
|
||||||
|
|
@ -1212,7 +1211,7 @@ class ServerList:
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self._dict)
|
return len(self._dict)
|
||||||
|
|
||||||
def toJSON(self):
|
def toJSON(self):
|
||||||
return json.dumps(self._dict,default=str)
|
return json.dumps(self._dict,default=str)
|
||||||
|
|
||||||
|
|
@ -1241,7 +1240,7 @@ class OrderedSet:
|
||||||
|
|
||||||
def pop(self, item):
|
def pop(self, item):
|
||||||
self._dict.pop(item)
|
self._dict.pop(item)
|
||||||
|
|
||||||
def get(self, item):
|
def get(self, item):
|
||||||
return self._dict[item]
|
return self._dict[item]
|
||||||
|
|
||||||
|
|
@ -1257,7 +1256,7 @@ class OrderedSet:
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self._dict)
|
return len(self._dict)
|
||||||
|
|
||||||
def toJSON(self):
|
def toJSON(self):
|
||||||
return json.dumps(self._dict,default=str)
|
return json.dumps(self._dict,default=str)
|
||||||
|
|
||||||
|
|
@ -1516,15 +1515,15 @@ if __name__ == "__main__":
|
||||||
if tokens := [token for envvar, token in os.environ.items() if envvar.lower().startswith("ff_access_token")]:
|
if tokens := [token for envvar, token in os.environ.items() if envvar.lower().startswith("ff_access_token")]:
|
||||||
arguments.access_token = tokens
|
arguments.access_token = tokens
|
||||||
|
|
||||||
logger.info(f"Starting FediFetcher")
|
logger.info("Starting FediFetcher")
|
||||||
|
|
||||||
if(arguments.server == None or arguments.access_token == None):
|
if(arguments.server == None or arguments.access_token == None):
|
||||||
logger.critical("You must supply at least a server name and an access token")
|
logger.critical("You must supply at least a server name and an access token")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# in case someone provided the server name as url instead,
|
# in case someone provided the server name as url instead,
|
||||||
setattr(arguments, 'server', re.sub(r"^(https://)?([^/]*)/?$", "\\2", arguments.server))
|
setattr(arguments, 'server', re.sub(r"^(https://)?([^/]*)/?$", "\\2", arguments.server))
|
||||||
|
|
||||||
|
|
||||||
runId = uuid.uuid4()
|
runId = uuid.uuid4()
|
||||||
|
|
||||||
|
|
@ -1545,9 +1544,9 @@ if __name__ == "__main__":
|
||||||
with open(LOCK_FILE, "r", encoding="utf-8") as f:
|
with open(LOCK_FILE, "r", encoding="utf-8") as f:
|
||||||
lock_time = parser.parse(f.read())
|
lock_time = parser.parse(f.read())
|
||||||
|
|
||||||
if (datetime.now() - lock_time).total_seconds() >= arguments.lock_hours * 60 * 60:
|
if (datetime.now() - lock_time).total_seconds() >= arguments.lock_hours * 60 * 60:
|
||||||
os.remove(LOCK_FILE)
|
os.remove(LOCK_FILE)
|
||||||
logger.debug(f"Lock file has expired. Removed lock file.")
|
logger.debug("Lock file has expired. Removed lock file.")
|
||||||
else:
|
else:
|
||||||
logger.critical(f"Lock file age is {datetime.now() - lock_time} - below --lock-hours={arguments.lock_hours} provided.")
|
logger.critical(f"Lock file age is {datetime.now() - lock_time} - below --lock-hours={arguments.lock_hours} provided.")
|
||||||
if(arguments.on_fail != None and arguments.on_fail != ''):
|
if(arguments.on_fail != None and arguments.on_fail != ''):
|
||||||
|
|
@ -1558,7 +1557,7 @@ if __name__ == "__main__":
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.critical(f"Cannot read logfile age - aborting.")
|
logger.critical("Cannot read logfile age - aborting.")
|
||||||
if(arguments.on_fail != None and arguments.on_fail != ''):
|
if(arguments.on_fail != None and arguments.on_fail != ''):
|
||||||
try:
|
try:
|
||||||
get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True)
|
get(f"{arguments.on_fail}?rid={runId}", ignore_robots_txt = True)
|
||||||
|
|
@ -1606,7 +1605,7 @@ if __name__ == "__main__":
|
||||||
lastCheck = recently_checked_users.get(user)
|
lastCheck = recently_checked_users.get(user)
|
||||||
userAge = datetime.now(lastCheck.tzinfo) - lastCheck
|
userAge = datetime.now(lastCheck.tzinfo) - lastCheck
|
||||||
if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
|
if(userAge.total_seconds() > arguments.remember_users_for_hours * 60 * 60):
|
||||||
recently_checked_users.pop(user)
|
recently_checked_users.pop(user)
|
||||||
|
|
||||||
recently_checked_context = {}
|
recently_checked_context = {}
|
||||||
if(os.path.exists(RECENTLY_CHECKED_CONTEXTS_FILE)):
|
if(os.path.exists(RECENTLY_CHECKED_CONTEXTS_FILE)):
|
||||||
|
|
@ -1621,7 +1620,7 @@ if __name__ == "__main__":
|
||||||
userAge = datetime.now(lastSeen.tzinfo) - lastSeen
|
userAge = datetime.now(lastSeen.tzinfo) - lastSeen
|
||||||
# dont really need to keep track for more than 7 days: if we haven't seen it in 7 days we can refetch content anyway
|
# dont really need to keep track for more than 7 days: if we haven't seen it in 7 days we can refetch content anyway
|
||||||
if(userAge.total_seconds() > 7 * 24 * 60 * 60):
|
if(userAge.total_seconds() > 7 * 24 * 60 * 60):
|
||||||
recently_checked_context.pop(tootUrl)
|
recently_checked_context.pop(tootUrl)
|
||||||
|
|
||||||
parsed_urls = {}
|
parsed_urls = {}
|
||||||
|
|
||||||
|
|
@ -1652,7 +1651,7 @@ if __name__ == "__main__":
|
||||||
if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24:
|
if os.path.getmtime(file_path) < time.time() - 60 * 60 * 24:
|
||||||
logger.debug(f"Removing cached robots.txt file {file_name}")
|
logger.debug(f"Removing cached robots.txt file {file_name}")
|
||||||
os.remove(file_path)
|
os.remove(file_path)
|
||||||
|
|
||||||
|
|
||||||
if(isinstance(arguments.access_token, str)):
|
if(isinstance(arguments.access_token, str)):
|
||||||
setattr(arguments, 'access_token', [arguments.access_token])
|
setattr(arguments, 'access_token', [arguments.access_token])
|
||||||
|
|
@ -1692,7 +1691,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
if arguments.home_timeline_length > 0:
|
if arguments.home_timeline_length > 0:
|
||||||
"""Do the same with any toots on the key owner's home timeline """
|
"""Do the same with any toots on the key owner's home timeline """
|
||||||
logger.info(f"Getting context for home timeline")
|
logger.info("Getting context for home timeline")
|
||||||
timeline_toots = get_timeline(arguments.server, token, arguments.home_timeline_length)
|
timeline_toots = get_timeline(arguments.server, token, arguments.home_timeline_length)
|
||||||
fetch_timeline_context(timeline_toots, token, parsed_urls, seen_hosts, seen_urls, all_known_users, recently_checked_users)
|
fetch_timeline_context(timeline_toots, token, parsed_urls, seen_hosts, seen_urls, all_known_users, recently_checked_users)
|
||||||
|
|
||||||
|
|
@ -1701,7 +1700,7 @@ if __name__ == "__main__":
|
||||||
user_id = get_user_id(arguments.server, arguments.user, token)
|
user_id = get_user_id(arguments.server, arguments.user, token)
|
||||||
followings = get_new_followings(arguments.server, user_id, arguments.max_followings, all_known_users)
|
followings = get_new_followings(arguments.server, user_id, arguments.max_followings, all_known_users)
|
||||||
add_user_posts(arguments.server, token, followings, known_followings, all_known_users, seen_urls, seen_hosts)
|
add_user_posts(arguments.server, token, followings, known_followings, all_known_users, seen_urls, seen_hosts)
|
||||||
|
|
||||||
if arguments.max_followers > 0:
|
if arguments.max_followers > 0:
|
||||||
logger.info(f"Getting posts from last {arguments.max_followers} followers")
|
logger.info(f"Getting posts from last {arguments.max_followers} followers")
|
||||||
user_id = get_user_id(arguments.server, arguments.user, token)
|
user_id = get_user_id(arguments.server, arguments.user, token)
|
||||||
|
|
@ -1758,7 +1757,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
logger.info(f"Processing finished in {datetime.now() - start}.")
|
logger.info(f"Processing finished in {datetime.now() - start}.")
|
||||||
|
|
||||||
except Exception as ex:
|
except Exception:
|
||||||
os.remove(LOCK_FILE)
|
os.remove(LOCK_FILE)
|
||||||
logger.error(f"Job failed after {datetime.now() - start}.")
|
logger.error(f"Job failed after {datetime.now() - start}.")
|
||||||
if(arguments.on_fail != None and arguments.on_fail != ''):
|
if(arguments.on_fail != None and arguments.on_fail != ''):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue