Merge pull request #161 from bentasker/instance_banlist

feat: add support for instance banlist
This commit is contained in:
Michael 2024-09-02 17:07:21 +01:00 committed by GitHub
commit fe6ce1af43
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 10 additions and 1 deletions

View file

@ -151,6 +151,7 @@ Option | Required? | Notes |
|:----------------------------------------------------|-----------|:------| |:----------------------------------------------------|-----------|:------|
|`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`| |`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`|
|`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. | |`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. |
|`instance-blocklist` | No | A comma seperated list of instance domains that FediFetcher should never attempt to connect to.
|`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`. |`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`.
| `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope. | `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope.
| `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope. | `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope.

View file

@ -53,6 +53,7 @@ argparser.add_argument('--max-list-length', required=False, type=int, default=10
argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.") argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)") argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format") argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-seperated array of instances that FediFetcher should never try to connect to")
def get_notification_users(server, access_token, known_users, max_age): def get_notification_users(server, access_token, known_users, max_age):
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age) since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
@ -1120,6 +1121,10 @@ def can_fetch(user_agent, url):
parsed_uri = urlparse(url) parsed_uri = urlparse(url)
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri) robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
if parsed_uri.netloc in INSTANCE_BLOCKLIST:
# Never connect to these locations
raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist")
robotsTxt = get_robots_from_url(robots_url) robotsTxt = get_robots_from_url(robots_url)
if isinstance(robotsTxt, bool): if isinstance(robotsTxt, bool):
return robotsTxt return robotsTxt
@ -1501,7 +1506,8 @@ if __name__ == "__main__":
"on_done", "on_done",
"on_fail", "on_fail",
"log_level", "log_level",
"log_format" "log_format",
"instance_blocklist"
]: ]:
value = int(value) value = int(value)
setattr(arguments, envvar, value) setattr(arguments, envvar, value)
@ -1572,6 +1578,7 @@ if __name__ == "__main__":
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts") SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context') RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
INSTANCE_BLOCKLIST = [x.strip() for x in arguments.instance_blocklist.split(",")]
ROBOTS_TXT = {} ROBOTS_TXT = {}
seen_urls = OrderedSet([]) seen_urls = OrderedSet([])

View file

@ -1446,6 +1446,7 @@ def test_can_fetch(mock_robotFileParser, mock_get_robots_from_url):
# Prepare mocks # Prepare mocks
mock_robotsTxt = MagicMock() mock_robotsTxt = MagicMock()
mock_robotParser = MagicMock() mock_robotParser = MagicMock()
find_posts.INSTANCE_BLOCKLIST = []
# Mock return values # Mock return values
mock_get_robots_from_url.return_value = mock_robotsTxt mock_get_robots_from_url.return_value = mock_robotsTxt