From 6e1830f87cab4268f2920f1165d9b5b7c785ccec Mon Sep 17 00:00:00 2001 From: B Tasker Date: Sun, 18 Aug 2024 16:47:56 +0100 Subject: [PATCH 1/5] feat: implement new flag `--instance-blocklist` This can also be set via env var `FF_INSTANCE_BLOCKLIST` It should be a comma seperated list of domains --- find_posts.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/find_posts.py b/find_posts.py index 408f0f5..9e5da6c 100644 --- a/find_posts.py +++ b/find_posts.py @@ -53,6 +53,7 @@ argparser.add_argument('--max-list-length', required=False, type=int, default=10 argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.") argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)") argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format") +argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-seperated array of instances that FediFetcher should never try to connect to") def get_notification_users(server, access_token, known_users, max_age): since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age) @@ -1501,7 +1502,8 @@ if __name__ == "__main__": "on_done", "on_fail", "log_level", - "log_format" + "log_format", + "instance_blocklist" ]: value = int(value) setattr(arguments, envvar, value) @@ -1572,8 +1574,12 @@ if __name__ == "__main__": SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts") RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context') + INSTANCE_BLOCKLIST = arguments.instance_blocklist.split(",") ROBOTS_TXT = {} + print(INSTANCE_BLOCKLIST) + sys.exit() + seen_urls = OrderedSet([]) if os.path.exists(SEEN_URLS_FILE): with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f: From 3eb2589a8be9307c6bca4924a27bee53764f540e Mon Sep 17 00:00:00 2001 From: B Tasker Date: Sun, 18 Aug 2024 16:58:11 +0100 Subject: [PATCH 2/5] feat: enforce blocklist (and remove testing statements) --- find_posts.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/find_posts.py b/find_posts.py index 9e5da6c..a303903 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1121,6 +1121,10 @@ def can_fetch(user_agent, url): parsed_uri = urlparse(url) robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri) + if parsed_uri.netloc in INSTANCE_BLOCKLIST: + # Never connect to these locations + raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist") + robotsTxt = get_robots_from_url(robots_url) if isinstance(robotsTxt, bool): return robotsTxt @@ -1577,9 +1581,6 @@ if __name__ == "__main__": INSTANCE_BLOCKLIST = arguments.instance_blocklist.split(",") ROBOTS_TXT = {} - print(INSTANCE_BLOCKLIST) - sys.exit() - seen_urls = OrderedSet([]) if os.path.exists(SEEN_URLS_FILE): with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f: From b7bd6a92f3e8bbc7b71fe6607d2293f8a1eba659 Mon Sep 17 00:00:00 2001 From: B Tasker Date: Sun, 18 Aug 2024 17:00:36 +0100 Subject: [PATCH 3/5] docs: update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 72e41bd..334a38c 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,7 @@ Option | Required? | Notes | |:----------------------------------------------------|-----------|:------| |`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`| |`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. | +|`instance-blocklist` | No | A comma seperated list of instance domains that FediFetcher should never attempt to connect to. |`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`. | `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope. | `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope. From c9f6521a613a694ac434b56da6eb234241b9d171 Mon Sep 17 00:00:00 2001 From: B Tasker Date: Sun, 18 Aug 2024 17:03:20 +0100 Subject: [PATCH 4/5] fix: strip whitespace if it's included in the list --- find_posts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/find_posts.py b/find_posts.py index a303903..29339d6 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1578,7 +1578,7 @@ if __name__ == "__main__": SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts") RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context') - INSTANCE_BLOCKLIST = arguments.instance_blocklist.split(",") + INSTANCE_BLOCKLIST = [x.strip() for x in arguments.instance_blocklist.split(",")] ROBOTS_TXT = {} seen_urls = OrderedSet([]) From ed40ff3e70d55bec52c0e04b1f7d78064ab3e7ec Mon Sep 17 00:00:00 2001 From: B Tasker Date: Sun, 18 Aug 2024 17:53:49 +0100 Subject: [PATCH 5/5] chore: fix test The test needed the new attribute adding --- tests/test_find_posts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_find_posts.py b/tests/test_find_posts.py index 4df950d..6e964ae 100644 --- a/tests/test_find_posts.py +++ b/tests/test_find_posts.py @@ -1446,6 +1446,7 @@ def test_can_fetch(mock_robotFileParser, mock_get_robots_from_url): # Prepare mocks mock_robotsTxt = MagicMock() mock_robotParser = MagicMock() + find_posts.INSTANCE_BLOCKLIST = [] # Mock return values mock_get_robots_from_url.return_value = mock_robotsTxt