Merge pull request #161 from bentasker/instance_banlist
feat: add support for instance banlist
This commit is contained in:
commit
fe6ce1af43
3 changed files with 10 additions and 1 deletions
|
|
@ -151,6 +151,7 @@ Option | Required? | Notes |
|
||||||
|:----------------------------------------------------|-----------|:------|
|
|:----------------------------------------------------|-----------|:------|
|
||||||
|`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`|
|
|`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. To set tokens for multiple users using environment variables, define multiple environment variables with `FF_ACCESS_TOKEN` prefix, eg. `FF_ACCESS_TOKEN_USER1=…` and `FF_ACCESS_TOKEN_USER2=…`|
|
||||||
|`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. |
|
|`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. |
|
||||||
|
|`instance-blocklist` | No | A comma seperated list of instance domains that FediFetcher should never attempt to connect to.
|
||||||
|`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`.
|
|`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`.
|
||||||
| `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope.
|
| `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope.
|
||||||
| `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope.
|
| `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope.
|
||||||
|
|
|
||||||
|
|
@ -53,6 +53,7 @@ argparser.add_argument('--max-list-length', required=False, type=int, default=10
|
||||||
argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
|
argparser.add_argument('--max-list-accounts', required=False, type=int, default=10, help="Determines how many accounts we'll backfill for in each list. This will be ignored, unless you also provide `from-lists = 1`. Set to `0` if you only want to fetch replies in lists.")
|
||||||
argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
|
argparser.add_argument('--log-level', required=False, default="DEBUG", help="Severity of events to log (DEBUG|INFO|WARNING|ERROR|CRITICAL)")
|
||||||
argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
|
argparser.add_argument('--log-format', required=False, type=str, default="%(asctime)s: %(message)s",help="Specify the log format")
|
||||||
|
argparser.add_argument('--instance-blocklist', required=False, type=str, default="",help="A comma-seperated array of instances that FediFetcher should never try to connect to")
|
||||||
|
|
||||||
def get_notification_users(server, access_token, known_users, max_age):
|
def get_notification_users(server, access_token, known_users, max_age):
|
||||||
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
|
since = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(hours=max_age)
|
||||||
|
|
@ -1120,6 +1121,10 @@ def can_fetch(user_agent, url):
|
||||||
parsed_uri = urlparse(url)
|
parsed_uri = urlparse(url)
|
||||||
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
|
robots_url = '{uri.scheme}://{uri.netloc}/robots.txt'.format(uri=parsed_uri)
|
||||||
|
|
||||||
|
if parsed_uri.netloc in INSTANCE_BLOCKLIST:
|
||||||
|
# Never connect to these locations
|
||||||
|
raise Exception(f"Connecting to {parsed_uri.netloc} is prohibited by the configured blocklist")
|
||||||
|
|
||||||
robotsTxt = get_robots_from_url(robots_url)
|
robotsTxt = get_robots_from_url(robots_url)
|
||||||
if isinstance(robotsTxt, bool):
|
if isinstance(robotsTxt, bool):
|
||||||
return robotsTxt
|
return robotsTxt
|
||||||
|
|
@ -1501,7 +1506,8 @@ if __name__ == "__main__":
|
||||||
"on_done",
|
"on_done",
|
||||||
"on_fail",
|
"on_fail",
|
||||||
"log_level",
|
"log_level",
|
||||||
"log_format"
|
"log_format",
|
||||||
|
"instance_blocklist"
|
||||||
]:
|
]:
|
||||||
value = int(value)
|
value = int(value)
|
||||||
setattr(arguments, envvar, value)
|
setattr(arguments, envvar, value)
|
||||||
|
|
@ -1572,6 +1578,7 @@ if __name__ == "__main__":
|
||||||
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
|
SEEN_HOSTS_FILE = os.path.join(arguments.state_dir, "seen_hosts")
|
||||||
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
|
RECENTLY_CHECKED_CONTEXTS_FILE = os.path.join(arguments.state_dir, 'recent_context')
|
||||||
|
|
||||||
|
INSTANCE_BLOCKLIST = [x.strip() for x in arguments.instance_blocklist.split(",")]
|
||||||
ROBOTS_TXT = {}
|
ROBOTS_TXT = {}
|
||||||
|
|
||||||
seen_urls = OrderedSet([])
|
seen_urls = OrderedSet([])
|
||||||
|
|
|
||||||
|
|
@ -1446,6 +1446,7 @@ def test_can_fetch(mock_robotFileParser, mock_get_robots_from_url):
|
||||||
# Prepare mocks
|
# Prepare mocks
|
||||||
mock_robotsTxt = MagicMock()
|
mock_robotsTxt = MagicMock()
|
||||||
mock_robotParser = MagicMock()
|
mock_robotParser = MagicMock()
|
||||||
|
find_posts.INSTANCE_BLOCKLIST = []
|
||||||
|
|
||||||
# Mock return values
|
# Mock return values
|
||||||
mock_get_robots_from_url.return_value = mock_robotsTxt
|
mock_get_robots_from_url.return_value = mock_robotsTxt
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue