From bd3f317635ee3a9b734822941ef4e7b29948d735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Bidoul?= Date: Tue, 30 Nov 2021 18:44:45 +0100 Subject: [PATCH] Better error handling in k8s watches An automatic retry in the watch itself could lead to missed events, leading to removed builds remaining in the database. So we raise the error so the controller can reset the database and do a full refresh. --- src/runboat/controller.py | 13 ++++++++++--- src/runboat/k8s.py | 12 +++++------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/runboat/controller.py b/src/runboat/controller.py index cf0156e..f9bd1d6 100644 --- a/src/runboat/controller.py +++ b/src/runboat/controller.py @@ -15,6 +15,8 @@ _logger = logging.getLogger(__name__) # of the background tasks and the clearing of the wakeup avoids waking up the tasks # too often. EVENT_BUFFERING_DELAY = 1 +# When an exception happens in background tasks, restart them after a delay. +WALKING_DEAD_RESTART_DELAY = 5 class Controller: @@ -256,13 +258,18 @@ class Controller: _logger.info(f"(Re)starting {func.__name__}") try: await func() + except k8s.WatchException as e: + _logger.info( + f"Watch error {e} in {func.__name__}, " + f"restarting in {WALKING_DEAD_RESTART_DELAY} sec." + ) + await asyncio.sleep(WALKING_DEAD_RESTART_DELAY) except Exception: - delay = 5 _logger.exception( f"Unhandled exception in {func.__name__}, " - f"restarting in {delay} sec." + f"restarting in {WALKING_DEAD_RESTART_DELAY} sec." ) - await asyncio.sleep(delay) + await asyncio.sleep(WALKING_DEAD_RESTART_DELAY) for f in ( self.deployment_watcher, diff --git a/src/runboat/k8s.py b/src/runboat/k8s.py index d78062c..641982d 100644 --- a/src/runboat/k8s.py +++ b/src/runboat/k8s.py @@ -4,7 +4,6 @@ import os import shutil import subprocess import tempfile -import time from contextlib import contextmanager from enum import Enum from importlib import resources @@ -80,6 +79,10 @@ def patch_deployment( raise +class WatchException(Exception): + pass + + def _watch( list_method: Callable[..., Any], *args: Any, **kwargs: Any ) -> Generator[tuple[str | None, Any], None, None]: @@ -113,12 +116,7 @@ def _watch( except (urllib3.exceptions.TimeoutError, TimeoutError): continue except Exception as e: - delay = 5 - _logger.info( - f"Error {e} watching {list_method.__name__}. Retrying in {delay} sec." - ) - time.sleep(delay) - continue + raise WatchException(f"{e} in {list_method.__name__}") from e @sync_to_async_iterator