Add database support for measurements and historic prediction data. (#848)

The database supports backend selection, compression, incremental data load, automatic data saving to storage, automatic vaccum and compaction. Make SQLite3 and LMDB database backends available. Update tests for new interface conventions regarding data sequences, data containers, data providers. This includes the measurements provider and the prediction providers. Add database documentation. The fix includes several bug fixes that are not directly related to the database implementation but are necessary to keep EOS running properly and to test and document the changes. * fix: config eos test setup Make the config_eos fixture generate a new instance of the config_eos singleton. Use correct env names to setup data folder path. * fix: startup with no config Make cache and measurements complain about missing data path configuration but do not bail out. * fix: soc data preparation and usage for genetic optimization. Search for soc measurments 48 hours around the optimization start time. Only clamp soc to maximum in battery device simulation. * fix: dashboard bailout on zero value solution display Do not use zero values to calculate the chart values adjustment for display. * fix: openapi generation script Make the script also replace data_folder_path and data_output_path to hide real (test) environment pathes. * feat: add make repeated task function make_repeated_task allows to wrap a function to be repeated cyclically. * chore: removed index based data sequence access Index based data sequence access does not make sense as the sequence can be backed by the database. The sequence is now purely time series data. * chore: refactor eos startup to avoid module import startup Avoid module import initialisation expecially of the EOS configuration. Config mutation, singleton initialization, logging setup, argparse parsing, background task definitions depending on config and environment-dependent behavior is now done at function startup. * chore: introduce retention manager A single long-running background task that owns the scheduling of all periodic server-maintenance jobs (cache cleanup, DB autosave, …) * chore: canonicalize timezone name for UTC Timezone names that are semantically identical to UTC are canonicalized to UTC. * chore: extend config file migration for default value handling Extend the config file migration handling values None or nonexisting values that will invoke a default value generation in the new config file. Also adapt test to handle this situation. * chore: extend datetime util test cases * chore: make version test check for untracked files Check for files that are not tracked by git. Version calculation will be wrong if these files will not be commited. * chore: bump pandas to 3.0.0 Pandas 3.0 now performs inference on the appropriate resolution (a.k.a. unit) for the output dtype which may become datetime64[us] (before it was ns). Also numeric dtype detection is now more strict which needs a different detection for numerics. * chore: bump pydantic-settings to 2.12.0 pydantic-settings 2.12.0 under pytest creates a different behaviour. The tests were adapted and a workaround was introduced. Also ConfigEOS was adapted to allow for fine grain initialization control to be able to switch off certain settings such as file settings during test. * chore: remove sci learn kit from dependencies The sci learn kit is not strictly necessary as long as we have scipy. * chore: add documentation mode guarding for sphinx autosummary Sphinx autosummary excecutes functions. Prevent exceptions in case of pure doc mode. * chore: adapt docker-build CI workflow to stricter GitHub handling Signed-off-by: Bobby Noelte <b0661n0e17e@gmail.com>
2026-05-27 06:36:18 +00:00 · 2026-02-22 14:12:42 +01:00
parent 5f66591d21
commit 6498c7dc32
92 changed files with 12710 additions and 2173 deletions
--- a/src/akkudoktoreos/server/dash/admin.py
+++ b/src/akkudoktoreos/server/dash/admin.py
@@ -402,6 +402,75 @@ def AdminConfig(
    )


+def AdminDatabase(
+    eos_host: str, eos_port: Union[str, int], data: Optional[dict], config: Optional[dict[str, Any]]
+) -> tuple[str, Union[Card, list[Card]]]:
+    """Creates a cache management card.
+
+    Args:
+        eos_host (str): The hostname of the EOS server.
+        eos_port (Union[str, int]): The port of the EOS server.
+        data (Optional[dict]): Incoming data containing action and category for processing.
+
+    Returns:
+        tuple[str, Union[Card, list[Card]]]: A tuple containing the cache category label and the `Card` UI component.
+    """
+    server = f"http://{eos_host}:{eos_port}"
+    eos_hostname = "EOS server"
+    eosdash_hostname = "EOSdash server"
+
+    category = "database"
+
+    status_vacuum = (None,)
+    if data and data.get("category", None) == category:
+        # This data is for us
+        if data["action"] == "vacuum":
+            # Remove old records from database
+            try:
+                result = requests.post(f"{server}/v1/admin/database/vacuum", timeout=30)
+                result.raise_for_status()
+                status_vacuum = Success(
+                    f"Removed old data records from database on '{eos_hostname}'"
+                )
+            except requests.exceptions.HTTPError as e:
+                detail = result.json()["detail"]
+                status_vacuum = Error(
+                    f"Can not remove old data records from database on '{eos_hostname}': {e}, {detail}"
+                )
+            except Exception as e:
+                status_vacuum = Error(
+                    f"Can not remove old data records from database on '{eos_hostname}': {e}"
+                )
+
+    return (
+        category,
+        [
+            Card(
+                Details(
+                    Summary(
+                        Grid(
+                            DivHStacked(
+                                UkIcon(icon="play"),
+                                ConfigButton(
+                                    "Vacuum",
+                                    hx_post=request_url_for("/eosdash/admin"),
+                                    hx_target="#page-content",
+                                    hx_swap="innerHTML",
+                                    hx_vals='{"category": "database", "action": "vacuum"}',
+                                ),
+                                P(f"Remove old data records from database on '{eos_hostname}'"),
+                            ),
+                            status_vacuum,
+                        ),
+                        cls="list-none",
+                    ),
+                    P(f"Remove old data records from database on '{eos_hostname}'."),
+                ),
+            ),
+        ],
+    )
+
+
 def Admin(eos_host: str, eos_port: Union[str, int], data: Optional[dict] = None) -> Div:
    """Generates the administrative dashboard layout.

@@ -450,6 +519,7 @@ def Admin(eos_host: str, eos_port: Union[str, int], data: Optional[dict] = None)
    for category, admin in [
        AdminCache(eos_host, eos_port, data, config),
        AdminConfig(eos_host, eos_port, data, config, config_backup),
+        AdminDatabase(eos_host, eos_port, data, config),
    ]:
        if category != last_category:
            rows.append(H3(category))
--- a/src/akkudoktoreos/server/dash/footer.py
+++ b/src/akkudoktoreos/server/dash/footer.py
@@ -7,7 +7,7 @@ from monsterui.franken import A, ButtonT, DivFullySpaced, P
 from requests.exceptions import RequestException

 import akkudoktoreos.server.dash.eosstatus as eosstatus
-from akkudoktoreos.config.config import get_config
+from akkudoktoreos.core.coreabc import get_config


 def get_alive(eos_host: str, eos_port: Union[str, int]) -> str:
--- a/src/akkudoktoreos/server/dash/plan.py
+++ b/src/akkudoktoreos/server/dash/plan.py
@@ -206,13 +206,20 @@ def SolutionCard(solution: OptimizationSolution, config: SettingsEOS, data: Opti
        else:
            continue
    # Adjust to similar y-axis 0-point
+    values_min_max = [
+        (energy_wh_min, energy_wh_max),
+        (amt_kwh_min, amt_kwh_max),
+        (amt_min, amt_max),
+        (soc_factor_min, soc_factor_max),
+    ]
    # First get the maximum factor for the min value related the maximum value
-    min_max_factor = max(
-        (energy_wh_min * -1.0) / energy_wh_max,
-        (amt_kwh_min * -1.0) / amt_kwh_max,
-        (amt_min * -1.0) / amt_max,
-        (soc_factor_min * -1.0) / soc_factor_max,
-    )
+    min_max_factor = 0.0
+    for value_min, value_max in values_min_max:
+        if value_max > 0:
+            value_factor = (value_min * -1.0) / value_max
+            if value_factor > min_max_factor:
+                min_max_factor = value_factor
+
    # Adapt the min values to have the same relative min/max factor on all y-axis
    energy_wh_min = min_max_factor * energy_wh_max * -1.0
    amt_kwh_min = min_max_factor * amt_kwh_max * -1.0
--- a/src/akkudoktoreos/server/eos.py
+++ b/src/akkudoktoreos/server/eos.py
--- a/src/akkudoktoreos/server/eosdash.py
+++ b/src/akkudoktoreos/server/eosdash.py
@@ -12,7 +12,7 @@ from monsterui.core import FastHTML, Theme
 from starlette.middleware import Middleware
 from starlette.requests import Request

-from akkudoktoreos.config.config import get_config
+from akkudoktoreos.core.coreabc import get_config
 from akkudoktoreos.core.logabc import LOGGING_LEVELS
 from akkudoktoreos.core.logging import logging_track_config
 from akkudoktoreos.core.version import __version__
@@ -39,7 +39,7 @@ from akkudoktoreos.server.server import (
 )
 from akkudoktoreos.utils.stringutil import str2bool

-config_eos = get_config()
+config_eos = get_config(init=True)


 # ------------------------------------
--- a/src/akkudoktoreos/server/rest/cli.py
+++ b/src/akkudoktoreos/server/rest/cli.py
@@ -0,0 +1,149 @@
+import argparse
+
+from loguru import logger
+
+from akkudoktoreos.core.coreabc import get_config
+from akkudoktoreos.core.logabc import LOGGING_LEVELS
+from akkudoktoreos.server.server import get_default_host
+from akkudoktoreos.utils.stringutil import str2bool
+
+
+def cli_argument_parser() -> argparse.ArgumentParser:
+    """Build argument parser for EOS cli."""
+    parser = argparse.ArgumentParser(description="Start EOS server.")
+
+    parser.add_argument(
+        "--host",
+        type=str,
+        help="Host for the EOS server (default: value from config)",
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="Port for the EOS server (default: value from config)",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="none",
+        help='Log level for the server console. Options: "critical", "error", "warning", "info", "debug", "trace" (default: "none")',
+    )
+    parser.add_argument(
+        "--reload",
+        type=str2bool,
+        default=False,
+        help="Enable or disable auto-reload. Useful for development. Options: True or False (default: False)",
+    )
+    parser.add_argument(
+        "--startup_eosdash",
+        type=str2bool,
+        default=None,
+        help="Enable or disable automatic EOSdash startup. Options: True or False (default: value from config)",
+    )
+    parser.add_argument(
+        "--run_as_user",
+        type=str,
+        help="The unprivileged user account the EOS server shall switch to after performing root-level startup tasks.",
+    )
+    return parser
+
+
+def cli_parse_args(
+    argv: list[str] | None = None,
+) -> tuple[argparse.Namespace, list[str]]:
+    """Parse command-line arguments for the EOS CLI.
+
+    This function parses known EOS-specific command-line arguments and
+    returns any remaining unknown arguments unmodified. Unknown arguments
+    can be forwarded to other subsystems (e.g. Uvicorn).
+
+    If ``argv`` is ``None``, arguments are read from ``sys.argv[1:]``.
+    If ``argv`` is provided, it is used instead.
+
+    Args:
+        argv: Optional list of command-line arguments to parse. If omitted,
+            the arguments are taken from ``sys.argv[1:]``.
+
+    Returns:
+        A tuple containing:
+        - A namespace with parsed EOS CLI arguments.
+        - A list of unparsed (unknown) command-line arguments.
+    """
+    args, args_unknown = cli_argument_parser().parse_known_args(argv)
+    return args, args_unknown
+
+
+def cli_apply_args_to_config(args: argparse.Namespace) -> None:
+    """Apply parsed CLI arguments to the EOS configuration.
+
+    This function updates the EOS configuration with values provided via
+    the command line. For each parameter, the precedence is:
+
+        CLI argument > existing config value > default value
+
+    Currently handled arguments:
+
+        - log_level: Updates "logging/console_level" in config.
+        - host: Updates "server/host" in config.
+        - port: Updates "server/port" in config.
+        - startup_eosdash: Updates "server/startup_eosdash" in config.
+        - eosdash_host/port: Initialized if EOSdash is enabled and not already set.
+
+    Args:
+        args: Parsed command-line arguments from argparse.
+    """
+    config_eos = get_config()
+
+    # Setup parameters from args, config_eos and default
+    # Remember parameters in config
+
+    # Setup EOS logging level - first to have the other logging messages logged
+    if args.log_level is not None:
+        log_level = args.log_level.upper()
+        # Ensure log_level from command line is in config settings
+        if log_level in LOGGING_LEVELS:
+            # Setup console logging level using nested value
+            # - triggers logging configuration by logging_track_config
+            config_eos.set_nested_value("logging/console_level", log_level)
+            logger.debug(f"logging/console_level configuration set by argument to {log_level}")
+
+    # Setup EOS server host
+    if args.host:
+        host = args.host
+        logger.debug(f"server/host configuration set by argument to {host}")
+    elif config_eos.server.host:
+        host = config_eos.server.host
+    else:
+        host = get_default_host()
+    # Ensure host from command line is in config settings
+    config_eos.set_nested_value("server/host", host)
+
+    # Setup EOS server port
+    if args.port:
+        port = args.port
+        logger.debug(f"server/port configuration set by argument to {port}")
+    elif config_eos.server.port:
+        port = config_eos.server.port
+    else:
+        port = 8503
+    # Ensure port from command line is in config settings
+    config_eos.set_nested_value("server/port", port)
+
+    # Setup EOSdash startup
+    if args.startup_eosdash is not None:
+        # Ensure startup_eosdash from command line is in config settings
+        config_eos.set_nested_value("server/startup_eosdash", args.startup_eosdash)
+        logger.debug(
+            f"server/startup_eosdash configuration set by argument to {args.startup_eosdash}"
+        )
+
+    if config_eos.server.startup_eosdash:
+        # Ensure EOSdash host and port config settings are at least set to default values
+
+        # Setup EOS server host
+        if config_eos.server.eosdash_host is None:
+            config_eos.set_nested_value("server/eosdash_host", host)
+
+        # Setup EOS server host
+        if config_eos.server.eosdash_port is None:
+            config_eos.set_nested_value("server/eosdash_port", port + 1)
--- a/src/akkudoktoreos/server/rest/starteosdash.py
+++ b/src/akkudoktoreos/server/rest/starteosdash.py
@@ -8,14 +8,12 @@ from typing import Any, MutableMapping

 from loguru import logger

-from akkudoktoreos.config.config import get_config
+from akkudoktoreos.core.coreabc import get_config
 from akkudoktoreos.server.server import (
    validate_ip_or_hostname,
    wait_for_port_free,
 )

-config_eos = get_config()
-
 # Loguru to HA stdout
 logger.add(sys.stdout, format="{time} | {level} | {message}", enqueue=True)

@@ -277,14 +275,18 @@ async def forward_stream(stream: asyncio.StreamReader, prefix: str = "") -> None
                    _emit_drop_warning()


+# Path to eosdash
+eosdash_path = Path(__file__).parent.resolve().joinpath("eosdash.py")
+
+
 async def run_eosdash_supervisor() -> None:
    """Starts EOSdash, pipes its logs, restarts it if it crashes.

    Runs forever.
    """
-    global eosdash_log_queue
+    global eosdash_log_queue, eosdash_path

-    eosdash_path = Path(__file__).parent.resolve().joinpath("eosdash.py")
+    config_eos = get_config()

    while True:
        await asyncio.sleep(5)
--- a/src/akkudoktoreos/server/rest/tasks.py
+++ b/src/akkudoktoreos/server/rest/tasks.py
@@ -90,3 +90,73 @@ def repeat_every(
        return wrapped

    return decorator
+
+
+def make_repeated_task(
+    func: NoArgsNoReturnAnyFuncT,
+    *,
+    seconds: float,
+    wait_first: float | None = None,
+    max_repetitions: int | None = None,
+    on_complete: NoArgsNoReturnAnyFuncT | None = None,
+    on_exception: ExcArgNoReturnAnyFuncT | None = None,
+) -> NoArgsNoReturnAsyncFuncT:
+    """Create a version of the given function that runs periodically.
+
+    This function wraps `func` with the `repeat_every` decorator at runtime,
+    allowing decorator parameters to be determined dynamically rather than at import time.
+
+    Args:
+        func (Callable[[], None] | Callable[[], Coroutine[Any, Any, None]]):
+            The function to execute periodically. Must accept no arguments.
+        seconds (float):
+            Interval in seconds between repeated calls.
+        wait_first (float | None, optional):
+            If provided, the function will wait this many seconds before the first call.
+        max_repetitions (int | None, optional):
+            Maximum number of times to repeat the function. If None, repeats indefinitely.
+        on_complete (Callable[[], None] | Callable[[], Coroutine[Any, Any, None]] | None, optional):
+            Function to call once the repetitions are complete.
+        on_exception (Callable[[Exception], None] | Callable[[Exception], Coroutine[Any, Any, None]] | None, optional):
+            Function to call if an exception is raised by `func`.
+
+    Returns:
+        Callable[[], Coroutine[Any, Any, None]]:
+            An async function that starts the periodic execution when called.
+
+    Usage:
+        .. code-block:: python
+
+            from my_task import my_task
+
+            from akkudoktoreos.core.coreabc import get_config
+            from akkudoktoreos.server.rest.tasks import make_repeated_task
+
+            config = get_config()
+
+            # Create a periodic task using configuration-dependent interval
+            repeated_task = make_repeated_task(
+                my_task,
+                seconds=config.server.poll_interval,
+                wait_first=5,
+                max_repetitions=None
+            )
+
+            # Run the task in the event loop
+            import asyncio
+            asyncio.run(repeated_task())
+
+
+    Notes:
+        - This pattern avoids starting the loop at import time.
+        - Arguments such as `seconds` can be read from runtime sources (config, CLI args, environment variables).
+        - The returned function must be awaited to start the periodic loop.
+    """
+    # Return decorated function
+    return repeat_every(
+        seconds=seconds,
+        wait_first=wait_first,
+        max_repetitions=max_repetitions,
+        on_complete=on_complete,
+        on_exception=on_exception,
+    )(func)
--- a/src/akkudoktoreos/server/retentionmanager.py
+++ b/src/akkudoktoreos/server/retentionmanager.py
@@ -0,0 +1,390 @@
+"""Retention Manager for Akkudoktor-EOS server.
+
+This module provides a single long-running background task that owns the scheduling of all periodic
+server-maintenance jobs (cache cleanup, DB autosave, config reload, …).
+
+Responsibilities:
+    - Run a fast "heartbeat" loop (default 5 s) — the *compaction tick*.
+    - Maintain a registry of ``ManagedJob`` entries, each with its own interval.
+    - Re-read the live configuration on every tick so interval changes take effect
+      immediately without a server restart.
+    - Track per-job state: last run time, last duration, last error, run count.
+    - Expose that state for health-check / metrics endpoints.
+
+Example:
+    Typical usage inside your FastAPI lifespan::
+
+        from akkudoktoreos.core.coreabc import get_config
+        from akkudoktoreos.server.rest.retention_manager import RetentionManager
+        from akkudoktoreos.server.rest.tasks import make_repeated_task
+
+        manager = RetentionManager(get_config().get_nested_value)
+        manager.register("cache_cleanup", cache_cleanup_fn, interval_attr="server/cache_cleanup_interval")
+        manager.register("db_autosave",   db_autosave_fn,   interval_attr="server/db_autosave_interval")
+
+        @asynccontextmanager
+        async def lifespan(app: FastAPI):
+            tick_task = make_repeated_task(manager.tick, seconds=5, wait_first=2)
+            await tick_task()
+            yield
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from dataclasses import dataclass
+from typing import Any, Callable, Coroutine, Optional, Union
+
+from loguru import logger
+from starlette.concurrency import run_in_threadpool
+
+NoArgsNoReturnAnyFuncT = Union[Callable[[], None], Callable[[], Coroutine[Any, Any, None]]]
+ExcArgNoReturnAnyFuncT = Union[
+    Callable[[Exception], None], Callable[[Exception], Coroutine[Any, Any, None]]
+]
+ConfigGetterFuncT = Callable[[str], Any]
+
+
+# ---------------------------------------------------------------------------
+# Job state — one per registered maintenance task
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class JobState:
+    """Runtime state tracked for a single managed job.
+
+    Attributes:
+        name: Unique human-readable job name used in logs and metrics.
+        func: The maintenance callable. Must accept no arguments.
+        interval_attr: Key passed to ``config_getter`` to retrieve the interval in seconds
+            for this job.
+        fallback_interval: Interval in seconds used when the key is not found or returns zero.
+        config_getter: Callable that accepts a string key and returns the corresponding
+            configuration value. Invoked with ``interval_attr`` to obtain the interval
+            in seconds.
+        on_exception: Optional callable invoked with the raised exception whenever
+            ``func`` fails. May be sync or async.
+        last_run_at: Monotonic timestamp of the last completed run; ``0.0`` means never run.
+        last_duration: How long the last run took, in seconds.
+        last_error: String representation of the last exception, or ``None`` if the last run succeeded.
+        run_count: Total number of completed runs (successful or not).
+        is_running: ``True`` while the job coroutine is currently executing.
+    """
+
+    name: str
+    func: NoArgsNoReturnAnyFuncT
+    interval_attr: str  # key passed to config_getter to obtain the interval in seconds
+    fallback_interval: float  # used when the key is not found or returns zero
+    config_getter: ConfigGetterFuncT  # callable(key: str) -> Any; returns interval in seconds
+    on_exception: Optional[ExcArgNoReturnAnyFuncT] = None  # optional cleanup/alerting hook
+
+    # mutable state
+    last_run_at: float = 0.0  # monotonic timestamp; 0.0 means "never run"
+    last_duration: float = 0.0  # seconds the job took
+    last_error: Optional[str] = None
+    run_count: int = 0
+    is_running: bool = False
+
+    def interval(self) -> Optional[float]:
+        """Retrieve the current interval by calling ``config_getter`` with ``interval_attr``.
+
+        Returns ``None`` when the config value is ``None``, which signals that the
+        job is disabled and must never fire. Falls back to ``fallback_interval``
+        when the key is not found.
+
+        Returns:
+            The interval in seconds, or ``None`` if the job is disabled.
+        """
+        try:
+            value = self.config_getter(self.interval_attr)
+            if value is None:
+                return None
+            return float(value) if value else self.fallback_interval
+        except (KeyError, IndexError):
+            logger.warning(
+                "RetentionManager: config key '{}' not found, using fallback {}s",
+                self.interval_attr,
+                self.fallback_interval,
+            )
+            return self.fallback_interval
+
+    def is_due(self) -> bool:
+        """Check whether enough time has elapsed since the last run to execute this job again.
+
+        Returns ``False`` immediately when `interval` returns ``None``
+        (job is disabled), so a disabled job never fires regardless of when it
+        last ran.
+
+        Returns:
+            ``True`` if the job should be executed on this tick, ``False`` otherwise.
+        """
+        interval = self.interval()
+        if interval is None:
+            return False
+        return (time.monotonic() - self.last_run_at) >= interval
+
+    def summary(self) -> dict:
+        """Build a serialisable snapshot of the job's current state.
+
+        Returns:
+            A dictionary suitable for JSON serialisation, containing the job name,
+            interval key, last run timestamp, last duration, last error,
+            run count, and whether the job is currently running.
+        """
+        return {
+            "name": self.name,
+            "interval_attr": self.interval_attr,
+            "interval_s": self.interval(),
+            "last_run_at": self.last_run_at,
+            "last_duration_s": round(self.last_duration, 4),
+            "last_error": self.last_error,
+            "run_count": self.run_count,
+            "is_running": self.is_running,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Retention Manager
+# ---------------------------------------------------------------------------
+
+
+class RetentionManager:
+    """Orchestrates all periodic server-maintenance jobs.
+
+    The manager itself is driven by an external ``make_repeated_task`` heartbeat
+    (the *compaction tick*). A ``config_getter`` callable — accepting a string key
+    and returning the corresponding value — is supplied at initialisation and
+    stored on every registered job, keeping the manager decoupled from any
+    specific config implementation.
+
+    Jobs are launched as independent ``asyncio.Task`` objects so they run
+    concurrently without blocking the tick. Call `shutdown` during
+    application teardown to wait for any in-flight tasks to complete before
+    the event loop closes. A configurable shutdown_timeout prevents the
+    wait from blocking indefinitely; jobs still running after the timeout are
+    reported by name but not cancelled.
+    """
+
+    def __init__(
+        self,
+        config_getter: ConfigGetterFuncT,
+        *,
+        shutdown_timeout: float = 30.0,
+    ) -> None:
+        """Initialise the manager with a configuration accessor.
+
+        Args:
+            config_getter: Callable that accepts a string key and returns the
+                corresponding configuration value. Used by each registered job
+                to look up its interval in seconds.
+            shutdown_timeout: Maximum number of seconds to wait for in-flight
+                jobs to finish during `shutdown`. If the timeout elapses
+                before all tasks complete, an error is logged and the names of
+                the still-running jobs are reported. The tasks are not cancelled
+                so they may continue running until the event loop closes.
+                Defaults to 30.0.
+
+        Example::
+
+            manager = RetentionManager(get_config().get_nested_value, shutdown_timeout=60.0)
+        """
+        self._config_getter = config_getter
+        self._shutdown_timeout = shutdown_timeout
+        self._jobs: dict[str, JobState] = {}
+        self._running_tasks: set[asyncio.Task] = set()
+
+    # ------------------------------------------------------------------
+    # Registration
+    # ------------------------------------------------------------------
+
+    def register(
+        self,
+        name: str,
+        func: NoArgsNoReturnAnyFuncT,
+        *,
+        interval_attr: str,
+        fallback_interval: float = 300.0,
+        on_exception: Optional[ExcArgNoReturnAnyFuncT] = None,
+    ) -> None:
+        """Register a maintenance function with the manager.
+
+        Args:
+            name: Unique human-readable job name used in logs and metrics.
+            func: The maintenance callable. Must accept no arguments.
+            interval_attr: Key passed to ``config_getter`` to retrieve the interval
+                in seconds for this job. When the config value is ``None`` the job
+                is treated as disabled and will never fire.
+            fallback_interval: Seconds to use when the config attribute is missing or zero.
+                Defaults to ``300.0``.
+            on_exception: Optional callable invoked with the raised exception whenever
+                ``func`` fails. Useful for cleanup or alerting. May be sync or async.
+
+        Raises:
+            ValueError: If a job with the given ``name`` is already registered.
+        """
+        if name in self._jobs:
+            raise ValueError(f"RetentionManager: job '{name}' is already registered")
+
+        self._jobs[name] = JobState(
+            name=name,
+            func=func,
+            interval_attr=interval_attr,
+            fallback_interval=fallback_interval,
+            config_getter=self._config_getter,
+            on_exception=on_exception,
+        )
+        logger.info("RetentionManager: registered job '{}' (config: {})", name, interval_attr)
+
+    def unregister(self, name: str) -> None:
+        """Remove a previously registered job from the manager.
+
+        If no job with the given name exists, this is a no-op.
+
+        Args:
+            name: The name of the job to remove.
+        """
+        self._jobs.pop(name, None)
+
+    # ------------------------------------------------------------------
+    # Tick — called by the external heartbeat loop
+    # ------------------------------------------------------------------
+
+    async def tick(self) -> None:
+        """Single compaction tick: check every job and fire those that are due.
+
+        Each job resolves its own interval via the ``config_getter`` captured at
+        registration time. Jobs whose interval is ``None`` are silently skipped
+        (disabled). Due jobs are launched as independent ``asyncio.Task`` objects
+        so they run concurrently without blocking the tick. Each task is tracked
+        in ``_running_tasks`` and removed automatically on completion, allowing
+        `shutdown` to await all of them gracefully.
+
+        Jobs that are still running from a previous tick are skipped to prevent
+        overlapping executions.
+
+        Note:
+            This is the function you pass to ``make_repeated_task``.
+        """
+        due = [job for job in self._jobs.values() if not job.is_running and job.is_due()]
+
+        if not due:
+            return
+
+        logger.debug("RetentionManager: {} job(s) due this tick", len(due))
+        for job in due:
+            task = asyncio.ensure_future(self._run_job(job))
+            task.set_name(job.name)  # used by shutdown() to report timed-out jobs by name
+            self._running_tasks.add(task)
+            task.add_done_callback(self._running_tasks.discard)
+
+    async def shutdown(self) -> None:
+        """Wait for all currently running job tasks to complete.
+
+        Waits up to shutdown_timeout seconds (configured at initialisation)
+        for in-flight tasks to finish. If the timeout elapses before all tasks
+        complete, an error is logged listing the names of the jobs that are still
+        running. Those tasks are **not** cancelled — they continue until the event
+        loop closes — but `shutdown` returns so that application teardown
+        is not blocked indefinitely.
+
+        Returns immediately if no tasks are running.
+
+        Example::
+
+            @asynccontextmanager
+            async def lifespan(app: FastAPI):
+                tick_task = make_repeated_task(manager.tick, seconds=5, wait_first=2)
+                await tick_task()
+
+        Yield:
+                await manager.shutdown()
+        """
+        if not self._running_tasks:
+            return
+
+        logger.info(
+            "RetentionManager: shutdown — waiting up to {}s for {} task(s) to finish",
+            self._shutdown_timeout,
+            len(self._running_tasks),
+        )
+
+        done, pending = await asyncio.wait(self._running_tasks, timeout=self._shutdown_timeout)
+
+        if pending:
+            # Task names were set to the job name when the task was created in tick().
+            pending_names = [t.get_name() for t in pending]
+            logger.error(
+                "RetentionManager: shutdown timed out after {}s — {} job(s) still running: {}",
+                self._shutdown_timeout,
+                len(pending),
+                pending_names,
+            )
+        else:
+            logger.info("RetentionManager: all tasks finished, shutdown complete")
+
+        self._running_tasks.clear()
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    async def _run_job(self, job: JobState) -> None:
+        """Execute a single job and update its state regardless of outcome.
+
+        Handles both async and sync callables for both the main function and the
+        optional ``on_exception`` hook. Exceptions from ``func`` are caught, logged,
+        stored on the job, and forwarded to ``on_exception`` if provided, so a
+        failing job never disrupts other concurrent jobs or future ticks.
+
+        Args:
+            job: The `JobState` instance to execute.
+        """
+        job.is_running = True
+        start = time.monotonic()
+        logger.debug("RetentionManager: starting job '{}'", job.name)
+        try:
+            if asyncio.iscoroutinefunction(job.func):
+                await job.func()
+            else:
+                await run_in_threadpool(job.func)
+
+            job.last_error = None
+            logger.debug(
+                "RetentionManager: job '{}' completed in {:.3f}s",
+                job.name,
+                time.monotonic() - start,
+            )
+
+        except Exception as exc:  # noqa: BLE001
+            job.last_error = str(exc)
+            logger.exception("RetentionManager: job '{}' raised an exception: {}", job.name, exc)
+
+            if job.on_exception is not None:
+                if asyncio.iscoroutinefunction(job.on_exception):
+                    await job.on_exception(exc)
+                else:
+                    await run_in_threadpool(job.on_exception, exc)
+
+        finally:
+            job.last_duration = time.monotonic() - start
+            job.last_run_at = time.monotonic()
+            job.run_count += 1
+            job.is_running = False
+
+    # ------------------------------------------------------------------
+    # Observability
+    # ------------------------------------------------------------------
+
+    def status(self) -> list[dict]:
+        """Return a snapshot of every job's state for health or metrics endpoints.
+
+        Returns:
+            A list of dictionaries, one per registered job, each produced by
+            `JobState.summary`.
+        """
+        return [job.summary() for job in self._jobs.values()]
+
+    def __repr__(self) -> str:  # pragma: no cover
+        return f"<RetentionManager jobs={list(self._jobs)}>"
--- a/src/akkudoktoreos/server/server.py
+++ b/src/akkudoktoreos/server/server.py
@@ -14,6 +14,7 @@ from loguru import logger
 from pydantic import Field, field_validator

 from akkudoktoreos.config.configabc import SettingsBaseModel
+from akkudoktoreos.core.coreabc import get_config


 def get_default_host() -> str:
@@ -258,8 +259,6 @@ def fix_data_directories_permissions(run_as_user: Optional[str] = None) -> None:
        run_as_user (Optional[str]): The user who should own the data directories and files.
            Defaults to current one.
    """
-    from akkudoktoreos.config.config import get_config
-
    config_eos = get_config()

    base_dirs = [