mirror of
https://github.com/Akkudoktor-EOS/EOS.git
synced 2026-03-15 02:56:17 +00:00
The database supports backend selection, compression, incremental data load, automatic data saving to storage, automatic vaccum and compaction. Make SQLite3 and LMDB database backends available. Update tests for new interface conventions regarding data sequences, data containers, data providers. This includes the measurements provider and the prediction providers. Add database documentation. The fix includes several bug fixes that are not directly related to the database implementation but are necessary to keep EOS running properly and to test and document the changes. * fix: config eos test setup Make the config_eos fixture generate a new instance of the config_eos singleton. Use correct env names to setup data folder path. * fix: startup with no config Make cache and measurements complain about missing data path configuration but do not bail out. * fix: soc data preparation and usage for genetic optimization. Search for soc measurments 48 hours around the optimization start time. Only clamp soc to maximum in battery device simulation. * fix: dashboard bailout on zero value solution display Do not use zero values to calculate the chart values adjustment for display. * fix: openapi generation script Make the script also replace data_folder_path and data_output_path to hide real (test) environment pathes. * feat: add make repeated task function make_repeated_task allows to wrap a function to be repeated cyclically. * chore: removed index based data sequence access Index based data sequence access does not make sense as the sequence can be backed by the database. The sequence is now purely time series data. * chore: refactor eos startup to avoid module import startup Avoid module import initialisation expecially of the EOS configuration. Config mutation, singleton initialization, logging setup, argparse parsing, background task definitions depending on config and environment-dependent behavior is now done at function startup. * chore: introduce retention manager A single long-running background task that owns the scheduling of all periodic server-maintenance jobs (cache cleanup, DB autosave, …) * chore: canonicalize timezone name for UTC Timezone names that are semantically identical to UTC are canonicalized to UTC. * chore: extend config file migration for default value handling Extend the config file migration handling values None or nonexisting values that will invoke a default value generation in the new config file. Also adapt test to handle this situation. * chore: extend datetime util test cases * chore: make version test check for untracked files Check for files that are not tracked by git. Version calculation will be wrong if these files will not be commited. * chore: bump pandas to 3.0.0 Pandas 3.0 now performs inference on the appropriate resolution (a.k.a. unit) for the output dtype which may become datetime64[us] (before it was ns). Also numeric dtype detection is now more strict which needs a different detection for numerics. * chore: bump pydantic-settings to 2.12.0 pydantic-settings 2.12.0 under pytest creates a different behaviour. The tests were adapted and a workaround was introduced. Also ConfigEOS was adapted to allow for fine grain initialization control to be able to switch off certain settings such as file settings during test. * chore: remove sci learn kit from dependencies The sci learn kit is not strictly necessary as long as we have scipy. * chore: add documentation mode guarding for sphinx autosummary Sphinx autosummary excecutes functions. Prevent exceptions in case of pure doc mode. * chore: adapt docker-build CI workflow to stricter GitHub handling Signed-off-by: Bobby Noelte <b0661n0e17e@gmail.com>
2195 lines
78 KiB
Python
2195 lines
78 KiB
Python
"""Abstract database interface."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import bisect
|
|
import gzip
|
|
import pickle
|
|
from abc import ABC, abstractmethod
|
|
from enum import Enum, auto
|
|
from pathlib import Path
|
|
from threading import Lock
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Final,
|
|
Generic,
|
|
Iterable,
|
|
Iterator,
|
|
Literal,
|
|
Optional,
|
|
Protocol,
|
|
Self,
|
|
Type,
|
|
TypeVar,
|
|
Union,
|
|
)
|
|
|
|
from loguru import logger
|
|
from numpydantic import NDArray, Shape
|
|
|
|
from akkudoktoreos.core.coreabc import (
|
|
ConfigMixin,
|
|
DatabaseMixin,
|
|
SingletonMixin,
|
|
)
|
|
from akkudoktoreos.utils.datetimeutil import (
|
|
DateTime,
|
|
Duration,
|
|
to_datetime,
|
|
to_duration,
|
|
)
|
|
|
|
# Key used to store metadata
|
|
DATABASE_METADATA_KEY: bytes = b"__metadata__"
|
|
|
|
# ==================== Abstract Database Interface ====================
|
|
|
|
|
|
class DatabaseABC(ABC, ConfigMixin):
|
|
"""Abstract base class for database.
|
|
|
|
All operations accept an optional `namespace` argument. Implementations should
|
|
treat None as the default/root namespace. Concrete implementations can map
|
|
namespace -> native namespace (LMDB DBI) or emulate namespaces (SQLite uses
|
|
a namespace column).
|
|
"""
|
|
|
|
@property
|
|
@abstractmethod
|
|
def is_open(self) -> bool:
|
|
"""Return whether the database connection is open."""
|
|
raise NotImplementedError
|
|
|
|
@property
|
|
def storage_path(self) -> Path:
|
|
"""Storage path for the database."""
|
|
return self.config.general.data_folder_path / "db" / self.__class__.__name__.lower()
|
|
|
|
@property
|
|
def compression_level(self) -> int:
|
|
"""Compression level for database record data."""
|
|
return self.config.database.compression_level
|
|
|
|
@property
|
|
def compression(self) -> bool:
|
|
"""Whether to compress stored values."""
|
|
return self.config.database.compression_level > 0
|
|
|
|
# Lifecycle
|
|
|
|
@abstractmethod
|
|
def provider_id(self) -> str:
|
|
"""Return the unique identifier for the database provider.
|
|
|
|
To be implemented by derived classes.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def open(self, namespace: Optional[str] = None) -> None:
|
|
"""Open database connection and optionally set default namespace.
|
|
|
|
Args:
|
|
namespace: Optional default namespace to prepare.
|
|
|
|
Raises:
|
|
RuntimeError: If the database cannot be opened.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def close(self) -> None:
|
|
"""Close the database connection and cleanup resources."""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def flush(self, namespace: Optional[str] = None) -> None:
|
|
"""Force synchronization of pending writes to storage (optional per-namespace)."""
|
|
raise NotImplementedError
|
|
|
|
# Metadata operations
|
|
|
|
@abstractmethod
|
|
def set_metadata(self, metadata: Optional[bytes], *, namespace: Optional[str] = None) -> None:
|
|
"""Save metadata for a given namespace.
|
|
|
|
Metadata is treated separately from data records and stored as a single object.
|
|
|
|
Args:
|
|
metadata (bytes): Arbitrary metadata to save or None to delete metadata.
|
|
namespace (Optional[str]): Optional namespace under which to store metadata.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def get_metadata(self, namespace: Optional[str] = None) -> Optional[bytes]:
|
|
"""Load metadata for a given namespace.
|
|
|
|
Returns None if no metadata exists.
|
|
|
|
Args:
|
|
namespace (Optional[str]): Optional namespace whose metadata to retrieve.
|
|
|
|
Returns:
|
|
Optional[bytes]: The loaded metadata, or None if not found.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
# Basic record operations
|
|
|
|
@abstractmethod
|
|
def save_records(
|
|
self, records: Iterable[tuple[bytes, bytes]], namespace: Optional[str] = None
|
|
) -> int:
|
|
"""Save multiple records into the specified namespace (or default).
|
|
|
|
Args:
|
|
records: Iterable providing key, value tuples ordered by key:
|
|
- key: Byte key (sortable) for the record.
|
|
- value: Serialized (and optionally compressed) bytes to store.
|
|
namespace: Optional namespace.
|
|
|
|
Returns:
|
|
Number of records saved.
|
|
|
|
Raises:
|
|
RuntimeError: If DB not open or write failed.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def delete_records(self, keys: Iterable[bytes], namespace: Optional[str] = None) -> int:
|
|
"""Delete multiple records by key from the specified namespace.
|
|
|
|
Args:
|
|
keys: Iterable that provides the Byte keys to delete.
|
|
namespace: Optional namespace.
|
|
|
|
Returns:
|
|
Number of records actually deleted.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def iterate_records(
|
|
self,
|
|
start_key: Optional[bytes] = None,
|
|
end_key: Optional[bytes] = None,
|
|
namespace: Optional[str] = None,
|
|
reverse: bool = False,
|
|
) -> Iterator[tuple[bytes, bytes]]:
|
|
"""Iterate over records for a namespace with optional bounds.
|
|
|
|
Args:
|
|
start_key: Inclusive start key, or None.
|
|
end_key: Exclusive end key, or None.
|
|
namespace: Optional namespace to target.
|
|
reverse: If True iterate in descending key order.
|
|
|
|
Yields:
|
|
Tuples of (key, record).
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def count_records(
|
|
self,
|
|
start_key: Optional[bytes] = None,
|
|
end_key: Optional[bytes] = None,
|
|
*,
|
|
namespace: Optional[str] = None,
|
|
) -> int:
|
|
"""Count records in [start_key, end_key) excluding metadata in specified namespace.
|
|
|
|
Excludes metadata records.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def get_key_range(
|
|
self, namespace: Optional[str] = None
|
|
) -> tuple[Optional[bytes], Optional[bytes]]:
|
|
"""Return (min_key, max_key) in the given namespace or (None, None) if empty."""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def get_backend_stats(self, namespace: Optional[str] = None) -> dict[str, Any]:
|
|
"""Get backend-specific statistics; implementations may return namespace-specific data."""
|
|
raise NotImplementedError
|
|
|
|
# Compression helpers
|
|
|
|
def serialize_data(self, data: bytes) -> bytes:
|
|
"""Optionally compress raw pickled data before storage.
|
|
|
|
Args:
|
|
data: Raw pickled bytes.
|
|
|
|
Returns:
|
|
Possibly compressed bytes.
|
|
"""
|
|
if self.compression:
|
|
return gzip.compress(data, compresslevel=self.compression_level)
|
|
return data
|
|
|
|
def deserialize_data(self, data: bytes) -> bytes:
|
|
"""Optionally decompress stored data.
|
|
|
|
Args:
|
|
data: Stored bytes.
|
|
|
|
Returns:
|
|
Raw pickled bytes (decompressed if needed).
|
|
"""
|
|
if len(data) >= 2 and data[:2] == b"\x1f\x8b":
|
|
try:
|
|
return gzip.decompress(data)
|
|
except gzip.BadGzipFile:
|
|
pass
|
|
return data
|
|
|
|
|
|
class DatabaseBackendABC(DatabaseABC, SingletonMixin):
|
|
"""Abstract base class for database backends.
|
|
|
|
All operations accept an optional `namespace` argument. Implementations should
|
|
treat None as the default/root namespace. Concrete implementations can map
|
|
namespace -> native namespace (LMDB DBI) or emulate namespaces (SQLite uses
|
|
a namespace column).
|
|
"""
|
|
|
|
connection: Any
|
|
lock: Lock
|
|
_is_open: bool
|
|
default_namespace: Optional[str]
|
|
|
|
def __init__(self, **kwargs: Any) -> None:
|
|
"""Initialize the DatabaseBackendABC base.
|
|
|
|
Args:
|
|
**kwargs: Backend-specific options (ignored by base).
|
|
"""
|
|
self.connection = None
|
|
self.lock = Lock()
|
|
self._is_open = False
|
|
self.default_namespace = None
|
|
|
|
@property
|
|
def is_open(self) -> bool:
|
|
"""Return whether the database connection is open."""
|
|
return self._is_open
|
|
|
|
|
|
# ==================== Database Record Protocol Mixin ====================
|
|
|
|
|
|
class DataRecordProtocol(Protocol):
|
|
date_time: DateTime
|
|
|
|
def __init__(self, date_time: Any) -> None: ...
|
|
|
|
def __getitem__(self, key: str) -> Any: ...
|
|
|
|
def model_dump(self) -> dict: ...
|
|
|
|
|
|
T_Record = TypeVar("T_Record", bound=DataRecordProtocol)
|
|
|
|
|
|
class DatabaseTimestamp(str):
|
|
"""ISO8601 UTC datetime string used as database timestamp.
|
|
|
|
Must always be in UTC and lexicographically sortable.
|
|
|
|
Example:
|
|
"20241027T123456[Z]" # 2024-10-27 12:34:56
|
|
"""
|
|
|
|
__slots__ = ()
|
|
|
|
@classmethod
|
|
def from_datetime(cls, dt: DateTime) -> "DatabaseTimestamp":
|
|
if dt.tz is None:
|
|
raise ValueError("Timezone-aware datetime required")
|
|
|
|
return cls(dt.in_timezone("UTC").format("YYYYMMDDTHHmmss[Z]"))
|
|
|
|
def to_datetime(self) -> DateTime:
|
|
from pendulum import parse
|
|
|
|
return parse(self)
|
|
|
|
|
|
class _DatabaseTimestampUnbound(str):
|
|
"""Sentinel type representing an unbounded datetime value for database usage.
|
|
|
|
Instances of this class are designed to be totally ordered relative to
|
|
ISO datetime strings:
|
|
|
|
- UNBOUND_START is smaller than any other value.
|
|
- UNBOUND_END is greater than any other value.
|
|
|
|
This makes the type safe for:
|
|
- sorted lists
|
|
- bisect operations
|
|
- dictionary keys
|
|
- range queries
|
|
|
|
The type inherits from `str` to remain maximally efficient for hashing
|
|
and dictionary usage.
|
|
"""
|
|
|
|
__slots__ = ("_is_start",)
|
|
|
|
if TYPE_CHECKING:
|
|
_is_start: bool
|
|
|
|
def __new__(cls, value: str, is_start: bool) -> "_DatabaseTimestampUnbound":
|
|
obj = super().__new__(cls, value)
|
|
obj._is_start = is_start
|
|
return obj
|
|
|
|
def __lt__(self, other: object) -> bool:
|
|
if isinstance(other, _DatabaseTimestampUnbound):
|
|
return self._is_start and not other._is_start
|
|
return self._is_start
|
|
|
|
def __le__(self, other: object) -> bool:
|
|
if isinstance(other, _DatabaseTimestampUnbound):
|
|
return self._is_start or self is other
|
|
return self._is_start
|
|
|
|
def __gt__(self, other: object) -> bool:
|
|
if isinstance(other, _DatabaseTimestampUnbound):
|
|
return not self._is_start and other._is_start
|
|
return not self._is_start
|
|
|
|
def __ge__(self, other: object) -> bool:
|
|
if isinstance(other, _DatabaseTimestampUnbound):
|
|
return not self._is_start or self is other
|
|
return not self._is_start
|
|
|
|
def __repr__(self) -> str:
|
|
return "UNBOUND_START" if self._is_start else "UNBOUND_END"
|
|
|
|
|
|
DatabaseTimestampType = Union[DatabaseTimestamp, _DatabaseTimestampUnbound]
|
|
|
|
|
|
# Public sentinels
|
|
UNBOUND_START: Final[_DatabaseTimestampUnbound] = _DatabaseTimestampUnbound(
|
|
"UNBOUND_START", is_start=True
|
|
)
|
|
UNBOUND_END: Final[_DatabaseTimestampUnbound] = _DatabaseTimestampUnbound(
|
|
"UNBOUND_END", is_start=False
|
|
)
|
|
|
|
|
|
class _DatabaseTimeWindowUnbound:
|
|
"""Sentinel representing an unbounded time window.
|
|
|
|
This is distinct from `None`:
|
|
- None → parameter not provided
|
|
- UNBOUND_WINDOW → explicitly infinite duration
|
|
|
|
Designed to:
|
|
- be identity-compared (is)
|
|
- be hashable
|
|
- be safe for dict usage
|
|
- avoid accidental equality with other values
|
|
"""
|
|
|
|
__slots__ = ()
|
|
|
|
def __repr__(self) -> str:
|
|
return "UNBOUND_WINDOW"
|
|
|
|
def __reduce__(self) -> str:
|
|
# Ensures singleton behavior during pickling
|
|
return "UNBOUND_WINDOW"
|
|
|
|
|
|
DatabaseTimeWindowType = Union[Duration, None, _DatabaseTimeWindowUnbound]
|
|
|
|
|
|
UNBOUND_WINDOW: Final[_DatabaseTimeWindowUnbound] = _DatabaseTimeWindowUnbound()
|
|
|
|
|
|
class DatabaseRecordProtocol(Protocol, Generic[T_Record]):
|
|
# ---- derived class required interface ----
|
|
|
|
records: list[T_Record]
|
|
|
|
def model_post_init(self, __context: Any) -> None: ...
|
|
|
|
def model_copy(self, *, deep: bool = False) -> Self: ...
|
|
|
|
# record class introspection
|
|
@classmethod
|
|
def record_class(cls) -> Type[T_Record]: ...
|
|
|
|
# Duration for which records shall be kept in database storage
|
|
def db_keep_duration(self) -> Optional[Duration]: ...
|
|
|
|
# namespace
|
|
def db_namespace(self) -> str: ...
|
|
|
|
# ---- public DB interface ----
|
|
|
|
def _db_reset_state(self) -> None: ...
|
|
|
|
@property
|
|
def db_enabled(self) -> bool: ...
|
|
|
|
def db_timestamp_range(self) -> tuple[DatabaseTimestampType, DatabaseTimestampType]: ...
|
|
|
|
def db_generate_timestamps(
|
|
self,
|
|
start_timestamp: DatabaseTimestamp,
|
|
values_count: int,
|
|
interval: Optional[Duration] = None,
|
|
) -> Iterator[DatabaseTimestamp]: ...
|
|
|
|
def db_get_record(self, target_timestamp: DatabaseTimestamp) -> Optional[T_Record]: ...
|
|
|
|
def db_insert_record(
|
|
self,
|
|
record: T_Record,
|
|
*,
|
|
mark_dirty: bool = True,
|
|
) -> None: ...
|
|
|
|
def db_iterate_records(
|
|
self,
|
|
start_timestamp: Optional[DatabaseTimestampType] = None,
|
|
end_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> Iterator[T_Record]: ...
|
|
|
|
def db_load_records(
|
|
self,
|
|
start_timestamp: Optional[DatabaseTimestampType] = None,
|
|
end_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> int: ...
|
|
|
|
def db_delete_records(
|
|
self,
|
|
start_timestamp: Optional[DatabaseTimestampType] = None,
|
|
end_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> int: ...
|
|
|
|
# ---- dirty tracking ----
|
|
def db_mark_dirty_record(self, record: T_Record) -> None: ...
|
|
|
|
def db_save_records(self) -> int: ...
|
|
|
|
# ---- autosave ----
|
|
def db_autosave(self) -> int: ...
|
|
|
|
# ---- Remove old records from database to free space ----
|
|
def db_vacuum(
|
|
self,
|
|
keep_hours: Optional[int] = None,
|
|
keep_datetime: Optional[DatabaseTimestampType] = None,
|
|
) -> int: ...
|
|
|
|
# ---- statistics about database storage ----
|
|
def db_count_records(self) -> int: ...
|
|
|
|
def db_get_stats(self) -> dict: ...
|
|
|
|
|
|
T_DatabaseRecordProtocol = TypeVar("T_DatabaseRecordProtocol", bound="DatabaseRecordProtocol")
|
|
|
|
|
|
class DatabaseRecordProtocolLoadPhase(Enum):
|
|
"""Database loading phases.
|
|
|
|
NONE:
|
|
No records have been loaded from the database.
|
|
|
|
INITIAL:
|
|
A limited initial time window has been loaded, typically centered
|
|
around a target datetime.
|
|
|
|
FULL:
|
|
All records in the database have been loaded into memory.
|
|
|
|
The phase controls whether further calls to ``db_ensure_loaded`` may
|
|
trigger additional database access.
|
|
"""
|
|
|
|
NONE = auto() # nothing loaded
|
|
INITIAL = auto() # initial window loaded
|
|
FULL = auto() # fully expanded
|
|
|
|
|
|
class DatabaseRecordProtocolMixin(
|
|
ConfigMixin,
|
|
DatabaseMixin,
|
|
Generic[T_Record], # for typing only
|
|
):
|
|
"""Database Record Protocol Mixin.
|
|
|
|
Completely manages in memory records and database storage.
|
|
|
|
Expects records with date_time (DatabaseTimestamp) property and the a record list
|
|
in self.records of the derived class.
|
|
|
|
DatabaseRecordProtocolMixin expects the derived classes to be singletons.
|
|
"""
|
|
|
|
# Tell mypy these attributes exist (will be provided by subclasses)
|
|
if TYPE_CHECKING:
|
|
records: list[T_Record]
|
|
|
|
@classmethod
|
|
def record_class(cls) -> Type[T_Record]: ...
|
|
|
|
@property
|
|
def record_keys_writable(self) -> list[str]: ...
|
|
|
|
def key_to_array(
|
|
self,
|
|
key: str,
|
|
start_datetime: Optional[DateTime] = None,
|
|
end_datetime: Optional[DateTime] = None,
|
|
interval: Optional[Duration] = None,
|
|
fill_method: Optional[str] = None,
|
|
dropna: Optional[bool] = True,
|
|
boundary: Literal["strict", "context"] = "context",
|
|
align_to_interval: bool = False,
|
|
) -> NDArray[Shape["*"], Any]: ...
|
|
|
|
# Database configuration
|
|
|
|
def db_initial_time_window(self) -> Optional[Duration]:
|
|
"""Return the initial time window used for database loading.
|
|
|
|
This window defines the initial symmetric time span around a target datetime
|
|
that should be loaded from the database when no explicit search time window
|
|
is specified. It serves as a loading hint and may be expanded by the caller
|
|
if no records are found within the initial range.
|
|
|
|
Subclasses may override this method to provide a domain-specific default.
|
|
|
|
Returns:
|
|
The initial loading time window as a Duration, or ``None`` to indicate
|
|
that no initial window constraint should be applied.
|
|
"""
|
|
return None
|
|
|
|
# -----------------------------------------------------
|
|
# Initialization
|
|
# -----------------------------------------------------
|
|
|
|
def _db_ensure_initialized(self) -> None:
|
|
"""Initialize DB runtime state.
|
|
|
|
Idempotent — safe to call multiple times.
|
|
"""
|
|
if not getattr(self, "_db_initialized", None):
|
|
# record datetime to record mapping for fast lookup
|
|
self._db_record_index: dict[DatabaseTimestamp, T_Record] = {}
|
|
self._db_sorted_timestamps: list[DatabaseTimestamp] = []
|
|
|
|
# Loading phase tracking
|
|
self._db_load_phase: DatabaseRecordProtocolLoadPhase = (
|
|
DatabaseRecordProtocolLoadPhase.NONE
|
|
)
|
|
# Range of timestamps the was already queried from database storage during load
|
|
self._db_loaded_range: Optional[tuple[DatabaseTimestampType, DatabaseTimestampType]] = (
|
|
None
|
|
)
|
|
|
|
# Dirty tracking
|
|
# - dirty records since last save
|
|
self._db_dirty_timestamps: set[DatabaseTimestamp] = set()
|
|
# - records added since last save
|
|
self._db_new_timestamps: set[DatabaseTimestamp] = set()
|
|
# - deleted records since last save
|
|
self._db_deleted_timestamps: set[DatabaseTimestamp] = set()
|
|
|
|
self._db_version: int = 1
|
|
|
|
# Storage
|
|
self._db_metadata: Optional[dict] = None
|
|
self._db_storage_initialized: bool = False
|
|
|
|
self._db_initialized: bool = True
|
|
|
|
if not self._db_storage_initialized and self.db_enabled:
|
|
# Metadata
|
|
existing_metadata = self._db_load_metadata()
|
|
if existing_metadata:
|
|
self._db_metadata = existing_metadata
|
|
else:
|
|
self._db_metadata = {
|
|
"version": self._db_version,
|
|
"created": to_datetime(as_string=True),
|
|
"provider_id": getattr(self, "provider_id", lambda: "unknown")(),
|
|
"compression": self.database.compression,
|
|
"backend": self.database.__class__.__name__,
|
|
}
|
|
self._db_save_metadata(self._db_metadata)
|
|
|
|
logger.info(
|
|
f"Initialized {self.database.__class__.__name__}:{self.db_namespace()} storage at "
|
|
f"{self.database.storage_path} "
|
|
f"autosave_interval_sec={self.config.database.autosave_interval_sec})"
|
|
)
|
|
|
|
self._db_storage_initialized = True
|
|
|
|
def model_post_init(self, __context: Any) -> None:
|
|
"""Initialize DB state attributes immediately after Pydantic construction."""
|
|
# Always call super() first — other mixins may also define model_post_init
|
|
super().model_post_init(__context) # type: ignore[misc]
|
|
self._db_ensure_initialized()
|
|
|
|
# -----------------------------------------------------
|
|
# Helpers
|
|
# -----------------------------------------------------
|
|
|
|
def _db_key_from_timestamp(self, dt: DatabaseTimestamp) -> bytes:
|
|
"""Convert database timestamp to a sortable database backend key."""
|
|
return dt.encode("utf-8")
|
|
|
|
def _db_key_to_timestamp(self, dbkey: bytes) -> DatabaseTimestamp:
|
|
"""Convert database backend key back to database timestamp."""
|
|
return DatabaseTimestamp(dbkey.decode("utf-8"))
|
|
|
|
def _db_timestamp_after(self, timestamp: DatabaseTimestamp) -> DatabaseTimestamp:
|
|
"""Get database timestamp after this timestamp.
|
|
|
|
A minimal time span is added to the DatabaseTimestamp to get the first possible timestamp
|
|
after DatabaseTimestamp.
|
|
"""
|
|
target = DatabaseTimestamp.to_datetime(timestamp)
|
|
db_datetime_after = DatabaseTimestamp.from_datetime(target.add(seconds=1))
|
|
return db_datetime_after
|
|
|
|
def db_previous_timestamp(
|
|
self,
|
|
timestamp: DatabaseTimestamp,
|
|
) -> Optional[DatabaseTimestamp]:
|
|
"""Find the largest timestamp < given timestamp.
|
|
|
|
Search memory-first, then fallback to database if necessary.
|
|
"""
|
|
self._db_ensure_initialized()
|
|
|
|
# Step 1: Memory-first search
|
|
if self._db_sorted_timestamps:
|
|
idx = bisect.bisect_left(self._db_sorted_timestamps, timestamp)
|
|
if idx > 0:
|
|
return self._db_sorted_timestamps[idx - 1]
|
|
|
|
# Step 2: Check if DB might contain older keys
|
|
if not self.db_enabled:
|
|
return None
|
|
|
|
db_min_key, _ = self.database.get_key_range(self.db_namespace())
|
|
if db_min_key is None:
|
|
return None
|
|
|
|
db_min_ts = self._db_key_to_timestamp(db_min_key)
|
|
if timestamp <= db_min_ts:
|
|
return None
|
|
|
|
# Step 3: Load left part of DB if not already in memory
|
|
# We want records < timestamp
|
|
start_key = None
|
|
end_key = self._db_key_from_timestamp(timestamp)
|
|
|
|
# Only load if timestamp is out of currently loaded memory
|
|
if self._db_loaded_range:
|
|
loaded_start, _ = self._db_loaded_range
|
|
if isinstance(loaded_start, DatabaseTimestamp) and timestamp > loaded_start:
|
|
# Already partially loaded, restrict iterator to unloaded portion
|
|
start_key = self._db_key_from_timestamp(loaded_start)
|
|
|
|
previous_ts: Optional[DatabaseTimestamp] = None
|
|
for key, _ in self.database.iterate_records(
|
|
start_key=start_key,
|
|
end_key=end_key,
|
|
namespace=self.db_namespace(),
|
|
):
|
|
ts = self._db_key_to_timestamp(key)
|
|
if ts in self._db_deleted_timestamps:
|
|
continue
|
|
previous_ts = ts # last one before `timestamp`
|
|
|
|
return previous_ts
|
|
|
|
def db_next_timestamp(
|
|
self,
|
|
timestamp: DatabaseTimestamp,
|
|
) -> Optional[DatabaseTimestamp]:
|
|
"""Find the smallest timestamp > given timestamp.
|
|
|
|
Search memory-first, then fallback to database if necessary.
|
|
"""
|
|
self._db_ensure_initialized()
|
|
|
|
# Step 1: Memory-first search
|
|
if self._db_sorted_timestamps:
|
|
idx = bisect.bisect_right(self._db_sorted_timestamps, timestamp)
|
|
if idx < len(self._db_sorted_timestamps):
|
|
return self._db_sorted_timestamps[idx]
|
|
|
|
# Step 2: Check if DB might contain newer keys
|
|
if not self.db_enabled:
|
|
return None
|
|
|
|
_, db_max_key = self.database.get_key_range(self.db_namespace())
|
|
if db_max_key is None:
|
|
return None
|
|
|
|
db_max_ts = self._db_key_to_timestamp(db_max_key)
|
|
if timestamp >= db_max_ts:
|
|
return None
|
|
|
|
# Step 3: Search right part of DB if not already in memory
|
|
timestamp_key = self._db_key_from_timestamp(timestamp)
|
|
start_key = timestamp_key
|
|
end_key = None
|
|
|
|
# Restrict iterator to unloaded portion if partially loaded
|
|
if self._db_loaded_range:
|
|
_, loaded_end = self._db_loaded_range
|
|
# Assumes everything < loaded_end is fully represented in memory.
|
|
if isinstance(loaded_end, DatabaseTimestamp) and timestamp < loaded_end:
|
|
start_key = self._db_key_from_timestamp(max(timestamp, loaded_end))
|
|
|
|
for key, _ in self.database.iterate_records(
|
|
start_key=start_key,
|
|
end_key=end_key,
|
|
namespace=self.db_namespace(),
|
|
):
|
|
if key == timestamp_key:
|
|
# skip
|
|
continue
|
|
|
|
ts = self._db_key_to_timestamp(key)
|
|
|
|
# Check for deleted (only necessary for database - memory already removed
|
|
if ts in self._db_deleted_timestamps:
|
|
continue
|
|
|
|
return ts # first valid one
|
|
|
|
return None
|
|
|
|
def _db_serialize_record(self, record: T_Record) -> bytes:
|
|
"""Serialize a DataRecord to bytes."""
|
|
if self.database is None:
|
|
raise ValueError("Database not defined.")
|
|
data = pickle.dumps(record.model_dump(), protocol=pickle.HIGHEST_PROTOCOL)
|
|
return self.database.serialize_data(data)
|
|
|
|
def _db_deserialize_record(self, data: bytes) -> T_Record:
|
|
"""Deserialize bytes to a DataRecord."""
|
|
if self.database is None:
|
|
raise ValueError("Database not defined.")
|
|
data = self.database.deserialize_data(data)
|
|
record_data = pickle.loads(data) # noqa: S301
|
|
return self.record_class()(**record_data)
|
|
|
|
def _db_save_metadata(self, metadata: dict) -> None:
|
|
"""Save metadata to database."""
|
|
if not self.db_enabled:
|
|
return
|
|
|
|
key = DATABASE_METADATA_KEY
|
|
value = pickle.dumps(metadata)
|
|
self.database.set_metadata(value, namespace=self.db_namespace())
|
|
|
|
def _db_load_metadata(self) -> Optional[dict]:
|
|
"""Load metadata from database."""
|
|
if not self.db_enabled:
|
|
return None
|
|
|
|
try:
|
|
value = self.database.get_metadata(namespace=self.db_namespace())
|
|
return pickle.loads(value) # noqa: S301
|
|
except Exception:
|
|
logger.debug("Can not load metadata.")
|
|
return None
|
|
|
|
def _db_reset_state(self) -> None:
|
|
self.records = []
|
|
self._db_loaded_range = None
|
|
self._db_load_phase = DatabaseRecordProtocolLoadPhase.NONE
|
|
try:
|
|
del self._db_initialized
|
|
except:
|
|
logger.debug("_db_reset_state called on uninitialized sequence")
|
|
|
|
def _db_clone_empty(self: T_DatabaseRecordProtocol) -> T_DatabaseRecordProtocol:
|
|
"""Create an empty internal clone for database operations.
|
|
|
|
The clone shares configuration and database access implicitly via
|
|
ConfigMixin and DatabaseMixin, but contains no in-memory records
|
|
or loaded-range state.
|
|
|
|
Internal helper for database workflows only.
|
|
"""
|
|
clone = self.model_copy(deep=True)
|
|
clone._db_reset_state()
|
|
|
|
return clone
|
|
|
|
def _search_window(
|
|
self,
|
|
center_timestamp: Optional[DatabaseTimestampType],
|
|
time_window: DatabaseTimeWindowType,
|
|
) -> tuple[DatabaseTimestampType, DatabaseTimestampType]:
|
|
"""Compute a symmetric search window around a center timestamp.
|
|
|
|
This method always returns valid database boundary values.
|
|
|
|
Args:
|
|
center_timestamp: Center of the window. Defaults to current UTC time
|
|
if None. Must not be an unbounded timestamp sentinel.
|
|
time_window: Total width of the search window.
|
|
Half is applied on each side of center_timestamp.
|
|
- None: interpreted as unbounded.
|
|
- UNBOUND_WINDOW: interpreted as unbounded.
|
|
- Duration: symmetric bounded interval.
|
|
|
|
Returns:
|
|
A tuple (start, end) representing a half-open interval.
|
|
Always returns valid database timestamp boundaries:
|
|
either concrete timestamps or (UNBOUND_START, UNBOUND_END).
|
|
|
|
Raises:
|
|
TypeError: If center_timestamp is an unbounded timestamp sentinel.
|
|
ValueError: If time_window is a negative Duration.
|
|
"""
|
|
# Unbounded cases → full DB range
|
|
if time_window is None or isinstance(time_window, _DatabaseTimeWindowUnbound):
|
|
return UNBOUND_START, UNBOUND_END
|
|
|
|
if isinstance(center_timestamp, _DatabaseTimestampUnbound):
|
|
raise TypeError("center_timestamp cannot be of unbounded timestamp type.")
|
|
|
|
# Resolve center
|
|
if center_timestamp is None:
|
|
center = to_datetime().in_timezone("UTC")
|
|
else:
|
|
center = DatabaseTimestamp.to_datetime(center_timestamp)
|
|
|
|
duration = to_duration(time_window)
|
|
|
|
if duration.total_seconds() < 0:
|
|
raise ValueError("time_window must be non-negative")
|
|
|
|
# Use duration arithmetic to avoid float precision issues
|
|
half = duration / 2
|
|
|
|
start = center - half
|
|
end = center + half
|
|
|
|
return (
|
|
DatabaseTimestamp.from_datetime(start),
|
|
DatabaseTimestamp.from_datetime(end),
|
|
)
|
|
|
|
def _db_range_covered(
|
|
self,
|
|
start_timestamp: DatabaseTimestampType,
|
|
end_timestamp: DatabaseTimestampType,
|
|
) -> bool:
|
|
"""Return True if [start_timestamp, end_timestamp) is fully covered.
|
|
|
|
Args:
|
|
start_timestamp: Inclusive lower boundary of the requested range.
|
|
end_timestamp: Exclusive upper boundary of the requested range.
|
|
|
|
Returns:
|
|
True if the requested half-open interval is completely contained
|
|
within the loaded database range.
|
|
|
|
Raises:
|
|
TypeError: If start_timestamp or end_timestamp is None.
|
|
"""
|
|
if start_timestamp is None or end_timestamp is None:
|
|
raise TypeError(
|
|
"start_timestamp and end_timestamp must not be None. "
|
|
"Use UNBOUND_START / UNBOUND_END instead."
|
|
)
|
|
|
|
if not isinstance(start_timestamp, (str, _DatabaseTimestampUnbound)):
|
|
raise TypeError(
|
|
f"Invalid start_timestamp type: {type(start_timestamp)}. "
|
|
"Must be DatabaseTimestamp or unbound sentinel."
|
|
)
|
|
|
|
if not isinstance(end_timestamp, (str, _DatabaseTimestampUnbound)):
|
|
raise TypeError(
|
|
f"Invalid end_timestamp type: {type(end_timestamp)}. "
|
|
"Must be DatabaseTimestamp or unbound sentinel."
|
|
)
|
|
|
|
if self._db_loaded_range is None:
|
|
return False
|
|
|
|
loaded_start, loaded_end = self._db_loaded_range
|
|
|
|
if loaded_start is None or loaded_end is None:
|
|
return False
|
|
|
|
return loaded_start <= start_timestamp and end_timestamp <= loaded_end
|
|
|
|
def _db_load_initial_window(
|
|
self,
|
|
center_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> None:
|
|
"""Load an initial time window of records from the database.
|
|
|
|
This method establishes the first lazy-loading window when the load phase
|
|
is ``NONE``. It queries the database for records within a symmetric time
|
|
interval around ``center_timestamp`` and transitions the load phase to
|
|
``INITIAL``.
|
|
|
|
The loaded interval is recorded in ``self._db_loaded_range`` and represents
|
|
**database coverage**, not memory continuity. That is:
|
|
|
|
- All database records in the half-open interval
|
|
[start_timestamp, end_timestamp) have been queried.
|
|
- Records within that interval are either loaded into memory or
|
|
confirmed absent.
|
|
- The interval does not imply that memory contains continuous records.
|
|
|
|
The loaded range is later expanded incrementally if additional
|
|
out-of-window ranges are requested.
|
|
|
|
If ``center_timestamp`` is not provided, the current time is used.
|
|
|
|
Args:
|
|
center_timestamp (DatabaseTimestampType):
|
|
The central reference time for the initial loading window.
|
|
If None, the current time is used.
|
|
|
|
Side Effects:
|
|
|
|
* Loads records from persistent storage into memory.
|
|
* Sets ``self._db_loaded_range`` by db_load_records().
|
|
* Sets ``self._db_load_phase`` to ``INITIAL``.
|
|
|
|
Notes:
|
|
* The loaded range uses half-open interval semantics:
|
|
[start_timestamp, end_timestamp).
|
|
* This method does not perform a full database load.
|
|
* Empty query results still establish coverage for the interval,
|
|
preventing redundant database queries.
|
|
"""
|
|
if not self.db_enabled:
|
|
return
|
|
|
|
# Redundant guard - should only be called from load phase None
|
|
if self._db_load_phase is not DatabaseRecordProtocolLoadPhase.NONE:
|
|
raise RuntimeError(
|
|
"_db_load_initial_window() may only be called when load phase is NONE."
|
|
)
|
|
|
|
window_h = self.config.database.initial_load_window_h
|
|
if window_h is None:
|
|
start, end = self._search_window(center_timestamp, UNBOUND_WINDOW)
|
|
else:
|
|
window = to_duration(window_h * 3600)
|
|
start, end = self._search_window(center_timestamp, window)
|
|
|
|
self.db_load_records(start, end)
|
|
|
|
self._db_load_phase = DatabaseRecordProtocolLoadPhase.INITIAL
|
|
|
|
def _db_load_full(self) -> int:
|
|
"""Load all remaining records from the database into memory.
|
|
|
|
This method performs a **full load** of the database, ensuring that all
|
|
records are present in memory. After this operation, the `_db_load_phase`
|
|
will be set to FULL, and `_db_loaded_range` will cover all known records.
|
|
|
|
**State transitions:**
|
|
|
|
* Allowed only from the INITIAL phase (partial window loaded) or NONE
|
|
(nothing loaded yet).
|
|
* If already FULL, the method is a no-op and returns 0.
|
|
|
|
Returns:
|
|
int: Number of records loaded from the database during this operation.
|
|
|
|
Raises:
|
|
RuntimeError: If called from an invalid load phase.
|
|
"""
|
|
if not self.db_enabled:
|
|
return 0
|
|
|
|
# Guard: must only run from NONE or INITIAL
|
|
if self._db_load_phase not in (
|
|
DatabaseRecordProtocolLoadPhase.NONE,
|
|
DatabaseRecordProtocolLoadPhase.INITIAL,
|
|
):
|
|
raise RuntimeError(
|
|
"_db_load_full() may only be called when load phase is NONE or INITIAL."
|
|
)
|
|
|
|
# Perform full database load (memory is authoritative; skips duplicates)
|
|
# This also sets _db_loaded_range
|
|
loaded_count = self.db_load_records()
|
|
|
|
# Update state
|
|
self._db_load_phase = DatabaseRecordProtocolLoadPhase.FULL
|
|
|
|
return loaded_count
|
|
|
|
def _extend_boundaries(
|
|
self,
|
|
start_timestamp: DatabaseTimestampType,
|
|
end_timestamp: DatabaseTimestampType,
|
|
) -> tuple[DatabaseTimestampType, DatabaseTimestampType]:
|
|
"""Find nearest database records outside requested range.
|
|
|
|
Returns:
|
|
(new_start, new_end) timestamps to fully cover requested range including neighbors.
|
|
"""
|
|
if start_timestamp is None or end_timestamp is None:
|
|
# Make mypy happy
|
|
raise RuntimeError(f"timestamps shall be non None: {start_timestamp}, {end_timestamp}")
|
|
|
|
new_start, new_end = start_timestamp, end_timestamp
|
|
|
|
# Extend start
|
|
if (
|
|
not isinstance(start_timestamp, _DatabaseTimestampUnbound)
|
|
and self._db_sorted_timestamps
|
|
and start_timestamp < self._db_sorted_timestamps[0]
|
|
):
|
|
# There may be earlier DB records
|
|
# Reverse iterate to get nearest smaller key
|
|
for key, _ in self.database.iterate_records(
|
|
start_key=UNBOUND_START,
|
|
end_key=self._db_key_from_timestamp(start_timestamp),
|
|
namespace=self.db_namespace(),
|
|
reverse=True,
|
|
):
|
|
ts = self._db_key_to_timestamp(key)
|
|
|
|
if ts in self._db_deleted_timestamps:
|
|
continue
|
|
|
|
if ts < start_timestamp:
|
|
new_start = ts
|
|
break # first valid record is the nearest
|
|
|
|
# Extend end
|
|
if (
|
|
not isinstance(end_timestamp, _DatabaseTimestampUnbound)
|
|
and self._db_sorted_timestamps
|
|
and end_timestamp > self._db_sorted_timestamps[-1]
|
|
):
|
|
# There may be later DB records
|
|
for key, _ in self.database.iterate_records(
|
|
start_key=self._db_key_from_timestamp(end_timestamp),
|
|
end_key=UNBOUND_END,
|
|
namespace=self.db_namespace(),
|
|
):
|
|
ts = self._db_key_to_timestamp(key)
|
|
|
|
if ts in self._db_deleted_timestamps:
|
|
continue
|
|
|
|
if ts >= end_timestamp:
|
|
new_end = ts
|
|
break # first valid record is the nearest
|
|
|
|
return new_start, new_end
|
|
|
|
def _db_ensure_loaded(
|
|
self,
|
|
start_timestamp: Optional[DatabaseTimestampType] = None,
|
|
end_timestamp: Optional[DatabaseTimestampType] = None,
|
|
*,
|
|
center_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> None:
|
|
"""Ensure database records for a given timestamp range are available in memory.
|
|
|
|
Lazy loading is performed in phases: NONE -> INITIAL -> FULL
|
|
|
|
1. **NONE**: No records loaded yet.
|
|
|
|
* If a range is provided, load exactly that range.
|
|
* If no range, load an initial window around `center_timestamp`.
|
|
|
|
2. **INITIAL**: A partial window is loaded.
|
|
|
|
* If requested range extends beyond loaded window, expand left/right as needed.
|
|
* If no range requested, escalate to FULL.
|
|
|
|
3. **FULL**: All records already loaded. Nothing to do.
|
|
|
|
Args:
|
|
start_timestamp (DatabaseTimestampType): Inclusive start of desired range.
|
|
end_timestamp (DatabaseTimestampType): Exclusive end of desired range.
|
|
center_timestamp (DatabaseTimestampType): Center for initial window if nothing loaded.
|
|
|
|
Notes:
|
|
* Only used for preparing memory for subsequent queries; does not return records.
|
|
* `center_timestamp` is ignored once an initial window has been established.
|
|
"""
|
|
if not self.db_enabled:
|
|
return
|
|
|
|
# Normalize boundaries immediately (strict DB layer rule)
|
|
if start_timestamp is None:
|
|
start_timestamp = UNBOUND_START
|
|
if end_timestamp is None:
|
|
end_timestamp = UNBOUND_END
|
|
|
|
# Shortcut: memory already covers the extended range
|
|
if self._db_sorted_timestamps:
|
|
mem_start, mem_end = self._db_sorted_timestamps[0], self._db_sorted_timestamps[-1]
|
|
|
|
# Case 1: bounded request
|
|
if (
|
|
start_timestamp is not UNBOUND_START
|
|
and end_timestamp is not UNBOUND_END
|
|
and mem_start < start_timestamp
|
|
and mem_end >= end_timestamp
|
|
):
|
|
return
|
|
|
|
# Case 2: unbounded request only safe if FULL
|
|
if (
|
|
self._db_load_phase is DatabaseRecordProtocolLoadPhase.FULL
|
|
and (start_timestamp is UNBOUND_START or mem_start < start_timestamp)
|
|
and (end_timestamp is UNBOUND_END or mem_end >= end_timestamp)
|
|
):
|
|
return
|
|
|
|
# Phase 0: NOTHING LOADED
|
|
if self._db_load_phase is DatabaseRecordProtocolLoadPhase.NONE:
|
|
if start_timestamp is UNBOUND_START and end_timestamp is UNBOUND_END:
|
|
self._db_load_initial_window(center_timestamp)
|
|
# _db_load_initial_window sets _db_loaded_range and _db_load_phase
|
|
else:
|
|
# Load the records
|
|
loaded = self.db_load_records(start_timestamp, end_timestamp)
|
|
self._db_load_phase = DatabaseRecordProtocolLoadPhase.INITIAL
|
|
return
|
|
|
|
if center_timestamp is not None:
|
|
logger.debug(
|
|
f"Center timestamp parameter '{center_timestamp}' given outside of load phase NONE"
|
|
)
|
|
|
|
# Phase 1: INITIAL WINDOW (PARTIAL)
|
|
if self._db_load_phase is DatabaseRecordProtocolLoadPhase.INITIAL:
|
|
# Escalate to FULL if no range is specified
|
|
if self._db_loaded_range is None:
|
|
# Should never happen
|
|
raise RuntimeError("_db_loaded_range shall set when load phase is INITIAL")
|
|
|
|
if self._db_range_covered(start_timestamp, end_timestamp):
|
|
return # already have it
|
|
|
|
if start_timestamp == UNBOUND_START and end_timestamp == UNBOUND_END:
|
|
self._db_load_full()
|
|
return
|
|
|
|
current_start, current_end = self._db_loaded_range
|
|
if current_start is None or current_end is None:
|
|
raise RuntimeError(
|
|
"_db_loaded_range shall not be set to (None, None) when load phase is INITIAL"
|
|
)
|
|
|
|
# Left expansion
|
|
if start_timestamp < current_start:
|
|
self.db_load_records(start_timestamp, current_start)
|
|
|
|
# Right expansion
|
|
if end_timestamp > current_end:
|
|
self.db_load_records(current_end, end_timestamp)
|
|
|
|
return
|
|
|
|
# Phase 2: FULL
|
|
# Everything already loaded, nothing to do
|
|
return
|
|
|
|
# ---- derived class required interface ----
|
|
|
|
def db_keep_duration(self) -> Optional[Duration]:
|
|
"""Duration for which database records should be retained.
|
|
|
|
Used when removing old records from database to free space.
|
|
|
|
Defaults to general database configuration.
|
|
|
|
May be provided by derived class.
|
|
|
|
Returns:
|
|
Duration or None (forever).
|
|
"""
|
|
duration_h: Optional[Duration] = self.config.database.keep_duration_h
|
|
if duration_h is None:
|
|
return None
|
|
return to_duration(duration_h * 3600)
|
|
|
|
def db_namespace(self) -> str:
|
|
"""Namespace of database.
|
|
|
|
To be implemented by derived class.
|
|
"""
|
|
raise NotImplementedError
|
|
|
|
# ---- public DB interface ----
|
|
|
|
@property
|
|
def db_enabled(self) -> bool:
|
|
return self.database.is_open
|
|
|
|
def db_timestamp_range(
|
|
self,
|
|
) -> tuple[Optional[DatabaseTimestamp], Optional[DatabaseTimestamp]]:
|
|
"""Get the timestamp range of records in database.
|
|
|
|
Regards records in storage plus extra records in memory.
|
|
"""
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
if self._db_sorted_timestamps:
|
|
memory_min_timestamp: Optional[DatabaseTimestamp] = self._db_sorted_timestamps[0]
|
|
memory_max_timestamp: Optional[DatabaseTimestamp] = self._db_sorted_timestamps[-1]
|
|
else:
|
|
memory_min_timestamp = None
|
|
memory_max_timestamp = None
|
|
|
|
if not self.db_enabled:
|
|
return memory_min_timestamp, memory_max_timestamp
|
|
|
|
db_min_key, db_max_key = self.database.get_key_range(self.db_namespace())
|
|
|
|
if db_min_key is None or db_max_key is None:
|
|
return memory_min_timestamp, memory_max_timestamp
|
|
|
|
storage_min_timestamp = self._db_key_to_timestamp(db_min_key)
|
|
storage_max_timestamp = self._db_key_to_timestamp(db_max_key)
|
|
|
|
if memory_min_timestamp and memory_min_timestamp < storage_min_timestamp:
|
|
min_timestamp = memory_min_timestamp
|
|
else:
|
|
min_timestamp = storage_min_timestamp
|
|
if memory_max_timestamp and memory_max_timestamp > storage_max_timestamp:
|
|
max_timestamp = memory_max_timestamp
|
|
else:
|
|
max_timestamp = storage_max_timestamp
|
|
|
|
return min_timestamp, max_timestamp
|
|
|
|
def db_generate_timestamps(
|
|
self,
|
|
start_timestamp: DatabaseTimestamp,
|
|
values_count: int,
|
|
interval: Optional[Duration] = None,
|
|
) -> Iterator[DatabaseTimestamp]:
|
|
"""Generate database timestamps using fixed absolute time stepping.
|
|
|
|
The iterator advances strictly in UTC, guaranteeing constant
|
|
spacing in seconds across daylight saving transitions.
|
|
|
|
Returned database timestamps are in UTC. This avoids ambiguity during
|
|
fall-back transitions and prevents accidental overwriting when
|
|
inserting into UTC-normalized storage backends.
|
|
|
|
Args:
|
|
start_timestamp (DatabaseTimestamp): Starting database timestamp.
|
|
values_count (int): Number of timestamps to generate.
|
|
interval (Optional[Duration]): Fixed duration between timestamps.
|
|
Defaults to 1 hour if not provided.
|
|
|
|
Yields:
|
|
DatabaseTimestamp: UTC-based database timestamps.
|
|
|
|
Raises:
|
|
ValueError: If values_count is negative.
|
|
"""
|
|
if values_count < 0:
|
|
raise ValueError("values_count must be non-negative")
|
|
|
|
if interval is None:
|
|
interval = Duration(hours=1)
|
|
|
|
step_seconds = int(interval.total_seconds())
|
|
|
|
current_utc = DatabaseTimestamp.to_datetime(start_timestamp)
|
|
|
|
for _ in range(values_count):
|
|
yield DatabaseTimestamp.from_datetime(current_utc)
|
|
current_utc = current_utc.add(seconds=step_seconds)
|
|
|
|
def db_get_record(
|
|
self,
|
|
target_timestamp: DatabaseTimestamp,
|
|
*,
|
|
time_window: DatabaseTimeWindowType = None,
|
|
) -> Optional[T_Record]:
|
|
"""Get the record at or nearest to the specified timestamp.
|
|
|
|
The search strategies are:
|
|
|
|
* None - exact match only.
|
|
* UNBOUND_WINDOW - nearest record across all stored records.
|
|
* Duration - nearest record within a symmetric window of this total width around
|
|
target_timestamp.
|
|
|
|
Args:
|
|
target_timestamp: The timestamp to search for.
|
|
time_window: Controls the search strategy (None, UNBOUND_WINDOW, Duration).
|
|
|
|
Returns:
|
|
Exact match, nearest record within the window, or None.
|
|
"""
|
|
self._db_ensure_initialized()
|
|
|
|
if time_window is None:
|
|
# Exact match only — load the minimal range containing this point
|
|
self._db_ensure_loaded(
|
|
target_timestamp,
|
|
self._db_timestamp_after(target_timestamp),
|
|
center_timestamp=target_timestamp,
|
|
)
|
|
return self._db_record_index.get(target_timestamp, None)
|
|
|
|
# load the relevant range
|
|
# in case of unbounded escalates to FULL
|
|
search_start, search_end = self._search_window(target_timestamp, time_window)
|
|
self._db_ensure_loaded(search_start, search_end, center_timestamp=target_timestamp)
|
|
|
|
# Exact match first (works for all three cases once loaded)
|
|
record = self._db_record_index.get(target_timestamp, None)
|
|
if record is not None:
|
|
return record
|
|
|
|
# Nearest-neighbour search
|
|
idx = bisect.bisect_left(self._db_sorted_timestamps, target_timestamp)
|
|
candidates = []
|
|
if idx < len(self._db_sorted_timestamps):
|
|
candidates.append(self.records[idx])
|
|
if idx > 0:
|
|
candidates.append(self.records[idx - 1])
|
|
if not candidates:
|
|
return None
|
|
|
|
record = min(
|
|
candidates,
|
|
key=lambda r: abs(
|
|
(r.date_time - DatabaseTimestamp.to_datetime(target_timestamp)).total_seconds()
|
|
),
|
|
)
|
|
|
|
# For bounded windows, enforce the distance constraint
|
|
if not isinstance(time_window, _DatabaseTimeWindowUnbound):
|
|
half_seconds = to_duration(time_window).total_seconds() / 2
|
|
if (
|
|
abs(
|
|
(
|
|
record.date_time - DatabaseTimestamp.to_datetime(target_timestamp)
|
|
).total_seconds()
|
|
)
|
|
> half_seconds
|
|
):
|
|
return None
|
|
|
|
return record
|
|
|
|
def db_insert_record(
|
|
self,
|
|
record: T_Record,
|
|
*,
|
|
mark_dirty: bool = True,
|
|
) -> None:
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
# Ensure normalized to UTC
|
|
db_record_date_time = DatabaseTimestamp.from_datetime(record.date_time)
|
|
|
|
self._db_ensure_loaded(
|
|
start_timestamp=db_record_date_time,
|
|
end_timestamp=db_record_date_time,
|
|
)
|
|
|
|
# Memory only
|
|
if db_record_date_time in self._db_record_index:
|
|
# No duplicates allowed
|
|
raise ValueError(f"Duplicate timestamp {record.date_time} -> {db_record_date_time}")
|
|
|
|
if db_record_date_time in self._db_deleted_timestamps:
|
|
# Clear tombstone - if we are re-inserting
|
|
self._db_deleted_timestamps.discard(db_record_date_time)
|
|
|
|
# insert
|
|
index = bisect.bisect_left(self._db_sorted_timestamps, db_record_date_time)
|
|
self._db_sorted_timestamps.insert(index, db_record_date_time)
|
|
self.records.insert(index, record)
|
|
self._db_record_index[db_record_date_time] = record
|
|
|
|
if mark_dirty:
|
|
self._db_dirty_timestamps.add(db_record_date_time)
|
|
self._db_new_timestamps.add(db_record_date_time)
|
|
|
|
# -----------------------------------------------------
|
|
# Load (range)
|
|
# -----------------------------------------------------
|
|
|
|
def db_load_records(
|
|
self,
|
|
start_timestamp: Optional[DatabaseTimestampType] = None,
|
|
end_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> int:
|
|
"""Load records from database into memory.
|
|
|
|
Merges database records into in-memory records while preserving:
|
|
- Memory-only records
|
|
- Sorted order
|
|
- No duplicates (DB overwrites memory)
|
|
|
|
This requested load range is extended to include the first record < start_timestamp
|
|
and the first record >= end_timestamp, so nearest-neighbor searches do not require
|
|
additional DB lookups.
|
|
|
|
The `_db_loaded_range` is updated to reflect the total timestamp span
|
|
currently present in memory after this method completes.
|
|
|
|
Args:
|
|
start_timestamp: Load records from this timestamp (inclusive)
|
|
end_timestamp: Load records until this timestamp (exclusive)
|
|
|
|
Returns:
|
|
Number of records loaded from database
|
|
|
|
Note:
|
|
record.date_time shall be DateTime or None
|
|
"""
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
if not self.db_enabled:
|
|
return 0
|
|
|
|
# Normalize boundaries immediately (strict DB layer rule)
|
|
if start_timestamp is None:
|
|
start_timestamp = UNBOUND_START
|
|
if end_timestamp is None:
|
|
end_timestamp = UNBOUND_END
|
|
|
|
# Extend boundaries to include first record < start and first record >= end
|
|
query_start, query_end = self._extend_boundaries(start_timestamp, end_timestamp)
|
|
|
|
if isinstance(query_start, _DatabaseTimestampUnbound):
|
|
start_key = None
|
|
else:
|
|
start_key = self._db_key_from_timestamp(query_start)
|
|
if isinstance(query_end, _DatabaseTimestampUnbound):
|
|
end_key = None
|
|
else:
|
|
end_key = self._db_key_from_timestamp(query_end)
|
|
|
|
namespace = self.db_namespace()
|
|
|
|
loaded_count = 0
|
|
|
|
# Iterate DB records (already sorted by key)
|
|
for db_key, value in self.database.iterate_records(
|
|
start_key=start_key,
|
|
end_key=end_key,
|
|
namespace=namespace,
|
|
):
|
|
if db_key == DATABASE_METADATA_KEY:
|
|
continue
|
|
|
|
record = self._db_deserialize_record(value)
|
|
db_record_date_time = DatabaseTimestamp.from_datetime(record.date_time)
|
|
|
|
# Do not resurrect explicitly deleted records
|
|
if db_record_date_time in self._db_deleted_timestamps:
|
|
continue
|
|
|
|
# ---- Memory is authoritative: skip if already present
|
|
if db_record_date_time in self._db_record_index:
|
|
continue
|
|
|
|
# Insert sorted
|
|
# - do not call self.db_insert_record - may call db_load_records recursively
|
|
# - see self.db_insert_record(record, mark_dirty=False)
|
|
index = bisect.bisect_left(self._db_sorted_timestamps, db_record_date_time)
|
|
self._db_sorted_timestamps.insert(index, db_record_date_time)
|
|
self.records.insert(index, record)
|
|
self._db_record_index[db_record_date_time] = record
|
|
|
|
loaded_count += 1
|
|
|
|
# Update range of timestamps the was already queried from database storage during load
|
|
if self._db_loaded_range is None:
|
|
# First load - initialize
|
|
self._db_loaded_range = query_start, query_end
|
|
else:
|
|
current_start, current_end = self._db_loaded_range
|
|
if query_start < current_start:
|
|
current_start = query_start
|
|
if query_end > current_end:
|
|
current_end = query_end
|
|
self._db_loaded_range = current_start, current_end
|
|
|
|
return loaded_count
|
|
|
|
# -----------------------------------------------------
|
|
# Delete (range)
|
|
# -----------------------------------------------------
|
|
|
|
def db_delete_records(
|
|
self,
|
|
start_timestamp: Optional[DatabaseTimestampType] = None,
|
|
end_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> int:
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
# Deletion is global — ensure we see everything
|
|
self._db_ensure_loaded(
|
|
start_timestamp=start_timestamp,
|
|
end_timestamp=end_timestamp,
|
|
)
|
|
|
|
to_delete: list[DatabaseTimestamp] = []
|
|
|
|
for dt in list(self._db_sorted_timestamps):
|
|
if start_timestamp and dt < start_timestamp:
|
|
continue
|
|
if end_timestamp and dt >= end_timestamp:
|
|
continue
|
|
to_delete.append(dt)
|
|
|
|
for dt in to_delete:
|
|
record = self._db_record_index.pop(dt, None)
|
|
if record is not None:
|
|
idx = bisect.bisect_left(self._db_sorted_timestamps, dt)
|
|
if idx < len(self._db_sorted_timestamps) and self._db_sorted_timestamps[idx] == dt:
|
|
self._db_sorted_timestamps.pop(idx)
|
|
try:
|
|
self.records.remove(record)
|
|
except Exception as ex:
|
|
logger.debug(f"Failed to remove record: {ex}")
|
|
|
|
# Mark for physical deletion
|
|
self._db_deleted_timestamps.add(dt)
|
|
|
|
# If it was dirty (new record), cancel the insert instead
|
|
self._db_dirty_timestamps.discard(dt)
|
|
self._db_new_timestamps.discard(dt)
|
|
|
|
return len(to_delete)
|
|
|
|
# -----------------------------------------------------
|
|
# Iteration from DB (no duplicates)
|
|
# -----------------------------------------------------
|
|
|
|
def db_iterate_records(
|
|
self,
|
|
start_timestamp: Optional[DatabaseTimestampType] = None,
|
|
end_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> Iterator[T_Record]:
|
|
"""Iterate records in requested range.
|
|
|
|
Ensures storage is loaded into memory first,
|
|
then iterates over in-memory records only.
|
|
"""
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
# Ensure memory contains required range
|
|
self._db_ensure_loaded(
|
|
start_timestamp=start_timestamp,
|
|
end_timestamp=end_timestamp,
|
|
)
|
|
|
|
for record in self.records:
|
|
record_date_time_timestamp = DatabaseTimestamp.from_datetime(record.date_time)
|
|
|
|
if start_timestamp and record_date_time_timestamp < start_timestamp:
|
|
continue
|
|
|
|
if end_timestamp and record_date_time_timestamp >= end_timestamp:
|
|
break
|
|
|
|
if record_date_time_timestamp in self._db_deleted_timestamps:
|
|
continue
|
|
|
|
yield record
|
|
|
|
# -----------------------------------------------------
|
|
# Dirty tracking
|
|
# -----------------------------------------------------
|
|
|
|
def db_mark_dirty_record(self, record: T_Record) -> None:
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
record_date_time_timestamp = DatabaseTimestamp.from_datetime(record.date_time)
|
|
self._db_dirty_timestamps.add(record_date_time_timestamp)
|
|
|
|
# -----------------------------------------------------
|
|
# Bulk save (flush dirty only)
|
|
# -----------------------------------------------------
|
|
|
|
def db_save_records(self) -> int:
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
if not self.db_enabled:
|
|
return 0
|
|
|
|
if not self._db_dirty_timestamps and not self._db_deleted_timestamps:
|
|
return 0
|
|
|
|
namespace = self.db_namespace()
|
|
|
|
# safer order: saves first, deletes last
|
|
|
|
# --- handle inserts/updates ---
|
|
save_items = []
|
|
for dt in self._db_dirty_timestamps:
|
|
record = self._db_record_index.get(dt)
|
|
if record:
|
|
key = self._db_key_from_timestamp(dt)
|
|
value = self._db_serialize_record(record)
|
|
save_items.append((key, value))
|
|
saved_count = len(save_items)
|
|
if saved_count:
|
|
self.database.save_records(save_items, namespace=namespace)
|
|
self._db_dirty_timestamps.clear()
|
|
self._db_new_timestamps.clear()
|
|
|
|
# --- handle deletions ---
|
|
if self._db_deleted_timestamps:
|
|
delete_keys = [self._db_key_from_timestamp(dt) for dt in self._db_deleted_timestamps]
|
|
self.database.delete_records(delete_keys, namespace=namespace)
|
|
deleted_count = len(self._db_deleted_timestamps)
|
|
self._db_deleted_timestamps.clear()
|
|
|
|
return saved_count + deleted_count
|
|
|
|
def db_autosave(self) -> int:
|
|
return self.db_save_records()
|
|
|
|
def db_vacuum(
|
|
self,
|
|
keep_hours: Optional[int] = None,
|
|
keep_timestamp: Optional[DatabaseTimestampType] = None,
|
|
) -> int:
|
|
"""Remove old records from database to free space.
|
|
|
|
Semantics:
|
|
|
|
- keep_hours is relative to the DB's max timestamp: cutoff = db_max - keep_hours, and records
|
|
with timestamp < cutoff are deleted.
|
|
- keep_timestamp is an absolute cutoff; records with timestamp < cutoff are deleted (exclusive).
|
|
|
|
Uses self.keep_duration() if both of keep_hours and keep_timestamp are None.
|
|
|
|
Args:
|
|
keep_hours: Keep only records from the last N hours (relative to the data's max timestamp)
|
|
keep_timestamp: Keep only records from this timestamp on (absolute cutoff)
|
|
|
|
Returns:
|
|
Number of records deleted
|
|
"""
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
if keep_hours is None and keep_timestamp is None:
|
|
keep_duration = self.db_keep_duration()
|
|
if keep_duration is None:
|
|
# No vacuum if all is None
|
|
logger.info(
|
|
f"Vacuum requested for database '{self.db_namespace()}' but keep limit is infinite."
|
|
)
|
|
return 0
|
|
keep_hours = keep_duration.hours
|
|
|
|
if keep_hours is not None:
|
|
_, db_max = self.db_timestamp_range()
|
|
if db_max is None or isinstance(db_max, _DatabaseTimestampUnbound):
|
|
# No records
|
|
return 0 # nothing to delete
|
|
if keep_hours <= 0:
|
|
db_cutoff_timestamp: DatabaseTimestampType = UNBOUND_END
|
|
else:
|
|
# cutoff = first record we want to delete; everything before is removed
|
|
datetime_max: DateTime = DatabaseTimestamp.to_datetime(db_max)
|
|
db_cutoff_timestamp = DatabaseTimestamp.from_datetime(
|
|
datetime_max.subtract(hours=keep_hours - 1)
|
|
)
|
|
elif keep_timestamp is not None:
|
|
db_cutoff_timestamp = keep_timestamp
|
|
else:
|
|
raise ValueError("Must specify either keep_hours or keep_timestamp")
|
|
|
|
# Delete records
|
|
deleted_count = self.db_delete_records(end_timestamp=db_cutoff_timestamp)
|
|
|
|
self.db_save_records()
|
|
|
|
logger.info(
|
|
f"Vacuumed {deleted_count} old records from database '{self.db_namespace()}' "
|
|
f"(before {db_cutoff_timestamp})"
|
|
)
|
|
return deleted_count
|
|
|
|
def db_count_records(self) -> int:
|
|
"""Return total logical number of records.
|
|
|
|
Memory is authoritative. If DB is enabled but not fully loaded,
|
|
we conservatively include storage-only records.
|
|
"""
|
|
# Defensive call - model_post_init() may not have initialized metadata
|
|
self._db_ensure_initialized()
|
|
|
|
if not self.db_enabled:
|
|
return len(self.records)
|
|
|
|
# If fully loaded, memory is complete view
|
|
if self._db_load_phase is DatabaseRecordProtocolLoadPhase.FULL:
|
|
return len(self.records)
|
|
|
|
storage_count = self.database.count_records(namespace=self.db_namespace())
|
|
pending_deletes = len(self._db_deleted_timestamps)
|
|
new_count = len(self._db_new_timestamps)
|
|
|
|
return storage_count + new_count - pending_deletes
|
|
|
|
def db_get_stats(self) -> dict:
|
|
"""Get comprehensive statistics about database storage.
|
|
|
|
Returns:
|
|
Dictionary with statistics
|
|
"""
|
|
if not self.db_enabled:
|
|
return {"enabled": False}
|
|
|
|
ns = self.db_namespace()
|
|
|
|
stats = {
|
|
"enabled": True,
|
|
"backend": self.database.__class__.__name__,
|
|
"path": str(self.database.storage_path),
|
|
"memory_records": len(self.records),
|
|
"compression_enabled": self.database.compression,
|
|
"keep_duration_h": self.config.database.keep_duration_h,
|
|
"autosave_interval_sec": self.config.database.autosave_interval_sec,
|
|
"total_records": self.database.count_records(namespace=ns),
|
|
}
|
|
|
|
# Add backend-specific stats
|
|
stats.update(self.database.get_backend_stats(namespace=ns))
|
|
|
|
min_timestamp, max_timestamp = self.db_timestamp_range()
|
|
stats["timestamp_range"] = {
|
|
"min": str(min_timestamp),
|
|
"max": str(max_timestamp),
|
|
}
|
|
|
|
return stats
|
|
|
|
# ==================== Tiered Compaction ====================
|
|
|
|
def db_compact_tiers(self) -> list[tuple[Duration, Duration]]:
|
|
"""Compaction tiers as (age_threshold, target_interval) pairs.
|
|
|
|
Records older than age_threshold are downsampled to target_interval.
|
|
Tiers must be ordered from shortest to longest age threshold.
|
|
|
|
Default policy:
|
|
|
|
- older than 2 hours → 15 min resolution
|
|
- older than 14 days → 1 hour resolution
|
|
|
|
Return empty list to disable compaction entirely.
|
|
Override in derived classes for domain-specific behaviour.
|
|
|
|
Example override to disable:
|
|
|
|
.. code-block python
|
|
|
|
def db_compact_tiers(self):
|
|
return []
|
|
|
|
Example override for price data (already at 15 min, skip first tier):
|
|
|
|
.. code-block python
|
|
|
|
def db_compact_tiers(self):
|
|
return [
|
|
(to_duration("2 weeks"), to_duration("1 hour")),
|
|
]
|
|
|
|
.. comment
|
|
"""
|
|
return [
|
|
(to_duration("2 hours"), to_duration("15 minutes")),
|
|
(to_duration("14 days"), to_duration("1 hour")),
|
|
]
|
|
|
|
# ------------------------------------------------------------------
|
|
# Compaction state helpers (stored in namespace metadata)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _db_get_compact_state(
|
|
self,
|
|
tier_interval: Duration,
|
|
) -> Optional[DatabaseTimestamp]:
|
|
"""Load the last compaction cutoff timestamp for a given tier interval.
|
|
|
|
Args:
|
|
tier_interval: The target interval that identifies this tier.
|
|
|
|
Returns:
|
|
The last cutoff DatabaseTimestamp, or None if never compacted.
|
|
"""
|
|
if self._db_metadata is None:
|
|
return None
|
|
key = f"last_compact_cutoff_{int(tier_interval.total_seconds())}"
|
|
cutoff_str = self._db_metadata.get(key)
|
|
return DatabaseTimestamp(cutoff_str) if cutoff_str else None
|
|
|
|
def _db_set_compact_state(
|
|
self,
|
|
tier_interval: Duration,
|
|
cutoff_ts: DatabaseTimestamp,
|
|
) -> None:
|
|
"""Persist the last compaction cutoff timestamp for a given tier interval.
|
|
|
|
Args:
|
|
tier_interval: The target interval that identifies this tier.
|
|
cutoff_ts: The cutoff timestamp to store.
|
|
"""
|
|
if self._db_metadata is None:
|
|
self._db_metadata = {}
|
|
key = f"last_compact_cutoff_{int(tier_interval.total_seconds())}"
|
|
self._db_metadata[key] = str(cutoff_ts)
|
|
self._db_save_metadata(self._db_metadata)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Single-tier worker
|
|
# ------------------------------------------------------------------
|
|
|
|
def _db_compact_tier(
|
|
self,
|
|
age_threshold: Duration,
|
|
target_interval: Duration,
|
|
) -> int:
|
|
"""Downsample records older than age_threshold to target_interval resolution.
|
|
|
|
Only processes the window [last_compact_cutoff, new_cutoff) so repeated
|
|
runs are cheap.
|
|
|
|
The window boundaries are snapped to UTC epoch-aligned interval boundaries
|
|
before processing:
|
|
|
|
- ``window_start`` is floored to the nearest interval boundary at or before
|
|
the raw start. This guarantees that the first resampled bucket always
|
|
sits on a clock-round timestamp (e.g. :00/:15/:30/:45 for 15 min) and
|
|
that consecutive runs produce gapless, non-overlapping coverage.
|
|
- ``window_end`` (the new cutoff stored in metadata) is also floored, so
|
|
the boundary stored in metadata is always interval-aligned. Records
|
|
between the floored cutoff and the raw cutoff (``newest - age_threshold``)
|
|
are left untouched and will be picked up on the next run once more data
|
|
arrives and the floored cutoff advances.
|
|
|
|
Skips resampling entirely when the existing record count is already at or
|
|
below the number of buckets resampling would produce (sparse-data guard).
|
|
When data is sparse but timestamps are misaligned the guard is bypassed and
|
|
timestamps are snapped to interval boundaries without changing values.
|
|
|
|
Args:
|
|
age_threshold: Records older than (newest - age_threshold) are compacted.
|
|
target_interval: Target resolution after compaction.
|
|
|
|
Returns:
|
|
Number of original records deleted (before re-insertion of downsampled
|
|
records). Returns 0 if skipped.
|
|
"""
|
|
self._db_ensure_initialized()
|
|
|
|
interval_sec = int(target_interval.total_seconds())
|
|
if interval_sec <= 0:
|
|
return 0
|
|
|
|
# ---- Determine raw new cutoff ------------------------------------
|
|
_, db_max = self.db_timestamp_range()
|
|
if db_max is None or isinstance(db_max, _DatabaseTimestampUnbound):
|
|
return 0
|
|
|
|
newest_dt = DatabaseTimestamp.to_datetime(db_max)
|
|
raw_cutoff_dt = newest_dt - age_threshold
|
|
|
|
# Snap new_cutoff DOWN to the nearest interval boundary.
|
|
# Records in [floored_cutoff, raw_cutoff) are left alone until the next
|
|
# run — they are inside the age window but straddle an incomplete bucket.
|
|
raw_cutoff_epoch = int(raw_cutoff_dt.timestamp())
|
|
floored_cutoff_epoch = (raw_cutoff_epoch // interval_sec) * interval_sec
|
|
new_cutoff_dt = DateTime.fromtimestamp(floored_cutoff_epoch, tz="UTC")
|
|
new_cutoff_ts = DatabaseTimestamp.from_datetime(new_cutoff_dt)
|
|
|
|
# ---- Determine window start (incremental) ------------------------
|
|
last_cutoff_ts = self._db_get_compact_state(target_interval)
|
|
|
|
if last_cutoff_ts is not None and last_cutoff_ts >= new_cutoff_ts:
|
|
logger.debug(
|
|
f"Namespace '{self.db_namespace()}' tier {target_interval} already "
|
|
f"compacted up to {new_cutoff_ts}, skipping."
|
|
)
|
|
return 0
|
|
|
|
db_min, _ = self.db_timestamp_range()
|
|
if db_min is None or isinstance(db_min, _DatabaseTimestampUnbound):
|
|
return 0
|
|
|
|
# Raw window start: last cutoff or absolute db minimum
|
|
raw_window_start_ts = last_cutoff_ts if last_cutoff_ts is not None else db_min
|
|
if raw_window_start_ts >= new_cutoff_ts:
|
|
return 0
|
|
|
|
raw_window_start_dt = DatabaseTimestamp.to_datetime(raw_window_start_ts)
|
|
|
|
# Snap window_start DOWN to the nearest interval boundary so the first
|
|
# resampled bucket is clock-aligned. This may pull the window slightly
|
|
# earlier than the last stored cutoff, which is safe: key_to_array with
|
|
# boundary="strict" only reads the window we pass and the re-insert step
|
|
# is idempotent for already-compacted records (they will simply be
|
|
# overwritten with the same values).
|
|
raw_start_epoch = int(raw_window_start_dt.timestamp())
|
|
floored_start_epoch = (raw_start_epoch // interval_sec) * interval_sec
|
|
window_start_dt = DateTime.fromtimestamp(floored_start_epoch, tz="UTC")
|
|
window_start_ts = DatabaseTimestamp.from_datetime(window_start_dt)
|
|
|
|
window_end_dt = new_cutoff_dt # exclusive upper bound, already aligned
|
|
window_end_ts = new_cutoff_ts
|
|
|
|
# ---- Sparse-data guard -------------------------------------------
|
|
existing_count = self.database.count_records(
|
|
start_key=self._db_key_from_timestamp(window_start_ts),
|
|
end_key=self._db_key_from_timestamp(window_end_ts),
|
|
namespace=self.db_namespace(),
|
|
)
|
|
|
|
window_sec = int((window_end_dt - window_start_dt).total_seconds())
|
|
# Maximum number of buckets resampling could produce (ceiling division)
|
|
resampled_count = (window_sec + interval_sec - 1) // interval_sec
|
|
|
|
if existing_count == 0:
|
|
# Nothing in window — just advance the cutoff
|
|
self._db_set_compact_state(target_interval, new_cutoff_ts)
|
|
return 0
|
|
|
|
if existing_count <= resampled_count:
|
|
# Data is already sparse — check whether timestamps are aligned.
|
|
# If every record already sits on an interval boundary, nothing to do.
|
|
# If any are misaligned, snap them in place without resampling.
|
|
records_in_window = [
|
|
r
|
|
for r in self.records
|
|
if r.date_time is not None and window_start_dt <= r.date_time < window_end_dt
|
|
]
|
|
misaligned = [
|
|
r for r in records_in_window if int(r.date_time.timestamp()) % interval_sec != 0
|
|
]
|
|
if not misaligned:
|
|
logger.debug(
|
|
f"Skipping tier {target_interval} compaction for "
|
|
f"namespace '{self.db_namespace()}': "
|
|
f"existing={existing_count} <= resampled={resampled_count} "
|
|
f"and all timestamps already aligned "
|
|
f"(window={window_start_dt}..{window_end_dt})"
|
|
)
|
|
self._db_set_compact_state(target_interval, new_cutoff_ts)
|
|
return 0
|
|
|
|
# ---- Sparse but misaligned: full window rewrite -----------------
|
|
# Delete the entire window and reinsert floor-snapped records.
|
|
# Deleting first guarantees no duplicate-timestamp ValueError on
|
|
# reinsert, even when an already-aligned record sits at the same
|
|
# epoch that a misaligned record floors to.
|
|
logger.debug(
|
|
f"Rewriting sparse window in namespace '{self.db_namespace()}' "
|
|
f"tier {target_interval} (existing={existing_count}, "
|
|
f"resampled={resampled_count})"
|
|
)
|
|
|
|
# Build snapped buckets from ALL records in window.
|
|
# Process chronologically so the earliest record's values win when
|
|
# multiple records floor to the same bucket.
|
|
snapped_bucket: dict[int, dict[str, Any]] = {}
|
|
for r in sorted(records_in_window, key=lambda x: x.date_time):
|
|
ts_epoch = int(r.date_time.timestamp())
|
|
snapped_epoch = (ts_epoch // interval_sec) * interval_sec
|
|
bucket = snapped_bucket.setdefault(snapped_epoch, {})
|
|
for key in self.record_keys_writable:
|
|
if key == "date_time":
|
|
continue
|
|
try:
|
|
val = r[key]
|
|
except KeyError:
|
|
continue
|
|
if val is not None and bucket.get(key) is None:
|
|
bucket[key] = val
|
|
|
|
# Delete entire window (aligned + misaligned)
|
|
deleted = self.db_delete_records(
|
|
start_timestamp=window_start_ts,
|
|
end_timestamp=window_end_ts,
|
|
)
|
|
|
|
# Reinsert one record per bucket
|
|
for snapped_epoch, values in snapped_bucket.items():
|
|
if not values:
|
|
continue
|
|
snapped_dt = DateTime.fromtimestamp(snapped_epoch, tz="UTC")
|
|
record = self.record_class()(date_time=snapped_dt, **values)
|
|
self.db_insert_record(record, mark_dirty=True)
|
|
|
|
self.db_save_records()
|
|
self._db_set_compact_state(target_interval, new_cutoff_ts)
|
|
logger.info(
|
|
f"Rewrote sparse window in namespace '{self.db_namespace()}' "
|
|
f"tier {target_interval}: deleted={deleted}, "
|
|
f"reinserted={len(snapped_bucket)} buckets "
|
|
f"(window={window_start_dt}..{window_end_dt})"
|
|
)
|
|
return deleted
|
|
|
|
# ---- Full resampling path ----------------------------------------
|
|
# boundary="context" is used here instead of "strict" so that key_to_array
|
|
# can include one record on each side of the window for proper interpolation
|
|
# at the edges. The truncation inside key_to_array then clips the result
|
|
# back to [window_start_dt, window_end_dt) so no out-of-window values are
|
|
# ever written back. align_to_interval=True ensures buckets land on
|
|
# clock-round timestamps regardless of window_start_dt precision.
|
|
compacted_data: dict[str, Any] = {}
|
|
compacted_timestamps: list[DateTime] = []
|
|
|
|
for key in self.record_keys_writable:
|
|
if key == "date_time":
|
|
continue
|
|
try:
|
|
array = self.key_to_array(
|
|
key,
|
|
start_datetime=window_start_dt,
|
|
end_datetime=window_end_dt,
|
|
interval=target_interval,
|
|
fill_method="time",
|
|
boundary="context",
|
|
align_to_interval=True,
|
|
)
|
|
except (KeyError, TypeError, ValueError):
|
|
continue # non-numeric or missing key — skip silently
|
|
|
|
if len(array) == 0:
|
|
continue
|
|
|
|
# Build the shared timestamp spine once from the first successful key.
|
|
# The spine is derived from the actual resampled index, not from
|
|
# db_generate_timestamps, so it matches exactly what key_to_array
|
|
# produced (epoch-aligned, truncated to window).
|
|
if not compacted_timestamps:
|
|
raw_start_epoch_aligned = (
|
|
int(window_start_dt.timestamp()) // interval_sec
|
|
) * interval_sec
|
|
first_bucket_epoch = raw_start_epoch_aligned
|
|
# Advance to first bucket >= window_start_dt (truncation in key_to_array
|
|
# removes any bucket before window_start_dt)
|
|
while first_bucket_epoch < int(window_start_dt.timestamp()):
|
|
first_bucket_epoch += interval_sec
|
|
compacted_timestamps = [
|
|
DateTime.fromtimestamp(first_bucket_epoch + i * interval_sec, tz="UTC")
|
|
for i in range(len(array))
|
|
]
|
|
|
|
# Guard against length mismatch between keys
|
|
if len(array) == len(compacted_timestamps):
|
|
compacted_data[key] = array
|
|
|
|
if not compacted_data or not compacted_timestamps:
|
|
# Nothing to write back — still advance cutoff
|
|
self._db_set_compact_state(target_interval, new_cutoff_ts)
|
|
return 0
|
|
|
|
# ---- Delete originals, re-insert downsampled records -------------
|
|
deleted = self.db_delete_records(
|
|
start_timestamp=window_start_ts,
|
|
end_timestamp=window_end_ts,
|
|
)
|
|
|
|
for i, dt in enumerate(compacted_timestamps):
|
|
values = {
|
|
key: arr[i]
|
|
for key, arr in compacted_data.items()
|
|
if i < len(arr) and arr[i] is not None
|
|
}
|
|
if values:
|
|
record = self.record_class()(date_time=dt, **values)
|
|
self.db_insert_record(record, mark_dirty=True)
|
|
|
|
self.db_save_records()
|
|
|
|
# Persist the aligned new cutoff for this tier
|
|
self._db_set_compact_state(target_interval, new_cutoff_ts)
|
|
|
|
logger.info(
|
|
f"Compacted tier {target_interval}: deleted {deleted} records in "
|
|
f"namespace '{self.db_namespace()}' "
|
|
f"(window={window_start_dt}..{window_end_dt}, "
|
|
f"reinserted={len(compacted_timestamps)})"
|
|
)
|
|
return deleted
|
|
|
|
# ------------------------------------------------------------------
|
|
# Public entry point
|
|
# ------------------------------------------------------------------
|
|
|
|
def db_compact(
|
|
self,
|
|
compact_tiers: Optional[list[tuple[Duration, Duration]]] = None,
|
|
) -> int:
|
|
"""Apply tiered compaction policy to all records in this namespace.
|
|
|
|
Tiers are processed coarsest-first (longest age threshold first) to
|
|
avoid compacting fine-grained data that an inner tier would immediately
|
|
re-compact anyway.
|
|
|
|
Args:
|
|
compact_tiers: Override tiers for this call. If None, uses
|
|
db_compact_tiers(). Each entry is (age_threshold, target_interval),
|
|
ordered shortest to longest age threshold.
|
|
|
|
Returns:
|
|
Total number of original records deleted across all tiers.
|
|
"""
|
|
if compact_tiers is None:
|
|
compact_tiers = self.db_compact_tiers()
|
|
|
|
if not compact_tiers:
|
|
return 0
|
|
|
|
total_deleted = 0
|
|
|
|
# Coarsest tier first (reversed) to avoid redundant work
|
|
for age_threshold, target_interval in reversed(compact_tiers):
|
|
total_deleted += self._db_compact_tier(age_threshold, target_interval)
|
|
|
|
return total_deleted
|