Merge pull request 'feat/minteg' (#39) from feat/minteg into develop

Reviewed-on: #39
2026-05-06 09:37:35 +02:00 · 2026-05-06 09:37:35 +02:00 · 3aba39fd9d
commit 3aba39fd9d
parent b3ead47adb b3a0c412f2
2 changed files with 153 additions and 6 deletions
--- a/corrlib/cli.py
+++ b/corrlib/cli.py
@ -108,7 +108,7 @@ def find(
    ),
    ) -> None:
    """
-    Find a record in the backlog at hand. Through specifying it's ensemble and the measured correlator.
+    Find a record in the given backlog.
    """
    results = find_record(path, ensemble, corr, code)
    if results.empty:
@ -147,6 +147,9 @@ def check(path: Path = typer.Option(
        "-d",
        ),
    ) -> None:
    """
    Check the integrity of the repository.
    """
    full_integrity_check(path)
--- a/corrlib/integrity.py
+++ b/corrlib/integrity.py
@ -1,15 +1,34 @@
 import datetime as dt
 from pathlib import Path
-from .tools import get_db_file
+from .tools import get_db_file, CONFIG_FILENAME
 import pandas as pd
 import sqlite3
 from .tracker import get
 import pyerrors.input.json as pj
 import os
 from configparser import ConfigParser
 from typing import Any
 path_opts = ['db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path']
 def has_valid_times(result: pd.Series) -> bool:
    """
    Check, whether the result at hand has time-stamps that are sensible:
    A recored is created first, then updated, with both times laying in the past.
    Parameters
    ----------
    result: pd.Series
        The result to check
    Returns
    -------
    b: bool
        True, if the timestamps make sense.
    """
    # we expect created_at <= updated_at <= now
    created_at = dt.datetime.fromisoformat(result['created_at'])
    updated_at = dt.datetime.fromisoformat(result['updated_at'])
@ -20,15 +39,41 @@ def has_valid_times(result: pd.Series) -> bool:
    return True
 def are_keys_unique(db: Path, table: str, col: str) -> bool:
    """
    Check whether the strings listed in a column of a given table are unique.
    Parameters
    ----------
    db: Path
        The database to check.
    table: str
        The table to check.
    col: str
        The column to be checked for uniqueness.
    Returns
    -------
    b: bool
        True, if the strings are unique.
    """
    conn = sqlite3.connect(db)
    c = conn.cursor()
-    c.execute(f"SELECT COUNT( DISTINCT CAST(path AS nvarchar(4000))), COUNT({col}) FROM {table};")
+    c.execute(f"SELECT COUNT( DISTINCT CAST({col} AS nvarchar(4000))), COUNT({col}) FROM {table};")
    results = c.fetchall()[0]
    conn.close()
    return bool(results[0] == results[1])
 def check_db_integrity(path: Path) -> None:
    """
    Check intergrity of the database by checking the uniqueness of the record keys used to load the records
    and ensuring that the timestamps of each record is sensible. Throws an error, if issues are detected.
    Parameters
    ----------
    path: Path
        Path to the backlog-library to check.
    """
    db = get_db_file(path)
    if not are_keys_unique(path / db, 'backlogs', 'path'):
@ -41,11 +86,21 @@ def check_db_integrity(path: Path) -> None:
    for _, result in results.iterrows():
        if not has_valid_times(result):
            raise ValueError(f"Result with id {result[id]} has wrong time signatures.")
    print("DB:\t✅")
    return
 def _check_db2paths(path: Path, meas_paths: list[str]) -> None:
    """
    Check whether for each record in the given by meas_paths, we can find the data in the file as we expect.
    Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error.
    Parameters
    ----------
    path: Path
        Path to the backlog-library to check.
    meas_paths: list[str]
        List of measurement paths to check.
    """
    needed_data: dict[str, list[str]] = {}
    for mpath in meas_paths:
        file = mpath.split("::")[0]
@ -67,11 +122,19 @@ def _check_db2paths(path: Path, meas_paths: list[str]) -> None:
            for key in needed_data[file]:
                if key not in filedict.keys():
                    raise ValueError(f"Did not find data for key {key} that should be in file {file}.")
    print("Links:\t✅")
    return
 def check_db_file_links(path: Path) -> None:
    """
    Check whether for each record in the given correlator library, we can find the data in the file as we expect.
    Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error.
    Parameters
    ----------
    path: Path
        Path to the backlog-library to check.
    """
    db = get_db_file(path)
    search_expr = "SELECT path FROM 'backlogs'"
    conn = sqlite3.connect(path / db)
@ -79,9 +142,90 @@ def check_db_file_links(path: Path) -> None:
    _check_db2paths(path, list(results))
 def check_path_and_config(path: Path) -> None:
    """
    Check whether the given path exists and the cinfigureation file can be found.
    Parameters
    ----------
    path: Path
        Path to the backlog-library to check.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Corrlib path {path} does not exist.")
    config_path = path / CONFIG_FILENAME
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Configuration file {config_path} not found.")
 def check_config_validity(path: Path) -> None:
    """
    Check whether the configuration file of the given corrlib-dataset path is valid.
    Parameters
    ----------
    path: Path
        Path to the backlog-library to check.
    """
    config = ConfigParser()
    config_path = path / CONFIG_FILENAME
    if os.path.exists(config_path):
        config.read(config_path)
    else:
        raise FileNotFoundError("Configuration file not found.")
    if config.has_section('core'):
        core_opts = ['version', 'tracker', 'cached']
        has_core_opts = [config.has_option('core', opt) for opt in core_opts]
        if not all(has_core_opts):
            raise ValueError("One of the options in the 'core' section  ('version', 'tracker', 'cached') is missing.")
    if config.has_section('paths'):
        has_path_opts = [config.has_option('paths', opt) for opt in path_opts]
        if not all(has_path_opts):
            raise ValueError("One of the options in the 'path' section  ('db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path') is missing.")
 def check_paths(path: Path) -> None:
    """
    Check whether all paths demanded by the 'paths' section of the configuration-file exist.
    Parameters
    ----------
    path: Path
        Path to the backlog-library to check.
    """
    config = ConfigParser()
    config_path = path / CONFIG_FILENAME
    if os.path.exists(config_path):
        config.read(config_path)
    else:
        raise FileNotFoundError("Configuration file not found.")
    has_paths = [os.path.exists(path / config.get('paths', opt)) for opt in path_opts]
    if not all(has_paths):
        raise FileNotFoundError("One of the paths specified in the configuration file is not present.")
 def full_integrity_check(path: Path) -> None:
    """
    Aggregate all checks for easy validation of the backlog-library.
    Parameters
    ----------
    path: Path
        Path to the backlog-library to check.
    """
    print("Run full integrity check...")
    check_path_and_config(path)
    print("(1/5) Path and config-file exist: ✅")
    check_config_validity(path)
    print("(2/5) Configuration is valid: ✅")
    check_paths(path)
    print("(3/5) Needed paths exist: ✅")
    check_db_integrity(path)
    print("(4/5) Database is sane: ✅")
    check_db_file_links(path)
-    print("Full:\t✅")
+    print("(5/5) DB2File and File2DB-links are sound: ✅")
    print("Full integrity check: ✅")