diff --git a/corrlib/cli.py b/corrlib/cli.py index 0e6d96a..bdee36e 100644 --- a/corrlib/cli.py +++ b/corrlib/cli.py @@ -108,7 +108,7 @@ def find( ), ) -> None: """ - Find a record in the backlog at hand. Through specifying it's ensemble and the measured correlator. + Find a record in the given backlog. """ results = find_record(path, ensemble, corr, code) if results.empty: @@ -147,6 +147,9 @@ def check(path: Path = typer.Option( "-d", ), ) -> None: + """ + Check the integrity of the repository. + """ full_integrity_check(path) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 5f80aa3..f660dfe 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -1,15 +1,34 @@ import datetime as dt from pathlib import Path -from .tools import get_db_file +from .tools import get_db_file, CONFIG_FILENAME import pandas as pd import sqlite3 from .tracker import get import pyerrors.input.json as pj +import os +from configparser import ConfigParser from typing import Any +path_opts = ['db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path'] + + def has_valid_times(result: pd.Series) -> bool: + """ + Check, whether the result at hand has time-stamps that are sensible: + A recored is created first, then updated, with both times laying in the past. + + Parameters + ---------- + result: pd.Series + The result to check + + Returns + ------- + b: bool + True, if the timestamps make sense. + """ # we expect created_at <= updated_at <= now created_at = dt.datetime.fromisoformat(result['created_at']) updated_at = dt.datetime.fromisoformat(result['updated_at']) @@ -20,15 +39,41 @@ def has_valid_times(result: pd.Series) -> bool: return True def are_keys_unique(db: Path, table: str, col: str) -> bool: + """ + Check whether the strings listed in a column of a given table are unique. + + Parameters + ---------- + db: Path + The database to check. + table: str + The table to check. + col: str + The column to be checked for uniqueness. + + Returns + ------- + b: bool + True, if the strings are unique. + """ conn = sqlite3.connect(db) c = conn.cursor() - c.execute(f"SELECT COUNT( DISTINCT CAST(path AS nvarchar(4000))), COUNT({col}) FROM {table};") + c.execute(f"SELECT COUNT( DISTINCT CAST({col} AS nvarchar(4000))), COUNT({col}) FROM {table};") results = c.fetchall()[0] conn.close() return bool(results[0] == results[1]) def check_db_integrity(path: Path) -> None: + """ + Check intergrity of the database by checking the uniqueness of the record keys used to load the records + and ensuring that the timestamps of each record is sensible. Throws an error, if issues are detected. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ db = get_db_file(path) if not are_keys_unique(path / db, 'backlogs', 'path'): @@ -41,11 +86,21 @@ def check_db_integrity(path: Path) -> None: for _, result in results.iterrows(): if not has_valid_times(result): raise ValueError(f"Result with id {result[id]} has wrong time signatures.") - print("DB:\t✅") return def _check_db2paths(path: Path, meas_paths: list[str]) -> None: + """ + Check whether for each record in the given by meas_paths, we can find the data in the file as we expect. + Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + meas_paths: list[str] + List of measurement paths to check. + """ needed_data: dict[str, list[str]] = {} for mpath in meas_paths: file = mpath.split("::")[0] @@ -67,11 +122,19 @@ def _check_db2paths(path: Path, meas_paths: list[str]) -> None: for key in needed_data[file]: if key not in filedict.keys(): raise ValueError(f"Did not find data for key {key} that should be in file {file}.") - print("Links:\t✅") return def check_db_file_links(path: Path) -> None: + """ + Check whether for each record in the given correlator library, we can find the data in the file as we expect. + Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ db = get_db_file(path) search_expr = "SELECT path FROM 'backlogs'" conn = sqlite3.connect(path / db) @@ -79,9 +142,90 @@ def check_db_file_links(path: Path) -> None: _check_db2paths(path, list(results)) +def check_path_and_config(path: Path) -> None: + """ + Check whether the given path exists and the cinfigureation file can be found. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ + if not os.path.exists(path): + raise FileNotFoundError(f"Corrlib path {path} does not exist.") + config_path = path / CONFIG_FILENAME + if not os.path.exists(config_path): + raise FileNotFoundError(f"Configuration file {config_path} not found.") + + +def check_config_validity(path: Path) -> None: + """ + Check whether the configuration file of the given corrlib-dataset path is valid. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ + config = ConfigParser() + config_path = path / CONFIG_FILENAME + if os.path.exists(config_path): + config.read(config_path) + else: + raise FileNotFoundError("Configuration file not found.") + + if config.has_section('core'): + core_opts = ['version', 'tracker', 'cached'] + has_core_opts = [config.has_option('core', opt) for opt in core_opts] + if not all(has_core_opts): + raise ValueError("One of the options in the 'core' section ('version', 'tracker', 'cached') is missing.") + + if config.has_section('paths'): + has_path_opts = [config.has_option('paths', opt) for opt in path_opts] + if not all(has_path_opts): + raise ValueError("One of the options in the 'path' section ('db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path') is missing.") + + +def check_paths(path: Path) -> None: + """ + Check whether all paths demanded by the 'paths' section of the configuration-file exist. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ + config = ConfigParser() + config_path = path / CONFIG_FILENAME + if os.path.exists(config_path): + config.read(config_path) + else: + raise FileNotFoundError("Configuration file not found.") + has_paths = [os.path.exists(path / config.get('paths', opt)) for opt in path_opts] + if not all(has_paths): + raise FileNotFoundError("One of the paths specified in the configuration file is not present.") + + def full_integrity_check(path: Path) -> None: + """ + Aggregate all checks for easy validation of the backlog-library. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ + print("Run full integrity check...") + check_path_and_config(path) + print("(1/5) Path and config-file exist: ✅") + check_config_validity(path) + print("(2/5) Configuration is valid: ✅") + check_paths(path) + print("(3/5) Needed paths exist: ✅") check_db_integrity(path) + print("(4/5) Database is sane: ✅") check_db_file_links(path) - print("Full:\t✅") + print("(5/5) DB2File and File2DB-links are sound: ✅") + print("Full integrity check: ✅")