From 656f99a13c2fd0c4fbad292f2a3c145afb567124 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 5 May 2026 16:47:07 +0200 Subject: [PATCH 1/5] add integrity check for the config-file --- corrlib/integrity.py | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 5f80aa3..74386f4 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -1,10 +1,12 @@ import datetime as dt from pathlib import Path -from .tools import get_db_file +from .tools import get_db_file, CONFIG_FILENAME import pandas as pd import sqlite3 from .tracker import get import pyerrors.input.json as pj +import os +from configparser import ConfigParser from typing import Any @@ -41,7 +43,6 @@ def check_db_integrity(path: Path) -> None: for _, result in results.iterrows(): if not has_valid_times(result): raise ValueError(f"Result with id {result[id]} has wrong time signatures.") - print("DB:\t✅") return @@ -67,7 +68,6 @@ def _check_db2paths(path: Path, meas_paths: list[str]) -> None: for key in needed_data[file]: if key not in filedict.keys(): raise ValueError(f"Did not find data for key {key} that should be in file {file}.") - print("Links:\t✅") return @@ -79,9 +79,44 @@ def check_db_file_links(path: Path) -> None: _check_db2paths(path, list(results)) +def check_path_and_config(path: Path) -> None: + if not os.path.exists(path): + raise FileNotFoundError(f"Corrlib path {path} does not exist.") + config_path = path / CONFIG_FILENAME + if not os.path.exists(config_path): + raise FileNotFoundError(f"Configuration file {config_path} not found.") + + +def check_config_validity(path: Path) -> None: + config = ConfigParser() + config_path = path / CONFIG_FILENAME + if os.path.exists(config_path): + config.read(config_path) + else: + raise FileNotFoundError("Configuration file not found.") + + if config.has_section('core'): + core_opts = ['version', 'tracker', 'cached'] + has_core_opts = [config.has_option('core', opt) for opt in core_opts] + if not all(has_core_opts): + raise ValueError("One of the options in the 'core' section ('version', 'tracker', 'cached') is missing.") + + if config.has_section('paths'): + path_opts = ['db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path'] + has_path_opts = [config.has_option('paths', opt) for opt in path_opts] + if not all(has_path_opts): + raise ValueError("One of the options in the 'path' section ('db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path') is missing.") + + def full_integrity_check(path: Path) -> None: + check_path_and_config(path) + print("Path and config-file exist:\t✅") + check_config_validity(path) + print("Configuration is valid:\t✅") check_db_integrity(path) + print("DB:\t✅") check_db_file_links(path) + print("Links:\t✅") print("Full:\t✅") From c3bf36bf52e6f1c9ff4631d75398ad82eff441d9 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 5 May 2026 17:15:16 +0200 Subject: [PATCH 2/5] add docs, add check for needed paths --- corrlib/integrity.py | 90 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 74386f4..f5b2300 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -12,6 +12,20 @@ from typing import Any def has_valid_times(result: pd.Series) -> bool: + """ + Check, whether the result at hand has time-stamps that are sensible: + A recored is created first, then updated, with both times laying in the past. + + Parameters + ---------- + result: pd.Series + The result to check + + Returns + ------- + b: bool + True, if the timestamps make sense. + """ # we expect created_at <= updated_at <= now created_at = dt.datetime.fromisoformat(result['created_at']) updated_at = dt.datetime.fromisoformat(result['updated_at']) @@ -22,15 +36,41 @@ def has_valid_times(result: pd.Series) -> bool: return True def are_keys_unique(db: Path, table: str, col: str) -> bool: + """ + Check whether the strings listed in a column of a given table are unique. + + Parameters + ---------- + db: Path + The database to check. + table: str + The table to check. + col: str + The column to be checked for uniqueness. + + Returns + ------- + b: bool + True, if the strings are unique. + """ conn = sqlite3.connect(db) c = conn.cursor() - c.execute(f"SELECT COUNT( DISTINCT CAST(path AS nvarchar(4000))), COUNT({col}) FROM {table};") + c.execute(f"SELECT COUNT( DISTINCT CAST({col} AS nvarchar(4000))), COUNT({col}) FROM {table};") results = c.fetchall()[0] conn.close() return bool(results[0] == results[1]) def check_db_integrity(path: Path) -> None: + """ + Check intergrity of the database by checking the uniqueness of the record keys used to load the records + and ensuring that the timestamps of each record is sensible. Throws an error, if issues are detected. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ db = get_db_file(path) if not are_keys_unique(path / db, 'backlogs', 'path'): @@ -47,6 +87,17 @@ def check_db_integrity(path: Path) -> None: def _check_db2paths(path: Path, meas_paths: list[str]) -> None: + """ + Check whether for each record in the given by meas_paths, we can find the data in the file as we expect. + Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + meas_paths: list[str] + List of measurement paths to check. + """ needed_data: dict[str, list[str]] = {} for mpath in meas_paths: file = mpath.split("::")[0] @@ -72,6 +123,15 @@ def _check_db2paths(path: Path, meas_paths: list[str]) -> None: def check_db_file_links(path: Path) -> None: + """ + Check whether for each record in the given correlator library, we can find the data in the file as we expect. + Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ db = get_db_file(path) search_expr = "SELECT path FROM 'backlogs'" conn = sqlite3.connect(path / db) @@ -80,6 +140,14 @@ def check_db_file_links(path: Path) -> None: def check_path_and_config(path: Path) -> None: + """ + Check whether the given path exists and the cinfigureation file can be found. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ if not os.path.exists(path): raise FileNotFoundError(f"Corrlib path {path} does not exist.") config_path = path / CONFIG_FILENAME @@ -88,6 +156,14 @@ def check_path_and_config(path: Path) -> None: def check_config_validity(path: Path) -> None: + """ + Check whether the configuration file of the given corrlib-dataset path is valid. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ config = ConfigParser() config_path = path / CONFIG_FILENAME if os.path.exists(config_path): @@ -107,8 +183,20 @@ def check_config_validity(path: Path) -> None: if not all(has_path_opts): raise ValueError("One of the options in the 'path' section ('db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path') is missing.") + has_paths = [os.path.exists(path / config.get('paths', opt)) for opt in path_opts] + if not all(has_paths): + raise FileNotFoundError("one of the paths needed by the configuration file is not present.") + def full_integrity_check(path: Path) -> None: + """ + Aggregate all checks for easy validation of the backlog-library. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ check_path_and_config(path) print("Path and config-file exist:\t✅") check_config_validity(path) From ba4624d8433d0360aded38d196cbdfac39a9e434 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 5 May 2026 17:20:20 +0200 Subject: [PATCH 3/5] restruct: needed paths get extra check --- corrlib/integrity.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index f5b2300..f2a70bd 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -11,6 +11,9 @@ from configparser import ConfigParser from typing import Any +path_opts = ['db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path'] + + def has_valid_times(result: pd.Series) -> bool: """ Check, whether the result at hand has time-stamps that are sensible: @@ -178,14 +181,29 @@ def check_config_validity(path: Path) -> None: raise ValueError("One of the options in the 'core' section ('version', 'tracker', 'cached') is missing.") if config.has_section('paths'): - path_opts = ['db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path'] has_path_opts = [config.has_option('paths', opt) for opt in path_opts] if not all(has_path_opts): raise ValueError("One of the options in the 'path' section ('db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path') is missing.") + +def check_paths(path: Path) -> None: + """ + Check whether all paths demanded by the 'paths' section of the configuration-file exist. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ + config = ConfigParser() + config_path = path / CONFIG_FILENAME + if os.path.exists(config_path): + config.read(config_path) + else: + raise FileNotFoundError("Configuration file not found.") has_paths = [os.path.exists(path / config.get('paths', opt)) for opt in path_opts] if not all(has_paths): - raise FileNotFoundError("one of the paths needed by the configuration file is not present.") + raise FileNotFoundError("One of the paths specified in the configuration file is not present.") def full_integrity_check(path: Path) -> None: @@ -201,6 +219,8 @@ def full_integrity_check(path: Path) -> None: print("Path and config-file exist:\t✅") check_config_validity(path) print("Configuration is valid:\t✅") + check_paths(path) + print("Needed paths exist:\t✅") check_db_integrity(path) print("DB:\t✅") check_db_file_links(path) From 3247cdbc40aaccd1c9ff62da2059d407a01d7a51 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 5 May 2026 17:24:09 +0200 Subject: [PATCH 4/5] neater UX --- corrlib/integrity.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index f2a70bd..f660dfe 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -215,16 +215,17 @@ def full_integrity_check(path: Path) -> None: path: Path Path to the backlog-library to check. """ + print("Run full integrity check...") check_path_and_config(path) - print("Path and config-file exist:\t✅") + print("(1/5) Path and config-file exist: ✅") check_config_validity(path) - print("Configuration is valid:\t✅") + print("(2/5) Configuration is valid: ✅") check_paths(path) - print("Needed paths exist:\t✅") + print("(3/5) Needed paths exist: ✅") check_db_integrity(path) - print("DB:\t✅") + print("(4/5) Database is sane: ✅") check_db_file_links(path) - print("Links:\t✅") - print("Full:\t✅") + print("(5/5) DB2File and File2DB-links are sound: ✅") + print("Full integrity check: ✅") From a2a3346f51ec574e772a1293038558e351e65b69 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 5 May 2026 22:12:14 +0200 Subject: [PATCH 5/5] provide docstring for repo check --- corrlib/cli.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/corrlib/cli.py b/corrlib/cli.py index d24d8ef..acf4eca 100644 --- a/corrlib/cli.py +++ b/corrlib/cli.py @@ -108,7 +108,7 @@ def find( ), ) -> None: """ - Find a record in the backlog at hand. Through specifying it's ensemble and the measured correlator. + Find a record in the given backlog. """ results = find_record(path, ensemble, corr, code) if results.empty: @@ -147,6 +147,9 @@ def check(path: Path = typer.Option( "-d", ), ) -> None: + """ + Check the integrity of the repository. + """ full_integrity_check(path)