diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 74386f4..f5b2300 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -12,6 +12,20 @@ from typing import Any def has_valid_times(result: pd.Series) -> bool: + """ + Check, whether the result at hand has time-stamps that are sensible: + A recored is created first, then updated, with both times laying in the past. + + Parameters + ---------- + result: pd.Series + The result to check + + Returns + ------- + b: bool + True, if the timestamps make sense. + """ # we expect created_at <= updated_at <= now created_at = dt.datetime.fromisoformat(result['created_at']) updated_at = dt.datetime.fromisoformat(result['updated_at']) @@ -22,15 +36,41 @@ def has_valid_times(result: pd.Series) -> bool: return True def are_keys_unique(db: Path, table: str, col: str) -> bool: + """ + Check whether the strings listed in a column of a given table are unique. + + Parameters + ---------- + db: Path + The database to check. + table: str + The table to check. + col: str + The column to be checked for uniqueness. + + Returns + ------- + b: bool + True, if the strings are unique. + """ conn = sqlite3.connect(db) c = conn.cursor() - c.execute(f"SELECT COUNT( DISTINCT CAST(path AS nvarchar(4000))), COUNT({col}) FROM {table};") + c.execute(f"SELECT COUNT( DISTINCT CAST({col} AS nvarchar(4000))), COUNT({col}) FROM {table};") results = c.fetchall()[0] conn.close() return bool(results[0] == results[1]) def check_db_integrity(path: Path) -> None: + """ + Check intergrity of the database by checking the uniqueness of the record keys used to load the records + and ensuring that the timestamps of each record is sensible. Throws an error, if issues are detected. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ db = get_db_file(path) if not are_keys_unique(path / db, 'backlogs', 'path'): @@ -47,6 +87,17 @@ def check_db_integrity(path: Path) -> None: def _check_db2paths(path: Path, meas_paths: list[str]) -> None: + """ + Check whether for each record in the given by meas_paths, we can find the data in the file as we expect. + Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + meas_paths: list[str] + List of measurement paths to check. + """ needed_data: dict[str, list[str]] = {} for mpath in meas_paths: file = mpath.split("::")[0] @@ -72,6 +123,15 @@ def _check_db2paths(path: Path, meas_paths: list[str]) -> None: def check_db_file_links(path: Path) -> None: + """ + Check whether for each record in the given correlator library, we can find the data in the file as we expect. + Also check, whether there are unreachable records in the files. If either of the issues arise, throws an error. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ db = get_db_file(path) search_expr = "SELECT path FROM 'backlogs'" conn = sqlite3.connect(path / db) @@ -80,6 +140,14 @@ def check_db_file_links(path: Path) -> None: def check_path_and_config(path: Path) -> None: + """ + Check whether the given path exists and the cinfigureation file can be found. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ if not os.path.exists(path): raise FileNotFoundError(f"Corrlib path {path} does not exist.") config_path = path / CONFIG_FILENAME @@ -88,6 +156,14 @@ def check_path_and_config(path: Path) -> None: def check_config_validity(path: Path) -> None: + """ + Check whether the configuration file of the given corrlib-dataset path is valid. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ config = ConfigParser() config_path = path / CONFIG_FILENAME if os.path.exists(config_path): @@ -107,8 +183,20 @@ def check_config_validity(path: Path) -> None: if not all(has_path_opts): raise ValueError("One of the options in the 'path' section ('db', 'projects_path', 'archive_path', 'toml_imports_path', 'import_scripts_path') is missing.") + has_paths = [os.path.exists(path / config.get('paths', opt)) for opt in path_opts] + if not all(has_paths): + raise FileNotFoundError("one of the paths needed by the configuration file is not present.") + def full_integrity_check(path: Path) -> None: + """ + Aggregate all checks for easy validation of the backlog-library. + + Parameters + ---------- + path: Path + Path to the backlog-library to check. + """ check_path_and_config(path) print("Path and config-file exist:\t✅") check_config_validity(path)