From b13136a248f294eb2275da54747cc105473655f3 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 17 Apr 2026 17:32:22 +0200 Subject: [PATCH 1/6] add check for links to files --- corrlib/integrity.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index dc1216c..63572a9 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -3,6 +3,8 @@ from pathlib import Path from .tools import get_db_file import pandas as pd import sqlite3 +from .tracker import get +import pyerrors.input.json as pj def has_valid_times(result: pd.Series) -> bool: @@ -38,10 +40,45 @@ def check_db_integrity(path: Path) -> None: if not has_valid_times(result): raise ValueError(f"Result with id {result[id]} has wrong time signatures.") print("DB:\t✅") + return + + +def _check_db2paths(path: Path, meas_paths: list[str]) -> None: + needed_data: dict[str, list[str]] = {} + for mpath in meas_paths: + file = mpath.split("::")[0] + if file not in needed_data.keys(): + needed_data[file] = [] + key = mpath.split("::")[1] + needed_data[file].append(key) + + for file in needed_data.keys(): + get(path, Path(file)) + filedict: dict[str, Any] = pj.load_json_dict(str(path / file)) + if not set(filedict.keys()).issubset(needed_data[file]): + for key in filedict.keys(): + if key not in needed_data[file]: + raise ValueError(f"Found unintended key {key} in file {file}.") + elif not set(needed_data[file]).issubset(filedict.keys()): + for key in needed_data[file]: + if key not in filedict.keys(): + raise ValueError(f"Did not find data for key {key} that should be in file {file}.") + print("Links:\t✅") + return + + +def check_db_file_links(path: Path) -> None: + db = get_db_file(path) + search_expr = "SELECT path FROM 'backlogs'" + conn = sqlite3.connect(path / db) + results = pd.read_sql(search_expr, conn)['path'].values + print(results) + _check_db2paths(path, results) def full_integrity_check(path: Path) -> None: check_db_integrity(path) + check_db_file_links(path) print("Full:\t✅") From 29ebafc1c44e5e0fe30dcb38fdf22d408f746cec Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 17 Apr 2026 17:34:53 +0200 Subject: [PATCH 2/6] show progress a little --- corrlib/integrity.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 63572a9..8722840 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -52,7 +52,9 @@ def _check_db2paths(path: Path, meas_paths: list[str]) -> None: key = mpath.split("::")[1] needed_data[file].append(key) - for file in needed_data.keys(): + totf = len(needed_data.keys()) + for i, file in enumerate(needed_data.keys()): + print(f"Check against file {i}/{totf}: {file}") get(path, Path(file)) filedict: dict[str, Any] = pj.load_json_dict(str(path / file)) if not set(filedict.keys()).issubset(needed_data[file]): From 37ae8185897b62017f5822ec0b727685a905a389 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 17 Apr 2026 17:37:46 +0200 Subject: [PATCH 3/6] small logic issue --- corrlib/integrity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 8722840..23fbe0e 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -61,7 +61,7 @@ def _check_db2paths(path: Path, meas_paths: list[str]) -> None: for key in filedict.keys(): if key not in needed_data[file]: raise ValueError(f"Found unintended key {key} in file {file}.") - elif not set(needed_data[file]).issubset(filedict.keys()): + if not set(needed_data[file]).issubset(filedict.keys()): for key in needed_data[file]: if key not in filedict.keys(): raise ValueError(f"Did not find data for key {key} that should be in file {file}.") From 0535e19bf08ebced63834ff5796591a815ae20f0 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 17 Apr 2026 17:42:47 +0200 Subject: [PATCH 4/6] fix typing --- corrlib/integrity.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 23fbe0e..5f80aa3 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -6,6 +6,8 @@ import sqlite3 from .tracker import get import pyerrors.input.json as pj +from typing import Any + def has_valid_times(result: pd.Series) -> bool: # we expect created_at <= updated_at <= now @@ -74,8 +76,7 @@ def check_db_file_links(path: Path) -> None: search_expr = "SELECT path FROM 'backlogs'" conn = sqlite3.connect(path / db) results = pd.read_sql(search_expr, conn)['path'].values - print(results) - _check_db2paths(path, results) + _check_db2paths(path, list(results)) def full_integrity_check(path: Path) -> None: From 083d7ee3ce6ec2868a2da1b9c4fd73ef22362f50 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 17 Apr 2026 17:52:18 +0200 Subject: [PATCH 5/6] add dry run for loading data using the integrity functions --- corrlib/meas_io.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/corrlib/meas_io.py b/corrlib/meas_io.py index f4e8a83..731da66 100644 --- a/corrlib/meas_io.py +++ b/corrlib/meas_io.py @@ -11,6 +11,7 @@ from .tracker import get, save, unlock import shutil from typing import Any from pathlib import Path +from .integrity import _check_db2paths CACHE_DIR = ".cache" @@ -153,7 +154,7 @@ def load_record(path: Path, meas_path: str) -> Union[Corr, Obs]: return load_records(path, [meas_path])[0] -def load_records(path: Path, meas_paths: list[str], preloaded: dict[str, Any] = {}) -> list[Union[Corr, Obs]]: +def load_records(path: Path, meas_paths: list[str], preloaded: dict[str, Any] = {}, dry_run: bool = False) -> list[Union[Corr, Obs]]: """ Load a list of records by their paths. @@ -163,14 +164,19 @@ def load_records(path: Path, meas_paths: list[str], preloaded: dict[str, Any] = Path of the correlator library. meas_paths: list[str] A list of the paths to the correlator in the backlog system. - perloaded: dict[str, Any] - The data that is already prelaoded. Of interest if data has alread been loaded in the same script. + preloaded: dict[str, Any] + The data that is already preloaded. Of interest if data has alread been loaded in the same script. + dry_run: bool + Do not load datda, just check whether we can reach the data we are interested in. Returns ------- - retruned_data: list + returned_data: list The loaded records. """ + if dry_run: + _check_db2paths(path, meas_paths) + return needed_data: dict[str, list[str]] = {} for mpath in meas_paths: file = mpath.split("::")[0] From 51ae53aa024365f47436c11bf69bc376184ac6b4 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 17 Apr 2026 17:53:13 +0200 Subject: [PATCH 6/6] add empty return --- corrlib/meas_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corrlib/meas_io.py b/corrlib/meas_io.py index 731da66..cbd9386 100644 --- a/corrlib/meas_io.py +++ b/corrlib/meas_io.py @@ -176,7 +176,7 @@ def load_records(path: Path, meas_paths: list[str], preloaded: dict[str, Any] = """ if dry_run: _check_db2paths(path, meas_paths) - return + return [] needed_data: dict[str, list[str]] = {} for mpath in meas_paths: file = mpath.split("::")[0]