From bd34b7c3785ddb509f2fbfffbf01ce145ee0463d Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Mon, 23 Mar 2026 22:33:01 +0100 Subject: [PATCH 01/26] write first trivial find test --- tests/find_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/find_test.py b/tests/find_test.py index b63b246..8cc7923 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -10,6 +10,7 @@ def make_sql(path: Path) -> Path: cinit._create_db(db) return db + def test_find_lookup_by_one_alias(tmp_path: Path) -> None: db = make_sql(tmp_path) conn = sqlite3.connect(db) From f8566207e36978af1cefded2ae6c8fc521e732cd Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Mon, 23 Mar 2026 22:43:39 +0100 Subject: [PATCH 02/26] add id lookup test --- tests/find_test.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/find_test.py b/tests/find_test.py index 8cc7923..e0730e9 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -32,3 +32,25 @@ def test_find_lookup_by_one_alias(tmp_path: Path) -> None: with pytest.raises(Exception): assert uuid == find._project_lookup_by_alias(db, "fun_project") conn.close() + + +def test_find_lookup_by_id(tmp_path: Path) -> None: + db = make_sql(tmp_path) + conn = sqlite3.connect(db) + c = conn.cursor() + uuid = "test_uuid" + alias_str = "fun_project" + tag_str = "tt" + owner = "tester" + code = "test_code" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + conn.commit() + conn.close() + result = find._project_lookup_by_id(db, uuid)[0] + assert uuid == result[0] + assert alias_str == result[1] + assert tag_str == result[2] + assert owner == result[3] + assert code == result[4] + From d0d5f9aa8775c4ce78f547d7cbe3dea5199743cb Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Mon, 23 Mar 2026 23:37:22 +0100 Subject: [PATCH 03/26] rewrite time filter --- corrlib/find.py | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index 4c51e05..e4ee735 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -8,6 +8,7 @@ from .tools import k2m, get_db_file from .tracker import get from typing import Any, Optional from pathlib import Path +import datetime as dt def _project_lookup_by_alias(db: Path, alias: str) -> str: @@ -62,8 +63,37 @@ def _project_lookup_by_id(db: Path, uuid: str) -> list[tuple[str, str]]: return results -def _db_lookup(db: Path, ensemble: str, correlator_name: str, code: str, project: Optional[str]=None, parameters: Optional[str]=None, - created_before: Optional[str]=None, created_after: Optional[Any]=None, updated_before: Optional[Any]=None, updated_after: Optional[Any]=None) -> pd.DataFrame: +def _time_filter(results: pd.DataFrame, created_before: Optional[str]=None, created_after: Optional[Any]=None, updated_before: Optional[Any]=None, updated_after: Optional[Any]=None) -> pd.DataFrame: + drops = [] + for ind in len(results): + result = results.iloc[ind] + created_at = dt.datetime.fromisoformat(result['created_at']) + updated_at = dt.datetime.fromisoformat(result['updated_at']) + + if created_before is not None: + created_before = dt.datetime.fromisoformat(created_before) + if created_before < created_at: + drops.append(ind) + continue + if created_after is not None: + created_after = dt.datetime.fromisoformat(created_after) + if created_before > created_at: + drops.append(ind) + continue + if updated_before is not None: + updated_before = dt.datetime.fromisoformat(updated_before) + if updated_before < updated_at: + drops.append(ind) + continue + if updated_after is not None: + updated_after = dt.datetime.fromisoformat(updated_after) + if updated_after > updated_at: + drops.append(ind) + continue + return results.drop(drops) + + +def _db_lookup(db: Path, ensemble: str, correlator_name: str, code: str, project: Optional[str]=None, parameters: Optional[str]=None) -> pd.DataFrame: """ Look up a correlator record in the database by the data given to the method. @@ -105,14 +135,6 @@ def _db_lookup(db: Path, ensemble: str, correlator_name: str, code: str, project search_expr += f" AND code = '{code}'" if parameters: search_expr += f" AND parameters = '{parameters}'" - if created_before: - search_expr += f" AND created_at < '{created_before}'" - if created_after: - search_expr += f" AND created_at > '{created_after}'" - if updated_before: - search_expr += f" AND updated_at < '{updated_before}'" - if updated_after: - search_expr += f" AND updated_at > '{updated_after}'" conn = sqlite3.connect(db) results = pd.read_sql(search_expr, conn) conn.close() @@ -236,7 +258,9 @@ def find_record(path: Path, ensemble: str, correlator_name: str, code: str, proj if code not in codes: raise ValueError("Code " + code + "unknown, take one of the following:" + ", ".join(codes)) get(path, db_file) - results = _db_lookup(db, ensemble, correlator_name,code, project, parameters=parameters, created_before=created_before, created_after=created_after, updated_before=updated_before, updated_after=updated_after) + results = _db_lookup(db, ensemble, correlator_name,code, project, parameters=parameters) + if Any([created_before, created_after, updated_before, updated_after]): + results = _time_filter(results, created_before, created_after, updated_before, updated_after) if code == "sfcf": results = sfcf_filter(results, **kwargs) elif code == "openQCD": From 29558a734b1522c94979858fb1ed0a12f8ed20d3 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Mon, 23 Mar 2026 23:38:40 +0100 Subject: [PATCH 04/26] add test for db lookup --- tests/find_test.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/find_test.py b/tests/find_test.py index e0730e9..da1bfc1 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -54,3 +54,42 @@ def test_find_lookup_by_id(tmp_path: Path) -> None: assert owner == result[3] assert code == result[4] + +def test_db_lookup(tmp_path: Path) -> None: + db = make_sql(tmp_path) + conn = sqlite3.connect(db) + c = conn.cursor() + + corr = "f_A" + ensemble = "SF_A" + code = "openQCD" + meas_path = "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf" + uuid = "Project_A" + pars = "{par_A: 3.0, par_B: 5.0}" + parameter_file = "projects/Project_A/myinput.in" + c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) + conn.commit() + conn.close() + + results = find._db_lookup(db, ensemble, corr, code) + assert len(results) == 1 + results = find._db_lookup(db, "SF_B", corr, code) + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code) + assert results.empty + results = find._db_lookup(db, ensemble, corr, "sfcf") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, project = "Project_A") + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, code, project = "Project_B") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, parameters = pars) + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, code, parameters = "{par_A: 3.0, par_B: 4.0}") + assert results.empty + #results = find._db_lookup(db, ensemble, corr, code, project, parameters, created_before, created_after, updated_before, updated_after) + + #results = find._db_lookup(db, ensemble, corr, code, project, parameters, created_before, created_after, updated_before, updated_after) + + From 402ca07edbecda8bb5828596e98527c9ed2de8a4 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Mon, 23 Mar 2026 23:42:42 +0100 Subject: [PATCH 05/26] linting and hotfix --- corrlib/find.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index e4ee735..3e62344 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -65,32 +65,32 @@ def _project_lookup_by_id(db: Path, uuid: str) -> list[tuple[str, str]]: def _time_filter(results: pd.DataFrame, created_before: Optional[str]=None, created_after: Optional[Any]=None, updated_before: Optional[Any]=None, updated_after: Optional[Any]=None) -> pd.DataFrame: drops = [] - for ind in len(results): + for ind in range(len(results)): result = results.iloc[ind] created_at = dt.datetime.fromisoformat(result['created_at']) updated_at = dt.datetime.fromisoformat(result['updated_at']) if created_before is not None: - created_before = dt.datetime.fromisoformat(created_before) - if created_before < created_at: + date_created_before = dt.datetime.fromisoformat(created_before) + if date_created_before < created_at: drops.append(ind) continue if created_after is not None: - created_after = dt.datetime.fromisoformat(created_after) - if created_before > created_at: + date_created_after = dt.datetime.fromisoformat(created_after) + if date_created_after > created_at: drops.append(ind) continue if updated_before is not None: - updated_before = dt.datetime.fromisoformat(updated_before) - if updated_before < updated_at: + date_updated_before = dt.datetime.fromisoformat(updated_before) + if date_updated_before < updated_at: drops.append(ind) continue if updated_after is not None: - updated_after = dt.datetime.fromisoformat(updated_after) - if updated_after > updated_at: + date_updated_after = dt.datetime.fromisoformat(updated_after) + if date_updated_after > updated_at: drops.append(ind) continue - return results.drop(drops) + return results.drop(drops) def _db_lookup(db: Path, ensemble: str, correlator_name: str, code: str, project: Optional[str]=None, parameters: Optional[str]=None) -> pd.DataFrame: From b50ffc4c6b898de970e8ded6c2287c96a6c6389b Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Mon, 23 Mar 2026 23:45:22 +0100 Subject: [PATCH 06/26] any hotfix --- corrlib/find.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corrlib/find.py b/corrlib/find.py index 3e62344..14b1772 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -259,7 +259,7 @@ def find_record(path: Path, ensemble: str, correlator_name: str, code: str, proj raise ValueError("Code " + code + "unknown, take one of the following:" + ", ".join(codes)) get(path, db_file) results = _db_lookup(db, ensemble, correlator_name,code, project, parameters=parameters) - if Any([created_before, created_after, updated_before, updated_after]): + if any(arg is not None for arg in [created_before, created_after, updated_before, updated_after]): results = _time_filter(results, created_before, created_after, updated_before, updated_after) if code == "sfcf": results = sfcf_filter(results, **kwargs) From c431145a23764015d52ed6a1fd3da007d554cc3f Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 24 Mar 2026 09:23:30 +0100 Subject: [PATCH 07/26] some more db lookup --- tests/find_test.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/tests/find_test.py b/tests/find_test.py index da1bfc1..e895b85 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -70,7 +70,6 @@ def test_db_lookup(tmp_path: Path) -> None: c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) conn.commit() - conn.close() results = find._db_lookup(db, ensemble, corr, code) assert len(results) == 1 @@ -88,8 +87,38 @@ def test_db_lookup(tmp_path: Path) -> None: assert len(results) == 1 results = find._db_lookup(db, ensemble, corr, code, parameters = "{par_A: 3.0, par_B: 4.0}") assert results.empty - #results = find._db_lookup(db, ensemble, corr, code, project, parameters, created_before, created_after, updated_before, updated_after) - #results = find._db_lookup(db, ensemble, corr, code, project, parameters, created_before, created_after, updated_before, updated_after) + corr = "g_A" + ensemble = "SF_A" + code = "openQCD" + meas_path = "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf" + uuid = "Project_A" + pars = "{par_A: 3.0, par_B: 4.0}" + parameter_file = "projects/Project_A/myinput.in" + c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) + conn.commit() + corr = "f_A" + results = find._db_lookup(db, ensemble, corr, code) + assert len(results) == 1 + results = find._db_lookup(db, "SF_B", corr, code) + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code) + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, "sfcf") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, project = "Project_A") + assert len(results) == 1 + results = find._db_lookup(db, ensemble, "g_A", code, project = "Project_A") + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, code, project = "Project_B") + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code, project = "Project_B") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, parameters = pars) + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code, parameters = "{par_A: 3.0, par_B: 4.0}") + assert len(results) == 1 + conn.close() From 3fd557f3eebd2a57b9340b727a23f72586f6e68e Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 24 Mar 2026 09:24:12 +0100 Subject: [PATCH 08/26] add customtFilter --- corrlib/find.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index 14b1772..8934854 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -6,9 +6,10 @@ import numpy as np from .input.implementations import codes from .tools import k2m, get_db_file from .tracker import get -from typing import Any, Optional +from typing import Any, Optional, Union from pathlib import Path import datetime as dt +from collections.abc import Callable def _project_lookup_by_alias(db: Path, alias: str) -> str: @@ -251,22 +252,31 @@ def sfcf_filter(results: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: return results.drop(drops) +def openQCD_filter(results:pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + return results + + def find_record(path: Path, ensemble: str, correlator_name: str, code: str, project: Optional[str]=None, parameters: Optional[str]=None, - created_before: Optional[str]=None, created_after: Optional[str]=None, updated_before: Optional[str]=None, updated_after: Optional[str]=None, revision: Optional[str]=None, **kwargs: Any) -> pd.DataFrame: + created_before: Optional[str]=None, created_after: Optional[str]=None, updated_before: Optional[str]=None, updated_after: Optional[str]=None, + revision: Optional[str]=None, + customFilter: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None, + **kwargs: Any) -> pd.DataFrame: db_file = get_db_file(path) db = path / db_file if code not in codes: raise ValueError("Code " + code + "unknown, take one of the following:" + ", ".join(codes)) get(path, db_file) results = _db_lookup(db, ensemble, correlator_name,code, project, parameters=parameters) - if any(arg is not None for arg in [created_before, created_after, updated_before, updated_after]): + if any([arg is not None for arg in [created_before, created_after, updated_before, updated_after]]): results = _time_filter(results, created_before, created_after, updated_before, updated_after) + if customFilter is not None: + results = customFilter(results) if code == "sfcf": results = sfcf_filter(results, **kwargs) elif code == "openQCD": - pass + results = openQCD_filter(results, **kwargs) else: - raise Exception + raise ValueError(f"Code {code} is not known.") print("Found " + str(len(results)) + " result" + ("s" if len(results)>1 else "")) return results.reset_index() From 3fe8e28a68a58a4cf8bce7a29d43f60286000c81 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 24 Mar 2026 09:25:21 +0100 Subject: [PATCH 09/26] customtFilter after general filters --- corrlib/find.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index 8934854..e099aea 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -269,14 +269,14 @@ def find_record(path: Path, ensemble: str, correlator_name: str, code: str, proj results = _db_lookup(db, ensemble, correlator_name,code, project, parameters=parameters) if any([arg is not None for arg in [created_before, created_after, updated_before, updated_after]]): results = _time_filter(results, created_before, created_after, updated_before, updated_after) - if customFilter is not None: - results = customFilter(results) if code == "sfcf": results = sfcf_filter(results, **kwargs) elif code == "openQCD": results = openQCD_filter(results, **kwargs) else: raise ValueError(f"Code {code} is not known.") + if customFilter is not None: + results = customFilter(results) print("Found " + str(len(results)) + " result" + ("s" if len(results)>1 else "")) return results.reset_index() From 4516ca3149cac8b2f0420903c41576b471b7ed8f Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 24 Mar 2026 18:39:00 +0100 Subject: [PATCH 10/26] better type annotation fir id lookup --- corrlib/find.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/corrlib/find.py b/corrlib/find.py index e099aea..dd3a9a6 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -40,7 +40,7 @@ def _project_lookup_by_alias(db: Path, alias: str) -> str: return str(results[0][0]) -def _project_lookup_by_id(db: Path, uuid: str) -> list[tuple[str, str]]: +def _project_lookup_by_id(db: Path, uuid: str) -> list[tuple[str, ...]]: """ Return the project information available in the database by UUID. From cc14e68b4429a122ee0c9b299555f4e7ca8fef45 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Thu, 26 Mar 2026 17:19:58 +0100 Subject: [PATCH 11/26] add tests for time filter and find project, add a first check for integrity of the database --- corrlib/find.py | 4 ++ corrlib/integrity.py | 5 ++ tests/find_test.py | 116 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 corrlib/integrity.py diff --git a/corrlib/find.py b/corrlib/find.py index dd3a9a6..3cbe09b 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -6,6 +6,7 @@ import numpy as np from .input.implementations import codes from .tools import k2m, get_db_file from .tracker import get +from .integrity import check_time_validity from typing import Any, Optional, Union from pathlib import Path import datetime as dt @@ -70,6 +71,9 @@ def _time_filter(results: pd.DataFrame, created_before: Optional[str]=None, cre result = results.iloc[ind] created_at = dt.datetime.fromisoformat(result['created_at']) updated_at = dt.datetime.fromisoformat(result['updated_at']) + db_times_valid = check_time_validity(created_at=created_at, updated_at=updated_at) + if not db_times_valid: + raise ValueError('Time stamps not valid for result with path', result["path"]) if created_before is not None: date_created_before = dt.datetime.fromisoformat(created_before) diff --git a/corrlib/integrity.py b/corrlib/integrity.py new file mode 100644 index 0000000..bf890db --- /dev/null +++ b/corrlib/integrity.py @@ -0,0 +1,5 @@ +import datetime as dt + + +def check_time_validity(created_at: dt.datetime, updated_at: dt.datetime) -> bool: + return not (created_at > updated_at) diff --git a/tests/find_test.py b/tests/find_test.py index e895b85..573f87e 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -3,6 +3,8 @@ import sqlite3 from pathlib import Path import corrlib.initialization as cinit import pytest +import pandas as pd +import datalad.api as dl def make_sql(path: Path) -> Path: @@ -34,6 +36,34 @@ def test_find_lookup_by_one_alias(tmp_path: Path) -> None: conn.close() +def test_find_project(tmp_path: Path) -> None: + cinit.create(tmp_path) + db = tmp_path / "backlogger.db" + dl.unlock(str(db), dataset=str(tmp_path)) + conn = sqlite3.connect(db) + c = conn.cursor() + uuid = "test_uuid" + alias_str = "fun_project" + tag_str = "tt" + owner = "tester" + code = "test_code" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + conn.commit() + + assert uuid == find.find_project(tmp_path, "fun_project") + + uuid = "test_uuid2" + alias_str = "fun_project" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + conn.commit() + + with pytest.raises(Exception): + assert uuid == find._project_lookup_by_alias(tmp_path, "fun_project") + conn.close() + + def test_find_lookup_by_id(tmp_path: Path) -> None: db = make_sql(tmp_path) conn = sqlite3.connect(db) @@ -122,3 +152,89 @@ def test_db_lookup(tmp_path: Path) -> None: assert len(results) == 1 conn.close() + + +def test_time_filter() -> None: + record_A = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] # only created + record_B = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-04-26 12:55:18.229966'] # created and updated + record_C = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2026-03-26 12:55:18.229966', '2026-05-26 12:55:18.229966'] # created and updated later + record_D = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2026-03-26 12:55:18.229966', '2026-03-27 12:55:18.229966'] + record_E = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2024-03-26 12:55:18.229966', '2024-03-26 12:55:18.229966'] # only created, earlier + record_F = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2026-03-26 12:55:18.229966', '2024-03-26 12:55:18.229966'] # this is invalid... + + data = [record_A, record_B, record_C, record_D, record_E] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + results = find._time_filter(df, created_before='2023-03-26 12:55:18.229966') + assert results.empty + results = find._time_filter(df, created_before='2027-03-26 12:55:18.229966') + assert len(results) == 5 + results = find._time_filter(df, created_before='2026-03-25 12:55:18.229966') + assert len(results) == 3 + results = find._time_filter(df, created_before='2026-03-26 12:55:18.229965') + assert len(results) == 3 + results = find._time_filter(df, created_before='2025-03-04 12:55:18.229965') + assert len(results) == 1 + + results = find._time_filter(df, created_after='2023-03-26 12:55:18.229966') + assert len(results) == 5 + results = find._time_filter(df, created_after='2027-03-26 12:55:18.229966') + assert results.empty + results = find._time_filter(df, created_after='2026-03-25 12:55:18.229966') + assert len(results) == 2 + results = find._time_filter(df, created_after='2026-03-26 12:55:18.229965') + assert len(results) == 2 + results = find._time_filter(df, created_after='2025-03-04 12:55:18.229965') + assert len(results) == 4 + + results = find._time_filter(df, updated_before='2023-03-26 12:55:18.229966') + assert results.empty + results = find._time_filter(df, updated_before='2027-03-26 12:55:18.229966') + assert len(results) == 5 + results = find._time_filter(df, updated_before='2026-03-25 12:55:18.229966') + assert len(results) == 3 + results = find._time_filter(df, updated_before='2026-03-26 12:55:18.229965') + assert len(results) == 3 + results = find._time_filter(df, updated_before='2025-03-04 12:55:18.229965') + assert len(results) == 1 + + results = find._time_filter(df, updated_after='2023-03-26 12:55:18.229966') + assert len(results) == 5 + results = find._time_filter(df, updated_after='2027-03-26 12:55:18.229966') + assert results.empty + results = find._time_filter(df, updated_after='2026-03-25 12:55:18.229966') + assert len(results) == 2 + results = find._time_filter(df, updated_after='2026-03-26 12:55:18.229965') + assert len(results) == 2 + results = find._time_filter(df, updated_after='2025-03-04 12:55:18.229965') + assert len(results) == 4 + + data = [record_A, record_B, record_C, record_D, record_F] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + with pytest.raises(ValueError): + results = find._time_filter(df, created_before='2023-03-26 12:55:18.229966') From 81af9579dcad49d0b5c3095b0d467cf49d2282e6 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Thu, 26 Mar 2026 17:25:57 +0100 Subject: [PATCH 12/26] add a docstring for time filter --- corrlib/find.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/corrlib/find.py b/corrlib/find.py index 3cbe09b..cb85130 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -66,6 +66,22 @@ def _project_lookup_by_id(db: Path, uuid: str) -> list[tuple[str, ...]]: def _time_filter(results: pd.DataFrame, created_before: Optional[str]=None, created_after: Optional[Any]=None, updated_before: Optional[Any]=None, updated_after: Optional[Any]=None) -> pd.DataFrame: + """ + Filter the results from the database in terms of the creation and update times. + + Parameters + ---------- + results: pd.DataFrame + The dataframe holding the unfilteres results from the database. + created_before: str + Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The creation date has to be truly before the date and time given. + created_after: str + Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The creation date has to be truly after the date and time given. + updated_before: str + Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The date of the last update has to be truly before the date and time given. + updated_after: str + Contraint on the creation date in datetime.datetime.isoformat. Note that this is exclusive. The date of the last update has to be truly after the date and time given. + """ drops = [] for ind in range(len(results)): result = results.iloc[ind] From e8360c88b938cbd3636b8b8cfa30ce0b3375e7ed Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 27 Mar 2026 11:53:07 +0100 Subject: [PATCH 13/26] add more templates --- tests/find_test.py | 278 +++++++++++++++++++++++++++++---------------- 1 file changed, 180 insertions(+), 98 deletions(-) diff --git a/tests/find_test.py b/tests/find_test.py index 573f87e..944ae5f 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -35,35 +35,6 @@ def test_find_lookup_by_one_alias(tmp_path: Path) -> None: assert uuid == find._project_lookup_by_alias(db, "fun_project") conn.close() - -def test_find_project(tmp_path: Path) -> None: - cinit.create(tmp_path) - db = tmp_path / "backlogger.db" - dl.unlock(str(db), dataset=str(tmp_path)) - conn = sqlite3.connect(db) - c = conn.cursor() - uuid = "test_uuid" - alias_str = "fun_project" - tag_str = "tt" - owner = "tester" - code = "test_code" - c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", - (uuid, alias_str, tag_str, owner, code)) - conn.commit() - - assert uuid == find.find_project(tmp_path, "fun_project") - - uuid = "test_uuid2" - alias_str = "fun_project" - c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", - (uuid, alias_str, tag_str, owner, code)) - conn.commit() - - with pytest.raises(Exception): - assert uuid == find._project_lookup_by_alias(tmp_path, "fun_project") - conn.close() - - def test_find_lookup_by_id(tmp_path: Path) -> None: db = make_sql(tmp_path) conn = sqlite3.connect(db) @@ -85,75 +56,6 @@ def test_find_lookup_by_id(tmp_path: Path) -> None: assert code == result[4] -def test_db_lookup(tmp_path: Path) -> None: - db = make_sql(tmp_path) - conn = sqlite3.connect(db) - c = conn.cursor() - - corr = "f_A" - ensemble = "SF_A" - code = "openQCD" - meas_path = "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf" - uuid = "Project_A" - pars = "{par_A: 3.0, par_B: 5.0}" - parameter_file = "projects/Project_A/myinput.in" - c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", - (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) - conn.commit() - - results = find._db_lookup(db, ensemble, corr, code) - assert len(results) == 1 - results = find._db_lookup(db, "SF_B", corr, code) - assert results.empty - results = find._db_lookup(db, ensemble, "g_A", code) - assert results.empty - results = find._db_lookup(db, ensemble, corr, "sfcf") - assert results.empty - results = find._db_lookup(db, ensemble, corr, code, project = "Project_A") - assert len(results) == 1 - results = find._db_lookup(db, ensemble, corr, code, project = "Project_B") - assert results.empty - results = find._db_lookup(db, ensemble, corr, code, parameters = pars) - assert len(results) == 1 - results = find._db_lookup(db, ensemble, corr, code, parameters = "{par_A: 3.0, par_B: 4.0}") - assert results.empty - - corr = "g_A" - ensemble = "SF_A" - code = "openQCD" - meas_path = "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf" - uuid = "Project_A" - pars = "{par_A: 3.0, par_B: 4.0}" - parameter_file = "projects/Project_A/myinput.in" - c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", - (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) - conn.commit() - - corr = "f_A" - results = find._db_lookup(db, ensemble, corr, code) - assert len(results) == 1 - results = find._db_lookup(db, "SF_B", corr, code) - assert results.empty - results = find._db_lookup(db, ensemble, "g_A", code) - assert len(results) == 1 - results = find._db_lookup(db, ensemble, corr, "sfcf") - assert results.empty - results = find._db_lookup(db, ensemble, corr, code, project = "Project_A") - assert len(results) == 1 - results = find._db_lookup(db, ensemble, "g_A", code, project = "Project_A") - assert len(results) == 1 - results = find._db_lookup(db, ensemble, corr, code, project = "Project_B") - assert results.empty - results = find._db_lookup(db, ensemble, "g_A", code, project = "Project_B") - assert results.empty - results = find._db_lookup(db, ensemble, corr, code, parameters = pars) - assert results.empty - results = find._db_lookup(db, ensemble, "g_A", code, parameters = "{par_A: 3.0, par_B: 4.0}") - assert len(results) == 1 - - conn.close() - - def test_time_filter() -> None: record_A = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] # only created @@ -238,3 +140,183 @@ def test_time_filter() -> None: with pytest.raises(ValueError): results = find._time_filter(df, created_before='2023-03-26 12:55:18.229966') + + +def test_db_lookup(tmp_path: Path) -> None: + db = make_sql(tmp_path) + conn = sqlite3.connect(db) + c = conn.cursor() + + corr = "f_A" + ensemble = "SF_A" + code = "openQCD" + meas_path = "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf" + uuid = "Project_A" + pars = "{par_A: 3.0, par_B: 5.0}" + parameter_file = "projects/Project_A/myinput.in" + c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) + conn.commit() + + results = find._db_lookup(db, ensemble, corr, code) + assert len(results) == 1 + results = find._db_lookup(db, "SF_B", corr, code) + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code) + assert results.empty + results = find._db_lookup(db, ensemble, corr, "sfcf") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, project = "Project_A") + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, code, project = "Project_B") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, parameters = pars) + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, code, parameters = "{par_A: 3.0, par_B: 4.0}") + assert results.empty + + corr = "g_A" + ensemble = "SF_A" + code = "openQCD" + meas_path = "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf" + uuid = "Project_A" + pars = "{par_A: 3.0, par_B: 4.0}" + parameter_file = "projects/Project_A/myinput.in" + c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) + conn.commit() + + corr = "f_A" + results = find._db_lookup(db, ensemble, corr, code) + assert len(results) == 1 + results = find._db_lookup(db, "SF_B", corr, code) + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code) + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, "sfcf") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, project = "Project_A") + assert len(results) == 1 + results = find._db_lookup(db, ensemble, "g_A", code, project = "Project_A") + assert len(results) == 1 + results = find._db_lookup(db, ensemble, corr, code, project = "Project_B") + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code, project = "Project_B") + assert results.empty + results = find._db_lookup(db, ensemble, corr, code, parameters = pars) + assert results.empty + results = find._db_lookup(db, ensemble, "g_A", code, parameters = "{par_A: 3.0, par_B: 4.0}") + assert len(results) == 1 + + conn.close() + + +def test_sfcf_filter() -> None: + record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_4 = [] + record_5 = [] + record_6 = [] + record_7 = [] + record_8 = [] + record_9 = [] + data = [ + record_0, + record_1, + record_2, + record_3, + record_4, + record_5, + record_6, + record_7, + record_8, + record_9, + ] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + assert True + + +def test_openQCD_filter() -> None: + assert True + + +def test_find_record() -> None: + assert True + + +def test_find_project(tmp_path: Path) -> None: + cinit.create(tmp_path) + db = tmp_path / "backlogger.db" + dl.unlock(str(db), dataset=str(tmp_path)) + conn = sqlite3.connect(db) + c = conn.cursor() + uuid = "test_uuid" + alias_str = "fun_project" + tag_str = "tt" + owner = "tester" + code = "test_code" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + conn.commit() + + assert uuid == find.find_project(tmp_path, "fun_project") + + uuid = "test_uuid2" + alias_str = "fun_project" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + conn.commit() + + with pytest.raises(Exception): + assert uuid == find._project_lookup_by_alias(tmp_path, "fun_project") + conn.close() + + +def test_list_projects(tmp_path: Path) -> None: + cinit.create(tmp_path) + db = tmp_path / "backlogger.db" + dl.unlock(str(db), dataset=str(tmp_path)) + conn = sqlite3.connect(db) + c = conn.cursor() + uuid = "test_uuid" + alias_str = "fun_project" + tag_str = "tt" + owner = "tester" + code = "test_code" + + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + uuid = "test_uuid2" + alias_str = "fun_project2" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + uuid = "test_uuid3" + alias_str = "fun_project3" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + uuid = "test_uuid4" + alias_str = "fun_project4" + c.execute("INSERT INTO projects (id, aliases, customTags, owner, code, created_at, updated_at) VALUES (?, ?, ?, ?, ?, datetime('now'), datetime('now'))", + (uuid, alias_str, tag_str, owner, code)) + conn.commit() + conn.close() + results = find.list_projects(tmp_path) + assert len(results) == 4 + for i in range(4): + assert len(results[i]) == 2 From 1a1ac5121dbd623513bfaca70de0aa829352029c Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 27 Mar 2026 11:53:39 +0100 Subject: [PATCH 14/26] restructure: make code filter --- corrlib/find.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index cb85130..9b2c201 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -276,6 +276,15 @@ def openQCD_filter(results:pd.DataFrame, **kwargs: Any) -> pd.DataFrame: return results +def _code_filter(results: pd.DataFrame, code: str, **kwargs: Any) -> pd.DataFrame: + if code == "sfcf": + return sfcf_filter(results, **kwargs) + elif code == "openQCD": + return openQCD_filter(results, **kwargs) + else: + raise ValueError(f"Code {code} is not known.") + + def find_record(path: Path, ensemble: str, correlator_name: str, code: str, project: Optional[str]=None, parameters: Optional[str]=None, created_before: Optional[str]=None, created_after: Optional[str]=None, updated_before: Optional[str]=None, updated_after: Optional[str]=None, revision: Optional[str]=None, @@ -289,12 +298,7 @@ def find_record(path: Path, ensemble: str, correlator_name: str, code: str, proj results = _db_lookup(db, ensemble, correlator_name,code, project, parameters=parameters) if any([arg is not None for arg in [created_before, created_after, updated_before, updated_after]]): results = _time_filter(results, created_before, created_after, updated_before, updated_after) - if code == "sfcf": - results = sfcf_filter(results, **kwargs) - elif code == "openQCD": - results = openQCD_filter(results, **kwargs) - else: - raise ValueError(f"Code {code} is not known.") + results = _code_filter(results, code, **kwargs) if customFilter is not None: results = customFilter(results) print("Found " + str(len(results)) + " result" + ("s" if len(results)>1 else "")) From 4673751dc3fd56dcb9776fb8d79e2d5b60f9e4b2 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 7 Apr 2026 11:29:10 +0200 Subject: [PATCH 15/26] add docstrings for openQCD filter --- corrlib/find.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/corrlib/find.py b/corrlib/find.py index 9b2c201..d368973 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -273,10 +273,43 @@ def sfcf_filter(results: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: def openQCD_filter(results:pd.DataFrame, **kwargs: Any) -> pd.DataFrame: + """ + Filter for parameters of openQCD. + + Parameters + ---------- + results: pd.DataFrame + The unfiltered list of results from the database. + + Returns + ------- + results: pd.DataFrame + The filtered results. + + """ return results def _code_filter(results: pd.DataFrame, code: str, **kwargs: Any) -> pd.DataFrame: + """ + Abstraction of the filters for the different codes that are available. + At the moment, only openQCD and SFCF are known. + The possible key words for the parameters can be seen in the descriptionso f the code-specific filters. + + Parameters + ---------- + results: pd.DataFrame + The unfiltered list of results from the database. + code: str + The name of the code that produced the record at hand. + kwargs: + The keyworkd args that are handed over to the code-specific filters. + + Returns + ------- + results: pd.DataFrame + The filtered results. + """ if code == "sfcf": return sfcf_filter(results, **kwargs) elif code == "openQCD": From 8db8d46a06c76bed244bcd9df374c6060d1886ff Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 7 Apr 2026 11:40:48 +0200 Subject: [PATCH 16/26] add very simple tests or code filter and openQCD filter, fix json par strings --- tests/find_test.py | 127 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 102 insertions(+), 25 deletions(-) diff --git a/tests/find_test.py b/tests/find_test.py index 944ae5f..156e5fe 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -57,17 +57,17 @@ def test_find_lookup_by_id(tmp_path: Path) -> None: def test_time_filter() -> None: - record_A = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_A = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] # only created - record_B = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_B = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-04-26 12:55:18.229966'] # created and updated - record_C = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_C = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2026-03-26 12:55:18.229966', '2026-05-26 12:55:18.229966'] # created and updated later - record_D = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_D = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2026-03-26 12:55:18.229966', '2026-03-27 12:55:18.229966'] - record_E = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_E = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2024-03-26 12:55:18.229966', '2024-03-26 12:55:18.229966'] # only created, earlier - record_F = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_F = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2026-03-26 12:55:18.229966', '2024-03-26 12:55:18.229966'] # this is invalid... data = [record_A, record_B, record_C, record_D, record_E] @@ -172,7 +172,7 @@ def test_db_lookup(tmp_path: Path) -> None: assert results.empty results = find._db_lookup(db, ensemble, corr, code, parameters = pars) assert len(results) == 1 - results = find._db_lookup(db, ensemble, corr, code, parameters = "{par_A: 3.0, par_B: 4.0}") + results = find._db_lookup(db, ensemble, corr, code, parameters = '{"par_A": 3.0, "par_B": 4.0}') assert results.empty corr = "g_A" @@ -180,7 +180,7 @@ def test_db_lookup(tmp_path: Path) -> None: code = "openQCD" meas_path = "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf" uuid = "Project_A" - pars = "{par_A: 3.0, par_B: 4.0}" + pars = '{"par_A": 3.0, "par_B": 4.0}' parameter_file = "projects/Project_A/myinput.in" c.execute("INSERT INTO backlogs (name, ensemble, code, path, project, parameters, parameter_file, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))", (corr, ensemble, code, meas_path, uuid, pars, parameter_file)) @@ -205,38 +205,26 @@ def test_db_lookup(tmp_path: Path) -> None: assert results.empty results = find._db_lookup(db, ensemble, corr, code, parameters = pars) assert results.empty - results = find._db_lookup(db, ensemble, "g_A", code, parameters = "{par_A: 3.0, par_B: 4.0}") + results = find._db_lookup(db, ensemble, "g_A", code, parameters = '{"par_A": 3.0, "par_B": 4.0}') assert len(results) == 1 conn.close() def test_sfcf_filter() -> None: - record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{par_A: 5.0, par_B: 5.0}', "projects/SF_A/input.in", + record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_4 = [] - record_5 = [] - record_6 = [] - record_7 = [] - record_8 = [] - record_9 = [] data = [ record_0, record_1, record_2, record_3, - record_4, - record_5, - record_6, - record_7, - record_8, - record_9, ] cols = ["name", "ensemble", @@ -253,9 +241,98 @@ def test_sfcf_filter() -> None: def test_openQCD_filter() -> None: + record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + data = [ + record_0, + record_1, + record_2, + record_3, + ] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + find.openQCD_filter(df) assert True +def test_code_filter() -> None: + record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_4 = ["f_A", "ensA", "openQCD", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_5 = ["f_A", "ensA", "openQCD", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_6 = ["f_P", "ensA", "openQCD", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_7 = ["f_P", "ensA", "openQCD", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_8 = ["f_P", "ensA", "openQCD", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + data = [ + record_0, + record_1, + record_2, + record_3, + ] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + res = find._code_filter(df, "sfcf") + assert len(res) == 4 + + data = [ + record_4, + record_5, + record_6, + record_7, + record_8, + ] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + res = find._code_filter(df, "openQCD") + assert len(res) == 5 + with pytest.raises(ValueError): + res = find._code_filter(df, "asdf") + + def test_find_record() -> None: assert True From 3a1e41808b00763ec270ef32700a9ba45fcf74ee Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Wed, 8 Apr 2026 17:26:38 +0200 Subject: [PATCH 17/26] correct minor typos in doc --- corrlib/find.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index d368973..660e4bf 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -179,9 +179,9 @@ def sfcf_filter(results: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: qk2: float, optional Mass parameter $\kappa_2$ of the first quark. qm1: float, optional - Bare quak mass $m_1$ of the first quark. + Bare quark mass $m_1$ of the first quark. qm2: float, optional - Bare quak mass $m_1$ of the first quark. + Bare quark mass $m_2$ of the first quark. quarks_thetas: list[list[float]], optional wf1: optional wf2: optional From e95edcb0932815e352011da5a69f297a92bfedd1 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Thu, 9 Apr 2026 23:27:31 +0200 Subject: [PATCH 18/26] restruct for easier tests, test drop of sfcf params --- corrlib/find.py | 149 +++++++++++++++++++++++---------------------- tests/find_test.py | 93 +++++++++++++--------------- 2 files changed, 116 insertions(+), 126 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index 660e4bf..9d07a1c 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -162,6 +162,78 @@ def _db_lookup(db: Path, ensemble: str, correlator_name: str, code: str, project return results +def _sfcf_drop(param, **kwargs): + if 'offset' in kwargs: + if kwargs.get('offset') != param['offset']: + return True + if 'quark_kappas' in kwargs: + kappas = kwargs['quark_kappas'] + if (not np.isclose(kappas[0], param['quarks'][0]['mass']) or not np.isclose(kappas[1], param['quarks'][1]['mass'])): + return True + if 'quark_masses' in kwargs: + masses = kwargs['quark_masses'] + if (not np.isclose(masses[0], k2m(param['quarks'][0]['mass'])) or not np.isclose(masses[1], k2m(param['quarks'][1]['mass']))): + return True + if 'qk1' in kwargs: + quark_kappa1 = kwargs['qk1'] + if not isinstance(quark_kappa1, list): + if (not np.isclose(quark_kappa1, param['quarks'][0]['mass'])): + return True + else: + if len(quark_kappa1) == 2: + if (quark_kappa1[0] > param['quarks'][0]['mass']) or (quark_kappa1[1] < param['quarks'][0]['mass']): + return True + else: + raise ValueError("quark_kappa1 has to have length 2") + if 'qk2' in kwargs: + quark_kappa2 = kwargs['qk2'] + if not isinstance(quark_kappa2, list): + if (not np.isclose(quark_kappa2, param['quarks'][1]['mass'])): + return True + else: + if len(quark_kappa2) == 2: + if (quark_kappa2[0] > param['quarks'][1]['mass']) or (quark_kappa2[1] < param['quarks'][1]['mass']): + return True + else: + raise ValueError("quark_kappa2 has to have length 2") + if 'qm1' in kwargs: + quark_mass1 = kwargs['qm1'] + if not isinstance(quark_mass1, list): + if (not np.isclose(quark_mass1, k2m(param['quarks'][0]['mass']))): + return True + else: + if len(quark_mass1) == 2: + if (quark_mass1[0] > k2m(param['quarks'][0]['mass'])) or (quark_mass1[1] < k2m(param['quarks'][0]['mass'])): + return True + else: + raise ValueError("quark_mass1 has to have length 2") + if 'qm2' in kwargs: + quark_mass2 = kwargs['qm2'] + if not isinstance(quark_mass2, list): + if (not np.isclose(quark_mass2, k2m(param['quarks'][1]['mass']))): + return True + else: + if len(quark_mass2) == 2: + if (quark_mass2[0] > k2m(param['quarks'][1]['mass'])) or (quark_mass2[1] < k2m(param['quarks'][1]['mass'])): + return True + else: + raise ValueError("quark_mass2 has to have length 2") + if 'quark_thetas' in kwargs: + quark_thetas = kwargs['quark_thetas'] + if (quark_thetas[0] != param['quarks'][0]['thetas'] and quark_thetas[1] != param['quarks'][1]['thetas']) or (quark_thetas[0] != param['quarks'][1]['thetas'] and quark_thetas[1] != param['quarks'][0]['thetas']): + return True + # careful, this is not save, when multiple contributions are present! + if 'wf1' in kwargs: + wf1 = kwargs['wf1'] + if not (np.isclose(wf1[0][0], param['wf1'][0][0], 1e-8) and np.isclose(wf1[0][1][0], param['wf1'][0][1][0], 1e-8) and np.isclose(wf1[0][1][1], param['wf1'][0][1][1], 1e-8)): + return True + if 'wf2' in kwargs: + wf2 = kwargs['wf2'] + if not (np.isclose(wf2[0][0], param['wf2'][0][0], 1e-8) and np.isclose(wf2[0][1][0], param['wf2'][0][1][0], 1e-8) and np.isclose(wf2[0][1][1], param['wf2'][0][1][1], 1e-8)): + return True + return False + + def sfcf_filter(results: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: r""" Filter method for the Database entries holding SFCF calculations. @@ -191,84 +263,13 @@ def sfcf_filter(results: pd.DataFrame, **kwargs: Any) -> pd.DataFrame: results: pd.DataFrame The filtered DataFrame, only holding the records that fit to the parameters given. """ + drops = [] for ind in range(len(results)): result = results.iloc[ind] param = json.loads(result['parameters']) - if 'offset' in kwargs: - if kwargs.get('offset') != param['offset']: - drops.append(ind) - continue - if 'quark_kappas' in kwargs: - kappas = kwargs['quark_kappas'] - if (not np.isclose(kappas[0], param['quarks'][0]['mass']) or not np.isclose(kappas[1], param['quarks'][1]['mass'])): - drops.append(ind) - continue - if 'quark_masses' in kwargs: - masses = kwargs['quark_masses'] - if (not np.isclose(masses[0], k2m(param['quarks'][0]['mass'])) or not np.isclose(masses[1], k2m(param['quarks'][1]['mass']))): - drops.append(ind) - continue - if 'qk1' in kwargs: - quark_kappa1 = kwargs['qk1'] - if not isinstance(quark_kappa1, list): - if (not np.isclose(quark_kappa1, param['quarks'][0]['mass'])): - drops.append(ind) - continue - else: - if len(quark_kappa1) == 2: - if (quark_kappa1[0] > param['quarks'][0]['mass']) or (quark_kappa1[1] < param['quarks'][0]['mass']): - drops.append(ind) - continue - if 'qk2' in kwargs: - quark_kappa2 = kwargs['qk2'] - if not isinstance(quark_kappa2, list): - if (not np.isclose(quark_kappa2, param['quarks'][1]['mass'])): - drops.append(ind) - continue - else: - if len(quark_kappa2) == 2: - if (quark_kappa2[0] > param['quarks'][1]['mass']) or (quark_kappa2[1] < param['quarks'][1]['mass']): - drops.append(ind) - continue - if 'qm1' in kwargs: - quark_mass1 = kwargs['qm1'] - if not isinstance(quark_mass1, list): - if (not np.isclose(quark_mass1, k2m(param['quarks'][0]['mass']))): - drops.append(ind) - continue - else: - if len(quark_mass1) == 2: - if (quark_mass1[0] > k2m(param['quarks'][0]['mass'])) or (quark_mass1[1] < k2m(param['quarks'][0]['mass'])): - drops.append(ind) - continue - if 'qm2' in kwargs: - quark_mass2 = kwargs['qm2'] - if not isinstance(quark_mass2, list): - if (not np.isclose(quark_mass2, k2m(param['quarks'][1]['mass']))): - drops.append(ind) - continue - else: - if len(quark_mass2) == 2: - if (quark_mass2[0] > k2m(param['quarks'][1]['mass'])) or (quark_mass2[1] < k2m(param['quarks'][1]['mass'])): - drops.append(ind) - continue - if 'quark_thetas' in kwargs: - quark_thetas = kwargs['quark_thetas'] - if (quark_thetas[0] != param['quarks'][0]['thetas'] and quark_thetas[1] != param['quarks'][1]['thetas']) or (quark_thetas[0] != param['quarks'][1]['thetas'] and quark_thetas[1] != param['quarks'][0]['thetas']): - drops.append(ind) - continue - # careful, this is not save, when multiple contributions are present! - if 'wf1' in kwargs: - wf1 = kwargs['wf1'] - if not (np.isclose(wf1[0][0], param['wf1'][0][0], 1e-8) and np.isclose(wf1[0][1][0], param['wf1'][0][1][0], 1e-8) and np.isclose(wf1[0][1][1], param['wf1'][0][1][1], 1e-8)): - drops.append(ind) - continue - if 'wf2' in kwargs: - wf2 = kwargs['wf2'] - if not (np.isclose(wf2[0][0], param['wf2'][0][0], 1e-8) and np.isclose(wf2[0][1][0], param['wf2'][0][1][0], 1e-8) and np.isclose(wf2[0][1][1], param['wf2'][0][1][1], 1e-8)): - drops.append(ind) - continue + if _sfcf_drop(param, **kwargs): + drops.append(ind) return results.drop(drops) diff --git a/tests/find_test.py b/tests/find_test.py index 156e5fe..36d687e 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -211,62 +211,51 @@ def test_db_lookup(tmp_path: Path) -> None: conn.close() -def test_sfcf_filter() -> None: - record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - data = [ - record_0, - record_1, - record_2, - record_3, - ] - cols = ["name", - "ensemble", - "code", - "path", - "project", - "parameters", - "parameter_file", - "created_at", - "updated_at"] - df = pd.DataFrame(data,columns=cols) +def test_sfcf_drop() -> None: + parameters0 = { + 'offset': [0,0,0], + 'quarks': [{'mass': 1, 'thetas': [0,0,0]}, {'mass': 2, 'thetas': [0,0,1]}], # m0s = -3.5, -3.75 + 'wf1': [[1, [0, 0]], [0.5, [1, 0]], [.75, [.5, .5]]], + 'wf2': [[1, [2, 1]], [2, [0.5, -0.5]], [.5, [.75, .72]]], + } - assert True + assert not find._sfcf_drop(parameters0, offset=[0,0,0]) + assert find._sfcf_drop(parameters0, offset=[1,0,0]) + + assert not find._sfcf_drop(parameters0, quark_kappas = [1, 2]) + assert find._sfcf_drop(parameters0, quark_kappas = [-3.1, -3.72]) + + assert not find._sfcf_drop(parameters0, quark_masses = [-3.5, -3.75]) + assert find._sfcf_drop(parameters0, quark_masses = [-3.1, -3.72]) + + assert not find._sfcf_drop(parameters0, qk1 = 1) + assert not find._sfcf_drop(parameters0, qk2 = 2) + assert find._sfcf_drop(parameters0, qk1 = 2) + assert find._sfcf_drop(parameters0, qk2 = 1) + + assert not find._sfcf_drop(parameters0, qk1 = [0.5,1.5]) + assert not find._sfcf_drop(parameters0, qk2 = [1.5,2.5]) + assert find._sfcf_drop(parameters0, qk1 = 2) + assert find._sfcf_drop(parameters0, qk2 = 1) + with pytest.raises(ValueError): + assert not find._sfcf_drop(parameters0, qk1 = [0.5,1,5]) + with pytest.raises(ValueError): + assert not find._sfcf_drop(parameters0, qk2 = [1,5,2.5]) + + assert find._sfcf_drop(parameters0, qm1 = 1.2) + assert find._sfcf_drop(parameters0, qm2 = 2.2) + assert not find._sfcf_drop(parameters0, qm1 = -3.5) + assert not find._sfcf_drop(parameters0, qm2 = -3.75) + + assert find._sfcf_drop(parameters0, qm2 = 1.2) + assert find._sfcf_drop(parameters0, qm1 = 2.2) + with pytest.raises(ValueError): + assert not find._sfcf_drop(parameters0, qm1 = [0.5,1,5]) + with pytest.raises(ValueError): + assert not find._sfcf_drop(parameters0, qm2 = [1,5,2.5]) def test_openQCD_filter() -> None: - record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] - data = [ - record_0, - record_1, - record_2, - record_3, - ] - cols = ["name", - "ensemble", - "code", - "path", - "project", - "parameters", - "parameter_file", - "created_at", - "updated_at"] - df = pd.DataFrame(data,columns=cols) - - find.openQCD_filter(df) assert True From 6d1f8f7f1baa7efc26b0964af041af97347cb491 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Fri, 10 Apr 2026 10:28:28 +0200 Subject: [PATCH 19/26] add NotImplemented warning for openQCD filter --- corrlib/find.py | 5 ++++- tests/find_test.py | 28 +++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index 9d07a1c..1c985e2 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -11,6 +11,7 @@ from typing import Any, Optional, Union from pathlib import Path import datetime as dt from collections.abc import Callable +import warnings def _project_lookup_by_alias(db: Path, alias: str) -> str: @@ -162,7 +163,7 @@ def _db_lookup(db: Path, ensemble: str, correlator_name: str, code: str, project return results -def _sfcf_drop(param, **kwargs): +def _sfcf_drop(param: dict[str, Any], **kwargs: Any) -> bool: if 'offset' in kwargs: if kwargs.get('offset') != param['offset']: return True @@ -288,6 +289,8 @@ def openQCD_filter(results:pd.DataFrame, **kwargs: Any) -> pd.DataFrame: The filtered results. """ + warnings.warn("A filter for openQCD parameters is no implemented yet.", Warning) + return results diff --git a/tests/find_test.py b/tests/find_test.py index 36d687e..f512f15 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -256,7 +256,33 @@ def test_sfcf_drop() -> None: def test_openQCD_filter() -> None: - assert True + record_0 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_1 = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_2 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + record_3 = ["f_P", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] + data = [ + record_0, + record_1, + record_2, + record_3, + ] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + with pytest.warns(Warning): + find.openQCD_filter(df, a = "asdf") def test_code_filter() -> None: From 91938c3c5a3f590ad48d471e0a19a8702ba94349 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 14 Apr 2026 14:17:41 +0200 Subject: [PATCH 20/26] add second time integrity check --- corrlib/integrity.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index bf890db..f1459d0 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -2,4 +2,9 @@ import datetime as dt def check_time_validity(created_at: dt.datetime, updated_at: dt.datetime) -> bool: - return not (created_at > updated_at) + # we expect created_at <= updated_at <= now + if created_at > updated_at: + return False + if updated_at > dt.datetime.now(): + return False + return True From 0b8c041ee559af903d6aa1526ed1a59753ab775d Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 14 Apr 2026 15:34:05 +0200 Subject: [PATCH 21/26] add wrapper functions to check for the validity of the database --- corrlib/integrity.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index f1459d0..db242f6 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -1,10 +1,32 @@ import datetime as dt +from pathlib import Path +from .tools import get_db_file +import pandas as pd +import sqlite3 -def check_time_validity(created_at: dt.datetime, updated_at: dt.datetime) -> bool: +def has_valid_times(result: pd.DataFrame) -> bool: # we expect created_at <= updated_at <= now + created_at = dt.datetime.fromisoformat(result['created_at']) + updated_at = dt.datetime.fromisoformat(result['updated_at']) if created_at > updated_at: return False if updated_at > dt.datetime.now(): return False return True + + +def check_db_integrity(path: Path) -> None: + db = get_db_file(path) + search_expr = "SELECT * FROM 'backlogs'" + conn = sqlite3.connect(db) + results = pd.read_sql(search_expr, conn) + + for result in results: + if not has_valid_times(result): + raise ValueError(f"Result with id {result[id]} has wrong time signatures.") + + +def full_integrity_check(path: Path) -> None: + check_db_integrity(path) + From 65cd55ec0a8d2afbe5a54159cff393d80da466bd Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 14 Apr 2026 16:36:31 +0200 Subject: [PATCH 22/26] add test on whether paths are indeed unique --- corrlib/integrity.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index db242f6..70e4694 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -15,6 +15,14 @@ def has_valid_times(result: pd.DataFrame) -> bool: return False return True +def are_keys_unique(db: Path, table: str, col: str) -> bool: + conn = sqlite3.connect(db) + c = conn.cursor() + c.execute(f"SELECT COUNT( DISTINCT CAST(path AS nvarchar(4000))), COUNT({col}) FROM {table};") + results = c.fetchall()[0] + conn.close() + return bool(results[0] == results[1]) + def check_db_integrity(path: Path) -> None: db = get_db_file(path) @@ -27,6 +35,7 @@ def check_db_integrity(path: Path) -> None: raise ValueError(f"Result with id {result[id]} has wrong time signatures.") + def full_integrity_check(path: Path) -> None: check_db_integrity(path) From 85698c377bca7405d69c63d13d3ef918d35aaf1a Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Tue, 14 Apr 2026 16:42:39 +0200 Subject: [PATCH 23/26] use uniqueness for complete db check --- corrlib/integrity.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 70e4694..8a414bf 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -26,6 +26,10 @@ def are_keys_unique(db: Path, table: str, col: str) -> bool: def check_db_integrity(path: Path) -> None: db = get_db_file(path) + + if not are_keys_unique(db, 'backlogs', 'path'): + raise Exception("The paths the backlog table of the database links are not unique.") + search_expr = "SELECT * FROM 'backlogs'" conn = sqlite3.connect(db) results = pd.read_sql(search_expr, conn) From d8bb9e4080017070bc928d19fadd7e175abeebf0 Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Wed, 15 Apr 2026 10:49:03 +0200 Subject: [PATCH 24/26] fix import --- corrlib/find.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/corrlib/find.py b/corrlib/find.py index 1c985e2..7b07321 100644 --- a/corrlib/find.py +++ b/corrlib/find.py @@ -6,8 +6,8 @@ import numpy as np from .input.implementations import codes from .tools import k2m, get_db_file from .tracker import get -from .integrity import check_time_validity -from typing import Any, Optional, Union +from .integrity import has_valid_times +from typing import Any, Optional from pathlib import Path import datetime as dt from collections.abc import Callable @@ -88,7 +88,7 @@ def _time_filter(results: pd.DataFrame, created_before: Optional[str]=None, cre result = results.iloc[ind] created_at = dt.datetime.fromisoformat(result['created_at']) updated_at = dt.datetime.fromisoformat(result['updated_at']) - db_times_valid = check_time_validity(created_at=created_at, updated_at=updated_at) + db_times_valid = has_valid_times(result) if not db_times_valid: raise ValueError('Time stamps not valid for result with path', result["path"]) From dc424c3e18ecdeeda834865b05dc9bfac6e41e5a Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Wed, 15 Apr 2026 11:24:25 +0200 Subject: [PATCH 25/26] fix time tests --- tests/find_test.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/tests/find_test.py b/tests/find_test.py index f512f15..cc455f9 100644 --- a/tests/find_test.py +++ b/tests/find_test.py @@ -5,6 +5,7 @@ import corrlib.initialization as cinit import pytest import pandas as pd import datalad.api as dl +import datetime as dt def make_sql(path: Path) -> Path: @@ -57,18 +58,20 @@ def test_find_lookup_by_id(tmp_path: Path) -> None: def test_time_filter() -> None: - record_A = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + record_A = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf0", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-03-26 12:55:18.229966'] # only created - record_B = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + record_B = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf1", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2025-03-26 12:55:18.229966', '2025-04-26 12:55:18.229966'] # created and updated - record_C = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", - '2026-03-26 12:55:18.229966', '2026-05-26 12:55:18.229966'] # created and updated later - record_D = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + record_C = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf2", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2026-03-26 12:55:18.229966', '2026-04-14 12:55:18.229966'] # created and updated later + record_D = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf3", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2026-03-26 12:55:18.229966', '2026-03-27 12:55:18.229966'] - record_E = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + record_E = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf4", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2024-03-26 12:55:18.229966', '2024-03-26 12:55:18.229966'] # only created, earlier - record_F = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + record_F = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf5", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", '2026-03-26 12:55:18.229966', '2024-03-26 12:55:18.229966'] # this is invalid... + record_G = ["f_A", "ensA", "sfcf", "archive/SF_A/f_A/Project_A.json.gz::asdfasdfasdf2", "SF_A", '{"par_A": 5.0, "par_B": 5.0}', "projects/SF_A/input.in", + '2026-03-26 12:55:18.229966', str(dt.datetime.now() + dt.timedelta(days=2, hours=3, minutes=5, seconds=30))] # created and updated later data = [record_A, record_B, record_C, record_D, record_E] cols = ["name", @@ -141,6 +144,21 @@ def test_time_filter() -> None: with pytest.raises(ValueError): results = find._time_filter(df, created_before='2023-03-26 12:55:18.229966') + data = [record_A, record_B, record_C, record_D, record_G] + cols = ["name", + "ensemble", + "code", + "path", + "project", + "parameters", + "parameter_file", + "created_at", + "updated_at"] + df = pd.DataFrame(data,columns=cols) + + with pytest.raises(ValueError): + results = find._time_filter(df, created_before='2023-03-26 12:55:18.229966') + def test_db_lookup(tmp_path: Path) -> None: db = make_sql(tmp_path) From b625bf92438ba3fcae0729bddd57554f68275fdd Mon Sep 17 00:00:00 2001 From: Justus Kuhlmann Date: Wed, 15 Apr 2026 12:02:03 +0200 Subject: [PATCH 26/26] proper row interation --- corrlib/integrity.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/corrlib/integrity.py b/corrlib/integrity.py index 8a414bf..d865944 100644 --- a/corrlib/integrity.py +++ b/corrlib/integrity.py @@ -5,7 +5,7 @@ import pandas as pd import sqlite3 -def has_valid_times(result: pd.DataFrame) -> bool: +def has_valid_times(result: pd.Series) -> bool: # we expect created_at <= updated_at <= now created_at = dt.datetime.fromisoformat(result['created_at']) updated_at = dt.datetime.fromisoformat(result['updated_at']) @@ -34,7 +34,7 @@ def check_db_integrity(path: Path) -> None: conn = sqlite3.connect(db) results = pd.read_sql(search_expr, conn) - for result in results: + for _, result in results.iterrows(): if not has_valid_times(result): raise ValueError(f"Result with id {result[id]} has wrong time signatures.")