From 42a6dbddd423da096bb4f0472f2f7d814164d42f Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 30 Jun 2022 14:05:11 +0100 Subject: [PATCH 1/7] feat: dump and load functionality for pandas dataframes containing Obs objects added. --- pyerrors/input/__init__.py | 1 + pyerrors/input/pandas.py | 72 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 pyerrors/input/pandas.py diff --git a/pyerrors/input/__init__.py b/pyerrors/input/__init__.py index 3b585614..e8cfff08 100644 --- a/pyerrors/input/__init__.py +++ b/pyerrors/input/__init__.py @@ -10,4 +10,5 @@ from . import hadrons from . import json from . import misc from . import openQCD +from . import pandas from . import sfcf diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py new file mode 100644 index 00000000..b54c7617 --- /dev/null +++ b/pyerrors/input/pandas.py @@ -0,0 +1,72 @@ +import warnings +import gzip +import pandas as pd +from ..obs import Obs +from .json import create_json_string, import_json_string + + +def dump_df(df, fname, gz=True): + """Exports a pandas DataFrame containing Obs valued columns to a (gzipped) csv file. + + Before making use of pandas to_csv functionality Obs objects are serialized via the standardized + json format of pyerrors. + + Parameters + ---------- + df : pandas.DataFrame + Dataframe to be dumped to a file. + fname : str + Filename of the output file. + gz : bool + If True, the output is a gzipped csv file. If False, the output is a csv file. + """ + + out = df.copy() + for column in out: + if isinstance(out[column][0], Obs): + out[column] = out[column].transform(lambda x: create_json_string(x, indent=0)) + + if not fname.endswith('.csv'): + fname += '.csv' + + out.to_csv(fname) + if gz is True: + with open(fname, 'rb') as f_in, gzip.open(fname + ".gz", 'wb') as f_out: + f_out.writelines(f_in) + + +def load_df(fname, auto_gamma=False, gz=True): + """Imports a pandas DataFrame from a csv.(gz) file in which Obs objects are serialized as json strings. + + Parameters + ---------- + fname : str + Filename of the input file. + auto_gamma : bool + If True applies the gamma_method to all imported Obs objects with the default parameters for + the error analysis. Default False. + gz : bool + If True, assumes that data is gzipped. If False, assumes JSON file. + """ + + if not fname.endswith('.csv') and not fname.endswith('.gz'): + fname += '.csv' + + if gz is True: + if not fname.endswith('.gz'): + fname += '.gz' + with gzip.open(fname) as f: + re_import = pd.read_csv(f) + else: + if fname.endswith('.gz'): + warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning) + re_import = pd.read_csv(fname) + + for column in re_import.select_dtypes(include="object"): + if isinstance(re_import[column][0], str): + if re_import[column][0][:20] == '{"program":"pyerrors': + re_import[column] = re_import[column].transform(lambda x: import_json_string(x, verbose=False)) + if auto_gamma is True: + re_import[column].apply(Obs.gamma_method) + + return re_import From 29820f8e61a05ed8af3590ef02fa37240438ea6d Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 30 Jun 2022 14:12:51 +0100 Subject: [PATCH 2/7] fix: don't write index column to csv file in input.pandas.dump_df. --- pyerrors/input/pandas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py index b54c7617..b3a2d3ef 100644 --- a/pyerrors/input/pandas.py +++ b/pyerrors/input/pandas.py @@ -29,7 +29,7 @@ def dump_df(df, fname, gz=True): if not fname.endswith('.csv'): fname += '.csv' - out.to_csv(fname) + out.to_csv(fname, index=False) if gz is True: with open(fname, 'rb') as f_in, gzip.open(fname + ".gz", 'wb') as f_out: f_out.writelines(f_in) From f980229d5cbb1acd14f293d7ffb6fc10bef2c49d Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 30 Jun 2022 14:14:59 +0100 Subject: [PATCH 3/7] build: pandas added as dependency. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0c00aad5..33bde5bc 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ setup(name='pyerrors', license="MIT", packages=find_packages(), python_requires='>=3.6.0', - install_requires=['numpy>=1.16', 'autograd>=1.4', 'numdifftools', 'matplotlib>=3.3', 'scipy>=1', 'iminuit>=2', 'h5py>=3', 'lxml>=4', 'python-rapidjson>=1'], + install_requires=['numpy>=1.16', 'autograd>=1.4', 'numdifftools', 'matplotlib>=3.3', 'scipy>=1', 'iminuit>=2', 'h5py>=3', 'lxml>=4', 'python-rapidjson>=1', 'pandas>=1.1'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Science/Research', From feab699162abe12834f8492b023533eb6f892a4e Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 30 Jun 2022 14:27:10 +0100 Subject: [PATCH 4/7] tests: basic test for pandas DataFrame export and re-import added. --- tests/pandas_test.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tests/pandas_test.py diff --git a/tests/pandas_test.py b/tests/pandas_test.py new file mode 100644 index 00000000..610f980f --- /dev/null +++ b/tests/pandas_test.py @@ -0,0 +1,18 @@ +import numpy as np +import pandas as pd +import pyerrors as pe + +def test_df_export_import(tmp_path): + for gz in [True, False]: + my_df = pd.DataFrame([{"int": 1, + "float": -0.01, + "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"), + "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}]) + + pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz) + reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz) + assert np.all(my_df == reconstructed_df) + + pe.input.pandas.load_df((tmp_path / 'df_output.csv').as_posix(), gz=gz) + + From c7c17256673519d016307e595b5c9819add1bfe1 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 30 Jun 2022 14:33:14 +0100 Subject: [PATCH 5/7] tests: pandas io tests extended. --- tests/pandas_test.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/pandas_test.py b/tests/pandas_test.py index 610f980f..d4833656 100644 --- a/tests/pandas_test.py +++ b/tests/pandas_test.py @@ -3,14 +3,15 @@ import pandas as pd import pyerrors as pe def test_df_export_import(tmp_path): + my_dict = {"int": 1, + "float": -0.01, + "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"), + "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")} for gz in [True, False]: - my_df = pd.DataFrame([{"int": 1, - "float": -0.01, - "Obs1": pe.pseudo_Obs(87, 21, "test_ensemble"), - "Obs2": pe.pseudo_Obs(-87, 21, "test_ensemble2")}]) + my_df = pd.DataFrame([my_dict] * 10) pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix(), gz=gz) - reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), gz=gz) + reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True, gz=gz) assert np.all(my_df == reconstructed_df) pe.input.pandas.load_df((tmp_path / 'df_output.csv').as_posix(), gz=gz) From 153cc795b882399066ec65cf08e8364fe0806186 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 30 Jun 2022 15:26:31 +0100 Subject: [PATCH 6/7] feat: pandas DataFrames with Corr columns can now also be imported and exported. --- pyerrors/input/pandas.py | 5 +++-- tests/pandas_test.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py index b3a2d3ef..67bd9bbd 100644 --- a/pyerrors/input/pandas.py +++ b/pyerrors/input/pandas.py @@ -2,6 +2,7 @@ import warnings import gzip import pandas as pd from ..obs import Obs +from ..correlators import Corr from .json import create_json_string, import_json_string @@ -23,7 +24,7 @@ def dump_df(df, fname, gz=True): out = df.copy() for column in out: - if isinstance(out[column][0], Obs): + if isinstance(out[column][0], (Obs, Corr)): out[column] = out[column].transform(lambda x: create_json_string(x, indent=0)) if not fname.endswith('.csv'): @@ -67,6 +68,6 @@ def load_df(fname, auto_gamma=False, gz=True): if re_import[column][0][:20] == '{"program":"pyerrors': re_import[column] = re_import[column].transform(lambda x: import_json_string(x, verbose=False)) if auto_gamma is True: - re_import[column].apply(Obs.gamma_method) + re_import[column].apply(lambda x: x.gamma_method()) return re_import diff --git a/tests/pandas_test.py b/tests/pandas_test.py index d4833656..658f4375 100644 --- a/tests/pandas_test.py +++ b/tests/pandas_test.py @@ -17,3 +17,14 @@ def test_df_export_import(tmp_path): pe.input.pandas.load_df((tmp_path / 'df_output.csv').as_posix(), gz=gz) +def test_df_Corr(tmp_path): + + my_corr = pe.Corr([pe.pseudo_Obs(-0.48, 0.04, "test"), pe.pseudo_Obs(-0.154, 0.03, "test")]) + + my_dict = {"int": 1, + "float": -0.01, + "Corr": my_corr} + my_df = pd.DataFrame([my_dict] * 5) + + pe.input.pandas.dump_df(my_df, (tmp_path / 'df_output').as_posix()) + reconstructed_df = pe.input.pandas.load_df((tmp_path / 'df_output').as_posix(), auto_gamma=True) From c6ec11045c9f7864a9193996aab00b90e7bf57f8 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 30 Jun 2022 15:49:40 +0100 Subject: [PATCH 7/7] fix: redundant export of not gzipped pandas Dataframe removed. --- pyerrors/input/pandas.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pyerrors/input/pandas.py b/pyerrors/input/pandas.py index 67bd9bbd..caf3e0b6 100644 --- a/pyerrors/input/pandas.py +++ b/pyerrors/input/pandas.py @@ -30,10 +30,12 @@ def dump_df(df, fname, gz=True): if not fname.endswith('.csv'): fname += '.csv' - out.to_csv(fname, index=False) if gz is True: - with open(fname, 'rb') as f_in, gzip.open(fname + ".gz", 'wb') as f_out: - f_out.writelines(f_in) + if not fname.endswith('.gz'): + fname += '.gz' + out.to_csv(fname, index=False, compression='gzip') + else: + out.to_csv(fname, index=False) def load_df(fname, auto_gamma=False, gz=True):