pyerrors.input.pandas

  1import warnings
  2import gzip
  3import sqlite3
  4from contextlib import closing
  5import pandas as pd
  6from ..obs import Obs
  7from ..correlators import Corr
  8from .json import create_json_string, import_json_string
  9import numpy as np
 10
 11
 12def to_sql(df, table_name, db, if_exists='fail', gz=True, **kwargs):
 13    """Write DataFrame including Obs or Corr valued columns to sqlite database.
 14
 15    Parameters
 16    ----------
 17    df : pandas.DataFrame
 18        Dataframe to be written to the database.
 19    table_name : str
 20        Name of the table in the database.
 21    db : str
 22        Path to the sqlite database.
 23    if exists : str
 24        How to behave if table already exists. Options 'fail', 'replace', 'append'.
 25    gz : bool
 26        If True the json strings are gzipped.
 27
 28    Returns
 29    -------
 30    None
 31    """
 32    se_df = _serialize_df(df, gz=gz)
 33    with closing(sqlite3.connect(db)) as con:
 34        se_df.to_sql(table_name, con=con, if_exists=if_exists, index=False, **kwargs)
 35
 36
 37def read_sql(sql, db, auto_gamma=False, **kwargs):
 38    """Execute SQL query on sqlite database and obtain DataFrame including Obs or Corr valued columns.
 39
 40    Parameters
 41    ----------
 42    sql : str
 43        SQL query to be executed.
 44    db : str
 45        Path to the sqlite database.
 46    auto_gamma : bool
 47        If True applies the gamma_method to all imported Obs objects with the default parameters for
 48        the error analysis. Default False.
 49
 50    Returns
 51    -------
 52    data : pandas.DataFrame
 53        Dataframe with the content of the sqlite database.
 54    """
 55    with closing(sqlite3.connect(db)) as con:
 56        extract_df = pd.read_sql(sql, con=con, **kwargs)
 57    return _deserialize_df(extract_df, auto_gamma=auto_gamma)
 58
 59
 60def dump_df(df, fname, gz=True):
 61    """Exports a pandas DataFrame containing Obs valued columns to a (gzipped) csv file.
 62
 63    Before making use of pandas to_csv functionality Obs objects are serialized via the standardized
 64    json format of pyerrors.
 65
 66    Parameters
 67    ----------
 68    df : pandas.DataFrame
 69        Dataframe to be dumped to a file.
 70    fname : str
 71        Filename of the output file.
 72    gz : bool
 73        If True, the output is a gzipped csv file. If False, the output is a csv file.
 74
 75    Returns
 76    -------
 77    None
 78    """
 79    for column in df:
 80        serialize = _need_to_serialize(df[column])
 81        if not serialize:
 82            if all(isinstance(entry, (int, np.integer, float, np.floating)) for entry in df[column]):
 83                if any([np.isnan(entry) for entry in df[column]]):
 84                    warnings.warn("nan value in column " + column + " will be replaced by None", UserWarning)
 85
 86    out = _serialize_df(df, gz=False)
 87
 88    if not fname.endswith('.csv'):
 89        fname += '.csv'
 90
 91    if gz is True:
 92        if not fname.endswith('.gz'):
 93            fname += '.gz'
 94        out.to_csv(fname, index=False, compression='gzip')
 95    else:
 96        out.to_csv(fname, index=False)
 97
 98
 99def load_df(fname, auto_gamma=False, gz=True):
100    """Imports a pandas DataFrame from a csv.(gz) file in which Obs objects are serialized as json strings.
101
102    Parameters
103    ----------
104    fname : str
105        Filename of the input file.
106    auto_gamma : bool
107        If True applies the gamma_method to all imported Obs objects with the default parameters for
108        the error analysis. Default False.
109    gz : bool
110        If True, assumes that data is gzipped. If False, assumes JSON file.
111
112    Returns
113    -------
114    data : pandas.DataFrame
115        Dataframe with the content of the sqlite database.
116    """
117    if not fname.endswith('.csv') and not fname.endswith('.gz'):
118        fname += '.csv'
119
120    if gz is True:
121        if not fname.endswith('.gz'):
122            fname += '.gz'
123        with gzip.open(fname) as f:
124            re_import = pd.read_csv(f, keep_default_na=False)
125    else:
126        if fname.endswith('.gz'):
127            warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning)
128        re_import = pd.read_csv(fname, keep_default_na=False)
129
130    return _deserialize_df(re_import, auto_gamma=auto_gamma)
131
132
133def _serialize_df(df, gz=False):
134    """Serializes all Obs or Corr valued columns into json strings according to the pyerrors json specification.
135
136    Parameters
137    ----------
138    df : pandas.DataFrame
139        DataFrame to be serilized.
140    gz: bool
141        gzip the json string representation. Default False.
142    """
143    out = df.copy()
144    for column in out:
145        serialize = _need_to_serialize(out[column])
146
147        if serialize is True:
148            out[column] = out[column].transform(lambda x: create_json_string(x, indent=0) if x is not None else None)
149            if gz is True:
150                out[column] = out[column].transform(lambda x: gzip.compress((x if x is not None else '').encode('utf-8')))
151    return out
152
153
154def _deserialize_df(df, auto_gamma=False):
155    """Deserializes all pyerrors json strings into Obs or Corr objects according to the pyerrors json specification.
156
157    Parameters
158    ----------
159    df : pandas.DataFrame
160        DataFrame to be deserilized.
161    auto_gamma : bool
162        If True applies the gamma_method to all imported Obs objects with the default parameters for
163        the error analysis. Default False.
164
165    Notes:
166    ------
167    In case any column of the DataFrame is gzipped it is gunzipped in the process.
168    """
169    for column in df.select_dtypes(include="object"):
170        if isinstance(df[column][0], bytes):
171            if df[column][0].startswith(b"\x1f\x8b\x08\x00"):
172                df[column] = df[column].transform(lambda x: gzip.decompress(x).decode('utf-8'))
173
174        if not all([e is None for e in df[column]]):
175            df[column] = df[column].replace({r'^$': None}, regex=True)
176            i = 0
177            while df[column][i] is None:
178                i += 1
179            if isinstance(df[column][i], str):
180                if '"program":' in df[column][i][:20]:
181                    df[column] = df[column].transform(lambda x: import_json_string(x, verbose=False) if x is not None else None)
182                    if auto_gamma is True:
183                        if isinstance(df[column][i], list):
184                            df[column].apply(lambda x: [o.gm() if o is not None else x for o in x])
185                        else:
186                            df[column].apply(lambda x: x.gm() if x is not None else x)
187    return df
188
189
190def _need_to_serialize(col):
191    serialize = False
192    i = 0
193    while i < len(col) and col[i] is None:
194        i += 1
195    if i == len(col):
196        return serialize
197    if isinstance(col[i], (Obs, Corr)):
198        serialize = True
199    elif isinstance(col[i], list):
200        if all(isinstance(o, Obs) for o in col[i]):
201            serialize = True
202    return serialize
def to_sql(df, table_name, db, if_exists='fail', gz=True, **kwargs):
13def to_sql(df, table_name, db, if_exists='fail', gz=True, **kwargs):
14    """Write DataFrame including Obs or Corr valued columns to sqlite database.
15
16    Parameters
17    ----------
18    df : pandas.DataFrame
19        Dataframe to be written to the database.
20    table_name : str
21        Name of the table in the database.
22    db : str
23        Path to the sqlite database.
24    if exists : str
25        How to behave if table already exists. Options 'fail', 'replace', 'append'.
26    gz : bool
27        If True the json strings are gzipped.
28
29    Returns
30    -------
31    None
32    """
33    se_df = _serialize_df(df, gz=gz)
34    with closing(sqlite3.connect(db)) as con:
35        se_df.to_sql(table_name, con=con, if_exists=if_exists, index=False, **kwargs)

Write DataFrame including Obs or Corr valued columns to sqlite database.

Parameters
  • df (pandas.DataFrame): Dataframe to be written to the database.
  • table_name (str): Name of the table in the database.
  • db (str): Path to the sqlite database.
  • if exists (str): How to behave if table already exists. Options 'fail', 'replace', 'append'.
  • gz (bool): If True the json strings are gzipped.
Returns
  • None
def read_sql(sql, db, auto_gamma=False, **kwargs):
38def read_sql(sql, db, auto_gamma=False, **kwargs):
39    """Execute SQL query on sqlite database and obtain DataFrame including Obs or Corr valued columns.
40
41    Parameters
42    ----------
43    sql : str
44        SQL query to be executed.
45    db : str
46        Path to the sqlite database.
47    auto_gamma : bool
48        If True applies the gamma_method to all imported Obs objects with the default parameters for
49        the error analysis. Default False.
50
51    Returns
52    -------
53    data : pandas.DataFrame
54        Dataframe with the content of the sqlite database.
55    """
56    with closing(sqlite3.connect(db)) as con:
57        extract_df = pd.read_sql(sql, con=con, **kwargs)
58    return _deserialize_df(extract_df, auto_gamma=auto_gamma)

Execute SQL query on sqlite database and obtain DataFrame including Obs or Corr valued columns.

Parameters
  • sql (str): SQL query to be executed.
  • db (str): Path to the sqlite database.
  • auto_gamma (bool): If True applies the gamma_method to all imported Obs objects with the default parameters for the error analysis. Default False.
Returns
  • data (pandas.DataFrame): Dataframe with the content of the sqlite database.
def dump_df(df, fname, gz=True):
61def dump_df(df, fname, gz=True):
62    """Exports a pandas DataFrame containing Obs valued columns to a (gzipped) csv file.
63
64    Before making use of pandas to_csv functionality Obs objects are serialized via the standardized
65    json format of pyerrors.
66
67    Parameters
68    ----------
69    df : pandas.DataFrame
70        Dataframe to be dumped to a file.
71    fname : str
72        Filename of the output file.
73    gz : bool
74        If True, the output is a gzipped csv file. If False, the output is a csv file.
75
76    Returns
77    -------
78    None
79    """
80    for column in df:
81        serialize = _need_to_serialize(df[column])
82        if not serialize:
83            if all(isinstance(entry, (int, np.integer, float, np.floating)) for entry in df[column]):
84                if any([np.isnan(entry) for entry in df[column]]):
85                    warnings.warn("nan value in column " + column + " will be replaced by None", UserWarning)
86
87    out = _serialize_df(df, gz=False)
88
89    if not fname.endswith('.csv'):
90        fname += '.csv'
91
92    if gz is True:
93        if not fname.endswith('.gz'):
94            fname += '.gz'
95        out.to_csv(fname, index=False, compression='gzip')
96    else:
97        out.to_csv(fname, index=False)

Exports a pandas DataFrame containing Obs valued columns to a (gzipped) csv file.

Before making use of pandas to_csv functionality Obs objects are serialized via the standardized json format of pyerrors.

Parameters
  • df (pandas.DataFrame): Dataframe to be dumped to a file.
  • fname (str): Filename of the output file.
  • gz (bool): If True, the output is a gzipped csv file. If False, the output is a csv file.
Returns
  • None
def load_df(fname, auto_gamma=False, gz=True):
100def load_df(fname, auto_gamma=False, gz=True):
101    """Imports a pandas DataFrame from a csv.(gz) file in which Obs objects are serialized as json strings.
102
103    Parameters
104    ----------
105    fname : str
106        Filename of the input file.
107    auto_gamma : bool
108        If True applies the gamma_method to all imported Obs objects with the default parameters for
109        the error analysis. Default False.
110    gz : bool
111        If True, assumes that data is gzipped. If False, assumes JSON file.
112
113    Returns
114    -------
115    data : pandas.DataFrame
116        Dataframe with the content of the sqlite database.
117    """
118    if not fname.endswith('.csv') and not fname.endswith('.gz'):
119        fname += '.csv'
120
121    if gz is True:
122        if not fname.endswith('.gz'):
123            fname += '.gz'
124        with gzip.open(fname) as f:
125            re_import = pd.read_csv(f, keep_default_na=False)
126    else:
127        if fname.endswith('.gz'):
128            warnings.warn("Trying to read from %s without unzipping!" % fname, UserWarning)
129        re_import = pd.read_csv(fname, keep_default_na=False)
130
131    return _deserialize_df(re_import, auto_gamma=auto_gamma)

Imports a pandas DataFrame from a csv.(gz) file in which Obs objects are serialized as json strings.

Parameters
  • fname (str): Filename of the input file.
  • auto_gamma (bool): If True applies the gamma_method to all imported Obs objects with the default parameters for the error analysis. Default False.
  • gz (bool): If True, assumes that data is gzipped. If False, assumes JSON file.
Returns
  • data (pandas.DataFrame): Dataframe with the content of the sqlite database.