diff --git a/examples/07_data_management.ipynb b/examples/07_data_management.ipynb new file mode 100644 index 00000000..329d87a9 --- /dev/null +++ b/examples/07_data_management.ipynb @@ -0,0 +1,378 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data management" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pyerrors as pe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the data management example we reuse the data from the correlator example." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data has been written using pyerrors 2.0.0.\n", + "Format version 0.1\n", + "Written by fjosw on 2022-01-06 11:11:19 +0100 on host XPS139305, Linux-5.11.0-44-generic-x86_64-with-glibc2.29\n", + "\n", + "Description: Test data for the correlator example\n" + ] + } + ], + "source": [ + "correlator_data = pe.input.json.load_json(\"./data/correlator_test\")\n", + "my_correlator = pe.Corr(correlator_data)\n", + "my_correlator.gamma_method()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import autograd.numpy as anp\n", + "def func_exp(a, x):\n", + " return a[1] * anp.exp(-a[0] * x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we perform uncorrelated fits of a single exponential function to the correlator and vary the range of the fit. The fit result can be conveniently stored in a pandas DataFrame together with the corresponding metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "rows = []\n", + "for t_start in range(12, 17):\n", + " for t_stop in range(30, 32):\n", + " fr = my_correlator.fit(func_exp, [t_start, t_stop], silent=True)\n", + " fr.gamma_method()\n", + " row = {\"t_start\": t_start,\n", + " \"t_stop\": t_stop,\n", + " \"datapoints\": t_stop - t_start + 1,\n", + " \"chisquare_by_dof\": fr.chisquare_by_dof,\n", + " \"mass\": fr[0]}\n", + " rows.append(row)\n", + "my_df = pd.DataFrame(rows)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
t_startt_stopdatapointschisquare_by_dofmass
01230190.0578720.2218(12)
11231200.0639510.2221(11)
21330180.0515770.2215(12)
31331190.0609010.2219(11)
41430170.0523490.2213(13)
51431180.0636400.2218(13)
61530160.0560880.2213(16)
71531170.0675520.2218(17)
81630150.0599690.2214(21)
91631160.0708740.2220(20)
\n", + "
" + ], + "text/plain": [ + " t_start t_stop datapoints chisquare_by_dof mass\n", + "0 12 30 19 0.057872 0.2218(12)\n", + "1 12 31 20 0.063951 0.2221(11)\n", + "2 13 30 18 0.051577 0.2215(12)\n", + "3 13 31 19 0.060901 0.2219(11)\n", + "4 14 30 17 0.052349 0.2213(13)\n", + "5 14 31 18 0.063640 0.2218(13)\n", + "6 15 30 16 0.056088 0.2213(16)\n", + "7 15 31 17 0.067552 0.2218(17)\n", + "8 16 30 15 0.059969 0.2214(21)\n", + "9 16 31 16 0.070874 0.2220(20)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The content of this pandas DataFrame can be inserted into a relational database, making use of the `JSON` serialization of `pyerrors` objects. In this example we use an SQLite database." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "pe.input.pandas.to_sql(my_df, \"mass_table\", \"my_db.sqlite\", if_exists='fail')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At a later stage of the analysis the content of the database can be reconstructed into a DataFrame via SQL queries.\n", + "In this example we extract `t_start`, `t_stop` and the fitted mass for all fits which start at times larger than 14." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "new_df = pe.input.pandas.read_sql(f\"SELECT t_start, t_stop, mass FROM mass_table WHERE t_start > 14\",\n", + " \"my_db.sqlite\",\n", + " auto_gamma=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
t_startt_stopmass
015300.2213(16)
115310.2218(17)
216300.2214(21)
316310.2220(20)
\n", + "
" + ], + "text/plain": [ + " t_start t_stop mass\n", + "0 15 30 0.2213(16)\n", + "1 15 31 0.2218(17)\n", + "2 16 30 0.2214(21)\n", + "3 16 31 0.2220(20)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The storage of intermediate analysis results in relational databases allows for a convenient and scalable way of splitting up a detailed analysis in multiple independent steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}