From 9a7f5679c9fd7422b6906e6dda6c605aebfed402 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 29 Sep 2022 09:46:15 +0100 Subject: [PATCH 1/2] docs: first version of data management example added. --- examples/07_data_management.ipynb | 554 ++++++++++++++++++++++++++++++ 1 file changed, 554 insertions(+) create mode 100644 examples/07_data_management.ipynb diff --git a/examples/07_data_management.ipynb b/examples/07_data_management.ipynb new file mode 100644 index 00000000..d317456a --- /dev/null +++ b/examples/07_data_management.ipynb @@ -0,0 +1,554 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data management" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pyerrors as pe" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data has been written using pyerrors 2.0.0.\n", + "Format version 0.1\n", + "Written by fjosw on 2022-01-06 11:11:19 +0100 on host XPS139305, Linux-5.11.0-44-generic-x86_64-with-glibc2.29\n", + "\n", + "Description: Test data for the correlator example\n" + ] + } + ], + "source": [ + "correlator_data = pe.input.json.load_json(\"./data/correlator_test\")\n", + "my_correlator = pe.Corr(correlator_data)\n", + "my_correlator.gamma_method()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import autograd.numpy as anp\n", + "def func_exp(a, x):\n", + " return a[1] * anp.exp(-a[0] * x)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "rows = []\n", + "for t_start in range(12, 17):\n", + " for t_stop in range(30, 35):\n", + " fr = my_correlator.fit(func_exp, [t_start, t_stop], silent=True)\n", + " fr.gamma_method()\n", + " row = {\"t_start\": t_start,\n", + " \"t_stop\": t_stop,\n", + " \"datapoints\": t_stop - t_start + 1,\n", + " \"chisquare_by_dof\": fr.chisquare_by_dof,\n", + " \"mass\": fr[0]}\n", + " rows.append(row)\n", + "my_df = pd.DataFrame(rows)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
t_startt_stopdatapointschisquare_by_dofmass
01230190.0578720.2218(12)
11231200.0639510.2221(11)
21232210.0649600.2223(11)
31233220.0664950.2224(10)
41234230.0666060.2225(10)
51330180.0515770.2215(12)
61331190.0609010.2219(11)
71332200.0635510.2221(12)
81333210.0664060.2223(12)
91334220.0672370.2224(12)
101430170.0523490.2213(13)
111431180.0636400.2218(13)
121432190.0668830.2220(14)
131433200.0700190.2223(15)
141434210.0707750.2224(15)
151530160.0560880.2213(16)
161531170.0675520.2218(17)
171532180.0701700.2221(18)
181533190.0725160.2224(18)
191534200.0725090.2225(18)
201630150.0599690.2214(21)
211631160.0708740.2220(20)
221632170.0724370.2223(21)
231633180.0736840.2225(21)
241634190.0727670.2227(20)
\n", + "
" + ], + "text/plain": [ + " t_start t_stop datapoints chisquare_by_dof mass\n", + "0 12 30 19 0.057872 0.2218(12)\n", + "1 12 31 20 0.063951 0.2221(11)\n", + "2 12 32 21 0.064960 0.2223(11)\n", + "3 12 33 22 0.066495 0.2224(10)\n", + "4 12 34 23 0.066606 0.2225(10)\n", + "5 13 30 18 0.051577 0.2215(12)\n", + "6 13 31 19 0.060901 0.2219(11)\n", + "7 13 32 20 0.063551 0.2221(12)\n", + "8 13 33 21 0.066406 0.2223(12)\n", + "9 13 34 22 0.067237 0.2224(12)\n", + "10 14 30 17 0.052349 0.2213(13)\n", + "11 14 31 18 0.063640 0.2218(13)\n", + "12 14 32 19 0.066883 0.2220(14)\n", + "13 14 33 20 0.070019 0.2223(15)\n", + "14 14 34 21 0.070775 0.2224(15)\n", + "15 15 30 16 0.056088 0.2213(16)\n", + "16 15 31 17 0.067552 0.2218(17)\n", + "17 15 32 18 0.070170 0.2221(18)\n", + "18 15 33 19 0.072516 0.2224(18)\n", + "19 15 34 20 0.072509 0.2225(18)\n", + "20 16 30 15 0.059969 0.2214(21)\n", + "21 16 31 16 0.070874 0.2220(20)\n", + "22 16 32 17 0.072437 0.2223(21)\n", + "23 16 33 18 0.073684 0.2225(21)\n", + "24 16 34 19 0.072767 0.2227(20)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "pe.input.pandas.to_sql(my_df, \"mass_table\", \"my_db.sqlite\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "new_df = df = pe.input.pandas.read_sql(f\"SELECT t_start, t_stop, mass FROM mass_table WHERE t_start > 13\"\n", + " ,\"my_db.sqlite\"\n", + " ,auto_gamma=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
t_startt_stopmass
014300.2213(13)
114310.2218(13)
214320.2220(14)
314330.2223(15)
414340.2224(15)
515300.2213(16)
615310.2218(17)
715320.2221(18)
815330.2224(18)
915340.2225(18)
1016300.2214(21)
1116310.2220(20)
1216320.2223(21)
1316330.2225(21)
1416340.2227(20)
\n", + "
" + ], + "text/plain": [ + " t_start t_stop mass\n", + "0 14 30 0.2213(13)\n", + "1 14 31 0.2218(13)\n", + "2 14 32 0.2220(14)\n", + "3 14 33 0.2223(15)\n", + "4 14 34 0.2224(15)\n", + "5 15 30 0.2213(16)\n", + "6 15 31 0.2218(17)\n", + "7 15 32 0.2221(18)\n", + "8 15 33 0.2224(18)\n", + "9 15 34 0.2225(18)\n", + "10 16 30 0.2214(21)\n", + "11 16 31 0.2220(20)\n", + "12 16 32 0.2223(21)\n", + "13 16 33 0.2225(21)\n", + "14 16 34 0.2227(20)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 6a57264868f343a880e931ce056fae6b3fb19be3 Mon Sep 17 00:00:00 2001 From: Fabian Joswig Date: Thu, 29 Sep 2022 17:11:24 +0100 Subject: [PATCH 2/2] docs: data management example refined. --- examples/07_data_management.ipynb | 328 +++++++----------------------- 1 file changed, 76 insertions(+), 252 deletions(-) diff --git a/examples/07_data_management.ipynb b/examples/07_data_management.ipynb index d317456a..329d87a9 100644 --- a/examples/07_data_management.ipynb +++ b/examples/07_data_management.ipynb @@ -18,6 +18,13 @@ "import pyerrors as pe" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the data management example we reuse the data from the correlator example." + ] + }, { "cell_type": "code", "execution_count": 2, @@ -52,6 +59,13 @@ " return a[1] * anp.exp(-a[0] * x)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we perform uncorrelated fits of a single exponential function to the correlator and vary the range of the fit. The fit result can be conveniently stored in a pandas DataFrame together with the corresponding metadata." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -60,7 +74,7 @@ "source": [ "rows = []\n", "for t_start in range(12, 17):\n", - " for t_stop in range(30, 35):\n", + " for t_stop in range(30, 32):\n", " fr = my_correlator.fit(func_exp, [t_start, t_stop], silent=True)\n", " fr.gamma_method()\n", " row = {\"t_start\": t_start,\n", @@ -124,30 +138,6 @@ " \n", " \n", " 2\n", - " 12\n", - " 32\n", - " 21\n", - " 0.064960\n", - " 0.2223(11)\n", - " \n", - " \n", - " 3\n", - " 12\n", - " 33\n", - " 22\n", - " 0.066495\n", - " 0.2224(10)\n", - " \n", - " \n", - " 4\n", - " 12\n", - " 34\n", - " 23\n", - " 0.066606\n", - " 0.2225(10)\n", - " \n", - " \n", - " 5\n", " 13\n", " 30\n", " 18\n", @@ -155,7 +145,7 @@ " 0.2215(12)\n", " \n", " \n", - " 6\n", + " 3\n", " 13\n", " 31\n", " 19\n", @@ -163,31 +153,7 @@ " 0.2219(11)\n", " \n", " \n", - " 7\n", - " 13\n", - " 32\n", - " 20\n", - " 0.063551\n", - " 0.2221(12)\n", - " \n", - " \n", - " 8\n", - " 13\n", - " 33\n", - " 21\n", - " 0.066406\n", - " 0.2223(12)\n", - " \n", - " \n", - " 9\n", - " 13\n", - " 34\n", - " 22\n", - " 0.067237\n", - " 0.2224(12)\n", - " \n", - " \n", - " 10\n", + " 4\n", " 14\n", " 30\n", " 17\n", @@ -195,7 +161,7 @@ " 0.2213(13)\n", " \n", " \n", - " 11\n", + " 5\n", " 14\n", " 31\n", " 18\n", @@ -203,31 +169,7 @@ " 0.2218(13)\n", " \n", " \n", - " 12\n", - " 14\n", - " 32\n", - " 19\n", - " 0.066883\n", - " 0.2220(14)\n", - " \n", - " \n", - " 13\n", - " 14\n", - " 33\n", - " 20\n", - " 0.070019\n", - " 0.2223(15)\n", - " \n", - " \n", - " 14\n", - " 14\n", - " 34\n", - " 21\n", - " 0.070775\n", - " 0.2224(15)\n", - " \n", - " \n", - " 15\n", + " 6\n", " 15\n", " 30\n", " 16\n", @@ -235,7 +177,7 @@ " 0.2213(16)\n", " \n", " \n", - " 16\n", + " 7\n", " 15\n", " 31\n", " 17\n", @@ -243,31 +185,7 @@ " 0.2218(17)\n", " \n", " \n", - " 17\n", - " 15\n", - " 32\n", - " 18\n", - " 0.070170\n", - " 0.2221(18)\n", - " \n", - " \n", - " 18\n", - " 15\n", - " 33\n", - " 19\n", - " 0.072516\n", - " 0.2224(18)\n", - " \n", - " \n", - " 19\n", - " 15\n", - " 34\n", - " 20\n", - " 0.072509\n", - " 0.2225(18)\n", - " \n", - " \n", - " 20\n", + " 8\n", " 16\n", " 30\n", " 15\n", @@ -275,68 +193,29 @@ " 0.2214(21)\n", " \n", " \n", - " 21\n", + " 9\n", " 16\n", " 31\n", " 16\n", " 0.070874\n", " 0.2220(20)\n", " \n", - " \n", - " 22\n", - " 16\n", - " 32\n", - " 17\n", - " 0.072437\n", - " 0.2223(21)\n", - " \n", - " \n", - " 23\n", - " 16\n", - " 33\n", - " 18\n", - " 0.073684\n", - " 0.2225(21)\n", - " \n", - " \n", - " 24\n", - " 16\n", - " 34\n", - " 19\n", - " 0.072767\n", - " 0.2227(20)\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " t_start t_stop datapoints chisquare_by_dof mass\n", - "0 12 30 19 0.057872 0.2218(12)\n", - "1 12 31 20 0.063951 0.2221(11)\n", - "2 12 32 21 0.064960 0.2223(11)\n", - "3 12 33 22 0.066495 0.2224(10)\n", - "4 12 34 23 0.066606 0.2225(10)\n", - "5 13 30 18 0.051577 0.2215(12)\n", - "6 13 31 19 0.060901 0.2219(11)\n", - "7 13 32 20 0.063551 0.2221(12)\n", - "8 13 33 21 0.066406 0.2223(12)\n", - "9 13 34 22 0.067237 0.2224(12)\n", - "10 14 30 17 0.052349 0.2213(13)\n", - "11 14 31 18 0.063640 0.2218(13)\n", - "12 14 32 19 0.066883 0.2220(14)\n", - "13 14 33 20 0.070019 0.2223(15)\n", - "14 14 34 21 0.070775 0.2224(15)\n", - "15 15 30 16 0.056088 0.2213(16)\n", - "16 15 31 17 0.067552 0.2218(17)\n", - "17 15 32 18 0.070170 0.2221(18)\n", - "18 15 33 19 0.072516 0.2224(18)\n", - "19 15 34 20 0.072509 0.2225(18)\n", - "20 16 30 15 0.059969 0.2214(21)\n", - "21 16 31 16 0.070874 0.2220(20)\n", - "22 16 32 17 0.072437 0.2223(21)\n", - "23 16 33 18 0.073684 0.2225(21)\n", - "24 16 34 19 0.072767 0.2227(20)" + " t_start t_stop datapoints chisquare_by_dof mass\n", + "0 12 30 19 0.057872 0.2218(12)\n", + "1 12 31 20 0.063951 0.2221(11)\n", + "2 13 30 18 0.051577 0.2215(12)\n", + "3 13 31 19 0.060901 0.2219(11)\n", + "4 14 30 17 0.052349 0.2213(13)\n", + "5 14 31 18 0.063640 0.2218(13)\n", + "6 15 30 16 0.056088 0.2213(16)\n", + "7 15 31 17 0.067552 0.2218(17)\n", + "8 16 30 15 0.059969 0.2214(21)\n", + "9 16 31 16 0.070874 0.2220(20)" ] }, "execution_count": 5, @@ -348,30 +227,45 @@ "my_df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The content of this pandas DataFrame can be inserted into a relational database, making use of the `JSON` serialization of `pyerrors` objects. In this example we use an SQLite database." + ] + }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "pe.input.pandas.to_sql(my_df, \"mass_table\", \"my_db.sqlite\")" + "pe.input.pandas.to_sql(my_df, \"mass_table\", \"my_db.sqlite\", if_exists='fail')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At a later stage of the analysis the content of the database can be reconstructed into a DataFrame via SQL queries.\n", + "In this example we extract `t_start`, `t_stop` and the fitted mass for all fits which start at times larger than 14." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "new_df = pe.input.pandas.read_sql(f\"SELECT t_start, t_stop, mass FROM mass_table WHERE t_start > 14\",\n", + " \"my_db.sqlite\",\n", + " auto_gamma=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [ - "new_df = df = pe.input.pandas.read_sql(f\"SELECT t_start, t_stop, mass FROM mass_table WHERE t_start > 13\"\n", - " ,\"my_db.sqlite\"\n", - " ,auto_gamma=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, "outputs": [ { "data": { @@ -402,118 +296,41 @@ " \n", " \n", " 0\n", - " 14\n", - " 30\n", - " 0.2213(13)\n", - " \n", - " \n", - " 1\n", - " 14\n", - " 31\n", - " 0.2218(13)\n", - " \n", - " \n", - " 2\n", - " 14\n", - " 32\n", - " 0.2220(14)\n", - " \n", - " \n", - " 3\n", - " 14\n", - " 33\n", - " 0.2223(15)\n", - " \n", - " \n", - " 4\n", - " 14\n", - " 34\n", - " 0.2224(15)\n", - " \n", - " \n", - " 5\n", " 15\n", " 30\n", " 0.2213(16)\n", " \n", " \n", - " 6\n", + " 1\n", " 15\n", " 31\n", " 0.2218(17)\n", " \n", " \n", - " 7\n", - " 15\n", - " 32\n", - " 0.2221(18)\n", - " \n", - " \n", - " 8\n", - " 15\n", - " 33\n", - " 0.2224(18)\n", - " \n", - " \n", - " 9\n", - " 15\n", - " 34\n", - " 0.2225(18)\n", - " \n", - " \n", - " 10\n", + " 2\n", " 16\n", " 30\n", " 0.2214(21)\n", " \n", " \n", - " 11\n", + " 3\n", " 16\n", " 31\n", " 0.2220(20)\n", " \n", - " \n", - " 12\n", - " 16\n", - " 32\n", - " 0.2223(21)\n", - " \n", - " \n", - " 13\n", - " 16\n", - " 33\n", - " 0.2225(21)\n", - " \n", - " \n", - " 14\n", - " 16\n", - " 34\n", - " 0.2227(20)\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " t_start t_stop mass\n", - "0 14 30 0.2213(13)\n", - "1 14 31 0.2218(13)\n", - "2 14 32 0.2220(14)\n", - "3 14 33 0.2223(15)\n", - "4 14 34 0.2224(15)\n", - "5 15 30 0.2213(16)\n", - "6 15 31 0.2218(17)\n", - "7 15 32 0.2221(18)\n", - "8 15 33 0.2224(18)\n", - "9 15 34 0.2225(18)\n", - "10 16 30 0.2214(21)\n", - "11 16 31 0.2220(20)\n", - "12 16 32 0.2223(21)\n", - "13 16 33 0.2225(21)\n", - "14 16 34 0.2227(20)" + " t_start t_stop mass\n", + "0 15 30 0.2213(16)\n", + "1 15 31 0.2218(17)\n", + "2 16 30 0.2214(21)\n", + "3 16 31 0.2220(20)" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -522,6 +339,13 @@ "new_df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The storage of intermediate analysis results in relational databases allows for a convenient and scalable way of splitting up a detailed analysis in multiple independent steps." + ] + }, { "cell_type": "code", "execution_count": null,