From cbe4121369257275e0dee15358563413820aff76 Mon Sep 17 00:00:00 2001 From: Eugenio Date: Mon, 26 Jul 2021 12:04:07 -0300 Subject: [PATCH 1/7] Queries examples structure defined --- notebook/examples/queries_examples.ipynb | 283 +++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 notebook/examples/queries_examples.ipynb diff --git a/notebook/examples/queries_examples.ipynb b/notebook/examples/queries_examples.ipynb new file mode 100644 index 0000000..7b8cd54 --- /dev/null +++ b/notebook/examples/queries_examples.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from soam.workflow.time_series_extractor import TimeSeriesExtractor\n", + "from muttlib.dbconn import get_client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sqlite_cfg = {\n", + " \"db_type\": \"sqlite\",\n", + " \"database\": \"soam_quickstart.db\"\n", + "}\n", + "\n", + "sqlite_client = get_client(sqlite_cfg)[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "extractor = TimeSeriesExtractor(db=sqlite_client, table_name='stock')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query 1\n", + "\n", + "Simple query, just retrieving all the data from the database.\n", + "\n", + "Query shape: build_query_kwargs: dict of {str: obj}\n", + " Configuration of the extraction query to be used for the extraction." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "query={\n", + " 'columns': '*'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
iddatesymbolavg_num_tradesavg_price
012021-03-01AAPL80000.0125.0
122021-03-02AAPL70000.0126.0
232021-03-03AAPL80000.0123.0
342021-03-04AAPL70000.0121.0
452021-03-05AAPL80000.0119.0
\n
", + "text/plain": " id date symbol avg_num_trades avg_price\n0 1 2021-03-01 AAPL 80000.0 125.0\n1 2 2021-03-02 AAPL 70000.0 126.0\n2 3 2021-03-03 AAPL 80000.0 123.0\n3 4 2021-03-04 AAPL 70000.0 121.0\n4 5 2021-03-05 AAPL 80000.0 119.0" + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query 2\n", + "Adding some extra conditionals:\n", + "- Filtering data by just retrieving Apple's stock valuations.\n", + "- Querying only a subset of the columns.\n", + "- Renaming some columns with aliases." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'avg_price AS Valuation'],\n", + " 'extra_where_conditions': [\"symbol = 'AAPL'\"]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n
", + "text/plain": " date symbol Valuation\n0 2021-03-01 AAPL 125.0\n1 2021-03-02 AAPL 126.0\n2 2021-03-03 AAPL 123.0\n3 2021-03-04 AAPL 121.0\n4 2021-03-05 AAPL 119.0" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query 3\n", + "Adding some extra conditionals:\n", + "- Filtering data by certain days.\n", + "- Ordering results based on their dates." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'avg_price AS Valuation'],\n", + " 'timestamp_col': 'date',\n", + " 'start_date': \"2021-03-01\",\n", + " 'end_date': \"2021-03-20\",\n", + " 'extra_where_conditions': [\"symbol = 'AAPL'\"],\n", + " 'order_by': [\"date ASC\"]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n
", + "text/plain": " date symbol Valuation\n0 2021-03-01 AAPL 125.0\n1 2021-03-02 AAPL 126.0\n2 2021-03-03 AAPL 123.0\n3 2021-03-04 AAPL 121.0\n4 2021-03-05 AAPL 119.0" + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query 4\n", + "\n", + "Adding some aggregated data.\n", + "- Multiply the average valuation with the amount of trades to obtain the transactional volume of the day.\n", + "- Group by date and symbol, this logic is implicit in the class, you don't need to specify it.\n", + "- Filter by a certain level of volume by using the having method." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'avg_num_trades * avg_price AS Volume'],\n", + " 'dimensions': ['date','symbol'],\n", + " 'timestamp_col': 'date',\n", + " 'start_date': \"2021-03-01\",\n", + " 'end_date': \"2021-03-20\",\n", + " 'order_by': [\"date ASC\"],\n", + " 'extra_having_conditions': ['Volume > 1000000']\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolVolume
02021-03-01AAPL10000000.0
12021-03-01TSLA6300000.0
22021-03-02AAPL8820000.0
32021-03-02TSLA6448000.0
42021-03-03AAPL9840000.0
\n
", + "text/plain": " date symbol Volume\n0 2021-03-01 AAPL 10000000.0\n1 2021-03-01 TSLA 6300000.0\n2 2021-03-02 AAPL 8820000.0\n3 2021-03-02 TSLA 6448000.0\n4 2021-03-03 AAPL 9840000.0" + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Query 5\n", + "\n", + "Adding some aggregated data.\n", + "- Retrieve the day with the biggest transactional volume for each company." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'max(avg_num_trades * avg_price) AS Max_Volume'],\n", + " 'dimensions': ['symbol']\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolMax_Volume
02021-03-22AAPL21300000.0
12021-03-08TSLA10324000.0
\n
", + "text/plain": " date symbol Max_Volume\n0 2021-03-22 AAPL 21300000.0\n1 2021-03-08 TSLA 10324000.0" + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From 677f729674018a49a10c40fab790ac324a486578 Mon Sep 17 00:00:00 2001 From: Eugenio Date: Fri, 30 Jul 2021 17:08:59 -0300 Subject: [PATCH 2/7] Slicer example added --- notebook/examples/queries_examples.ipynb | 898 ++++++++++++++++++++++- 1 file changed, 868 insertions(+), 30 deletions(-) diff --git a/notebook/examples/queries_examples.ipynb b/notebook/examples/queries_examples.ipynb index 7b8cd54..1457376 100644 --- a/notebook/examples/queries_examples.ipynb +++ b/notebook/examples/queries_examples.ipynb @@ -1,5 +1,23 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time Series Extractor and Slicer Introduction\n", + "\n", + "This notebook intends to show you different queries that can be done with the TimeSeriesExtractor class that soam provides, such as:\n", + "- Simple extract all query.\n", + "- Temporal data filters or conditions.\n", + "- Categorical data filters or conditions.\n", + "- Aggregated fields.\n", + "\n", + "At the same time, we provide a brief introduction to the Slicer class that soam provides to generate slices of your DataFrame.\n", + "\n", + "\n", + "- Poner mas prolijo + storytelling." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -41,13 +59,14 @@ "\n", "Simple query, just retrieving all the data from the database.\n", "\n", - "Query shape: build_query_kwargs: dict of {str: obj}\n", - " Configuration of the extraction query to be used for the extraction." + "Query shape: \n", + "- build_query_kwargs: dict of {str: obj}\n", + " - Configuration of the extraction query to be used for the extraction." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -58,15 +77,92 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
iddatesymbolavg_num_tradesavg_price
012021-03-01AAPL80000.0125.0
122021-03-02AAPL70000.0126.0
232021-03-03AAPL80000.0123.0
342021-03-04AAPL70000.0121.0
452021-03-05AAPL80000.0119.0
\n
", - "text/plain": " id date symbol avg_num_trades avg_price\n0 1 2021-03-01 AAPL 80000.0 125.0\n1 2 2021-03-02 AAPL 70000.0 126.0\n2 3 2021-03-03 AAPL 80000.0 123.0\n3 4 2021-03-04 AAPL 70000.0 121.0\n4 5 2021-03-05 AAPL 80000.0 119.0" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatesymbolavg_num_tradesavg_price
012021-03-01AAPL80000.0125.0
122021-03-02AAPL70000.0126.0
232021-03-03AAPL80000.0123.0
342021-03-04AAPL70000.0121.0
452021-03-05AAPL80000.0119.0
\n", + "
" + ], + "text/plain": [ + " id date symbol avg_num_trades avg_price\n", + "0 1 2021-03-01 AAPL 80000.0 125.0\n", + "1 2 2021-03-02 AAPL 70000.0 126.0\n", + "2 3 2021-03-03 AAPL 80000.0 123.0\n", + "3 4 2021-03-04 AAPL 70000.0 121.0\n", + "4 5 2021-03-05 AAPL 80000.0 119.0" + ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -90,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -102,15 +198,80 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n
", - "text/plain": " date symbol Valuation\n0 2021-03-01 AAPL 125.0\n1 2021-03-02 AAPL 126.0\n2 2021-03-03 AAPL 123.0\n3 2021-03-04 AAPL 121.0\n4 2021-03-05 AAPL 119.0" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol Valuation\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -132,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -148,15 +309,80 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n
", - "text/plain": " date symbol Valuation\n0 2021-03-01 AAPL 125.0\n1 2021-03-02 AAPL 126.0\n2 2021-03-03 AAPL 123.0\n3 2021-03-04 AAPL 121.0\n4 2021-03-05 AAPL 119.0" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol Valuation\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -180,7 +406,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -197,15 +423,80 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolVolume
02021-03-01AAPL10000000.0
12021-03-01TSLA6300000.0
22021-03-02AAPL8820000.0
32021-03-02TSLA6448000.0
42021-03-03AAPL9840000.0
\n
", - "text/plain": " date symbol Volume\n0 2021-03-01 AAPL 10000000.0\n1 2021-03-01 TSLA 6300000.0\n2 2021-03-02 AAPL 8820000.0\n3 2021-03-02 TSLA 6448000.0\n4 2021-03-03 AAPL 9840000.0" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolVolume
02021-03-01AAPL10000000.0
12021-03-01TSLA6300000.0
22021-03-02AAPL8820000.0
32021-03-02TSLA6448000.0
42021-03-03AAPL9840000.0
\n", + "
" + ], + "text/plain": [ + " date symbol Volume\n", + "0 2021-03-01 AAPL 10000000.0\n", + "1 2021-03-01 TSLA 6300000.0\n", + "2 2021-03-02 AAPL 8820000.0\n", + "3 2021-03-02 TSLA 6448000.0\n", + "4 2021-03-03 AAPL 9840000.0" + ] }, - "execution_count": 41, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -227,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -239,23 +530,571 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolMax_Volume
02021-03-22AAPL21300000.0
12021-03-08TSLA10324000.0
\n", + "
" + ], + "text/plain": [ + " date symbol Max_Volume\n", + "0 2021-03-22 AAPL 21300000.0\n", + "1 2021-03-08 TSLA 10324000.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Slicer\n", + "\n", + "#### First instantiate the class:\n", + "\n", + "Slice the incoming data upon the given dimensions\n", + "\n", + " Parameters\n", + " ----------\n", + " dimensions:\n", + " str or list of str labels of categorical columns to slices\n", + " metrics:\n", + " str or list of str labels of metrics columns to slices\n", + " ds_col:\n", + " str of datetime column\n", + " keeps:\n", + " str or list of str labels of columns to keep.\n", + " \n", + " \n", + " \n", + "#### Then run the method .run:\n", + "\n", + "Slice the given dataframe with the dimensions setted.\n", + "\n", + " Parameters\n", + " ----------\n", + " raw_df\n", + " A pandas DataFrame containing the raw data to slice\n", + "\n", + " Returns\n", + " -------\n", + " list[pd.DataFrame]\n", + " DataFrame containing the sliced dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from soam.workflow.slicer import Slicer" + ] + }, + { + "cell_type": "code", + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { - "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
datesymbolMax_Volume
02021-03-22AAPL21300000.0
12021-03-08TSLA10324000.0
\n
", - "text/plain": " date symbol Max_Volume\n0 2021-03-22 AAPL 21300000.0\n1 2021-03-08 TSLA 10324000.0" + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatesymbolavg_num_tradesavg_price
012021-03-01AAPL80000.0125.0
122021-03-02AAPL70000.0126.0
232021-03-03AAPL80000.0123.0
342021-03-04AAPL70000.0121.0
452021-03-05AAPL80000.0119.0
\n", + "
" + ], + "text/plain": [ + " id date symbol avg_num_trades avg_price\n", + "0 1 2021-03-01 AAPL 80000.0 125.0\n", + "1 2 2021-03-02 AAPL 70000.0 126.0\n", + "2 3 2021-03-03 AAPL 80000.0 123.0\n", + "3 4 2021-03-04 AAPL 70000.0 121.0\n", + "4 5 2021-03-05 AAPL 80000.0 119.0" + ] }, - "execution_count": 48, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "query={\n", + " 'columns': '*'\n", + "}\n", "df = extractor.run(build_query_kwargs = query)\n", + "\n", "df.head()" ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "slicer = Slicer(metrics=[\"avg_num_trades\", \"avg_price\"], ds_col=\"date\", dimensions=[\"symbol\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "apple_trades, apple_price, tesla_trades, tesla_price = slicer.run(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_num_trades
02021-03-01AAPL80000.0
12021-03-02AAPL70000.0
22021-03-03AAPL80000.0
32021-03-04AAPL70000.0
42021-03-05AAPL80000.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_num_trades\n", + "0 2021-03-01 AAPL 80000.0\n", + "1 2021-03-02 AAPL 70000.0\n", + "2 2021-03-03 AAPL 80000.0\n", + "3 2021-03-04 AAPL 70000.0\n", + "4 2021-03-05 AAPL 80000.0" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "apple_trades.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_price
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_price\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "apple_price.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_num_trades
222021-03-01TSLA60000.0
232021-03-02TSLA62000.0
242021-03-03TSLA64000.0
252021-03-04TSLA69000.0
262021-03-05TSLA80000.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_num_trades\n", + "22 2021-03-01 TSLA 60000.0\n", + "23 2021-03-02 TSLA 62000.0\n", + "24 2021-03-03 TSLA 64000.0\n", + "25 2021-03-04 TSLA 69000.0\n", + "26 2021-03-05 TSLA 80000.0" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tesla_trades.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_price
222021-03-01TSLA105.0
232021-03-02TSLA104.0
242021-03-03TSLA101.0
252021-03-04TSLA108.0
262021-03-05TSLA115.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_price\n", + "22 2021-03-01 TSLA 105.0\n", + "23 2021-03-02 TSLA 104.0\n", + "24 2021-03-03 TSLA 101.0\n", + "25 2021-03-04 TSLA 108.0\n", + "26 2021-03-05 TSLA 115.0" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tesla_price.head()" + ] } ], "metadata": { @@ -275,9 +1114,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" - }, - "orig_nbformat": 2 + } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} From b3d86ed249c55d50f3984f481d8387aef34a8d57 Mon Sep 17 00:00:00 2001 From: Eugenio Date: Mon, 2 Aug 2021 12:05:44 -0300 Subject: [PATCH 3/7] Better presentation and explanations --- notebook/examples/queries_examples.ipynb | 235 +++++++++++++++-------- 1 file changed, 160 insertions(+), 75 deletions(-) diff --git a/notebook/examples/queries_examples.ipynb b/notebook/examples/queries_examples.ipynb index 1457376..b55de80 100644 --- a/notebook/examples/queries_examples.ipynb +++ b/notebook/examples/queries_examples.ipynb @@ -4,18 +4,29 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Time Series Extractor and Slicer Introduction\n", + "# Time Series Extractor and Slicer Introduction\n", "\n", - "This notebook intends to show you different queries that can be done with the TimeSeriesExtractor class that soam provides, such as:\n", - "- Simple extract all query.\n", - "- Temporal data filters or conditions.\n", + "This notebook intends to show you different queries that can be done with the `TimeSeriesExtractor` class that `soam` provides, such as:\n", + "- Simple extract * query.\n", "- Categorical data filters or conditions.\n", + "- Temporal data filters or conditions.\n", "- Aggregated fields.\n", "\n", - "At the same time, we provide a brief introduction to the Slicer class that soam provides to generate slices of your DataFrame.\n", + "At the same time, we provide a brief introduction to the `Slicer` class that `soam` provides to generate slices of your `DataFrame`.\n", "\n", + "## Database used\n", + "We will be using a database that stores information on Apple's and Tesla's stocks over a period of time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", "\n", - "- Poner mas prolijo + storytelling." + "The setup consists of:\n", + "- Import the needed dependencies.\n", + "- Establish the connection with our database." ] }, { @@ -42,6 +53,17 @@ "sqlite_client = get_client(sqlite_cfg)[1]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time Series Extractor\n", + "\n", + "- Instantiate our extractor object.\n", + "\n", + "> To build a query you should create a `dictionary` of `{str: obj}` that will be later used for the extraction by executing the `extractor.run` method." + ] + }, { "cell_type": "code", "execution_count": 3, @@ -55,18 +77,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Query 1\n", + "### Simple extract * query\n", "\n", - "Simple query, just retrieving all the data from the database.\n", - "\n", - "Query shape: \n", - "- build_query_kwargs: dict of {str: obj}\n", - " - Configuration of the extraction query to be used for the extraction." + "Simple query, just retrieving all the data from the database." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -77,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -162,14 +180,13 @@ "4 5 2021-03-05 AAPL 80000.0 119.0" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = extractor.run(build_query_kwargs = query)\n", - "\n", "df.head()" ] }, @@ -177,16 +194,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Query 2\n", - "Adding some extra conditionals:\n", - "- Filtering data by just retrieving Apple's stock valuations.\n", + "### Categorical data filters or conditions\n", + "Adding some filters and conditionals:\n", "- Querying only a subset of the columns.\n", - "- Renaming some columns with aliases." + "- Renaming some columns with aliases.\n", + "- Filtering data by just retrieving Apple's stock valuations." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -198,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -271,7 +288,7 @@ "4 2021-03-05 AAPL 119.0" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -285,15 +302,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Query 3\n", - "Adding some extra conditionals:\n", - "- Filtering data by certain days.\n", - "- Ordering results based on their dates." + "### Temporal data filters or conditions\n", + "Adding some extra filters and sorting results:\n", + "- Filtering data by certain dates.\n", + "- Sorting results based on their dates in ascending order." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -309,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -382,7 +399,7 @@ "4 2021-03-05 AAPL 119.0" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -396,17 +413,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Query 4\n", + "### Aggregated Fields I\n", "\n", "Adding some aggregated data.\n", - "- Multiply the average valuation with the amount of trades to obtain the transactional volume of the day.\n", - "- Group by date and symbol, this logic is implicit in the class, you don't need to specify it.\n", - "- Filter by a certain level of volume by using the having method." + "- Multiply the average valuation with the amount of trades to obtain the transactional volume per day.\n", + "- Group by date and symbol by using the dimensions method.\n", + "- Filter by a certain level of volume by using the having method.\n", + "- Sort results by volume in descending order." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -416,14 +434,14 @@ " 'timestamp_col': 'date',\n", " 'start_date': \"2021-03-01\",\n", " 'end_date': \"2021-03-20\",\n", - " 'order_by': [\"date ASC\"],\n", + " 'order_by': [\"Volume DESC\"],\n", " 'extra_having_conditions': ['Volume > 1000000']\n", "}" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -455,33 +473,33 @@ " \n", " \n", " 0\n", - " 2021-03-01\n", + " 2021-03-19\n", " AAPL\n", - " 10000000.0\n", + " 16320000.0\n", " \n", " \n", " 1\n", - " 2021-03-01\n", - " TSLA\n", - " 6300000.0\n", + " 2021-03-20\n", + " AAPL\n", + " 16320000.0\n", " \n", " \n", " 2\n", - " 2021-03-02\n", + " 2021-03-17\n", " AAPL\n", - " 8820000.0\n", + " 15180000.0\n", " \n", " \n", " 3\n", - " 2021-03-02\n", - " TSLA\n", - " 6448000.0\n", + " 2021-03-16\n", + " AAPL\n", + " 14280000.0\n", " \n", " \n", " 4\n", - " 2021-03-03\n", + " 2021-03-18\n", " AAPL\n", - " 9840000.0\n", + " 14190000.0\n", " \n", " \n", "\n", @@ -489,14 +507,14 @@ ], "text/plain": [ " date symbol Volume\n", - "0 2021-03-01 AAPL 10000000.0\n", - "1 2021-03-01 TSLA 6300000.0\n", - "2 2021-03-02 AAPL 8820000.0\n", - "3 2021-03-02 TSLA 6448000.0\n", - "4 2021-03-03 AAPL 9840000.0" + "0 2021-03-19 AAPL 16320000.0\n", + "1 2021-03-20 AAPL 16320000.0\n", + "2 2021-03-17 AAPL 15180000.0\n", + "3 2021-03-16 AAPL 14280000.0\n", + "4 2021-03-18 AAPL 14190000.0" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -510,7 +528,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Query 5\n", + "### Aggregated Fields II\n", "\n", "Adding some aggregated data.\n", "- Retrieve the day with the biggest transactional volume for each company." @@ -518,19 +536,20 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "query={\n", " 'columns': ['date', 'symbol', 'max(avg_num_trades * avg_price) AS Max_Volume'],\n", - " 'dimensions': ['symbol']\n", + " 'dimensions': ['symbol'],\n", + " 'order_by': [\"Max_Volume DESC\"],\n", "}" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -582,7 +601,7 @@ "1 2021-03-08 TSLA 10324000.0" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -598,7 +617,11 @@ "source": [ "## Slicer\n", "\n", - "#### First instantiate the class:\n", + "Slice a dataframe upon given dimensions.\n", + "\n", + "### How it works?\n", + "\n", + "#### 1. Instantiate the class:\n", "\n", "Slice the incoming data upon the given dimensions\n", "\n", @@ -615,7 +638,7 @@ " \n", " \n", " \n", - "#### Then run the method .run:\n", + "#### 2. Execute the .run method:\n", "\n", "Slice the given dataframe with the dimensions setted.\n", "\n", @@ -630,9 +653,25 @@ " DataFrame containing the sliced dataframes." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use case\n", + "Imagine we want to generate a unique `DataFrame` for each dimension value with each metric. \n", + "\n", + "In our case, this means 4 dataframes since we have `Apple` and `Tesla` from the `Symbol` dimension and two metrics: `avg_num_trades` and `avg_price`. This will result on the following combinations:\n", + "1. Apple's average amount of trades per day.\n", + "2. Apple's average price per day.\n", + "3. Tesla's average amount of trades per day.\n", + "4. Tesla's average price per day.\n", + "\n", + "*We will be using the same database as before but retrieving all the data from the table to generate our `DataFrame`.*" + ] + }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -641,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -726,7 +765,7 @@ "4 5 2021-03-05 AAPL 80000.0 119.0" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -740,18 +779,36 @@ "df.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Instantiate the class:\n", + "\n", + "> - dimensions: `symbol` \n", + "> - metrics: `avg_num_trades` and `avg_price`\n", + "> - ds_col: `date`" + ] + }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "slicer = Slicer(metrics=[\"avg_num_trades\", \"avg_price\"], ds_col=\"date\", dimensions=[\"symbol\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Execute the .run method" + ] + }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": { "scrolled": true }, @@ -760,9 +817,16 @@ "apple_trades, apple_price, tesla_trades, tesla_price = slicer.run(df)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Apple's average amount of trades per day:" + ] + }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -835,7 +899,7 @@ "4 2021-03-05 AAPL 80000.0" ] }, - "execution_count": 28, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -844,9 +908,16 @@ "apple_trades.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Apple's average price per day:" + ] + }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -919,7 +990,7 @@ "4 2021-03-05 AAPL 119.0" ] }, - "execution_count": 29, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -928,9 +999,16 @@ "apple_price.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tesla's average amount of trades per day:" + ] + }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1003,7 +1081,7 @@ "26 2021-03-05 TSLA 80000.0" ] }, - "execution_count": 30, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1012,9 +1090,16 @@ "tesla_trades.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tesla's average price per day" + ] + }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1087,7 +1172,7 @@ "26 2021-03-05 TSLA 115.0" ] }, - "execution_count": 31, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } From aee524acc351e6eb1da047b4407bdf5e08220adc Mon Sep 17 00:00:00 2001 From: Eugenio Date: Mon, 2 Aug 2021 12:09:50 -0300 Subject: [PATCH 4/7] nit on a title --- notebook/examples/queries_examples.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/notebook/examples/queries_examples.ipynb b/notebook/examples/queries_examples.ipynb index b55de80..cdcd01b 100644 --- a/notebook/examples/queries_examples.ipynb +++ b/notebook/examples/queries_examples.ipynb @@ -619,8 +619,6 @@ "\n", "Slice a dataframe upon given dimensions.\n", "\n", - "### How it works?\n", - "\n", "#### 1. Instantiate the class:\n", "\n", "Slice the incoming data upon the given dimensions\n", From 5b2289629d0eab00927eb7943141d28c96b2c762 Mon Sep 17 00:00:00 2001 From: Eugenio Date: Mon, 2 Aug 2021 12:27:39 -0300 Subject: [PATCH 5/7] Changelog and version bumped --- .bumpversion.cfg | 2 +- CHANGELOG.md | 6 ++++++ soam/__init__.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e791937..1e78ff8 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.7.2 +current_version = 0.7.3 tag = False [bumpversion:file:soam/__init__.py] diff --git a/CHANGELOG.md b/CHANGELOG.md index 3759d38..4ee952f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.7.3 - 2021-08-02] + +### Added +- Notebook with use cases of the TimeSeriesExtractor and Slicer classes. + ## [0.7.2 - 2021-07-30] ### Removed diff --git a/soam/__init__.py b/soam/__init__.py index ed90d66..2ec4476 100644 --- a/soam/__init__.py +++ b/soam/__init__.py @@ -1,3 +1,3 @@ """Version.""" -__version__ = '0.7.2' +__version__ = '0.7.3' From d18b0e23d90c6433173390d0a6b51f698f595fd7 Mon Sep 17 00:00:00 2001 From: Eugenio Date: Fri, 6 Aug 2021 16:58:26 -0300 Subject: [PATCH 6/7] Store Class Explanation Added --- ...b => extractor_slicer_store_example.ipynb} | 303 ++++++++++++++---- notebook/examples/soam_quickstart.db | Bin 8192 -> 12288 bytes 2 files changed, 245 insertions(+), 58 deletions(-) rename notebook/examples/{queries_examples.ipynb => extractor_slicer_store_example.ipynb} (82%) diff --git a/notebook/examples/queries_examples.ipynb b/notebook/examples/extractor_slicer_store_example.ipynb similarity index 82% rename from notebook/examples/queries_examples.ipynb rename to notebook/examples/extractor_slicer_store_example.ipynb index 6f9bb96..5891be3 100644 --- a/notebook/examples/queries_examples.ipynb +++ b/notebook/examples/extractor_slicer_store_example.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Time Series Extractor and Slicer Introduction\n", + "# Time Series Extractor, Slicer and Store Introduction\n", "\n", "This notebook intends to show you different queries that can be done with the `TimeSeriesExtractor` class that `soam` provides, such as:\n", "- Simple extract * query.\n", @@ -12,7 +12,7 @@ "- Temporal data filters or conditions.\n", "- Aggregated fields.\n", "\n", - "At the same time, we provide a brief introduction to the `Slicer` class that `soam` provides to generate slices of your `DataFrame`.\n", + "At the same time, we provide a brief introduction to the `Slicer` class that `soam` provides to generate slices of your `DataFrame`. Finally, we show how to persist the results by using the `Store` class from the `soam` workflow.\n", "\n", "## Database used\n", "We will be using a database that stores information on Apple's and Tesla's stocks over a period of time." @@ -84,14 +84,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "Query generated: \n", " \n", " SELECT *\n", " FROM stock\n", @@ -108,12 +108,13 @@ "query={\n", " 'columns': '*'\n", "}\n", - "print(extractor.build_query(columns= '*')[0])" + "\n", + "print('Query generated: ', extractor.build_query(columns= '*')[0])" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -198,7 +199,7 @@ "4 5 2021-03-05 AAPL 80000.0 119.0" ] }, - "execution_count": 15, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -221,14 +222,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "Query generated: \n", " \n", " SELECT date, symbol, avg_price AS Valuation\n", " FROM stock\n", @@ -249,14 +250,13 @@ " 'extra_where_conditions': [\"symbol = 'AAPL'\"]\n", "}\n", "\n", - "print(extractor.build_query(columns = ['date', 'symbol', 'avg_price AS Valuation'],\n", - " extra_where_conditions = [\"symbol = 'AAPL'\"]\n", - " )[0])" + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'avg_price AS Valuation'],\n", + " extra_where_conditions = [\"symbol = 'AAPL'\"])[0])" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -329,7 +329,7 @@ "4 2021-03-05 AAPL 119.0" ] }, - "execution_count": 17, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -351,14 +351,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "Query generated: \n", " \n", " SELECT date, symbol, avg_price AS Valuation\n", " FROM stock\n", @@ -385,18 +385,17 @@ " 'order_by': [\"date ASC\"]\n", "}\n", "\n", - "print(extractor.build_query(columns = ['date', 'symbol', 'avg_price AS Valuation'],\n", - " timestamp_col = 'date',\n", - " start_date = \"2021-03-01\",\n", - " end_date = \"2021-03-20\",\n", - " extra_where_conditions = [\"symbol = 'AAPL'\"],\n", - " order_by = [\"date ASC\"]\n", - " )[0])" + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'avg_price AS Valuation'],\n", + " timestamp_col = 'date',\n", + " start_date = \"2021-03-01\",\n", + " end_date = \"2021-03-20\",\n", + " extra_where_conditions = [\"symbol = 'AAPL'\"],\n", + " order_by = [\"date ASC\"])[0])" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -469,7 +468,7 @@ "4 2021-03-05 AAPL 119.0" ] }, - "execution_count": 24, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -494,14 +493,14 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "Query generated: \n", " \n", " SELECT date, symbol, avg_num_trades * avg_price AS Volume\n", " FROM stock\n", @@ -533,19 +532,18 @@ " 'extra_having_conditions': ['Volume > 1000000']\n", "}\n", "\n", - "print(extractor.build_query(columns = ['date', 'symbol', 'avg_num_trades * avg_price AS Volume'],\n", - " dimensions = ['date','symbol'],\n", - " timestamp_col = 'date',\n", - " start_date = \"2021-03-01\",\n", - " end_date = \"2021-03-20\",\n", - " order_by = [\"Volume DESC\"],\n", - " extra_having_conditions = ['Volume > 1000000']\n", - " )[0])" + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'avg_num_trades * avg_price AS Volume'],\n", + " dimensions = ['date','symbol'],\n", + " timestamp_col = 'date',\n", + " start_date = \"2021-03-01\",\n", + " end_date = \"2021-03-20\",\n", + " order_by = [\"Volume DESC\"],\n", + " extra_having_conditions = ['Volume > 1000000'])[0])" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -618,7 +616,7 @@ "4 2021-03-18 AAPL 14190000.0" ] }, - "execution_count": 11, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -640,14 +638,14 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", + "Query generated: \n", " \n", " SELECT date, symbol, max(avg_num_trades * avg_price) AS Max_Volume\n", " FROM stock\n", @@ -671,15 +669,15 @@ " 'order_by': [\"Max_Volume DESC\"],\n", "}\n", "\n", - "print(extractor.build_query(columns = ['date', 'symbol', 'max(avg_num_trades * avg_price) AS Max_Volume'],\n", - " dimensions = ['symbol'],\n", - " order_by = [\"Max_Volume DESC\"]\n", - " )[0])" + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'max(avg_num_trades * avg_price) AS Max_Volume'],\n", + " dimensions = ['symbol'],\n", + " order_by = [\"Max_Volume DESC\"]\n", + " )[0])" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -731,7 +729,7 @@ "1 2021-03-08 TSLA 10324000.0" ] }, - "execution_count": 29, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -799,7 +797,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -808,7 +806,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -893,7 +891,7 @@ "4 5 2021-03-05 AAPL 80000.0 119.0" ] }, - "execution_count": 31, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -902,6 +900,7 @@ "query={\n", " 'columns': '*'\n", "}\n", + "\n", "df = extractor.run(build_query_kwargs = query)\n", "\n", "df.head()" @@ -920,7 +919,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -936,7 +935,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 25, "metadata": { "scrolled": true }, @@ -954,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1027,7 +1026,7 @@ "4 2021-03-05 AAPL 80000.0" ] }, - "execution_count": 34, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1045,7 +1044,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1118,7 +1117,7 @@ "4 2021-03-05 AAPL 119.0" ] }, - "execution_count": 35, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1136,7 +1135,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1209,7 +1208,7 @@ "26 2021-03-05 TSLA 80000.0" ] }, - "execution_count": 36, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1227,7 +1226,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1300,7 +1299,7 @@ "26 2021-03-05 TSLA 115.0" ] }, - "execution_count": 37, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1308,6 +1307,194 @@ "source": [ "tesla_price.head()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Store\n", + "A class to store results in a table.\n", + "\n", + "#### 1. Instantiate the class:\n", + "\n", + "Store given data into a DataBase\n", + "\n", + " Parameters\n", + " ----------\n", + " db_cli:\n", + " BaseClient client.\n", + " table:\n", + " str of table to store in.\n", + " extra_insert_args:\n", + " dict extra arguments to insert data.\n", + " \n", + " \n", + "#### 2. Execute the .run method:\n", + "\n", + "Store given DataFrame.\n", + "\n", + " Parameters\n", + " ----------\n", + " df\n", + " A pandas DataFrame to store.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use case\n", + "Imagine we want to persist the `DataFrame` created of Apple's average price per day on a specific table named `apple_stock_price` in the same `DataBase` we are working on." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "from soam.workflow.store import Store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Instantiate the class:\n", + "\n", + "> - db_cli: `sqlite_client` \n", + "> - table: `apple_stock_price`" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "saver = Store(db_cli=sqlite_client, table='apple_stock_price')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Execute the .run method" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "saver.run(apple_price)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can extract it with the `TimeSeriesExtractor` object to verify it was persisted successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_price
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_price\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extractor = TimeSeriesExtractor(db=sqlite_client, table_name='apple_stock_price')\n", + "query={\n", + " 'columns': '*'\n", + "}\n", + "\n", + "persisted_df = extractor.run(build_query_kwargs = query)\n", + "\n", + "persisted_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila!" + ] } ], "metadata": { diff --git a/notebook/examples/soam_quickstart.db b/notebook/examples/soam_quickstart.db index f65e4b0203a83501899709d6d8f7b8249a8eb939..a80567c2b114683903475ef79450180d341ff860 100644 GIT binary patch delta 718 zcmZY4u};E37{Kwipa{wxcn1zHbwVRZDJVJ^Ee0nNjdox#!GgpPA%HY0K`0mv1{0%$ z@gZ~}J^?SG@fmb;WRYIQ^g@#x`pf0J@88^P9zM*vtqj9h250zjkN)rgmo`3~1L8oR zfi=+YmhXVp;A}I&Alq70m2^Isv(X-BMY$czyQK9z&KIIn4gJ3&;BMaP>7Mgnt{K1V__HH#D?b0)%>jj0Plzj A>i_@% From 4736bc3799c9c282673ee9fedaaee6c8cf982026 Mon Sep 17 00:00:00 2001 From: Eugenio Date: Fri, 6 Aug 2021 17:04:23 -0300 Subject: [PATCH 7/7] CHANGELOG updated --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7c66d2..d3fb0f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.8.1 - 2021-08-06] ### Added -- Notebook with use cases of the TimeSeriesExtractor and Slicer classes. +- Notebook with use cases of the TimeSeriesExtractor, Slicer and Store classes. ## [0.8.0 - 2021-08-02]