diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 8e59a3f..13ee3e1 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.8.0 +current_version = 0.8.1 tag = False [bumpversion:file:soam/__init__.py] diff --git a/CHANGELOG.md b/CHANGELOG.md index 2daa541..d3fb0f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.1 - 2021-08-06] + +### Added +- Notebook with use cases of the TimeSeriesExtractor, Slicer and Store classes. + ## [0.8.0 - 2021-08-02] ### Added diff --git a/notebook/examples/extractor_slicer_store_example.ipynb b/notebook/examples/extractor_slicer_store_example.ipynb new file mode 100644 index 0000000..5891be3 --- /dev/null +++ b/notebook/examples/extractor_slicer_store_example.ipynb @@ -0,0 +1,1521 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Time Series Extractor, Slicer and Store Introduction\n", + "\n", + "This notebook intends to show you different queries that can be done with the `TimeSeriesExtractor` class that `soam` provides, such as:\n", + "- Simple extract * query.\n", + "- Categorical data filters or conditions.\n", + "- Temporal data filters or conditions.\n", + "- Aggregated fields.\n", + "\n", + "At the same time, we provide a brief introduction to the `Slicer` class that `soam` provides to generate slices of your `DataFrame`. Finally, we show how to persist the results by using the `Store` class from the `soam` workflow.\n", + "\n", + "## Database used\n", + "We will be using a database that stores information on Apple's and Tesla's stocks over a period of time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "The setup consists of:\n", + "- Import the needed dependencies.\n", + "- Establish the connection with our database." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from soam.workflow.time_series_extractor import TimeSeriesExtractor\n", + "from muttlib.dbconn import get_client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sqlite_cfg = {\n", + " \"db_type\": \"sqlite\",\n", + " \"database\": \"soam_quickstart.db\"\n", + "}\n", + "\n", + "sqlite_client = get_client(sqlite_cfg)[1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time Series Extractor\n", + "\n", + "- Instantiate our extractor object.\n", + "\n", + "> To build a query you should create a `dictionary` of `{str: obj}` that will be later used for the extraction by executing the `extractor.run` method." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "extractor = TimeSeriesExtractor(db=sqlite_client, table_name='stock')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simple extract * query\n", + "\n", + "Simple query, just retrieving all the data from the database." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query generated: \n", + " \n", + " SELECT *\n", + " FROM stock\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n" + ] + } + ], + "source": [ + "query={\n", + " 'columns': '*'\n", + "}\n", + "\n", + "print('Query generated: ', extractor.build_query(columns= '*')[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatesymbolavg_num_tradesavg_price
012021-03-01AAPL80000.0125.0
122021-03-02AAPL70000.0126.0
232021-03-03AAPL80000.0123.0
342021-03-04AAPL70000.0121.0
452021-03-05AAPL80000.0119.0
\n", + "
" + ], + "text/plain": [ + " id date symbol avg_num_trades avg_price\n", + "0 1 2021-03-01 AAPL 80000.0 125.0\n", + "1 2 2021-03-02 AAPL 70000.0 126.0\n", + "2 3 2021-03-03 AAPL 80000.0 123.0\n", + "3 4 2021-03-04 AAPL 70000.0 121.0\n", + "4 5 2021-03-05 AAPL 80000.0 119.0" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Categorical data filters or conditions\n", + "Adding some filters and conditionals:\n", + "- Querying only a subset of the columns.\n", + "- Renaming some columns with aliases.\n", + "- Filtering data by just retrieving Apple's stock valuations." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query generated: \n", + " \n", + " SELECT date, symbol, avg_price AS Valuation\n", + " FROM stock\n", + " \n", + " \n", + " WHERE symbol = 'AAPL'\n", + " \n", + " \n", + " \n", + " \n", + " \n" + ] + } + ], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'avg_price AS Valuation'],\n", + " 'extra_where_conditions': [\"symbol = 'AAPL'\"]\n", + "}\n", + "\n", + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'avg_price AS Valuation'],\n", + " extra_where_conditions = [\"symbol = 'AAPL'\"])[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol Valuation\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Temporal data filters or conditions\n", + "Adding some extra filters and sorting results:\n", + "- Filtering data by certain dates.\n", + "- Sorting results based on their dates in ascending order." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query generated: \n", + " \n", + " SELECT date, symbol, avg_price AS Valuation\n", + " FROM stock\n", + " \n", + " \n", + " WHERE date >= '2021-03-01' AND date <= '2021-03-20' AND symbol = 'AAPL'\n", + " \n", + " \n", + " \n", + " \n", + " ORDER BY date ASC\n", + " \n", + " \n" + ] + } + ], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'avg_price AS Valuation'],\n", + " 'timestamp_col': 'date',\n", + " 'start_date': \"2021-03-01\",\n", + " 'end_date': \"2021-03-20\",\n", + " 'extra_where_conditions': [\"symbol = 'AAPL'\"],\n", + " 'order_by': [\"date ASC\"]\n", + "}\n", + "\n", + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'avg_price AS Valuation'],\n", + " timestamp_col = 'date',\n", + " start_date = \"2021-03-01\",\n", + " end_date = \"2021-03-20\",\n", + " extra_where_conditions = [\"symbol = 'AAPL'\"],\n", + " order_by = [\"date ASC\"])[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolValuation
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol Valuation\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregated Fields I\n", + "\n", + "Adding some aggregated data.\n", + "- Multiply the average valuation with the amount of trades to obtain the transactional volume per day.\n", + "- Group by date and symbol by using the dimensions method.\n", + "- Filter by a certain level of volume by using the having method.\n", + "- Sort results by volume in descending order." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query generated: \n", + " \n", + " SELECT date, symbol, avg_num_trades * avg_price AS Volume\n", + " FROM stock\n", + " \n", + " \n", + " WHERE date >= '2021-03-01' AND date <= '2021-03-20'\n", + " \n", + " \n", + " GROUP BY date, symbol\n", + " \n", + " \n", + " HAVING Volume > 1000000\n", + " \n", + " \n", + " ORDER BY Volume DESC\n", + " \n", + " \n" + ] + } + ], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'avg_num_trades * avg_price AS Volume'],\n", + " 'dimensions': ['date','symbol'],\n", + " 'timestamp_col': 'date',\n", + " 'start_date': \"2021-03-01\",\n", + " 'end_date': \"2021-03-20\",\n", + " 'order_by': [\"Volume DESC\"],\n", + " 'extra_having_conditions': ['Volume > 1000000']\n", + "}\n", + "\n", + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'avg_num_trades * avg_price AS Volume'],\n", + " dimensions = ['date','symbol'],\n", + " timestamp_col = 'date',\n", + " start_date = \"2021-03-01\",\n", + " end_date = \"2021-03-20\",\n", + " order_by = [\"Volume DESC\"],\n", + " extra_having_conditions = ['Volume > 1000000'])[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolVolume
02021-03-19AAPL16320000.0
12021-03-20AAPL16320000.0
22021-03-17AAPL15180000.0
32021-03-16AAPL14280000.0
42021-03-18AAPL14190000.0
\n", + "
" + ], + "text/plain": [ + " date symbol Volume\n", + "0 2021-03-19 AAPL 16320000.0\n", + "1 2021-03-20 AAPL 16320000.0\n", + "2 2021-03-17 AAPL 15180000.0\n", + "3 2021-03-16 AAPL 14280000.0\n", + "4 2021-03-18 AAPL 14190000.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Aggregated Fields II\n", + "\n", + "Adding some aggregated data.\n", + "- Retrieve the day with the biggest transactional volume for each company." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query generated: \n", + " \n", + " SELECT date, symbol, max(avg_num_trades * avg_price) AS Max_Volume\n", + " FROM stock\n", + " \n", + " \n", + " \n", + " GROUP BY symbol\n", + " \n", + " \n", + " \n", + " ORDER BY Max_Volume DESC\n", + " \n", + " \n" + ] + } + ], + "source": [ + "query={\n", + " 'columns': ['date', 'symbol', 'max(avg_num_trades * avg_price) AS Max_Volume'],\n", + " 'dimensions': ['symbol'],\n", + " 'order_by': [\"Max_Volume DESC\"],\n", + "}\n", + "\n", + "print('Query generated: ', extractor.build_query(columns = ['date', 'symbol', 'max(avg_num_trades * avg_price) AS Max_Volume'],\n", + " dimensions = ['symbol'],\n", + " order_by = [\"Max_Volume DESC\"]\n", + " )[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolMax_Volume
02021-03-22AAPL21300000.0
12021-03-08TSLA10324000.0
\n", + "
" + ], + "text/plain": [ + " date symbol Max_Volume\n", + "0 2021-03-22 AAPL 21300000.0\n", + "1 2021-03-08 TSLA 10324000.0" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = extractor.run(build_query_kwargs = query)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Slicer\n", + "\n", + "Slice a dataframe upon given dimensions.\n", + "\n", + "#### 1. Instantiate the class:\n", + "\n", + "Slice the incoming data upon the given dimensions\n", + "\n", + " Parameters\n", + " ----------\n", + " dimensions:\n", + " str or list of str labels of categorical columns to slices\n", + " metrics:\n", + " str or list of str labels of metrics columns to slices\n", + " ds_col:\n", + " str of datetime column\n", + " keeps:\n", + " str or list of str labels of columns to keep.\n", + " \n", + " \n", + " \n", + "#### 2. Execute the .run method:\n", + "\n", + "Slice the given dataframe with the dimensions setted.\n", + "\n", + " Parameters\n", + " ----------\n", + " raw_df\n", + " A pandas DataFrame containing the raw data to slice\n", + "\n", + " Returns\n", + " -------\n", + " list[pd.DataFrame]\n", + " DataFrame containing the sliced dataframes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use case\n", + "Imagine we want to generate a unique `DataFrame` for each dimension value with each metric. \n", + "\n", + "In our case, this means 4 dataframes since we have `Apple` and `Tesla` from the `Symbol` dimension and two metrics: `avg_num_trades` and `avg_price`. This will result on the following combinations:\n", + "1. Apple's average amount of trades per day.\n", + "2. Apple's average price per day.\n", + "3. Tesla's average amount of trades per day.\n", + "4. Tesla's average price per day.\n", + "\n", + "*We will be using the same database as before but retrieving all the data from the table to generate our `DataFrame`.*" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from soam.workflow.slicer import Slicer" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatesymbolavg_num_tradesavg_price
012021-03-01AAPL80000.0125.0
122021-03-02AAPL70000.0126.0
232021-03-03AAPL80000.0123.0
342021-03-04AAPL70000.0121.0
452021-03-05AAPL80000.0119.0
\n", + "
" + ], + "text/plain": [ + " id date symbol avg_num_trades avg_price\n", + "0 1 2021-03-01 AAPL 80000.0 125.0\n", + "1 2 2021-03-02 AAPL 70000.0 126.0\n", + "2 3 2021-03-03 AAPL 80000.0 123.0\n", + "3 4 2021-03-04 AAPL 70000.0 121.0\n", + "4 5 2021-03-05 AAPL 80000.0 119.0" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query={\n", + " 'columns': '*'\n", + "}\n", + "\n", + "df = extractor.run(build_query_kwargs = query)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Instantiate the class:\n", + "\n", + "> - dimensions: `symbol` \n", + "> - metrics: `avg_num_trades` and `avg_price`\n", + "> - ds_col: `date`" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "slicer = Slicer(metrics=[\"avg_num_trades\", \"avg_price\"], ds_col=\"date\", dimensions=[\"symbol\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Execute the .run method" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "apple_trades, apple_price, tesla_trades, tesla_price = slicer.run(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Apple's average amount of trades per day:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_num_trades
02021-03-01AAPL80000.0
12021-03-02AAPL70000.0
22021-03-03AAPL80000.0
32021-03-04AAPL70000.0
42021-03-05AAPL80000.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_num_trades\n", + "0 2021-03-01 AAPL 80000.0\n", + "1 2021-03-02 AAPL 70000.0\n", + "2 2021-03-03 AAPL 80000.0\n", + "3 2021-03-04 AAPL 70000.0\n", + "4 2021-03-05 AAPL 80000.0" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "apple_trades.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Apple's average price per day:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_price
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_price\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "apple_price.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tesla's average amount of trades per day:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_num_trades
222021-03-01TSLA60000.0
232021-03-02TSLA62000.0
242021-03-03TSLA64000.0
252021-03-04TSLA69000.0
262021-03-05TSLA80000.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_num_trades\n", + "22 2021-03-01 TSLA 60000.0\n", + "23 2021-03-02 TSLA 62000.0\n", + "24 2021-03-03 TSLA 64000.0\n", + "25 2021-03-04 TSLA 69000.0\n", + "26 2021-03-05 TSLA 80000.0" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tesla_trades.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Tesla's average price per day" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_price
222021-03-01TSLA105.0
232021-03-02TSLA104.0
242021-03-03TSLA101.0
252021-03-04TSLA108.0
262021-03-05TSLA115.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_price\n", + "22 2021-03-01 TSLA 105.0\n", + "23 2021-03-02 TSLA 104.0\n", + "24 2021-03-03 TSLA 101.0\n", + "25 2021-03-04 TSLA 108.0\n", + "26 2021-03-05 TSLA 115.0" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tesla_price.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Store\n", + "A class to store results in a table.\n", + "\n", + "#### 1. Instantiate the class:\n", + "\n", + "Store given data into a DataBase\n", + "\n", + " Parameters\n", + " ----------\n", + " db_cli:\n", + " BaseClient client.\n", + " table:\n", + " str of table to store in.\n", + " extra_insert_args:\n", + " dict extra arguments to insert data.\n", + " \n", + " \n", + "#### 2. Execute the .run method:\n", + "\n", + "Store given DataFrame.\n", + "\n", + " Parameters\n", + " ----------\n", + " df\n", + " A pandas DataFrame to store.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use case\n", + "Imagine we want to persist the `DataFrame` created of Apple's average price per day on a specific table named `apple_stock_price` in the same `DataBase` we are working on." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "from soam.workflow.store import Store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Instantiate the class:\n", + "\n", + "> - db_cli: `sqlite_client` \n", + "> - table: `apple_stock_price`" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "saver = Store(db_cli=sqlite_client, table='apple_stock_price')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Execute the .run method" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "saver.run(apple_price)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can extract it with the `TimeSeriesExtractor` object to verify it was persisted successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datesymbolavg_price
02021-03-01AAPL125.0
12021-03-02AAPL126.0
22021-03-03AAPL123.0
32021-03-04AAPL121.0
42021-03-05AAPL119.0
\n", + "
" + ], + "text/plain": [ + " date symbol avg_price\n", + "0 2021-03-01 AAPL 125.0\n", + "1 2021-03-02 AAPL 126.0\n", + "2 2021-03-03 AAPL 123.0\n", + "3 2021-03-04 AAPL 121.0\n", + "4 2021-03-05 AAPL 119.0" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extractor = TimeSeriesExtractor(db=sqlite_client, table_name='apple_stock_price')\n", + "query={\n", + " 'columns': '*'\n", + "}\n", + "\n", + "persisted_df = extractor.run(build_query_kwargs = query)\n", + "\n", + "persisted_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voila!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebook/examples/soam_quickstart.db b/notebook/examples/soam_quickstart.db index f65e4b0..a80567c 100644 Binary files a/notebook/examples/soam_quickstart.db and b/notebook/examples/soam_quickstart.db differ diff --git a/soam/__init__.py b/soam/__init__.py index b9f4c5a..3ef43f9 100644 --- a/soam/__init__.py +++ b/soam/__init__.py @@ -1,3 +1,3 @@ """Version.""" -__version__ = '0.8.0' +__version__ = '0.8.1'