From 5c056456cc1b82146f231a71e6bdeec85629ec23 Mon Sep 17 00:00:00 2001 From: jessiezhang24 Date: Thu, 23 Jan 2025 15:18:14 -0800 Subject: [PATCH] add data_summary to jupyter --- docs/example.ipynb | 123 ++++++++++++++++++++++++++++++++++---- src/pyeda/data_summary.py | 5 ++ 2 files changed, 117 insertions(+), 11 deletions(-) diff --git a/docs/example.ipynb b/docs/example.ipynb index 6d14af2..1d86ba8 100644 --- a/docs/example.ipynb +++ b/docs/example.ipynb @@ -13,11 +13,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import csv\n", + "import pandas as pd\n", "from pyeda.check_csv import check_csv\n", "from pyeda.pymissing_values_summary import missing_values_summary\n", "from pyeda.data_summary import get_summary_statistics" @@ -29,12 +30,12 @@ "source": [ "## Create a csv file\n", "\n", - "We'll first create a csv file to work with." + "We'll first create a csv file to work with. You can find this `sample_data.csv` file [here](https://github.com/UBC-MDS/pyeda/blob/main/docs/sample_data.csv)." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -67,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -86,10 +87,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "sample_df = pd.read_csv(file_name)" + ] }, { "cell_type": "markdown", @@ -97,15 +100,113 @@ "source": [ "## Get data summary\n", "\n", - "Now it's time to use the `get_summary_statistics` method to get the data summary information." + "Now it's time to use the `get_summary_statistics` method to quickly generate the summary statistics of your dataset. You can either specify particular columns to analyze or summarize all columns if no column names are provided. \n", + " •\tFor numeric columns, the function calculates metrics, including mean, minimum, maximum, median, mode, and range. \n", + "\t•\tFor non-numeric columns, it provides frequency-based metrics like the number of unique values, the most frequent value, and its corresponding count. " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeCity
mean25.666667NaN
min22.000000NaN
max30.000000NaN
median25.000000NaN
mode22.000000NaN
range8.000000NaN
num_unique_valuesNaN3
most_frequent_valueNaNNew York
frequency_of_most_frequent_valueNaN1
\n", + "
" + ], + "text/plain": [ + " Age City\n", + "mean 25.666667 NaN\n", + "min 22.000000 NaN\n", + "max 30.000000 NaN\n", + "median 25.000000 NaN\n", + "mode 22.000000 NaN\n", + "range 8.000000 NaN\n", + "num_unique_values NaN 3\n", + "most_frequent_value NaN New York\n", + "frequency_of_most_frequent_value NaN 1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_summary_statistics(sample_df)\n", + "get_summary_statistics(sample_df, col=[\"Age\", \"City\"])" + ] } ], "metadata": { @@ -124,7 +225,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.12.8" } }, "nbformat": 4, diff --git a/src/pyeda/data_summary.py b/src/pyeda/data_summary.py index d6792f0..d8fefc1 100644 --- a/src/pyeda/data_summary.py +++ b/src/pyeda/data_summary.py @@ -19,6 +19,11 @@ def get_summary_statistics(df, col = None): ---------- pd.DataFrame A DataFrame with summary statistics for the specified columns, including mean, min, max, median, mode, and range for numeric columns, as well as number of unique values, the most frequent value, and its corresponding frequency) for non-numeric columns. + + Examples + -------- + >>> from pyeda.data_summary import get_summary_statistics + >>> get_summary_statistics(df) """ if col is None: col = df.columns.tolist()