From 5c056456cc1b82146f231a71e6bdeec85629ec23 Mon Sep 17 00:00:00 2001
From: jessiezhang24 <zhangj24@student.ubc.ca>
Date: Thu, 23 Jan 2025 15:18:14 -0800
Subject: [PATCH] add data_summary to jupyter

---
 docs/example.ipynb        | 123 ++++++++++++++++++++++++++++++++++----
 src/pyeda/data_summary.py |   5 ++
 2 files changed, 117 insertions(+), 11 deletions(-)
diff --git a/docs/example.ipynb b/docs/example.ipynb
index 6d14af2..1d86ba8 100644
--- a/docs/example.ipynb
+++ b/docs/example.ipynb
@@ -13,11 +13,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import csv\n",
+    "import pandas as pd\n",
     "from pyeda.check_csv import check_csv\n",
     "from pyeda.pymissing_values_summary import missing_values_summary\n",
     "from pyeda.data_summary import get_summary_statistics"
@@ -29,12 +30,12 @@
    "source": [
     "## Create a csv file\n",
     "\n",
-    "We'll first create a csv file to work with."
+    "We'll first create a csv file to work with. You can find this `sample_data.csv` file [here](https://github.com/UBC-MDS/pyeda/blob/main/docs/sample_data.csv)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -67,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,10 +87,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "sample_df = pd.read_csv(file_name)"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -97,15 +100,113 @@
    "source": [
     "## Get data summary\n",
     "\n",
-    "Now it's time to use the `get_summary_statistics` method to get the data summary information."
+    "Now it's time to use the `get_summary_statistics` method to quickly generate the summary statistics of your dataset. You can either specify particular columns to analyze or summarize all columns if no column names are provided.  \n",
+    "    •\tFor numeric columns, the function calculates metrics, including mean, minimum, maximum, median, mode, and range.  \n",
+    "\t•\tFor non-numeric columns, it provides frequency-based metrics like the number of unique values, the most frequent value, and its corresponding count.  "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Age</th>\n",
+       "      <th>City</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>25.666667</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>22.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>30.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>median</th>\n",
+       "      <td>25.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mode</th>\n",
+       "      <td>22.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>range</th>\n",
+       "      <td>8.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>num_unique_values</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>most_frequent_value</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>New York</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>frequency_of_most_frequent_value</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        Age      City\n",
+       "mean                              25.666667       NaN\n",
+       "min                               22.000000       NaN\n",
+       "max                               30.000000       NaN\n",
+       "median                            25.000000       NaN\n",
+       "mode                              22.000000       NaN\n",
+       "range                              8.000000       NaN\n",
+       "num_unique_values                       NaN         3\n",
+       "most_frequent_value                     NaN  New York\n",
+       "frequency_of_most_frequent_value        NaN         1"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_summary_statistics(sample_df)\n",
+    "get_summary_statistics(sample_df, col=[\"Age\", \"City\"])"
+   ]
   }
  ],
  "metadata": {
@@ -124,7 +225,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.12.8"
   }
  },
  "nbformat": 4,
diff --git a/src/pyeda/data_summary.py b/src/pyeda/data_summary.py
index d6792f0..d8fefc1 100644
--- a/src/pyeda/data_summary.py
+++ b/src/pyeda/data_summary.py
@@ -19,6 +19,11 @@ def get_summary_statistics(df, col = None):
     ----------
     pd.DataFrame
         A DataFrame with summary statistics for the specified columns, including mean, min, max, median, mode, and range for numeric columns, as well as number of unique values, the most frequent value, and its corresponding frequency) for non-numeric columns.
+    
+    Examples
+    --------
+    >>> from pyeda.data_summary import get_summary_statistics
+    >>> get_summary_statistics(df)
     """
     if col is None:
         col = df.columns.tolist()

	Age	City
mean	25.666667	NaN
min	22.000000	NaN
max	30.000000	NaN
median	25.000000	NaN
mode	22.000000	NaN
range	8.000000	NaN
num_unique_values	NaN	3
most_frequent_value	NaN	New York
frequency_of_most_frequent_value	NaN	1