Merge pull request #24 from UBC-MDS/edit_get_summary

add data_summary to jupyter
UBC-MDS · Jan 23, 2025 · b2f0279 · b2f0279
2 parents a75e3fe + 5c05645
commit b2f0279
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 11 deletions.
diff --git a/docs/example.ipynb b/docs/example.ipynb
@@ -13,11 +13,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import csv\n",
+    "import pandas as pd\n",
     "from pyeda.check_csv import check_csv\n",
     "from pyeda.pymissing_values_summary import missing_values_summary\n",
     "from pyeda.data_summary import get_summary_statistics"
@@ -29,12 +30,12 @@
    "source": [
     "## Create a csv file\n",
     "\n",
-    "We'll first create a csv file to work with."
+    "We'll first create a csv file to work with. You can find this `sample_data.csv` file [here](https://github.com/UBC-MDS/pyeda/blob/main/docs/sample_data.csv)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -67,7 +68,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -86,26 +87,126 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "sample_df = pd.read_csv(file_name)"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Get data summary\n",
     "\n",
-    "Now it's time to use the `get_summary_statistics` method to get the data summary information."
+    "Now it's time to use the `get_summary_statistics` method to quickly generate the summary statistics of your dataset. You can either specify particular columns to analyze or summarize all columns if no column names are provided.  \n",
+    "    •\tFor numeric columns, the function calculates metrics, including mean, minimum, maximum, median, mode, and range.  \n",
+    "\t•\tFor non-numeric columns, it provides frequency-based metrics like the number of unique values, the most frequent value, and its corresponding count.  "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Age</th>\n",
+       "      <th>City</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>25.666667</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>22.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>30.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>median</th>\n",
+       "      <td>25.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mode</th>\n",
+       "      <td>22.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>range</th>\n",
+       "      <td>8.000000</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>num_unique_values</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>most_frequent_value</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>New York</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>frequency_of_most_frequent_value</th>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        Age      City\n",
+       "mean                              25.666667       NaN\n",
+       "min                               22.000000       NaN\n",
+       "max                               30.000000       NaN\n",
+       "median                            25.000000       NaN\n",
+       "mode                              22.000000       NaN\n",
+       "range                              8.000000       NaN\n",
+       "num_unique_values                       NaN         3\n",
+       "most_frequent_value                     NaN  New York\n",
+       "frequency_of_most_frequent_value        NaN         1"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_summary_statistics(sample_df)\n",
+    "get_summary_statistics(sample_df, col=[\"Age\", \"City\"])"
+   ]
   }
  ],
  "metadata": {
@@ -124,7 +225,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.12.8"
   }
  },
  "nbformat": 4,

diff --git a/src/pyeda/data_summary.py b/src/pyeda/data_summary.py
@@ -19,6 +19,11 @@ def get_summary_statistics(df, col = None):
     ----------
     pd.DataFrame
         A DataFrame with summary statistics for the specified columns, including mean, min, max, median, mode, and range for numeric columns, as well as number of unique values, the most frequent value, and its corresponding frequency) for non-numeric columns.
+    
+    Examples
+    --------
+    >>> from pyeda.data_summary import get_summary_statistics
+    >>> get_summary_statistics(df)
     """
     if col is None:
         col = df.columns.tolist()