Skip to content

Commit

Permalink
Merge pull request #24 from UBC-MDS/edit_get_summary
Browse files Browse the repository at this point in the history
add data_summary to jupyter
  • Loading branch information
jessiezhang24 authored Jan 23, 2025
2 parents a75e3fe + 5c05645 commit b2f0279
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 11 deletions.
123 changes: 112 additions & 11 deletions docs/example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import pandas as pd\n",
"from pyeda.check_csv import check_csv\n",
"from pyeda.pymissing_values_summary import missing_values_summary\n",
"from pyeda.data_summary import get_summary_statistics"
Expand All @@ -29,12 +30,12 @@
"source": [
"## Create a csv file\n",
"\n",
"We'll first create a csv file to work with."
"We'll first create a csv file to work with. You can find this `sample_data.csv` file [here](https://github.com/UBC-MDS/pyeda/blob/main/docs/sample_data.csv)."
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -67,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -86,26 +87,126 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": []
"source": [
"sample_df = pd.read_csv(file_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get data summary\n",
"\n",
"Now it's time to use the `get_summary_statistics` method to get the data summary information."
"Now it's time to use the `get_summary_statistics` method to quickly generate the summary statistics of your dataset. You can either specify particular columns to analyze or summarize all columns if no column names are provided. \n",
"\tFor numeric columns, the function calculates metrics, including mean, minimum, maximum, median, mode, and range. \n",
"\t\tFor non-numeric columns, it provides frequency-based metrics like the number of unique values, the most frequent value, and its corresponding count. "
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>City</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>25.666667</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>22.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>30.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>median</th>\n",
" <td>25.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mode</th>\n",
" <td>22.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>range</th>\n",
" <td>8.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>num_unique_values</th>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>most_frequent_value</th>\n",
" <td>NaN</td>\n",
" <td>New York</td>\n",
" </tr>\n",
" <tr>\n",
" <th>frequency_of_most_frequent_value</th>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age City\n",
"mean 25.666667 NaN\n",
"min 22.000000 NaN\n",
"max 30.000000 NaN\n",
"median 25.000000 NaN\n",
"mode 22.000000 NaN\n",
"range 8.000000 NaN\n",
"num_unique_values NaN 3\n",
"most_frequent_value NaN New York\n",
"frequency_of_most_frequent_value NaN 1"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_summary_statistics(sample_df)\n",
"get_summary_statistics(sample_df, col=[\"Age\", \"City\"])"
]
}
],
"metadata": {
Expand All @@ -124,7 +225,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
"version": "3.12.8"
}
},
"nbformat": 4,
Expand Down
5 changes: 5 additions & 0 deletions src/pyeda/data_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ def get_summary_statistics(df, col = None):
----------
pd.DataFrame
A DataFrame with summary statistics for the specified columns, including mean, min, max, median, mode, and range for numeric columns, as well as number of unique values, the most frequent value, and its corresponding frequency) for non-numeric columns.
Examples
--------
>>> from pyeda.data_summary import get_summary_statistics
>>> get_summary_statistics(df)
"""
if col is None:
col = df.columns.tolist()
Expand Down

0 comments on commit b2f0279

Please sign in to comment.