-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistbox.py
69 lines (63 loc) · 2.82 KB
/
distbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/local/bin/python3.9
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
infile='path/to/infile.csv'
df=pd.read_csv(infile)
#spread of continuous variables
#create list of continuous variables
def dist_box(data):
'''
function plots a combined graph for univariate analysis of continous variable
to check spread, central tendency , dispersion and outliers
'''
Name=data.name.upper()
fig,(ax_box,ax_dis) =plt.subplots(nrows=2,sharex=True,gridspec_kw = {"height_ratios": (.25, .75)},figsize=(8, 5))
mean=data.mean()
median=data.median()
#mode=data.mode().tolist()[0]
sns.set_theme(style="white")
sns.set_palette(sns.color_palette("Set1", 8))
fig.suptitle("SPREAD OF DATA FOR "+ Name , fontsize=18, fontweight='bold')
sns.boxplot(x=data,showmeans=True, orient='h',ax=ax_box)
ax_box.set(xlabel='')
# just trying to make visualisation better. This will set background to white
sns.despine(top=True,right=True,left=True) # to remove side line from graph
sns.set_palette(sns.color_palette("Set1", 8))
sns.distplot(data,kde=False,ax=ax_dis)
ax_dis.axvline(mean, color='r', linestyle='--',linewidth=2)
ax_dis.axvline(median, color='g', linestyle='-',linewidth=2)
#ax_dis.axvline(mode, color='y', linestyle='-',linewidth=2)
plt.legend({'Mean':mean,'Median':median})
list_col = df.select_dtypes(include='number').columns.to_list()
for i in range(len(list_col)):
dist_box(df[list_col[i]])
#spread of continuous variables
#create list of continuous variables
def dist_box(data):
'''
function plots a combined graph for univariate analysis of continous variable
to check spread, central tendency , dispersion and outliers
'''
Name=data.name.upper()
fig,(ax_box,ax_dis) =plt.subplots(nrows=2,sharex=True,gridspec_kw = {"height_ratios": (.25, .75)},figsize=(8, 5))
mean=data.mean()
median=data.median()
#mode=data.mode().tolist()[0]
sns.set_theme(style="white")
sns.set_palette(sns.color_palette("Set1", 8))
fig.suptitle("SPREAD OF DATA FOR "+ Name , fontsize=18, fontweight='bold')
sns.boxplot(x=data,showmeans=True, orient='h',ax=ax_box)
ax_box.set(xlabel='')
# just trying to make visualisation better. This will set background to white
sns.despine(top=True,right=True,left=True) # to remove side line from graph
sns.set_palette(sns.color_palette("Set1", 8))
sns.distplot(data,kde=False,ax=ax_dis)
ax_dis.axvline(mean, color='r', linestyle='--',linewidth=2)
ax_dis.axvline(median, color='g', linestyle='-',linewidth=2)
#ax_dis.axvline(mode, color='y', linestyle='-',linewidth=2)
plt.legend({'Mean':mean,'Median':median})
list_col = df.select_dtypes(include='number').columns.to_list()
#list_col.remove('<column name>')
for i in range(len(list_col)):
dist_box(df[list_col[i]])