-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path13. Generalized Linear Models, take 2.py
132 lines (89 loc) · 4.76 KB
/
13. Generalized Linear Models, take 2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Generalized Linear Models
#13.2 Logistic Regression
'''When you have a binary response variable, logistic regression is often used to model the data.
Here’s some data from the American Community Survey (ACS) for New York.'''
##### Import #####
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np
from sklearn import linear_model
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt
##################
acs = pd.read_csv('/Users/russellconte/Documents/Pandas for Everyone/pandas_for_everyone-master/data/acs_ny.csv')
print(acs.head())
acs.columns
print(acs.columns) # the print function does not change the output
acs.info
'''To model these data, we first need to create a binary response variable.
Here we split the FamilyIncome variable into a binary variable.'''
acs['ge150k'] = pd.cut(acs['FamilyIncome'], [0, 150000, acs['FamilyIncome'].max()], labels=[0,1])
acs['ge150k_i'] = acs['ge150k'].astype(int)
print(acs['ge150k_i'].value_counts())
acs.info()
# 13.2.1 GLM using Statsmodels
model = smf.logit('ge150k_i ~ HouseCosts + NumWorkers + OwnRent + NumBedrooms + FamilyType', data = acs)
results = model.fit()
print(results.summary())
# To interpret our logistic model, we first need to exponentiate our results.
odds_ratios = np.exp(results.params)
print(odds_ratios)
# An example interpretation of these numbers would be that for every one unit increase in NumBedrooms,
# the odds of the FamilyIncome being greater than 150,000 increases by 1.27 times.
'''An example interpretation of these dummy variables would be that the odds of the FamilyIncome being greater than 150,000
increases by 1.82 times when the home is owned outright versus being under a mortgage.'''
# 13.2.2 Using Sklearn
# When using sklearn, remember that dummy variables need to be created manually. UGH!!!
predictors = pd.get_dummies(
acs[['HouseCosts', 'NumWorkers', 'OwnRent', 'NumBedrooms', 'FamilyType']], drop_first=True
)
lr = linear_model.LogisticRegression()
# We can fit our model in the same way as when we fitted a linear regression.
results = lr.fit(X = predictors, y = acs['ge150k_i']) # this returns a warning but not an error
print(results.coef_) # this runs
print(results.intercept_) # this runs
# print results in a more attractive format
values = np.append(results.intercept_, results.coef_)
names = np.append('intercept', predictors.columns)
# put everything into a labeled data frame
results = pd.DataFrame(values, index = names, columns=['coef'])
# exponentiate our values for accurate interpretation
results['or'] = np.exp(results['coef'])
print(results)
# 13.3 Poisson Regression
# 13.3.1 Using Statsmodels
results = smf.poisson(
'NumChildren ~ FamilyIncome + FamilyType + OwnRent',
data=acs).fit() # this returns four warnings and does not converge
print(results.summary()) # does not converge, results are nan
model = smf.glm('NumChildren ~ FamilyIncome + FamilyType + OwnRent', data = acs,
family = sm.families.Poisson(sm.genmod.families.links.log)) # warning this is deprecated
model = smf.glm('NumChildren ~ FamilyIncome + FamilyType + OwnRent', data = acs,
family = sm.families.Poisson()) # this runs without a warning and returns the same results
results = model.fit()
print(results.summary())
# 13.3.2 Negative Binomial Regression for Overdispersion
# If our assumptions for Poisson regression are violated—that is, if our data has overdispersion—we can perform a negative binomial regression instead.
model = smf.glm('NumChildren ~ FamilyIncome + FamilyType + OwnRent',
data=acs,
family=sm.families.NegativeBinomial())
print(results.summary()) # this converges and runs successfully!
# 13.5 Survival Analysis does not work for me at all, and I cannot repair it to work
# survival analysis is used when modeling the time to occurrence of a certain event
bladder = pd.read_csv('/Users/russellconte/Documents/Pandas for Everyone/pandas_for_everyone-master/data/bladder.csv')
bladder.head(15)
# let's find the counts of the different treatments :)
print(bladder['rx'].value_counts()) # 1: 188, 2: 252
'''Creating the model and fitting the data proceed similarly to how models are fit using sklearn.
The stop variable indicates when an event occurs,
and the event variable signals whether the event of interest (bladder cancer re-occurrence) occurred. '''
kmf = KaplanMeierFitter()
kmf.fit(bladder['stop'], event_observed=bladder['event']) # returns a number of TypeErrors
#We can plot the survival curve using matplotlib - I cannot get this to run correctly! :(
fix, ax = plt.subplots()
ax = kmf.survival_function_.plot(ax = ax)
ax.set_title('Survival function of political regimes')
plt.show()