Skip to content

Commit

Permalink
Merge pull request #42 from shakedzy/v0.5-api-update
Browse files Browse the repository at this point in the history
V0.5 api update
  • Loading branch information
shakedzy authored Apr 17, 2020
2 parents 7a2e07d + 461451e commit 6ef6be9
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 202 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.4.7
0.5.0a0
6 changes: 3 additions & 3 deletions dython/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def roc_graph_example():
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

# Plot ROC graphs
roc_graph(y_test, y_score, class_names=iris.target_names)
return roc_graph(y_test, y_score, class_names=iris.target_names)


def associations_iris_example():
Expand All @@ -58,7 +58,7 @@ def associations_iris_example():
df = pd.concat([X, y], axis=1)

# Plot features associations
associations(df)
return associations(df)


def associations_mushrooms_example():
Expand All @@ -75,4 +75,4 @@ def associations_mushrooms_example():
'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat']

# Plot features associations
associations(df, theil_u=True, figsize=(15, 15))
return associations(df, theil_u=True, figsize=(15, 15))
210 changes: 122 additions & 88 deletions dython/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,6 @@
'roc_graph'
]

# ROC graphs defaults
_DEFAULT_FORMAT = '.2f'
_DEFAULT_LINE_WIDTH = 2
_DEFAULT_MARKER_SIZE = 10
_DEFAULT_COLORS = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'darkorange']
_DEFAULT_COLOR = 'darkorange'
_DEFAULT_MICRO_COLOR = 'deeppink'
_DEFAULT_MACRO_COLOR = 'navy'
_DEFAULT_LINE_STYLE = '-'
_DEFAULT_MICRO_MACRO_LINE_STYLE = ':'
_DEFAULT_THRESHOLD_ANNOTATION_OFFSET = (-.027, .03)
_DEFAULT_MARKER = 'o'


def _display_roc_plot(xlim, ylim):
plt.plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--')
Expand All @@ -34,21 +21,22 @@ def _display_roc_plot(xlim, ylim):
plt.show()


def _draw_estimated_optimal_threshold_mark(fpr, tpr, thresholds, color, ms, fmt):
def _draw_estimated_optimal_threshold_mark(fpr, tpr, thresholds, color, ms, fmt, ax):
annotation_offset = (-.027, .03)
a = np.zeros((len(fpr), 2))
a[:, 0] = fpr
a[:, 1] = tpr
dist = lambda row: row[0]**2 + (1 - row[1])**2
amin = np.apply_along_axis(dist, 1, a).argmin()
plt.plot(fpr[amin], tpr[amin], color=color, marker=_DEFAULT_MARKER, ms=ms)
plt.gca().annotate("{th:{fmt}}".format(th=thresholds[amin], fmt=fmt),
xy=(fpr[amin], tpr[amin]), color=color,
xytext=(fpr[amin]+_DEFAULT_THRESHOLD_ANNOTATION_OFFSET[0],
tpr[amin]+_DEFAULT_THRESHOLD_ANNOTATION_OFFSET[1]))
plt.plot(fpr[amin], tpr[amin], color=color, marker='o', ms=ms)
ax.annotate("{th:{fmt}}".format(th=thresholds[amin], fmt=fmt),
xy=(fpr[amin], tpr[amin]), color=color,
xytext=(fpr[amin]+annotation_offset[0],
tpr[amin]+annotation_offset[1]))
return thresholds[amin]


def _plot_macro_roc(fpr, tpr, n, **kwargs):
def _plot_macro_roc(fpr, tpr, n, lw, fmt, ax):
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(n):
Expand All @@ -57,18 +45,16 @@ def _plot_macro_roc(fpr, tpr, n, **kwargs):
fpr_macro = all_fpr
tpr_macro = mean_tpr
auc_macro = auc(fpr_macro, tpr_macro)
fmt = kwargs.get('fmt', '.2f')
lw = kwargs.get('lw', 2)
label = 'ROC curve: macro (AUC = {auc:{fmt}})'.format(auc=auc_macro, fmt=fmt)
plt.plot(fpr_macro,
tpr_macro,
label=label,
color=_DEFAULT_MACRO_COLOR,
ls=_DEFAULT_MICRO_MACRO_LINE_STYLE,
lw=lw)
ax.plot(fpr_macro,
tpr_macro,
label=label,
color='navy',
ls=':',
lw=lw)


def _binary_roc_graph(y_true, y_pred, eoptimal, **kwargs):
def _binary_roc_graph(y_true, y_pred, eoptimal, class_label, color, lw, ls, ms, fmt, ax):
y_true = convert(y_true, 'array')
y_pred = convert(y_pred, 'array')
if y_pred.shape != y_true.shape:
Expand All @@ -81,32 +67,45 @@ def _binary_roc_graph(y_true, y_pred, eoptimal, **kwargs):
y_p = [x[1] for x in y_pred]
fpr, tpr, th = roc_curve(y_t, y_p)
auc_score = auc(fpr, tpr)
color = kwargs.get('color', _DEFAULT_COLOR)
lw = kwargs.get('lw', _DEFAULT_LINE_WIDTH)
ls = kwargs.get('ls', _DEFAULT_LINE_STYLE)
fmt = kwargs.get('fmt', _DEFAULT_FORMAT)
if 'class_label' in kwargs and kwargs['class_label'] is not None:
class_label = ': {}'.format(kwargs['class_label'])
if 'class_label' is not None:
class_label = ': ' + class_label
else:
class_label = ''
label = 'ROC curve{class_label} (AUC = {auc:{fmt}}'.format(class_label=class_label, auc=auc_score, fmt=fmt)
if eoptimal:
eopt = _draw_estimated_optimal_threshold_mark(fpr, tpr, th, color, kwargs.get('ms', _DEFAULT_MARKER_SIZE), fmt)
eopt = _draw_estimated_optimal_threshold_mark(fpr, tpr, th, color, ms, fmt, ax)
label += ', eOpT = {th:{fmt}})'.format(th=eopt, fmt=fmt)
else:
eopt = None
label += ')'
plt.plot(fpr,
tpr,
color=color,
lw=lw,
ls=ls,
label=label)
return {'fpr': fpr, 'tpr': tpr, 'thresholds': th}
ax.plot(fpr,
tpr,
color=color,
lw=lw,
ls=ls,
label=label)
return {'fpr': fpr, 'tpr': tpr, 'thresholds': th,
'auc': auc_score, 'eopt': eopt}


def roc_graph(y_true, y_pred, micro=True, macro=True, eoptimal_threshold=True, class_names=None, **kwargs):
def roc_graph(y_true,
y_pred,
micro=True,
macro=True,
eoptimal_threshold=True,
class_names=None,
colors=None,
ax=None,
figsize=None,
xlim=(0.,1.),
ylim=(0.,1.02),
lw=2,
ls='-',
ms=10,
fmt='.2f'
):
"""
Plot a ROC graph of predictor's results (inclusding AUC scores), where each
Plot a ROC graph of predictor's results (including AUC scores), where each
row of y_true and y_pred represent a single example.
If there are 1 or two columns only, the data is treated as a binary
classification (see input example below).
Expand All @@ -117,22 +116,8 @@ def roc_graph(y_true, y_pred, micro=True, macro=True, eoptimal_threshold=True, c
Based on sklearn examples (as was seen on April 2018):
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
**Example:** See `roc_graph_example` under `dython.examples`
**Binary Classification Input Example:**
Consider a data-set of two data-points where the true class of the first line
is class 0, which was predicted with a probability of 0.6, and the second line's
true class is 1, with predicted probability of 0.8.
```python
# First option:
>> binary_roc_graph(y_true=[0,1], y_pred=[0.6,0.8])
# Second option:
>> binary_roc_graph(y_true=[[1,0],[0,1]], y_pred=[[0.6,0.4],[0.2,0.8]])
# Both yield the same result
```
Parameters
----------
Parameters:
-----------
y_true : list / NumPy ndarray
The true classes of the predicted data
y_pred : list / NumPy ndarray
Expand All @@ -152,10 +137,48 @@ def roc_graph(y_true, y_pred, micro=True, macro=True, eoptimal_threshold=True, c
order must match the order of the classes probabilities in the input
data. In a binary classification, can be a string or a list. If a list,
only the last element will be used.
kwargs : any key-value pairs
Different options and configurations. Some possible options: figsize,
color, lw (line-width), ls (line-style), ms (marker-size), fmt (number
format)
colors : list of Matplotlib color strings or None, default = None
List of colors to be used for the plotted curves. If `None`, falls back
to a predefined default.
ax : matplotlib ax, default = None
Matplotlib Axis on which the curves will be plotted
figsize : (int,int) or None, default = None
a Matplotlib figure-size tuple. If `None`, falls back to Matplotlib's
default. Only used if `ax=None`.
xlim : (float, float), default = (0.,1.)
X-axis limits.
ylim : (float,float), default = (0.,1.02)
Y-axis limits.
lw : int, default = 2
Line-width.
ls : string, default = '-'
Matplotlib line-style string
ms : int, default = 10,
Marker-size.
fmt : string, default = '.2f'
String formatting of displayed AUC and threshold numbers.
Returns:
--------
A dictionary, one key for each class. Each value is another dictionary,
holding AUC and eOpT values.
Binary Classification Input Example:
------------------------------------
Consider a data-set of two data-points where the true class of the first line
is class 0, which was predicted with a probability of 0.6, and the second line's
true class is 1, with predicted probability of 0.8.
```python
# First option:
>> binary_roc_graph(y_true=[0,1], y_pred=[0.6,0.8])
# Second option:
>> binary_roc_graph(y_true=[[1,0],[0,1]], y_pred=[[0.6,0.4],[0.2,0.8]])
# Both yield the same result
```
Example:
--------
See `roc_graph_example` under `dython.examples`
"""
all_fpr = list()
all_tpr = list()
Expand All @@ -168,12 +191,20 @@ def roc_graph(y_true, y_pred, micro=True, macro=True, eoptimal_threshold=True, c
class_names = convert(class_names, 'list')
else:
class_names = [class_names]
plt.figure(figsize=kwargs.get('figsize', None))
if ax is None:
plt.figure(figsize=figsize)
ax = plt.gca()
colors = colors or ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'darkorange']
output_dict = dict()
if len(y_pred.shape) == 1 or y_pred.shape[1] <= 2:
class_label = class_names[-1] if class_names is not None else None
_binary_roc_graph(y_true, y_pred, eoptimal=eoptimal_threshold, class_label=class_label, **kwargs)
d = _binary_roc_graph(y_true, y_pred, eoptimal=eoptimal_threshold,
class_label=class_label, color=colors[-1],
lw=lw, ls=ls, ms=ms, fmt=fmt, ax=ax)
class_label = class_label or '0'
output_dict[class_label] = {'auc': d['auc'],
'eopt': d['eopt']}
else:
colors = _DEFAULT_COLORS
n = y_pred.shape[1]
if class_names is not None:
if not isinstance(class_names, list):
Expand All @@ -182,46 +213,49 @@ def roc_graph(y_true, y_pred, micro=True, macro=True, eoptimal_threshold=True, c
raise ValueError('Number of class names does not match input data size.')
for i in range(0, n):
class_label = class_names[i] if class_names is not None else str(i)
pr = _binary_roc_graph(y_true[:, i],
y_pred[:, i],
eoptimal=eoptimal_threshold,
color=colors[i % len(colors)],
class_label=class_label,
**kwargs)
all_fpr.append(pr['fpr'])
all_tpr.append(pr['tpr'])
d = _binary_roc_graph(y_true[:, i],
y_pred[:, i],
eoptimal=eoptimal_threshold,
color=colors[i % len(colors)],
class_label=class_label,
lw=lw, ls=ls, ms=ms, fmt=fmt, ax=ax)
all_fpr.append(d['fpr'])
all_tpr.append(d['tpr'])
output_dict[class_label] = {'auc': d['auc'],
'eopt': d['eopt']}
if micro:
_binary_roc_graph(y_true.ravel(),
y_pred.ravel(),
eoptimal=False,
ls=_DEFAULT_MICRO_MACRO_LINE_STYLE,
color=_DEFAULT_MICRO_COLOR,
ls=':',
color='deeppink',
class_label='micro',
**kwargs)
lw=lw, ms=ms, fmt=fmt, ax=ax)
if macro:
_plot_macro_roc(all_fpr, all_tpr, n, **kwargs)
_display_roc_plot(xlim=kwargs.get('xlim', (0, 1)),
ylim=kwargs.get('ylim', (0, 1.02)))
_plot_macro_roc(all_fpr, all_tpr, n, lw, fmt, ax)
_display_roc_plot(xlim=xlim, ylim=ylim)
output_dict['ax'] = ax
return output_dict


def random_forest_feature_importance(forest, features, **kwargs):
def random_forest_feature_importance(forest, features, precision=4):
"""
Given a trained `sklearn.ensemble.RandomForestClassifier`, plot the
different features based on their importance according to the classifier,
from the most important to the least.
Parameters
----------
Parameters:
-----------
forest : sklearn.ensemble.RandomForestClassifier
A trained `RandomForestClassifier`
features : list
A list of the names of the features the classifier was trained on,
ordered by the same order the appeared
in the training data
kwargs : any key-value pairs
Different options and configurations
precision : int, default = 4
Precision of feature importance
"""
return sorted(zip(
map(lambda x: round(x, kwargs.get('precision', 4)),
map(lambda x: round(x, precision),
forest.feature_importances_), features),
reverse=True)
Loading

0 comments on commit 6ef6be9

Please sign in to comment.