-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNaiveBayes.py
74 lines (61 loc) · 2.8 KB
/
NaiveBayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import numpy as np
import math
class Outcome:
def __init__(self, f_count):
self.n = 0
self.distribution = np.zeros(f_count)
def resize_features(self, new_f_count):
self.distribution.resize(new_f_count)
def add_sample(self, f_array, probability = 1):
self.distribution += np.multiply(f_array, probability)
self.n += 1
def predict(self, f_array):
#result = 1
result = math.log(1)
for i in xrange(self.distribution.size):
#prob = math.log(self.distribution.take(i)) - math.log(self.n)
prob = math.log(1. + self.distribution.take(i)) - math.log(self.distribution.size + self.n)
if f_array[i]:
result += prob
else:
#result += math.log(self.n - self.distribution.take(i)) - math.log(self.n)
result += math.log(self.distribution.size + self.n - 1. - self.distribution.take(i)) - math.log(self.distribution.size + self.n)
#probability = (1. + self.distribution.take(i)) / (self.distribution.size + self.n)
#if f_array[i]:
# result *= probability
#else:
# result *= (1 - probability)
return result
#probs = [self.distribution.take(i) / self.n
# if f_array[i]
# else 1 - self.distribution.take(i) / self.n
# for i in xrange(self.n)]
#return reduce(lambda x,y: x * y, probs)
class NaiveBayesClassifier:
def __init__(self, outcome_count, feature_count):
self.outcomes = [Outcome(feature_count) for i in xrange(outcome_count)]
self.sample_size = 0
def fit(self, feature_vector, outcome_id, prob = 1):
self.outcomes[outcome_id].add_sample(feature_vector, prob)
self.sample_size += 1
def train(self, inputs):
for (data, outcome, prob) in inputs:
self.fit(data, outcome, prob)
def predict(self, data):
outcome_id = -float('inf')
outcome_probs = -float('inf')
for i in xrange(len(self.outcomes)):
#i_prob = float(self.outcomes[i].n) / self.sample_size * self.outcomes[i].predict(data)
i_prob = math.log(self.outcomes[i].n) - math.log(self.sample_size) + self.outcomes[i].predict(data)
if i_prob > outcome_probs:
outcome_probs = i_prob
outcome_id = i
return (outcome_id, outcome_probs)
def predict_all(self, data):
result_vector = []
for i in xrange(len(self.outcomes)):
i_prob = float(self.outcomes[i].n) / self.sample_size * self.outcomes[i].predict(data)
result_vector.append((i, i_prob))
return result_vector
def set_feature_count(self, new_f_count):
map(lambda o: o.resize_features(new_f_count), self.outcomes)