-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecommendation_helpers.py
60 lines (51 loc) · 2.27 KB
/
recommendation_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
# calculating mean squere error
def get_mse(pred, actual):
# Ignore nonzero terms.
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
return mean_squared_error(pred, actual)
# cross validation
def train_test_split(ratings):
test = np.zeros(ratings.shape)
train = ratings.copy()
for user in xrange(ratings.shape[0]):
test_ratings = np.random.choice(ratings[user, :].nonzero()[0],
size=10,
replace=False)
train[user, test_ratings] = 0.
test[user, test_ratings] = ratings[user, test_ratings]
# Test and training are truly disjoint
assert(np.all((train * test) == 0))
return train, test
# measure distance L2
def calc_similarity(ratings, kind='user', epsilon=1e-9):
# epsilon -> small number for handling dived-by-zero errors
if kind == 'user':
sim = ratings.dot(ratings.T) + epsilon
elif kind == 'item':
sim = ratings.T.dot(ratings) + epsilon
norms = np.array([np.sqrt(np.diagonal(sim))])
return (sim / norms / norms.T)
def predict_simple(ratings, similarity, kind='user'):
if kind == 'user':
return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
elif kind == 'item':
return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
def predict_topk(ratings, similarity, kind='user', k=40):
pred = np.zeros(ratings.shape)
if kind == 'user':
for i in xrange(ratings.shape[0]):
top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
for j in xrange(ratings.shape[1]):
pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
if kind == 'item':
for j in xrange(ratings.shape[1]):
top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
for i in xrange(ratings.shape[0]):
pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))
return pred