-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathDemo_MovieLens.py
224 lines (171 loc) · 6.59 KB
/
Demo_MovieLens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
import time
import numpy as np
from VISolver.Domains.MixtureMean import MixtureMean
from VISolver.Domains.SVDMethod import SVDMethod
from VISolver.Domains.MatrixFactorization import MatrixFactorization
from VISolver.Solvers.HeunEuler import HeunEuler
from VISolver.Solvers.Euler import Euler
from VISolver.Projection import EntropicProjection, BoxProjection
from VISolver.Solver import Solve
from VISolver.Options import (
DescentOptions, Miscellaneous, Reporting, Termination, Initialization)
from VISolver.Log import PrintSimResults, PrintSimStats
from scipy.sparse import coo_matrix
from IPython import embed
def Demo(small=True,folds=1):
# __MOVIE_LENS:_MATRIX_COMPLETION__#########################################
if small:
path = '/Users/imgemp/Dropbox/690Op/Midterm/data/100/ml-100k/u'
ext = '.base'
dlim = '\t'
sh = (943,1682)
else:
path = '/Users/imgemp/Dropbox/690Op/Midterm/data/1M/ml-1m/r'
ext = '.train'
dlim = '::'
sh = (6040,3952)
# SVT convergence
rel_error = []
RMSEs = np.empty((6,folds))
RMSEs.fill(np.NaN)
np.random.seed(0)
for k in range(folds):
# Load Training Data
data = np.loadtxt(path+str(k+1)+ext,usecols=(0,1,2),delimiter=dlim)
users = data[:,0] - 1
movies = data[:,1] - 1
ratings = data[:,2]
spdata_train = coo_matrix((ratings,(users,movies)),shape=sh)
# Load Testing Data
data = np.loadtxt(path+str(k+1)+'.test',usecols=(0,1,2),delimiter=dlim)
users = data[:,0] - 1
movies = data[:,1] - 1
ratings = data[:,2]
spdata_test = coo_matrix((ratings,(users,movies)),shape=sh)
mask = (spdata_test != 0).toarray()
# RMSEs[0,k] = score_globalmean(spdata_train,spdata_test,mask)
# RMSEs[1,k] = score_usermean(spdata_train,spdata_test,mask)
# RMSEs[2,k] = score_moviemean(spdata_train,spdata_test,mask)
# RMSEs[3,k] = score_mixturemean(spdata_train,spdata_test,mask)
tau = 5*np.sqrt(np.prod(spdata_train.shape))
RMSEs[4,k], re = score_svdmethod(spdata_train,spdata_test,mask,tau=tau)
rel_error += [np.asarray(re)]
# RMSEs[5,k] = score_matrixfac(spdata_train,spdata_test,mask)
print(RMSEs)
embed()
def rmse(pred,test,mask):
sqerr = mask*np.asarray(pred - test)**2.
rmse = np.sqrt(sqerr.sum()/test.nnz)
return rmse
def score_globalmean(train,test,mask):
pred = train.sum()/train.nnz*mask
return rmse(pred,test,mask)
def score_usermean(train,test,mask):
globalmean = train.sum()/train.nnz
usermean = np.asarray(train.sum(axis=1)).squeeze()/train.getnnz(axis=1)
usermean[np.isnan(usermean)] = globalmean
pred = (mask.T*usermean).T
return rmse(pred,test,mask)
def score_moviemean(train,test,mask):
globalmean = train.sum()/train.nnz
moviemean = np.asarray(train.sum(axis=0)).squeeze()/train.getnnz(axis=0)
moviemean[np.isnan(moviemean)] = globalmean
pred = mask*moviemean
return rmse(pred,test,mask)
def score_mixturemean(train,test,mask,step=-1e-3,iters=100):
# Define Domain
Domain = MixtureMean(Data=train)
# Set Method
Method = Euler(Domain=Domain,P=BoxProjection(lo=0.,hi=1.))
# Method = Euler(Domain=Domain,P=EntropicProjection())
# Method = HeunEuler(Domain=Domain,P=EntropicProjection(),Delta0=1e-5,
# MinStep=-1e-1,MaxStep=-1e-4)
# Initialize Starting Point
# Start = np.array([.452,.548])
Start = np.array([.452])
# Set Options
Init = Initialization(Step=step)
Term = Termination(MaxIter=iters)
Repo = Reporting(Requests=['Step', 'F Evaluations'])
Misc = Miscellaneous()
Options = DescentOptions(Init,Term,Repo,Misc)
# Print Stats
PrintSimStats(Domain,Method,Options)
# Start Solver
tic = time.time()
Results = Solve(Start,Method,Domain,Options)
toc = time.time() - tic
# Print Results
PrintSimResults(Options,Results,Method,toc)
# Retrieve result
parameters = np.asarray(Results.TempStorage['Data'][-1])
pred = Domain.predict(parameters)
return rmse(pred,test,mask)
def score_svdmethod(train,test,mask,tau=6e3,step=1.9,fixstep=True,iters=250):
# Define Domain
Domain = SVDMethod(Data=train,tau=tau)
# Set Method
# Method = Euler(Domain=Domain,FixStep=fixstep)
Method = HeunEuler(Domain=Domain,Delta0=1e2,
MinStep=1e0,MaxStep=1e3)
# Initialize Starting Point
# globalmean = train.sum()/train.nnz
# Start = globalmean*np.ones(train.shape)
Start = np.zeros(train.shape).flatten()
# Set Options
Init = Initialization(Step=step)
Term = Termination(MaxIter=iters,Tols=[(Domain.rel_error,0.2)])
Repo = Reporting(Requests=[Domain.rel_error,'Step', 'F Evaluations'])
Misc = Miscellaneous()
Options = DescentOptions(Init,Term,Repo,Misc)
# Print Stats
PrintSimStats(Domain,Method,Options)
# Start Solver
tic = time.time()
Results = Solve(Start,Method,Domain,Options)
toc = time.time() - tic
# Print Results
PrintSimResults(Options,Results,Method,toc)
# Retrieve result
Y = np.asarray(Results.TempStorage['Data'][-1]).reshape(train.shape)
pred = Domain.shrink(Y,Domain.tau)
return rmse(pred,test,mask), Results.PermStorage[Domain.rel_error]
def score_matrixfac(train,test,mask,step=1e-5,iters=100,k=500):
# Define Domain
n,d = train.shape
sh_P = (n,k)
sh_Q = (d,k)
Domain = MatrixFactorization(Data=train,sh_P=sh_P,sh_Q=sh_Q)
# Set Method
# Method = Euler(Domain=Domain,FixStep=True)
Method = HeunEuler(Domain=Domain,Delta0=1e-1,
MinStep=1e-7,MaxStep=1e-2)
# Initialize Starting Point
globalmean = train.sum()/train.nnz
scale = np.sqrt(globalmean/k)
# P = np.random.rand(n,k)
# Q = np.random.rand(d,k)
P = scale*np.ones(sh_P)
Q = scale*np.ones(sh_Q)
Start = np.hstack((P.flatten(),Q.flatten()))
# Set Options
Init = Initialization(Step=step)
Term = Termination(MaxIter=iters)
Repo = Reporting(Requests=['Step', 'F Evaluations'])
Misc = Miscellaneous()
Options = DescentOptions(Init,Term,Repo,Misc)
# Print Stats
PrintSimStats(Domain,Method,Options)
# Start Solver
tic = time.time()
Results = Solve(Start,Method,Domain,Options)
toc = time.time() - tic
# Print Results
PrintSimResults(Options,Results,Method,toc)
# Retrieve result
parameters = np.asarray(Results.TempStorage['Data'][-1])
pred = Domain.predict(parameters)
return rmse(pred,test,mask)
if __name__ == '__main__':
Demo(small=False,folds=5)
# Demo()