-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataSet.py
176 lines (149 loc) · 6.08 KB
/
DataSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from numpy import *
import BigMat as bm
import cPickle as cp
from time import time as now
class BatchSet(object):
def __init__(self,X,Y,batchsize):
self._size = m = X.shape[0]
self._X = X
self._Y = Y
self._index = 0
self._batchsize = batchsize
self._blocksize = batchsize * max(1,2048//batchsize)
self._batches = vstack([arange(0 ,m ,batchsize),
arange(batchsize,m+batchsize,batchsize)]).transpose()
self._batches[-1,-1] = m
def __iter__(self):
self._index = 0
return self
def next(self):
if self._index >= len(self._batches):
raise StopIteration
s = slice(*(self._batches[self._index]))
batch = DataFold(self._X[s,:],self._Y[s,:])
self._index += 1
return batch
def __len__(self):
return len(self._batches)
def shuffle(self):
random.shuffle(self._batches)
class DataFold(object):
'''
A simple structure containing a subset of inputs X,
and the corresponding target outputs Y
'''
def __init__(self,X,Y):
assert(X.shape[0] == Y.shape[0])
self.X = X
self.Y = Y
self.size = X.shape[0]
def __iter__(self):
return [self.X,self.Y].__iter__() # let X,Y = data unpack
def __getitem__(self,i):
return DataFold(self.X[i,:],self.Y[i,:])
def make_batches(self,batchsize):
return BatchSet(self.X,self.Y,batchsize)
class DataSet(object):
'''
A simple structure containing three DataFold instances:
a 'train', 'valid', and 'test.
'''
def __init__(self,X,Y,Xshape=None,Yshape=None,Xrange=None,Yrange=None,shuffle=True,max_batchsize=1):
if shuffle:
perm = random.permutation(X.shape[0])
X[:] = X[perm]
Y[:] = Y[perm]
self._X = bm.asarray(X)
self._Y = bm.asarray(Y)
self._size = X.shape[0]
self.max_batchsize = max_batchsize
self.Xshape = Xshape or (1,X.shape[1])
self.Yshape = Yshape or (1,Y.shape[1])
self.Xdim = X.shape[1]
self.Ydim = Y.shape[1]
self.Xrange = Xrange or (X.min(axis=0),X.max(axis=0))
self.Yrange = Yrange or (Y.min(axis=0),Y.max(axis=0))
rs = self._rowslice(0,self._size)
self.train = DataFold(X[rs,:],Y[rs,:])
self.valid = DataFold(X[0:0,:],Y[0:0,:])
self.test = DataFold(X[0:0,:],Y[0:0,:])
def keys(self): return ['train','valid','test']
def values(self): return [self.train,self.valid,self.test]
def items(self): return zip(self.keys(),self.values())
def _rowslice(self,a,b):
# round endpoint down
b = a + self.max_batchsize *((b-a) // self.max_batchsize)
return slice(a,b)
def __getitem__(self,key):
if key == 'train': return self.train
elif key == 'valid': return self.valid
elif key == 'test': return self.test
raise KeyError("invalid key for DataSet fold")
def split(self,trainsplit,validsplit=0,testsplit=0):
assert(trainsplit + validsplit + testsplit <= 100)
trainsize = int(trainsplit * self._size // 100)
validsize = int(validsplit * self._size // 100)
testsize = int(testsplit * self._size // 100)
trs = self._rowslice(0,trainsize)
vas = self._rowslice(trainsize,trainsize+validsize)
tes = self._rowslice(trainsize+validsize,trainsize+validsize+testsize)
self.train.X = self._X[trs,:]
self.train.Y = self._Y[trs,:]
self.train.size = self.train.X.shape[0]
self.valid.X = self._X[vas,:]
self.valid.Y = self._Y[vas,:]
self.valid.size = self.valid.X.shape[0]
self.test.X = self._X[tes,:]
self.test.Y = self._Y[tes,:]
self.test.size = self.test.X.shape[0]
def rescale(self,Xrange,Yrange):
'''
Rescales the entire dataset so that all inputs X lie within (Xrange[0],Xrange[1])
and all targets Y lie within (Yrange[0],Yrange[1]).
The same scaling factor is applied to all folds.
'''
if Xrange != self.Xrange:
for fold in self.values():
fold.X -= self.Xrange[0]
fold.X *= (Xrange[1]-Xrange[0])/maximum(1e-5,self.Xrange[1]-self.Xrange[0])
fold.X += Xrange[0]
self.Xrange = Xrange
if Yrange != self.Yrange:
for fold in self.values():
fold.Y -= self.Yrange[0]
fold.Y *= (Yrange[1]-Yrange[0])/maximum(1e-5,self.Yrange[1]-self.Yrange[0])
fold.Y += Yrange[0]
self.Yrange = Yrange
################################################
def pickle(filename,object):
f = open(filename,'wb')
cp.dump(object,f,-1)
f.close()
def unpickle(filename):
f = open(filename,'rb')
object = cp.load(f)
f.close()
return object
def load_mnist(digits=range(10),split=[50,15,35],xform=[]):
X,Y = [],[]
for d in digits:
for set in ('train','test'):
# Load all N instances of digit 'd' as a Nx768 row vector of inputs,
# and an Nx10 target vector.
Xd,Yd = unpickle("data/mnist/mnist_%s_%i.pkl" % (set,d))
X.append(Xd)
Y.append(Yd[:,digits]) # make the output dimensionality match the number of actual targets, for faster training on subsets of digits
# Apply transformations to the digitis
for f in xform:
Ximg = Xd.reshape((-1,28,28)).copy()
if f[0] > 0: Ximg[:,:,f[0]:] = Ximg[:,:,:-f[0]]; Ximg[:,:,:f[0]] = 0
elif f[0] < 0: Ximg[:,:,:f[0]] = Ximg[:,:,-f[0]:]; Ximg[:,:,f[0]:] = 0
if f[1] > 0: Ximg[:,f[1]:,:] = Ximg[:,:-f[1],:]; Ximg[:,:f[1],:] = 0
elif f[1] < 0: Ximg[:,:f[1],:] = Ximg[:,-f[1]:,:]; Ximg[:,f[1]:,:] = 0
X.append(Ximg.reshape((-1,28*28)))
Y.append(Yd[:,digits])
X = vstack(X)
Y = vstack(Y)
data = DataSet(X,Y,Xshape=(28,28),Xrange=[0.0,255.0],Yrange=[0.0,1.0],shuffle=True)
data.split(*split)
return data