-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdeconv_solver.py
529 lines (432 loc) · 18.3 KB
/
deconv_solver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
#!/usr/bin/python2.7
# own module
# from memory_controller import *
# from systolic_array import *
# from onchip_buffer import *
# public library
import cv2 as cv
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import minimize
# info for systolic array
A = 16.0 # systolic array dimension
# info for weights
K = 3.0 # kernel size;
K_arr = [] # notated as an array of sub-kernel size;
# input layer dimension
H = 128.0 # height
W = 256.0 # width
C = 512.0 # channel
# memory bandwith
B = 1.0
# buffer size
buffer_size = 2.0*1024.0*1024.0
# variables for optimization
# this two has been encodes as x[2]; = {c_0, h_0xw_0};
# c_0 # number of channels per batch;
# h_0xw_0 # size of tile per batch;
# calculate the latency for compute and memory;
# l_com = (K*K*c_0*h_0xw_0)/(R*R)
# # if row-major
# l_mem_r = B*(c_0*h_0xw_0 + C*h_0xw_0)
# # if channel-major
# l_mem_c = B*(c_0*h_0xw_0 + C*K*K*h_0xw_0)
#########################################################
# general process #
#########################################################
def process_parameter(x, row_major, comp_bound):
res = [math.ceil(C/x[0]), C/math.ceil(C/x[0]), \
math.ceil(W*H/x[1]), H*W/math.ceil(W*H/x[1])]
print(math.ceil(C/x[0]), C/math.ceil(C/x[0]))
print(math.ceil(W*H/x[1]), H*W/math.ceil(W*H/x[1]))
x[0] = A*math.floor(x[0]/A)
x[1] = A*math.floor(x[1]/A)
print(math.ceil(C/x[0]), C/math.ceil(C/x[0]))
print(math.ceil(W*H/x[1]), H*W/math.ceil(W*H/x[1]))
if (row_major):
total_transfer = (res[1]*res[3]+res[3]*C)*res[2]*res[0]\
+(res[1]*res[3]+K*K*C*res[3])*res[0]
else:
total_transfer = (res[1]*res[3]+K*K*C*res[1])*res[0]*res[2]\
+(res[1]*res[3]+res[3]*C)*res[2]
if comp_bound:
total_cycle = (res[0]/A*res[1]/A)*(C*K*K)*res[2]*res[3]
else:
total_cycle = total_transfer/B
print("total_transfer", total_transfer)
print("total_cycle", total_cycle)
return [res, total_transfer]
# this function is to verifer if a given hardware
# configuration is able to realize in given hardware
# constraints.
# return the result and total
# def verifier(x, row_major):
#########################################################
# general constraints #
#########################################################
# the low bound of buffer size
def buffer_constraint1(x):
return C*(K_arr[0]*K_arr[0]*x[0] + K_arr[1]*K_arr[1]*x[1]) \
+ (x[0]+x[1])*x[2] + C*x[2]
# the upper bound of the buffer size
def buffer_constraint2(x):
return buffer_size - ((x[0]+x[1])*x[2] + C*x[2] + \
C*(K_arr[0]*K_arr[0]*x[0] + K_arr[1]*K_arr[1]*x[1]))
# the low bound of buffer size
def buffer_constraint3(x):
return C*K_arr[1]*K_arr[1]*x[3] + x[3]*x[4] + C*x[4]
# the upper bound of the buffer size
def buffer_constraint4(x):
return buffer_size - (C*K_arr[1]*K_arr[1]*x[3] + x[3]*x[4] + C*x[4])
# make sure x[0] always greater than x[1]
def var_constraint1(x):
return x[0]-x[1]
# make sure x[1] always greater than x[2]
def var_constraint2(x):
return x[1]-x[2]
# make sure x[2] always greater than x[3]
def var_constraint3(x):
return x[2]-x[3]
#########################################################
# row-major constraint solving obj and constraints #
#########################################################
# Attention to following notes:
# variables for optimization has always been
# encoded as x[n]; = {c_0, ..., c_n-1, h_0xw_0};
# the minimization objective of row-major compute-bound
# since we don't consider boundary effects on total latency
# therefore, we try to make total round of computing be less.
def row_major_obj(x):
return (H*W/x[0] + K_arr[0]*K_arr[0]/x[2]) + \
(H*W/x[0] + K_arr[1]*K_arr[1]/x[2])*(x[1]/x[0]) + \
(H*W/x[3] + K_arr[1]*K_arr[1]/x[4])*(1-x[1]/x[0])
# make sure the load for row-major is always less than
# load for channel-major, range : [0, +inf]
def row_major_constraint1(x):
# simplified from sum{K[i]*K[i]*C*x[i]} - C*x[2]
return K_arr[0]*K_arr[0]*x[0]\
+K_arr[1]*K_arr[1]*x[1]-x[2];
# make sure the load for row-major is always less than
# load for channel-major, range : [0, +inf]
def row_major_constraint2(x):
# simplified from K*K*C*x[3] - C*x[4]
return K_arr[1]*K_arr[1]*x[3]-x[4]
# make sure the process is always memory-bound;
# range : [0, +inf]
def row_major_mem_bound_constraint1(x):
return (C+x[0]+x[1])/B - \
(K_arr[0]*K_arr[0]*x[0]+K_arr[1]*K_arr[1]*x[1])*C/(A*A)
# make sure the process is always memory-bound;
# range : [0, +inf]
def row_major_mem_bound_constraint2(x):
return (C+x[3])/B - K_arr[1]*K_arr[1]*C/(A*A)*x[3]
# make sure the process is always compute-bound;
# range : [0, +inf]
def row_major_comp_bound_constraint1(x):
return (K_arr[0]*K_arr[0]*x[0]+K_arr[1]*K_arr[1]*x[1])*C/(A*A)\
- (C+x[0]+x[1])/B
# make sure the process is always compute-bound;
# range : [0, +inf]
def row_major_comp_bound_constraint2(x):
return K_arr[1]*K_arr[1]*C/(A*A)*x[3] - (x[3]+C)/B
# The constraints related to the two sub-kernel optimization routine;
def row_major_dual_constraints():
# for row_major_constraint1
con1 = {'type': 'ineq', 'fun': row_major_constraint1}
# for row_major_comp_bound_constraint1
con2 = {'type': 'ineq', 'fun': row_major_comp_bound_constraint1}
# for the buffer_constraint
con3 = {'type': 'ineq', 'fun': buffer_constraint1}
con4 = {'type': 'ineq', 'fun': buffer_constraint2}
# add additional variable constraints
con5 = {'type': 'ineq', 'fun': var_constraint1}
return [con1, con2, con3, con4, con5]
# The constraints related to the left sub-kernel optimization routine;
def row_major_rest_constraints():
# for row_major_constraint2
con6 = {'type': 'ineq', 'fun': row_major_constraint2}
# for row_major_comp_bound_constraint2
con7 = {'type': 'ineq', 'fun': row_major_comp_bound_constraint2}
# for the buffer_constraint
con8 = {'type': 'ineq', 'fun': buffer_constraint3}
con9 = {'type': 'ineq', 'fun': buffer_constraint4}
return [con6, con7, con8, con9]
# major routine for optimizing compute-bound row-major sequence;
def opti_comp_row_major():
# set the initial guess;
x0 = [A, A, A, A, A]
# Dual sub-kernel constraint optimization;
[con1, con2, con3, con4, con5] = row_major_dual_constraints();
## start to optimize the rest of the smaller sub-kernel ##
[con6, con7, con8, con9] =
# summery all the bounds and constraints
bnds = ((A, C), (A, C), (A, H*W), (A, C), (A, H*W))
cons = ([con1, con2, con3, con4, con5, con6, con7, con8, con9])
# call the external solver to solve the solution
solution = minimize(row_major_obj, x0, method='SLSQP',\
bounds=bnds, constraints=cons)
print("row major", solution.x, row_major_obj(solution.x))
print(row_major_constraint(solution.x))
print("buffer size", buffer_constraint1(solution.x))
print(buffer_constraint2(solution.x))
print(row_major_comp_bound_constraint(solution.x))
process_parameter(solution.x, True, True)
def opti_comp_row_major(C_arr=None, K_arr=None):
x0 = []
# check if optimizing regular Conv.
if C_arr is None:
# set the initial guess;
x0 = [A, A]
else:
# set the initial guess;
x0 = [A]*(len(C_arr)+1)
# for row_major_constraint1
con1 = {'type': 'ineq', 'fun': row_major_constraint}
# for mem_bound_constraint
con2 = {'type': 'ineq', 'fun': row_major_comp_bound_constraint}
# for the buffer_constraint
con3 = {'type': 'ineq', 'fun': buffer_constraint1}
con4 = {'type': 'ineq', 'fun': buffer_constraint2}
# initial bounds and conditions constraints array.
bnds = []
cons = []
# summery all the bounds and constraints
if len(x) == 2:
bnds = ((A, C), (A, H*W))
cons = ([con1, con2, con3, con4])
elif len(x) == 3:
# add additional variable constraints
con5 = {'type': 'ineq', 'fun': var_constraint1}
# summery all the bounds and constraints
bnds = ((A, C), (A, C), (A, H*W))
cons = ([con1, con2, con3, con4, con5])
elif len(x) == 4:
# add additional variable constraints
con5 = {'type': 'ineq', 'fun': var_constraint1}
con6 = {'type': 'ineq', 'fun': var_constraint2}
# summery all the bounds and constraints
bnds = ((A, C), (A, C), (A, C), (A, H*W))
cons = ([con1, con2, con3, con4, con5, con6])
else:
# add additional variable constraints
con5 = {'type': 'ineq', 'fun': var_constraint1}
con6 = {'type': 'ineq', 'fun': var_constraint2}
con7 = {'type': 'ineq', 'fun': var_constraint3}
# summery all the bounds and constraints
bnds = ((A, C), (A, C), (A, C), (A, C), (A, H*W))
cons = ([con1, con2, con3, con4, con5, con6, con7])
# call the external solver to solve the solution
solution = minimize(row_major_obj, x0, method='SLSQP',\
bounds=bnds, constraints=cons)
print("row major", solution.x, row_major_obj(solution.x))
print(row_major_constraint(solution.x))
print("buffer size", buffer_constraint1(solution.x))
print(buffer_constraint2(solution.x))
print(row_major_comp_bound_constraint(solution.x))
process_parameter(solution.x, True, True)
def opti_mem_row_major(C_arr=None, K_arr=None):
x0 = []
# check if optimizing regular Conv.
if C_arr is None:
# set the initial guess;
x0 = [A,A]
else:
# set the initial guess;
x0 = [A]*(len(C_arr)+1)
# for row_major_constraint1
con1 = {'type': 'ineq', 'fun': row_major_constraint}
# for mem_bound_constraint
con2 = {'type': 'ineq', 'fun': row_major_mem_bound_constraint}
# for the buffer_constraint
con3 = {'type': 'ineq', 'fun': buffer_constraint1}
con4 = {'type': 'ineq', 'fun': buffer_constraint2}
# initial bounds and conditions constraints array.
bnds = None
cons = None
# summery all the bounds and constraints
if len(x) == 2:
bnds = ((A, C), (A, H*W))
cons = ([con1, con2, con3, con4])
elif len(x) == 3:
# add additional variable constraints
con5 = {'type': 'ineq', 'fun': var_constraint1}
# summery all the bounds and constraints
bnds = ((A, C), (A, C), (A, H*W))
cons = ([con1, con2, con3, con4, con5])
elif len(x) == 4:
# add additional variable constraints
con5 = {'type': 'ineq', 'fun': var_constraint1}
con6 = {'type': 'ineq', 'fun': var_constraint2}
# summery all the bounds and constraints
bnds = ((A, C), (A, C), (A, C), (A, H*W))
cons = ([con1, con2, con3, con4, con5, con6])
else:
# add additional variable constraints
con5 = {'type': 'ineq', 'fun': var_constraint1}
con6 = {'type': 'ineq', 'fun': var_constraint2}
con7 = {'type': 'ineq', 'fun': var_constraint3}
# summery all the bounds and constraints
bnds = ((A, C), (A, C), (A, C), (A, C), (A, H*W))
cons = ([con1, con2, con3, con4, con5, con6, con7])
# call the external solver to solve the solution
solution = minimize(row_major_obj, x0, method='SLSQP',\
bounds=bnds, constraints=cons)
print("row major", solution.x, row_major_obj(solution.x))
print(row_major_constraint(solution.x))
print("buffer size", buffer_constraint1(solution.x))
print(buffer_constraint2(solution.x))
print(row_major_mem_bound_constraint(solution.x))
process_parameter(solution.x, True, False)
########################################################
# channel-major constraint solving obj and constraints #
########################################################
# the minimization objective of channel-major compute-bound
# since we don't consider boundary effects on total latency
# therefore, we try to make total round of computing be less.
def channel_major_obj(x):
# simplified from H*W*C/x[0] + K*K*C*C*W*H/x[1]
return 1/x[0] + K*K*C/x[-1]
# make sure the load for channel-major is always less than
# load for row-major, range : [0, +inf]
def channel_major_constraint(x):
# simplified from C*x[1] - K*K*C*x[0]
return x[1]-K*K*x[0];
# make sure the process is always memory-bound;
# range : [0, +inf]
def channel_major_mem_bound_constraint(x):
# simplified from (x[0]*x[1] + K^2*C*x[0])/B - (K^2*C*x[1])/A^2
return (x[1]+K*K*C)/B-K*K*C*x[1]/(A*A)
# make sure the process is always compute-bound;
# range : [0, +inf]
def row_major_comp_bound_constraint(x):
# simplified from (K^2*C*x[1])/A^2 - (x[0]*x[1] + K^2*C*x[0])/B
return K*K*C*x[1]/(A*A)-(x[1]+K*K*C)/B
def opti_mem_channel_major(C_arr=None, K_arr=None):
# check if optimizing regular Conv.
if C_arr is None:
# set the initial guess;
x0 = [A,A]
# for row_major_constraint1
con1 = {'type': 'ineq', 'fun': channel_major_constraint}
# for mem_bound_constraint
con2 = {'type': 'ineq', 'fun': channel_major_mem_bound_constraint}
# for the buffer_constraint
con3 = {'type': 'ineq', 'fun': buffer_constraint1}
con4 = {'type': 'ineq', 'fun': buffer_constraint2}
# summery all the bounds and constraints
bnds = ((A, C), (A, H*W))
cons= ([con1, con2, con3, con4])
# call the external solver to solve the solution
solution = minimize(channel_major_obj,x0,method='SLSQP',\
bounds=bnds,constraints=cons)
print("channel major",solution.x, channel_major_obj(solution.x))
print(channel_major_constraint(solution.x))
print("buffer size", buffer_constraint1(solution.x))
print(buffer_constraint2(solution.x))
print(channel_major_mem_bound_constraint(solution.x))
process_parameter(solution.x, False, False)
def opti_comp_channel_major(C_arr=None, K_arr=None):
if C_arr is None:
# set the initial guess;
x0 = [A,A]
# for row_major_constraint1
con1 = {'type': 'ineq', 'fun': channel_major_constraint}
# for mem_bound_constraint
con2 = {'type': 'ineq', 'fun': channel_major_comp_bound_constraint}
# for the buffer_constraint
con3 = {'type': 'ineq', 'fun': buffer_constraint1}
con4 = {'type': 'ineq', 'fun': buffer_constraint2}
# summery all the bounds and constraints
bnds = ((A, C), (A, H*W))
cons= ([con1, con2, con3, con4])
# call the external solver to solve the solution
solution = minimize(channel_major_obj,x0,method='SLSQP',\
bounds=bnds,constraints=cons)
print("channel major",solution.x, channel_major_obj(solution.x))
print(channel_major_constraint(solution.x))
print("buffer size", buffer_constraint1(solution.x))
print(buffer_constraint2(solution.x))
print(comp_bound_constraint(solution.x))
process_parameter(solution.x, False, True)
else:
return
def opti_comp_channel_major(C_arr=None, K_arr=None):
# set the initial guess;
x0 = [A, A, A]
# for row_major_constraint1
con1 = {'type': 'ineq', 'fun': channel_major_constraint}
# for mem_bound_constraint
con2 = {'type': 'ineq', 'fun': channel_major_comp_bound_constraint}
# for the buffer_constraint
con3 = {'type': 'ineq', 'fun': buffer_constraint1}
con4 = {'type': 'ineq', 'fun': buffer_constraint2}
# summery all the bounds and constraints
bnds = ((A, C), (A, H*W))
cons= ([con1, con2, con3, con4])
# call the external solver to solve the solution
solution = minimize(channel_major_obj,x0,method='SLSQP',\
bounds=bnds,constraints=cons)
print("channel major",solution.x, channel_major_obj(solution.x))
print(channel_major_constraint(solution.x))
print("buffer size", buffer_constraint1(solution.x))
print(buffer_constraint2(solution.x))
print(comp_bound_constraint(solution.x))
process_parameter(solution.x, False, True)
def opti_mem(C_arr=None, K_arr=None):
print("==================================================================")
print("======================== Memory Bound ==========================")
# optimization for row-major;
opti_mem_row_major(C_arr, K_arr);
# optimization for channel-major;
opti_mem_channel_major(C_arr, K_arr);
print("==================================================================\n")
def opti_comp(C_arr=None, K_arr=None):
print("==================================================================")
print("======================== Compute Bound =========================")
# optimization for row-major;
opti_comp_row_major(C_arr, K_arr);
# optimization for channel-major;
opti_comp_channel_major(C_arr, K_arr);
print("==================================================================\n")
#########################################################
# general routines #
#########################################################
def opti_deconv(K_arr):
# initial an array of counter with length of K_arr
C_arr = [C]*len(K_arr)
res = []
while (len(C_arr) != 0):
# optimize on both cases;
opti_mem(C_arr, K_arr)
opti_comp(C_arr, K_arr)
return
def optimizeLayer(height, width, channel, w_number):
opti_mem()
return
if K == 3:
K_arr = [(2,2),(2,1),(1,2),(1,1)]
# enter optimize deconv routine;
opti_deconv(K_arr)
# done!!
else:
if K % 2 == 0:
sub_K = K/2 # optimize regular Conv layer
# if it is possible to be memory-bound only;
if (sub_K*sub_K*C)/(A*A) < B or \
B/((sub_K*sub_K*C)/(A*A) - B) > 1:
opti_mem() # only possible to be memory-bound;
else:
# both cases are possible;
opti_mem()
opti_comp()
# done with optimization with same size sub-kernel;
else:
# first split the kernel into two different sub-kernels;
K_arr = [((K+1)/2, (K+1)/2), ((K-1)/2, (K-1)/2)]
# enter optimize deconv routine;
opti_deconv(K_arr)
# done with optimize deconv with different size sub-kernels;
if __name__== '__main__':
optimizeLayer(H, W, C, C)