-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloader.py
121 lines (89 loc) · 3.89 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from multiprocessing import Pool
from functools import partial
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view
import numpy as np
from constants import WINDOW_SIZE
from util import get_light_benchmark_list, get_benchmark_list, load_timeseries, load_classification, foreach_fork
# rebalance datapoints of a fork
def _resample(df):
# 100 segments per fork (50 per class)
sample_size_per_class = 50
idx_per_class = {}
for class_ in [0, 1]:
# Get min and max starts_at values for majority class
starts_at_min = df[df.y == class_].starts_at.min()
starts_at_max = df[df.y == class_].starts_at.max()
idx = np.linspace(starts_at_min, starts_at_max, sample_size_per_class, dtype=int)
idx_per_class[class_] = np.unique(idx)
for class_, idx in idx_per_class.items():
# Correct sample size if necessary
if len(idx) < sample_size_per_class:
other_class = 1 - class_
sample_size = 2*sample_size_per_class - len(idx)
starts_at_min = df[df.y == other_class].starts_at.min()
starts_at_max = df[df.y == other_class].starts_at.max()
idx_per_class[other_class] = np.linspace(starts_at_min, starts_at_max, sample_size, dtype=int)
# Get mask
mask0 = (df.y == 0) & (df.starts_at.isin(idx_per_class[0]))
mask1 = (df.y == 1) & (df.starts_at.isin(idx_per_class[1]))
# Return resampled dataframe
return df[mask0 | mask1]
# rebalance dataset
def resample(df):
# resample each fork
return df.groupby(['benchmark_id', 'no_fork'], group_keys=False).apply(_resample)
def process_fork(i, ts, st):
rows = []
windows = sliding_window_view(ts, WINDOW_SIZE)
for starts_at, window in enumerate(windows):
end_at = starts_at + WINDOW_SIZE - 1
clas = 0
if (0 <= st < len(ts)) and (starts_at > st):
clas = 1
# Scale values without losing precision
window = window * 10**6
# Standardize data
window = (window - window.mean()) / window.std()
# Handle cases where std is equal to 0
window[np.isnan(window) | np.isinf(window)] = 0
row = [i, starts_at, end_at, *window.tolist(), clas]
rows.append(row)
return rows
def process_benchmark(benchmark, steady_state_only=False):
print('Processing ' + benchmark + ' ...')
timeseries = load_timeseries(benchmark)
classification = load_classification(benchmark)
rows = []
for i, ts, st in foreach_fork(timeseries, classification):
# Skip non steady state forks if steady_state_only is True
if steady_state_only is True and st == -1:
continue
newrows = process_fork(i, ts, st)
rows += newrows
x_columns = ["x{}".format(i) for i in range(WINDOW_SIZE)]
columns = ['no_fork', 'starts_at', 'end_at'] + x_columns + ['y']
df = pd.DataFrame(rows, columns=columns)
df['benchmark_id'] = benchmark
print('Ending processing of ' + benchmark)
return df
def load_dataset(steady_state_only=False, sort=False, stratify=False):
with Pool() as pool:
fn = partial(process_benchmark, steady_state_only=steady_state_only)
dfs = pool.map(fn, get_benchmark_list())
df = pd.concat(dfs, ignore_index=True)
if stratify:
df = resample(df)
if sort:
df.sort_values(["benchmark_id", "no_fork", "starts_at"], inplace=True)
return df
def lightload_dataset(steady_state_only=False, sort=False, stratify=False, cut = 10):
with Pool() as pool:
fn = partial(process_benchmark, steady_state_only=steady_state_only)
dfs = pool.map(fn, get_light_benchmark_list(cut = cut))
df = pd.concat(dfs, ignore_index=True)
if stratify:
df = resample(df)
if sort:
df.sort_values(["benchmark_id", "no_fork", "starts_at"], inplace=True)
return df