-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.py
107 lines (80 loc) · 3.91 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import pandas as pd
from catboost import CatBoostClassifier
from main import convert_np_types
from parsing import read_data, get_files_in_folder
from preprocess import parse_file
import time
import functools
def timer(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time() # начало замера времени
result = func(*args, **kwargs)
end_time = time.time() # окончание замера времени
elapsed_time = end_time - start_time
print(f"Функция {func.__name__} выполнена за {elapsed_time:.4f} секунд")
return result
return wrapper
def validate_file(file_path):
df = read_data(file_path)
if len(df) < 20:
return
result_dict = parse_file(df)
if result_dict is None:
return
result_serializable = convert_np_types(result_dict)
data = {'file_name': file_path, }
for i in range(4):
data[f'poly_coefs_deltaP_x{i + 1}'] = result_serializable['poly_coefs_deltaP'][i]
data[f'poly_coefs_p_prime_x{i + 1}'] = result_serializable['poly_coefs_p_prime'][i]
data['mse_deltaP'] = result_serializable['mse_deltaP'],
data['mse_pprime'] = result_serializable['mse_pprime'],
for key, value in result_serializable['chunks_deltaP'].items():
for i, value_i in enumerate(value):
data[f'chunks_deltaP_{key}_{i}'] = value_i
for key, value in result_serializable['chunks_p_prime'].items():
for i, value_i in enumerate(value):
data[f'chunks_p_prime_{key}_{i}'] = value_i
processed_data = pd.DataFrame(data=data)
processed_data = processed_data.drop(columns=['file_name'])
models_path = r"C:/Users/selin/PycharmProjects/SIAM_Hack/case1/models/catboost"
models_filepath = get_files_in_folder(models_path)
model_pred = {}
for model_filepath in models_filepath:
model_path = models_path + "/" + model_filepath
model = CatBoostClassifier()
model.load_model(model_path)
prediction = model.predict(processed_data)
model_pred[model_filepath] = prediction[0]
return model_pred
@timer
def do_validate(folder_path=r"C:\Users\selin\PycharmProjects\SIAM_Hack\case1\datasets\validation 1\validation 1",
example_submit_path=r"C:\Users\selin\PycharmProjects\SIAM_Hack\case1\datasets\example_submit.csv"):
files = get_files_in_folder(folder_path)
example_submit = pd.read_csv(example_submit_path, sep=',', encoding='utf-8')
predictions_arr = []
for i, filename in enumerate(files):
if i % 10 == 0:
print(f"Обработка файла {i}")
file_path = os.path.join(folder_path, filename)
preds = validate_file(file_path)
if preds is None:
preds = {'file_name': filename, 'Билинейный режим.cbm': 0, 'Влияние ствола скважины.cbm': 0,
'Граница непроницаемый разлом.cbm': 0, 'Граница постоянного давления.cbm': 0,
'Линейный режим.cbm': 0, 'Некачественное ГДИС.cbm': 1, 'Радиальный режим.cbm': 0,
'Сферический режим.cbm': 0}
else:
preds['file_name'] = filename
preds = {key.replace('.cbm', ''): value for key, value in preds.items()}
predictions_arr.append(preds)
predictions_dataframe = pd.DataFrame(predictions_arr)
example_submit = example_submit[['file_name']]
result = example_submit.merge(predictions_dataframe, how='left', left_on='file_name', right_on='file_name')
return result
if __name__ == '__main__':
import datetime
result_df = do_validate()
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M")
filename = f"submit_{current_time}.csv"
result_df.to_csv(filename, index=False, encoding="utf-8")