Skip to content

Commit

Permalink
Merge branch 'master' into update-python
Browse files Browse the repository at this point in the history
  • Loading branch information
jrzkaminski authored Oct 11, 2024
2 parents cbed6f1 + 938753c commit 0f1a1ed
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 64 deletions.
1 change: 1 addition & 0 deletions bamt/networks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,7 @@ def fit_parameters(self, data: pd.DataFrame, n_jobs: int = 1):
"""
Base function for parameter learning
"""
data = data.copy()
if data.isnull().values.any():
logger_network.error("Dataframe contains NaNs.")
return
Expand Down
6 changes: 3 additions & 3 deletions bamt/nodes/conditional_gaussian_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pandas import DataFrame
from sklearn import linear_model
from sklearn.base import clone
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import root_mean_squared_error as rmse

from .base import BaseNode
from .schema import CondGaussParams
Expand Down Expand Up @@ -51,8 +51,8 @@ def fit_parameters(self, data: DataFrame) -> Dict[str, Dict[str, CondGaussParams
new_data[self.cont_parents].values, new_data[self.name].values
)
predicted_value = model.predict(new_data[self.cont_parents].values)
variance = mse(
new_data[self.name].values, predicted_value, squared=False
variance = rmse(
new_data[self.name].values, predicted_value
)
hycprob[str(key_comb)] = {
"variance": variance,
Expand Down
4 changes: 2 additions & 2 deletions bamt/nodes/gaussian_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
from pandas import DataFrame
from sklearn import linear_model
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import root_mean_squared_error as rmse

from .base import BaseNode
from .schema import GaussianParams
Expand All @@ -30,7 +30,7 @@ def fit_parameters(self, data: DataFrame, **kwargs) -> GaussianParams:
if parents:
self.regressor.fit(data[parents].values, data[self.name].values, **kwargs)
predicted_value = self.regressor.predict(data[parents].values)
variance = mse(data[self.name].values, predicted_value, squared=False)
variance = rmse(data[self.name].values, predicted_value)
return {
"mean": np.nan,
"regressor_obj": self.regressor,
Expand Down
161 changes: 106 additions & 55 deletions bamt/utils/composite_utils/CompositeGeneticOperators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from golem.core.dag.graph_utils import ordered_subnodes_hierarchy
from numpy import std, mean, log
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split

import numpy as np
from .CompositeModel import CompositeModel
from .MLUtils import MlModels

Expand Down Expand Up @@ -127,63 +127,114 @@ def custom_mutation_add_model(graph: CompositeModel, **kwargs):
return graph


def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02):
data_all = data
data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42)
score, len_data = 0, len(data_train)

def composite_metric(graph: CompositeModel, data: pd.DataFrame):
data_train, data_test = train_test_split(data, train_size=0.8, random_state=42)
score = 0
len_data = len(data_train)

for node in graph.nodes:
data_of_node_train = data_train[node.content["name"]]
data_of_node_test = data_test[node.content["name"]]
if node.nodes_from is None or node.nodes_from == []:
if node.content["type"] == "cont":
mu, sigma = mean(data_of_node_train), std(data_of_node_train)
score += norm.logpdf(
data_of_node_test.values, loc=mu, scale=sigma
).sum()
node_name = node.content["name"]
node_type = node.content["type"]

data_of_node_train = data_train[node_name]
data_of_node_test = data_test[node_name]
index_test_dict = {k:value for value, k in enumerate(sorted(data_train[node_name].unique()))}

if not node.nodes_from:
if node_type == "cont":
mu, sigma = data_of_node_train.mean(), data_of_node_train.std()
score += norm.logpdf(data_of_node_test, loc=mu, scale=sigma).sum()
else:
count = data_of_node_train.value_counts()
frequency = log(count / len_data)
index = frequency.index.tolist()
for value in data_of_node_test:
if value in index:
score += frequency[value]
frequency = np.log(count / len_data)
score += data_of_node_test.map(frequency).fillna(1e-7).sum()
else:
model, columns, target, idx = (
MlModels().dict_models[node.content["parent_model"]](),
[n.content["name"] for n in node.nodes_from],
data_of_node_train.to_numpy(),
data_train.index.to_numpy(),
)
setattr(model, "max_iter", 100000)
features = data_train[columns].to_numpy()
if len(set(target)) == 1:
parent_model = MlModels().dict_models[node.content["parent_model"]]
model = parent_model()
model.max_iter = 100000

columns = [n.content["name"] for n in node.nodes_from]
features_train = data_train[columns].to_numpy()
target_train = data_of_node_train.to_numpy()

if len(set(target_train)) == 1:
continue
fitted_model = model.fit(features, target)

features = data_test[columns].to_numpy()
target = data_of_node_test.to_numpy()
if node.content["type"] == "cont":
predict = fitted_model.predict(features)
mse = mean_squared_error(target, predict, squared=False) + 0.0000001
a = norm.logpdf(target, loc=predict, scale=mse)
score += a.sum()
else:
predict_proba = fitted_model.predict_proba(features)
idx = pd.array(list(range(len(target))))
li = []

for i in idx:
a = predict_proba[i]
try:
b = a[target[i]]
except BaseException:
b = 0.0000001
if b < 0.0000001:
b = 0.0000001
li.append(log(b))
score += sum(li)

edges_count = len(graph.get_edges())
score -= (edges_count * percent) * log10(len_data) * edges_count

fitted_model = model.fit(features_train, target_train)

features_test = data_test[columns].to_numpy()
target_test = data_of_node_test.to_numpy()

if node_type == "cont":
predictions = fitted_model.predict(features_test)
rmse = root_mean_squared_error(target_test, predictions, squared=False) + 1e-7
score += norm.logpdf(target_test, loc=predictions, scale=rmse).sum()

else:
predict_proba = fitted_model.predict_proba(features_test)
probas = np.maximum(predict_proba[range(len(target_test)), [index_test_dict[x] for x in target_test]], 1e-7)
score += np.log(probas).sum()

return -score

# def composite_metric(graph: CompositeModel, data: pd.DataFrame, percent=0.02):
# data_all = data
# data_train, data_test = train_test_split(data_all, train_size=0.8, random_state=42)
# score, len_data = 0, len(data_train)
# for node in graph.nodes:
# data_of_node_train = data_train[node.content["name"]]
# data_of_node_test = data_test[node.content["name"]]
# if node.nodes_from is None or node.nodes_from == []:
# if node.content["type"] == "cont":
# mu, sigma = mean(data_of_node_train), std(data_of_node_train)
# score += norm.logpdf(
# data_of_node_test.values, loc=mu, scale=sigma
# ).sum()
# else:
# count = data_of_node_train.value_counts()
# frequency = log(count / len_data)
# index = frequency.index.tolist()
# for value in data_of_node_test:
# if value in index:
# score += frequency[value]
# else:
# model, columns, target, idx = (
# MlModels().dict_models[node.content["parent_model"]](),
# [n.content["name"] for n in node.nodes_from],
# data_of_node_train.to_numpy(),
# data_train.index.to_numpy(),
# )
# setattr(model, "max_iter", 100000)
# features = data_train[columns].to_numpy()
# if len(set(target)) == 1:
# continue
# fitted_model = model.fit(features, target)

# features = data_test[columns].to_numpy()
# target = data_of_node_test.to_numpy()
# if node.content["type"] == "cont":
# predict = fitted_model.predict(features)
# mse = mean_squared_error(target, predict, squared=False) + 0.0000001
# a = norm.logpdf(target, loc=predict, scale=mse)
# score += a.sum()
# else:
# predict_proba = fitted_model.predict_proba(features)
# idx = pd.array(list(range(len(target))))
# li = []

# for i in idx:
# a = predict_proba[i]
# try:
# b = a[target[i]]
# except BaseException:
# b = 0.0000001
# if b < 0.0000001:
# b = 0.0000001
# li.append(log(b))
# score += sum(li)

# # edges_count = len(graph.get_edges())
# # score -= (edges_count * percent) * log10(len_data) * edges_count

# return -score
2 changes: 1 addition & 1 deletion docs/source/examples/learn_sampling_predict.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Used imports:
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
numpy==1.26.4
matplotlib==3.6.2
pandas==2.0.3
pandas>=2.0.3
gmr==1.6.2
scikit-learn>=1.2.0
scipy>=1.9.3
Expand Down
4 changes: 2 additions & 2 deletions tests/MainTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
# print(cont_bn.weights)
# print(cont_bn2.weights)
# print('RMSE on predicted values with continuous data: ' +
# f'{mse(cont_target, cont_predicted_values, squared=False)}')
# f'{rmse(cont_target, cont_predicted_values)}')
# print(cont_bn.get_info())
# print(cont_bn2.get_info())
# print(synth_cont_data)
Expand Down Expand Up @@ -116,7 +116,7 @@
# print(hybrid_bn2.weights)
# print(hybrid_bn3.weights)
# print('RMSE on predicted values with hybrid data: ' +
# f'{mse(hybrid_target, hybrid_predicted_values, squared=False)}')
# f'{rmse(hybrid_target, hybrid_predicted_values)}')
# print(hybrid_bn.get_info())
# print(hybrid_bn2.get_info())
# print(hybrid_bn3.get_info())
Expand Down

0 comments on commit 0f1a1ed

Please sign in to comment.