Skip to content

Commit

Permalink
finished decision tree multivariate LSx
Browse files Browse the repository at this point in the history
  • Loading branch information
kaiguender committed Aug 27, 2024
1 parent 765184c commit ebbbb19
Show file tree
Hide file tree
Showing 12 changed files with 137 additions and 124 deletions.
49 changes: 20 additions & 29 deletions _proc/02_levelSetKDEx_multivariate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,7 @@
"metadata": {
"language": "python"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"outputs": [],
"source": []
},
{
Expand Down Expand Up @@ -244,7 +235,7 @@
"*[`LevelSetKDEx`](https://kaiguender.github.io/dddex/levelsetkdex_univariate.html#levelsetkdex) turns any point forecasting model into an estimator of the underlying conditional density.\n",
"The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n",
"as a similarity measure between samples. \n",
"TBD*\n",
"TBD.*\n",
"\n",
"| | **Type** | **Default** | **Details** |\n",
"| -- | -------- | ----------- | ----------- |\n",
Expand All @@ -264,7 +255,7 @@
"*`LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density.\n",
"The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n",
"as a similarity measure between samples. \n",
"TBD*\n",
"TBD.*\n",
"\n",
"| | **Type** | **Default** | **Details** |\n",
"| -- | -------- | ----------- | ----------- |\n",
Expand Down Expand Up @@ -303,7 +294,7 @@
"text/markdown": [
"---\n",
"\n",
"[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L612){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L636){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### LevelSetKDEx_multivariate_bin\n",
"\n",
Expand Down Expand Up @@ -331,7 +322,7 @@
"text/plain": [
"---\n",
"\n",
"[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L612){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L636){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### LevelSetKDEx_multivariate_bin\n",
"\n",
Expand Down Expand Up @@ -410,9 +401,20 @@
"# unstacked = True,\n",
"# returnXY = True)\n",
"\n",
"# # duplicate XTrain and yTrain\n",
"# XTrain = np.vstack([XTrain, XTrain])\n",
"# yTrain = np.vstack([yTrain, yTrain])"
"# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n",
"# RF.fit(X = XTrain, y = yTrain)\n",
"\n",
"# # Duplicate XTrain and yTrain m times\n",
"# m = 1000\n",
"# XTrain = np.vstack([XTrain for i in range(m)])\n",
"# yTrain = np.vstack([yTrain for i in range(m)])\n",
"\n",
"# print(XTrain.shape)\n",
"# print(yTrain.shape)\n",
"\n",
"# # Add gaussian to XTrain and yTrain\n",
"# XTrain = XTrain + np.random.normal(0, 0.1, XTrain.shape)\n",
"# yTrain = yTrain + np.random.normal(0, 0.1, yTrain.shape)"
]
},
{
Expand All @@ -422,19 +424,8 @@
"metadata": {
"language": "python"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING clustering 1446 points to 100 centroids: please provide at least 3900 training points\n"
]
}
],
"outputs": [],
"source": [
"# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n",
"# RF.fit(X = XTrain, y = yTrain)\n",
"\n",
"# LSKDEx = LevelSetKDEx_multivariate_opt(estimator = RF, nClusters = 100, minClusterSize = 20)\n",
"# LSKDEx.fit(X = XTrain, y = yTrain)\n",
"\n",
Expand Down
Binary file modified dddex/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/_modidx.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/baseClasses.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/crossValidation.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/levelSetKDEx_multivariate.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/levelSetKDEx_univariate.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/loadData.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/utils.cpython-39.pyc
Binary file not shown.
Binary file modified dddex/__pycache__/wSAA.cpython-39.pyc
Binary file not shown.
80 changes: 52 additions & 28 deletions dddex/levelSetKDEx_multivariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ class LevelSetKDEx_DT(BaseWeightsBasedEstimator_multivariate, BaseLSx):
`LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density.
The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts
as a similarity measure between samples.
TBD
TBD.
"""

def __init__(self,
Expand All @@ -505,11 +505,24 @@ def __init__(self,

super(BaseEstimator, self).__init__(estimator = estimator)

# Check if binSize is integer
if not isinstance(binSize, (int, np.int32, np.int64)):
raise ValueError("'binSize' must be an integer!")
# Check if max_depth is integer
if not isinstance(max_depth, (int, np.int32, np.int64)):
raise ValueError("'max_depth' must be an integer!")

# Check if max_depth is bigger than 0
if max_depth <= 0:
raise ValueError("'max_depth' must be bigger than 0!")

# Check if min_samples_leaf is integer or float
if not isinstance(min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)):
raise ValueError("'min_samples_leaf' must be an integer or float!")

# Check if min_samples_leaf is bigger than 0
if min_samples_leaf <= 0:
raise ValueError("'min_samples_leaf' must be bigger than 0!")

self.binSize = binSize
self.max_depth = max_depth
self.min_samples_leaf = min_samples_leaf

self.yTrain = None
self.yPredTrain = None
Expand All @@ -518,7 +531,7 @@ def __init__(self,

#---

def fit(self: LevelSetKDEx_DRF,
def fit(self: LevelSetKDEx_DT,
X: np.ndarray, # Feature matrix used by `estimator` to predict `y`.
y: np.ndarray, # 1-dimensional target variable corresponding to the feature matrix `X`.
):
Expand All @@ -528,13 +541,17 @@ def fit(self: LevelSetKDEx_DRF,
`binSize` many samples. For details, checkout the function `generateBins` which does the
heavy lifting.
"""

# Check if max_depth is integer
if not isinstance(self.max_depth, (int, np.int32, np.int64)):
raise ValueError("'max_depth' must be an integer!")

# Checks
if not isinstance(self.binSize, (int, np.int32, np.int64)):
raise ValueError("'binSize' must be an integer!")
# Check if min_samples_leaf is integer or float
if not isinstance(self.min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)):
raise ValueError("'min_samples_leaf' must be an integer or float!")

if self.binSize > y.shape[0]:
raise ValueError("'binSize' mustn't be bigger than the size of 'y'!")
if self.min_samples_leaf > y.shape[0]:
raise ValueError("'min_samples_leaf' mustn't be bigger than the size of 'y'!")

if X.shape[0] != y.shape[0]:
raise ValueError("'X' and 'y' must contain the same number of samples!")
Expand All @@ -554,11 +571,10 @@ def fit(self: LevelSetKDEx_DRF,

#---

yPred = pd.DataFrame(yPred)
y = pd.Series(y)
tree = DecisionTreeRegressor(max_depth = self.max_depth, min_samples_leaf = self.min_samples_leaf)

DRF = drf(min_node_size = self.binSize, num_trees = 100, num_features = 1, honesty = False, sample_fraction = 0.5, response_scaling = False, mtry = 1, num_threads = 16)
DRF.fit(X = yPred, Y = y)
tree.fit(X = yPred, y = y)
leafIndicesTrain = tree.apply(yPred)

#---

Expand All @@ -567,12 +583,13 @@ def fit(self: LevelSetKDEx_DRF,
self.yTrain = y.ravel()

self.yPredTrain = yPred
self.drf = DRF
self.tree = tree
self.leafIndicesTrain = leafIndicesTrain
self.fitted = True

#---

def getWeights(self: LevelSetKDEx_DRF,
def getWeights(self: LevelSetKDEx_DT,
X: np.ndarray, # Feature matrix for which conditional density estimates are computed.
# Specifies structure of the returned density estimates. One of:
# 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'
Expand All @@ -583,7 +600,7 @@ def getWeights(self: LevelSetKDEx_DRF,
) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.

# __annotations__ = BaseWeightsBasedEstimator.getWeights.__annotations__
__doc__ = BaseWeightsBasedEstimator.getWeights.__doc__
__doc__ = BaseWeightsBasedEstimator_multivariate.getWeights.__doc__

if not self.fitted:
raise NotFittedError("This LevelSetKDEx instance is not fitted yet. Call 'fit' with "
Expand All @@ -592,17 +609,24 @@ def getWeights(self: LevelSetKDEx_DRF,
#---

yPred = self.estimator.predict(X)
yPred = pd.DataFrame(yPred)
leafIndicesTest = self.tree.apply(yPred)

weightsDataList = []

for leafIndex in leafIndicesTest:
leafComparison = (self.leafIndicesTrain == leafIndex) * 1
nObsInSameLeaf = np.sum(leafComparison)
weights = leafComparison / nObsInSameLeaf

weightsDataList.append((weights[weights > 0], np.where(weights > 0)[0]))

weightsArray = self.drf.predict(yPred).weights
weightsList = list(weightsArray)
weightsDataList = [(weights[weights > 0], np.where(weights > 0)[0]) for weights in weightsList]
#---

weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList,
outputType = outputType,
y = self.yTrain,
scalingList = scalingList,
equalWeights = True)
weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList,
outputType = outputType,
y = self.yTrain,
scalingList = scalingList,
equalWeights = True)

return weightsDataList

Expand Down Expand Up @@ -633,7 +657,7 @@ def __init__(self,

super(BaseEstimator, self).__init__(estimator = estimator)

# Check if binSize is int
# Check if nBinsPerDim is int
if not isinstance(nBinsPerDim, int):
raise ValueError("'binSize' must be an integer!")

Expand Down
Loading

0 comments on commit ebbbb19

Please sign in to comment.