finished decision tree multivariate LSx

d3group · Aug 27, 2024 · ebbbb19 · ebbbb19
1 parent 765184c
commit ebbbb19
Show file tree

Hide file tree

Showing 12 changed files with 137 additions and 124 deletions.
diff --git a/_proc/02_levelSetKDEx_multivariate.ipynb b/_proc/02_levelSetKDEx_multivariate.ipynb
@@ -28,16 +28,7 @@
    "metadata": {
     "language": "python"
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": []
   },
   {
@@ -244,7 +235,7 @@
        "*[`LevelSetKDEx`](https://kaiguender.github.io/dddex/levelsetkdex_univariate.html#levelsetkdex) turns any point forecasting model into an estimator of the underlying conditional density.\n",
        "The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n",
        "as a similarity measure between samples. \n",
-       "TBD*\n",
+       "TBD.*\n",
        "\n",
        "|    | **Type** | **Default** | **Details** |\n",
        "| -- | -------- | ----------- | ----------- |\n",
@@ -264,7 +255,7 @@
        "*`LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density.\n",
        "The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n",
        "as a similarity measure between samples. \n",
-       "TBD*\n",
+       "TBD.*\n",
        "\n",
        "|    | **Type** | **Default** | **Details** |\n",
        "| -- | -------- | ----------- | ----------- |\n",
@@ -303,7 +294,7 @@
       "text/markdown": [
        "---\n",
        "\n",
-       "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L612){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L636){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "### LevelSetKDEx_multivariate_bin\n",
        "\n",
@@ -331,7 +322,7 @@
       "text/plain": [
        "---\n",
        "\n",
-       "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L612){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
+       "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L636){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
        "\n",
        "### LevelSetKDEx_multivariate_bin\n",
        "\n",
@@ -410,9 +401,20 @@
     "#                                                  unstacked = True,\n",
     "#                                                  returnXY = True)\n",
     "\n",
-    "# # duplicate XTrain and yTrain\n",
-    "# XTrain = np.vstack([XTrain, XTrain])\n",
-    "# yTrain = np.vstack([yTrain, yTrain])"
+    "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n",
+    "# RF.fit(X = XTrain, y = yTrain)\n",
+    "\n",
+    "# # Duplicate XTrain and yTrain m times\n",
+    "# m = 1000\n",
+    "# XTrain = np.vstack([XTrain for i in range(m)])\n",
+    "# yTrain = np.vstack([yTrain for i in range(m)])\n",
+    "\n",
+    "# print(XTrain.shape)\n",
+    "# print(yTrain.shape)\n",
+    "\n",
+    "# # Add gaussian to XTrain and yTrain\n",
+    "# XTrain = XTrain + np.random.normal(0, 0.1, XTrain.shape)\n",
+    "# yTrain = yTrain + np.random.normal(0, 0.1, yTrain.shape)"
    ]
   },
   {
@@ -422,19 +424,8 @@
    "metadata": {
     "language": "python"
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING clustering 1446 points to 100 centroids: please provide at least 3900 training points\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n",
-    "# RF.fit(X = XTrain, y = yTrain)\n",
-    "\n",
     "# LSKDEx = LevelSetKDEx_multivariate_opt(estimator = RF, nClusters = 100, minClusterSize = 20)\n",
     "# LSKDEx.fit(X = XTrain, y = yTrain)\n",
     "\n",

diff --git a/dddex/__pycache__/__init__.cpython-39.pyc b/dddex/__pycache__/__init__.cpython-39.pyc
diff --git a/dddex/__pycache__/_modidx.cpython-39.pyc b/dddex/__pycache__/_modidx.cpython-39.pyc
diff --git a/dddex/__pycache__/baseClasses.cpython-39.pyc b/dddex/__pycache__/baseClasses.cpython-39.pyc
diff --git a/dddex/__pycache__/crossValidation.cpython-39.pyc b/dddex/__pycache__/crossValidation.cpython-39.pyc
diff --git a/dddex/__pycache__/levelSetKDEx_multivariate.cpython-39.pyc b/dddex/__pycache__/levelSetKDEx_multivariate.cpython-39.pyc
diff --git a/dddex/__pycache__/levelSetKDEx_univariate.cpython-39.pyc b/dddex/__pycache__/levelSetKDEx_univariate.cpython-39.pyc
diff --git a/dddex/__pycache__/loadData.cpython-39.pyc b/dddex/__pycache__/loadData.cpython-39.pyc
diff --git a/dddex/__pycache__/utils.cpython-39.pyc b/dddex/__pycache__/utils.cpython-39.pyc
diff --git a/dddex/__pycache__/wSAA.cpython-39.pyc b/dddex/__pycache__/wSAA.cpython-39.pyc
diff --git a/dddex/levelSetKDEx_multivariate.py b/dddex/levelSetKDEx_multivariate.py
@@ -494,7 +494,7 @@ class LevelSetKDEx_DT(BaseWeightsBasedEstimator_multivariate, BaseLSx):
     `LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density.
     The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts
     as a similarity measure between samples. 
-    TBD
+    TBD.
     """
 
     def __init__(self, 
@@ -505,11 +505,24 @@ def __init__(self,
 
         super(BaseEstimator, self).__init__(estimator = estimator)
 
-        # Check if binSize is integer
-        if not isinstance(binSize, (int, np.int32, np.int64)):
-            raise ValueError("'binSize' must be an integer!")
+        # Check if max_depth is integer
+        if not isinstance(max_depth, (int, np.int32, np.int64)):
+            raise ValueError("'max_depth' must be an integer!")
+
+        # Check if max_depth is bigger than 0
+        if max_depth <= 0:
+            raise ValueError("'max_depth' must be bigger than 0!")
+
+        # Check if min_samples_leaf is integer or float
+        if not isinstance(min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)):
+            raise ValueError("'min_samples_leaf' must be an integer or float!")
+
+        # Check if min_samples_leaf is bigger than 0
+        if min_samples_leaf <= 0:
+            raise ValueError("'min_samples_leaf' must be bigger than 0!")
 
-        self.binSize = binSize
+        self.max_depth = max_depth
+        self.min_samples_leaf = min_samples_leaf
 
         self.yTrain = None
         self.yPredTrain = None
@@ -518,7 +531,7 @@ def __init__(self,
 
     #---
 
-    def fit(self: LevelSetKDEx_DRF, 
+    def fit(self: LevelSetKDEx_DT, 
             X: np.ndarray, # Feature matrix used by `estimator` to predict `y`.
             y: np.ndarray, # 1-dimensional target variable corresponding to the feature matrix `X`.
             ):
@@ -528,13 +541,17 @@ def fit(self: LevelSetKDEx_DRF,
         `binSize` many samples. For details, checkout the function `generateBins` which does the
         heavy lifting.
         """
+
+        # Check if max_depth is integer
+        if not isinstance(self.max_depth, (int, np.int32, np.int64)):
+            raise ValueError("'max_depth' must be an integer!")
 
-        # Checks
-        if not isinstance(self.binSize, (int, np.int32, np.int64)):
-            raise ValueError("'binSize' must be an integer!")
+        # Check if min_samples_leaf is integer or float
+        if not isinstance(self.min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)):
+            raise ValueError("'min_samples_leaf' must be an integer or float!")
 
-        if self.binSize > y.shape[0]:
-            raise ValueError("'binSize' mustn't be bigger than the size of 'y'!")
+        if self.min_samples_leaf > y.shape[0]:
+            raise ValueError("'min_samples_leaf' mustn't be bigger than the size of 'y'!")
 
         if X.shape[0] != y.shape[0]:
             raise ValueError("'X' and 'y' must contain the same number of samples!")
@@ -554,11 +571,10 @@ def fit(self: LevelSetKDEx_DRF,
 
         #---
 
-        yPred = pd.DataFrame(yPred)
-        y = pd.Series(y)
+        tree = DecisionTreeRegressor(max_depth = self.max_depth, min_samples_leaf = self.min_samples_leaf)
 
-        DRF = drf(min_node_size = self.binSize, num_trees = 100, num_features = 1, honesty = False, sample_fraction = 0.5, response_scaling = False, mtry = 1, num_threads = 16)
-        DRF.fit(X = yPred, Y = y)
+        tree.fit(X = yPred, y = y)
+        leafIndicesTrain = tree.apply(yPred)
 
         #---
 
@@ -567,12 +583,13 @@ def fit(self: LevelSetKDEx_DRF,
         self.yTrain = y.ravel()
 
         self.yPredTrain = yPred
-        self.drf = DRF
+        self.tree = tree
+        self.leafIndicesTrain = leafIndicesTrain
         self.fitted = True
 
     #---
 
-    def getWeights(self: LevelSetKDEx_DRF, 
+    def getWeights(self: LevelSetKDEx_DT, 
                    X: np.ndarray, # Feature matrix for which conditional density estimates are computed.
                    # Specifies structure of the returned density estimates. One of: 
                    # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'
@@ -583,7 +600,7 @@ def getWeights(self: LevelSetKDEx_DRF,
                    ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.
 
         # __annotations__ = BaseWeightsBasedEstimator.getWeights.__annotations__
-        __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__
+        __doc__ = BaseWeightsBasedEstimator_multivariate.getWeights.__doc__
 
         if not self.fitted:
             raise NotFittedError("This LevelSetKDEx instance is not fitted yet. Call 'fit' with "
@@ -592,17 +609,24 @@ def getWeights(self: LevelSetKDEx_DRF,
         #---
 
         yPred = self.estimator.predict(X)
-        yPred = pd.DataFrame(yPred)
+        leafIndicesTest = self.tree.apply(yPred)
+
+        weightsDataList = []
+
+        for leafIndex in leafIndicesTest:
+            leafComparison = (self.leafIndicesTrain == leafIndex) * 1
+            nObsInSameLeaf = np.sum(leafComparison)
+            weights = leafComparison / nObsInSameLeaf
+
+            weightsDataList.append((weights[weights > 0], np.where(weights > 0)[0]))
 
-        weightsArray = self.drf.predict(yPred).weights
-        weightsList = list(weightsArray)
-        weightsDataList = [(weights[weights > 0], np.where(weights > 0)[0]) for weights in weightsList]
+        #---
 
-        weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, 
-                                                     outputType = outputType, 
-                                                     y = self.yTrain,
-                                                     scalingList = scalingList,
-                                                     equalWeights = True)
+        weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList, 
+                                                                  outputType = outputType, 
+                                                                  y = self.yTrain,
+                                                                  scalingList = scalingList,
+                                                                  equalWeights = True)
 
         return weightsDataList
 
@@ -633,7 +657,7 @@ def __init__(self,
 
         super(BaseEstimator, self).__init__(estimator = estimator)
 
-        # Check if binSize is int
+        # Check if nBinsPerDim is int
         if not isinstance(nBinsPerDim, int):
             raise ValueError("'binSize' must be an integer!")