From ebbbb196959f5cf950b44436d1681b19f675dc00 Mon Sep 17 00:00:00 2001 From: kaiguender Date: Tue, 27 Aug 2024 20:20:37 +0200 Subject: [PATCH] finished decision tree multivariate LSx --- _proc/02_levelSetKDEx_multivariate.ipynb | 49 +++---- dddex/__pycache__/__init__.cpython-39.pyc | Bin 125 -> 125 bytes dddex/__pycache__/_modidx.cpython-39.pyc | Bin 12104 -> 12104 bytes dddex/__pycache__/baseClasses.cpython-39.pyc | Bin 7672 -> 7672 bytes .../crossValidation.cpython-39.pyc | Bin 18175 -> 18175 bytes .../levelSetKDEx_multivariate.cpython-39.pyc | Bin 20145 -> 20301 bytes .../levelSetKDEx_univariate.cpython-39.pyc | Bin 27436 -> 27436 bytes dddex/__pycache__/loadData.cpython-39.pyc | Bin 1558 -> 1558 bytes dddex/__pycache__/utils.cpython-39.pyc | Bin 5979 -> 5979 bytes dddex/__pycache__/wSAA.cpython-39.pyc | Bin 6858 -> 6858 bytes dddex/levelSetKDEx_multivariate.py | 80 +++++++---- nbs/02_levelSetKDEx_multivariate.ipynb | 132 +++++++++--------- 12 files changed, 137 insertions(+), 124 deletions(-) diff --git a/_proc/02_levelSetKDEx_multivariate.ipynb b/_proc/02_levelSetKDEx_multivariate.ipynb index 451991f..d694e0d 100644 --- a/_proc/02_levelSetKDEx_multivariate.ipynb +++ b/_proc/02_levelSetKDEx_multivariate.ipynb @@ -28,16 +28,7 @@ "metadata": { "language": "python" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [] }, { @@ -244,7 +235,7 @@ "*[`LevelSetKDEx`](https://kaiguender.github.io/dddex/levelsetkdex_univariate.html#levelsetkdex) turns any point forecasting model into an estimator of the underlying conditional density.\n", "The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n", "as a similarity measure between samples. \n", - "TBD*\n", + "TBD.*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -264,7 +255,7 @@ "*`LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density.\n", "The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts\n", "as a similarity measure between samples. \n", - "TBD*\n", + "TBD.*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -303,7 +294,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L612){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L636){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### LevelSetKDEx_multivariate_bin\n", "\n", @@ -331,7 +322,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L612){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/kaiguender/dddex/blob/main/dddex/levelSetKDEx_multivariate.py#L636){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### LevelSetKDEx_multivariate_bin\n", "\n", @@ -410,9 +401,20 @@ "# unstacked = True,\n", "# returnXY = True)\n", "\n", - "# # duplicate XTrain and yTrain\n", - "# XTrain = np.vstack([XTrain, XTrain])\n", - "# yTrain = np.vstack([yTrain, yTrain])" + "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n", + "# RF.fit(X = XTrain, y = yTrain)\n", + "\n", + "# # Duplicate XTrain and yTrain m times\n", + "# m = 1000\n", + "# XTrain = np.vstack([XTrain for i in range(m)])\n", + "# yTrain = np.vstack([yTrain for i in range(m)])\n", + "\n", + "# print(XTrain.shape)\n", + "# print(yTrain.shape)\n", + "\n", + "# # Add gaussian to XTrain and yTrain\n", + "# XTrain = XTrain + np.random.normal(0, 0.1, XTrain.shape)\n", + "# yTrain = yTrain + np.random.normal(0, 0.1, yTrain.shape)" ] }, { @@ -422,19 +424,8 @@ "metadata": { "language": "python" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING clustering 1446 points to 100 centroids: please provide at least 3900 training points\n" - ] - } - ], + "outputs": [], "source": [ - "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n", - "# RF.fit(X = XTrain, y = yTrain)\n", - "\n", "# LSKDEx = LevelSetKDEx_multivariate_opt(estimator = RF, nClusters = 100, minClusterSize = 20)\n", "# LSKDEx.fit(X = XTrain, y = yTrain)\n", "\n", diff --git a/dddex/__pycache__/__init__.cpython-39.pyc b/dddex/__pycache__/__init__.cpython-39.pyc index 8561c0f76a0effcd8b9d7c720d5d90198477c986..ebb7c60ec703c407b3140105dfea1fab40f4c561 100644 GIT binary patch delta 17 Wcmb=eRcOs5Ek(ZZ?0SH>f&u!$k)dv7R^964J delta 20 acmX>RcOs5Ek(ZZ?0SKo4JG+tFRv!RF$OdHq diff --git a/dddex/__pycache__/baseClasses.cpython-39.pyc b/dddex/__pycache__/baseClasses.cpython-39.pyc index beedc9d2a7c3969ce9008ca1285def45bb8217cb..f71a8350a61473aa32960be7332c0b66c39f000f 100644 GIT binary patch delta 20 acmexi{ll6&k(ZZ?0SH>f&u!#>D+>TZbp|H@ delta 20 acmexi{ll6&k(ZZ?0SKo4JG+tlttmx}=iTE)+8mx}=irv5v-k^74q09oD$0RR91 diff --git a/dddex/__pycache__/levelSetKDEx_multivariate.cpython-39.pyc b/dddex/__pycache__/levelSetKDEx_multivariate.cpython-39.pyc index 2b9c05cad481c1248065ef09baa407740fcf282e..9bda18192227a3aa63341eca0934e8229aa6ef4e 100644 GIT binary patch delta 1895 zcmaJ?Yitx%6ux&}J3IU6cDuBeQo36$bqbUMh1N>2;ZYtf6{V4|32dg!Y}vNEUGEM7 zn%iuYB5G`W+z66XZNUeK4{!yI3Gu%{V|@JCKl+zn_(u~-44yL!#RqY2?ls!d8S1gw+VAO3f=a zQ~I_@uRKp*FIBZKS9aIf3Ji5tI!V&h)@~F|k-=JL}rYs<)l(hrr4{PhQ{2-h&l;AK_sH6Cf#G?(AN? z6eOZWTzOQ1 zsk%C#AEx2eq|epvR1#QR8V%TTJsDHj0nmFD<+P$(u*qx^Y+fBFBj7tY7vglDPRV;o zZo5L<@UibH!7&*FYcV)l#j#Da8Hp00#41K`;ltAJBe`$~hPdDcG*Sm~(KUtXv-KHi zO5qClk?;E9%3L*<;ZT)JbJ2NhnHsNsVq5`v5my0gZ9gZgQgyZgjT2y9kQa6y;R1r! zAU*#nrFbS}iY7C%;awmN4Vn+IWytpGgx?0+h42BwMTAxauYez-gw~#q15%LJ1AY2lwBQ>#o zAl6ER6!5|z!CL5yxOlaz!H7s(EGVQCfQPqO!-}7F2Mc3F}R-_4P+|7eRDw`;LsdnN<9I4s3Ol(Y?rh& zPbqnNy&LAxLf{K3g?H7A3Mb&5qYbhm1j1^o5i#>a>;#}0VKwj))+0?#-Vgj3?UxWR z&1Jc8ILp$w%!rrz+ly6y&zMti*>hk^`tN9+qscIen_O#WU0|I*o zz?9fAcnz|%BHh#OZ;oz+>+%oQAP&?wmcHz<)Uc^kZY%a6@cKl&H$h9{yF{Z%^#(z@ zov5QTrRm;?qMic6&*IFM7h0}>R5r#ZjyP=7#NnL%??{pPi@3hES!A~EtVcF$L%8=_ zkZ}_j2UdQLhN}P&s^>Rw&+nZSRQ_&&@+<797x}(U`WZG^6utZo8DZN%7&4czlK8r> zm!1}ZZOa2?{}#)&`WU2DJi6_tmTSQAA~0p|ph`$zpw>G8JN9g8-S)Ud*NCTfUZI^L z*#9KGCQ|+J&}lH1)jWf%o6SR+hv`@1W`9CbWf2`1p=ZV9z!J(!rw2ArdPe-X>m2PC LM+dv*BB}lh1>MhR delta 1723 zcmYjRU2IfE6rQ<1yLbP$?NYmKp`{3V`6+E^ODQD$bz1_Z2(2`1BI`}(Zo4bp*11!q z&21W`ilTjxiGl`oD{25u6gNgheIPz)Vu-vD%|?x{eK5fXA9(QRoZDKxll}6|IcLty zobSxuev4jzlSZvbBuLaTQtDB$Z+@70H!Q$+d8WQQwgEY zBR~WZ)Fo$>yy8+u=P4yZ6`Ih6FYkw+kyXzqPO}Iw*s56zjXjTW|TjO6FQ0@`bRX;C+i*!6z%X24<-;qXCj0SPhHU9rJL{4>b(OrIDM|8# z#yfJTX$(1-|5U84lYbOOEkkPCU4KCe&$ z5s4)qWbk2z6QmGrf%4?#@pTom#fYn7CJ`=1T?LpTVt6j*_v8$M$OcM0Z_H?}2ANgJ z9Kp=4Qj9NPnJd6XGLhzQoZPfs4iqNGGJ@N-Qw}4|+LoB)wv+acju&i4Or>LvH8yd| z&SbeYEOO(8bR66h;{^vY3|psig^{#2Ciql2^Bo2KOCq5 z0sE03M!1S_4dEe#LFx9^j6Mpq9O$-$)x)hZn;%El2M9w5E1knP!_?II7PR6{yaQnZ zAgyA~;Z9+f{g8$Pd-MrCZr`hMXr z8m7ZSp=tG_l&iF*g1g_@;=ry0}Tl;4_Y({Z?U+{7r>GvS>+YwNo|1e^AHY&^s1Fx;&47v_CIas zMn;7rpD`1mA-EOq1@m;Xywz1Jf2vNb8|2Snc(k0{XF2xJQZ!PQazt*-660Kc)Y8y~ z)#jfdcqJ|&!+(u(EodzG&|-qq@)Q& z661gQxA22zJ-;DO_U@q<<=efRbg!cFZtvaow=fwtp{er!w*3mND_wc6^!_-w4H$)#&wL XWu?}`yC{7_7WyyK4q17mlb-$uiW#0G diff --git a/dddex/__pycache__/levelSetKDEx_univariate.cpython-39.pyc b/dddex/__pycache__/levelSetKDEx_univariate.cpython-39.pyc index fec89ceb818299826c7829753d803d3ac4cb00ba..1fe9260e69429379dc30dc544cee8f29a9345239 100644 GIT binary patch delta 22 ccmZ2;jd9I2M(#vjUM>b8Xca%Vky|kv08m^8NdN!< delta 22 ccmZ2;jd9I2M(#vjUM>b8nELPRMsCGy09G*vKL7v# diff --git a/dddex/__pycache__/loadData.cpython-39.pyc b/dddex/__pycache__/loadData.cpython-39.pyc index c7b95f9fdb470b388f0745b65797ad120bc5d4e9..98301bb04fdf5ff7b4e0bbbec7a778fb6666714c 100644 GIT binary patch delta 20 acmbQnGmVEkk(ZZ?0SH>f&u!%9W&;2#as&(j delta 20 acmbQnGmVEkk(ZZ?0SKo4JG+san+*UlM+E`^ diff --git a/dddex/__pycache__/utils.cpython-39.pyc b/dddex/__pycache__/utils.cpython-39.pyc index a5b95671572e60d0f7336ed55294075b32d5fdcd..0033b6455ff2dc54d068b1cce57cfb6d0a065201 100644 GIT binary patch delta 20 acmcbucUzA;k(ZZ?0SH>f&u!!m5C;G|69q5; delta 20 acmcbucUzA;k(ZZ?0SKo4JG+rPKpX%-=msVL diff --git a/dddex/__pycache__/wSAA.cpython-39.pyc b/dddex/__pycache__/wSAA.cpython-39.pyc index 538fa87ccd0ed0b464044e52b1d770634f25a938..b24329a5a9a75ceceed18f1a043f67c3a0e30a9f 100644 GIT binary patch delta 20 acmX?QddieLk(ZZ?0SH>f&u!%1F9iTPY6Xq} delta 20 acmX?QddieLk(ZZ?0SKo4JG+s4zZ3vKKL&&V diff --git a/dddex/levelSetKDEx_multivariate.py b/dddex/levelSetKDEx_multivariate.py index 7963b5a..350513e 100644 --- a/dddex/levelSetKDEx_multivariate.py +++ b/dddex/levelSetKDEx_multivariate.py @@ -494,7 +494,7 @@ class LevelSetKDEx_DT(BaseWeightsBasedEstimator_multivariate, BaseLSx): `LevelSetKDEx` turns any point forecasting model into an estimator of the underlying conditional density. The name 'LevelSet' stems from the fact that this approach interprets the values of the point forecasts as a similarity measure between samples. - TBD + TBD. """ def __init__(self, @@ -505,11 +505,24 @@ def __init__(self, super(BaseEstimator, self).__init__(estimator = estimator) - # Check if binSize is integer - if not isinstance(binSize, (int, np.int32, np.int64)): - raise ValueError("'binSize' must be an integer!") + # Check if max_depth is integer + if not isinstance(max_depth, (int, np.int32, np.int64)): + raise ValueError("'max_depth' must be an integer!") + + # Check if max_depth is bigger than 0 + if max_depth <= 0: + raise ValueError("'max_depth' must be bigger than 0!") + + # Check if min_samples_leaf is integer or float + if not isinstance(min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)): + raise ValueError("'min_samples_leaf' must be an integer or float!") + + # Check if min_samples_leaf is bigger than 0 + if min_samples_leaf <= 0: + raise ValueError("'min_samples_leaf' must be bigger than 0!") - self.binSize = binSize + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf self.yTrain = None self.yPredTrain = None @@ -518,7 +531,7 @@ def __init__(self, #--- - def fit(self: LevelSetKDEx_DRF, + def fit(self: LevelSetKDEx_DT, X: np.ndarray, # Feature matrix used by `estimator` to predict `y`. y: np.ndarray, # 1-dimensional target variable corresponding to the feature matrix `X`. ): @@ -528,13 +541,17 @@ def fit(self: LevelSetKDEx_DRF, `binSize` many samples. For details, checkout the function `generateBins` which does the heavy lifting. """ + + # Check if max_depth is integer + if not isinstance(self.max_depth, (int, np.int32, np.int64)): + raise ValueError("'max_depth' must be an integer!") - # Checks - if not isinstance(self.binSize, (int, np.int32, np.int64)): - raise ValueError("'binSize' must be an integer!") + # Check if min_samples_leaf is integer or float + if not isinstance(self.min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)): + raise ValueError("'min_samples_leaf' must be an integer or float!") - if self.binSize > y.shape[0]: - raise ValueError("'binSize' mustn't be bigger than the size of 'y'!") + if self.min_samples_leaf > y.shape[0]: + raise ValueError("'min_samples_leaf' mustn't be bigger than the size of 'y'!") if X.shape[0] != y.shape[0]: raise ValueError("'X' and 'y' must contain the same number of samples!") @@ -554,11 +571,10 @@ def fit(self: LevelSetKDEx_DRF, #--- - yPred = pd.DataFrame(yPred) - y = pd.Series(y) + tree = DecisionTreeRegressor(max_depth = self.max_depth, min_samples_leaf = self.min_samples_leaf) - DRF = drf(min_node_size = self.binSize, num_trees = 100, num_features = 1, honesty = False, sample_fraction = 0.5, response_scaling = False, mtry = 1, num_threads = 16) - DRF.fit(X = yPred, Y = y) + tree.fit(X = yPred, y = y) + leafIndicesTrain = tree.apply(yPred) #--- @@ -567,12 +583,13 @@ def fit(self: LevelSetKDEx_DRF, self.yTrain = y.ravel() self.yPredTrain = yPred - self.drf = DRF + self.tree = tree + self.leafIndicesTrain = leafIndicesTrain self.fitted = True #--- - def getWeights(self: LevelSetKDEx_DRF, + def getWeights(self: LevelSetKDEx_DT, X: np.ndarray, # Feature matrix for which conditional density estimates are computed. # Specifies structure of the returned density estimates. One of: # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized' @@ -583,7 +600,7 @@ def getWeights(self: LevelSetKDEx_DRF, ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`. # __annotations__ = BaseWeightsBasedEstimator.getWeights.__annotations__ - __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__ + __doc__ = BaseWeightsBasedEstimator_multivariate.getWeights.__doc__ if not self.fitted: raise NotFittedError("This LevelSetKDEx instance is not fitted yet. Call 'fit' with " @@ -592,17 +609,24 @@ def getWeights(self: LevelSetKDEx_DRF, #--- yPred = self.estimator.predict(X) - yPred = pd.DataFrame(yPred) + leafIndicesTest = self.tree.apply(yPred) + + weightsDataList = [] + + for leafIndex in leafIndicesTest: + leafComparison = (self.leafIndicesTrain == leafIndex) * 1 + nObsInSameLeaf = np.sum(leafComparison) + weights = leafComparison / nObsInSameLeaf + + weightsDataList.append((weights[weights > 0], np.where(weights > 0)[0])) - weightsArray = self.drf.predict(yPred).weights - weightsList = list(weightsArray) - weightsDataList = [(weights[weights > 0], np.where(weights > 0)[0]) for weights in weightsList] + #--- - weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, - outputType = outputType, - y = self.yTrain, - scalingList = scalingList, - equalWeights = True) + weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList, + outputType = outputType, + y = self.yTrain, + scalingList = scalingList, + equalWeights = True) return weightsDataList @@ -633,7 +657,7 @@ def __init__(self, super(BaseEstimator, self).__init__(estimator = estimator) - # Check if binSize is int + # Check if nBinsPerDim is int if not isinstance(nBinsPerDim, int): raise ValueError("'binSize' must be an integer!") diff --git a/nbs/02_levelSetKDEx_multivariate.ipynb b/nbs/02_levelSetKDEx_multivariate.ipynb index c95de5d..8fb03dd 100644 --- a/nbs/02_levelSetKDEx_multivariate.ipynb +++ b/nbs/02_levelSetKDEx_multivariate.ipynb @@ -5,16 +5,7 @@ "execution_count": null, "id": "72025364-2e34-4aba-87de-ff5a8b382900", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ] - } - ], + "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -41,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "5e8cef21-4158-4307-b898-6bc03398a4e5", "metadata": {}, "outputs": [], @@ -52,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "2d95d3a6-a5e0-46de-b191-d1de0281f3ec", "metadata": {}, "outputs": [], @@ -614,9 +605,17 @@ " if not isinstance(max_depth, (int, np.int32, np.int64)):\n", " raise ValueError(\"'max_depth' must be an integer!\")\n", " \n", - " # Check if min_samples_leaf is integer\n", - " if not isinstance(min_samples_leaf, (int, np.int32, np.int64)):\n", - " raise ValueError(\"'min_samples_leaf' must be an integer!\")\n", + " # Check if max_depth is bigger than 0\n", + " if max_depth <= 0:\n", + " raise ValueError(\"'max_depth' must be bigger than 0!\")\n", + " \n", + " # Check if min_samples_leaf is integer or float\n", + " if not isinstance(min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)):\n", + " raise ValueError(\"'min_samples_leaf' must be an integer or float!\")\n", + " \n", + " # Check if min_samples_leaf is bigger than 0\n", + " if min_samples_leaf <= 0:\n", + " raise ValueError(\"'min_samples_leaf' must be bigger than 0!\")\n", "\n", " self.max_depth = max_depth\n", " self.min_samples_leaf = min_samples_leaf\n", @@ -628,7 +627,7 @@ " \n", " #---\n", " \n", - " def fit(self: LevelSetKDEx_DRF, \n", + " def fit(self: LevelSetKDEx_DT, \n", " X: np.ndarray, # Feature matrix used by `estimator` to predict `y`.\n", " y: np.ndarray, # 1-dimensional target variable corresponding to the feature matrix `X`.\n", " ):\n", @@ -638,13 +637,17 @@ " `binSize` many samples. For details, checkout the function `generateBins` which does the\n", " heavy lifting.\n", " \"\"\"\n", + "\n", + " # Check if max_depth is integer\n", + " if not isinstance(self.max_depth, (int, np.int32, np.int64)):\n", + " raise ValueError(\"'max_depth' must be an integer!\")\n", " \n", - " # Checks\n", - " if not isinstance(self.binSize, (int, np.int32, np.int64)):\n", - " raise ValueError(\"'binSize' must be an integer!\")\n", + " # Check if min_samples_leaf is integer or float\n", + " if not isinstance(self.min_samples_leaf, (int, np.int32, np.int64, float, np.float32, np.float64)):\n", + " raise ValueError(\"'min_samples_leaf' must be an integer or float!\")\n", " \n", - " if self.binSize > y.shape[0]:\n", - " raise ValueError(\"'binSize' mustn't be bigger than the size of 'y'!\")\n", + " if self.min_samples_leaf > y.shape[0]:\n", + " raise ValueError(\"'min_samples_leaf' mustn't be bigger than the size of 'y'!\")\n", " \n", " if X.shape[0] != y.shape[0]:\n", " raise ValueError(\"'X' and 'y' must contain the same number of samples!\")\n", @@ -664,11 +667,10 @@ " \n", " #---\n", " \n", - " yPred = pd.DataFrame(yPred)\n", - " y = pd.Series(y)\n", + " tree = DecisionTreeRegressor(max_depth = self.max_depth, min_samples_leaf = self.min_samples_leaf)\n", "\n", - " DRF = drf(min_node_size = self.binSize, num_trees = 100, num_features = 1, honesty = False, sample_fraction = 0.5, response_scaling = False, mtry = 1, num_threads = 16)\n", - " DRF.fit(X = yPred, Y = y)\n", + " tree.fit(X = yPred, y = y)\n", + " leafIndicesTrain = tree.apply(yPred)\n", " \n", " #---\n", " \n", @@ -677,12 +679,13 @@ " self.yTrain = y.ravel()\n", " \n", " self.yPredTrain = yPred\n", - " self.drf = DRF\n", + " self.tree = tree\n", + " self.leafIndicesTrain = leafIndicesTrain\n", " self.fitted = True\n", " \n", " #---\n", " \n", - " def getWeights(self: LevelSetKDEx_DRF, \n", + " def getWeights(self: LevelSetKDEx_DT, \n", " X: np.ndarray, # Feature matrix for which conditional density estimates are computed.\n", " # Specifies structure of the returned density estimates. One of: \n", " # 'all', 'onlyPositiveWeights', 'summarized', 'cumDistribution', 'cumDistributionSummarized'\n", @@ -693,7 +696,7 @@ " ) -> list: # List whose elements are the conditional density estimates for the samples specified by `X`.\n", " \n", " # __annotations__ = BaseWeightsBasedEstimator.getWeights.__annotations__\n", - " __doc__ = BaseWeightsBasedEstimator.getWeights.__doc__\n", + " __doc__ = BaseWeightsBasedEstimator_multivariate.getWeights.__doc__\n", " \n", " if not self.fitted:\n", " raise NotFittedError(\"This LevelSetKDEx instance is not fitted yet. Call 'fit' with \"\n", @@ -702,17 +705,24 @@ " #---\n", " \n", " yPred = self.estimator.predict(X)\n", - " yPred = pd.DataFrame(yPred)\n", + " leafIndicesTest = self.tree.apply(yPred)\n", + "\n", + " weightsDataList = []\n", + "\n", + " for leafIndex in leafIndicesTest:\n", + " leafComparison = (self.leafIndicesTrain == leafIndex) * 1\n", + " nObsInSameLeaf = np.sum(leafComparison)\n", + " weights = leafComparison / nObsInSameLeaf\n", + "\n", + " weightsDataList.append((weights[weights > 0], np.where(weights > 0)[0]))\n", " \n", - " weightsArray = self.drf.predict(yPred).weights\n", - " weightsList = list(weightsArray)\n", - " weightsDataList = [(weights[weights > 0], np.where(weights > 0)[0]) for weights in weightsList]\n", + " #---\n", "\n", - " weightsDataList = restructureWeightsDataList(weightsDataList = weightsDataList, \n", - " outputType = outputType, \n", - " y = self.yTrain,\n", - " scalingList = scalingList,\n", - " equalWeights = True)\n", + " weightsDataList = restructureWeightsDataList_multivariate(weightsDataList = weightsDataList, \n", + " outputType = outputType, \n", + " y = self.yTrain,\n", + " scalingList = scalingList,\n", + " equalWeights = True)\n", " \n", " return weightsDataList\n", " \n", @@ -953,7 +963,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "c5f055de-89ce-4943-8242-8f434ee8a3f1", "metadata": {}, "outputs": [], @@ -968,7 +978,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "34541b1d-b3a9-4310-ae57-0918cbd18c14", "metadata": {}, "outputs": [], @@ -981,9 +991,20 @@ "# unstacked = True,\n", "# returnXY = True)\n", "\n", - "# # duplicate XTrain and yTrain\n", - "# XTrain = np.vstack([XTrain, XTrain])\n", - "# yTrain = np.vstack([yTrain, yTrain])" + "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n", + "# RF.fit(X = XTrain, y = yTrain)\n", + "\n", + "# # Duplicate XTrain and yTrain m times\n", + "# m = 1000\n", + "# XTrain = np.vstack([XTrain for i in range(m)])\n", + "# yTrain = np.vstack([yTrain for i in range(m)])\n", + "\n", + "# print(XTrain.shape)\n", + "# print(yTrain.shape)\n", + "\n", + "# # Add gaussian to XTrain and yTrain\n", + "# XTrain = XTrain + np.random.normal(0, 0.1, XTrain.shape)\n", + "# yTrain = yTrain + np.random.normal(0, 0.1, yTrain.shape)" ] }, { @@ -991,19 +1012,8 @@ "execution_count": null, "id": "93d36f76", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING clustering 1446 points to 100 centroids: please provide at least 3900 training points\n" - ] - } - ], + "outputs": [], "source": [ - "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n", - "# RF.fit(X = XTrain, y = yTrain)\n", - "\n", "# LSKDEx = LevelSetKDEx_multivariate_opt(estimator = RF, nClusters = 100, minClusterSize = 20)\n", "# LSKDEx.fit(X = XTrain, y = yTrain)\n", "\n", @@ -1056,21 +1066,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "python3", "language": "python", "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.18" } }, "nbformat": 4,