From 620d04b20f54eb5f037c9b74eb79a91c7fe904e3 Mon Sep 17 00:00:00 2001 From: kaiguender Date: Thu, 22 Aug 2024 02:14:22 +0200 Subject: [PATCH] finished asymptotic optimal multivariate LSx class --- _proc/02_levelSetKDEx_multivariate.ipynb | 43 +++-- dddex/__pycache__/__init__.cpython-38.pyc | Bin 125 -> 125 bytes dddex/__pycache__/__init__.cpython-39.pyc | Bin 125 -> 125 bytes dddex/__pycache__/_modidx.cpython-39.pyc | Bin 11214 -> 11032 bytes dddex/__pycache__/baseClasses.cpython-38.pyc | Bin 7661 -> 7661 bytes dddex/__pycache__/baseClasses.cpython-39.pyc | Bin 7672 -> 7672 bytes .../crossValidation.cpython-39.pyc | Bin 18175 -> 18175 bytes .../levelSetKDEx_multivariate.cpython-39.pyc | Bin 12835 -> 13096 bytes .../levelSetKDEx_univariate.cpython-39.pyc | Bin 27436 -> 27436 bytes dddex/__pycache__/loadData.cpython-39.pyc | Bin 1558 -> 1558 bytes dddex/__pycache__/utils.cpython-38.pyc | Bin 6006 -> 6006 bytes dddex/__pycache__/utils.cpython-39.pyc | Bin 5979 -> 5979 bytes dddex/__pycache__/wSAA.cpython-38.pyc | Bin 6855 -> 6855 bytes dddex/__pycache__/wSAA.cpython-39.pyc | Bin 6858 -> 6858 bytes dddex/_modidx.py | 2 - dddex/levelSetKDEx_multivariate.py | 106 ++++++------- nbs/02_levelSetKDEx_multivariate.ipynb | 149 ++++++++---------- 17 files changed, 140 insertions(+), 160 deletions(-) diff --git a/_proc/02_levelSetKDEx_multivariate.ipynb b/_proc/02_levelSetKDEx_multivariate.ipynb index 8f327c4..f8fd9cc 100644 --- a/_proc/02_levelSetKDEx_multivariate.ipynb +++ b/_proc/02_levelSetKDEx_multivariate.ipynb @@ -275,7 +275,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING clustering 1446 points to 723 centroids: please provide at least 28197 training points\n" + "WARNING clustering 1446 points to 100 centroids: please provide at least 3900 training points\n" ] } ], @@ -283,10 +283,13 @@ "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n", "# RF.fit(X = XTrain, y = yTrain)\n", "\n", - "# LSKDEx = LevelSetKDEx_multivariate(estimator = RF, binSize = 2, equalBins = False)\n", + "# LSKDEx = LevelSetKDEx_multivariate_opt(estimator = RF, nClusters = 100, minClusterSize = 20)\n", "# LSKDEx.fit(X = XTrain, y = yTrain)\n", "\n", - "# weightsDataList = LSKDEx.getWeights(X = XTest, outputType='summarized')" + "# yPred = LSKDEx.estimator.predict(XTest).astype(np.float32)\n", + "# clusters = LSKDEx.kmeans.assign(yPred)[1]\n", + "\n", + "# weightsDataList = LSKDEx.getWeights(X = XTest, outputType='onlyPositiveWeights')" ] }, { @@ -295,18 +298,7 @@ "metadata": { "language": "python" }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([404, 496, 114, ..., 257, 430, 149])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# centers = LSKDEx.centers\n", "# yPred = LSKDEx.yPredTrain\n", @@ -323,8 +315,25 @@ "metadata": { "language": "python" }, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[20 20 20 20 30 28 38 22 30 20 36 36 22 38]\n", + "20\n", + "48\n" + ] + } + ], + "source": [ + "# nPosValues = np.array([len(weightsDataList[i][0]) for i in range(len(weightsDataList))])\n", + "# print(nPosValues)\n", + "\n", + "# lenIndices = np.array([len(LSKDEx.indicesPerBin[i]) for i in range(len(LSKDEx.indicesPerBin))])\n", + "# print(min(lenIndices))\n", + "# print(max(lenIndices))" + ] } ], "metadata": { diff --git a/dddex/__pycache__/__init__.cpython-38.pyc b/dddex/__pycache__/__init__.cpython-38.pyc index 419e0a705ffeb5c31ac32fd1cff42dc5fb9300cf..ff4824aaa201c56e0dfb9a98953350f655fcd4a4 100644 GIT binary patch delta 16 Vcmb=e?)2 F1pB^ZG6(||2>=g6l7qy(k}r3BAm zP6^bZyIdhnz7*cYB88q`YD>1I;V+@}BkzaCh zKA${Ge11X6+AF(){)DmBG9 tr?j{vwWzqt5hT00RcO07ontVgcmiZ0KZvgEAL$3e; diff --git a/dddex/__pycache__/baseClasses.cpython-38.pyc b/dddex/__pycache__/baseClasses.cpython-38.pyc index 45a1ae992e9e6b9284c15b6db6c8a235dd25f887..4031fafaeac1927f663de39e7f20d6a8817e5739 100644 GIT binary patch delta 19 ZcmaEB{nnZ*l$V!_0SJ^LHgY|d1pqg71qlEE delta 19 ZcmaEB{nnZ*l$V!_0SKBUH*!6e1pqjJ1uFmm diff --git a/dddex/__pycache__/baseClasses.cpython-39.pyc b/dddex/__pycache__/baseClasses.cpython-39.pyc index f1a5927c3c646c34646a6e25b2eaf922d057d792..3525273aaa2182b81cf23118c197aef934bf0294 100644 GIT binary patch delta 19 Zcmexi{ll6ok(ZZ?0SIPQZ{&I>3jjSn1mxmx}=iW>s(G`sxM%P96re delta 21 bcmez0%lN;Skt>mxmx}=io=I-x`sxM%P5cI* diff --git a/dddex/__pycache__/levelSetKDEx_multivariate.cpython-39.pyc b/dddex/__pycache__/levelSetKDEx_multivariate.cpython-39.pyc index 0b0356089ffaab4892251661b7a436928c5e051c..95bc270128f859e676b3a6932c2e356dc6507204 100644 GIT binary patch delta 2816 zcmbtWU2Gf25#BxW$RkA^k3SMgS&5IamFO%rR)L^#9XoC!*=g#URx3GfE>TsQd!i_d zM`?Mi|CXg;IguPhsEP&pQ1n4UQ6NDOAdcRXx1w+DV?h*c5THOG`p}=}0G(NpwrjXR zf%0%4XJ=+-c6MiH?pN%CC9^y@7$@-i#jW2gjlXf<(V>x+0IUSjXS^PU{n6gUQ*U?Cnd(GwY zn#T($oRLkdL(j^8Su^yEJdwJfF9F+?@1-U>Zlbc_btTH!V@qE-SdDlSsfSzwO=IZ6uRd?v2a1oRt3|zk(J3lkeJ?~&j&)a2va{p--(D)btT> zgZWwc`S5E`HIY<40-(AHZ@pDsz3kPh%K@C)k=(?>Mbwx>PzL6aQU=Z=rB*zO)SC!+ zD!hU)jBo_Ni5&!T92v?Tdg8Ycs`8`U;h{E?DvoPNsW>|F&$)%j8jMShd^P{<&}|gi z2(t*vTSvC@SJF84h)Wr9HAdYKOUZxa#}21ilqJ|8JmxRMEXl09x*ORfVd6&Th?Chr zUHE5&F7Hp-5gJ)WlbiBX(dm;{i`UF7aAWX>2%*hm`+qOKL_0xpcSuB5LM&V$d$b)D z`tGm@?@<;nkhX!&?vfxU$O_q@Gw|Lc;QBFMI~L@Hf%1j%x3zY>J+MW@K!NNQM0}4@ zB4T1dCk4XdFM{t=WQVT53w}pLY_CIMa&q?&khGluYP2Vy;8ME^OTvY<2ZMwd+yMG& zct_)31W)Zgy#=y+G&n3G8+64M5iwXG{Xm$tt*HP^aBaf;WRP4biX@A()V9{Pgr%(O zE6;Qki?&k`0XTpNe25_39uTHTv#>~sG_E^G-qDscFqDDi9oZyHl%-)843WV-*`r%j z44fiC0{;uZ!HEB348e#*j;Q6I3y!kP2E9XfwDr!pO$x@cZ0N2L|q=WFs0AuKDJhw?Kc7&O7EKI7c01y{tvc*MB&X~ z*ftO~3_IVZf(B%TLK0Ire8T9qCe=24sCD&L%?~P2`Sp`5xB=yS8s_nD0sznIwTD@c zzgqL5xIQdo-S{b}yg{W=Uwipu*tKtz9t{7#HSv46fNA-=@qA|+$ou`>n|CI;ssqoU z!X|>Ma7CoPi*NydUv4G@GIrO^Wp7A zFPFIwN31$ex8EZ1_?xOM^P{8#hE&^8Rr@(G>0}?N)l=xGicmvPN~#eV{SJH3fqstC zs|YUvxP||kWv6Gkd5pJeP6zp zJwLZ=3LQ!`Rz_$=P{_hehpHIC0@jYQ2#kn`s*w>Sg(0*pvSNu4v@&B`ZyUmZViYU! zAERvx;e}o)(B|8jhK5_i4i7<>c%`CWu}5N;#b2zcHH@x3dLPZiS`uR+6kLdjRd mhpY}6kZkJvX;>LtNWWEwd#1j2F3I0ceXR2x;LBTI|KQ)5Xqi9& delta 2127 zcmZ{l-A~(97{|}~(U!K9zR^-T#s*^yDr^HYb=h>zg}o2h3(Fji)xz(`D3lgXf4CQa zwc9``(`~q8Vq$hdV{~hbiIYn&cISl}|A2a7VvO<1_;R;oiO=@{V>1!L=l7iFd(?3Y=8o*$6^_`CUD zxiipXAA^&;5ophsJ;SN#tZ9hs>8zobeXFBBp&P=`MG~Gaa}*P1QhM1vY^34cYg{Df zWw}3-IW>_;rOM$oW1O6+eAHcb9qsNHhOr#7lC3|}V~7hzXPGC<-j$>)I?+x!X&Ls? zM;rxOfDWL4Ne>1%EmE)8fw7-Zs#H~q(@^xug_?CcOLozQQ752U@af5{nMe;C@ri^f zl9xmVCS+QMntf3IQj_5ErS8yI+_k#Ghveh%*Z%$Rdb2>6N-y_tCHau1ycG!7HJa4hF@B$aT^sTjQdi717t4w<>`(nhh z6}j$?vRtKEXGUlEyhh)3^krtfRXM}E*h5v&cCsFw13UQKq|J zqD(Kd1wLh(zQqPJTF@3Z6xj?f@|>GA8!dM+&vR9l!>Xb*o;huVqr+LJRn<*yl#s&a z7-{`$+9DM|#Ww1JdX}puk0wf3JvBmPO^dF>tgR0-n)Nf5LME&}$P?UOH z+)~gA+^T_$-WRl~p*cz~XII zfEY`6SRUwBonWED*jVcfhg}apHI-&*qE_Gg_whf|&W-WSU+zmNapx3pS9e{7pjhjHpmL{8{)60B->2fVF#n9z)y_5eL+p{U)RYZ~zzrhJkuO2Mk~YZ~*Gz z7=?rfPwWNMi_*;UByPsI1!yJ2!cV;`>IEBzX#!9|X-FAh64(t)0Rm7NvXD$*8h8u1 z2wVa#16P1{79qY3QEfZcYA3CE_Nshu>n>Rg9^p;0sWvJbVovF*Nxdj6_~rnrd}Z&~ z_7<<#L&Rg{nQfcq=0WYs4oqQ-XC_S<2t@+o8X~U)YEfro*R}>ZAG&83bEK5_!aeIm z0p@vNZFU^WKjfk9^27}o-UmK-;_|HDpHzi9y#3-9Ts{Oodg82RRxRHl-49&;8tITf zb#}T@Td(+-C~wz=WnY&o6rzEX+v*r+k8A7wdFFHmq|%3Hk_HfEU!L)|6g~x>A3&^ diff --git a/dddex/__pycache__/levelSetKDEx_univariate.cpython-39.pyc b/dddex/__pycache__/levelSetKDEx_univariate.cpython-39.pyc index d04889b5e46ee1d3dca4be8c2ff70e0602971854..88b7216100b2ccecb25373230abfc2284d6672a7 100644 GIT binary patch delta 21 bcmZ2;jd9I2My^C&UM>b8m{q-zODP)wN`(dU delta 21 bcmZ2;jd9I2My^C&UM>b8cqX}#ODP)wN@E4x diff --git a/dddex/__pycache__/loadData.cpython-39.pyc b/dddex/__pycache__/loadData.cpython-39.pyc index ae91bbee9109c03555dbf036b52468e5fc8c7eb1..d871bc4fa9a7a2cf7dca3f4c5b05c6182717f7ca 100644 GIT binary patch delta 19 ZcmbQnGmVEUk(ZZ?0SIPQZ{*@(0{|yo1GWGF delta 19 ZcmbQnGmVEUk(ZZ?0SKN+Zsg)&0{|yG1E2r^ diff --git a/dddex/__pycache__/utils.cpython-38.pyc b/dddex/__pycache__/utils.cpython-38.pyc index 14de057d4e48de9a9f00eccb5d7b40ae90bf4d1a..4926bd3830c944670fcd7bf3d20ae3a36d7ac5fe 100644 GIT binary patch delta 19 ZcmeyS_f3y0l$V!_0SJ^LHge^N0{}J)1cU$p delta 19 ZcmeyS_f3y0l$V!_0SKBUH*)2O0{}M`1f~E0 diff --git a/dddex/__pycache__/utils.cpython-39.pyc b/dddex/__pycache__/utils.cpython-39.pyc index da588bcaa854584ec05af66ec6db9cd7cebf4a7f..cdd232e304573ca0349c484cabad9342c74b80ee 100644 GIT binary patch delta 19 ZcmcbucUzAuk(ZZ?0SIPQZ{!LT2LLuO1lj-q delta 19 ZcmcbucUzAuk(ZZ?0SKN+ZsZCS2LLt>1jGOU diff --git a/dddex/__pycache__/wSAA.cpython-38.pyc b/dddex/__pycache__/wSAA.cpython-38.pyc index 841803d975583bcfa8e8db290c35633483b0eeca..c7d72376a8a405f5926f519ab7a0636ec639ce59 100644 GIT binary patch delta 19 ZcmX?Zdfb#Nl$V!_0SJ^LHgfHi0st`r1c(3t delta 19 ZcmX?Zdfb#Nl$V!_0SKBUH*)Qj0st}%1gZc4 diff --git a/dddex/__pycache__/wSAA.cpython-39.pyc b/dddex/__pycache__/wSAA.cpython-39.pyc index 81c36b99af1a384e50abbe2a368415939ab5b8e9..50e7dd607535215e7608446d4d677cd3c0917dc1 100644 GIT binary patch delta 19 ZcmX?Qddie5k(ZZ?0SIPQZ{#{41pqah1vmfz delta 19 ZcmX?Qddie5k(ZZ?0SKN+Zsa;31pqa91tI_d diff --git a/dddex/_modidx.py b/dddex/_modidx.py index b347f28..bac4e3e 100644 --- a/dddex/_modidx.py +++ b/dddex/_modidx.py @@ -75,8 +75,6 @@ 'dddex/levelSetKDEx_multivariate.py'), 'dddex.levelSetKDEx_multivariate.LevelSetKDEx_multivariate_opt.__init__': ( 'levelsetkdex_multivariate.html#levelsetkdex_multivariate_opt.__init__', 'dddex/levelSetKDEx_multivariate.py'), - 'dddex.levelSetKDEx_multivariate.LevelSetKDEx_multivariate_opt._getEqualSizedClusters': ( 'levelsetkdex_multivariate.html#levelsetkdex_multivariate_opt._getequalsizedclusters', - 'dddex/levelSetKDEx_multivariate.py'), 'dddex.levelSetKDEx_multivariate.LevelSetKDEx_multivariate_opt.fit': ( 'levelsetkdex_multivariate.html#levelsetkdex_multivariate_opt.fit', 'dddex/levelSetKDEx_multivariate.py'), 'dddex.levelSetKDEx_multivariate.LevelSetKDEx_multivariate_opt.getWeights': ( 'levelsetkdex_multivariate.html#levelsetkdex_multivariate_opt.getweights', diff --git a/dddex/levelSetKDEx_multivariate.py b/dddex/levelSetKDEx_multivariate.py index 2ec1c1b..5f32c2f 100644 --- a/dddex/levelSetKDEx_multivariate.py +++ b/dddex/levelSetKDEx_multivariate.py @@ -372,88 +372,74 @@ def fit(self, kmeans.train(yPredMod) # Get cluster centers created by faiss. IMPORTANT NOTE: not all clusters are used! We will handle that further below. - centersAll = kmeans.centroids + centers = kmeans.centroids + clusters = np.arange(centers.shape[0]) # Compute the cluster assignment for each sample - if self.equalBins: - clusterAssignments = self._getEqualSizedClusters(y = yPredMod) - else: - clusterAssignments = kmeans.assign(yPredMod)[1] + clusterAssignments = kmeans.assign(yPredMod)[1] # Based on the clusters and cluster assignments, we can now compute the indices belonging to each bin / cluster - indicesPerBin = defaultdict(list) - binSizes = defaultdict(int) + indicesPerBin = [[] for i in range(self.nClusters)] + clusterSizes = [0 for i in range(self.nClusters)] for index, cluster in enumerate(clusterAssignments): indicesPerBin[cluster].append(index) - binSizes[cluster] += 1 + clusterSizes[cluster] += 1 - #--- + clusterSizes = np.array(clusterSizes) - clustersUsed = np.array(list(indicesPerBin.keys())) - clustersOrdered = np.sort(clustersUsed) - - centers = centersAll[clustersOrdered] - indicesPerBin = [indicesPerBin[cluster] for cluster in clustersOrdered] - binSizes = np.array([binSizes[cluster] for cluster in clustersOrdered]) + # Just needed for a check in the end + maxSizeOfExistingClusters = np.max(clusterSizes) #--- - # Merge clusters that are too small (i.e. contain less than binSize / 2 samples). # clustersTooSmall is the array of all clusters that are too small. - threshold = self.binSize / 2 - binsTooSmall = np.where(binSizes < threshold)[0] + clustersTooSmall = np.where(np.array(clusterSizes) < self.minClusterSize)[0] - if len(binsTooSmall) > 0: + if len(clustersTooSmall) > 0: + + indicesPerBinNew = copy.deepcopy(indicesPerBin) - # remove all centers from centersOld that are part of clustersTooSmall - centersNew = np.delete(centers, binsTooSmall, axis = 0) - centersTooSmall = centers[binsTooSmall] - centersNew_oldIndices = np.delete(np.arange(len(centers)), binsTooSmall) + # We are searching for the closest other cluster for each cluster that is too small + # As we don't know how many nearest neighbors we need, we are setting k to the number of clusters + nearestClusters = KDTree(centers).query(centers[clustersTooSmall], k = centers.shape[0])[1] - KDTreeNew = KDTree(centersNew) - clustersToMerge = KDTreeNew.query(centersTooSmall)[1] + # sizeNearestClusters is an array of shape (len(clustersTooSmall), self.nClusters) + sizeNearestClusters = clusterSizes[nearestClusters] - for i, clusterToMerge in enumerate(clustersToMerge): - indicesPerBin[centersNew_oldIndices[clusterToMerge]].extend(indicesPerBin[binsTooSmall[i]]) + # Calculating the cumulative sum of the cluster sizes over each row allows us to find out + # which cluster is the first one that is big enough to make the current cluster big enough + clusterSizesCumSum = np.cumsum(sizeNearestClusters, axis = 1) - # Remove the indices given by clustersTooSmall from indicesPerBin by deleting the list entry - indicesPerBin = [np.array(indices) for binIndex, indices in enumerate(indicesPerBin) if binIndex not in binsTooSmall] - binSizes = [len(indices) for indices in indicesPerBin] - binSizes = pd.Series(binSizes) + # argmax returns the first index where the condition is met. + necessaryClusters = (clusterSizesCumSum >= self.minClusterSize).argmax(axis = 1) + + # We are now creating the new indicesPerBin list by extending the indices of the clusters that are too small + for i, cluster in enumerate(clustersTooSmall): + clustersToAdd = nearestClusters[i, 0:necessaryClusters[i] + 1] + + indicesPerBinNew[cluster] = np.concatenate([indicesPerBin[cluster] for cluster in clustersToAdd]) + clusterSizes[cluster] = len(indicesPerBinNew[cluster]) - self.centers = centersNew - self.binSizes = binSizes - self.kmeans = KDTreeNew + # Following our intended logic, the resulting clusters can't be bigger than minClusterSize + maxSizeOfExistingClusters + if len(indicesPerBinNew[cluster]) > self.minClusterSize + maxSizeOfExistingClusters: + raise Warning("The cluster size is bigger than minClusterSize + maxSizeOfExistingClusters. This should not happen!") + + # indicesPerBin is only turned into a dictionary to be consistent with the other implementations of LevelSetKDEx + self.indicesPerBin = {cluster: np.array(indicesPerBinNew[cluster], dtype = 'uintc') for cluster in range(len(indicesPerBinNew))} + self.clusterSizes = pd.Series(clusterSizes) else: - self.centers = centers - self.binSizes = pd.Series(binSizes) - self.kmeans = KDTree(self.centers) - - # Transform the indices given by indicesPerBin into numpy arrays - indicesPerBin = [np.array(indices) for indices in indicesPerBin] + self.indicesPerBin = {cluster: np.array(indicesPerBin[cluster], dtype = 'uintc') for cluster in range(len(indicesPerBin))} + self.clusterSizes = pd.Series(clusterSizes) #--- self.yTrain = y self.yPredTrain = yPred - self.indicesPerBin = indicesPerBin + self.centers = centers + self.kmeans = kmeans self.fitted = True - - - #--- - - def _getEqualSizedClusters(self, - y, - ): - - centers = self.centers.reshape(-1, 1, y.shape[-1]).repeat(self.binSize, 1).reshape(-1, y.shape[-1]) - - distance_matrix = cdist(y, centers) - clusterAssignments = linear_sum_assignment(distance_matrix)[1]//self.binSize - - return clusterAssignments #--- @@ -482,16 +468,12 @@ def getWeights(self, yPred = yPred.reshape(-1, 1) #--- - - if self.equalBins: - binPerPred = self._getEqualSizedClusters(y = yPred) - - else: - binPerPred = self.kmeans.query(yPred)[1] + + clusterPerPred = self.kmeans.assign(yPred)[1] #--- - neighborsList = [self.indicesPerBin[binIndex] for binIndex in binPerPred] + neighborsList = [self.indicesPerBin[cluster] for cluster in clusterPerPred] weightsDataList = [(np.repeat(1 / len(neighbors), len(neighbors)), np.array(neighbors)) for neighbors in neighborsList] diff --git a/nbs/02_levelSetKDEx_multivariate.ipynb b/nbs/02_levelSetKDEx_multivariate.ipynb index 0104c6c..e9f04d2 100644 --- a/nbs/02_levelSetKDEx_multivariate.ipynb +++ b/nbs/02_levelSetKDEx_multivariate.ipynb @@ -460,88 +460,74 @@ " kmeans.train(yPredMod)\n", "\n", " # Get cluster centers created by faiss. IMPORTANT NOTE: not all clusters are used! We will handle that further below.\n", - " centersAll = kmeans.centroids\n", + " centers = kmeans.centroids\n", + " clusters = np.arange(centers.shape[0])\n", " \n", " # Compute the cluster assignment for each sample\n", - " if self.equalBins:\n", - " clusterAssignments = self._getEqualSizedClusters(y = yPredMod) \n", - " else:\n", - " clusterAssignments = kmeans.assign(yPredMod)[1]\n", + " clusterAssignments = kmeans.assign(yPredMod)[1]\n", " \n", " # Based on the clusters and cluster assignments, we can now compute the indices belonging to each bin / cluster\n", - " indicesPerBin = defaultdict(list)\n", - " binSizes = defaultdict(int)\n", + " indicesPerBin = [[] for i in range(self.nClusters)]\n", + " clusterSizes = [0 for i in range(self.nClusters)]\n", "\n", " for index, cluster in enumerate(clusterAssignments):\n", " indicesPerBin[cluster].append(index)\n", - " binSizes[cluster] += 1\n", + " clusterSizes[cluster] += 1\n", "\n", - " #---\n", + " clusterSizes = np.array(clusterSizes)\n", "\n", - " clustersUsed = np.array(list(indicesPerBin.keys()))\n", - " clustersOrdered = np.sort(clustersUsed)\n", - "\n", - " centers = centersAll[clustersOrdered]\n", - " indicesPerBin = [indicesPerBin[cluster] for cluster in clustersOrdered]\n", - " binSizes = np.array([binSizes[cluster] for cluster in clustersOrdered])\n", + " # Just needed for a check in the end\n", + " maxSizeOfExistingClusters = np.max(clusterSizes)\n", "\n", " #---\n", "\n", - " # Merge clusters that are too small (i.e. contain less than binSize / 2 samples).\n", " # clustersTooSmall is the array of all clusters that are too small.\n", - " threshold = self.binSize / 2\n", - " binsTooSmall = np.where(binSizes < threshold)[0]\n", + " clustersTooSmall = np.where(np.array(clusterSizes) < self.minClusterSize)[0]\n", " \n", - " if len(binsTooSmall) > 0:\n", + " if len(clustersTooSmall) > 0:\n", + " \n", + " indicesPerBinNew = copy.deepcopy(indicesPerBin)\n", "\n", - " # remove all centers from centersOld that are part of clustersTooSmall\n", - " centersNew = np.delete(centers, binsTooSmall, axis = 0)\n", - " centersTooSmall = centers[binsTooSmall]\n", - " centersNew_oldIndices = np.delete(np.arange(len(centers)), binsTooSmall)\n", + " # We are searching for the closest other cluster for each cluster that is too small\n", + " # As we don't know how many nearest neighbors we need, we are setting k to the number of clusters\n", + " nearestClusters = KDTree(centers).query(centers[clustersTooSmall], k = centers.shape[0])[1]\n", "\n", - " KDTreeNew = KDTree(centersNew)\n", - " clustersToMerge = KDTreeNew.query(centersTooSmall)[1]\n", + " # sizeNearestClusters is an array of shape (len(clustersTooSmall), self.nClusters)\n", + " sizeNearestClusters = clusterSizes[nearestClusters]\n", "\n", - " for i, clusterToMerge in enumerate(clustersToMerge):\n", - " indicesPerBin[centersNew_oldIndices[clusterToMerge]].extend(indicesPerBin[binsTooSmall[i]])\n", + " # Calculating the cumulative sum of the cluster sizes over each row allows us to find out \n", + " # which cluster is the first one that is big enough to make the current cluster big enough\n", + " clusterSizesCumSum = np.cumsum(sizeNearestClusters, axis = 1)\n", "\n", - " # Remove the indices given by clustersTooSmall from indicesPerBin by deleting the list entry\n", - " indicesPerBin = [np.array(indices) for binIndex, indices in enumerate(indicesPerBin) if binIndex not in binsTooSmall]\n", - " binSizes = [len(indices) for indices in indicesPerBin]\n", - " binSizes = pd.Series(binSizes)\n", + " # argmax returns the first index where the condition is met.\n", + " necessaryClusters = (clusterSizesCumSum >= self.minClusterSize).argmax(axis = 1)\n", + " \n", + " # We are now creating the new indicesPerBin list by extending the indices of the clusters that are too small\n", + " for i, cluster in enumerate(clustersTooSmall):\n", + " clustersToAdd = nearestClusters[i, 0:necessaryClusters[i] + 1]\n", + " \n", + " indicesPerBinNew[cluster] = np.concatenate([indicesPerBin[cluster] for cluster in clustersToAdd])\n", + " clusterSizes[cluster] = len(indicesPerBinNew[cluster])\n", "\n", - " self.centers = centersNew\n", - " self.binSizes = binSizes\n", - " self.kmeans = KDTreeNew\n", + " # Following our intended logic, the resulting clusters can't be bigger than minClusterSize + maxSizeOfExistingClusters\n", + " if len(indicesPerBinNew[cluster]) > self.minClusterSize + maxSizeOfExistingClusters:\n", + " raise Warning(\"The cluster size is bigger than minClusterSize + maxSizeOfExistingClusters. This should not happen!\")\n", + "\n", + " # indicesPerBin is only turned into a dictionary to be consistent with the other implementations of LevelSetKDEx\n", + " self.indicesPerBin = {cluster: np.array(indicesPerBinNew[cluster], dtype = 'uintc') for cluster in range(len(indicesPerBinNew))}\n", + " self.clusterSizes = pd.Series(clusterSizes)\n", " \n", " else:\n", - " self.centers = centers\n", - " self.binSizes = pd.Series(binSizes)\n", - " self.kmeans = KDTree(self.centers)\n", - "\n", - " # Transform the indices given by indicesPerBin into numpy arrays\n", - " indicesPerBin = [np.array(indices) for indices in indicesPerBin]\n", + " self.indicesPerBin = {cluster: np.array(indicesPerBin[cluster], dtype = 'uintc') for cluster in range(len(indicesPerBin))}\n", + " self.clusterSizes = pd.Series(clusterSizes)\n", " \n", " #---\n", " \n", " self.yTrain = y\n", " self.yPredTrain = yPred\n", - " self.indicesPerBin = indicesPerBin\n", + " self.centers = centers\n", + " self.kmeans = kmeans\n", " self.fitted = True\n", - " \n", - " \n", - " #---\n", - " \n", - " def _getEqualSizedClusters(self,\n", - " y,\n", - " ):\n", - " \n", - " centers = self.centers.reshape(-1, 1, y.shape[-1]).repeat(self.binSize, 1).reshape(-1, y.shape[-1])\n", - "\n", - " distance_matrix = cdist(y, centers)\n", - " clusterAssignments = linear_sum_assignment(distance_matrix)[1]//self.binSize\n", - "\n", - " return clusterAssignments\n", "\n", " #---\n", " \n", @@ -570,16 +556,12 @@ " yPred = yPred.reshape(-1, 1)\n", " \n", " #---\n", - " \n", - " if self.equalBins:\n", - " binPerPred = self._getEqualSizedClusters(y = yPred)\n", - " \n", - " else:\n", - " binPerPred = self.kmeans.query(yPred)[1]\n", + "\n", + " clusterPerPred = self.kmeans.assign(yPred)[1]\n", " \n", " #---\n", " \n", - " neighborsList = [self.indicesPerBin[binIndex] for binIndex in binPerPred]\n", + " neighborsList = [self.indicesPerBin[cluster] for cluster in clusterPerPred]\n", " \n", " weightsDataList = [(np.repeat(1 / len(neighbors), len(neighbors)), np.array(neighbors)) for neighbors in neighborsList]\n", " \n", @@ -657,7 +639,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "WARNING clustering 1446 points to 723 centroids: please provide at least 28197 training points\n" + "WARNING clustering 1446 points to 100 centroids: please provide at least 3900 training points\n" ] } ], @@ -665,28 +647,20 @@ "# RF = RandomForestRegressor(n_estimators = 10, n_jobs = 1)\n", "# RF.fit(X = XTrain, y = yTrain)\n", "\n", - "# LSKDEx = LevelSetKDEx_multivariate(estimator = RF, binSize = 2, equalBins = False)\n", + "# LSKDEx = LevelSetKDEx_multivariate_opt(estimator = RF, nClusters = 100, minClusterSize = 20)\n", "# LSKDEx.fit(X = XTrain, y = yTrain)\n", "\n", - "# weightsDataList = LSKDEx.getWeights(X = XTest, outputType='summarized')" + "# yPred = LSKDEx.estimator.predict(XTest).astype(np.float32)\n", + "# clusters = LSKDEx.kmeans.assign(yPred)[1]\n", + "\n", + "# weightsDataList = LSKDEx.getWeights(X = XTest, outputType='onlyPositiveWeights')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([404, 496, 114, ..., 257, 430, 149])" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# centers = LSKDEx.centers\n", "# yPred = LSKDEx.yPredTrain\n", @@ -701,8 +675,25 @@ "execution_count": null, "id": "10e14eb7", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[20 20 20 20 30 28 38 22 30 20 36 36 22 38]\n", + "20\n", + "48\n" + ] + } + ], + "source": [ + "# nPosValues = np.array([len(weightsDataList[i][0]) for i in range(len(weightsDataList))])\n", + "# print(nPosValues)\n", + "\n", + "# lenIndices = np.array([len(LSKDEx.indicesPerBin[i]) for i in range(len(LSKDEx.indicesPerBin))])\n", + "# print(min(lenIndices))\n", + "# print(max(lenIndices))" + ] } ], "metadata": {