diff --git a/Ncv.py b/Ncv.py index a43ece0..7b042ab 100644 --- a/Ncv.py +++ b/Ncv.py @@ -1,19 +1,21 @@ from PythonKnn import PythonKnn def NestedCrossVal(X,y,foldK,nns,dists,mySeed): + np.random.seed(mySeed) - accuracy_fold=[] + accuracy_fold = [] indices = np.random.permutation(X.shape[0]) - bins=np.array_split(indices, foldK) + bins = np.array_split(indices, foldK) sum_matrix = [] - assert(foldK==len(bins)) + assert(foldK == len(bins)) + for i in range(0,foldK): foldTrain=[] foldTest=[] foldVal=[] - # split the X to train, test and validation folds foldTest.extend(bins[i]) + if(i != foldK-1): foldVal.extend(bins[i+1]) else: @@ -21,7 +23,7 @@ def NestedCrossVal(X,y,foldK,nns,dists,mySeed): # don't add dublicates for j in range(0,len(bins)): - if(i != j and i+1 != j): + if(i != j and i + 1 != j): if(i == foldK-1 and j == 0): continue foldTrain.extend(bins[j]) @@ -35,7 +37,8 @@ def NestedCrossVal(X,y,foldK,nns,dists,mySeed): bestAccuracy=-10 #loop to all available parametres and find the best to train the model - for d in range(0, len(dists)): + for d in range(0, len(dists)): + for k in nns: knn = PythonKnn(k, dists[d]) knn.fit(X[foldTrain], y[foldTrain]) diff --git a/PythonKnn.py b/PythonKnn.py index d7cdbef..3b8bff5 100644 --- a/PythonKnn.py +++ b/PythonKnn.py @@ -1,7 +1,12 @@ import numpy as np -class PythonKnn: +import math +import operator +from collections import Counter + +class PythonKnn(object): # initialize the object and it's variables def __init__(self, n, metric): + self.predicted = [] self.X = None self.y = None @@ -10,6 +15,7 @@ def __init__(self, n, metric): #Fit function to get X and y from IRIS dataset. def fit(self, Xx_, yy_): + if(len(Xx_) != len(yy_)): raise ValueError("Data not equal in size with labels") self.X = Xx_ @@ -17,39 +23,43 @@ def fit(self, Xx_, yy_): # Get the accuracy of the model and raise errors if the array sizes are wrong def accuracy(self, y_test,y_pred): + if(len(y_test) != len(y_pred)): raise ValueError("Prediction and labels are not equal in size") a = np.where(y_test == y_pred) accuracy = (float(len(a[0])) / len(y_test)) + return accuracy # Get the euclidean distance from two arrays by ziping them with Pythonic way def euclideanDistance(self, in1,in2): - import math + return math.sqrt(sum([(a - b) ** 2 for a, b in zip(in1, in2)])) # Get the absolute distance from two arrays for experiment purposes def absoluteDistance(self, in1, in2): + return float(abs(sum([(a - b) for a, b in zip(in1, in2)]))) # check the distance from all points def getNeighbours(self, x_): + dists = [] X = self.X metric = self.metric for i in range(0, X.shape[0]-1): + if metric == 'euclidean': e = (( i, self.euclideanDistance(X[i], x_) )) elif metric == 'absolute': e = (( i, self.absoluteDistance(X[i], x_) )) else: raise ValueError("I don't understand this metric. Please choose 'euclidean' or 'absolute'" ) -# print(X[i], x_, e[1]) dists.append(e) # sort the tuple by the distance index - import operator + s = dists.copy() s.sort(key=operator.itemgetter(1)) # get the n neighbours @@ -59,23 +69,25 @@ def getNeighbours(self, x_): # append to the array the calculated label self.assignLabel(label) + return label # get the labels from n neighbours def getLabel(self, neighbours): - from collections import Counter + indexes = [] a_labels = [] labels = [] - # check the distance from the first item in the array if neighbours. #If the distance is zero, then means we found jackpot and returning this label as 100% correct prediction first = neighbours[0] if (first[1]) == 0: + label = (self.y[first[0]],0) # get the majority of the labels and return it as the calculated label else: for i in range(0, len(neighbours)): + index = neighbours[i][0] a_labels.append(self.y[index]) @@ -86,23 +98,29 @@ def getLabel(self, neighbours): # append to predicted array the correct label for each index of x test def assignLabel(self, label): + self.predicted.append(label) + return self.predicted # the main function. Calls all the helper Knn functions and returns the predicted array # It also does checks about empty or not identical in size arrays def predict(self, x_): + if (len(x_) > len(self.X)): raise ValueError("Test array is larger than the training data") elif (len(x_) == 0): raise ValueError("Test array is empty") for i in x_: self.getNeighbours(i) + return self.predicted # calculates the confusion matrix according to the prediction. def confMat(self, y_test_,y_pred_,classno): + matrix = np.zeros(9).reshape(classno,classno) for i in range(0,y_test_.size): - matrix[y_test_[i], y_pred_[i]] += 1 + matrix[y_test_[i], y_pred_[i]] += 1 + return matrix \ No newline at end of file diff --git a/TestKnn.py b/TestKnn.py index a55e7da..bb1bbc0 100644 --- a/TestKnn.py +++ b/TestKnn.py @@ -1,7 +1,8 @@ from PythonKnn import PythonKnn # Load the IRIS dataset, as in the labs -# %matplotlib inline +# uncomment this if you use jupyter +# %matplotlib inline from sklearn import datasets import numpy as np @@ -18,7 +19,6 @@ iris = datasets.load_iris() - #view a description of the dataset (uncomment next line to do so) #print(iris.DESCR) @@ -40,10 +40,12 @@ bins=np.array_split(indices,2) # we just need a training and testing set here foldTrain=bins[0] foldTest=bins[1] -knn=PythonKnn(10,'euclidean') -knn.fit(X[foldTrain],y[foldTrain]) + +knn=PythonKnn(10, 'euclidean') +knn.fit(X[foldTrain], y[foldTrain]) y_pred=knn.predict(X[foldTest]) + a = np.where(y_pred != y[foldTest]) -print(knn.accuracy(y[foldTest],y_pred)) -# print(knn.confMat()) +print("accuracy: ", knn.accuracy(y[foldTest],y_pred)) + # print(knn.confMat(y[foldTest],y_pred,len(np.unique(y)))) \ No newline at end of file