Skip to content

Commit

Permalink
improve code readability
Browse files Browse the repository at this point in the history
  • Loading branch information
Pantzan committed Feb 17, 2019
1 parent 6274368 commit 2861fea
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 19 deletions.
15 changes: 9 additions & 6 deletions Ncv.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
from PythonKnn import PythonKnn

def NestedCrossVal(X,y,foldK,nns,dists,mySeed):

np.random.seed(mySeed)
accuracy_fold=[]
accuracy_fold = []
indices = np.random.permutation(X.shape[0])
bins=np.array_split(indices, foldK)
bins = np.array_split(indices, foldK)
sum_matrix = []
assert(foldK==len(bins))
assert(foldK == len(bins))

for i in range(0,foldK):
foldTrain=[]
foldTest=[]
foldVal=[]

# split the X to train, test and validation folds
foldTest.extend(bins[i])

if(i != foldK-1):
foldVal.extend(bins[i+1])
else:
foldVal.extend(bins[0])

# don't add dublicates
for j in range(0,len(bins)):
if(i != j and i+1 != j):
if(i != j and i + 1 != j):
if(i == foldK-1 and j == 0):
continue
foldTrain.extend(bins[j])
Expand All @@ -35,7 +37,8 @@ def NestedCrossVal(X,y,foldK,nns,dists,mySeed):
bestAccuracy=-10

#loop to all available parametres and find the best to train the model
for d in range(0, len(dists)):
for d in range(0, len(dists)):

for k in nns:
knn = PythonKnn(k, dists[d])
knn.fit(X[foldTrain], y[foldTrain])
Expand Down
32 changes: 25 additions & 7 deletions PythonKnn.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import numpy as np
class PythonKnn:
import math
import operator
from collections import Counter

class PythonKnn(object):
# initialize the object and it's variables
def __init__(self, n, metric):

self.predicted = []
self.X = None
self.y = None
Expand All @@ -10,46 +15,51 @@ def __init__(self, n, metric):

#Fit function to get X and y from IRIS dataset.
def fit(self, Xx_, yy_):

if(len(Xx_) != len(yy_)):
raise ValueError("Data not equal in size with labels")
self.X = Xx_
self.y = yy_

# Get the accuracy of the model and raise errors if the array sizes are wrong
def accuracy(self, y_test,y_pred):

if(len(y_test) != len(y_pred)):
raise ValueError("Prediction and labels are not equal in size")
a = np.where(y_test == y_pred)
accuracy = (float(len(a[0])) / len(y_test))

return accuracy

# Get the euclidean distance from two arrays by ziping them with Pythonic way
def euclideanDistance(self, in1,in2):
import math

return math.sqrt(sum([(a - b) ** 2 for a, b in zip(in1, in2)]))

# Get the absolute distance from two arrays for experiment purposes
def absoluteDistance(self, in1, in2):

return float(abs(sum([(a - b) for a, b in zip(in1, in2)])))

# check the distance from all points
def getNeighbours(self, x_):

dists = []
X = self.X
metric = self.metric

for i in range(0, X.shape[0]-1):

if metric == 'euclidean':
e = (( i, self.euclideanDistance(X[i], x_) ))
elif metric == 'absolute':
e = (( i, self.absoluteDistance(X[i], x_) ))
else:
raise ValueError("I don't understand this metric. Please choose 'euclidean' or 'absolute'" )
# print(X[i], x_, e[1])
dists.append(e)

# sort the tuple by the distance index
import operator

s = dists.copy()
s.sort(key=operator.itemgetter(1))
# get the n neighbours
Expand All @@ -59,23 +69,25 @@ def getNeighbours(self, x_):

# append to the array the calculated label
self.assignLabel(label)

return label

# get the labels from n neighbours
def getLabel(self, neighbours):
from collections import Counter

indexes = []
a_labels = []
labels = []

# check the distance from the first item in the array if neighbours.
#If the distance is zero, then means we found jackpot and returning this label as 100% correct prediction
first = neighbours[0]
if (first[1]) == 0:

label = (self.y[first[0]],0)
# get the majority of the labels and return it as the calculated label
else:
for i in range(0, len(neighbours)):

index = neighbours[i][0]
a_labels.append(self.y[index])

Expand All @@ -86,23 +98,29 @@ def getLabel(self, neighbours):

# append to predicted array the correct label for each index of x test
def assignLabel(self, label):

self.predicted.append(label)

return self.predicted

# the main function. Calls all the helper Knn functions and returns the predicted array
# It also does checks about empty or not identical in size arrays
def predict(self, x_):

if (len(x_) > len(self.X)):
raise ValueError("Test array is larger than the training data")
elif (len(x_) == 0):
raise ValueError("Test array is empty")
for i in x_:
self.getNeighbours(i)

return self.predicted

# calculates the confusion matrix according to the prediction.
def confMat(self, y_test_,y_pred_,classno):

matrix = np.zeros(9).reshape(classno,classno)
for i in range(0,y_test_.size):
matrix[y_test_[i], y_pred_[i]] += 1
matrix[y_test_[i], y_pred_[i]] += 1

return matrix
14 changes: 8 additions & 6 deletions TestKnn.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from PythonKnn import PythonKnn
# Load the IRIS dataset, as in the labs
# %matplotlib inline

# uncomment this if you use jupyter
# %matplotlib inline

from sklearn import datasets
import numpy as np
Expand All @@ -18,7 +19,6 @@


iris = datasets.load_iris()

#view a description of the dataset (uncomment next line to do so)
#print(iris.DESCR)

Expand All @@ -40,10 +40,12 @@
bins=np.array_split(indices,2) # we just need a training and testing set here
foldTrain=bins[0]
foldTest=bins[1]
knn=PythonKnn(10,'euclidean')
knn.fit(X[foldTrain],y[foldTrain])

knn=PythonKnn(10, 'euclidean')
knn.fit(X[foldTrain], y[foldTrain])
y_pred=knn.predict(X[foldTest])

a = np.where(y_pred != y[foldTest])
print(knn.accuracy(y[foldTest],y_pred))
# print(knn.confMat())
print("accuracy: ", knn.accuracy(y[foldTest],y_pred))

# print(knn.confMat(y[foldTest],y_pred,len(np.unique(y))))

0 comments on commit 2861fea

Please sign in to comment.