-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject2.py
102 lines (86 loc) · 3.42 KB
/
project2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import argparse
import pandas as pd
import json
from collections import Counter
def list_to_sent(lst):
text = ''
for i in lst:
text = text + i + ', '
return text
def recommender(N, ingredient):
#get the data
with open('docs/yummly.json','r') as f:
recipes = json.load(f)
rec_df = pd.DataFrame.from_dict(pd.json_normalize(recipes),orient='columns')
X = []
y = []
labs = []
for x in rec_df['ingredients']:
X.append(list_to_sent(x))
for z in rec_df['cuisine']:
y.append(z)
for a in rec_df['id']:
labs.append(a)
#pre-process: vectorize the data and reduce the data
lsa = make_pipeline(
TfidfVectorizer(max_df = .75, min_df = 5, stop_words='english'),
TruncatedSVD(n_components = 100),
Normalizer(copy=False))
X_lsa = lsa.fit_transform(X)
kmeans = KMeans(n_clusters=20, max_iter=100, n_init=1)
kmeans.fit(X_lsa)
center_pts = kmeans.cluster_centers_
#label the clusters, majority rules
cluster_label = [[] for _ in range(20)]
cluster_dict = dict()
for x in range(len(kmeans.labels_)):
cluster_label[kmeans.labels_[x]].append(y[x])
for x in range(len(cluster_label)):
c = Counter(cluster_label[x])
value, count = c.most_common()[0]
cluster_dict[x] = value
#predict new data
new_recipe = [list_to_sent(ingredient)]
new_point = lsa.transform(new_recipe)
pred_clust = kmeans.predict(new_point)
#distance from new point to centroid of assigned cluster
cent_dist = np.linalg.norm(new_point - center_pts[pred_clust])
#distance from new point to N closest neighbors
##>> Get the indices of the points in the cluster
pts_in_clust = list((np.array(kmeans.labels_)==pred_clust[0]).nonzero()[0])
##>>find the corresponding locations and ID numbers of each point
clust_pts = X_lsa[pts_in_clust]
clust_id = np.array(labs)[pts_in_clust]
##>>Calculate the distances between the new point and each point in the cluster
distances = []
for x in clust_pts:
distances.append(np.linalg.norm(new_point - x))
##>>Use argsort to find the indices of N closest (the N lowest distances)
idx = np.argsort(distances)[:(int(N))]
##>>Get the actual distances and id's based on the indices
distances = np.array(distances)[idx]
labels = np.array(clust_id)[idx]
#Prepare the dictionary, convert to JSON, print it pretty
cuisine = cluster_dict[pred_clust[0]]
cuisine_dist = cent_dist
nbr_dist = []
for x,y in zip(labels, distances):
nbr_dist.append({'id':str(x),'score':float(round(y,2))})
final_dict = {'cuisine':str(cuisine), 'score':float(round(cuisine_dist,2)),'closest':nbr_dist}
#formatted = json.dumps(final_dict, indent=4)
return final_dict
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--N', required=True, help="Provide number of neighbors")
parser.add_argument('--ingredient', action='append')
args = parser.parse_args()
result = recommender(args.N, args.ingredient)
formatted = json.dumps(result, indent=4)
print(formatted)