-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculate_similarities.py
77 lines (53 loc) · 2.29 KB
/
calculate_similarities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Calculate similarites. Mess around with the gensim stuff to implement different similarity algorithms. Change the alg= var from 'main' so those similarites
can be pulled up when requested.
"""
import os
import MySQLdb
from gensim import corpora, models, similarities
def convert_recip(val): #take numeric recip and return CRP value; take CRP value and return number for gensim
if type(val)==int:
recip = string(val)
if recip[0]=='1': recip = 'N' + recip[1:]
elif recip[0]=='2': recip = 'C' + recip[1:]
return recip
else:
recip = val
if recip[0]=='N': recip = '1' + recip[1:]
elif recip[0]=='C': recip = '2' + recip[1:]
return int(recip)
conn = MySQLdb.connect(db='FEC')
cursor = conn.cursor()
statement = "SELECT * FROM gensim_2_contrib_recip ORDER BY contribidshort"
cursor.execute(statement)
res = cursor.fetchall()
corpus = [] #recipient sets per person
contrib_lookup = [] #index of those people's ids in the same order
recip_lookup = []
curcontrib = None
currecips = []
for r in res:
if r[0]!=curcontrib:
if curcontrib: #done with one guy. save him.
corpus.append(currecips)
currecips = []
contrib_lookup.append(curcontrib)
curcontrib = r[0]
recip = convert_recip(r[1])
currecips.append((recip,1)) #no weighing by date, amount, etc. you either gaveor you didn't.
if recip not in recip_lookup: recip_lookup.append(recip)
#statement = "DROP TABLE IF EXISTS gensim_similarity"
#cursor.execute(statement)
statement = "CREATE TABLE gensim_similarity (alg varchar(15), source varchar(15), rank int, target varchar(15), similarity float)"
cursor.execute(statement)
alg = 'main'
#tfidf = models.TfidfModel(corpus)
similarity = similarities.SparseMatrixSimilarity(corpus,num_best=80)
#pass sim_index an object, and it will return the (in this case)
#50 most similar objects from the corpus
for i,contrib in enumerate(contrib_lookup):
print i
sims = similarity[ corpus[i] ]
for (j,(target,simval)) in enumerate(sims):
statement = "INSERT INTO gensim_similarity (alg, source, rank, target, similarity) VALUES (%s,'%s',%s,'%s',%s)" % (alg, contrib_lookup[i], j, contrib_lookup[target], simval)
os.system("mysql FEC -e \"%s\"" % statement)