-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTextProcessor.py
223 lines (194 loc) · 9.05 KB
/
TextProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
from __future__ import print_function
from gensim import corpora, models
from gensim.parsing.preprocessing import STOPWORDS
import logging, re, numpy, cPickle
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
def main():
"""Main entry."""
global priorweight
dates = ['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12',
'2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12']
dates = ['2013-01', '2013-02', '2013-03']
numtopics = 40
vocabsize = 2000
priorweight = 0.05
workers = 3
# filterUsers(dates)
createDictionariesFromFiles(dates)
createGlobalDictionaryFromMonthly(dates, vocabsize)
createMonthCorpuses(dates)
#
performTFIDF(dates)
performLDA(dates, numtopics, vocabsize, workers)
# lookupTopics(dates)
def lookatdist(date):
lda = models.LdaModel.load("ldamodels/" + date + "-lda.model")
# lda = models.LdaMulticore.load("ldamodels/" + date + "-lda.model")
print(lda.LdaState.get_lambda())
# ldalambda = lda.get_lambda()
# print(ldalambda)
# run this to only get users that exist in all months
def filterUsers(dates):
users = set()
for date in dates:
musers = set()
for line in open("data/" + str(date) + "-title-users.txt", "r"):
musers.add(line.strip("\n"))
if len(users) == 0:
users = musers
else:
users = set.intersection(users, musers)
ufile = open("data/all-month-users.txt", "w")
for user in users:
ufile.write(user + "\n")
ufile.close()
def readFile(date):
original_sentences = {}
for line in open("data/" + date + "-posts.tsv"):
[id, postDate, type, score, title, text, tags] = line.split('\t')
original_sentences[id] = text
return original_sentences
def lookupLDATopics(date, docIDs, numTopics):
tokenized_dictfile = "models/global-tokenized_dict.pdict"
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
lda = models.LdaModel.load("ldamodels/"+date+"-lda.model")
for docID in docIDs:
sentence = tokenized_dict[str(docID)]
bow = dictionary.doc2bow(sentence)
topics = lda[bow]
topics_by_value = sorted(topics, key=lambda tup: tup[1], reverse=True)
return topics_by_value[:numTopics]
def calculateEta(dates, date, numtopics, vocabsize):
priordate = dates[dates.index(date) - 1]
tokenized_dictfile = "models/"+priordate+"-monthly-tokenized_dict.pdict"
with open(tokenized_dictfile, 'rb') as f:
tokenized_dict = cPickle.load(f)
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
priorlda = models.LdaMulticore.load("ldamodels/" + priordate + "-lda.model")
countedwordtopics = []
[countedwordtopics.append(0) for i in range(numtopics)]
for docID in tokenized_dict.keys():
doc = tokenized_dict[docID]
bow = dictionary.doc2bow(doc)
wordcount = len(bow)
priordoctopics = priorlda[bow]
for doctopic in priordoctopics:
topicID = doctopic[0]
topicvalue = wordcount * doctopic[1]
countedwordtopics[topicID] = countedwordtopics[topicID] + topicvalue * priorweight
indexes = priorlda.id2word
priortopics = priorlda.show_topics(num_topics=-1, num_words=vocabsize, formatted=False)
eta = numpy.zeros((numtopics, vocabsize))
reverseindexes = dict(zip(indexes.values(), indexes.keys()))
for priortopic in priortopics:
topicID = priortopic[0]
worddist = priortopic[1]
for wordTuple in worddist:
word = wordTuple[0]
value = wordTuple[1]
wordindex = reverseindexes[word]
eta[topicID][wordindex] = value * countedwordtopics[topicID]
return eta
def calculateEta2(dates, date, numtopics, vocabsize, minpriorvalue):
prioldafile = "ldamodels/" + dates[dates.index(date) - 1] + "-lda.model"
logging.info("loading " + prioldafile)
priorlda = models.LdaMulticore.load(prioldafile)
eta = numpy.zeros((numtopics, vocabsize))
topics = priorlda.show_topics(num_topics=-1, num_words=2000, formatted=False)
indexes = priorlda.id2word
reverseindexes = dict(zip(indexes.values(), indexes.keys()))
for topic in topics:
topicid = topic[0]
wordlist = topic[1]
for wordtuple in wordlist:
word = wordtuple[0]
value = wordtuple[1]
if value < minpriorvalue:
value = 0
index = reverseindexes[word]
eta[topicid][index] = value
return eta
def performTFIDF(dates):
for date in dates:
corpus = corpora.MmCorpus("models/" + date + "-tokenized.mm")
tfidf = models.TfidfModel(corpus)
tfidf.save("models/"+date+"-tfidf.model")
tfidf_corpus = tfidf[corpus]
corpora.MmCorpus.save_corpus("models/"+date+"-tfidf.mm", tfidf_corpus)
def performLDA(dates, numtopics, vocabsize, workers):
for date in dates:
print("performing lda on " + str(date))
dictionary = corpora.Dictionary.load("models/global-dictionary.dict")
corpus = corpora.MmCorpus("models/" + date + "-tfidf.mm")
if date != dates[0] and priorweight != 0:
logging.info("Calculating eta based on prior month")
eta = calculateEta(dates, date, numtopics, vocabsize)
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=numtopics, workers=workers, eta=eta)
else:
logging.info("Eta weighting factor too low or no prior months")
lda = models.LdaMulticore(corpus, id2word=dictionary, num_topics=numtopics, workers=workers)
lda_corpus = lda[corpus]
corpora.MmCorpus.serialize('ldamodels/' + date + '-lda.mm', lda_corpus)
lda.save('ldamodels/' + date + '-lda.model')
def tokenizeandstemline(text):
stoplist = STOPWORDS
stemmer = PorterStemmer()
tokenized_line = [stemmer.stem(word.lower()) for word in word_tokenize(text.decode('utf-8'), language='english') if word not in stoplist and len(word) > 3 and re.match('^[\w-]+$', word) is not None]
return tokenized_line
def writecpicklefile(content, filename):
with open(filename, 'wb') as f:
cPickle.dump(content, f, cPickle.HIGHEST_PROTOCOL)
def createGlobalDictionaryFromMonthly(dates, vocabsize):
global_tokenized_dict = {}
for date in dates:
monthly_tokenized_dictfile = "models/" + date + "-monthly-tokenized_dict.pdict"
with open(monthly_tokenized_dictfile, 'rb') as f:
logging.info("Opening file %s", monthly_tokenized_dictfile)
global_tokenized_dict = merge_two_dicts(cPickle.load(f), global_tokenized_dict)
logging.info("Creating corpora.Dictionary")
dictionary = corpora.Dictionary(global_tokenized_dict.values())
logging.info("Compressing dictionary of size: %s", len(dictionary))
dictionary.filter_extremes(no_below=200, no_above=0.8, keep_n=vocabsize)
dictionary.compactify()
logging.info("Dictionary size: %s", len(dictionary))
dictionary.save('models/global-dictionary.dict')
def merge_two_dicts(x, y):
"""Given two dicts, merge them into a new dict as a shallow copy."""
z = x.copy()
z.update(y)
return z
def createDictionariesFromFiles(dates):
for date in dates:
print("parsing month: " + date)
monthly_tokenized_dict = {}
monthly_original_dict = {}
docids = {}
for line in open("data/" + date + "-titles-tags-text.tsv"):
[id, userid, postDate, score, title, tags, text] = line.split('\t')
docids[id] = (userid, score)
text = title + " " + tags + " " + text
tokenized_line = tokenizeandstemline(text)
monthly_tokenized_dict[id] = tokenized_line
monthly_original_dict[id] = text
monthly_docids_dictfile = "models/"+date+"-docids.pdict"
writecpicklefile(docids, monthly_docids_dictfile)
monthly_tokenized_dictfile = "models/"+date+"-monthly-tokenized_dict.pdict"
writecpicklefile(monthly_tokenized_dict, monthly_tokenized_dictfile)
monthly_original_dictfile = "models/"+date+"-monthly-original_dict.pdict"
writecpicklefile(monthly_original_dict, monthly_original_dictfile)
def createMonthCorpuses(dates):
for date in dates:
logging.info("Parsing date: %s", date)
print("parsing month: " + date)
monthly_dict_file = "models/" + date + "-monthly-tokenized_dict.pdict"
with open(monthly_dict_file, 'rb') as f:
tokenized_dict = cPickle.load(f)
dictionary = corpora.Dictionary.load('models/global-dictionary.dict')
corpus = [dictionary.doc2bow(sentence) for sentence in tokenized_dict.values()]
corpora.MmCorpus.serialize('models/' + date + '-tokenized.mm', corpus)
if __name__ == '__main__':
main()