-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathInterestClassifier.py
288 lines (243 loc) · 10.4 KB
/
InterestClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 12 19:29:13 2015
Twitter Account Classifier
This program takes in a twitterhandle, trains a classifier based on given training
data, and then classifies the given twitterhandle into one of the 10 preset
categories.
This program hosts 2 types of classifiers: Multinomial Naive Bayes and Linear SVM.
There are various settings for the preprocessing and classifiers which are outlined
in their instructions.
TO USE:
1)Modify the SETTINGS FOR CLASSIFIER section appropriately.
[Application can run with default settings once 1) is done. For customization, read on.]
2)Scroll to bottom, if you want to use naive bayes, uncomment naive bayes classifier command.
likewise for SVM. Important: Only uncomment one at a time!
3)Additional settings (ngram,stopwords,tf or tfidf) can be changed by modifying the input
to the preProcess function. If no additional arguments given, preProcess runs with default
settings. Look at function for more details.
4)Additional settings for the classifiers can be modified by changing the inputs to the
classifier functions (optional arguments). If these are not given, the classifier runs with
default settings. Look at the classifiers for the optional arguments available.
5)**optional, there is a function testClassifier that can test the accuracy of the classifier if
you so wish. It prints out the accuracy % of the classifier, and returns the confusion
matrix in a nice graphical format.
@author: Thiru
"""
import csv,tweepy,random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB #Naive Bayes
from sklearn.linear_model import SGDClassifier #SVM
from sklearn import metrics
from collections import Counter
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize
#######
"""
SETTINGS FOR CLASSIFIER: Modify accordingly!
"""
##Twitter login details of format ClientID/Client Secret/ Access token / Access Secret
accesstokenlist=[]
accesstokenlist.append(['<Insert client id>','<Include Client secret>','<Include access token>','<Include access secret>'])
#Interest Names/Categories as per CSV File!
names = ['News','Sports','Games','Religious','Celebrity','Food','Music','Finance','Political','Technology']
##Directory for CSV's. ASSUME THAT SAME DIRECTORY.
trainingcsv='BTAssignmentTraining.csv'
#If you want to test classifier accuracy
testcsv='BTAssignmentValidation.csv'
"""end settings """
"""
Variables to store tweets,interest, and the tokeniser and TFIDF functions, DO-NOT-TOUCH.
"""
textlst=[]
interestlst=[]
vectorizer=None
tfidfTransformer=None
""" end variables"""
"""
Description: Function to openCSV. Takes in a csv, outputs a list.
Sample command: openCSV('test.csv')
"""
def openCSV(name):
with open(name,'r') as f:
reader = csv.reader(f)
lst=list(reader)
return lst
"""
preProcess
Description: This function takes in a CSV of form [Tweets,tag],
OPTINAL ARGUMENTS:
ngram_range = (1,1) for uni, (2,2) for bigram etc
stop_words = 'english' or None.
tfidf = True for tfidf, False for TF
It then converts the csv and splits into one list each for tweets and interest tag.
Then, it tokenizes the tweetslst, uses TF/TFIDF to create the term document matrix,
and returns the matrix.
Post Cond: returns term document matrix
Sample commands: preProcess(test.csv), preProcess(test.csv,True,ngram_range=(1,2),stop_words='None')
"""
def preProcess(csvname,tfidf=True,ngram_range=(1,1),stop_words='english',stem=False):
print('Beginning preprocessing with TFIDF = '+str(tfidf) +
', ngrams selected as '+ str(ngram_range) + ' and stop words selected as '+str(stop_words) +
', stemming set to '+str(stem))
#Subsidary function to tokenize words according to settings
def tokenizer(lst,ngram_range=(1,1),stop_words='english',stem=False):
global vectorizer
if stem == False:
count_vect = CountVectorizer(ngram_range=ngram_range,stop_words=stop_words)
vectorizer=count_vect
X_train_counts = count_vect.fit_transform(textlst)
return X_train_counts
else:
stemmer = SnowballStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenizeSnowball(text):
tokens = word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
vectorizer = Count_Vectorizer(tokenizer = tokenizeSnowball,
ngram_range=ngram_range,stop_words=stop_words)
X_train_counts = vectorizer.fit_transform(textlst)
return X_train_counts
#Subsidary function to convert tokens to term document matrix
def TFIDF(tokens,tfidf):
global tfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(tokens)
tfidfTransformer=tf_transformer
X_train_tf = tf_transformer.transform(tokens)
return X_train_tf
#open CSV, split the columns into one list for tweets, one list for interest
data=openCSV(csvname)
for i in range(1,len(data)):
textlst.append(data[i][1])
interestlst.append(data[i][2])
tokens=tokenizer(textlst,ngram_range,stop_words)
tfidfdoc=TFIDF(tokens,tfidf)
print('Preprocessing done!')
return tfidfdoc
""" Mutinomial Naive Bayes classifier
Options: Alpha = x, where x >=0"""
def makeClassifierBayes(tfidf,result,alpha=1.0):
clf = MultinomialNB(alpha=alpha).fit(tfidf, result)
return clf
""" Descripton: SVM Classifier. Takes in tfidf, and optional arguments are:
loss, penalty,alpha,n_iter,random_state. for more details, see scikit learn library"""
def makeClassifierSVM(tfidf,result,loss='hinge', penalty='l2',alpha=1e-3, n_iter=9, random_state=42):
clf = SGDClassifier(loss=loss, penalty=penalty,alpha=alpha, n_iter=n_iter, random_state=random_state)
clf=clf.fit(tfidf,result)
return clf
def predictTweetClassifier(classifier,tweettext):
newcounts = vectorizer.transform(tweettext)
newtfidf=tfidfTransformer.transform(newcounts)
predicted = classifier.predict(newtfidf)
predictions=[]
for i in range(len(tweettext)):
predictions.append([tweettext[i],predicted[i]])
return predictions
def testClassifier(classifier,testingcsv,sample_size):
data=openCSV(testingcsv)
data=data[1:] #remove headers
lst=[]
for i in range(sample_size):
lst.append(data[random.randint(0,sample_size)])
newtext=[]
newinterestlst=[]
for i in range(len(lst)):
newtext.append(lst[i][1])
newinterestlst.append(lst[i][2])
newcounts = vectorizer.transform(newtext)
newtfidf=tfidfTransformer.transform(newcounts)
predicted = classifier.predict(newtfidf)
print('Accuracy of classifications = '+ str((np.mean(predicted == newinterestlst))*100)+'%')
print('Classification report:')
print(metrics.classification_report(newinterestlst,predicted,target_names=names))
return plot_confusion_matrix(metrics.confusion_matrix(newinterestlst, predicted))
#Sets the settings for confusion matrix. Does not need modification
def confusion_matrix_settings(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(names))
plt.xticks(tick_marks, names, rotation=45)
plt.yticks(tick_marks, names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
#Plots the confusion matrix. Does not need modification
def plot_confusion_matrix(cm):
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization:')
plt.figure()
confusion_matrix_settings(cm)
plt.show()
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
confusion_matrix_settings(cm_normalized, title='Normalized confusion matrix')
plt.show()
"""
Description: This function mines the tweets of the specified user,
then uses the above functions to classify the user and determine their primary
interest.
"""
def classifyUser(user):
print('Processing user: '+user)
global accesstokenlist
currKeyID=0
currentKey=accesstokenlist[currKeyID]
auth = tweepy.auth.OAuthHandler(currentKey[0], currentKey[1])
auth.set_access_token(currentKey[2], currentKey[3])
api = tweepy.API(auth)
#Remove links and formatting of the tweets mined.
def removeLinksandFormatting(lst):
for i in range(0,len(lst)):
text = lst[i]
x = text.find('http')
while x != -1:
text = lst[i][:x] + lst[i][x+22:]
lst[i] = text
x = text.find('http')
lst[i]=lst[i][:-1]
lst[i]=lst[i][2:]
return lst
tweetlst = []
counter=4
print('Mining tweets..')
new_tweets = api.user_timeline(screen_name = user,count=200)
tweetlst.extend(new_tweets)
oldest = tweetlst[-1].id - 1
##Mine 1k tweets
while len(new_tweets) > 0 and counter>0:
counter-=1
new_tweets = api.user_timeline(screen_name = user,count=200,max_id=oldest)
tweetlst.extend(new_tweets)
oldest = tweetlst[-1].id - 1
tweetstxt=[]
##process tweets to list
for i in range(len(tweetlst)):
tweetstxt.append(str(tweetlst[i].text.encode('utf-8')))
print('Formatting tweets..')
tweetstxt=removeLinksandFormatting(tweetstxt)
#Predict the interest of each tweet
taggedtweets=predictTweetClassifier(classifier,tweetstxt)
interests=[]
#find the most occuring interest in the tweets. this is the interest of user
for i in range(len(taggedtweets)):
interests.append(taggedtweets[i][1])
words_to_count = (word for word in interests if word[:1].isupper())
c = Counter(words_to_count)
return (c.most_common(1)[0][0])
##Actual commands
tfis = preProcess(trainingcsv) #For options, check the function above.
classifier=makeClassifierSVM(tfis,interestlst)
#lassifier=makeClassifierBayes(tfis,interestlst)
#testClassifier(classifier,testcsv,10000) #uncomment if you want to test the classifier
while True:
interest = classifyUser(input('Enter a Twitter User: '))
print('Users interest is '+interest)