forked from tweepy/tweepy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4_process_tweets.py
125 lines (92 loc) · 3.13 KB
/
4_process_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# process_tweets.py
import psycopg2
import pandas as pd
from nltk.stem import WordNetLemmatizer
# import nltk
# nltk.download('wordnet')
from nltk.corpus import wordnet
import numpy as np
from textblob import TextBlob
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
# Cleaning the tweets
def preprocess(tweet):
# remove links
tweet = re.sub(r'http\S+', '', tweet)
# remove mentions
tweet = re.sub("@\w+","",tweet)
# alphanumeric and hashtags
tweet = re.sub("[^a-zA-Z#]"," ",tweet)
# remove multiple spaces
tweet = re.sub("\s+"," ",tweet)
tweet = tweet.lower()
# Lemmatize
lemmatizer = WordNetLemmatizer()
sent = ' '.join([lemmatizer.lemmatize(w) for w in tweet.split() if len(lemmatizer.lemmatize(w))>3])
return sent
# Connecting to the Database
def DbConnect(query):
conn = psycopg2.connect(host="localhost",database="TwitterDB",port=5432,user='postgres',password='790213Aa')
curr = conn.cursor()
curr.execute(query)
rows = curr.fetchall()
return rows
# Create
data_tweet = DbConnect("SELECT User_Id, Tweet_Id, Tweet FROM TwitterTweet;")
df_tweet = pd.DataFrame(columns=['User_Id','Tweet_Id','Clean_Tweet'])
for data in data_tweet:
index = len(df_tweet)
df_tweet.loc[index,'User_Id'] = data[0]
df_tweet.loc[index,'Tweet_Id'] = data[1]
df_tweet.loc[index,'Clean_Tweet'] = preprocess(data[2])
df_tweet.head()
# Most commomly occuring words
def keywords():
all_words = ' '.join([text for text in df_tweet['Clean_Tweet']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
keywords()
# Sentiment analysis using Textblob
def sentiment(tweet):
analysis = TextBlob(tweet)
if analysis.sentiment.polarity > 0:
return 1
elif analysis.sentiment.polarity == 0:
return 0
else:
return -1
df_tweet['Sentiment'] = df_tweet['Clean_Tweet'].apply(sentiment)
df_tweet.head(20)
# Querying hashtags from database
data_tags = DbConnect("SELECT Tweet_Id, Hashtag FROM TwitterEntity;")
df_tags = pd.DataFrame(columns=['Tweet_Id','Hashtags'])
for data in data_tags:
index = len(df_tags)
df_tags.loc[index,'Tweet_Id'] = data[0]
df_tags.loc[index,'Hashtags'] = data[1]
df_tags.head(20)
# Unique hashtag counts
table = df_tags.pivot_table(index="Hashtags",values='Tweet_Id',aggfunc=len)
# Convert pivot table to dataframe
df_pivot = pd.DataFrame(table.to_records())
df_pivot.head()
# Plotting hashtags counts
data = df_pivot.nlargest(columns="Tweet_Id", n = 15)
# Creating bar graph
plt.figure(figsize=(16,5))
ax = sns.barplot(data=data, x= "Hashtags", y = "Tweet_Id", palette=("Reds_d"))
# Altering the visual elements
sns.set_context("poster")
ax.set(ylabel = 'Count')
ax.set_xticklabels(labels=ax.get_xticklabels(),rotation=70)
plt.title('Bitcoin #Hashtags')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
# Output plot
plt.show()