forked from sedpeep/Twitter-Scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
124 lines (102 loc) · 4.71 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os
import openpyxl
options = Options()
options.add_experimental_option("detach", True)
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("window_size=1280,800")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-save-password-bubble")
def save_to_excel(data, filename):
if os.path.exists(filename):
df_existing = pd.read_excel(filename, engine='openpyxl')
df_new = pd.DataFrame(data, columns=["Tweets"])
df_combined = pd.concat([df_existing, df_new], ignore_index=True)
df_combined.to_excel(filename, index=False, engine='openpyxl')
else:
df = pd.DataFrame(data, columns=["Tweets"])
df.to_excel(filename, index=False, engine='openpyxl')
driver = webdriver.Chrome(options=options)
driver.get("https://accounts.google.com/v3/signin/identifier?hl=en_GB&ifkv=AXo7B7VGP4Y_gNfwPri72zV40Ii9kmgYbvLRXoOhOeBNkeBYcMPcPOX_Aolo1vK16FetaA4URMIfUA&flowName=GlifWebSignIn&flowEntry=ServiceLogin&dsh=S-1140670556%3A1692882589574310")
#email:
email = ''
#password:
password = ''
#LOGIN
driver.find_element(By.XPATH,'//*[@id="identifierId"]').send_keys(email)
time.sleep(3)
driver.find_element(By.XPATH,'//*[@id="identifierNext"]/div/button/span').click()
time.sleep(5)
driver.find_element(By.XPATH,'//*[@id="password"]/div[1]/div/div[1]/input').send_keys(password)
driver.find_element(By.XPATH,'//*[@id="passwordNext"]/div/button/span').click()
time.sleep(5)
#GO TO TWITTER
driver.get("https://twitter.com/")
time.sleep(10)
# current_window = driver.current_window_handle
# con = driver.find_element(By.XPATH,'/html/body/div/div/div[2]').click()
# wait = WebDriverWait(driver, 10)
# wait.until(EC.number_of_windows_to_be(2))
#
# for window_handle in driver.window_handles:
# if window_handle != current_window:
# driver.switch_to.window(window_handle)
# break
hashtags = ['wow' , 'funny' ,'go']
unique_texts = []
seen_texts = set()
for hashtag in hashtags:
driver.get("https://twitter.com/explore")
time.sleep(5)
search = driver.find_element(By.XPATH,
'//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[1]/div[1]/div/div/div/div/div/div[1]/div[2]/div/div/div/form/div[1]/div/div/div/label/div[2]/div/input')
search.send_keys(hashtag+" lang:en") #for language filtere, you can remove
search.send_keys(Keys.ENTER)
time.sleep(5)
#
# driver.find_element(By.XPATH,'/html/body/div[1]/div/div/div[2]/main/div/div/div/div/div/div[1]/div[1]/div[2]/nav/div/div[2]/div/div[2]/a/div/div/span').click()
# time.sleep(5)
actions = ActionChains(driver)
previous_num_unique = 0 # Keep track of the number of unique tweets in the previous iteration
while True:
# Scroll 10 times
for _ in range(10):
actions.send_keys(Keys.PAGE_DOWN).perform()
time.sleep(3) # Giving the page some time to load new content
# Fetch tweet data
t_data = driver.find_elements(By.XPATH,
"//div[starts-with(@class,'css-901oao r-18jsvk2 r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-bnwqim r-qvutc0')]")
# Store in set
for i in t_data:
try:
tweet_text = i.text
if tweet_text not in seen_texts:
unique_texts.append(tweet_text)
seen_texts.add(tweet_text)
except StaleElementReferenceException:
# If a stale element exception occurs, break out of the loop
# and re-fetch the tweets
break
if len(unique_texts) == previous_num_unique:
break
previous_num_unique = len(unique_texts)
for text in unique_texts:
print(text)
print(len(unique_texts))
# Save the tweets to the Excel file after processing each hashtag
save_to_excel(list(unique_texts), "tweets.xlsx")
# Print the unique texts
for text in unique_texts:
print(text)