-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfb_scraper.py
182 lines (164 loc) · 6.75 KB
/
fb_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/python3
import pickle
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
import urllib.request
import re
options = Options()
options.page_load_strategy = 'normal'
headers = 'user-agent=Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
options.add_argument(headers)
driver = webdriver.Chrome(options=options)
# open the cookies jar
driver.get(
'https://m.facebook.com/')
cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
driver.add_cookie(cookie)
# it show time!
driver.get('') # insert your group member profile url here
# check if element by link text is exist
def check_element_by_link_text(link_text):
try:
driver.find_element_by_link_text(link_text)
except NoSuchElementException:
return False
return True
def get_timestamp(i):
timestamp_raw_list = driver.find_elements_by_tag_name('abbr')
timestamp_raw = timestamp_raw_list[i].get_attribute(
'innerHTML')
if timestamp_raw == "":
timestamp_raw = "Null"
timestamp_non_alphanumeric = timestamp_raw.replace(" ", "_")
timestamp = re.sub("[^0-9a-zA-Z]+", "_", timestamp_non_alphanumeric)
return timestamp
def open_new_tab(url):
# open the url to new page
driver.execute_script(
''f"window.open('{url}','_blank');"'')
# change focus to the second page
window_name = driver.window_handles[-1]
driver.switch_to.window(window_name=window_name)
def close_new_tab():
driver.close()
window_name = driver.window_handles[0] # go back to the main window
driver.switch_to.window(window_name=window_name)
def multipic_img_ids():
current_url = driver.current_url
current_url_start = current_url.find('fbid=')
current_url_end = current_url.find('&id')
current_url_fix = current_url[current_url_start +
5:current_url_end]
return current_url_fix
def click_next(current_url2):
img_id_list.append(current_url2)
# print(current_url2)
driver.find_element_by_xpath(
'/html/body/div/div/div[2]/div/div[1]/div/div/div[1]/div/div[2]/table/tbody/tr/td[2]/a').click()
# press the okay button if it get blocked by facebook
time.sleep(2)
if check_element_by_link_text('Okay'):
driver.find_element_by_link_text('Okay').click()
time.sleep(2)
# Get scroll height # https://tinyurl.com/uf6z66j2
last_height = driver.execute_script("return document.body.scrollHeight")
poster_count = 1
multipic_count = 5
timestamp_count = 0
title = str(driver.title).replace(" ", "_")
while True:
timestamp_elements = driver.find_elements_by_tag_name('abbr')
# post with single image
posts = driver.find_elements_by_class_name(
"_39pi")
# post with multiple image
multipic_url = driver.find_elements_by_class_name("_26ih")
if (poster_count-1 == len(posts)-1) and (len(multipic_url)-1 == multipic_count-1):
# Scroll down to bottom
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(3)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
print('we\'re done here this is the last pages!')
break
last_height = new_height
continue
timestamp = get_timestamp(timestamp_count)
file_path_single = Path(f'./image/{title}_{timestamp}.jpg')
# print('nope it still', len(multipic_url))
if len(multipic_url) > multipic_count:
multipic_count = len(multipic_url)
print('found multipic post in', multipic_count)
multipic_url[multipic_count - 1].click()
# wait for the page to load
WebDriverWait(driver, 30).until(
lambda x: x.find_element_by_class_name("_56be").is_displayed())
the_images = driver.find_elements_by_class_name(
"_56be")
current_url = driver.current_url
# print(timestamp, '\n', current_url, '\n\n')
# fuck it going MBasic mode!
mbasic_formater = current_url.replace('https://m', 'https://mbasic')
# open the url to new page
open_new_tab(mbasic_formater)
time.sleep(1)
driver.find_element_by_xpath(
'/html/body/div/div/div[2]/div/div[1]/div[1]/a[1]').click()
img_id_list = list()
img_count = 0
while True:
file_path_multipic = Path(
f'./image/{title}_{timestamp}_p{img_count}.jpg')
current_url2 = multipic_img_ids()
if current_url2 in img_id_list:
print('last image, done!')
break
if file_path_multipic.is_file():
print(f'skipping {file_path_multipic} file exist')
click_next(current_url2)
img_count = img_count + 1
continue
click_next(current_url2)
fullsize_img_url = driver.find_element_by_xpath(
'/html/body/div/div/div[2]/div/div[1]/div/div/div[3]/div[1]/div[2]/span/div/span/a[1]').get_attribute('href')
print('saving multi', file_path_multipic)
driver.get(fullsize_img_url)
image_url = driver.current_url
# print(image_url) # image url
driver.back()
urllib.request.urlretrieve(
image_url, file_path_multipic) # Download Image
img_count = img_count + 1
time.sleep(0.5)
close_new_tab()
driver.back()
timestamp_count = timestamp_count + 1
else: # post with single image
if file_path_single.is_file():
print(f'skipping {file_path_single} file exist', timestamp_count)
poster_count = poster_count + 1
timestamp_count = timestamp_count + 1
continue
poster = posts[poster_count] # find the post with single image
url = poster.get_attribute('href') # get the post url
# print(timestamp, '\n', url, '\n\n')
open_new_tab(url)
view_full_size = driver.find_element_by_link_text(
"View Full Size").get_attribute('href') # Get the image URL
driver.get(view_full_size)
image_url = driver.find_element_by_tag_name('img').get_attribute('src')
# print(image_url)
urllib.request.urlretrieve(
image_url, file_path_single) # Download Image
poster_count = poster_count + 1 # add 1 to the poster_count counter
print('saving single', file_path_single)
time.sleep(0.5)
close_new_tab()
timestamp_count = timestamp_count + 1