-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_scraper.py
199 lines (165 loc) · 7.16 KB
/
main_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import json
import requests
import time
import bs4 as bs
import contextlib
import urllib
import os,sys,datetime, random,re, argparse
#the script is modelled from the wonderful program of Althonos:
#https://github.com/althonos/InstaLooter
parser = argparse.ArgumentParser(description="""lovInstagram: scrape an Instagram account.\n
This script can be used to perform data analyses.\n
Further implementations will be made for a correct analysis of metadata.\n
To scrape a user:\n
python scraper_private.py -u/--user USERNAME\n""",
epilog="script designed by Stefano Bosisio"
"to gather information from instagram pics ",
prog="lovInsta")
parser.add_argument('-l','--login',nargs="+",
type=str,help='Supply username and password for login')
parser.add_argument('-u', '--user', nargs="?",
help='Supply a username to scrape a user.')
parser.add_argument('-d','--directory',nargs="?",
help='Supply a directory to save all the pics.')
#ugly variables
RX_SHARED_DATA = re.compile(r'window._sharedData = ({[^\n]*});')
NO_RESIZE_RX = re.compile(r'(/[p|s][0-9]*x[0-9]*)')
RX_TEMPLATE = re.compile(r'{([a-zA-Z]*)}')
RX_CODE_URL = re.compile(r'p/([^/]*)')
def login(username,password,directory,user):
print("Login...")
URL_HOME = "https://www.instagram.com/"
URL_LOGIN = "https://www.instagram.com/accounts/login/ajax/"
#URL_LOGOUT = "https://www.instagram.com/accounts/logout/"
#extract from InstaLooter
session = requests.Session()
user_agent = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
session.cookies.update({
'sessionid': '',
'mid': '',
'ig_pr': '1',
'ig_vw': '1920',
'csrftoken': '',
's_network': '',
'ds_user_id': ''
})
#here you need to put the user and password
login_post = {'username': username,
'password': password}
session.headers.update({
'Origin': URL_HOME,
'Referer':URL_HOME,
'X-Instragram-AJAX': '1',
'X-Requested-With': 'XMLHttpRequest',
})
res = session.get(URL_HOME)
#here I copy the csrftoken, thus it is possible to connect
session.headers.update({'X-CSRFToken': res.cookies['csrftoken']})
#we need to make it sleep
time.sleep(5 * random.random())
#now login
login = session.post(URL_LOGIN, data=login_post, allow_redirects=True)
#now take the csrf token otherwise we cannot access
session.headers.update({'X-CSRFToken': login.cookies['csrftoken']})
csrftoken = login.cookies['csrftoken']
if login.status_code != 200:
csrftoken = None
raise SystemError("Login error: check your connection")
else:
r = session.get(URL_HOME)
#print(r.text)
if r.text.find("%s" % username) == -1:
raise ValueError('Login error: check your login data')
else:
print("Login succeed")
##SCrape only users
if user is not None:
print("Scraping profile %s..." % user)
target = user
page_name = "ProfilePage"
section_name = "user"
base_url = "https://www.instagram.com/{}/"
#retrieve all the pages with pictures
retrieve_pages(directory,session,target,page_name,section_name,base_url)
else:
print("Please select an option: profile or hastag")
def retrieve_pages(directory,session,target,page_name,section_name,base_url):
if not os.path.exists(directory):
os.makedirs(directory)
#collect all the urls to scrape
photo_urls = [] #create a list of urls to be processed then
current_page = 0
url = base_url.format(target)
#urls.append(url)
print("Scraping : %s" % url) #sanity check
while True:
current_page +=1
#print("Retrieving urls...")
#connect to the url and read all the infor
res = session.get(url) #get the url of the user
data = get_shared_data(res)
try:
media = data['entry_data'][page_name][0][section_name]['media']
media_info = data['entry_data'][page_name][0][section_name]['media']["nodes"]
count_photos = 0
for nodes in media_info:
photo_url = NO_RESIZE_RX.sub('', nodes.get('display_src'))
#print(nodes["caption"])
#caption --> "caption"
#comments --> "comments"
#likes --> "likes"
#print(nodes["code"])
print("Page %d photo %d" % (current_page, count_photos))
#[u'code', u'gating_info', u'dimensions', u'caption',
#u'thumbnail_resources', u'comments_disabled', u'__typename',
#u'comments', u'date', u'media_preview', u'likes', u'owner', u'thumbnail_src',
#u'is_video', u'id', u'display_src']
#print(media_info["page_info"]["end_cursor"])
#url = '{}?max_id={}'.format(base_url.format(target), media_info['page_info']["end_cursor"])
#print(photo_url)
photo_urls.append(photo_url)
#filename = make_filename(nodes)
#print(filename)
savedfile = directory + "/" + nodes["code"] + ".jpg"#+ filename
with contextlib.closing(session.get(photo_url)) as image:
with open(savedfile,"wb") as dest_file:
for block in image.iter_content(1024):
if block:
dest_file.write(block)
#open the image with pyexiv2#
dest_file.close()
count_photos+=1
except:
break
if not media['page_info']['has_next_page']:
break
else:
url = '{}?max_id={}'.format(base_url.format(target), media['page_info']["end_cursor"])
def get_shared_data(res):
#here we retrieve the data shared by the user
PARSER = "html.parser"
soup = bs.BeautifulSoup(res.text, PARSER)
#in the soup we have everything we need to take the pictures
script = soup.find('body').find('script', {'type': 'text/javascript'})
#here extract the info from the contact
#you got initially a dictionary from json loads and then extract the values
media_group = (json.loads(RX_SHARED_DATA.match(script.text).group(1)))
#Here we could retrieve some info about the user if it's private
#return private,followed,json.loads(RX_SHARED_DATA.match(script.text).group(1))
return media_group
#MAIN
args = parser.parse_args()
if args.login:
#usually we will have 2 input here
username = args.login[0]
passw = args.login[1]
if args.user:
print("Username to scrape %s" % args.user)
user = args.user
else:
user = None
if args.directory:
directory = args.directory
else:
directory = os.getcwd()
login(username,passw,directory,user)