-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhandleCrawl.py
115 lines (107 loc) · 4.9 KB
/
handleCrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
import os
import re
import sqlite3
import json
import handleUid
import handleDB
lang = "it"
output = "data.json"
outputComment = "comment.json"
def crawl(email, password, page, pagenumbers, date_crawl):
# delete files
if os.path.exists(output):
os.system("rm -rf {}".format(output))
if os.path.exists(outputComment):
os.system("rm -rf {}".format(outputComment))
# check if page, date_crawl is in DB, don't crawl
with sqlite3.connect("database.db") as conn:
cur = conn.execute(
"SELECT * FROM post WHERE page=? and date_crawl=?", (page, date_crawl)
)
res = cur.fetchall()
if len(res) > 0:
conn.commit()
return
# crawl post
cmd = "scrapy crawl fb -a email=\"{}\" -a password=\"{}\" -a page=\"{}\" -a max=\"{}\" -a lang=\"{}\" -o {}".format(email, password, page, pagenumbers, lang, output)
os.system(cmd)
datastore = {}
comment = {}
with open (output, 'r') as jsonFile:
datastore = json.load(jsonFile)
index = 0
while index < len(datastore):
url = datastore[index]['url']
post_id = datastore[index]['post_id']
# get profile image
source = datastore[index]['source'][0]
urlGetImg = 'https://www.facebook.com' + source
profile_img = handleUid.findUid(email, password, urlGetImg)
datastore[index]['profile_img'][0] = profile_img
# find price, email, phone numbers
if 'text' in datastore[index]:
text = datastore[index]['text']
emails = re.findall(r"[\w\.-]+@[a-z0-9\.\-+_]+\.[a-z]+", text)
phones = re.findall(r"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})", text)
prices = re.findall(r"giá: \b\d+\b", text.lower())
if len(prices) == 0:
prices = re.findall(r"giá:\b\d+\b", text.lower())
datastore[index]['emails'] = emails
datastore[index]['phones'] = phones
datastore[index]['prices'] = prices
# read comment from url
cmd = "scrapy crawl comments -a email=\"{}\" -a password=\"{}\" -a post=\"{}\" -a lang=\"{}\" -o {}".format(email, password, url, lang, outputComment)
os.system(cmd)
# put comment to DB
if os.path.exists(outputComment):
if os.stat(outputComment).st_size > 0:
with open(outputComment, 'r') as commentFile:
comment = json.load(commentFile)
for cment in comment:
sourceComment = cment['source_url'][0]
urlGetImg = 'https://www.facebook.com' + sourceComment
print("<<<<<<<<<<{}".format(urlGetImg))
profile_imgComment = handleUid.findUid(email, password, urlGetImg)
cment['profile_img'][0] = profile_imgComment
# find price, email, phone numbers
if 'text' in cment:
text = cment['text']
emails = re.findall(r"[\w\.-]+@[a-z0-9\.\-+_]+\.[a-z]+", text)
phones = re.findall(r"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})", text)
prices = re.findall(r"giá: \b\d+\b", text.lower())
if len(prices) == 0:
prices = re.findall(r"giá:\b\d+\b", text.lower())
cment['emails'] = emails
cment['phones'] = phones
cment['prices'] = prices
with sqlite3.connect("database.db") as conn:
cur = conn.execute(
"SELECT * FROM comment WHERE post_id=?", (post_id,)
)
res = cur.fetchall()
if len(res) == 0:
data = json.dumps(comment)
conn.execute(
"INSERT INTO comment (post_id, data) values (?, ?)",
(post_id, data)
)
conn.commit()
# delete comment json file
os.system("rm -rf {}".format(outputComment))
index +=1
# put post to DB
with sqlite3.connect("database.db") as conn:
index = 0
while index < len(datastore):
post_id = datastore[index]['post_id']
data = json.dumps(datastore[index])
conn.execute(
"INSERT INTO post (page, post_id, date_crawl, data) values (?, ?, ?, ?)",
(page, post_id, date_crawl, data)
)
index += 1
conn.commit()
os.system("rm -rf {}".format(output))
# update crawls number to DB
handleDB.updatePostNumbers(date_crawl, index)