-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscraper.py
executable file
·92 lines (82 loc) · 2.5 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import os
import sys
import json
import requests
import sqlite3
#TODO: USE CLI ARGS
# FROM > TO
FROM = int(1428885)#Last 1428865
TO = int(1420000)#1347790
MAX_ERRORS=4
# path to sqlite database
DB = "fbcomments.db"
def connectDB():
try:
conn = sqlite3.connect(DB)
return conn
except sqlite3.OperationalError as err:
print("SQLite: {0}".format(err))
pass
def createTable():
conn = connectDB()
c = conn.cursor()
try:
c.execute(
'''CREATE TABLE comments( id INTEGER PRIMARY KEY AUTOINCREMENT, fb_id text unique, comment text)''')
conn.commit()
conn.close()
except sqlite3.OperationalError as err:
print("SQLite: {0}".format(err))
pass
def getUrls():
urls = []
for page_id in list(range(FROM, TO, -1)):
urls.append(
"https://graph.facebook.com/comments?id=http://abc.com.py/%s.html&limit=500&fields=message" % page_id)
return urls
def dumpText():
conn = connectDB()
c = conn.cursor()
query = c.execute('select comment from comments')
comments = query.fetchall()
print("Parseando %d comentarios" % len(comments))
for comment in comments:
with open('comentarios.txt', 'a') as f:
f.write(comment[0].upper())
raise SystemExit
if __name__ == "__main__":
error_count=0
if 'dumptext' in sys.argv:
dumpText()
if not os.path.isfile(DB):
createTable()
urls = getUrls()
print("URLs a escanear: " + str(len(urls)))
db_conn = connectDB()
cursor = db_conn.cursor()
for i, url in enumerate(urls):
if error_count>MAX_ERRORS:
raise SystemExit
print(str(i) + " - " + url)
data="{}"
try:
data = requests.get(url, timeout=8).text
except Exception as err:
error_count += 1
msg = "RequestURL: {0}\n".format(err)
with open('error.log', 'a') as f:
f.write(url + ": " + str(msg))
comments = json.loads(data)
if 'data' in comments:
for entry in comments['data']:
comment = entry['message']
fbid = entry['id']
cursor.execute('''INSERT OR IGNORE INTO comments(fb_id, comment)
VALUES(?,?)''', (fbid, comment))
db_conn.commit()
else:
with open('error.log', 'a') as f:
f.write(url + ": " + str(comments) + "\n")
db_conn.close()