-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathcrawler.py
129 lines (109 loc) · 4.49 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
# Written by Jonathon Vogel, 2014
import bs4
import feedparser
import requests
import slack
import slack.chat
import time
import traceback
import urllib.parse
KEYWORDS = ['Storj', 'Storj Labs', 'SJCX', 'Storjcoin X', 'Storjcoin']
PING_TIME = 2 # how many seconds to wait between checking BitcoinTalk
KEYWORD_FORMAT = '_*{}*_' # markdown bold, {} is replaced
MESSAGE_FORMAT = """Someone mentioned your organization on BitcoinTalk!
Thread - {} / {}
{}"""
slack.api_token = '' # get one for your org. at api.slack.com
SLACK_USERNAME = 'Bitcoin-Talk-Bot'
SLACK_CHANNEL = '#general'
BITCOIN_TALK_RSS = 'https://bitcointalk.org/index.php?type=rss;action=.xml&limit=100'
def string_find_all(s, needle):
"""A generator that finds all occurences of needle in s."""
loc = 0
while True:
loc = s.find(needle, loc)
if loc == -1:
return
yield loc
loc += len(needle)
def check_and_format_string(s, kwds, each_side_context=20):
"""s is the string to check, kwds is the keywords to check for, and
each_side_context is the number of characters of context to include on each
side of the keyword. Returns a list of formatted strings, which is empty if
no keywords were found.
"""
keywords = {}
for k in kwds:
for loc in string_find_all(s, k):
if loc not in keywords or len(k) > len(keywords[loc]):
keywords[loc] = k
return [s[loc - each_side_context:loc] + KEYWORD_FORMAT.format(k) + s[loc + len(keywords[loc]):loc + len(keywords[loc]) + each_side_context]
for loc, k in keywords.items()]
def check_post_strings(url, kwd=KEYWORDS):
"""We need to do a *little* bit of HTML scraping, as the RSS feed only
gives us partial summaries of posts. Luckily, this isn't too difficult,
and it's flexible enough that BitcoinTalk redesigns shouldn't break it
too hard.
"""
html = bs4.BeautifulSoup(requests.get(url).text)
post_id_elem = html.find('a', href=url)
if post_id_elem is None: # bitcoin talk returning bad HTML
print('Bad HTML (503?), bailing...')
print(html[:100])
raise Exception('Bad HTML, possible 503')
post = post_id_elem.find_next('div', {'class': 'post'})
def walk_post_children(node):
if isinstance(node, str):
yield str(node)
elif hasattr(node, 'children') and ('class' not in node or node['class'] not in ['quote', 'quoteheader']):
# we don't want quotes to double-report things
for c in node.children:
for s in walk_post_children(c):
yield s
lines = []
for s in walk_post_children(post):
lines += check_and_format_string(s, kwd)
return lines
def get_post_id(url):
return int(urllib.parse.urlparse(url).fragment.replace('msg', ''))
def check_btc_talk(last_post_checked):
"""Handler for RSS and posting to Slack."""
t = requests.get(BITCOIN_TALK_RSS).text
feed = feedparser.parse(t)
#if feed['bozo']:
# print('WARNING: XML errors in feed')
# print(t[:100])
for entry in reversed(feed['entries']):
if 'id' not in entry or (last_post_checked is not None and get_post_id(entry['id']) <= get_post_id(last_post_checked)):
continue
print(entry['id'])
try:
mentions = check_post_strings(entry['id'], KEYWORDS)
if len(mentions):
print('Found a mention, posting to slack...')
slack.chat.post_message(SLACK_CHANNEL, MESSAGE_FORMAT.format(entry['title'], entry['id'], '\n'.join(mentions)), username=SLACK_USERNAME)
last_post_checked = entry['id']
except Exception as e:
if isinstance(e, KeyboardInterrupt):
raise e
print('Unhandled exception, retrying feed parse at exception point')
traceback.print_exc()
break
time.sleep(1)
return last_post_checked
def main():
"""Loop and exception handling"""
last_post_checked = feedparser.parse(BITCOIN_TALK_RSS)['entries'][0]['id'] # don't spend a bunch of time parsing old comments
while True:
try:
last_post_checked = check_btc_talk(last_post_checked)
time.sleep(1)
except Exception as e:
if isinstance(e, KeyboardInterrupt):
print('Being killed! Exiting...')
break
print('Unexpected exception, trying to continue...')
traceback.print_exc()
if __name__ == '__main__':
main()