forked from wlrd/Snapcatz
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgeturl.py
executable file
·39 lines (29 loc) · 1.05 KB
/
geturl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import re
import urllib2
from bs4 import BeautifulSoup
import urllib
import pickle
def get_url():
a=[]
urllist=[]
hdr = { 'User-Agent' : 'SnapMeCatz scraper' }
url = 'http://www.reddit.com/r/cats/top?limit=5'
conn = urllib2.urlopen(urllib2.Request(url, headers=hdr))
html = conn.read()
soup = BeautifulSoup(html)
for elem in soup.findAll('a', href=re.compile('i\.imgur\.com/[a-zA-Z0-9]')):
a.append(elem['href'])
#print elem['href']
url = 'http://www.reddit.com/r/catpictures/top?limit=5'
conn = urllib2.urlopen(urllib2.Request(url, headers=hdr))
html = conn.read()
soup = BeautifulSoup(html)
for elem in soup.findAll('a', href=re.compile('i\.imgur\.com/[a-zA-Z0-9]')):
a.append(elem['href'])
#print elem['href']
for i in a:
if i not in urllist:
urllist.append(i)
pickle.dump(urllist, open( "urls.p", "wb" ))
if __name__ == "__main__":
get_url();