-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathedtcrawl.py
58 lines (49 loc) · 1.22 KB
/
edtcrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import urllib, urllib2 ,sys
from bs4 import BeautifulSoup, Comment
class style:
BOLD = '\033[1m'
END = '\033[0m'
def printedit(url):
edfile=open('/home/manishankar/Desktop/editorial.txt','a')
content = urllib2.urlopen(url).read()
l=[]
soup = BeautifulSoup(content, "html.parser")
rows=soup.find('none')
edfile.write( rows.text)
edfile.write('\n')
topic=soup.find('h1',attrs={"class" : "artcl-nm-stky-text"})
edfile.write((topic.text).encode("UTF-8"))
edfile.write('\n')
for row in soup.find_all('p',attrs={"class" : "drop-caps"}):
s= row.text
print type(s)
sutf = s.encode("UTF-8")
#print type(stuf8)
try:
edfile.write(sutf)
except:
print 'exception'
#print sutf8
l.append(sutf)
edfile.write('\n')
edfile.write('\t\t-------------------------------------\n')
edfile.close()
return l
url1='http://newsite.thehindu.com/opinion/editorial/'
content1 = urllib2.urlopen(url1).read()
soup1 = BeautifulSoup(content1, "html.parser")
alledd=soup1.find_all('div',attrs={"class" : "dd-slide"})
t=[]
l=[]
r=len(alledd)
print r
for i in range(0,r):
t=alledd[i].find_all('li')
for i in t:
k=i.find_all('a',)
for j in k:
l.append(j.get('href'))
for i in l:
printedit(i)
#print len(p)
#print p