-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.py
83 lines (66 loc) · 2.14 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import urllib
import re
from urllib import urlopen
from bs4 import BeautifulSoup
import datetime
import unicodedata
def getData(insideSite,fp):
html=urlopen(insideSite)
bsObj1 = BeautifulSoup(html,"lxml")
dataList=bsObj1.find_all("p")
global s
global s1
s=''
s1=''
for data in dataList:
s=s+data.get_text()
s1=s.encode('utf-8')
print s1
fp.write(s1)
fp.write("\n\n\t\t\t*******************NEW SITE*************************\n\n")
fp.write(insideSite)
def getExternalLinks(bsObj, excludeUrl):
global externalLinks
externalLinks=[]
#Finds all links that start with "http" or "www" that do
#not contain the current URL
for link in bsObj.findAll("a",href=re.compile("^(http|www|https)((?!"+excludeUrl+").)*$"),limit=8):
if link.attrs['href'] is not None:
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def splitAddress(address):
addressParts = address.replace("https://","").split("/")
return addressParts
seeds="https://www.bing.com/search?q=demonetisation+of+currency+in+india&qs=AS&pq=demonetisation+of+currency&sk=HS1AS1&sc=8-26&cvid=4E7DC5E5195342EABC7E4D240A47B75E&FORM=QBRE&sp=3"
j=0
externalLinks = []
s=''
s1=''
fp=open("Demonisation_Data.txt","a")
while(j<100):
print seeds
html = urlopen(seeds)
bsObj = BeautifulSoup(html,"lxml")
externalLinks = getExternalLinks(bsObj, splitAddress(seeds)[0])
for i in range(0,len(externalLinks)):
print externalLinks[i]
if(externalLinks[i]=="http://go.microsoft.com/fwlink/?LinkId=521839&CLCID=4009"):
break
else:
getData(externalLinks[i],fp)
#getData(externalLinks[1])
#print externalLinks[1]
nextSeed=''
l=bsObj.findAll("a",{"class":"sb_pagN"})
if (len(l)!=0):
for l1 in l:
nextSeed=l1.attrs['href']
seeds="https://www.bing.com"+nextSeed
j=j+1
print j
else:
print "Work completed"
break
fp.close()
#print("Random external link is: "+externalLink)