-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharea.js
74 lines (70 loc) · 2.16 KB
/
area.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
//To crawl areas from sitemap of a city
const rp = require('request-promise');
var request = require('request');
const $ = require('cheerio');
const url = 'https://www.practo.com/sitemap/indore-listings/localities-listings/page-1';
const split_city_name1 = url.split('https://www.practo.com/sitemap/');
split_city_name2 = split_city_name1[1].split('/');
city = split_city_name2[0]; // To get city name
scrap(url,city);
const remain = [];
var flag = 0;
var flag1 = 0;
const pages = [];
var loop =true;
function scrap(url,city)
{
rp(url)
.then(function(html){
//success!
const Urls = [];
for (var i=0;i<200;i++) {
try
{
var a = ($('a', html)[i].attribs.href); // get href
if(a.startsWith('/sitemap/'+city+'/')) // if href is like /sitemap/agra-listings/MG-road
{
var fields = a.split(city+'/');
var area = fields[1];
console.log(area);
if(area.startsWith('localities-listings')) // if href is like /sitemap/agra-listings/localities-listings/page-2 then save in remain for another page
{
if(flag==0)
{
remain.push(area);
}
}
else
{
Urls.push(area);
}
}
}
catch(err)
{
console.log('Nothing'); // if area are less than number of loop then do nothing
}
}
flag++;
console.log(Urls);
console.log(remain);
if(flag1==0)
{
remaining(remain);
}
flag1++;
})
.catch(function(err){
//handle error
});
}
function remaining(remain) // To scrap other pagination pages
{
console.log(remain.length);
for(var i=0;i<remain.length;i++)
{
full_remain_link = 'https://www.practo.com/sitemap/'+city+'/'+remain[i];
console.log(full_remain_link);
scrap(full_remain_link,city);
}
}