-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.js
96 lines (77 loc) · 2.52 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
const puppeteer = require('puppeteer');
const cheerio = require('cheerio');
const path = require('path');
const fs = require('fs');
const allEmails = [];
const allWebsites = [];
const RESULTS_PER_PAGE = 10;
const scrapeEmails = async (url) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url);
const html = await page.content();
const $ = cheerio.load(html);
const emails = [];
$('a[href^="mailto:"], .email-contact, span[data-email]').each((index, element) => {
const email = $(element).text().trim() || $(element).attr('href').replace('mailto:', '');
if (isValidEmail(email)) {
emails.push(email);
}
});
await browser.close();
return emails;
} catch (error) {
console.error(`Error scraping ${url}:`, error);
}
}
const isValidEmail = (email) => {
const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
return emailRegex.test(email);
}
const main = async (urls) => {
for (const url of urls) {
try {
const [site, contact] = await Promise.all([
scrapeEmails(url),
scrapeEmails(`${url}/contact`)
]);
allEmails.concat(...site);
allEmails.concat(...contact);
} catch (error) {
console.error(`Error processing ${url}:`, error);
}
}
}
const getWebsiteLinks = async (q, pageCount = 1) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const start = pageCount === 1 ? 0 : pageCount * RESULTS_PER_PAGE;
const googleUrl = `https://www.google.com/search?q=${encodeURIComponent(q)}&start=${start}`;
console.log(`Fetching websites for ${googleUrl}`);
await page.goto(googleUrl);
const html = await page.content();
const $ = cheerio.load(html);
$('a[data-ved] > br').parent().each((index, element) => {
const link = $(element).attr('href');
if (link) {
const { origin } = new URL(link);
allWebsites.push(origin);
}
});
await browser.close();
if (pageCount === Number(process.argv[3])) {
return allWebsites;
}
return getWebsiteLinks(q, pageCount + 1);
}
const run = async () => {
const googleQuery = process.argv[2];
const websites = await getWebsiteLinks(googleQuery);
console.log(`Found ${websites.length} unique websites to check.`);
await main(websites);
const emailAddresses = [...new Set(allEmails.flat())];
console.log(`Found ${emailAddresses.length} unique email addresses.`);
fs.writeFileSync(path.join(__dirname, 'result.txt'), emailAddresses.join('\n '));
};
run();