-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathJCrawl.java
107 lines (88 loc) · 2.61 KB
/
JCrawl.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/**
* The Class JCrawl to handle web crawling.
*
* @author Andy Zollner, Zollner Solutions LLC
* @param startURL URL to start crawling
* @param breakpoint breakpoint for program, number of URLS to scrape
* @return
* @exception e print stacktrace
* @see
* @since
* @serial
* @deprecated
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Scanner;
public class JCrawl {
private Queue<String> urlQueue;
private List<String> visitedURLs;
public JCrawl() {
urlQueue = new LinkedList<>();
visitedURLs = new ArrayList<>();
}
public void crawl(String startURL, int breakpoint) {
urlQueue.add(startURL);
visitedURLs.add(startURL);
while(!urlQueue.isEmpty()) {
// remove url from queue and begin crawling
String str = urlQueue.remove();
String htmlTagged = "";
// use try-catch to build and check url
try{
// use BufferedReader to build url string
URL url = new URL(str);
BufferedReader urlIn = new BufferedReader(new InputStreamReader(url.openStream()));
String inLine = urlIn.readLine();
// read each line and concat to htmlTagged
while (inLine != null) {
htmlTagged += inLine;
inLine = urlIn.readLine();
}
// close stream
urlIn.close();
} catch (Exception e) {
e.printStackTrace();
}
// TODO: refactor?
// regex to match url
String urlRegex = "(www|http:|https:)+[^\\s]+[\\w]";
Pattern pattern = Pattern.compile(urlRegex);
Matcher matcher = pattern.matcher(htmlTagged);
// breakpoint exits outermost loop per BFS
breakpoint = getBreakpoint(breakpoint, matcher);
if (breakpoint == 0) break;
}
}
private int getBreakpoint(int breakpoint, Matcher matcher) {
while(matcher.find()){
String actualURL = matcher.group();
if(!visitedURLs.contains(actualURL)){
visitedURLs.add(actualURL);
// TODO: output to file here
System.out.println("Website found with URL " + actualURL);
urlQueue.add(actualURL);
}
// exit the loop if it reaches the breakpoint.
if(breakpoint == 0) break;
breakpoint--;
}
return breakpoint;
}
public static void main(String[] args) {
JCrawl crawler = new JCrawl();
Scanner sc = new Scanner(System.in);
System.out.println("Enter a URL: ");
String startURL = sc.nextLine();
System.out.println("Enter number of links to crawl: ");
int breakpoint = sc.nextInt();
crawler.crawl(startURL, breakpoint);
}
}