-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebCrawler.java
More file actions
64 lines (53 loc) · 1.78 KB
/
Copy pathWebCrawler.java
File metadata and controls
64 lines (53 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package WebCrawler;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WebCrawler {
private Queue<String> queue;
private List<String> discoveredWebsitesList;
public WebCrawler() {
this.queue = new LinkedList<>();
this.discoveredWebsitesList = new ArrayList<>();
}
public void discoverWeb(String root) {
this.queue.add(root);
this.discoveredWebsitesList.add(root);
while (!queue.isEmpty()) {
String v = this.queue.remove();
String rawHtml = readUrl(v);
String regexp = "http://(\\w+\\.)*(\\w+)";
Pattern pattern = Pattern.compile(regexp);
Matcher matcher = pattern.matcher(rawHtml);
while (matcher.find()) {
String actualUrl = matcher.group();
if (!discoveredWebsitesList.contains(actualUrl)) {
discoveredWebsitesList.add(actualUrl);
System.out.println("Website: " + actualUrl);
queue.add(actualUrl);
}
}
}
}
// Read Initial url to parse html page
private String readUrl(String v) {
String rawHtml = "";
try {
URL url = new URL(v);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
String inputLine = "";
while ((inputLine = in.readLine()) != null) {
rawHtml += inputLine;
}
in.close();
} catch (Exception e) {
e.printStackTrace();
}
return rawHtml;
}
}