java-web-crawler/WebCrawler.java at master · OmarElebiary/java-web-crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package WebCrawler;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebCrawler {

    private Queue<String> queue;
    private List<String> discoveredWebsitesList;

    public WebCrawler() {
        this.queue = new LinkedList<>();
        this.discoveredWebsitesList = new ArrayList<>();
    }

    public void discoverWeb(String root) {

        this.queue.add(root);
        this.discoveredWebsitesList.add(root);

        while (!queue.isEmpty()) {
            String v = this.queue.remove();
            String rawHtml = readUrl(v);

            String regexp = "http://(\\w+\\.)*(\\w+)";
            Pattern pattern = Pattern.compile(regexp);
            Matcher matcher = pattern.matcher(rawHtml);

            while (matcher.find()) {
                String actualUrl = matcher.group();

                if (!discoveredWebsitesList.contains(actualUrl)) {
                    discoveredWebsitesList.add(actualUrl);
                    System.out.println("Website: " + actualUrl);
                    queue.add(actualUrl);
                }
            }
        }
    }

    // Read Initial url to parse html page
    private String readUrl(String v) {
        String rawHtml = "";
        try {
            URL url = new URL(v);
            BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
            String inputLine = "";
            while ((inputLine = in.readLine()) != null) {
                rawHtml += inputLine;
            }
            in.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return rawHtml;
    }
}