1- from collections import Counter
2- from collections import defaultdict
1+ from collections import Counter , defaultdict
32from urllib .parse import urlsplit
43from xml .dom import minidom
5-
64import socket
75
86from .http import http
@@ -19,7 +17,7 @@ def __init__(
1917 self .analyze_extra_tags = analyze_extra_tags
2018 self .follow_links = follow_links
2119 self .crawled_pages = []
22- self .crawled_urls = set ([] )
20+ self .crawled_urls = set ()
2321 self .page_queue = []
2422 self .wordcount = Counter ()
2523 self .bigrams = Counter ()
@@ -29,71 +27,60 @@ def __init__(
2927 def check_dns (self , url_to_check ):
3028 try :
3129 o = urlsplit (url_to_check )
32- socket .gethostbyname (o .hostname )
30+ socket .gethostbyname_ex (o .hostname )
3331 return True
34- except :
35- pass
36-
37- return False
32+ except (socket .herror , socket .gaierror ):
33+ return False
3834
3935 def get_text_from_xml (self , nodelist ):
4036 """
4137 Stolen from the minidom documentation
4238 """
43- rc = []
44-
45- for node in nodelist :
46- if node .nodeType == node .TEXT_NODE :
47- rc .append (node .data )
48-
49- return "" .join (rc )
39+ return "" .join (node .data for node in nodelist if node .nodeType == node .TEXT_NODE )
5040
5141 def crawl (self ):
52- if self .sitemap :
53- page = http .get (self .sitemap )
54- if self .sitemap .endswith ("xml" ):
55- xmldoc = minidom .parseString (page .data .decode ("utf-8" ))
56- sitemap_urls = xmldoc .getElementsByTagName ("loc" )
57- for url in sitemap_urls :
58- self .page_queue .append (self .get_text_from_xml (url .childNodes ))
59- elif self .sitemap .endswith ("txt" ):
60- sitemap_urls = page .data .decode ("utf-8" ).split ("\n " )
61- for url in sitemap_urls :
62- self .page_queue .append (url )
63-
64- self .page_queue .append (self .base_url )
65-
66- for url in self .page_queue :
67- if url in self .crawled_urls :
68- continue
69-
70- page = Page (
71- url = url ,
72- base_domain = self .base_url ,
73- analyze_headings = self .analyze_headings ,
74- analyze_extra_tags = self .analyze_extra_tags ,
75- )
76-
77- if page .parsed_url .netloc != page .base_domain .netloc :
78- continue
79-
80- page .analyze ()
81-
82- self .content_hashes [page .content_hash ].add (page .url )
83-
84- for w in page .wordcount :
85- self .wordcount [w ] += page .wordcount [w ]
86-
87- for b in page .bigrams :
88- self .bigrams [b ] += page .bigrams [b ]
89-
90- for t in page .trigrams :
91- self .trigrams [t ] += page .trigrams [t ]
92-
93- self .page_queue .extend (page .links )
94-
95- self .crawled_pages .append (page )
96- self .crawled_urls .add (page .url )
97-
98- if not self .follow_links :
99- break
42+ try :
43+ if self .sitemap :
44+ page = http .get (self .sitemap )
45+ if self .sitemap .endswith ("xml" ):
46+ xmldoc = minidom .parseString (page .data .decode ("utf-8" ))
47+ sitemap_urls = xmldoc .getElementsByTagName ("loc" )
48+ for url in sitemap_urls :
49+ self .page_queue .append (self .get_text_from_xml (url .childNodes ))
50+ elif self .sitemap .endswith ("txt" ):
51+ sitemap_urls = page .data .decode ("utf-8" ).split ("\n " )
52+ for url in sitemap_urls :
53+ self .page_queue .append (url )
54+
55+ self .page_queue .append (self .base_url )
56+
57+ for url in self .page_queue :
58+ if url in self .crawled_urls :
59+ continue
60+
61+ page = Page (
62+ url = url ,
63+ base_domain = self .base_url ,
64+ analyze_headings = self .analyze_headings ,
65+ analyze_extra_tags = self .analyze_extra_tags ,
66+ )
67+
68+ if page .parsed_url .netloc != page .base_domain .netloc :
69+ continue
70+
71+ page .analyze ()
72+
73+ self .content_hashes [page .content_hash ].add (page .url )
74+ self .wordcount .update (page .wordcount )
75+ self .bigrams .update (page .bigrams )
76+ self .trigrams .update (page .trigrams )
77+
78+ self .page_queue .extend (page .links )
79+
80+ self .crawled_pages .append (page )
81+ self .crawled_urls .add (page .url )
82+
83+ if not self .follow_links :
84+ break
85+ except Exception as e :
86+ print (f"Error occurred during crawling: { e } " )
0 commit comments