Skip to content

Commit 410dffe

Browse files
authored
Update website.py (#105)
1 parent 8710cb4 commit 410dffe

1 file changed

Lines changed: 51 additions & 64 deletions

File tree

pyseoanalyzer/website.py

Lines changed: 51 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
1-
from collections import Counter
2-
from collections import defaultdict
1+
from collections import Counter, defaultdict
32
from urllib.parse import urlsplit
43
from xml.dom import minidom
5-
64
import socket
75

86
from .http import http
@@ -19,7 +17,7 @@ def __init__(
1917
self.analyze_extra_tags = analyze_extra_tags
2018
self.follow_links = follow_links
2119
self.crawled_pages = []
22-
self.crawled_urls = set([])
20+
self.crawled_urls = set()
2321
self.page_queue = []
2422
self.wordcount = Counter()
2523
self.bigrams = Counter()
@@ -29,71 +27,60 @@ def __init__(
2927
def check_dns(self, url_to_check):
3028
try:
3129
o = urlsplit(url_to_check)
32-
socket.gethostbyname(o.hostname)
30+
socket.gethostbyname_ex(o.hostname)
3331
return True
34-
except:
35-
pass
36-
37-
return False
32+
except (socket.herror, socket.gaierror):
33+
return False
3834

3935
def get_text_from_xml(self, nodelist):
4036
"""
4137
Stolen from the minidom documentation
4238
"""
43-
rc = []
44-
45-
for node in nodelist:
46-
if node.nodeType == node.TEXT_NODE:
47-
rc.append(node.data)
48-
49-
return "".join(rc)
39+
return "".join(node.data for node in nodelist if node.nodeType == node.TEXT_NODE)
5040

5141
def crawl(self):
52-
if self.sitemap:
53-
page = http.get(self.sitemap)
54-
if self.sitemap.endswith("xml"):
55-
xmldoc = minidom.parseString(page.data.decode("utf-8"))
56-
sitemap_urls = xmldoc.getElementsByTagName("loc")
57-
for url in sitemap_urls:
58-
self.page_queue.append(self.get_text_from_xml(url.childNodes))
59-
elif self.sitemap.endswith("txt"):
60-
sitemap_urls = page.data.decode("utf-8").split("\n")
61-
for url in sitemap_urls:
62-
self.page_queue.append(url)
63-
64-
self.page_queue.append(self.base_url)
65-
66-
for url in self.page_queue:
67-
if url in self.crawled_urls:
68-
continue
69-
70-
page = Page(
71-
url=url,
72-
base_domain=self.base_url,
73-
analyze_headings=self.analyze_headings,
74-
analyze_extra_tags=self.analyze_extra_tags,
75-
)
76-
77-
if page.parsed_url.netloc != page.base_domain.netloc:
78-
continue
79-
80-
page.analyze()
81-
82-
self.content_hashes[page.content_hash].add(page.url)
83-
84-
for w in page.wordcount:
85-
self.wordcount[w] += page.wordcount[w]
86-
87-
for b in page.bigrams:
88-
self.bigrams[b] += page.bigrams[b]
89-
90-
for t in page.trigrams:
91-
self.trigrams[t] += page.trigrams[t]
92-
93-
self.page_queue.extend(page.links)
94-
95-
self.crawled_pages.append(page)
96-
self.crawled_urls.add(page.url)
97-
98-
if not self.follow_links:
99-
break
42+
try:
43+
if self.sitemap:
44+
page = http.get(self.sitemap)
45+
if self.sitemap.endswith("xml"):
46+
xmldoc = minidom.parseString(page.data.decode("utf-8"))
47+
sitemap_urls = xmldoc.getElementsByTagName("loc")
48+
for url in sitemap_urls:
49+
self.page_queue.append(self.get_text_from_xml(url.childNodes))
50+
elif self.sitemap.endswith("txt"):
51+
sitemap_urls = page.data.decode("utf-8").split("\n")
52+
for url in sitemap_urls:
53+
self.page_queue.append(url)
54+
55+
self.page_queue.append(self.base_url)
56+
57+
for url in self.page_queue:
58+
if url in self.crawled_urls:
59+
continue
60+
61+
page = Page(
62+
url=url,
63+
base_domain=self.base_url,
64+
analyze_headings=self.analyze_headings,
65+
analyze_extra_tags=self.analyze_extra_tags,
66+
)
67+
68+
if page.parsed_url.netloc != page.base_domain.netloc:
69+
continue
70+
71+
page.analyze()
72+
73+
self.content_hashes[page.content_hash].add(page.url)
74+
self.wordcount.update(page.wordcount)
75+
self.bigrams.update(page.bigrams)
76+
self.trigrams.update(page.trigrams)
77+
78+
self.page_queue.extend(page.links)
79+
80+
self.crawled_pages.append(page)
81+
self.crawled_urls.add(page.url)
82+
83+
if not self.follow_links:
84+
break
85+
except Exception as e:
86+
print(f"Error occurred during crawling: {e}")

0 commit comments

Comments
 (0)