Skip to content

Commit 9a43f31

Browse files
authored
Merge pull request #65 from gbrault/master
Added text sitemap parsing
2 parents a0641a7 + 021dcf9 commit 9a43f31

3 files changed

Lines changed: 57 additions & 5 deletions

File tree

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
# I don't want the python virtual env in github!
2+
venv
3+
4+
# nor visual
5+
.vscode
6+
17
*.py[cod]
28

39
# C extensions

seoanalyzer/website.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,15 @@ def get_text_from_xml(self, nodelist):
4545
def crawl(self):
4646
if self.sitemap:
4747
page = http.get(self.sitemap)
48-
xmldoc = minidom.parseString(page.data.decode('utf-8'))
49-
sitemap_urls = xmldoc.getElementsByTagName('loc')
50-
51-
for url in sitemap_urls:
52-
self.page_queue.append(self.get_text_from_xml(url.childNodes))
48+
if self.sitemap.endswith('xml'):
49+
xmldoc = minidom.parseString(page.data.decode('utf-8'))
50+
sitemap_urls = xmldoc.getElementsByTagName('loc')
51+
for url in sitemap_urls:
52+
self.page_queue.append(self.get_text_from_xml(url.childNodes))
53+
elif self.sitemap.endswith('txt'):
54+
sitemap_urls = page.data.decode('utf-8').split('\n')
55+
for url in sitemap_urls:
56+
self.page_queue.append(url)
5357

5458
self.page_queue.append(self.base_url)
5559

test.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import argparse
2+
import inspect
3+
import json
4+
import os
5+
6+
from jinja2 import Environment
7+
from jinja2 import FileSystemLoader
8+
from seoanalyzer import analyze
9+
10+
11+
module_path = os.path.dirname(inspect.getfile(analyze))
12+
13+
arg_parser = argparse.ArgumentParser()
14+
15+
arg_parser.add_argument('site', help='URL of the site you are wanting to analyze.')
16+
arg_parser.add_argument('-s', '--sitemap', help='URL of the sitemap to seed the crawler with.')
17+
arg_parser.add_argument('-f', '--output-format', help='Output format.', choices=['json', 'html', ],
18+
default='json')
19+
arg_parser.add_argument('-d', '--disk', help='save to disk', choices=['y', 'n', ], default='y')
20+
21+
args = arg_parser.parse_args()
22+
23+
output = analyze(args.site, args.sitemap)
24+
25+
if args.output_format == 'html':
26+
from jinja2 import Environment
27+
from jinja2 import FileSystemLoader
28+
29+
env = Environment(loader=FileSystemLoader(os.path.join(module_path, 'templates')))
30+
template = env.get_template('index.html')
31+
output_from_parsed_template = template.render(result=output)
32+
if args.disk == 'y':
33+
with open("test.html", "w", encoding='utf-8') as text_file:
34+
text_file.write(output_from_parsed_template)
35+
else:
36+
print(output_from_parsed_template)
37+
elif args.output_format == 'json':
38+
if args.disk == 'y':
39+
with open("test.json", "w", encoding='utf-8') as text_file:
40+
text_file.write(json.dumps(output, indent=4, separators=(',', ': ')))
41+
else:
42+
print(json.dumps(output, indent=4, separators=(',', ': ')))

0 commit comments

Comments
 (0)