Skip to content

Commit 012fc22

Browse files
authored
Merge pull request #78 from SummitStha/feature/analyze-extra-tags
Feature/analyze extra tags
2 parents de1855c + b883d4c commit 012fc22

5 files changed

Lines changed: 101 additions & 6 deletions

File tree

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,25 @@ output = analyze(site, sitemap)
5656
print(output)
5757
```
5858

59+
In order to analyze heading tags (h1-h6) and other extra additional tags as well, the following options can be passed to the `analyze` function
60+
```python
61+
from seoanalyzer import analyze
62+
63+
output = analyze(site, sitemap, analyze_headings=True, analyze_extra_tags=True)
64+
65+
print(output)
66+
```
67+
68+
By default, the `analyze` function analyzes all the existing inner links as well, which might be time consuming.
69+
This default behaviour can be changed to analyze only the provided URL by passing the following option to the `analyze` function
70+
```python
71+
from seoanalyzer import analyze
72+
73+
output = analyze(site, sitemap, follow_links=False)
74+
75+
print(output)
76+
```
77+
5978
Alternatively, you can run the analysis as a script from the seoanalyzer folder.
6079

6180
```sh

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
beautifulsoup4==4.6.0
22
requests==2.20.0
33
Jinja2==2.11.3
4+
lxml==4.6.4
45
urllib3==1.26.5
56
certifi==2018.11.29

seoanalyzer/analyzer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
from operator import itemgetter
55
from seoanalyzer.website import Website
66

7-
def analyze(url, sitemap_url=None):
7+
def analyze(url, sitemap_url=None, analyze_headings=False, analyze_extra_tags=False, follow_links=True):
88
start_time = time.time()
99

1010
def calc_total_time():
1111
return time.time() - start_time
1212

1313
output = {'pages': [], 'keywords': [], 'errors': [], 'total_time': calc_total_time()}
1414

15-
site = Website(url, sitemap_url)
15+
site = Website(url, sitemap_url, analyze_headings, analyze_extra_tags, follow_links)
1616

1717
site.crawl()
1818

seoanalyzer/page.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from bs4 import BeautifulSoup
77
from collections import Counter
8+
import lxml.html as lh
89
from string import punctuation
910
from urllib.parse import urlsplit
1011
from urllib3.exceptions import HTTPError
@@ -60,6 +61,25 @@
6061

6162
TOKEN_REGEX = re.compile(r'(?u)\b\w\w+\b')
6263

64+
HEADING_TAGS_XPATHS = {
65+
'h1': '//h1',
66+
'h2': '//h2',
67+
'h3': '//h3',
68+
'h4': '//h4',
69+
'h5': '//h5',
70+
'h6': '//h6',
71+
}
72+
73+
ADDITIONAL_TAGS_XPATHS = {
74+
'title': '//title/text()',
75+
'meta_desc': '//meta[@name="description"]/@content',
76+
'viewport': '//meta[@name="viewport"]/@content',
77+
'charset': '//meta[@charset]/@charset',
78+
'canonical': '//link[@rel="canonical"]/@href',
79+
'alt_href': '//link[@rel="alternate"]/@href',
80+
'alt_hreflang': '//link[@rel="alternate"]/@hreflang',
81+
}
82+
6383
IMAGE_EXTENSIONS = set(['.img', '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp', '.avif',])
6484

6585

@@ -68,14 +88,16 @@ class Page():
6888
Container for each page and the core analyzer.
6989
"""
7090

71-
def __init__(self, url='', base_domain=''):
91+
def __init__(self, url='', base_domain='', analyze_headings=False, analyze_extra_tags=False):
7292
"""
7393
Variables go here, *not* outside of __init__
7494
"""
7595

7696
self.base_domain = urlsplit(base_domain)
7797
self.parsed_url = urlsplit(url)
7898
self.url = url
99+
self.analyze_headings = analyze_headings
100+
self.analyze_extra_tags = analyze_extra_tags
79101
self.title = ''
80102
self.description = ''
81103
self.keywords = {}
@@ -89,12 +111,17 @@ def __init__(self, url='', base_domain=''):
89111
self.stem_to_word = {}
90112
self.content_hash = None
91113

114+
if analyze_headings:
115+
self.headings = {}
116+
if analyze_extra_tags:
117+
self.additional_info = {}
118+
92119
def talk(self):
93120
"""
94121
Returns a dictionary that can be printed
95122
"""
96123

97-
return {
124+
context = {
98125
'url': self.url,
99126
'title': self.title,
100127
'description': self.description,
@@ -106,6 +133,13 @@ def talk(self):
106133
'content_hash': self.content_hash
107134
}
108135

136+
if self.analyze_headings:
137+
context['headings'] = self.headings
138+
if self.analyze_extra_tags:
139+
context['additional_info'] = self.additional_info
140+
141+
return context
142+
109143
def populate(self, bs):
110144
"""
111145
Populates the instance variables from BeautifulSoup
@@ -126,6 +160,34 @@ def populate(self, bs):
126160
if len(keywords) > 0:
127161
self.warn(f'Keywords should be avoided as they are a spam indicator and no longer used by Search Engines: {keywords}')
128162

163+
def analyze_heading_tags(self, bs):
164+
"""
165+
Analyze the heading tags and populate the headings
166+
"""
167+
168+
try:
169+
dom = lh.fromstring(str(bs))
170+
except ValueError as _:
171+
dom = lh.fromstring(bs.encode('utf-8'))
172+
for tag, xpath in HEADING_TAGS_XPATHS.items():
173+
value = [heading.text_content() for heading in dom.xpath(xpath)]
174+
if value:
175+
self.headings.update({tag: value})
176+
177+
def analyze_additional_tags(self, bs):
178+
"""
179+
Analyze additional tags and populate the additional info
180+
"""
181+
182+
try:
183+
dom = lh.fromstring(str(bs))
184+
except ValueError as _:
185+
dom = lh.fromstring(bs.encode('utf-8'))
186+
for tag, xpath in ADDITIONAL_TAGS_XPATHS.items():
187+
value = dom.xpath(xpath)
188+
if value:
189+
self.additional_info.update({tag: value})
190+
129191
def analyze(self, raw_html=None):
130192
"""
131193
Analyze the page and populate the warnings list
@@ -192,6 +254,11 @@ def analyze(self, raw_html=None):
192254
self.analyze_img_tags(soup_lower)
193255
self.analyze_h1_tags(soup_lower)
194256

257+
if self.analyze_headings:
258+
self.analyze_heading_tags(soup_unmodified)
259+
if self.analyze_extra_tags:
260+
self.analyze_additional_tags(soup_unmodified)
261+
195262
return True
196263

197264
def word_list_freq_dist(self, wordlist):

seoanalyzer/website.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,12 @@
99
from seoanalyzer.page import Page
1010

1111
class Website():
12-
def __init__(self, base_url, sitemap):
12+
def __init__(self, base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links):
1313
self.base_url = base_url
1414
self.sitemap = sitemap
15+
self.analyze_headings = analyze_headings
16+
self.analyze_extra_tags = analyze_extra_tags
17+
self.follow_links = follow_links
1518
self.crawled_pages = []
1619
self.crawled_urls = set([])
1720
self.page_queue = []
@@ -61,7 +64,9 @@ def crawl(self):
6164
if url in self.crawled_urls:
6265
continue
6366

64-
page = Page(url=url, base_domain=self.base_url)
67+
page = Page(url=url, base_domain=self.base_url,
68+
analyze_headings=self.analyze_headings,
69+
analyze_extra_tags=self.analyze_extra_tags)
6570

6671
if page.parsed_url.netloc != page.base_domain.netloc:
6772
continue
@@ -83,3 +88,6 @@ def crawl(self):
8388

8489
self.crawled_pages.append(page)
8590
self.crawled_urls.add(page.url)
91+
92+
if not self.follow_links:
93+
break

0 commit comments

Comments
 (0)