Merge pull request #78 from SummitStha/feature/analyze-extra-tags

sethblack · web-flow · commit 012fc2296244 · 2021-12-03T17:08:07.000-06:00
Feature/analyze extra tags
diff --git a/README.md b/README.md
@@ -56,6 +56,25 @@ output = analyze(site, sitemap)
 print(output)
 ```
 
+In order to analyze heading tags (h1-h6) and other extra additional tags as well, the following options can be passed to the `analyze` function
+```python
+from seoanalyzer import analyze
+
+output = analyze(site, sitemap, analyze_headings=True, analyze_extra_tags=True)
+
+print(output)
+```
+
+By default, the `analyze` function analyzes all the existing inner links as well, which might be time consuming.
+This default behaviour can be changed to analyze only the provided URL by passing the following option to the `analyze` function
+```python
+from seoanalyzer import analyze
+
+output = analyze(site, sitemap, follow_links=False)
+
+print(output)
+```
+
 Alternatively, you can run the analysis as a script from the seoanalyzer folder.
 
 ```sh
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 beautifulsoup4==4.6.0
 requests==2.20.0
 Jinja2==2.11.3
+lxml==4.6.4
 urllib3==1.26.5
 certifi==2018.11.29
diff --git a/seoanalyzer/analyzer.py b/seoanalyzer/analyzer.py
@@ -4,15 +4,15 @@
 from operator import itemgetter
 from seoanalyzer.website import Website
 
-def analyze(url, sitemap_url=None):
+def analyze(url, sitemap_url=None, analyze_headings=False, analyze_extra_tags=False, follow_links=True):
     start_time = time.time()
 
     def calc_total_time():
         return time.time() - start_time
 
     output = {'pages': [], 'keywords': [], 'errors': [], 'total_time': calc_total_time()}
 
-    site = Website(url, sitemap_url)
+    site = Website(url, sitemap_url, analyze_headings, analyze_extra_tags, follow_links)
 
     site.crawl()
 
diff --git a/seoanalyzer/page.py b/seoanalyzer/page.py
@@ -5,6 +5,7 @@
 
 from bs4 import BeautifulSoup
 from collections import Counter
+import lxml.html as lh
 from string import punctuation
 from urllib.parse import urlsplit
 from urllib3.exceptions import HTTPError
@@ -60,6 +61,25 @@
 
 TOKEN_REGEX = re.compile(r'(?u)\b\w\w+\b')
 
+HEADING_TAGS_XPATHS = {
+    'h1': '//h1',
+    'h2': '//h2',
+    'h3': '//h3',
+    'h4': '//h4',
+    'h5': '//h5',
+    'h6': '//h6',
+}
+
+ADDITIONAL_TAGS_XPATHS = {
+    'title': '//title/text()',
+    'meta_desc': '//meta[@name="description"]/@content',
+    'viewport': '//meta[@name="viewport"]/@content',
+    'charset': '//meta[@charset]/@charset',
+    'canonical': '//link[@rel="canonical"]/@href',
+    'alt_href': '//link[@rel="alternate"]/@href',
+    'alt_hreflang': '//link[@rel="alternate"]/@hreflang',
+}
+
 IMAGE_EXTENSIONS = set(['.img', '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp', '.avif',])
 
 
@@ -68,14 +88,16 @@ class Page():
     Container for each page and the core analyzer.
     """
 
-    def __init__(self, url='', base_domain=''):
+    def __init__(self, url='', base_domain='', analyze_headings=False, analyze_extra_tags=False):
         """
         Variables go here, *not* outside of __init__
         """
 
         self.base_domain = urlsplit(base_domain)
         self.parsed_url = urlsplit(url)
         self.url = url
+        self.analyze_headings = analyze_headings
+        self.analyze_extra_tags = analyze_extra_tags
         self.title = ''
         self.description = ''
         self.keywords = {}
@@ -89,12 +111,17 @@ def __init__(self, url='', base_domain=''):
         self.stem_to_word = {}
         self.content_hash = None
 
+        if analyze_headings:
+            self.headings = {}
+        if analyze_extra_tags:
+            self.additional_info = {}
+
     def talk(self):
         """
         Returns a dictionary that can be printed
         """
 
-        return {
+        context = {
             'url': self.url,
             'title': self.title,
             'description': self.description,
@@ -106,6 +133,13 @@ def talk(self):
             'content_hash': self.content_hash
         }
 
+        if self.analyze_headings:
+            context['headings'] = self.headings
+        if self.analyze_extra_tags:
+            context['additional_info'] = self.additional_info
+
+        return context
+
     def populate(self, bs):
         """
         Populates the instance variables from BeautifulSoup
@@ -126,6 +160,34 @@ def populate(self, bs):
         if len(keywords) > 0:
             self.warn(f'Keywords should be avoided as they are a spam indicator and no longer used by Search Engines: {keywords}')
 
+    def analyze_heading_tags(self, bs):
+        """
+        Analyze the heading tags and populate the headings
+        """
+
+        try:
+            dom = lh.fromstring(str(bs))
+        except ValueError as _:
+            dom = lh.fromstring(bs.encode('utf-8'))
+        for tag, xpath in HEADING_TAGS_XPATHS.items():
+            value = [heading.text_content() for heading in dom.xpath(xpath)]
+            if value:
+                self.headings.update({tag: value})
+
+    def analyze_additional_tags(self, bs):
+        """
+        Analyze additional tags and populate the additional info
+        """
+
+        try:
+            dom = lh.fromstring(str(bs))
+        except ValueError as _:
+            dom = lh.fromstring(bs.encode('utf-8'))
+        for tag, xpath in ADDITIONAL_TAGS_XPATHS.items():
+            value = dom.xpath(xpath)
+            if value:
+                self.additional_info.update({tag: value})
+
     def analyze(self, raw_html=None):
         """
         Analyze the page and populate the warnings list
@@ -192,6 +254,11 @@ def analyze(self, raw_html=None):
         self.analyze_img_tags(soup_lower)
         self.analyze_h1_tags(soup_lower)
 
+        if self.analyze_headings:
+            self.analyze_heading_tags(soup_unmodified)
+        if self.analyze_extra_tags:
+            self.analyze_additional_tags(soup_unmodified)
+
         return True
 
     def word_list_freq_dist(self, wordlist):
diff --git a/seoanalyzer/website.py b/seoanalyzer/website.py
@@ -9,9 +9,12 @@
 from seoanalyzer.page import Page
 
 class Website():
-    def __init__(self, base_url, sitemap):
+    def __init__(self, base_url, sitemap, analyze_headings, analyze_extra_tags, follow_links):
         self.base_url = base_url
         self.sitemap = sitemap
+        self.analyze_headings = analyze_headings
+        self.analyze_extra_tags = analyze_extra_tags
+        self.follow_links = follow_links
         self.crawled_pages = []
         self.crawled_urls = set([])
         self.page_queue = []
@@ -61,7 +64,9 @@ def crawl(self):
             if url in self.crawled_urls:
                 continue
 
-            page = Page(url=url, base_domain=self.base_url)
+            page = Page(url=url, base_domain=self.base_url,
+                        analyze_headings=self.analyze_headings,
+                        analyze_extra_tags=self.analyze_extra_tags)
 
             if page.parsed_url.netloc != page.base_domain.netloc:
                 continue
@@ -83,3 +88,6 @@ def crawl(self):
 
             self.crawled_pages.append(page)
             self.crawled_urls.add(page.url)
+
+            if not self.follow_links:
+                break