55
66from bs4 import BeautifulSoup
77from collections import Counter
8+ import lxml .html as lh
89from string import punctuation
910from urllib .parse import urlsplit
1011from urllib3 .exceptions import HTTPError
6061
6162TOKEN_REGEX = re .compile (r'(?u)\b\w\w+\b' )
6263
64+ HEADING_TAGS_XPATHS = {
65+ 'h1' : '//h1' ,
66+ 'h2' : '//h2' ,
67+ 'h3' : '//h3' ,
68+ 'h4' : '//h4' ,
69+ 'h5' : '//h5' ,
70+ 'h6' : '//h6' ,
71+ }
72+
73+ ADDITIONAL_TAGS_XPATHS = {
74+ 'title' : '//title/text()' ,
75+ 'meta_desc' : '//meta[@name="description"]/@content' ,
76+ 'viewport' : '//meta[@name="viewport"]/@content' ,
77+ 'charset' : '//meta[@charset]/@charset' ,
78+ 'canonical' : '//link[@rel="canonical"]/@href' ,
79+ 'alt_href' : '//link[@rel="alternate"]/@href' ,
80+ 'alt_hreflang' : '//link[@rel="alternate"]/@hreflang' ,
81+ }
82+
6383IMAGE_EXTENSIONS = set (['.img' , '.png' , '.jpg' , '.jpeg' , '.gif' , '.bmp' , '.svg' , '.webp' , '.avif' ,])
6484
6585
@@ -68,14 +88,16 @@ class Page():
6888 Container for each page and the core analyzer.
6989 """
7090
71- def __init__ (self , url = '' , base_domain = '' ):
91+ def __init__ (self , url = '' , base_domain = '' , analyze_headings = False , analyze_extra_tags = False ):
7292 """
7393 Variables go here, *not* outside of __init__
7494 """
7595
7696 self .base_domain = urlsplit (base_domain )
7797 self .parsed_url = urlsplit (url )
7898 self .url = url
99+ self .analyze_headings = analyze_headings
100+ self .analyze_extra_tags = analyze_extra_tags
79101 self .title = ''
80102 self .description = ''
81103 self .keywords = {}
@@ -89,12 +111,17 @@ def __init__(self, url='', base_domain=''):
89111 self .stem_to_word = {}
90112 self .content_hash = None
91113
114+ if analyze_headings :
115+ self .headings = {}
116+ if analyze_extra_tags :
117+ self .additional_info = {}
118+
92119 def talk (self ):
93120 """
94121 Returns a dictionary that can be printed
95122 """
96123
97- return {
124+ context = {
98125 'url' : self .url ,
99126 'title' : self .title ,
100127 'description' : self .description ,
@@ -106,6 +133,13 @@ def talk(self):
106133 'content_hash' : self .content_hash
107134 }
108135
136+ if self .analyze_headings :
137+ context ['headings' ] = self .headings
138+ if self .analyze_extra_tags :
139+ context ['additional_info' ] = self .additional_info
140+
141+ return context
142+
109143 def populate (self , bs ):
110144 """
111145 Populates the instance variables from BeautifulSoup
@@ -126,6 +160,34 @@ def populate(self, bs):
126160 if len (keywords ) > 0 :
127161 self .warn (f'Keywords should be avoided as they are a spam indicator and no longer used by Search Engines: { keywords } ' )
128162
163+ def analyze_heading_tags (self , bs ):
164+ """
165+ Analyze the heading tags and populate the headings
166+ """
167+
168+ try :
169+ dom = lh .fromstring (str (bs ))
170+ except ValueError as _ :
171+ dom = lh .fromstring (bs .encode ('utf-8' ))
172+ for tag , xpath in HEADING_TAGS_XPATHS .items ():
173+ value = [heading .text_content () for heading in dom .xpath (xpath )]
174+ if value :
175+ self .headings .update ({tag : value })
176+
177+ def analyze_additional_tags (self , bs ):
178+ """
179+ Analyze additional tags and populate the additional info
180+ """
181+
182+ try :
183+ dom = lh .fromstring (str (bs ))
184+ except ValueError as _ :
185+ dom = lh .fromstring (bs .encode ('utf-8' ))
186+ for tag , xpath in ADDITIONAL_TAGS_XPATHS .items ():
187+ value = dom .xpath (xpath )
188+ if value :
189+ self .additional_info .update ({tag : value })
190+
129191 def analyze (self , raw_html = None ):
130192 """
131193 Analyze the page and populate the warnings list
@@ -192,6 +254,11 @@ def analyze(self, raw_html=None):
192254 self .analyze_img_tags (soup_lower )
193255 self .analyze_h1_tags (soup_lower )
194256
257+ if self .analyze_headings :
258+ self .analyze_heading_tags (soup_unmodified )
259+ if self .analyze_extra_tags :
260+ self .analyze_additional_tags (soup_unmodified )
261+
195262 return True
196263
197264 def word_list_freq_dist (self , wordlist ):
0 commit comments