From 037998bf6731b37f6205299b5f64fd2aef5c0d44 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Tue, 19 May 2026 14:26:14 +0000 Subject: [PATCH 01/26] Add configuration for skippable domains such as social media --- config/config_template.yaml | 1 + src/crawl/HesitantCrawler.py | 11 ++++++++++- src/scrape/__init__.py | 7 ++++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/config/config_template.yaml b/config/config_template.yaml index c53d1fe..c414ac6 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -14,6 +14,7 @@ requests: input: input_dir: ../input input_files: + skip_domains: urls: urls.txt keywords: keywords.txt url_max: 100 diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py index 74a9de4..7ae3d74 100644 --- a/src/crawl/HesitantCrawler.py +++ b/src/crawl/HesitantCrawler.py @@ -21,7 +21,8 @@ def __init__( fetcher: HTMLFetcher, target_keywords: List[str], add_sitemapurls: bool = False, - max_depth: int = 1): + max_depth: int = 1, + skip_domains: List[str] = []): """ Depth-limited Search Targeted Crawler Crawler class for obtaining urls from start_url. @@ -64,6 +65,10 @@ def __init__( self.target_keywords = target_keywords logging.info(f"The targeted crawl will look for given keywords: {', '.join(self.target_keywords)}") + # Skip domains + self.skip_domains = skip_domains + logging.info(f"The targeted crawl will skip domains: {', '.join(self.skip_domains)}") + # Excluded URLs which contain: self._unsupported = ( ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", @@ -91,6 +96,10 @@ def skip_this_url(self, url: str) -> bool: # prevent duplicate crawl from trailing forward slash in URL url = url.rstrip('/') if url.endswith('/') else url + if any([skip_domain in url for skip_domain in self.skip_domains]): + logging.debug(f"Skip {url}, because domain is in skip-list") + return True # skip + # Do not revisit pages if url in self._visited: logging.debug(f"Skip {url}, because we have visited it before") diff --git a/src/scrape/__init__.py b/src/scrape/__init__.py index ded790d..4ebab41 100644 --- a/src/scrape/__init__.py +++ b/src/scrape/__init__.py @@ -16,12 +16,17 @@ def build_webfocusedscraper(user_agent: str) -> IScraper: with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}", 'r', encoding='utf-8') as file_in: target_keywords = [line.rstrip() for line in file_in] + with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}", 'r', encoding='utf-8') as file_in: + skip_domains = [line.rstrip() for line in file_in] + fetcher = HTMLFetcher(user_agent=user_agent) crawler = HesitantCrawler( fetcher=fetcher, target_keywords=target_keywords, add_sitemapurls=CONFIG.crawl.use_sitemap, - max_depth=CONFIG.crawl.max_depth) + max_depth=CONFIG.crawl.max_depth, + skip_domains=skip_domains + ) htmlparser = HTMLBodyParser() return Scraper( From cab73b4427708a16e0c4c20d8b7948a5bdc60cc5 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Thu, 21 May 2026 11:59:09 +0000 Subject: [PATCH 02/26] Basic setup scrapy with multiprocessing; TODO testing --- src/crawl/HesitantCrawler.py | 3 + src/crawl/__init__.py | 3 +- src/crawl/base.py | 3 +- src/crawl/scrapymodules/HesitantSpider.py | 141 ++++++++++++++++++ .../scrapymodules/ScrapyCrawlMiddleware.py | 27 ++++ src/crawl/scrapymodules/ScrapyResult.py | 7 + src/crawl/scrapymodules/__init__.py | 3 + src/main.py | 1 - src/main_scrapy.py | 118 +++++++++++++++ src/util/__init__.py | 3 +- src/util/urls.py | 35 +++++ 11 files changed, 340 insertions(+), 4 deletions(-) create mode 100644 src/crawl/scrapymodules/HesitantSpider.py create mode 100644 src/crawl/scrapymodules/ScrapyCrawlMiddleware.py create mode 100644 src/crawl/scrapymodules/ScrapyResult.py create mode 100644 src/crawl/scrapymodules/__init__.py create mode 100644 src/main_scrapy.py create mode 100644 src/util/urls.py diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py index 7ae3d74..2703494 100644 --- a/src/crawl/HesitantCrawler.py +++ b/src/crawl/HesitantCrawler.py @@ -96,6 +96,9 @@ def skip_this_url(self, url: str) -> bool: # prevent duplicate crawl from trailing forward slash in URL url = url.rstrip('/') if url.endswith('/') else url + # prevent duplicate crawl from '#' such as '#content', '#main', etc. + url = url.rstrip("#") if url.contains("#") else url + if any([skip_domain in url for skip_domain in self.skip_domains]): logging.debug(f"Skip {url}, because domain is in skip-list") return True # skip diff --git a/src/crawl/__init__.py b/src/crawl/__init__.py index 77999db..6fe4b71 100644 --- a/src/crawl/__init__.py +++ b/src/crawl/__init__.py @@ -1,2 +1,3 @@ from .base import ICrawler, NoCrawler, BaseCrawler, CrawlResult -from .HesitantCrawler import HesitantCrawler \ No newline at end of file +from .HesitantCrawler import HesitantCrawler +from .scrapymodules import ScrapyResult \ No newline at end of file diff --git a/src/crawl/base.py b/src/crawl/base.py index 36b51ca..1a9f576 100644 --- a/src/crawl/base.py +++ b/src/crawl/base.py @@ -2,7 +2,7 @@ from typing import NamedTuple, List import logging from urllib.parse import urlparse - +from scrapy.http import Response from fetch import IFetcher @@ -11,6 +11,7 @@ class CrawlResult(NamedTuple): source: str targeted: bool = None first_keyword_hit: str = None + crawl_depth: int = 0 class ICrawler(ABC): diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py new file mode 100644 index 0000000..e535647 --- /dev/null +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -0,0 +1,141 @@ +from typing import List +import scrapy +import validators +from urllib.parse import urlparse, urljoin +import logging +import re +from .ScrapyResult import ScrapyResult + +class HesitantSpider(scrapy.Spider): + name = "hesitant-spider" + + # Define custom settings as a class attribute + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks + "AUTOTHROTTLE_START_DELAY": 1.0, # Start slow to "warm up" + "AUTOTHROTTLE_MAX_DELAY": 10.0, # Never wait more than 10s + "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # Aim for 1 request per worker at a time + "DOWNLOAD_DELAY": 0, # Let Autothrottle handle the delay + } + + def __init__( + self, + start_urls: str, + target_keywords: List[str] = [], + add_sitemap_urls: bool = False, + max_depth: int = 1, + skip_domains: List[str] = [], + *args, **kwargs + ): + super(HesitantSpider, self).__init__(*args, **kwargs) + + self.start_urls = start_urls + self.logger.debug(f"Init start_urls: {self.start_urls}") + self.max_depth = max_depth + self.logger.debug(f"Init max depth: {self.max_depth}") + self.skip_domains = skip_domains + self.logger.debug(f"Init skip domains: {self.skip_domains}") + self.target_keywords = target_keywords + self.logger.debug(f"Init target keywords: {self.target_keywords}") + + self._unsupported = ( + ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", + ".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4", + ".woff", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", ".css", ".pdf", ".doc", ".docx", ".exe", ".bin", ".rss", ".zip", + ".rar", ".msu", ".flv", ".dmg", ".xls", ".xlsx", ".ico", ".mng?download=true", ".pct?download=true", ".bmp?download=true", + ".gif?download=true", ".jpg?download=true", ".jpeg?download=true", ".png?download=true", ".pst?download=true", + ".psp?download=true", ".tif?download=true", ".tiff?download=true", ".ai?download=true", ".drw?download=true", + ".dxf?download=true", ".eps?download=true", ".ps?download=true", ".svg?download=true", ".mp3?download=true", + ".wma?download=true", ".ogg?download=true", ".wav?download=true", ".ra?download=true", ".aac?download=true", + ".mid?download=true", ".au?download=true", ".aiff?download=true", ".3gp?download=true", ".asf?download=true", + ".asx?download=true", ".avi?download=true", ".mov?download=true", ".mp4?download=true", ".mpg?download=true", + ".qt?download=true", ".rm?download=true", ".swf?download=true", ".wmv?download=true", ".m4a?download=true", + ".css?download=true", ".pdf?download=true", ".doc?download=true", ".exe?download=true", ".bin?download=true", + ".rss?download=true", ".zip?download=true", ".rar?download=true", ".msu?download=true", ".flv?download=true", + ".dmg?download=true") + self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}") + + self.results = [] + self.visited = set() + + if max_depth < 0: + self.logger.debug("Only urls from starting_url can be found, max_depth < 0") + + def url_is_target(self, url: str) -> bool: + for keyword in self.target_keywords: + first_keyword_hit = re.search(keyword, url) + if first_keyword_hit is not None: + return True + + def skip_this_url(self, url: str) -> bool: + """Function to see if we have already visited url""" + + if not validators.url(url): + return True + + if any(ext in url for ext in self._unsupported): + self.logger.debug(f"Skip {url}, because extension is unsupported") + return True + + # prevent duplicate crawl from trailing forward slash in URL + url = url.rstrip('/') if url.endswith('/') else url + + # prevent duplicate crawl from '#' such as '#content', '#main', etc. + url = url.rstrip("#") if "#" in url else url + + if any([skip_domain in url for skip_domain in self.skip_domains]): + self.logger.debug(f"Skip {url}, because domain is in skip-list") + return True # skip + + # Do not revisit pages + if url in self.visited: + self.logger.debug(f"Skip {url}, because we have visited it before") + return True # skip + return False + + async def start(self): + for start_url in self.start_urls: + yield scrapy.Request(url=start_url, callback=self.parse, meta={"depth": 0}) + + def parse(self, response): + self.logger.debug(f"Parsing url: {response.url}") + self.visited.add(response.url) + + yield {"url": response.url, "html":response.text[:10]} + current_depth = response.meta.get("depth", 0) + if not self.url_is_target(response.url) and current_depth >= self.max_depth: + return + + # Process the current page + if self.url_is_target(response.url): + # Add results + self.results.append( + ScrapyResult( + url=response.url, + status=response.status, + text=response.text[:1], + crawl_depth=current_depth + ) + ) + + # Reset current depth because we found target at current page + current_depth = 0 + + # Extract and follow links + for link in response.css("a::attr(href)").getall(): + url = urljoin(response.url, link) + + # Keep crawling restricted to the start domain and avoid skipped domains + if self.skip_this_url(url): + continue + + yield scrapy.Request( + url=url, + callback=self.parse, + meta={"depth": current_depth + 1} + ) + + def closed(self, reason): + """Optional: Scrapy built-in method called when the spider finishes""" + print(f"Spider closed because of: {reason}. Total collected pages: {len(self.results)}") \ No newline at end of file diff --git a/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py b/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py new file mode 100644 index 0000000..7ce42ea --- /dev/null +++ b/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py @@ -0,0 +1,27 @@ +import logging +from scrapy.exceptions import IgnoreRequest + +exceptions = [ + ".txt", + ".xml", + ".rss" +] + + +class TextTypeFilterMiddleware: + """ + Drops any response that isn't HTML or XHTML. + """ + def process_response(self, request, response, spider): + if any([response.url.endswith(exception) for exception in exceptions]): + logging.debug(f"Making exception bypass for url: {response.url}") + return response + content_type = response.headers.get('Content-Type', b'').decode('utf-8').lower() + + # Only allow HTML-based content + if 'text/html' not in content_type and 'application/xhtml+xml' not in content_type and 'application/xml' not in content_type: + logging.info(f"\t\tTextTypeFilterMiddleware: Skipping non-text content: {response.url} ({content_type})") + # Returning None tells Scrapy to drop this response entirely + raise IgnoreRequest("Not Text type response, ignore request") + + return response diff --git a/src/crawl/scrapymodules/ScrapyResult.py b/src/crawl/scrapymodules/ScrapyResult.py new file mode 100644 index 0000000..f1be36b --- /dev/null +++ b/src/crawl/scrapymodules/ScrapyResult.py @@ -0,0 +1,7 @@ +from typing import NamedTuple + +class ScrapyResult(NamedTuple): + url: str + status: str + text: str + crawl_depth: int = 0 diff --git a/src/crawl/scrapymodules/__init__.py b/src/crawl/scrapymodules/__init__.py new file mode 100644 index 0000000..12459fc --- /dev/null +++ b/src/crawl/scrapymodules/__init__.py @@ -0,0 +1,3 @@ +from .HesitantSpider import HesitantSpider +from .ScrapyResult import ScrapyResult +from .ScrapyCrawlMiddleware import TextTypeFilterMiddleware \ No newline at end of file diff --git a/src/main.py b/src/main.py index e7a9b4f..27c7fb3 100644 --- a/src/main.py +++ b/src/main.py @@ -45,4 +45,3 @@ def main(): # CONFIG = setup("../config/config.yaml") # df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow") # print(df.head()) - diff --git a/src/main_scrapy.py b/src/main_scrapy.py new file mode 100644 index 0000000..a154eb9 --- /dev/null +++ b/src/main_scrapy.py @@ -0,0 +1,118 @@ +import os +import logging +import numpy as np +import multiprocessing +import sys +from datetime import datetime + +from scrapy.crawler import CrawlerProcess + +from util import setup, normalize_url +from crawl.scrapymodules import HesitantSpider + +CONFIG = setup("config/config.yaml") + + +def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile): + print(f"Args: urls: {urls}, keywords: {keywords}, skip_domains: {skip_domains}, process_id: {process_id} ") + print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!") + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + if project_root not in sys.path: + sys.path.insert(0, project_root) + + process = CrawlerProcess( + settings={ + "ROBOTSTXT_OBEY": True, + "LOG_LEVEL": "INFO", + "LOG_FILE": logfile, + "DOWNLOADER_MIDDLEWARES": { + "src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority + }, + "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] + } + ) + + root_logger = logging.getLogger() + root_logger.setLevel(log_level) + root_logger.handlers = [] + + fileHandler = logging.FileHandler(logfile) + fileHandler.setLevel(log_level) + root_logger.addHandler(fileHandler) + + # Remove console output + # We get the logger that Scrapy uses and remove all handlers that print to the console + scrapy_logger = logging.getLogger('scrapy') + for handler in scrapy_logger.handlers[:]: + scrapy_logger.removeHandler(handler) + + # (Optional) If you want to be extremely thorough, silence the engine too + logging.getLogger('twisted').handlers = [] + + spiderCrawler = process.create_crawler(HesitantSpider) + + process.crawl( + spiderCrawler, + start_urls=urls, + max_depth=1, + target_keywords=keywords, + skip_domains=skip_domains + ) + + if len(urls) == 0: + return [] + + try: + process.start() + except Exception as e: + print(f"Got here! Error {e}") + + if spiderCrawler.spider is not None: + print(f"Returning results of length for PID {process_id}: {len(spiderCrawler.spider.results)}") + return spiderCrawler.spider.results + + +if __name__ == "__main__": + + # Input URLs + file_urls = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}" + logging.info(f"Reading list of base-urls from file: {file_urls}") + with open(file_urls, 'r', encoding='utf-8') as file_in: + urls = [line.rstrip() for line in file_in] + + # Normalize URLs + urls = [*map(normalize_url, urls)] + + # Keywords + file_keywords = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}" + logging.info(f"Reading list of keywords from file: {file_keywords}") + with open(file_keywords, 'r', encoding='utf-8') as file_in: + target_keywords = [line.rstrip() for line in file_in] + + # Skip domains + file_skip_domains = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}" + logging.info(f"Reading list of skip_domains from file: {file_skip_domains}") + with open(file_skip_domains, 'r', encoding='utf-8') as file_in: + skip_domains = [line.rstrip() for line in file_in] + + num_workers = min([len(urls), 6]) + batch_size = len(urls) // num_workers if len(urls) > num_workers else 1 + url_chunks = np.array_split(urls, num_workers) + + chunked_args = [] + + logfile = f"output/logs/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log" + + for i in range(0, num_workers): + chunked_args.append( + (url_chunks[i], target_keywords, skip_domains, i, logging.INFO, logfile) + ) + + print("# Workers:", num_workers) + + print("# Cores available:", multiprocessing.cpu_count()) + with multiprocessing.Pool(processes=num_workers) as pool: + results = sum(pool.starmap(spawn_spider_process, chunked_args), []) + + print("Results:", results) + print("#Results:", len(results)) diff --git a/src/util/__init__.py b/src/util/__init__.py index 46cd90e..bb2a8cc 100644 --- a/src/util/__init__.py +++ b/src/util/__init__.py @@ -1 +1,2 @@ -from .setup import setup \ No newline at end of file +from .setup import setup +from .urls import normalize_url \ No newline at end of file diff --git a/src/util/urls.py b/src/util/urls.py new file mode 100644 index 0000000..62cb112 --- /dev/null +++ b/src/util/urls.py @@ -0,0 +1,35 @@ +from urllib.parse import urlparse, urlunparse +import re + + +# Normalize URL to make sure crawler can handle it without issue +def normalize_url(url): + # Handle case where there is no scheme at all + if not re.match(r'^[a-zA-Z]+://', url): + url = 'https://' + url + + parsed = urlparse(url) + + # 2. Force HTTPS + scheme = 'https' + + # 3. Handle the domain (netloc) + netloc = parsed.netloc.lower() + + # Remove existing 'www.' to re-add cleanly + if netloc.startswith('www.'): + netloc = netloc[4:] + + netloc = 'www.' + netloc + + # Reconstruct URL + new_url = urlunparse(( + scheme, + netloc, + parsed.path, + parsed.params, + parsed.query, + parsed.fragment + )) + + return new_url From 2c65ebe78659184e084844d07a7c5b3dbf2020b6 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Thu, 28 May 2026 08:53:49 +0000 Subject: [PATCH 03/26] temp commit; saving result --- src/analysis/analyze_results.py | 2 + src/crawl/HesitantCrawler.py | 4 +- src/crawl/scrapymodules/HesitantSpider.py | 161 ++++++++++++++++++---- src/crawl/scrapymodules/ScrapyResult.py | 6 +- src/main.py | 5 + src/main_scrapy.py | 45 ++++-- src/parse/HTML.py | 2 +- 7 files changed, 183 insertions(+), 42 deletions(-) diff --git a/src/analysis/analyze_results.py b/src/analysis/analyze_results.py index fb52cb1..62b5843 100644 --- a/src/analysis/analyze_results.py +++ b/src/analysis/analyze_results.py @@ -137,6 +137,8 @@ def __iter__(self): logging.debug(f"Total number of base-urls with scraped content: {len(get_baseurls(df=total))}.") logging.debug(f"Total number of pages downloaded: {total.shape[0]}.") + dfs.to_parquet("output/output.parquet") + gr = total.groupby(by='base_url', as_index=False)['url'].count() gr = gr.rename(columns={'url': 'pages', 'base_url': 'count'}) gr = gr.groupby(by='pages', as_index=False).count() diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py index 2703494..9d07853 100644 --- a/src/crawl/HesitantCrawler.py +++ b/src/crawl/HesitantCrawler.py @@ -97,11 +97,11 @@ def skip_this_url(self, url: str) -> bool: url = url.rstrip('/') if url.endswith('/') else url # prevent duplicate crawl from '#' such as '#content', '#main', etc. - url = url.rstrip("#") if url.contains("#") else url + url = url.rstrip("#") if "#" in url else url if any([skip_domain in url for skip_domain in self.skip_domains]): logging.debug(f"Skip {url}, because domain is in skip-list") - return True # skip + return True # skip # Do not revisit pages if url in self._visited: diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py index e535647..e7b35c5 100644 --- a/src/crawl/scrapymodules/HesitantSpider.py +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -1,18 +1,20 @@ from typing import List import scrapy import validators -from urllib.parse import urlparse, urljoin -import logging +from urllib.parse import urljoin, urlparse import re +import pandas as pd from .ScrapyResult import ScrapyResult +from parse import HTMLBodyParser + class HesitantSpider(scrapy.Spider): name = "hesitant-spider" - + # Define custom settings as a class attribute custom_settings = { "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks + "AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks "AUTOTHROTTLE_START_DELAY": 1.0, # Start slow to "warm up" "AUTOTHROTTLE_MAX_DELAY": 10.0, # Never wait more than 10s "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # Aim for 1 request per worker at a time @@ -26,19 +28,37 @@ def __init__( add_sitemap_urls: bool = False, max_depth: int = 1, skip_domains: List[str] = [], + skip_paths: List[str] = [], + allowed_top_level_domains: List[str] = [".com"], + batch_size: int = 100, + output_file: str = "output.parquet", + max_jumps: int = 1, *args, **kwargs ): super(HesitantSpider, self).__init__(*args, **kwargs) - + self.start_urls = start_urls self.logger.debug(f"Init start_urls: {self.start_urls}") self.max_depth = max_depth self.logger.debug(f"Init max depth: {self.max_depth}") self.skip_domains = skip_domains self.logger.debug(f"Init skip domains: {self.skip_domains}") + self.skip_paths = skip_paths + self.logger.debug(f"Init skip domains: {self.skip_paths}") + self.allowed_top_level_domains = allowed_top_level_domains + self.logger.debug(f"Init allowed_top_level_domains: {self.allowed_top_level_domains}") self.target_keywords = target_keywords self.logger.debug(f"Init target keywords: {self.target_keywords}") + self.batch_size = batch_size + self.batch_counter = 0 + self.logger.debug(f"Init batch_size: {self.batch_size}") + self.max_jumps = max_jumps + + self.output_file = output_file + self.logger.debug(f"Init output file: {self.output_file}") + + self._htmlparser = HTMLBodyParser() self._unsupported = ( ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", ".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4", @@ -55,73 +75,149 @@ def __init__( ".rss?download=true", ".zip?download=true", ".rar?download=true", ".msu?download=true", ".flv?download=true", ".dmg?download=true") self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}") - + + self.batch = [] self.results = [] self.visited = set() - + if max_depth < 0: self.logger.debug("Only urls from starting_url can be found, max_depth < 0") + async def start(self): + for start_url in self.start_urls: + yield scrapy.Request( + url=start_url, + callback=self.parse, + meta={ + "base_url": start_url, + "current_start": start_url, + "depth": 0, + "jumps": 0 + } + ) + + def save_batch(self): + if len(self.batch) == 0: + self.logger.debug("Tried to save batch without any results..") + return + + df = pd.DataFrame({ + "base_url": [res.base_url for res in self.batch], + "url": [res.url for res in self.batch], + "first_keyword_hit": [res.first_keyword_hit for res in self.batch], + "content": [res.content for res in self.batch], + "crawl_depth": [res.crawl_depth for res in self.batch], + "schema_indicator": [res.schema_indicator for res in self.batch] # TODO now always false + }) + + df.to_parquet( + self.output_file.replace(".parquet", f"_{self.batch_counter}.parquet") # TODO name + ) + + self.batch_counter += 1 + + self.batch = [] + self.logger.debug("Saved batch to parquet") + def url_is_target(self, url: str) -> bool: + parsed_url = urlparse(url).path for keyword in self.target_keywords: - first_keyword_hit = re.search(keyword, url) + first_keyword_hit = re.search(keyword, parsed_url) if first_keyword_hit is not None: - return True + self.logger.debug(f"For {url} keyword hit: {first_keyword_hit.group(0)}") + return True, first_keyword_hit.group(0) + + return False, None def skip_this_url(self, url: str) -> bool: """Function to see if we have already visited url""" + # Do not revisit pages + if url in self.visited: + self.logger.debug(f"Skip {url}, because we have visited it before") + return True # skip + + # Only visit valid urls if not validators.url(url): return True + # Only visit pages with supported extensions if any(ext in url for ext in self._unsupported): self.logger.debug(f"Skip {url}, because extension is unsupported") return True + # Only visit pages on allowed top-level domains + url_netloc = urlparse(url).netloc.lower() + + if not any([url_netloc.endswith(toplevel_domain) for toplevel_domain in self.allowed_top_level_domains]): + self.logger.debug(f"Skip {url} with netloc {url_netloc}, because top-level domain is not in allowed list") + return True + # prevent duplicate crawl from trailing forward slash in URL url = url.rstrip('/') if url.endswith('/') else url # prevent duplicate crawl from '#' such as '#content', '#main', etc. url = url.rstrip("#") if "#" in url else url + # Skip domains on skip-list if any([skip_domain in url for skip_domain in self.skip_domains]): self.logger.debug(f"Skip {url}, because domain is in skip-list") return True # skip - # Do not revisit pages - if url in self.visited: - self.logger.debug(f"Skip {url}, because we have visited it before") - return True # skip + # skip pre-defined paths + for skip_path in self.skip_paths: + if any([path == skip_path for path in urlparse(url).path.split("/")]): + self.logger.debug(f"Skip {url} because path {urlparse(url).path} contains skip-path: {skip_path}") + return True + return False - async def start(self): - for start_url in self.start_urls: - yield scrapy.Request(url=start_url, callback=self.parse, meta={"depth": 0}) - def parse(self, response): - self.logger.debug(f"Parsing url: {response.url}") self.visited.add(response.url) - yield {"url": response.url, "html":response.text[:10]} current_depth = response.meta.get("depth", 0) - if not self.url_is_target(response.url) and current_depth >= self.max_depth: + + url_is_targeted, first_keyword_hit = self.url_is_target(response.url) + + if not url_is_targeted and current_depth >= self.max_depth: return + jumps = response.meta.get("jumps", 0) + + parsed_url = urlparse(response.url) + current_netloc = parsed_url.netloc.lower().rsplit(".", 1)[0] + meta_netloc = urlparse(response.meta.get("current_start")).netloc.lower().rsplit(".", 1)[0] + if current_netloc != meta_netloc: + self.logger.debug(f"Adding jump from {jumps} to {jumps + 1} going with base url: {meta_netloc} to {current_netloc}") + jumps += 1 + # TODO do not add jump if response is a HTTP 300 redirect + + if jumps > self.max_jumps: + self.logger.debug(f"Ending crawl path due to exceeding jumps ({jumps}/{self.max_jumps}) for {response.url}, base url: {response.meta.get("base_url")}") + return + + self.logger.debug(f"Parsing url: {response.url}, targeted: {url_is_targeted}, depth: {current_depth}, jumps: {jumps}") + # Process the current page - if self.url_is_target(response.url): + if url_is_targeted: # Add results - self.results.append( - ScrapyResult( + result = ScrapyResult( + base_url=str(response.meta.get("base_url")), url=response.url, status=response.status, - text=response.text[:1], + first_keyword_hit=first_keyword_hit, + content=self._htmlparser.parse(html=response.text), crawl_depth=current_depth ) - ) + self.batch.append(result) + self.results.append(result) + + if len(self.batch) >= self.batch_size: + self.save_batch() # Reset current depth because we found target at current page current_depth = 0 - + # Extract and follow links for link in response.css("a::attr(href)").getall(): url = urljoin(response.url, link) @@ -131,11 +227,16 @@ def parse(self, response): continue yield scrapy.Request( - url=url, - callback=self.parse, - meta={"depth": current_depth + 1} + url=url, + callback=self.parse, + meta={ + "base_url": response.meta.get("base_url"), + "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", + "depth": current_depth + 1, + "jumps": jumps + } ) def closed(self, reason): """Optional: Scrapy built-in method called when the spider finishes""" - print(f"Spider closed because of: {reason}. Total collected pages: {len(self.results)}") \ No newline at end of file + print(f"Spider closed because of: {reason}. Total collected pages: {len(self.batch)}") \ No newline at end of file diff --git a/src/crawl/scrapymodules/ScrapyResult.py b/src/crawl/scrapymodules/ScrapyResult.py index f1be36b..bba5c23 100644 --- a/src/crawl/scrapymodules/ScrapyResult.py +++ b/src/crawl/scrapymodules/ScrapyResult.py @@ -1,7 +1,11 @@ from typing import NamedTuple + class ScrapyResult(NamedTuple): + base_url: str url: str + first_keyword_hit: str status: str - text: str + content: str crawl_depth: int = 0 + schema_indicator: bool = False diff --git a/src/main.py b/src/main.py index 27c7fb3..ce2e3f4 100644 --- a/src/main.py +++ b/src/main.py @@ -37,10 +37,15 @@ def main(): logging.info("Config:") logging.info(OmegaConf.to_yaml(CONFIG)) + start_time = time.perf_counter() + main() logging.info("Exiting with no error") + end_time = time.perf_counter() + + print("Runtime: ", end_time - start_time) # # Read the output files by using the following syntax: # CONFIG = setup("../config/config.yaml") # df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow") diff --git a/src/main_scrapy.py b/src/main_scrapy.py index a154eb9..532b73b 100644 --- a/src/main_scrapy.py +++ b/src/main_scrapy.py @@ -4,6 +4,7 @@ import multiprocessing import sys from datetime import datetime +import time from scrapy.crawler import CrawlerProcess @@ -13,7 +14,7 @@ CONFIG = setup("config/config.yaml") -def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile): +def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile, output_file): print(f"Args: urls: {urls}, keywords: {keywords}, skip_domains: {skip_domains}, process_id: {process_id} ") print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!") project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) @@ -54,9 +55,18 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo process.crawl( spiderCrawler, start_urls=urls, - max_depth=1, + max_depth=2, target_keywords=keywords, - skip_domains=skip_domains + skip_domains=skip_domains, + output_file=output_file, + allowed_top_level_domains=[".com", ".nl", ".ai", ".de", ".be", ".eu", ".io"], + skip_paths=[ + "shop", "cart", "clients", "testimonials", "search", + "query", "calendar", "events", "archive", "news", + "blog", "media", "articles", "profile", "legal", + "tos", "products", "winkel", "winkelwagen", "archief", + "nieuws", "artikelen", "producten", "faq" + ] ) if len(urls) == 0: @@ -68,12 +78,12 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo print(f"Got here! Error {e}") if spiderCrawler.spider is not None: - print(f"Returning results of length for PID {process_id}: {len(spiderCrawler.spider.results)}") + spiderCrawler.spider.save_batch() + print(f"Returning results of length for PID {process_id} ({len(urls)} URLs: {urls}): {len(spiderCrawler.spider.results)} ({len(spiderCrawler.spider.visited)} visited)") return spiderCrawler.spider.results if __name__ == "__main__": - # Input URLs file_urls = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}" logging.info(f"Reading list of base-urls from file: {file_urls}") @@ -95,7 +105,8 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo with open(file_skip_domains, 'r', encoding='utf-8') as file_in: skip_domains = [line.rstrip() for line in file_in] - num_workers = min([len(urls), 6]) + max_workers = 16 + num_workers = min([len(urls), max_workers]) batch_size = len(urls) // num_workers if len(urls) > num_workers else 1 url_chunks = np.array_split(urls, num_workers) @@ -103,16 +114,34 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo logfile = f"output/logs/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log" + time_part = datetime.now().strftime("%Y%m%d_%H%M%S") + if not os.path.exists(f"{CONFIG.output.output_dir}/{time_part}"): + os.makedirs(f"{CONFIG.output.output_dir}/{time_part}") + for i in range(0, num_workers): chunked_args.append( - (url_chunks[i], target_keywords, skip_domains, i, logging.INFO, logfile) + ( + url_chunks[i], + target_keywords, + skip_domains, + i, + logging.DEBUG, + logfile, + f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet" + ) ) print("# Workers:", num_workers) + start_time = time.perf_counter() + print("# Cores available:", multiprocessing.cpu_count()) with multiprocessing.Pool(processes=num_workers) as pool: results = sum(pool.starmap(spawn_spider_process, chunked_args), []) - print("Results:", results) + end_time = time.perf_counter() + + # print("Results:", results) print("#Results:", len(results)) + + print("Runtime: ", end_time - start_time) diff --git a/src/parse/HTML.py b/src/parse/HTML.py index a352f54..3237415 100644 --- a/src/parse/HTML.py +++ b/src/parse/HTML.py @@ -44,7 +44,7 @@ def parse(self, html: str) -> str: for tag in soup(self._disregard): tag.decompose() text = soup.get_text(separator="\n", strip=True) - logging.debug(f"First 100 characters of text extracted: {text[0:100]}") + #logging.debug(f"First 100 characters of text extracted: {text[0:100]}") return text except Exception as e: # Handle exceptions From 56a47192632eda1c153e43fdbe5718c267586d5e Mon Sep 17 00:00:00 2001 From: lhaarman Date: Wed, 3 Jun 2026 13:18:32 +0000 Subject: [PATCH 04/26] Refactoring, timeout, language/country filtering, schema.org in hesitant spider --- src/crawl/__init__.py | 3 +- src/crawl/scrapymodules/HesitantSpider.py | 160 ++++++++++++++++------ src/crawl/scrapymodules/ScrapyResult.py | 9 ++ src/crawl/scrapymodules/__init__.py | 4 +- src/main_scrapy.py | 79 ++++++++--- 5 files changed, 193 insertions(+), 62 deletions(-) diff --git a/src/crawl/__init__.py b/src/crawl/__init__.py index 6fe4b71..77999db 100644 --- a/src/crawl/__init__.py +++ b/src/crawl/__init__.py @@ -1,3 +1,2 @@ from .base import ICrawler, NoCrawler, BaseCrawler, CrawlResult -from .HesitantCrawler import HesitantCrawler -from .scrapymodules import ScrapyResult \ No newline at end of file +from .HesitantCrawler import HesitantCrawler \ No newline at end of file diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py index e7b35c5..f016bbf 100644 --- a/src/crawl/scrapymodules/HesitantSpider.py +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -1,11 +1,17 @@ -from typing import List +import json +import re import scrapy +import time import validators -from urllib.parse import urljoin, urlparse -import re + import pandas as pd -from .ScrapyResult import ScrapyResult + +from scrapy.exceptions import CloseSpider +from typing import List +from urllib.parse import urljoin, urlparse + from parse import HTMLBodyParser +from .ScrapyResult import ScrapyResult class HesitantSpider(scrapy.Spider): @@ -16,27 +22,31 @@ class HesitantSpider(scrapy.Spider): "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks "AUTOTHROTTLE_START_DELAY": 1.0, # Start slow to "warm up" - "AUTOTHROTTLE_MAX_DELAY": 10.0, # Never wait more than 10s + "AUTOTHROTTLE_MAX_DELAY": 30.0, # Never wait more than 10s "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # Aim for 1 request per worker at a time "DOWNLOAD_DELAY": 0, # Let Autothrottle handle the delay } def __init__( self, - start_urls: str, - target_keywords: List[str] = [], - add_sitemap_urls: bool = False, - max_depth: int = 1, - skip_domains: List[str] = [], - skip_paths: List[str] = [], - allowed_top_level_domains: List[str] = [".com"], - batch_size: int = 100, - output_file: str = "output.parquet", - max_jumps: int = 1, + start_urls: List[str], # List of starting (base) urls + target_keywords: List[str] = [], # list of keywords to determine targeting of URLs + max_depth: int = 2, # Maximum crawling depth with hesitancy + skip_domains: List[str] = [], # List of domains to skip + skip_paths: List[str] = [], # List of in-website paths to skip + allowed_top_level_domains: List[str] = [".com"], # List of allowed top level domains + batch_size: int = 100, # Output batch size + output_file: str = "output.parquet", # Output file name + max_jumps: int = 1, # Maximum site-to-site jumps + timeout: int = 3600, # max time in seconds + allowed_languages: List[str] = ["en", "en-us", "en-gb", "en-uk"], # Allowed languages within url paths + allowed_countries: List[str] = ["en", "us", "gb", "eu"], # Allowed countries within url paths + schema_keywords: List[str] = [], # Schema.org keywords to look for *args, **kwargs ): super(HesitantSpider, self).__init__(*args, **kwargs) + # Set and log attributes self.start_urls = start_urls self.logger.debug(f"Init start_urls: {self.start_urls}") self.max_depth = max_depth @@ -50,14 +60,25 @@ def __init__( self.target_keywords = target_keywords self.logger.debug(f"Init target keywords: {self.target_keywords}") self.batch_size = batch_size - self.batch_counter = 0 self.logger.debug(f"Init batch_size: {self.batch_size}") - + self.allowed_languages = allowed_languages + self.logger.debug(f"Init allowed languages: {self.allowed_languages}") + self.allowed_countries = allowed_countries + self.logger.debug(f"Init allowed countries: {self.allowed_countries}") + self.schema_keywords = schema_keywords + self.logger.debug(f"Init schema keywords: {self.schema_keywords}") self.max_jumps = max_jumps - + self.logger.debug(f"Init max_jumps: {self.max_jumps}") self.output_file = output_file self.logger.debug(f"Init output file: {self.output_file}") + # Start batch counter + self.batch_counter = 0 + + # Set timeout + self.timeout = timeout + + # Set parser and unsupported endpoints self._htmlparser = HTMLBodyParser() self._unsupported = ( ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", @@ -76,6 +97,7 @@ def __init__( ".dmg?download=true") self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}") + # Init batch, results, visited self.batch = [] self.results = [] self.visited = set() @@ -83,7 +105,10 @@ def __init__( if max_depth < 0: self.logger.debug("Only urls from starting_url can be found, max_depth < 0") + # Asynchronous function that starts the crawl async def start(self): + self.start_time = time.time() + # For each start url, start crawling for start_url in self.start_urls: yield scrapy.Request( url=start_url, @@ -96,6 +121,7 @@ async def start(self): } ) + # Save current batch to disk def save_batch(self): if len(self.batch) == 0: self.logger.debug("Tried to save batch without any results..") @@ -107,18 +133,23 @@ def save_batch(self): "first_keyword_hit": [res.first_keyword_hit for res in self.batch], "content": [res.content for res in self.batch], "crawl_depth": [res.crawl_depth for res in self.batch], - "schema_indicator": [res.schema_indicator for res in self.batch] # TODO now always false + "schema_indicator": [res.schema_indicator for res in self.batch] }) df.to_parquet( - self.output_file.replace(".parquet", f"_{self.batch_counter}.parquet") # TODO name + self.output_file.replace(".parquet", f"_{self.batch_counter}.parquet") ) self.batch_counter += 1 + # Add batch to total results + self.results += self.batch + + # Empty batch self.batch = [] - self.logger.debug("Saved batch to parquet") + self.logger.debug(f"Saved batch to parquet, total results: {len(self.results)}") + # Determine whether or not URL is a target def url_is_target(self, url: str) -> bool: parsed_url = urlparse(url).path for keyword in self.target_keywords: @@ -129,13 +160,9 @@ def url_is_target(self, url: str) -> bool: return False, None + # Determine whether or not to skip URL def skip_this_url(self, url: str) -> bool: - """Function to see if we have already visited url""" - - # Do not revisit pages - if url in self.visited: - self.logger.debug(f"Skip {url}, because we have visited it before") - return True # skip + """Function to see if we skip url""" # Only visit valid urls if not validators.url(url): @@ -147,7 +174,8 @@ def skip_this_url(self, url: str) -> bool: return True # Only visit pages on allowed top-level domains - url_netloc = urlparse(url).netloc.lower() + parsed_url = urlparse(url) + url_netloc = parsed_url.netloc.lower() if not any([url_netloc.endswith(toplevel_domain) for toplevel_domain in self.allowed_top_level_domains]): self.logger.debug(f"Skip {url} with netloc {url_netloc}, because top-level domain is not in allowed list") @@ -164,54 +192,102 @@ def skip_this_url(self, url: str) -> bool: self.logger.debug(f"Skip {url}, because domain is in skip-list") return True # skip + # Skip if first path is a country code but not within allowed + paths = urlparse(url).path.split("/") + if len(paths) >= 2: + if len(paths[1]) == 2 and paths[1] not in self.allowed_countries: + self.logger.debug(f"Skip {url} because path /{paths[1]}/ indicates country-page not in allowed countries: {self.allowed_countries}") + return True + # skip pre-defined paths for skip_path in self.skip_paths: - if any([path == skip_path for path in urlparse(url).path.split("/")]): + if any([path == skip_path for path in paths]): self.logger.debug(f"Skip {url} because path {urlparse(url).path} contains skip-path: {skip_path}") return True - + + # Skip pages in unsupported languages + query_params = parsed_url.query.split("&") + if len(self.allowed_languages) > 0: + for query_param in query_params: + if "lang=" in query_param: + lang = query_param.split("lang=")[1] + if lang not in self.allowed_languages: + self.logger.debug(f"Skip {url} due to language parameter 'lang={lang}' not in allowed list: {self.allowed_languages}") + return True + elif "language=" in query_param: + language = query_param.split("language=")[1] + if language not in self.allowed_languages: + self.logger.debug(f"Skip {url} due to language parameter 'language={language}' not in allowed list: {self.allowed_languages}") + return True + return False + # Process request response def parse(self, response): - self.visited.add(response.url) - + # Check if we passed timeout + if time.time() - self.start_time > self.timeout: + print(f"Hit timeout {self.timeout} seconds for spider with start urls: {self.start_urls}!") + self.logger.debug(f"Hit timeout {self.timeout} seconds for spider with start urls: {self.start_urls}!") + raise CloseSpider('bandwidth_exceeded') current_depth = response.meta.get("depth", 0) + # Check if url is tagret url_is_targeted, first_keyword_hit = self.url_is_target(response.url) + # If url is not target and exceeds hesitancy depth, return if not url_is_targeted and current_depth >= self.max_depth: return + # Determine whether we need to add a jump jumps = response.meta.get("jumps", 0) parsed_url = urlparse(response.url) current_netloc = parsed_url.netloc.lower().rsplit(".", 1)[0] meta_netloc = urlparse(response.meta.get("current_start")).netloc.lower().rsplit(".", 1)[0] - if current_netloc != meta_netloc: + if current_netloc != meta_netloc and response.meta.get("redirect_urls") is None: self.logger.debug(f"Adding jump from {jumps} to {jumps + 1} going with base url: {meta_netloc} to {current_netloc}") jumps += 1 - # TODO do not add jump if response is a HTTP 300 redirect - + + # If we exceed jumps, return if jumps > self.max_jumps: self.logger.debug(f"Ending crawl path due to exceeding jumps ({jumps}/{self.max_jumps}) for {response.url}, base url: {response.meta.get("base_url")}") return + # Process response if above skip-conditions not met self.logger.debug(f"Parsing url: {response.url}, targeted: {url_is_targeted}, depth: {current_depth}, jumps: {jumps}") + self.visited.add(response.url) # Process the current page if url_is_targeted: - # Add results + # Determine schema.org indicator + schema_indicator = False + + # Get JSON-LD elements + jsonlds = response.xpath("//script[@type='application/ld+json']/text()").getall() + if jsonlds: + for jsonld in jsonlds: + try: + data = json.loads(jsonld) + if "@type" in data.keys() and data["@type"] in self.schema_keywords: + self.logger.debug(f"Found schema entity {data["@type"]} that is within schema keywords: {self.schema_keywords}") + schema_indicator = True + except json.JSONDecodeError: + pass + + # Add result to batch result = ScrapyResult( base_url=str(response.meta.get("base_url")), url=response.url, status=response.status, first_keyword_hit=first_keyword_hit, content=self._htmlparser.parse(html=response.text), - crawl_depth=current_depth + crawl_depth=current_depth, + schema_indicator=schema_indicator ) + self.batch.append(result) - self.results.append(result) + # Save batch if exceeding batch size if len(self.batch) >= self.batch_size: self.save_batch() @@ -222,7 +298,7 @@ def parse(self, response): for link in response.css("a::attr(href)").getall(): url = urljoin(response.url, link) - # Keep crawling restricted to the start domain and avoid skipped domains + # Only continue with valid crawl paths if self.skip_this_url(url): continue @@ -234,9 +310,11 @@ def parse(self, response): "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", "depth": current_depth + 1, "jumps": jumps - } + }, + dont_filter=False # Skip duplicates ) + # Called when the spider closes cleanly def closed(self, reason): - """Optional: Scrapy built-in method called when the spider finishes""" + self.save_batch() print(f"Spider closed because of: {reason}. Total collected pages: {len(self.batch)}") \ No newline at end of file diff --git a/src/crawl/scrapymodules/ScrapyResult.py b/src/crawl/scrapymodules/ScrapyResult.py index bba5c23..2188046 100644 --- a/src/crawl/scrapymodules/ScrapyResult.py +++ b/src/crawl/scrapymodules/ScrapyResult.py @@ -9,3 +9,12 @@ class ScrapyResult(NamedTuple): content: str crawl_depth: int = 0 schema_indicator: bool = False + + def __eq__(self, other): + if not isinstance(other, ScrapyResult): + return False + + return self.base_url == other.base_url and self.content == other.content + + def __hash__(self): + return hash((self.base_url, self.content)) diff --git a/src/crawl/scrapymodules/__init__.py b/src/crawl/scrapymodules/__init__.py index 12459fc..a0255a5 100644 --- a/src/crawl/scrapymodules/__init__.py +++ b/src/crawl/scrapymodules/__init__.py @@ -1,3 +1,3 @@ from .HesitantSpider import HesitantSpider -from .ScrapyResult import ScrapyResult -from .ScrapyCrawlMiddleware import TextTypeFilterMiddleware \ No newline at end of file +from .ScrapyCrawlMiddleware import TextTypeFilterMiddleware +from .ScrapyResult import ScrapyResult \ No newline at end of file diff --git a/src/main_scrapy.py b/src/main_scrapy.py index 532b73b..9e84032 100644 --- a/src/main_scrapy.py +++ b/src/main_scrapy.py @@ -1,38 +1,63 @@ -import os import logging -import numpy as np import multiprocessing +import os +import re import sys -from datetime import datetime import time +import numpy as np +import pandas as pd + +from datetime import datetime from scrapy.crawler import CrawlerProcess -from util import setup, normalize_url from crawl.scrapymodules import HesitantSpider +from util import setup, normalize_url CONFIG = setup("config/config.yaml") +# Check if string is valid and contains no strange characters +def is_valid_string(s): + if not isinstance(s, str): + return False + if len(s) == 0: + return False + strange_chars = re.findall(r'[\x00-\x08\x0B\x0E-\x1F\x7F]', str(s)) + return not (len(strange_chars) / len(s)) > 0.1 + + +# Concatenates all .parquet files in a dir (and its subdirs) +def read_parquet_dir(parquet_dir): + for root, dirs, files in os.walk(parquet_dir): + for file in files: + if file.endswith('.parquet'): + file_path = os.path.join(root, file) + df = pd.read_parquet(file_path) + yield df[df['content'].apply(is_valid_string)] + + +# Spawn spider crawler process def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile, output_file): - print(f"Args: urls: {urls}, keywords: {keywords}, skip_domains: {skip_domains}, process_id: {process_id} ") + print(f"Args: urls: {urls}, keywords: {keywords}, skip domains: {skip_domains}, log level {log_level}, log file: {logfile}, output file: {output_file}, process_id: {process_id}") print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!") project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) if project_root not in sys.path: sys.path.insert(0, project_root) + # Create scrapy CrawlerProcess process = CrawlerProcess( settings={ "ROBOTSTXT_OBEY": True, - "LOG_LEVEL": "INFO", "LOG_FILE": logfile, "DOWNLOADER_MIDDLEWARES": { "src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority }, - "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] + "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] # TODO can be removed? } ) + # Configure logging root_logger = logging.getLogger() root_logger.setLevel(log_level) root_logger.handlers = [] @@ -42,16 +67,18 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo root_logger.addHandler(fileHandler) # Remove console output - # We get the logger that Scrapy uses and remove all handlers that print to the console + # Get the logger that Scrapy uses and remove all handlers that print to the console scrapy_logger = logging.getLogger('scrapy') for handler in scrapy_logger.handlers[:]: scrapy_logger.removeHandler(handler) - # (Optional) If you want to be extremely thorough, silence the engine too + # Silence the twisted engine too logging.getLogger('twisted').handlers = [] + # Create crawler from process spiderCrawler = process.create_crawler(HesitantSpider) + # Crawl and configure spider process.crawl( spiderCrawler, start_urls=urls, @@ -65,20 +92,24 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo "query", "calendar", "events", "archive", "news", "blog", "media", "articles", "profile", "legal", "tos", "products", "winkel", "winkelwagen", "archief", - "nieuws", "artikelen", "producten", "faq" - ] + "nieuws", "artikelen", "producten", "faq", "policies", + "downloads", "portfolio" + ], + allowed_languages=["nl", "en", "en-uk", "en-gb"], + allowed_countries=["nl"], + schema_keywords=["JobPosting"] ) + # If worker gets 0 urls, pass (shouldn't happen) if len(urls) == 0: return [] try: process.start() except Exception as e: - print(f"Got here! Error {e}") + print(f"Something went from starting process! Error {e}") if spiderCrawler.spider is not None: - spiderCrawler.spider.save_batch() print(f"Returning results of length for PID {process_id} ({len(urls)} URLs: {urls}): {len(spiderCrawler.spider.results)} ({len(spiderCrawler.spider.visited)} visited)") return spiderCrawler.spider.results @@ -105,6 +136,7 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo with open(file_skip_domains, 'r', encoding='utf-8') as file_in: skip_domains = [line.rstrip() for line in file_in] + # Set amount of parallel workers and prepare chunk-wisem parallel execution max_workers = 16 num_workers = min([len(urls), max_workers]) batch_size = len(urls) // num_workers if len(urls) > num_workers else 1 @@ -112,8 +144,10 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo chunked_args = [] + # All workers write to same log logfile = f"output/logs/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log" + # Make output dir for specific run time_part = datetime.now().strftime("%Y%m%d_%H%M%S") if not os.path.exists(f"{CONFIG.output.output_dir}/{time_part}"): os.makedirs(f"{CONFIG.output.output_dir}/{time_part}") @@ -127,7 +161,7 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo i, logging.DEBUG, logfile, - f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet" + f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet" # Different output files per werker ) ) @@ -135,13 +169,24 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo start_time = time.perf_counter() - print("# Cores available:", multiprocessing.cpu_count()) with multiprocessing.Pool(processes=num_workers) as pool: - results = sum(pool.starmap(spawn_spider_process, chunked_args), []) + pool.starmap(spawn_spider_process, chunked_args) end_time = time.perf_counter() - # print("Results:", results) + +# Results in tables + dir_parquets = f"{CONFIG.output.output_dir}/{time_part}/" + parquet_dfs = read_parquet_dir(dir_parquets) + + # Analysis + dfs = [] + for df in parquet_dfs: + dfs.append(df) + + if len(dfs) > 0: + results = pd.concat(dfs, ignore_index=True) + print("#Results:", len(results)) print("Runtime: ", end_time - start_time) From ac9527b121978f7240cc3689c289264be1aa436b Mon Sep 17 00:00:00 2001 From: lhaarman Date: Thu, 4 Jun 2026 11:13:09 +0000 Subject: [PATCH 05/26] add logs dir --- output/logs/.keep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 output/logs/.keep diff --git a/output/logs/.keep b/output/logs/.keep new file mode 100644 index 0000000..e69de29 From e5af8e0b8d686d840f92b5dd2cbe1f303842a6f7 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Tue, 9 Jun 2026 09:42:36 +0000 Subject: [PATCH 06/26] add skip_domains to config template --- config/config_template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config_template.yaml b/config/config_template.yaml index c414ac6..d1802d0 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -14,7 +14,7 @@ requests: input: input_dir: ../input input_files: - skip_domains: + skip_domains: skipdomains.txt urls: urls.txt keywords: keywords.txt url_max: 100 From 3787ac6fd4fcf9544db1ad24b5adce89f02a8f6c Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Tue, 9 Jun 2026 09:42:55 +0000 Subject: [PATCH 07/26] adjust logging setup --- src/main_scrapy.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main_scrapy.py b/src/main_scrapy.py index 9e84032..5d281cd 100644 --- a/src/main_scrapy.py +++ b/src/main_scrapy.py @@ -115,6 +115,22 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo if __name__ == "__main__": + + # Set logging level and create file + # All workers write to same log + logging_level = logging.DEBUG + + dir_log = f"{CONFIG.output.output_dir}/{CONFIG.output.logs}" + if not os.path.exists(dir_log): + os.makedirs(dir_log) + logfile = f"{dir_log}/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log" + logging.basicConfig( + filename=logfile, + level=logging_level, + format='%(asctime)s - %(levelname)s - %(message)s' + ) + logging.info("Log file created.") + # Input URLs file_urls = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}" logging.info(f"Reading list of base-urls from file: {file_urls}") @@ -144,9 +160,6 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo chunked_args = [] - # All workers write to same log - logfile = f"output/logs/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log" - # Make output dir for specific run time_part = datetime.now().strftime("%Y%m%d_%H%M%S") if not os.path.exists(f"{CONFIG.output.output_dir}/{time_part}"): @@ -159,7 +172,7 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo target_keywords, skip_domains, i, - logging.DEBUG, + logging_level, logfile, f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet" # Different output files per werker ) @@ -186,7 +199,6 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo if len(dfs) > 0: results = pd.concat(dfs, ignore_index=True) - - print("#Results:", len(results)) + print("#Results:", len(results)) print("Runtime: ", end_time - start_time) From 770bf128fcb023240208c54d6db787a367ab2cfe Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Tue, 9 Jun 2026 09:47:06 +0000 Subject: [PATCH 08/26] add code for sitemap parsing and testing class --- src/crawl/scrapymodules/HesitantSpider.py | 106 +++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py index f016bbf..21e7ceb 100644 --- a/src/crawl/scrapymodules/HesitantSpider.py +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -3,6 +3,7 @@ import scrapy import time import validators +import logging import pandas as pd @@ -42,6 +43,7 @@ def __init__( allowed_languages: List[str] = ["en", "en-us", "en-gb", "en-uk"], # Allowed languages within url paths allowed_countries: List[str] = ["en", "us", "gb", "eu"], # Allowed countries within url paths schema_keywords: List[str] = [], # Schema.org keywords to look for + sitemaps_tocheck: List[str] = ['sitemap.xml'], # path extensions that often lead to sitemaps to check for URL's *args, **kwargs ): super(HesitantSpider, self).__init__(*args, **kwargs) @@ -71,6 +73,8 @@ def __init__( self.logger.debug(f"Init max_jumps: {self.max_jumps}") self.output_file = output_file self.logger.debug(f"Init output file: {self.output_file}") + self.sitemaps_tocheck = sitemaps_tocheck + self.logger.debug(f"Check urls found on (potential) sitemaps: {self.sitemaps_tocheck}") # Start batch counter self.batch_counter = 0 @@ -120,6 +124,20 @@ async def start(self): "jumps": 0 } ) + # next, if desired, check the sitemapurls to augment existing results + parsed_url = urlparse(start_url) + for sitemap in self.sitemaps_tocheck: + url = f"{parsed_url.scheme}://{parsed_url.netloc}/{sitemap}" + yield scrapy.Request( + url=url, + callback=self.parse_sitemap, + meta={ + "base_url": start_url, + "current_start": start_url, + "depth": 0, + "jumps": 0 + } + ) # Save current batch to disk def save_batch(self): @@ -313,8 +331,94 @@ def parse(self, response): }, dont_filter=False # Skip duplicates ) + + def parse_sitemap(self, response): + # Extract all URLs from the sitemap, accounting for namespace + ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} + urls = response.xpath('//ns:url/ns:loc/text()', namespaces=ns).getall() + + for url in urls: + # Only continue with valid crawl paths + if self.skip_this_url(url): + continue + + parsed_url = urlparse(url) + yield scrapy.Request( + url=url, + callback=self.parse, + meta={ + "base_url": response.meta.get("base_url"), + "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", + "depth": response.meta.get("depth", 0) + 1 + } + ) # Called when the spider closes cleanly def closed(self, reason): self.save_batch() - print(f"Spider closed because of: {reason}. Total collected pages: {len(self.batch)}") \ No newline at end of file + print(f"Spider closed because of: {reason}. Total collected pages: {len(self.results)}") + + +if __name__ == "__main__": + import os + from datetime import datetime + from scrapy.crawler import CrawlerProcess + from util import setup + + CONFIG = setup("config/config.yaml") + + logging_level = logging.DEBUG + + dir_log = f"{CONFIG.output.output_dir}/{CONFIG.output.logs}" + if not os.path.exists(dir_log): + os.makedirs(dir_log) + logfile = f"{dir_log}/log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log" + + # Create scrapy CrawlerProcess + process = CrawlerProcess( + settings={ + "ROBOTSTXT_OBEY": True, + "LOG_FILE": logfile, + "DOWNLOADER_MIDDLEWARES": { + "src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority + }, + "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] # TODO can be removed? + } + ) + + # Create crawler from process + spiderCrawler = process.create_crawler(HesitantSpider) + + # Crawl and configure spider + # urls = ['https://books.toscrape.com/'] + # target_keywords = ["philosophy"] + + urls = ['https://werkenbijhetcbs.nl/'] + target_keywords = ["enqueteur"] + sitemaps_tocheck = ["sitemap.xml"] + allowed_top_level_domains = [".com", ".nl"] + + # Skip domains + file_skip_domains = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}" + logging.info(f"Reading list of skip_domains from file: {file_skip_domains}") + with open(file_skip_domains, 'r', encoding='utf-8') as file_in: + skip_domains = [line.rstrip() for line in file_in] + + # output + time_part = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = f"{CONFIG.output.output_dir}/{time_part}_output.parquet" + + process.crawl( + spiderCrawler, + start_urls=urls, + target_keywords=target_keywords, + skip_domains=skip_domains, + allowed_top_level_domains=allowed_top_level_domains, + output_file=output_file, + sitemaps_tocheck=sitemaps_tocheck + ) + + try: + process.start() + except Exception as e: + print(f"Something went from starting process! Error {e}") From 7a081702ef92cb993a838239179b5de327f325e0 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Wed, 24 Jun 2026 10:00:15 +0000 Subject: [PATCH 09/26] Split up targeting between netloc/path --- src/crawl/scrapymodules/HesitantSpider.py | 42 +++++++++++++++++------ src/main_scrapy.py | 29 ++++++++++------ 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py index 21e7ceb..c8ab72a 100644 --- a/src/crawl/scrapymodules/HesitantSpider.py +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -12,6 +12,7 @@ from urllib.parse import urljoin, urlparse from parse import HTMLBodyParser +from util import normalize_url from .ScrapyResult import ScrapyResult @@ -31,7 +32,8 @@ class HesitantSpider(scrapy.Spider): def __init__( self, start_urls: List[str], # List of starting (base) urls - target_keywords: List[str] = [], # list of keywords to determine targeting of URLs + target_netloc_keywords: List[str] = [], # List of keywords to determine targeting of URL netlocs + target_path_keywords: List[str] = [], # list of keywords to determine targeting of URL paths max_depth: int = 2, # Maximum crawling depth with hesitancy skip_domains: List[str] = [], # List of domains to skip skip_paths: List[str] = [], # List of in-website paths to skip @@ -59,8 +61,10 @@ def __init__( self.logger.debug(f"Init skip domains: {self.skip_paths}") self.allowed_top_level_domains = allowed_top_level_domains self.logger.debug(f"Init allowed_top_level_domains: {self.allowed_top_level_domains}") - self.target_keywords = target_keywords - self.logger.debug(f"Init target keywords: {self.target_keywords}") + self.target_netloc_keywords = target_netloc_keywords + self.logger.debug(f"Init target netloc keywords: {self.target_netloc_keywords}") + self.target_path_keywords = target_path_keywords + self.logger.debug(f"Init target paths keywords: {self.target_path_keywords}") self.batch_size = batch_size self.logger.debug(f"Init batch_size: {self.batch_size}") self.allowed_languages = allowed_languages @@ -120,6 +124,7 @@ async def start(self): meta={ "base_url": start_url, "current_start": start_url, + "steps_from_target": 0, "depth": 0, "jumps": 0 } @@ -134,6 +139,7 @@ async def start(self): meta={ "base_url": start_url, "current_start": start_url, + "steps_from_target": 0, "depth": 0, "jumps": 0 } @@ -169,12 +175,22 @@ def save_batch(self): # Determine whether or not URL is a target def url_is_target(self, url: str) -> bool: - parsed_url = urlparse(url).path - for keyword in self.target_keywords: - first_keyword_hit = re.search(keyword, parsed_url) + parsed_url = urlparse(url) + # Check netloc + url_netloc = parsed_url.netloc + for keyword in self.target_netloc_keywords: + first_keyword_hit = re.search(keyword, url_netloc) + if first_keyword_hit is not None: + self.logger.debug(f"For {url} keyword hit: {first_keyword_hit.group(0)}") + return True, keyword + + # Check path + url_path = parsed_url.path + for keyword in self.target_path_keywords: + first_keyword_hit = re.search(keyword, url_path) if first_keyword_hit is not None: self.logger.debug(f"For {url} keyword hit: {first_keyword_hit.group(0)}") - return True, first_keyword_hit.group(0) + return True, keyword return False, None @@ -182,6 +198,9 @@ def url_is_target(self, url: str) -> bool: def skip_this_url(self, url: str) -> bool: """Function to see if we skip url""" + if url in self.visited: + return True + # Only visit valid urls if not validators.url(url): return True @@ -248,12 +267,13 @@ def parse(self, response): self.logger.debug(f"Hit timeout {self.timeout} seconds for spider with start urls: {self.start_urls}!") raise CloseSpider('bandwidth_exceeded') current_depth = response.meta.get("depth", 0) + steps_from_target = response.meta.get("steps_from_target", 0) - # Check if url is tagret + # Check if url is target url_is_targeted, first_keyword_hit = self.url_is_target(response.url) # If url is not target and exceeds hesitancy depth, return - if not url_is_targeted and current_depth >= self.max_depth: + if not url_is_targeted and steps_from_target >= self.max_depth: return # Determine whether we need to add a jump @@ -310,7 +330,7 @@ def parse(self, response): self.save_batch() # Reset current depth because we found target at current page - current_depth = 0 + steps_from_target = 0 # Extract and follow links for link in response.css("a::attr(href)").getall(): @@ -327,6 +347,7 @@ def parse(self, response): "base_url": response.meta.get("base_url"), "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", "depth": current_depth + 1, + "steps_from_target": steps_from_target + 1, "jumps": jumps }, dont_filter=False # Skip duplicates @@ -338,6 +359,7 @@ def parse_sitemap(self, response): urls = response.xpath('//ns:url/ns:loc/text()', namespaces=ns).getall() for url in urls: + url = normalize_url(url) # Only continue with valid crawl paths if self.skip_this_url(url): continue diff --git a/src/main_scrapy.py b/src/main_scrapy.py index 5d281cd..701ffa8 100644 --- a/src/main_scrapy.py +++ b/src/main_scrapy.py @@ -37,9 +37,9 @@ def read_parquet_dir(parquet_dir): yield df[df['content'].apply(is_valid_string)] -# Spawn spider crawler process -def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, logfile, output_file): - print(f"Args: urls: {urls}, keywords: {keywords}, skip domains: {skip_domains}, log level {log_level}, log file: {logfile}, output file: {output_file}, process_id: {process_id}") +# Spawn spider crawler process +def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, process_id, log_level, logfile, output_file, schema_keywords): + print(f"Args: urls: {urls}, netloc keywords: {netloc_keywords}, path keywords: {path_keywords}, skip domains: {skip_domains}, log level: {log_level}, log file: {logfile}, output file: {output_file}, process_id: {process_id}") print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!") project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) if project_root not in sys.path: @@ -83,7 +83,8 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo spiderCrawler, start_urls=urls, max_depth=2, - target_keywords=keywords, + target_netloc_keywords=netloc_keywords, + target_path_keywords=path_keywords, skip_domains=skip_domains, output_file=output_file, allowed_top_level_domains=[".com", ".nl", ".ai", ".de", ".be", ".eu", ".io"], @@ -97,7 +98,7 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo ], allowed_languages=["nl", "en", "en-uk", "en-gb"], allowed_countries=["nl"], - schema_keywords=["JobPosting"] + schema_keywords=schema_keywords ) # If worker gets 0 urls, pass (shouldn't happen) @@ -141,10 +142,15 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo urls = [*map(normalize_url, urls)] # Keywords - file_keywords = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}" + file_keywords = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.netloc_keywords}" logging.info(f"Reading list of keywords from file: {file_keywords}") with open(file_keywords, 'r', encoding='utf-8') as file_in: - target_keywords = [line.rstrip() for line in file_in] + target_netloc_keywords = [line.rstrip() for line in file_in] + + file_keywords = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.path_keywords}" + logging.info(f"Reading list of keywords from file: {file_keywords}") + with open(file_keywords, 'r', encoding='utf-8') as file_in: + target_path_keywords = [line.rstrip() for line in file_in] # Skip domains file_skip_domains = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}" @@ -169,12 +175,14 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo chunked_args.append( ( url_chunks[i], - target_keywords, + target_netloc_keywords, + target_path_keywords, skip_domains, i, logging_level, logfile, - f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet" # Different output files per werker + f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet", # Different output files per werker + [CONFIG.crawl.schema.keyword] ) ) @@ -187,8 +195,7 @@ def spawn_spider_process(urls, keywords, skip_domains, process_id, log_level, lo end_time = time.perf_counter() - -# Results in tables + # Results in tables dir_parquets = f"{CONFIG.output.output_dir}/{time_part}/" parquet_dfs = read_parquet_dir(dir_parquets) From 29d6a636a3c478b391c199c1316180104b8a3608 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 08:39:03 +0000 Subject: [PATCH 10/26] add missing config keys --- config/config_template.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/config/config_template.yaml b/config/config_template.yaml index d1802d0..b2ed76c 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -12,16 +12,17 @@ requests: timeout_read: 7 # In seconds max_retries: 3 input: - input_dir: ../input + input_dir: input input_files: skip_domains: skipdomains.txt urls: urls.txt - keywords: keywords.txt + netloc_keywords: keywords.txt + path_keywords: keywords.txt url_max: 100 url_offset: 0 input_variables: output: - output_dir: ../output + output_dir: output batchsize: 100 logs: logs crawl: From 80b1a414e0eb48f9457b9546b93231e3cd934572 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 08:40:23 +0000 Subject: [PATCH 11/26] adjust testing code of hesitant spider --- src/crawl/scrapymodules/HesitantSpider.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py index c8ab72a..f47011b 100644 --- a/src/crawl/scrapymodules/HesitantSpider.py +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -412,13 +412,17 @@ def closed(self, reason): spiderCrawler = process.create_crawler(HesitantSpider) # Crawl and configure spider - # urls = ['https://books.toscrape.com/'] - # target_keywords = ["philosophy"] - - urls = ['https://werkenbijhetcbs.nl/'] - target_keywords = ["enqueteur"] + urls = ['https://books.toscrape.com/'] + target_keywords = ["philosophy"] sitemaps_tocheck = ["sitemap.xml"] allowed_top_level_domains = [".com", ".nl"] + max_depth = 1 + + # urls = ['https://werkenbijhetcbs.nl/'] + # target_keywords = ["enqueteur"] + # sitemaps_tocheck = ["sitemap.xml"] + # allowed_top_level_domains = [".com", ".nl"] + # max_depth = 2 # Skip domains file_skip_domains = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}" @@ -433,7 +437,9 @@ def closed(self, reason): process.crawl( spiderCrawler, start_urls=urls, - target_keywords=target_keywords, + target_netloc_keywords=target_keywords, + target_path_keywords=target_keywords, + max_depth=max_depth, skip_domains=skip_domains, allowed_top_level_domains=allowed_top_level_domains, output_file=output_file, From a0d2944b318ecb6151d46af1f06daf2c4d940f49 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 08:45:15 +0000 Subject: [PATCH 12/26] remove unused code --- src/analysis/__init__.py | 0 src/analysis/analyze_results.py | 151 --------------- src/crawl/HesitantCrawler.py | 326 -------------------------------- src/crawl/__init__.py | 2 - src/crawl/base.py | 103 ---------- src/fetch/HTML.py | 196 ------------------- src/fetch/Robots.py | 70 ------- src/fetch/__init__.py | 3 - src/fetch/base.py | 65 ------- src/main.py | 52 ----- src/scrape/__init__.py | 44 ----- src/scrape/base.py | 162 ---------------- 12 files changed, 1174 deletions(-) delete mode 100644 src/analysis/__init__.py delete mode 100644 src/analysis/analyze_results.py delete mode 100644 src/crawl/HesitantCrawler.py delete mode 100644 src/crawl/base.py delete mode 100644 src/fetch/HTML.py delete mode 100644 src/fetch/Robots.py delete mode 100644 src/fetch/__init__.py delete mode 100644 src/fetch/base.py delete mode 100644 src/main.py delete mode 100644 src/scrape/__init__.py delete mode 100644 src/scrape/base.py diff --git a/src/analysis/__init__.py b/src/analysis/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/analysis/analyze_results.py b/src/analysis/analyze_results.py deleted file mode 100644 index 62b5843..0000000 --- a/src/analysis/analyze_results.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import logging -from typing import Set -import re - -import pandas as pd - -from util import setup - -CONFIG = setup("../config/config.yaml") - - -def is_valid_string(s): - if not isinstance(s, str): - return False - if len(s) == 0: - return False - strange_chars = re.findall(r'[\x00-\x08\x0B\x0E-\x1F\x7F]', str(s)) - return not (len(strange_chars) / len(s)) > 0.1 - - -class ParquetReader(object): - def __init__( - self, - dir_parquets: str, - filter_valid_content: bool = True): - """ - Reader finds (all) parquet files in given folder and yields each as a pd.DataFrame - """ - self._dir_parquets = dir_parquets - logging.info(f"ParquetReader will search for parquet files in: {dir_parquets}.") - self._filter_valid_content = filter_valid_content - logging.info(f"ParquetReader will filter content for valid strings: {filter_valid_content}.") - - def __iter__(self): - cnt = 0 - for root, dirs, files in os.walk(self._dir_parquets): - for file in files: - if file.endswith('.parquet'): - cnt += 1 - file_path = os.path.join(root, file) - logging.debug(f"Yielding parquet file: {file_path}.") - df = pd.read_parquet(file_path) - if not self._filter_valid_content: - yield df - else: - yield df[df['content'].apply(is_valid_string)] - logging.info(f"ParquetReader iterated through {cnt} parquet files in total.") - - -def get_baseurls(df: pd.DataFrame) -> Set: - return set(list(df['base_url'].drop_duplicates())) - - -class LogReader(object): - def __init__( - self, - dir_logs: str): - """ - """ - self._dir_logs = dir_logs - logging.info(f"LogReader will search for log files in: {dir_logs}.") - - def __iter__(self): - cnt = 0 - for root, dirs, files in os.walk(self._dir_logs): - for file in files: - if file.endswith('.log'): - cnt += 1 - file_path = os.path.join(root, file) - logging.debug(f"Yielding log file: {file_path}.") - with open(file_path, 'r', newline='\n') as filelog: - for line in filelog: - yield line - logging.info(f"LogReader iterated through {cnt} log files in total.") - - -if __name__ == "__main__": - - logging.basicConfig(level=logging.INFO) - - # Input URLs - with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}", 'r', encoding='utf-8') as file_in: - urls = [line.rstrip() for line in file_in] - - # Results from log file - dir_logs = f"{CONFIG.output.output_dir}/{CONFIG.output.logs}" - lr = LogReader(dir_logs=dir_logs) - urls_tried = set() - visits = {} - base_url_current = '' - - for logline in lr: - if "Trying to crawl base url: " in logline: - base_url_current = logline.split("Trying to crawl base url: ")[1].split('\n')[0] - if base_url_current not in urls_tried: - urls_tried.add(base_url_current) - visits[base_url_current] = 0 - else: - logging.error(f"This should not have happend. Twice url: {base_url_current}") - - if " visits out of maximum " in logline: - if len(base_url_current) == 0: - logging.error("This should not have happend. Found visits before new url declaration") - visits[base_url_current] = int(logline.split(" visits out of maximum ")[1].split('.')[0]) - base_url_current = '' - - for k in visits: - if k not in urls_tried: - print(k) - - logging.info(f"Processed urls: {len(urls_tried)}, of total given: {len(urls)}.") - visits_none = {k for k, v in visits.items() if v == 0} - logging.info(f"Websites without visits: {len(visits_none)}.") - visits = {k: v for k, v in visits.items() if v > 0} - logging.info(f"Websites with visits: {len(visits)}.") - - # Results in tables - dir_parquets = CONFIG.output.output_dir - pr = ParquetReader(dir_parquets=dir_parquets) - - # Analysis - urls_withcontent = set() - count_content = 0 - dfs = [] - for df in pr: - logging.debug(df.head(2)) - urls_withcontent = urls_withcontent.union(get_baseurls(df=df)) - count_content += df.shape[0] - dfs.append(df) - - logging.info(f"Total number of base-urls tried: {len(urls)}.") - logging.info(f"Total number of base-urls with scraped content: {len(urls_withcontent)}.") - logging.info(f"Total number of pages downloaded: {count_content}.") - - total = pd.concat(dfs, ignore_index=True) - logging.debug(f"Total number of base-urls with scraped content: {len(get_baseurls(df=total))}.") - logging.debug(f"Total number of pages downloaded: {total.shape[0]}.") - - dfs.to_parquet("output/output.parquet") - - gr = total.groupby(by='base_url', as_index=False)['url'].count() - gr = gr.rename(columns={'url': 'pages', 'base_url': 'count'}) - gr = gr.groupby(by='pages', as_index=False).count() - counts_stats = {row['pages']: row['count'] for row in gr.to_dict(orient='records')} - counts_out = pd.DataFrame([{'pages': k, 'counts': v} for k, v in counts_stats.items()]) - # counts_out.to_csv('scraped_pages_count.csv', index=False) - - logging.info(f"Downloaded single page for {counts_stats[1]} base-urls.") - logging.info(f"Maximum number op pages downloaded for a given base-url: {max(counts_stats.keys())}.") - \ No newline at end of file diff --git a/src/crawl/HesitantCrawler.py b/src/crawl/HesitantCrawler.py deleted file mode 100644 index 9d07853..0000000 --- a/src/crawl/HesitantCrawler.py +++ /dev/null @@ -1,326 +0,0 @@ -from typing import List -import time -import logging -import re -import validators - -import numpy as np -from urllib.parse import urlparse, urljoin -from bs4 import BeautifulSoup - -from .base import BaseCrawler, CrawlResult -from fetch import HTMLFetcher -from util import setup - -CONFIG = setup("config/config.yaml") - - -class HesitantCrawler(BaseCrawler): - def __init__( - self, - fetcher: HTMLFetcher, - target_keywords: List[str], - add_sitemapurls: bool = False, - max_depth: int = 1, - skip_domains: List[str] = []): - """ - Depth-limited Search Targeted Crawler - Crawler class for obtaining urls from start_url. - Crawler will look for urls on start_url and append them to list. It will then - look for urls in the next item on the list and append urls to the end of the list. - Once the max_crawl_visits is visited, crawl stops. - - This crawler must be used as a focused crawl where target_keywords are given. - Only URLs found that meet the target keywords are stored in the results. - - Crawler will hesitate: - It will not stop at path that led to no targeted results - as long as it hasn't gone too deep yet. - The hesitancy reflects the idea to crawl on non-relevant pages because they might lead to relevant pages. - - If sitemap is also to be checked for urls, set add_sitemapurls = True - If only starting_page and sitemaps should be used exclusively, set max_crawl_visits = 0 in config file - - :param add_sitemapurls: True if urls from sitemap are added to crawl - :param target_keywords: List of targeting keywords in regex format - :param max_depth: How many steps further do we look beyond non-targeted results, defaults to 1 - """ - logging.info(f"Initializing HesitantCrawler with max_depth={max_depth}") - self.max_depth = max_depth - if max_depth < 0: - logging.debug("Only urls from starting_url (and possibly sitemap if used) can be found, since max_depth<0") - - super(HesitantCrawler, self).__init__(fetcher=fetcher) - - # crawl delay will be overwritten if robots from given domain provides a value # TODO: make sure - self.crawl_delay = 2 - logging.debug(f"Defaul crawl delay is set to {self.crawl_delay}") - - self.max_duration = CONFIG.crawl.max_duration - logging.debug(f"Max duration of crawl set to {self.max_duration} seconds") - - self.max_crawl_visits = CONFIG.crawl.max_visits - logging.debug(f"Max page visits of crawl set to {self.max_crawl_visits}") - - # Targets - self.target_keywords = target_keywords - logging.info(f"The targeted crawl will look for given keywords: {', '.join(self.target_keywords)}") - - # Skip domains - self.skip_domains = skip_domains - logging.info(f"The targeted crawl will skip domains: {', '.join(self.skip_domains)}") - - # Excluded URLs which contain: - self._unsupported = ( - ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", - ".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4", - ".woff", ".mpg", ".qt", ".rm", ".swf", ".wmv", ".m4a", ".css", ".pdf", ".doc", ".docx", ".exe", ".bin", ".rss", ".zip", - ".rar", ".msu", ".flv", ".dmg", ".xls", ".xlsx", ".ico", ".mng?download=true", ".pct?download=true", ".bmp?download=true", - ".gif?download=true", ".jpg?download=true", ".jpeg?download=true", ".png?download=true", ".pst?download=true", - ".psp?download=true", ".tif?download=true", ".tiff?download=true", ".ai?download=true", ".drw?download=true", - ".dxf?download=true", ".eps?download=true", ".ps?download=true", ".svg?download=true", ".mp3?download=true", - ".wma?download=true", ".ogg?download=true", ".wav?download=true", ".ra?download=true", ".aac?download=true", - ".mid?download=true", ".au?download=true", ".aiff?download=true", ".3gp?download=true", ".asf?download=true", - ".asx?download=true", ".avi?download=true", ".mov?download=true", ".mp4?download=true", ".mpg?download=true", - ".qt?download=true", ".rm?download=true", ".swf?download=true", ".wmv?download=true", ".m4a?download=true", - ".css?download=true", ".pdf?download=true", ".doc?download=true", ".exe?download=true", ".bin?download=true", - ".rss?download=true", ".zip?download=true", ".rar?download=true", ".msu?download=true", ".flv?download=true", - ".dmg?download=true") - logging.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}") - - self.add_sitemapurls = add_sitemapurls - logging.info(f"Will we check URLs from sitemap? Answer: {add_sitemapurls}") - - def skip_this_url(self, url: str) -> bool: - """Function to see if we have already visited url""" - - # prevent duplicate crawl from trailing forward slash in URL - url = url.rstrip('/') if url.endswith('/') else url - - # prevent duplicate crawl from '#' such as '#content', '#main', etc. - url = url.rstrip("#") if "#" in url else url - - if any([skip_domain in url for skip_domain in self.skip_domains]): - logging.debug(f"Skip {url}, because domain is in skip-list") - return True # skip - - # Do not revisit pages - if url in self._visited: - logging.debug(f"Skip {url}, because we have visited it before") - return True # skip - return False - - def find_urls(self, url: str, html: str) -> str: - """ - Generator that yields a URLs to check for target condition - """ - - soup = BeautifulSoup(html, "html.parser") - - # Extract links - will later be checked if they are internal - for link in soup.find_all("a", href=True): - href = link["href"] - absolute_url = urljoin(url, href) # TODO: find out if necessary - absolute_url = absolute_url.rstrip('/') if absolute_url.endswith('/') else absolute_url - # parsed = urlparse(absolute_url) - - # if parsed.netloc == self.domain and absolute_url not in self._istargeted: - if absolute_url not in self._istargeted: - logging.debug(f"Found a URL to check: {absolute_url}") - yield absolute_url - - def find_target(self, parsed: str) -> str: - """Check if the parsed URL matches the target keywords in subdomain or path""" - - subdomain = parsed.netloc - logging.debug(f"Current URL subdomain is identified as: {subdomain}") - path = parsed.path - logging.debug(f"Current URL path is identified as: {path}") - - # Check for keywords in subdomain - for keyword in self.target_keywords: - first_keyword_hit = re.search(keyword, subdomain) - if first_keyword_hit is not None: - logging.debug(f"Target is met in the subdomain: {subdomain}") - logging.debug(f"Target is met with the following hit: {first_keyword_hit.group(0)}") - return first_keyword_hit.group(0) - - # Check for keywords in path - for keyword in self.target_keywords: - first_keyword_hit = re.search(keyword, path) - if first_keyword_hit is not None: - logging.debug(f"Target is met in the path: {path}") - logging.debug(f"Target is met with the following hit: {first_keyword_hit.group(0)}") - return first_keyword_hit.group(0) - - logging.debug("Target has not been met, no hit") - return '' - - def process_url(self, url: str, parent_url: str, from_sitemap: bool = False): - """check url for target and then add to results and queue""" - - if not validators.url(url): - logging.debug(f"Invalid url: {url}") - return - - if url in self._istargeted: - return - - if any(ext in url for ext in self._unsupported): - self._istargeted[url] = { - 'parent': parent_url, - 'depth': np.inf, - 'is_deadend': True} - logging.debug("Unsupported url, setting depth to infinite and deadend=True, will not be added to queue") - return - - # parse the url - parsed = urlparse(url) - domain = parsed.netloc - - # dead ends for queue - is_deadend = False - if from_sitemap: - is_deadend = True # won't be added to queue if from sitemap tree - - if domain != self.start_domain: - if domain != self._istargeted[parent_url]['domain']: - if self._istargeted[parent_url]['domain'] != self.start_domain: - # In this case we have jumped to a third domain, not allowed at all! - logging.debug("Deviated from domain twice, url is not allowed") - return - else: - # In this case we jumped the first time, allowed and we still like to crawl that site actually - pass - # TODO: check if following should be done? What if on a job board the link goes to vacancies of a different company? - # else: - # # We are still on the domain after the first jump, allowed to be targeted but no more crawl - # is_deadend = True - logging.debug(f"Result of check if the URL is a dead end: {is_deadend}") - - # determine if it is targeted - first_keyword_hit = self.find_target(parsed=parsed) - is_targeted = True if len(first_keyword_hit) > 0 else False - logging.debug(f"Result of check if the URL is targeted: {is_targeted}") - - # keep track of how far wway we've walked from targeted site - depth = 0 if is_targeted else self._istargeted[parent_url]['depth'] + 1 - logging.debug(f"Depth = steps away from a targeted URL: {depth}") - self._istargeted[url] = { - 'domain': parsed.netloc, - 'parent': parent_url, - 'depth': depth, - 'is_deadend': is_deadend} - - # Add to results if targeted - if is_targeted: - logging.info(f"Found a targeted URL: {url}") - logging.debug("Adding the URL to our list with results") - self._results.append(CrawlResult(url=url, source="NoCrawler", targeted=True, first_keyword_hit=first_keyword_hit)) - - # May anyways be added to queue of URLs to visit for more URLS - if (depth <= self.max_depth) and (not is_deadend) and (not from_sitemap): - logging.debug(f"Adding the URL to queue vor visiting with depth={depth} at max_depth={self.max_depth}") - self._queue.append(url) - - def order_queue(self): - """Reorder elements in queue by ascending depth of URL, so that targeted URLs are visited first""" - - if len(self._queue) > 0: - self._queue = sorted(self._queue, key=lambda x: self._istargeted.get(x, {'depth': np.inf})['depth']) - - def crawl(self): - """ - Main crawling function - Results can be otbained by calling get_results() - """ - - if len(self.start_url) == '': - logging.error("No start URL provided for crawler, use reset_with_starturl() to reset crawler") - return {} - logging.info(f"Starting crawl of {self.start_url}..") - - # domain - domain = urlparse(self.start_url).netloc - - # The queue will be updated with found urls and then worked through - # until a maximum number of visits or duration is reached - self._queue = [self.start_url] - start_time = time.time() - duration = 0 - - # for reference, put start_url and domain in dictionary - self._istargeted[self.start_url] = {'depth': 0, 'domain': domain, 'is_deadend': False} - self._istargeted[domain] = {'depth': 0, 'domain': domain, 'is_deadend': False} - - while self._queue and len(self._visited) < self.max_crawl_visits and duration < self.max_duration: - - # Take an element from the queue - visiting_url = self._queue.pop(0) # will start with base url, then whatever will have been added next - - # Check if we already visited URL - logging.debug(f"Check if {visiting_url} can be skipped") - if self.skip_this_url(url=visiting_url): - continue - - # Fetch from visting URL, will check robots if it is allowed (as part of Fetcher class) - try: - visiting_html, schema_indicator = self._fetcher.fetch(url=visiting_url) - self._visited[visiting_url] = visiting_html # even if nothing found, keep track of what we have tried - if len(visiting_html) == 0: # Nothing returned - continue - - for found_url in self.find_urls(url=visiting_url, html=visiting_html): - self.process_url(url=found_url, parent_url=visiting_url) - except Exception: - continue - - # At the end, measure how long we've been busy so far - duration = time.time() - start_time - - # Respect crawl delay - logging.debug("Waiting for delay to pass") - time.sleep(self.crawl_delay) - logging.debug("Delay has passed") - - # order queue by depth, ascending - so that targeted URLs are crawled before the ones further removed - self.order_queue() - - # Crawl stopped - logging.debug(f"Crawl stopped after {np.around(duration, 0)} seconds, with max duration {self.max_duration} seconds") - logging.debug(f"Crawl stopped after {len(self._visited)} page visits, with max {self.max_crawl_visits}") - logging.debug(f"Crawl stopped with {len(self._queue)} urls still in the queue") - - logging.info(f"Crawling from {self.start_url} involved checking {len(self._istargeted)} URLs for meeting the target") - logging.info(f"Crawling from {self.start_url} resulted in {len(self.get_results())} results") - logging.debug(f"Crawling from {self.start_url} results: {self.get_results()}") - - if self.add_sitemapurls: - self.extendcrawl_fromsitemaps(domain=domain) - - def extendcrawl_fromsitemaps(self, domain: str): - sitemap_urls = self._fetcher.robotsfetcher.get_sitemap_urls(domain=domain) - if sitemap_urls: - logging.info(f"Sitemaps of {self.start_url} linked to {len(sitemap_urls)} URLs to check for meeting the target") - for found_url in sitemap_urls: - self.process_url(url=found_url, parent_url=domain, from_sitemap=True) - logging.info(f"Sitemaps of {self.start_url} increased the number of results to {len(self.get_results())}") - logging.info(f"No sitemap URLs found for {self.start_url}") - - -if __name__ == "__main__": - - logging.basicConfig(level=logging.INFO) - - target_keywords = ["vacature"] - fetcher = HTMLFetcher() - - crawler = HesitantCrawler( - fetcher=fetcher, - target_keywords=target_keywords, - max_depth=-1, - add_sitemapurls=True - ) - - # Crawl can start as soon as start url provided - crawler.reset_with_starturl(start_url="https://cbs.nl") - crawler.crawl() diff --git a/src/crawl/__init__.py b/src/crawl/__init__.py index 77999db..e69de29 100644 --- a/src/crawl/__init__.py +++ b/src/crawl/__init__.py @@ -1,2 +0,0 @@ -from .base import ICrawler, NoCrawler, BaseCrawler, CrawlResult -from .HesitantCrawler import HesitantCrawler \ No newline at end of file diff --git a/src/crawl/base.py b/src/crawl/base.py deleted file mode 100644 index 1a9f576..0000000 --- a/src/crawl/base.py +++ /dev/null @@ -1,103 +0,0 @@ -from abc import ABC, abstractmethod -from typing import NamedTuple, List -import logging -from urllib.parse import urlparse -from scrapy.http import Response -from fetch import IFetcher - - -class CrawlResult(NamedTuple): - url: str - source: str - targeted: bool = None - first_keyword_hit: str = None - crawl_depth: int = 0 - - -class ICrawler(ABC): - """ - interface for all crawlers - """ - def __init__(self, fetcher: IFetcher): - logging.info(f"Initializing crawler with fetcher of type: {type(fetcher)}") - self._fetcher = fetcher - - @abstractmethod - def reset_with_starturl(start_url: str): - """Reset crawler and set url from which to start the crawl""" - raise NotImplementedError() - - @abstractmethod - def get_results() -> List[CrawlResult]: - """Return list of crawled URLs""" - return NotImplementedError() - - @abstractmethod - def crawl(): - """Crawl candidate URLs""" - raise NotImplementedError() - - -class BaseCrawler(ICrawler): - """ - Base functionality of all Crawlers - """ - def __init__(self, fetcher: IFetcher): - super(BaseCrawler, self).__init__(fetcher=fetcher) - self.start_url = "" - self.start_domain = "" - self.crawl_delay = 2 - - def reset_results(self): - logging.debug("Crawler is (re)set with empty results") - self._results = [] # for output - self._queue = [] # for next visits - self._visited = dict() # to keep track of visited pages - self._istargeted = dict() # will keep track of urls and if they met targeting conditions - - def reset_with_starturl(self, start_url: str): - """Reset crawler and set url from which to start the crawl""" - self.reset_results() - - logging.debug(f"Crawler start url given as: {start_url}") - if not start_url.startswith('https://') and not start_url.startswith('http://'): - logging.debug("Start URL lacks required http or https prefix") - start_url = f"https://{start_url}" - logging.info(f"Prefix 'https://' added to start URL: {start_url}") - self.start_url = start_url - - self.start_domain = urlparse(start_url).netloc - - def get_results(self) -> List[CrawlResult]: - """Return list of crawled URLs""" - return self._results - - def crawl(): - """Crawl candidate URLs""" - raise NotImplementedError() - - -class NoCrawler(BaseCrawler): - """ - Do nothing Crawler for testing, just put start_url in results - """ - def __init__(self): - from fetch import NoFetcher - logging.info("Initializing NoCrawler which will not crawl") - logging.debug("Since Crawler won't go looking for urls, the NoFetcher is loaded as a dummy") - super(NoCrawler, self).__init__(fetcher=NoFetcher()) - - def crawl(self): - logging.info("Just adding start-url to the results") - result = CrawlResult(url=self.start_url, source="NoCrawler") - self._results.append(result) - - -if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) - - crawler = NoCrawler() - crawler.reset_with_starturl(start_url="https://books.toscrape.com") - crawler.crawl() - for r in crawler.get_results(): - print(r) diff --git a/src/fetch/HTML.py b/src/fetch/HTML.py deleted file mode 100644 index 597cc15..0000000 --- a/src/fetch/HTML.py +++ /dev/null @@ -1,196 +0,0 @@ -import requests -from typing import Dict, Optional -import time -import random -import urllib -from urllib.parse import urlparse -import logging - -import extruct -from w3lib.html import get_base_url - -from util import setup -from .base import IFetcher - -CONFIG = setup("config/config.yaml") - - -class HTMLFetcher(IFetcher): - """ - Standard Fetcher - Fetches the HTML content of the given URL with retries and error handling. - Uses a robots fetcher - Returns a dictionary with the URL as key and the HTML content as value. - """ - def __init__( - self, - user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", - headers: Optional[Dict] = None): - logging.info("Initializing HTMLFetcher") - super(HTMLFetcher, self).__init__(user_agent=user_agent) - self.user_agent = user_agent - logging.debug(f"User agent given as: {user_agent}") - - self.timeout = ( - CONFIG.requests.timeout_connect, - CONFIG.requests.timeout_read) - logging.debug(f"Timeout for connection is {CONFIG.requests.timeout_connect} seconds, for reading {CONFIG.requests.timeout_read} seconds") - - self.max_retries = CONFIG.requests.max_retries - logging.debug(f"Maximum retries set to {CONFIG.requests.max_retries}") - - self.headers = headers or { - "User-Agent": self.user_agent, - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9,nl-NL;q=0.8,nl;q=0.7", - "Accept-Encoding": "identity", - "Connection": "keep-alive" - } - - headers_str = ', '.join([f"{k}: {v}" for k, v in self.headers.items()]) - logging.debug(f"Request headers set to {headers_str}") - - # Domain will have to be identified for any given url to fetch, then the corresponding robots file will be checked - # this is handled by RobotsFetcher - from .Robots import RobotsFetcher - self.robotsfetcher = RobotsFetcher(user_agent=user_agent) - self._robots_bydomain = self.robotsfetcher.get_results() - - def resetResults(self): - self.results = {} - return - - def is_allowed(self, url: str) -> bool: - """Will check if robots of given domain allows fetching""" - - # Identify given domain to check corresponding robots file - domain = urlparse(url).netloc # obtain domain from url - logging.debug(f"The domain is identified as {domain}") - - if not self._robots_bydomain.get(domain, False): - self.robotsfetcher.fetch(domain=domain) - self._robots_bydomain[domain].read() - logging.debug(f"A new robots file has been read for domain {domain}") - - # check if allowed - if self._robots_bydomain[domain].can_fetch(useragent=self.user_agent, url=url): - return True - else: - return False - - def fetch(self, url: str) -> str: - """ - Fetches the HTML content of the given URL with retries and error handling. - Returns a dictionary with the URL as key and the HTML content as value. - """ - logging.info(f"Trying to fetch the next URL: {url}") - - # check if allowed - logging.debug("Checking if url is allowed") - if not self.is_allowed(url=url): - logging.debug(f"Given url skipped because it is not allowed: {url}") - return {} - - return self._fetch_with_retries(url) - - def _fetch_with_retries(self, url: str, retries: int = 0): - """ - Internal method that performs the request with retry logic. - """ - try: - response = requests.get(url, headers=self.headers, timeout=self.timeout) - - # Check for HTTP errors - if response.status_code != 200: - logging.warning(f"Exited with response status: {response.status_code}") - return {} - - # Check if content is HTML - if "text/html" not in response.headers.get("Content-Type", ""): - logging.info(f"Non-HTML content received for URL: {url}") - return {} - - # Success - result = response.text - - # Base URL is needed to resolve relative URLs in the metadata - base_url = get_base_url(result, response.url) - # Extract JSON-LD - schema_indicator = False - try: - data = extruct.extract(result, base_url=base_url) - - # Filter out empty formats - found_schema = {k: v for k, v in data.items() if v} - - if found_schema: - for format_type in found_schema: - if format_type == "json-ld": - for el in found_schema["json-ld"]: - if "@context" in el.keys(): - if "@type" in el.keys(): - if el["@type"] == CONFIG.crawl.schema.keyword: - logging.info(f"Element {CONFIG.crawl.schema.keyword} found using schema.org for base url: {base_url}") - print(f"{CONFIG.crawl.schema.keyword} found using schema.org for base url: {base_url}") - schema_indicator = True - if "@graph" in el.keys(): - for graphel in el["@graph"]: - if graphel["@type"] == CONFIG.crawl.schema.keyword: - logging.info(f"Graph element {CONFIG.crawl.schema.keyword} found using schema.org for base url: {base_url}") - print(f"Graph {CONFIG.crawl.schema.keyword} found using schema.org for base url: {base_url}") - schema_indicator = True - except Exception: - pass - - if CONFIG.crawl.schema.keyword in result: - logging.debug(f"Found schema keyword: {CONFIG.crawl.schema.keyword} in HTML for url: {url}, possible schema org?") - - self.results[url] = (result, schema_indicator) - return result, schema_indicator - - except requests.exceptions.RequestException as e: - # Handle exceptions - logging.info(f"Request failed for {url}. Error: {e}") - - if retries < self.max_retries: - wait_time = random.uniform(1, 5) # Random delay between 1 and 5 seconds - logging.info(f"Retrying in {wait_time:.2f} seconds...") - time.sleep(wait_time) - return self._fetch_with_retries(url, retries + 1) - - # Max retries reached - return {} - - except urllib.error.URLError as e: - logging.info(f"Request failed with exception: {e}") - return {} - - def get_results(self) -> Dict[str, str]: - """ - Returns the dictionary of fetched URLs and their HTML content. - """ - return self.results - - -if __name__ == "__main__": - - logging.basicConfig(level=logging.DEBUG) - - user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - fetcher = HTMLFetcher( - user_agent=user_agent - ) - - urls = [ - "https://example.com", - "https://books.toscrape.com", - "https://werkenbijhetcbs.nl/vacature-overzicht-express#/?page=1", - "https://books.toscrape.com/catalogue/category/books/travel_2/index.html" - ] - - for url in urls: - fetcher.fetch(url) - - for url, html in fetcher.get_results().items(): - print(f"\nURL: {url}") - print(f"...{html[:100]}...\n\n") diff --git a/src/fetch/Robots.py b/src/fetch/Robots.py deleted file mode 100644 index 24d18c5..0000000 --- a/src/fetch/Robots.py +++ /dev/null @@ -1,70 +0,0 @@ -from typing import Dict, List -import logging -from urllib.robotparser import RobotFileParser -from usp.tree import sitemap_tree_for_homepage - -from util import setup -from .base import IFetcher - -CONFIG = setup("config/config.yaml") - - -class RobotsFetcher(IFetcher): - """ - Robots Fetcher for accessing information in robots file - """ - def __init__( - self, - user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"): - logging.info("Initializing RobotsFetcher") - super(RobotsFetcher, self).__init__(user_agent=user_agent) - - # keep track of domains for which the robots file has already been fetched - self.results = dict() - - def fetch(self, domain: str) -> RobotFileParser: - """Fetches robots file for given url domain, if not already done""" - - # only download robots in case it hasn't already - self.results.setdefault(domain, RobotFileParser(url=f"https://{domain}/robots.txt")) - return self.results[domain] - - def get_results(self) -> Dict[str, RobotFileParser]: - """ - Returns the dictionary of fetched URLs and their HTML content. - """ - return self.results - - def get_sitemap_urls(self, domain: str) -> List[str]: - """Get a list of sitemaps listed on robots.txt""" - try: - tree = sitemap_tree_for_homepage(f"https://{domain}", use_robots=True, use_known_paths=False) - sitemap_urls = [page.url for page in tree.all_pages()] - logging.debug(f"Found {len(sitemap_urls)} sitemap_urls for domain {domain}") - return sitemap_urls - except Exception as e: - logging.warning(f"Could not fetch sitemap_urls for domain {domain}: {e}") - return [] - - -if __name__ == "__main__": - from urllib.parse import urlparse - - logging.basicConfig(level=logging.DEBUG) - - fetcher = RobotsFetcher() - - urls = ["https://books.toscrape.com", - "https://cbs.nl"] - - domains = [urlparse(url=url).netloc for url in urls] - - for domain in domains: - fetcher.fetch(domain) - - for domain, robotsobject in fetcher.get_results().items(): - print(f"Domain: {domain}") - print(robotsobject.path) - - for domain in domains: - sitemaps = fetcher.get_sitemap_urls(domain=domain) \ No newline at end of file diff --git a/src/fetch/__init__.py b/src/fetch/__init__.py deleted file mode 100644 index 06bfbfa..0000000 --- a/src/fetch/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from fetch.base import IFetcher, NoFetcher -from fetch.Robots import RobotsFetcher -from fetch.HTML import HTMLFetcher \ No newline at end of file diff --git a/src/fetch/base.py b/src/fetch/base.py deleted file mode 100644 index 8202671..0000000 --- a/src/fetch/base.py +++ /dev/null @@ -1,65 +0,0 @@ -import logging -from abc import ABC, abstractmethod -from typing import Dict - - -class IFetcher(ABC): - """ - interface for all fetchers - """ - def __init__( - self, - user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"): - self.user_agent = user_agent - self.results = {} # {url: html_content} - - @abstractmethod - def fetch(self, url: str): - """Fetches content for given url""" - raise NotImplementedError() - - @abstractmethod - def get_results(self) -> Dict: - """Returns the dictionary of fetched URLs and their content""" - return NotImplementedError() - - -class NoFetcher(IFetcher): - """ - Do nothing Fetcher for testing, returns a minimal html doc - """ - def __init__(self): - logging.info("Initializing NoFetcher, returns a minimal default html") - super(NoFetcher, self).__init__() - - self._default_html = (""" - - -Hello -""") - - def fetch(self, url: str) -> str: - """Fetches default minimal html""" - self.results[url] = self._default_html - return self.results[url] - - def get_results(self) -> Dict[str, str]: - """ - Returns the dictionary of fetched URLs and their HTML content. - """ - return self.results - - -if __name__ == "__main__": - - logging.basicConfig(level=logging.DEBUG) - - fetcher = NoFetcher() - - urls = ["https://books.toscrape.com"] - for url in urls: - fetcher.fetch(url) - - for url, html in fetcher.get_results().items(): - print(f"\nURL: {url}") - print(f"...{html[:100]}...\n\n") diff --git a/src/main.py b/src/main.py deleted file mode 100644 index ce2e3f4..0000000 --- a/src/main.py +++ /dev/null @@ -1,52 +0,0 @@ -import os -from omegaconf import OmegaConf -from util import setup -import logging -from datetime import datetime -import time - -from scrape import build_webfocusedscraper - - -CONFIG = setup("config/config.yaml") - - -def main(): - """ - Crawl given urls to fetch relevant content from HTML - """ - - user_agent = CONFIG.requests.useragent - scraper = build_webfocusedscraper(user_agent=user_agent) - scraper.scrape() - - -if __name__ == "__main__": - - LOG_FILE = f"{CONFIG.output.output_dir}/{CONFIG.output.logs}" - if not os.path.exists(LOG_FILE): - os.makedirs(LOG_FILE) - LOG_FILE = f"{LOG_FILE}/{datetime.fromtimestamp(time.time()).strftime('%Y%m%d_%H%M%S')}_offset{CONFIG.input.url_offset}_testing-refactor.log" - logFormatter = logging.Formatter("%(levelname)s %(asctime)s %(processName)s %(message)s") - fileHandler = logging.FileHandler("{0}".format(LOG_FILE)) - fileHandler.setFormatter(logFormatter) - rootLogger = logging.getLogger() - rootLogger.addHandler(fileHandler) - rootLogger.setLevel(logging.INFO) - - logging.info("Config:") - logging.info(OmegaConf.to_yaml(CONFIG)) - - start_time = time.perf_counter() - - main() - - logging.info("Exiting with no error") - - end_time = time.perf_counter() - - print("Runtime: ", end_time - start_time) - # # Read the output files by using the following syntax: - # CONFIG = setup("../config/config.yaml") - # df = pd.read_parquet(f"{CONFIG.output.output_dir}/20260304_080625", engine="pyarrow") - # print(df.head()) diff --git a/src/scrape/__init__.py b/src/scrape/__init__.py deleted file mode 100644 index 4ebab41..0000000 --- a/src/scrape/__init__.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging -from scrape.base import IScraper, Scraper -from util import setup - -CONFIG = setup("config/config.yaml") - - -def build_webfocusedscraper(user_agent: str) -> IScraper: - """ - Build Scraper class with standard settings - """ - from crawl import HesitantCrawler - from fetch import HTMLFetcher - from parse import HTMLBodyParser - - with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.keywords}", 'r', encoding='utf-8') as file_in: - target_keywords = [line.rstrip() for line in file_in] - - with open(f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.skip_domains}", 'r', encoding='utf-8') as file_in: - skip_domains = [line.rstrip() for line in file_in] - - fetcher = HTMLFetcher(user_agent=user_agent) - crawler = HesitantCrawler( - fetcher=fetcher, - target_keywords=target_keywords, - add_sitemapurls=CONFIG.crawl.use_sitemap, - max_depth=CONFIG.crawl.max_depth, - skip_domains=skip_domains - ) - htmlparser = HTMLBodyParser() - - return Scraper( - crawler=crawler, - fetcher=fetcher, - htmlparser=htmlparser) - - -if __name__ == "__main__": - - logging.basicConfig(level=logging.DEBUG) - - user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - scraper = build_webfocusedscraper(user_agent=user_agent) - scraper.scrape() diff --git a/src/scrape/base.py b/src/scrape/base.py deleted file mode 100644 index c313300..0000000 --- a/src/scrape/base.py +++ /dev/null @@ -1,162 +0,0 @@ -import logging -import os -from abc import ABC, abstractmethod -import pandas as pd -import numpy as np -from typing import List -from datetime import datetime -import time -import random - -from util import setup -from fetch import IFetcher -from crawl import ICrawler -from parse import IHTMLParser - -CONFIG = setup("config/config.yaml") - - -class IScraper(ABC): - """ - Interface for all Scrapers - """ - def __init__(self, crawler: ICrawler, fetcher: IFetcher, htmlparser: IHTMLParser): - - # Set crawler and fetcher and parser - self._crawler = crawler - self._fetcher = fetcher - self._htmlparser = htmlparser - - @abstractmethod - def save_batch(self, batch: List, batch_id: int): - raise NotImplementedError() - - @abstractmethod - def scrape(self): - raise NotImplementedError() - - -class Scraper(IScraper): - """ - Interface for all Scrapers - """ - def __init__(self, crawler: ICrawler, fetcher: IFetcher, htmlparser: IHTMLParser): - super(Scraper, self).__init__(crawler=crawler, fetcher=fetcher, htmlparser=htmlparser) - - # All scrapers take base-url input from file - file_urls = f"{CONFIG.input.input_dir}/{CONFIG.input.input_files.urls}" - logging.info(f"Reading list of base-urls from file: {file_urls}") - logging.info(f"Offset is set to {CONFIG.input.url_offset} and maximum number of base-urls is {CONFIG.input.url_max}") - with open(file_urls, 'r', encoding='utf-8') as file_in: - self._base_urls = [line.rstrip() for line in file_in] - self._base_urls = self._base_urls[CONFIG.input.url_offset:CONFIG.input.url_offset + CONFIG.input.url_max] - random.shuffle(self._base_urls) - logging.debug(f"Read list with {len(self._base_urls)} base-urls from file: {file_urls}") - logging.debug(f"Scraper will start with entry {CONFIG.input.url_offset + 1} in the file") - - # create output folder with current datetime and possible url offset - self._dir_out = f"{CONFIG.output.output_dir}/{datetime.now().strftime('%Y%m%d_%H%M%S')}_offset{CONFIG.input.url_offset}" - logging.info(f"Creating output folder: {self._dir_out}") - os.makedirs(self._dir_out, exist_ok=True) - logging.debug("Created output folder") - # TODO instead consider a given folder name and crash-robust resuming of batch iteration - - def save_batch(self, batch: List, batch_id: int): - df = pd.DataFrame(batch) - - # add partition column - df["batch"] = batch_id - - df.to_parquet( - self._dir_out, - engine="pyarrow", - partition_cols=["batch"], - index=False, - compression="snappy" - ) - - def scrape(self): - - # saving data in batches - time_start = time.time() - buffer = [] - batch_id = 0 - - for cnt, base_url in enumerate(self._base_urls): - logging.info(f"Now starting scrape #{cnt + 1} of {len(self._base_urls)} base-urls") - - logging.info(f"Trying to crawl base url: {base_url}") - # Crawl can start as soon as start url provided - self._crawler.reset_with_starturl(start_url=base_url) - self._crawler.crawl() - - # track content by base_url to prevent duplicates - seen_content = set() - - delay = self._crawler.crawl_delay # might be different depending on curren domain - - # After crawl, collect results and parse content of targeted sites - # Some urls will already have their html fetched before during crawl, don't redo this then - for crawlresult in self._crawler.get_results(): - schema_indicator = False - html = self._crawler._visited.get(crawlresult.url, False) - if not html: - logging.debug(f"Downloading html from yet unvisited url {crawlresult.url}") - try: - html, schema_indicator = self._fetcher.fetch(crawlresult.url) - except Exception: - html = "" - # Respect crawl delay if crawler dose that - - logging.debug("Waiting for delay to pass") - time.sleep(delay) - logging.debug("Delay has passed") - - content = self._htmlparser.parse(html=html) - if content is not None and len(content) > 0: - if content in seen_content: # No dupliactes - logging.debug(f"Content from {crawlresult.url} is a duplicate, not added to output") - continue - - seen_content.add(content) - buffer.append({ - "base_url": base_url, - "url": crawlresult.url, - "first_keyword_hit": crawlresult.first_keyword_hit, - "content": content, - "schema_indicator": schema_indicator - }) - else: - logging.debug(f"After parsing no output for url {crawlresult.url}") - - if len(buffer) >= CONFIG.output.batchsize: - self.save_batch(batch=buffer, batch_id=batch_id) - logging.info(f"Saved batch number {batch_id} with {len(buffer)} records") - buffer = [] - batch_id += 1 - - # Remaining rows at the end - if buffer: - self.save_batch(batch=buffer, batch_id=batch_id) - logging.info(f"Saved final batch number {batch_id} with {len(buffer)} records") - - time_duration = (time.time() - time_start) / 60 - logging.info(f"Finished. Running scrape took {int(np.around(time_duration, 0))} minutes.") - - -if __name__ == "__main__": - from crawl import NoCrawler - from fetch import NoFetcher - from parse import EmptystringParser - - logging.basicConfig(level=logging.DEBUG) - - fetcher = NoFetcher() - crawler = NoCrawler() - htmlparser = EmptystringParser() - - scraper = Scraper( - crawler=crawler, - fetcher=fetcher, - htmlparser=htmlparser) - scraper.scrape() From 828077beb7432958f5ad50dfa505cec020a02ded Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 09:08:10 +0000 Subject: [PATCH 13/26] fix imports for -m flag running --- src/crawl/scrapymodules/HesitantSpider.py | 8 ++++---- src/main_scrapy.py | 4 ++-- src/parse/__init__.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/crawl/scrapymodules/HesitantSpider.py index f47011b..1dd92c3 100644 --- a/src/crawl/scrapymodules/HesitantSpider.py +++ b/src/crawl/scrapymodules/HesitantSpider.py @@ -11,9 +11,9 @@ from typing import List from urllib.parse import urljoin, urlparse -from parse import HTMLBodyParser -from util import normalize_url -from .ScrapyResult import ScrapyResult +from src.parse import HTMLBodyParser +from src.util import normalize_url +from src.crawl.scrapymodules import ScrapyResult class HesitantSpider(scrapy.Spider): @@ -385,7 +385,7 @@ def closed(self, reason): import os from datetime import datetime from scrapy.crawler import CrawlerProcess - from util import setup + from src.util import setup CONFIG = setup("config/config.yaml") diff --git a/src/main_scrapy.py b/src/main_scrapy.py index 701ffa8..7828725 100644 --- a/src/main_scrapy.py +++ b/src/main_scrapy.py @@ -11,8 +11,8 @@ from datetime import datetime from scrapy.crawler import CrawlerProcess -from crawl.scrapymodules import HesitantSpider -from util import setup, normalize_url +from src.crawl.scrapymodules import HesitantSpider +from src.util import setup, normalize_url CONFIG = setup("config/config.yaml") diff --git a/src/parse/__init__.py b/src/parse/__init__.py index f6b4b97..6dd5f0f 100644 --- a/src/parse/__init__.py +++ b/src/parse/__init__.py @@ -1 +1 @@ -from parse.HTML import IHTMLParser, HTMLBodyParser, EmptystringParser \ No newline at end of file +from .HTML import IHTMLParser, HTMLBodyParser, EmptystringParser \ No newline at end of file From bf1684d7d7718ed175b98daab63ac9bf2461b05e Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 09:33:08 +0000 Subject: [PATCH 14/26] move scrapy modules into scrape --- src/crawl/__init__.py | 0 src/main_scrapy.py | 2 +- src/{crawl/scrapymodules => scrape}/HesitantSpider.py | 4 ++-- src/{crawl/scrapymodules => scrape}/ScrapyCrawlMiddleware.py | 0 src/{crawl/scrapymodules => scrape}/ScrapyResult.py | 0 src/{crawl/scrapymodules => scrape}/__init__.py | 0 6 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 src/crawl/__init__.py rename src/{crawl/scrapymodules => scrape}/HesitantSpider.py (99%) rename src/{crawl/scrapymodules => scrape}/ScrapyCrawlMiddleware.py (100%) rename src/{crawl/scrapymodules => scrape}/ScrapyResult.py (100%) rename src/{crawl/scrapymodules => scrape}/__init__.py (100%) diff --git a/src/crawl/__init__.py b/src/crawl/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/main_scrapy.py b/src/main_scrapy.py index 7828725..5034fb4 100644 --- a/src/main_scrapy.py +++ b/src/main_scrapy.py @@ -11,7 +11,7 @@ from datetime import datetime from scrapy.crawler import CrawlerProcess -from src.crawl.scrapymodules import HesitantSpider +from src.scrape import HesitantSpider from src.util import setup, normalize_url CONFIG = setup("config/config.yaml") diff --git a/src/crawl/scrapymodules/HesitantSpider.py b/src/scrape/HesitantSpider.py similarity index 99% rename from src/crawl/scrapymodules/HesitantSpider.py rename to src/scrape/HesitantSpider.py index 1dd92c3..4e7a18b 100644 --- a/src/crawl/scrapymodules/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -13,7 +13,7 @@ from src.parse import HTMLBodyParser from src.util import normalize_url -from src.crawl.scrapymodules import ScrapyResult +from . import ScrapyResult class HesitantSpider(scrapy.Spider): @@ -402,7 +402,7 @@ def closed(self, reason): "ROBOTSTXT_OBEY": True, "LOG_FILE": logfile, "DOWNLOADER_MIDDLEWARES": { - "src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority + "src.scrape.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority }, "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] # TODO can be removed? } diff --git a/src/crawl/scrapymodules/ScrapyCrawlMiddleware.py b/src/scrape/ScrapyCrawlMiddleware.py similarity index 100% rename from src/crawl/scrapymodules/ScrapyCrawlMiddleware.py rename to src/scrape/ScrapyCrawlMiddleware.py diff --git a/src/crawl/scrapymodules/ScrapyResult.py b/src/scrape/ScrapyResult.py similarity index 100% rename from src/crawl/scrapymodules/ScrapyResult.py rename to src/scrape/ScrapyResult.py diff --git a/src/crawl/scrapymodules/__init__.py b/src/scrape/__init__.py similarity index 100% rename from src/crawl/scrapymodules/__init__.py rename to src/scrape/__init__.py From 8e8703d9940270f8cabecfca85a847bd0d6ad165 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 09:40:58 +0000 Subject: [PATCH 15/26] move schema parser to parse module --- src/parse/Schema.py | 61 ++++++++++++++++++++++++++++++++++++ src/parse/__init__.py | 3 +- src/scrape/HesitantSpider.py | 22 ++++--------- 3 files changed, 69 insertions(+), 17 deletions(-) diff --git a/src/parse/Schema.py b/src/parse/Schema.py index e69de29..23394e2 100644 --- a/src/parse/Schema.py +++ b/src/parse/Schema.py @@ -0,0 +1,61 @@ +import logging +import json +from abc import ABC, abstractmethod +from typing import List + +from scrapy.http import Response + + +class ISchemaParser(ABC): + """ + Interface class for Schema parser + """ + + @abstractmethod + def parse(self, titles: List[str]) -> str: + raise NotImplementedError("Do not call abstract base class.") + + +class SchemaParser(ISchemaParser): + """ + Parser for detecting specified schema entities within response + """ + def __init__(self, schema_keywords: List[str]): + self.schema_keywords = schema_keywords + logging.info(f"Initializing SchemaParser to detect entities of any of types: {self.schema_keywords}") + + def parse(self, response: Response) -> List[str]: + """ + returns the types that were found of the allowed type + """ + results = [] + jsonlds = response.xpath("//script[@type='application/ld+json']/text()").getall() + if jsonlds: + for jsonld in jsonlds: + try: + data = json.loads(jsonld) + if "@type" in data.keys() and data["@type"] in self.schema_keywords: + logging.debug(f"Found schema entity {data["@type"]} that is within schema keywords: {self.schema_keywords}") + results.append(data["@type"]) + except json.JSONDecodeError: + pass + return results + + +if __name__ == "__main__": + from scrapy.http import TextResponse + + html = """ + + + + + + """ + response = TextResponse(url='http://example.com', body=html.encode('utf-8')) + + parser = SchemaParser(schema_keywords=['Article']) + for found_type in parser.parse(response=response): + print(found_type) diff --git a/src/parse/__init__.py b/src/parse/__init__.py index 6dd5f0f..8fae1c5 100644 --- a/src/parse/__init__.py +++ b/src/parse/__init__.py @@ -1 +1,2 @@ -from .HTML import IHTMLParser, HTMLBodyParser, EmptystringParser \ No newline at end of file +from .HTML import IHTMLParser, HTMLBodyParser, EmptystringParser +from .Schema import SchemaParser \ No newline at end of file diff --git a/src/scrape/HesitantSpider.py b/src/scrape/HesitantSpider.py index 4e7a18b..eec8886 100644 --- a/src/scrape/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -11,7 +11,7 @@ from typing import List from urllib.parse import urljoin, urlparse -from src.parse import HTMLBodyParser +from src.parse import HTMLBodyParser, SchemaParser from src.util import normalize_url from . import ScrapyResult @@ -71,8 +71,6 @@ def __init__( self.logger.debug(f"Init allowed languages: {self.allowed_languages}") self.allowed_countries = allowed_countries self.logger.debug(f"Init allowed countries: {self.allowed_countries}") - self.schema_keywords = schema_keywords - self.logger.debug(f"Init schema keywords: {self.schema_keywords}") self.max_jumps = max_jumps self.logger.debug(f"Init max_jumps: {self.max_jumps}") self.output_file = output_file @@ -105,6 +103,10 @@ def __init__( ".dmg?download=true") self.logger.debug(f"URLs will be excluded if they contain any in path:{', '.join(self._unsupported)}") + # Set schema parser + self._schemaparser = SchemaParser(schema_keywords=schema_keywords) + self.logger.debug(f"Init schemaparser with keywords: {schema_keywords}") + # Init batch, results, visited self.batch = [] self.results = [] @@ -298,19 +300,7 @@ def parse(self, response): # Process the current page if url_is_targeted: # Determine schema.org indicator - schema_indicator = False - - # Get JSON-LD elements - jsonlds = response.xpath("//script[@type='application/ld+json']/text()").getall() - if jsonlds: - for jsonld in jsonlds: - try: - data = json.loads(jsonld) - if "@type" in data.keys() and data["@type"] in self.schema_keywords: - self.logger.debug(f"Found schema entity {data["@type"]} that is within schema keywords: {self.schema_keywords}") - schema_indicator = True - except json.JSONDecodeError: - pass + schema_indicator = True if self._schemaparser.parse(response=response) else False # Add result to batch result = ScrapyResult( From 1a3dcfaf1adb78f930918384eaaf673c3f16ddca Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 09:42:18 +0000 Subject: [PATCH 16/26] update normalize_url documentation --- src/util/urls.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/util/urls.py b/src/util/urls.py index 62cb112..226d1e7 100644 --- a/src/util/urls.py +++ b/src/util/urls.py @@ -2,9 +2,13 @@ import re -# Normalize URL to make sure crawler can handle it without issue -def normalize_url(url): - # Handle case where there is no scheme at all +def normalize_url(url: str): + """ + Normalize URL to make sure crawler can handle it without issue + :param url: url to normalize + """ + + # Handle case where there is no scheme at all if not re.match(r'^[a-zA-Z]+://', url): url = 'https://' + url From 6cedb3d797264bb72b05ad4c739f7c5267a3da56 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 09:42:51 +0000 Subject: [PATCH 17/26] remove unused lines --- src/util/setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/util/setup.py b/src/util/setup.py index c51f470..a81ae93 100644 --- a/src/util/setup.py +++ b/src/util/setup.py @@ -4,8 +4,4 @@ def setup(config_path): # Read config object config = OmegaConf.load(config_path) - - # Set OS environment variables or other stuff... - # ... - return config From 7132ee45180d95c10a60a9d18cac6b2429cfc917 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 09:47:14 +0000 Subject: [PATCH 18/26] rename main scripts --- src/{main_scrapy.py => main.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/{main_scrapy.py => main.py} (100%) diff --git a/src/main_scrapy.py b/src/main.py similarity index 100% rename from src/main_scrapy.py rename to src/main.py From 9f58b1187f1e7d8e00d9e132d402e319f32341d9 Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Thu, 25 Jun 2026 09:48:11 +0000 Subject: [PATCH 19/26] update requirements.txt --- requirements.txt | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/requirements.txt b/requirements.txt index 83eb19f..e8be2d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,28 +9,35 @@ antlr4-python3-runtime==4.9.3 anyio==4.10.0 asttokens==3.0.0 attrs==25.3.0 +Automat==25.4.16 beautifulsoup4==4.13.4 blinker==1.9.0 botocore==1.39.11 cachetools==5.5.2 certifi==2025.8.3 +cffi==2.0.0 chardet==5.2.0 charset-normalizer==3.4.3 click==8.2.1 cloudpickle==3.1.1 comm==0.2.3 +constantly==23.10.4 contourpy==1.3.3 +cryptography==49.0.0 cssselect==1.3.0 cycler==0.12.1 dask==2025.7.0 databricks-sdk==0.62.0 debugpy==1.8.16 decorator==5.2.1 +defusedxml==0.7.1 docker==7.1.0 duckdb==1.3.2 executing==2.2.0 +extruct==0.18.0 fastapi==0.116.1 fastjsonschema==2.21.1 +filelock==3.29.4 Flask==3.1.1 fonttools==4.59.0 frozendict==2.4.7 @@ -47,11 +54,17 @@ graphql-relay==3.2.0 greenlet==3.2.4 gunicorn==23.0.0 h11==0.16.0 +html-text==0.7.1 +html5lib==1.1 +hyperlink==21.0.0 idna==3.10 importlib_metadata==8.7.0 +Incremental==24.11.0 ipykernel==6.30.1 ipython==9.4.0 ipython_pygments_lexers==1.1.1 +itemadapter==0.13.1 +itemloaders==1.4.0 itsdangerous==2.2.0 jedi==0.19.2 Jinja2==3.1.6 @@ -59,6 +72,7 @@ jmespath==1.0.1 joblib==1.5.1 jsonschema==4.25.0 jsonschema-specifications==2025.4.1 +jstyleson==0.0.2 jupyter_client==8.6.3 jupyter_core==5.8.1 jusText==3.0.2 @@ -73,6 +87,7 @@ MarkupSafe==3.0.2 matplotlib==3.10.5 matplotlib-inline==0.1.7 mdurl==0.1.2 +mf2py==2.0.1 mlflow==3.2.0 mlflow-skinny==3.2.0 mlflow-tracing==3.2.0 @@ -90,6 +105,7 @@ opentelemetry-sdk==1.36.0 opentelemetry-semantic-conventions==0.57b0 packaging==25.0 pandas==2.3.1 +parsel==1.11.0 parso==0.8.4 partd==1.4.2 patsy==1.0.1 @@ -101,6 +117,7 @@ plotly==6.2.0 polars==1.32.2 prompt_toolkit==3.0.51 propcache==0.3.2 +Protego==0.6.1 protobuf==6.31.1 psutil==7.0.0 ptyprocess==0.7.0 @@ -108,28 +125,37 @@ pure_eval==0.2.3 pyarrow==21.0.0 pyasn1==0.6.1 pyasn1_modules==0.4.2 +pycparser==3.0 pydantic==2.11.7 pydantic_core==2.33.2 +PyDispatcher==2.0.7 pyee==13.0.0 Pygments==2.19.2 pyogrio==0.11.1 +pyOpenSSL==26.3.0 pyparsing==3.2.3 pyproj==3.7.1 +pyrdfa3==3.6.5 python-dateutil==2.9.0.post0 pytz==2025.2 PyYAML==6.0.2 pyzmq==27.0.1 +queuelib==1.9.0 +rdflib==7.6.0 readability-lxml==0.8.4.1 referencing==0.36.2 regex==2025.7.34 requests==2.32.4 +requests-file==3.0.1 rich==14.2.0 rpds-py==0.27.0 rsa==4.9.1 s3fs==2025.7.0 scikit-learn==1.7.1 scipy==1.16.1 +Scrapy==2.16.0 seaborn==0.13.2 +service-identity==26.1.0 setuptools==80.9.0 shapely==2.1.1 sitemap==20191121 @@ -143,10 +169,12 @@ stack-data==0.6.3 starlette==0.47.2 statsmodels==0.14.5 threadpoolctl==3.6.0 +tldextract==5.3.1 toolz==1.0.0 tornado==6.5.2 tqdm==4.67.1 traitlets==5.14.3 +Twisted==26.4.0 typing-inspection==0.4.1 typing_extensions==4.14.1 tzdata==2025.2 @@ -154,10 +182,14 @@ ultimate-sitemap-parser==1.6.0 urllib3==2.5.0 uv==0.8.8 uvicorn==0.35.0 +validators==0.35.0 +w3lib==2.4.1 wcwidth==0.2.13 +webencodings==0.5.1 Werkzeug==3.1.3 wheel==0.45.1 wrapt==1.17.2 xgboost==3.0.3 yarl==1.20.1 zipp==3.23.0 +zope.interface==8.5 From b538f0fa806b6054ab44d1c653f7521c0ba7a67a Mon Sep 17 00:00:00 2001 From: dominikblatt <84blatt@gmail.com> Date: Tue, 30 Jun 2026 07:35:49 +0000 Subject: [PATCH 20/26] bug fix: adjust middlewares path to refactored structure --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index 5034fb4..91bcbf7 100644 --- a/src/main.py +++ b/src/main.py @@ -51,7 +51,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro "ROBOTSTXT_OBEY": True, "LOG_FILE": logfile, "DOWNLOADER_MIDDLEWARES": { - "src.crawl.scrapymodules.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority + "src.scrape.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority }, "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] # TODO can be removed? } From 156fca7062ecdbfb1c5ec777b63e72dbfae2c3c7 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Mon, 29 Jun 2026 13:44:57 +0000 Subject: [PATCH 21/26] Upgrade scrape to support dynamic content and more sitemaps --- .gitignore | 6 +- src/fetch/PlaywrightText.py | 125 +++++++++++++++++++++++++++++++++++ src/fetch/__init__.py | 4 ++ src/scrape/HesitantSpider.py | 107 +++++++++++++++++++++--------- 4 files changed, 211 insertions(+), 31 deletions(-) create mode 100644 src/fetch/PlaywrightText.py create mode 100644 src/fetch/__init__.py diff --git a/.gitignore b/.gitignore index d2bfd51..e1f738e 100644 --- a/.gitignore +++ b/.gitignore @@ -214,4 +214,8 @@ config/config.yaml # input and output files need to be explicitly added input -output \ No newline at end of file +output + +# (Debug) Output file types +*.txt +*.html diff --git a/src/fetch/PlaywrightText.py b/src/fetch/PlaywrightText.py new file mode 100644 index 0000000..1c1c5e6 --- /dev/null +++ b/src/fetch/PlaywrightText.py @@ -0,0 +1,125 @@ +import asyncio +import random +import logging +from typing import Tuple, Union + +from playwright.async_api import async_playwright, Error as PlaywrightError +from .Robots import RobotsFetcher + + +class PlaywrightTextFetcher: + """ + Playwright Text Fetcher + Uses Playwright to load a page until DOM content is loaded, + waits for a random delay, and then extracts the inner text of the body. + """ + def __init__( + self, + user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + max_retries=3, + wait_time=5): + logging.debug("Initializing PlaywrightTextFetcher") + self.user_agent = user_agent + + self.max_retries = max_retries + self.wait_time = wait_time + + # Domain will have to be identified for any given url to fetch, then the corresponding robots file will be checked + self.robotsfetcher = RobotsFetcher(user_agent=user_agent) + self._robots_bydomain = self.robotsfetcher.get_results() + + logging.info("Launching new Chromium browser instance...") + self.setup_playwright = False + + async def setup_playwright_browser(self): + self._playwright = await async_playwright().start() + self._browser = await self._playwright.chromium.launch(headless=True) + self.setup_playwright = True + + async def close(self): + """Properly shuts down the browser and playwright.""" + if self._browser: + await self._browser.close() + self._browser = None + if self._playwright: + await self._playwright.stop() + self._playwright = None + logging.info("Playwright browser closed.") + + async def fetch(self, url: str) -> Union[Tuple[str, bool], dict]: + """ + Asynchronous fetch using Playwright. + Returns a tuple (text_content, schema_indicator) or an empty dict on failure. + """ + logging.info(f"Trying to fetch the next URL with Playwright: {url}") + if not self.setup_playwright: + await self.setup_playwright_browser() + + return await self._fetch_with_retries(url) + + async def _fetch_with_retries(self, url: str, retries: int = 0): + # Create a new context (incognito-like) for every request for isolation + context = await self._browser.new_context(user_agent=self.user_agent) + page = await context.new_page() + + try: + logging.debug(f"Navigating to {url}...") + # Use a timeout to prevent a single slow page from hanging the whole worker + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + + await asyncio.sleep(self.wait_time) + + text_content = await self._extract_clean_text(page) + return text_content + + except (PlaywrightError, Exception) as e: + logging.error(f"Playwright request failed for {url}. Error: {e}") + + if retries < self.max_retries: + wait_time = random.uniform(1, 5) + logging.info(f"Retrying in {wait_time:.2f} seconds...") + await asyncio.sleep(wait_time) + # We don't need to pass context here, the next retry will create its own + return await self._fetch_with_retries(url, retries + 1) + + return {} + finally: + # ALWAYS close the context and page to free up memory, + # even if the request fails or succeeds. + await page.close() + await context.close() + + async def _extract_clean_text(self, page) -> str: + # ... (rest of your existing _extract_clean_text code remains the same) + noise_selectors = ["nav", "footer", "header", "aside"] + for selector in noise_selectors: + try: + await page.evaluate(f'document.querySelectorAll("{selector}").forEach(el => el.remove())') + except Exception: + pass + + text = await page.inner_text("body") + lines = [line.strip() for line in text.splitlines() if line.strip()] + return "\n".join(lines) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + + async def main(): + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + fetcher = PlaywrightTextFetcher(user_agent=user_agent) + + urls = [ + "https://example.com", + "https://books.toscrape.com" + ] + + for url in urls: + await fetcher.fetch(url) + + for url, content in fetcher.get_results().items(): + print(f"\nURL: {url}") + print(f"...{content[:100]}...\n\n") + + asyncio.run(main()) \ No newline at end of file diff --git a/src/fetch/__init__.py b/src/fetch/__init__.py new file mode 100644 index 0000000..bddcf5a --- /dev/null +++ b/src/fetch/__init__.py @@ -0,0 +1,4 @@ +from fetch.base import IFetcher, NoFetcher +from fetch.Robots import RobotsFetcher +from fetch.HTML import HTMLFetcher +from fetch.PlaywrightText import PlaywrightTextFetcher \ No newline at end of file diff --git a/src/scrape/HesitantSpider.py b/src/scrape/HesitantSpider.py index eec8886..0908453 100644 --- a/src/scrape/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -12,8 +12,9 @@ from urllib.parse import urljoin, urlparse from src.parse import HTMLBodyParser, SchemaParser +from src.fetch import PlaywrightTextFetcher +from src.scrape import ScrapyResult from src.util import normalize_url -from . import ScrapyResult class HesitantSpider(scrapy.Spider): @@ -23,10 +24,13 @@ class HesitantSpider(scrapy.Spider): custom_settings = { "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "AUTOTHROTTLE_ENABLED": True, # Auto throttle to maximize speed without risking blocks - "AUTOTHROTTLE_START_DELAY": 1.0, # Start slow to "warm up" - "AUTOTHROTTLE_MAX_DELAY": 30.0, # Never wait more than 10s + "AUTOTHROTTLE_START_DELAY": 5.0, # Start slow to "warm up" + "AUTOTHROTTLE_MAX_DELAY": 10.0, # Never wait more than 10s "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, # Aim for 1 request per worker at a time + "CONCURRENT_REQUESTS": 4,# Allow more concurrent requests within the single process "DOWNLOAD_DELAY": 0, # Let Autothrottle handle the delay + "DOWNLOAD_TIMEOUT": 5, # CRITICAL: Fail fast (5s) if the site is dead + "RETRY_TIMES": 1, } def __init__( @@ -86,6 +90,7 @@ def __init__( # Set parser and unsupported endpoints self._htmlparser = HTMLBodyParser() + self._fetcher = PlaywrightTextFetcher() self._unsupported = ( ".ics", ".mng", ".pct", ".bmp", ".gif", ".jpg", ".jpeg", ".png", ".pst", ".psp", ".tif", ".tiff", ".drw", ".dxf", ".eps", ".woff2", ".svg", ".mp3", ".wma", ".ogg", ".wav", ".ra", ".aac", ".mid", ".aiff", ".3gp", ".asf", ".asx", ".avi", ".mp4", @@ -111,6 +116,7 @@ def __init__( self.batch = [] self.results = [] self.visited = set() + self.sitemaps_crawled = set() if max_depth < 0: self.logger.debug("Only urls from starting_url can be found, max_depth < 0") @@ -133,6 +139,7 @@ async def start(self): ) # next, if desired, check the sitemapurls to augment existing results parsed_url = urlparse(start_url) + self.sitemaps_crawled.add(parsed_url.netloc) for sitemap in self.sitemaps_tocheck: url = f"{parsed_url.scheme}://{parsed_url.netloc}/{sitemap}" yield scrapy.Request( @@ -262,7 +269,7 @@ def skip_this_url(self, url: str) -> bool: return False # Process request response - def parse(self, response): + async def parse(self, response): # Check if we passed timeout if time.time() - self.start_time > self.timeout: print(f"Hit timeout {self.timeout} seconds for spider with start urls: {self.start_urls}!") @@ -297,30 +304,27 @@ def parse(self, response): self.logger.debug(f"Parsing url: {response.url}, targeted: {url_is_targeted}, depth: {current_depth}, jumps: {jumps}") self.visited.add(response.url) - # Process the current page - if url_is_targeted: - # Determine schema.org indicator - schema_indicator = True if self._schemaparser.parse(response=response) else False - - # Add result to batch - result = ScrapyResult( - base_url=str(response.meta.get("base_url")), - url=response.url, - status=response.status, - first_keyword_hit=first_keyword_hit, - content=self._htmlparser.parse(html=response.text), - crawl_depth=current_depth, - schema_indicator=schema_indicator - ) - - self.batch.append(result) + # Add sitemap discovery + if parsed_url.netloc.lower() not in self.sitemaps_crawled: + self.sitemaps_crawled.add(parsed_url.netloc.lower()) + self.logger.debug(f"New domain detected: {parsed_url.netloc.lower()}. Checking for sitemaps...") - # Save batch if exceeding batch size - if len(self.batch) >= self.batch_size: - self.save_batch() + for sitemap_path in self.sitemaps_tocheck: + # Construct the sitemap URL + sitemap_url = urljoin(f"{parsed_url.scheme}://{parsed_url.netloc}/", sitemap_path) - # Reset current depth because we found target at current page - steps_from_target = 0 + # YIELD the request so Scrapy handles it + yield scrapy.Request( + url=sitemap_url, + callback=self.parse_sitemap, + meta={ + "base_url": response.meta.get("base_url"), + "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", + "depth": current_depth, + "steps_from_target": steps_from_target, + "jumps": jumps + } + ) # Extract and follow links for link in response.css("a::attr(href)").getall(): @@ -342,20 +346,62 @@ def parse(self, response): }, dont_filter=False # Skip duplicates ) + + # Process the current page + if url_is_targeted: + self.logger.debug(f"Found targeted url: {response.url} from base url {response.meta.get("base_url")}") + # Determine schema.org indicator + schema_indicator = True if self._schemaparser.parse(response=response) else False + + # Add result to batch + result = ScrapyResult( + base_url=str(response.meta.get("base_url")), + url=response.url, + status=response.status, + first_keyword_hit=first_keyword_hit, + content= await self._fetcher.fetch(response.url), + crawl_depth=current_depth, + schema_indicator=schema_indicator + ) + + self.batch.append(result) + + # Save batch if exceeding batch size + if len(self.batch) >= self.batch_size: + self.save_batch() + + # Reset current depth because we found target at current page + steps_from_target = 0 def parse_sitemap(self, response): # Extract all URLs from the sitemap, accounting for namespace ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} - urls = response.xpath('//ns:url/ns:loc/text()', namespaces=ns).getall() + + # Look for both (standard) and (index) + urls = response.xpath('//ns:url/ns:loc/text() | //ns:sitemap/ns:loc/text()', namespaces=ns).getall() for url in urls: url = normalize_url(url) # Only continue with valid crawl paths if self.skip_this_url(url): continue - parsed_url = urlparse(url) - yield scrapy.Request( + + # Check if the discovered URL is itself a sitemap (to allow recursive discovery) + # If it ends in .xml, we should probably call parse_sitemap again + if url.endswith('.xml'): + yield scrapy.Request( + url=url, + callback=self.parse_sitemap, + meta={ + "base_url": response.meta.get("base_url"), + "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", + "depth": response.meta.get("depth", 0) + 1 + } + ) + else: + # Otherwise, it's a regular page + yield scrapy.Request( url=url, callback=self.parse, meta={ @@ -366,8 +412,9 @@ def parse_sitemap(self, response): ) # Called when the spider closes cleanly - def closed(self, reason): + async def closed(self, reason): self.save_batch() + await self._fetcher.close() print(f"Spider closed because of: {reason}. Total collected pages: {len(self.results)}") From 89195e3f32acfaa6b2abdbccc69c7e4418a6d9a9 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Tue, 30 Jun 2026 08:35:36 +0000 Subject: [PATCH 22/26] Integrate changes to scrape with refactor --- src/fetch/Robots.py | 70 ++++++++++++++++++++++++++++++++++++ src/fetch/__init__.py | 7 ++-- src/fetch/base.py | 65 +++++++++++++++++++++++++++++++++ src/main.py | 5 +-- src/scrape/HesitantSpider.py | 7 ++-- 5 files changed, 145 insertions(+), 9 deletions(-) create mode 100644 src/fetch/Robots.py create mode 100644 src/fetch/base.py diff --git a/src/fetch/Robots.py b/src/fetch/Robots.py new file mode 100644 index 0000000..bebb3af --- /dev/null +++ b/src/fetch/Robots.py @@ -0,0 +1,70 @@ +from typing import Dict, List +import logging +from urllib.robotparser import RobotFileParser +from usp.tree import sitemap_tree_for_homepage + +from src.util import setup +from .base import IFetcher + +CONFIG = setup("config/config.yaml") + + +class RobotsFetcher(IFetcher): + """ + Robots Fetcher for accessing information in robots file + """ + def __init__( + self, + user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"): + logging.info("Initializing RobotsFetcher") + super(RobotsFetcher, self).__init__(user_agent=user_agent) + + # keep track of domains for which the robots file has already been fetched + self.results = dict() + + def fetch(self, domain: str) -> RobotFileParser: + """Fetches robots file for given url domain, if not already done""" + + # only download robots in case it hasn't already + self.results.setdefault(domain, RobotFileParser(url=f"https://{domain}/robots.txt")) + return self.results[domain] + + def get_results(self) -> Dict[str, RobotFileParser]: + """ + Returns the dictionary of fetched URLs and their HTML content. + """ + return self.results + + def get_sitemap_urls(self, domain: str) -> List[str]: + """Get a list of sitemaps listed on robots.txt""" + try: + tree = sitemap_tree_for_homepage(f"https://{domain}", use_robots=True, use_known_paths=False) + sitemap_urls = [page.url for page in tree.all_pages()] + logging.debug(f"Found {len(sitemap_urls)} sitemap_urls for domain {domain}") + return sitemap_urls + except Exception as e: + logging.warning(f"Could not fetch sitemap_urls for domain {domain}: {e}") + return [] + + +if __name__ == "__main__": + from urllib.parse import urlparse + + logging.basicConfig(level=logging.DEBUG) + + fetcher = RobotsFetcher() + + urls = ["https://books.toscrape.com", + "https://cbs.nl"] + + domains = [urlparse(url=url).netloc for url in urls] + + for domain in domains: + fetcher.fetch(domain) + + for domain, robotsobject in fetcher.get_results().items(): + print(f"Domain: {domain}") + print(robotsobject.path) + + for domain in domains: + sitemaps = fetcher.get_sitemap_urls(domain=domain) \ No newline at end of file diff --git a/src/fetch/__init__.py b/src/fetch/__init__.py index bddcf5a..c3656b7 100644 --- a/src/fetch/__init__.py +++ b/src/fetch/__init__.py @@ -1,4 +1,3 @@ -from fetch.base import IFetcher, NoFetcher -from fetch.Robots import RobotsFetcher -from fetch.HTML import HTMLFetcher -from fetch.PlaywrightText import PlaywrightTextFetcher \ No newline at end of file +from .base import IFetcher, NoFetcher +from .Robots import RobotsFetcher +from .PlaywrightText import PlaywrightTextFetcher diff --git a/src/fetch/base.py b/src/fetch/base.py new file mode 100644 index 0000000..fa68037 --- /dev/null +++ b/src/fetch/base.py @@ -0,0 +1,65 @@ +import logging +from abc import ABC, abstractmethod +from typing import Dict + + +class IFetcher(ABC): + """ + interface for all fetchers + """ + def __init__( + self, + user_agent: str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"): + self.user_agent = user_agent + self.results = {} # {url: html_content} + + @abstractmethod + def fetch(self, url: str): + """Fetches content for given url""" + raise NotImplementedError() + + @abstractmethod + def get_results(self) -> Dict: + """Returns the dictionary of fetched URLs and their content""" + return NotImplementedError() + + +class NoFetcher(IFetcher): + """ + Do nothing Fetcher for testing, returns a minimal html doc + """ + def __init__(self): + logging.info("Initializing NoFetcher, returns a minimal default html") + super(NoFetcher, self).__init__() + + self._default_html = (""" + + +Hello +""") + + def fetch(self, url: str) -> str: + """Fetches default minimal html""" + self.results[url] = self._default_html + return self.results[url] + + def get_results(self) -> Dict[str, str]: + """ + Returns the dictionary of fetched URLs and their HTML content. + """ + return self.results + + +if __name__ == "__main__": + + logging.basicConfig(level=logging.DEBUG) + + fetcher = NoFetcher() + + urls = ["https://books.toscrape.com"] + for url in urls: + fetcher.fetch(url) + + for url, html in fetcher.get_results().items(): + print(f"\nURL: {url}") + print(f"...{html[:100]}...\n\n") \ No newline at end of file diff --git a/src/main.py b/src/main.py index 91bcbf7..b9b5627 100644 --- a/src/main.py +++ b/src/main.py @@ -40,7 +40,6 @@ def read_parquet_dir(parquet_dir): # Spawn spider crawler process def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, process_id, log_level, logfile, output_file, schema_keywords): print(f"Args: urls: {urls}, netloc keywords: {netloc_keywords}, path keywords: {path_keywords}, skip domains: {skip_domains}, log level: {log_level}, log file: {logfile}, output file: {output_file}, process_id: {process_id}") - print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!") project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) if project_root not in sys.path: sys.path.insert(0, project_root) @@ -53,7 +52,8 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro "DOWNLOADER_MIDDLEWARES": { "src.scrape.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority }, - "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"] # TODO can be removed? + "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"], + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", } ) @@ -106,6 +106,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro return [] try: + print(f"Starting crawling process (PID: {process_id}, OSPID: {os.getpid()}) for {urls}!") process.start() except Exception as e: print(f"Something went from starting process! Error {e}") diff --git a/src/scrape/HesitantSpider.py b/src/scrape/HesitantSpider.py index 0908453..a5d7175 100644 --- a/src/scrape/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -12,8 +12,8 @@ from urllib.parse import urljoin, urlparse from src.parse import HTMLBodyParser, SchemaParser -from src.fetch import PlaywrightTextFetcher -from src.scrape import ScrapyResult +from src.fetch import PlaywrightTextFetcher +from src.scrape.ScrapyResult import ScrapyResult from src.util import normalize_url @@ -352,6 +352,7 @@ async def parse(self, response): self.logger.debug(f"Found targeted url: {response.url} from base url {response.meta.get("base_url")}") # Determine schema.org indicator schema_indicator = True if self._schemaparser.parse(response=response) else False + self.logger.debug(f"Schema indicator: {schema_indicator}") # Add result to batch result = ScrapyResult( @@ -359,7 +360,7 @@ async def parse(self, response): url=response.url, status=response.status, first_keyword_hit=first_keyword_hit, - content= await self._fetcher.fetch(response.url), + content=await self._fetcher.fetch(response.url), crawl_depth=current_depth, schema_indicator=schema_indicator ) From 71886664223c7593911e5dc10b52875e88ad5330 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Tue, 30 Jun 2026 08:55:04 +0000 Subject: [PATCH 23/26] extend logging --- src/main.py | 3 ++- src/scrape/HesitantSpider.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index b9b5627..63b5a23 100644 --- a/src/main.py +++ b/src/main.py @@ -120,7 +120,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro # Set logging level and create file # All workers write to same log - logging_level = logging.DEBUG + logging_level = logging.INFO dir_log = f"{CONFIG.output.output_dir}/{CONFIG.output.logs}" if not os.path.exists(dir_log): @@ -162,6 +162,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro # Set amount of parallel workers and prepare chunk-wisem parallel execution max_workers = 16 num_workers = min([len(urls), max_workers]) + logging.info(f"Will use {num_workers} workers!") batch_size = len(urls) // num_workers if len(urls) > num_workers else 1 url_chunks = np.array_split(urls, num_workers) diff --git a/src/scrape/HesitantSpider.py b/src/scrape/HesitantSpider.py index a5d7175..7b6200c 100644 --- a/src/scrape/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -301,7 +301,7 @@ async def parse(self, response): return # Process response if above skip-conditions not met - self.logger.debug(f"Parsing url: {response.url}, targeted: {url_is_targeted}, depth: {current_depth}, jumps: {jumps}") + self.logger.debug(f"Parsing url: {response.url}, targeted: {url_is_targeted}, depth: {current_depth}, steps from target: {steps_from_target}, jumps: {jumps}") self.visited.add(response.url) # Add sitemap discovery From c8866aad3f316297a475bba76117a9917831045e Mon Sep 17 00:00:00 2001 From: lhaarman Date: Tue, 30 Jun 2026 14:32:05 +0000 Subject: [PATCH 24/26] small changes/fixes --- README.md | 2 + requirements.txt | 214 +++++++---------------------------- src/fetch/PlaywrightText.py | 2 +- src/main.py | 17 ++- src/scrape/HesitantSpider.py | 1 - 5 files changed, 52 insertions(+), 184 deletions(-) diff --git a/README.md b/README.md index 4849ccf..70c9510 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,8 @@ More info on statistical scraping [here](https://github.com/SNStatComp/SSIG) # Getting started - Install all required packages using > pip install -r requirements.txt + > playwright install + > playwright install-deps - Activate the environment - run the following command to install modules in src as packages for proper import > pip install -e . diff --git a/requirements.txt b/requirements.txt index e8be2d9..4817468 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,195 +1,57 @@ -aiobotocore==2.24.0 -aiohappyeyeballs==2.6.1 -aiohttp==3.12.15 -aioitertools==0.12.0 -aiosignal==1.4.0 -alembic==1.16.4 -annotated-types==0.7.0 antlr4-python3-runtime==4.9.3 -anyio==4.10.0 -asttokens==3.0.0 -attrs==25.3.0 -Automat==25.4.16 -beautifulsoup4==4.13.4 -blinker==1.9.0 -botocore==1.39.11 -cachetools==5.5.2 -certifi==2025.8.3 +attrs==26.1.0 +automat==25.4.16 +beautifulsoup4==4.15.0 +bs4==0.0.2 +build==1.5.0 +certifi==2026.6.17 cffi==2.0.0 -chardet==5.2.0 -charset-normalizer==3.4.3 -click==8.2.1 -cloudpickle==3.1.1 -comm==0.2.3 +charset-normalizer==3.4.7 +click==8.4.2 constantly==23.10.4 -contourpy==1.3.3 cryptography==49.0.0 -cssselect==1.3.0 -cycler==0.12.1 -dask==2025.7.0 -databricks-sdk==0.62.0 -debugpy==1.8.16 -decorator==5.2.1 +cssselect==1.4.0 defusedxml==0.7.1 -docker==7.1.0 -duckdb==1.3.2 -executing==2.2.0 -extruct==0.18.0 -fastapi==0.116.1 -fastjsonschema==2.21.1 filelock==3.29.4 -Flask==3.1.1 -fonttools==4.59.0 -frozendict==2.4.7 -frozenlist==1.7.0 -fsspec==2025.7.0 -GDAL==3.8.4 -geopandas==1.1.1 -gitdb==4.0.12 -GitPython==3.1.45 -google-auth==2.40.3 -graphene==3.4.3 -graphql-core==3.2.6 -graphql-relay==3.2.0 -greenlet==3.2.4 -gunicorn==23.0.0 -h11==0.16.0 -html-text==0.7.1 -html5lib==1.1 +greenlet==3.5.3 hyperlink==21.0.0 -idna==3.10 -importlib_metadata==8.7.0 -Incremental==24.11.0 -ipykernel==6.30.1 -ipython==9.4.0 -ipython_pygments_lexers==1.1.1 +idna==3.18 +incremental==24.11.0 itemadapter==0.13.1 itemloaders==1.4.0 -itsdangerous==2.2.0 -jedi==0.19.2 -Jinja2==3.1.6 -jmespath==1.0.1 -joblib==1.5.1 -jsonschema==4.25.0 -jsonschema-specifications==2025.4.1 -jstyleson==0.0.2 -jupyter_client==8.6.3 -jupyter_core==5.8.1 -jusText==3.0.2 -kiwisolver==1.4.9 -langdetect==1.0.9 -locket==1.0.0 -lxml==6.0.2 -lxml_html_clean==0.4.3 -Mako==1.3.10 -markdown-it-py==4.0.0 -MarkupSafe==3.0.2 -matplotlib==3.10.5 -matplotlib-inline==0.1.7 -mdurl==0.1.2 -mf2py==2.0.1 -mlflow==3.2.0 -mlflow-skinny==3.2.0 -mlflow-tracing==3.2.0 -multidict==6.6.3 -narwhals==2.0.1 -nbclient==0.10.2 -nbformat==5.10.4 -nest-asyncio==1.6.0 -nltk==3.9.1 -numpy==2.3.2 -nvidia-nccl-cu12==2.27.7 -omegaconf==2.3.0 -opentelemetry-api==1.36.0 -opentelemetry-sdk==1.36.0 -opentelemetry-semantic-conventions==0.57b0 -packaging==25.0 -pandas==2.3.1 +jmespath==1.1.0 +lxml==6.1.1 +numpy==2.5.0 +omegaconf==2.3.1 +packaging==26.2 +pandas==3.0.3 parsel==1.11.0 -parso==0.8.4 -partd==1.4.2 -patsy==1.0.1 -pexpect==4.9.0 -pillow==11.3.0 -platformdirs==4.3.8 -playwright==1.58.0 -plotly==6.2.0 -polars==1.32.2 -prompt_toolkit==3.0.51 -propcache==0.3.2 -Protego==0.6.1 -protobuf==6.31.1 -psutil==7.0.0 -ptyprocess==0.7.0 -pure_eval==0.2.3 -pyarrow==21.0.0 -pyasn1==0.6.1 -pyasn1_modules==0.4.2 +pip==26.1.2 +pip-tools==7.5.3 +playwright==1.61.0 +protego==0.6.2 +pyarrow==24.0.0 pycparser==3.0 -pydantic==2.11.7 -pydantic_core==2.33.2 -PyDispatcher==2.0.7 -pyee==13.0.0 -Pygments==2.19.2 -pyogrio==0.11.1 -pyOpenSSL==26.3.0 -pyparsing==3.2.3 -pyproj==3.7.1 -pyrdfa3==3.6.5 +pydispatcher==2.0.7 +pyee==13.0.1 +pyopenssl==26.3.0 +pyproject-hooks==1.2.0 python-dateutil==2.9.0.post0 -pytz==2025.2 -PyYAML==6.0.2 -pyzmq==27.0.1 +pyyaml==6.0.3 queuelib==1.9.0 -rdflib==7.6.0 -readability-lxml==0.8.4.1 -referencing==0.36.2 -regex==2025.7.34 -requests==2.32.4 +requests==2.34.2 requests-file==3.0.1 -rich==14.2.0 -rpds-py==0.27.0 -rsa==4.9.1 -s3fs==2025.7.0 -scikit-learn==1.7.1 -scipy==1.16.1 -Scrapy==2.16.0 -seaborn==0.13.2 +scrapy==2.16.0 service-identity==26.1.0 -setuptools==80.9.0 -shapely==2.1.1 -sitemap==20191121 +setuptools==82.0.1 six==1.17.0 -smmap==5.0.2 -sniffio==1.3.1 -soupsieve==2.7 -SQLAlchemy==2.0.42 -sqlparse==0.5.3 -stack-data==0.6.3 -starlette==0.47.2 -statsmodels==0.14.5 -threadpoolctl==3.6.0 +soupsieve==2.8.4 tldextract==5.3.1 -toolz==1.0.0 -tornado==6.5.2 -tqdm==4.67.1 -traitlets==5.14.3 -Twisted==26.4.0 -typing-inspection==0.4.1 -typing_extensions==4.14.1 -tzdata==2025.2 -ultimate-sitemap-parser==1.6.0 -urllib3==2.5.0 -uv==0.8.8 -uvicorn==0.35.0 +twisted==26.4.0 +typing-extensions==4.15.0 +ultimate-sitemap-parser==1.8.1 +urllib3==2.7.0 validators==0.35.0 w3lib==2.4.1 -wcwidth==0.2.13 -webencodings==0.5.1 -Werkzeug==3.1.3 -wheel==0.45.1 -wrapt==1.17.2 -xgboost==3.0.3 -yarl==1.20.1 -zipp==3.23.0 -zope.interface==8.5 +wheel==0.47.0 +zope-interface==8.5 diff --git a/src/fetch/PlaywrightText.py b/src/fetch/PlaywrightText.py index 1c1c5e6..c5c87e9 100644 --- a/src/fetch/PlaywrightText.py +++ b/src/fetch/PlaywrightText.py @@ -82,7 +82,7 @@ async def _fetch_with_retries(self, url: str, retries: int = 0): # We don't need to pass context here, the next retry will create its own return await self._fetch_with_retries(url, retries + 1) - return {} + return "" finally: # ALWAYS close the context and page to free up memory, # even if the request fails or succeeds. diff --git a/src/main.py b/src/main.py index 63b5a23..f1fca50 100644 --- a/src/main.py +++ b/src/main.py @@ -48,7 +48,6 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro process = CrawlerProcess( settings={ "ROBOTSTXT_OBEY": True, - "LOG_FILE": logfile, "DOWNLOADER_MIDDLEWARES": { "src.scrape.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority }, @@ -66,6 +65,11 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro fileHandler.setLevel(log_level) root_logger.addHandler(fileHandler) + # Explicitly set levels for Scrapy and other noisy loggers + logging.getLogger('scrapy').setLevel(log_level) + logging.getLogger('twisted').setLevel(log_level) + root_logger.setLevel(log_level) + # Remove console output # Get the logger that Scrapy uses and remove all handlers that print to the console scrapy_logger = logging.getLogger('scrapy') @@ -87,7 +91,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro target_path_keywords=path_keywords, skip_domains=skip_domains, output_file=output_file, - allowed_top_level_domains=[".com", ".nl", ".ai", ".de", ".be", ".eu", ".io"], + allowed_top_level_domains=[".com", ".nl", ".ai", ".de", ".be", ".eu", ".io", ".org"], skip_paths=[ "shop", "cart", "clients", "testimonials", "search", "query", "calendar", "events", "archive", "news", @@ -98,7 +102,8 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro ], allowed_languages=["nl", "en", "en-uk", "en-gb"], allowed_countries=["nl"], - schema_keywords=schema_keywords + schema_keywords=schema_keywords, + timeout=36000 ) # If worker gets 0 urls, pass (shouldn't happen) @@ -120,7 +125,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro # Set logging level and create file # All workers write to same log - logging_level = logging.INFO + logging_level = logging.DEBUG dir_log = f"{CONFIG.output.output_dir}/{CONFIG.output.logs}" if not os.path.exists(dir_log): @@ -160,7 +165,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro skip_domains = [line.rstrip() for line in file_in] # Set amount of parallel workers and prepare chunk-wisem parallel execution - max_workers = 16 + max_workers = 32 num_workers = min([len(urls), max_workers]) logging.info(f"Will use {num_workers} workers!") batch_size = len(urls) // num_workers if len(urls) > num_workers else 1 @@ -183,7 +188,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro i, logging_level, logfile, - f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet", # Different output files per werker + f"{CONFIG.output.output_dir}/{time_part}/worker_{i}.parquet", # Different output files per worker [CONFIG.crawl.schema.keyword] ) ) diff --git a/src/scrape/HesitantSpider.py b/src/scrape/HesitantSpider.py index 7b6200c..a1dc028 100644 --- a/src/scrape/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -352,7 +352,6 @@ async def parse(self, response): self.logger.debug(f"Found targeted url: {response.url} from base url {response.meta.get("base_url")}") # Determine schema.org indicator schema_indicator = True if self._schemaparser.parse(response=response) else False - self.logger.debug(f"Schema indicator: {schema_indicator}") # Add result to batch result = ScrapyResult( From 876d3322a6af49b529ce88202c68ac1761497d35 Mon Sep 17 00:00:00 2001 From: lhaarman Date: Wed, 1 Jul 2026 09:09:58 +0000 Subject: [PATCH 25/26] timestamp in result --- src/main.py | 12 ++++++------ src/scrape/HesitantSpider.py | 7 +++++-- src/scrape/ScrapyResult.py | 1 + 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/main.py b/src/main.py index f1fca50..29efb9b 100644 --- a/src/main.py +++ b/src/main.py @@ -91,19 +91,19 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro target_path_keywords=path_keywords, skip_domains=skip_domains, output_file=output_file, - allowed_top_level_domains=[".com", ".nl", ".ai", ".de", ".be", ".eu", ".io", ".org"], + allowed_top_level_domains=[".com", ".nl", ".ai", ".de", ".be", ".fr", ".eu", ".io", ".org"], skip_paths=[ "shop", "cart", "clients", "testimonials", "search", "query", "calendar", "events", "archive", "news", "blog", "media", "articles", "profile", "legal", "tos", "products", "winkel", "winkelwagen", "archief", - "nieuws", "artikelen", "producten", "faq", "policies", + "nieuws", "artikelen", "artikel", "producten", "faq", "policies", "downloads", "portfolio" ], - allowed_languages=["nl", "en", "en-uk", "en-gb"], + allowed_languages=["nl", "en", "en-uk", "en-gb", "nl-nl", "en-nl", "nl-en"], allowed_countries=["nl"], schema_keywords=schema_keywords, - timeout=36000 + timeout=3600 * 48 # 2 days ) # If worker gets 0 urls, pass (shouldn't happen) @@ -125,7 +125,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro # Set logging level and create file # All workers write to same log - logging_level = logging.DEBUG + logging_level = logging.INFO dir_log = f"{CONFIG.output.output_dir}/{CONFIG.output.logs}" if not os.path.exists(dir_log): @@ -165,7 +165,7 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro skip_domains = [line.rstrip() for line in file_in] # Set amount of parallel workers and prepare chunk-wisem parallel execution - max_workers = 32 + max_workers = 16 num_workers = min([len(urls), max_workers]) logging.info(f"Will use {num_workers} workers!") batch_size = len(urls) // num_workers if len(urls) > num_workers else 1 diff --git a/src/scrape/HesitantSpider.py b/src/scrape/HesitantSpider.py index a1dc028..fccfc23 100644 --- a/src/scrape/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -4,6 +4,7 @@ import time import validators import logging +from datetime import datetime import pandas as pd @@ -163,10 +164,11 @@ def save_batch(self): df = pd.DataFrame({ "base_url": [res.base_url for res in self.batch], "url": [res.url for res in self.batch], + "timestamp": [res.timestamp for res in self.batch], "first_keyword_hit": [res.first_keyword_hit for res in self.batch], "content": [res.content for res in self.batch], "crawl_depth": [res.crawl_depth for res in self.batch], - "schema_indicator": [res.schema_indicator for res in self.batch] + "schema_indicator": [res.schema_indicator for res in self.batch], }) df.to_parquet( @@ -361,7 +363,8 @@ async def parse(self, response): first_keyword_hit=first_keyword_hit, content=await self._fetcher.fetch(response.url), crawl_depth=current_depth, - schema_indicator=schema_indicator + schema_indicator=schema_indicator, + timestamp=datetime.now().strftime("%Y-%m-%d-%H:%M:%S") ) self.batch.append(result) diff --git a/src/scrape/ScrapyResult.py b/src/scrape/ScrapyResult.py index 2188046..8fb2cb1 100644 --- a/src/scrape/ScrapyResult.py +++ b/src/scrape/ScrapyResult.py @@ -9,6 +9,7 @@ class ScrapyResult(NamedTuple): content: str crawl_depth: int = 0 schema_indicator: bool = False + timestamp: str = 0 def __eq__(self, other): if not isinstance(other, ScrapyResult): From 5a7e453477d0a74f3b9ac1248f67d0344ffdf27d Mon Sep 17 00:00:00 2001 From: lhaarman Date: Wed, 1 Jul 2026 14:05:17 +0000 Subject: [PATCH 26/26] improve logging configuration and add request error handling --- src/main.py | 23 ++++++++++++++++------- src/scrape/HesitantSpider.py | 10 ++++++++++ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/main.py b/src/main.py index 29efb9b..af5b513 100644 --- a/src/main.py +++ b/src/main.py @@ -51,6 +51,8 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro "DOWNLOADER_MIDDLEWARES": { "src.scrape.ScrapyCrawlMiddleware.TextTypeFilterMiddleware": 543 # High priority }, + "LOG_FILE": logfile, + "LOG_LEVEL": log_level, "DOWNLOAD_CONTENT_TYPES": ["text/html", "application/xhtml+xml"], "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", } @@ -63,6 +65,10 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro fileHandler = logging.FileHandler(logfile) fileHandler.setLevel(log_level) + + # This format mimics Scrapy's default look + formatter = logging.Formatter('%(asctime)s %(levelname)s: %(name)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') + fileHandler.setFormatter(formatter) root_logger.addHandler(fileHandler) # Explicitly set levels for Scrapy and other noisy loggers @@ -71,13 +77,16 @@ def spawn_spider_process(urls, netloc_keywords, path_keywords, skip_domains, pro root_logger.setLevel(log_level) # Remove console output - # Get the logger that Scrapy uses and remove all handlers that print to the console - scrapy_logger = logging.getLogger('scrapy') - for handler in scrapy_logger.handlers[:]: - scrapy_logger.removeHandler(handler) - - # Silence the twisted engine too - logging.getLogger('twisted').handlers = [] + for logger_name in ['scrapy', 'twisted', 'sqlalchemy.engine']: + logger = logging.getLogger(logger_name) + logger.setLevel(log_level) + # Remove any handlers that might be printing to console + for handler in logger.handlers[:]: + logger.removeHandler(handler) + # Prevent logs from propagating up to the root logger's console handlers + logger.propagate = True + # Silence the twisted engine too + logging.getLogger('twisted').handlers = [] # Create crawler from process spiderCrawler = process.create_crawler(HesitantSpider) diff --git a/src/scrape/HesitantSpider.py b/src/scrape/HesitantSpider.py index fccfc23..873f061 100644 --- a/src/scrape/HesitantSpider.py +++ b/src/scrape/HesitantSpider.py @@ -130,6 +130,7 @@ async def start(self): yield scrapy.Request( url=start_url, callback=self.parse, + errback=self.handle_error, meta={ "base_url": start_url, "current_start": start_url, @@ -146,6 +147,7 @@ async def start(self): yield scrapy.Request( url=url, callback=self.parse_sitemap, + errback=self.handle_error, meta={ "base_url": start_url, "current_start": start_url, @@ -319,6 +321,7 @@ async def parse(self, response): yield scrapy.Request( url=sitemap_url, callback=self.parse_sitemap, + errback=self.handle_error, meta={ "base_url": response.meta.get("base_url"), "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", @@ -339,6 +342,7 @@ async def parse(self, response): yield scrapy.Request( url=url, callback=self.parse, + errback=self.handle_error, meta={ "base_url": response.meta.get("base_url"), "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", @@ -396,6 +400,7 @@ def parse_sitemap(self, response): yield scrapy.Request( url=url, callback=self.parse_sitemap, + errback=self.handle_error, meta={ "base_url": response.meta.get("base_url"), "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", @@ -407,6 +412,7 @@ def parse_sitemap(self, response): yield scrapy.Request( url=url, callback=self.parse, + errback=self.handle_error, meta={ "base_url": response.meta.get("base_url"), "current_start": f"{parsed_url.scheme}://{parsed_url.netloc}", @@ -414,6 +420,10 @@ def parse_sitemap(self, response): } ) + def handle_error(self, failure): + # TODO pass some specific errors to info? + self.logger.debug(f"Error encountered: {failure}") + # Called when the spider closes cleanly async def closed(self, reason): self.save_batch()