Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
037998b
Add configuration for skippable domains such as social media
lhaarman May 19, 2026
cab73b4
Basic setup scrapy with multiprocessing; TODO testing
lhaarman May 21, 2026
2c65ebe
temp commit; saving result
lhaarman May 28, 2026
56a4719
Refactoring, timeout, language/country filtering, schema.org in hesit…
lhaarman Jun 3, 2026
ac9527b
add logs dir
lhaarman Jun 4, 2026
e5af8e0
add skip_domains to config template
dominikblatt Jun 9, 2026
3787ac6
adjust logging setup
dominikblatt Jun 9, 2026
770bf12
add code for sitemap parsing and testing class
dominikblatt Jun 9, 2026
7a08170
Split up targeting between netloc/path
lhaarman Jun 24, 2026
29d6a63
add missing config keys
dominikblatt Jun 25, 2026
80b1a41
adjust testing code of hesitant spider
dominikblatt Jun 25, 2026
a0d2944
remove unused code
dominikblatt Jun 25, 2026
828077b
fix imports for -m flag running
dominikblatt Jun 25, 2026
bf1684d
move scrapy modules into scrape
dominikblatt Jun 25, 2026
8e8703d
move schema parser to parse module
dominikblatt Jun 25, 2026
1a3dcfa
update normalize_url documentation
dominikblatt Jun 25, 2026
6cedb3d
remove unused lines
dominikblatt Jun 25, 2026
7132ee4
rename main scripts
dominikblatt Jun 25, 2026
9f58b11
update requirements.txt
dominikblatt Jun 25, 2026
b538f0f
bug fix: adjust middlewares path to refactored structure
dominikblatt Jun 30, 2026
156fca7
Upgrade scrape to support dynamic content and more sitemaps
lhaarman Jun 29, 2026
89195e3
Integrate changes to scrape with refactor
lhaarman Jun 30, 2026
7188666
extend logging
lhaarman Jun 30, 2026
c8866aa
small changes/fixes
lhaarman Jun 30, 2026
876d332
timestamp in result
lhaarman Jul 1, 2026
5a7e453
improve logging configuration and add request error handling
lhaarman Jul 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,8 @@ config/config.yaml

# input and output files need to be explicitly added
input
output
output

# (Debug) Output file types
*.txt
*.html
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ More info on statistical scraping [here](https://github.com/SNStatComp/SSIG)
# Getting started
- Install all required packages using
> pip install -r requirements.txt
> playwright install
> playwright install-deps
- Activate the environment
- run the following command to install modules in src as packages for proper import
> pip install -e .
Expand Down
8 changes: 5 additions & 3 deletions config/config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@ requests:
timeout_read: 7 # In seconds
max_retries: 3
input:
input_dir: ../input
input_dir: input
input_files:
skip_domains: skipdomains.txt
urls: urls.txt
keywords: keywords.txt
netloc_keywords: keywords.txt
path_keywords: keywords.txt
url_max: 100
url_offset: 0
input_variables:
output:
output_dir: ../output
output_dir: output
batchsize: 100
logs: logs
crawl:
Expand Down
File renamed without changes.
214 changes: 54 additions & 160 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,163 +1,57 @@
aiobotocore==2.24.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aioitertools==0.12.0
aiosignal==1.4.0
alembic==1.16.4
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.10.0
asttokens==3.0.0
attrs==25.3.0
beautifulsoup4==4.13.4
blinker==1.9.0
botocore==1.39.11
cachetools==5.5.2
certifi==2025.8.3
chardet==5.2.0
charset-normalizer==3.4.3
click==8.2.1
cloudpickle==3.1.1
comm==0.2.3
contourpy==1.3.3
cssselect==1.3.0
cycler==0.12.1
dask==2025.7.0
databricks-sdk==0.62.0
debugpy==1.8.16
decorator==5.2.1
docker==7.1.0
duckdb==1.3.2
executing==2.2.0
fastapi==0.116.1
fastjsonschema==2.21.1
Flask==3.1.1
fonttools==4.59.0
frozendict==2.4.7
frozenlist==1.7.0
fsspec==2025.7.0
GDAL==3.8.4
geopandas==1.1.1
gitdb==4.0.12
GitPython==3.1.45
google-auth==2.40.3
graphene==3.4.3
graphql-core==3.2.6
graphql-relay==3.2.0
greenlet==3.2.4
gunicorn==23.0.0
h11==0.16.0
idna==3.10
importlib_metadata==8.7.0
ipykernel==6.30.1
ipython==9.4.0
ipython_pygments_lexers==1.1.1
itsdangerous==2.2.0
jedi==0.19.2
Jinja2==3.1.6
jmespath==1.0.1
joblib==1.5.1
jsonschema==4.25.0
jsonschema-specifications==2025.4.1
jupyter_client==8.6.3
jupyter_core==5.8.1
jusText==3.0.2
kiwisolver==1.4.9
langdetect==1.0.9
locket==1.0.0
lxml==6.0.2
lxml_html_clean==0.4.3
Mako==1.3.10
markdown-it-py==4.0.0
MarkupSafe==3.0.2
matplotlib==3.10.5
matplotlib-inline==0.1.7
mdurl==0.1.2
mlflow==3.2.0
mlflow-skinny==3.2.0
mlflow-tracing==3.2.0
multidict==6.6.3
narwhals==2.0.1
nbclient==0.10.2
nbformat==5.10.4
nest-asyncio==1.6.0
nltk==3.9.1
numpy==2.3.2
nvidia-nccl-cu12==2.27.7
omegaconf==2.3.0
opentelemetry-api==1.36.0
opentelemetry-sdk==1.36.0
opentelemetry-semantic-conventions==0.57b0
packaging==25.0
pandas==2.3.1
parso==0.8.4
partd==1.4.2
patsy==1.0.1
pexpect==4.9.0
pillow==11.3.0
platformdirs==4.3.8
playwright==1.58.0
plotly==6.2.0
polars==1.32.2
prompt_toolkit==3.0.51
propcache==0.3.2
protobuf==6.31.1
psutil==7.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pyarrow==21.0.0
pyasn1==0.6.1
pyasn1_modules==0.4.2
pydantic==2.11.7
pydantic_core==2.33.2
pyee==13.0.0
Pygments==2.19.2
pyogrio==0.11.1
pyparsing==3.2.3
pyproj==3.7.1
attrs==26.1.0
automat==25.4.16
beautifulsoup4==4.15.0
bs4==0.0.2
build==1.5.0
certifi==2026.6.17
cffi==2.0.0
charset-normalizer==3.4.7
click==8.4.2
constantly==23.10.4
cryptography==49.0.0
cssselect==1.4.0
defusedxml==0.7.1
filelock==3.29.4
greenlet==3.5.3
hyperlink==21.0.0
idna==3.18
incremental==24.11.0
itemadapter==0.13.1
itemloaders==1.4.0
jmespath==1.1.0
lxml==6.1.1
numpy==2.5.0
omegaconf==2.3.1
packaging==26.2
pandas==3.0.3
parsel==1.11.0
pip==26.1.2
pip-tools==7.5.3
playwright==1.61.0
protego==0.6.2
pyarrow==24.0.0
pycparser==3.0
pydispatcher==2.0.7
pyee==13.0.1
pyopenssl==26.3.0
pyproject-hooks==1.2.0
python-dateutil==2.9.0.post0
pytz==2025.2
PyYAML==6.0.2
pyzmq==27.0.1
readability-lxml==0.8.4.1
referencing==0.36.2
regex==2025.7.34
requests==2.32.4
rich==14.2.0
rpds-py==0.27.0
rsa==4.9.1
s3fs==2025.7.0
scikit-learn==1.7.1
scipy==1.16.1
seaborn==0.13.2
setuptools==80.9.0
shapely==2.1.1
sitemap==20191121
pyyaml==6.0.3
queuelib==1.9.0
requests==2.34.2
requests-file==3.0.1
scrapy==2.16.0
service-identity==26.1.0
setuptools==82.0.1
six==1.17.0
smmap==5.0.2
sniffio==1.3.1
soupsieve==2.7
SQLAlchemy==2.0.42
sqlparse==0.5.3
stack-data==0.6.3
starlette==0.47.2
statsmodels==0.14.5
threadpoolctl==3.6.0
toolz==1.0.0
tornado==6.5.2
tqdm==4.67.1
traitlets==5.14.3
typing-inspection==0.4.1
typing_extensions==4.14.1
tzdata==2025.2
ultimate-sitemap-parser==1.6.0
urllib3==2.5.0
uv==0.8.8
uvicorn==0.35.0
wcwidth==0.2.13
Werkzeug==3.1.3
wheel==0.45.1
wrapt==1.17.2
xgboost==3.0.3
yarl==1.20.1
zipp==3.23.0
soupsieve==2.8.4
tldextract==5.3.1
twisted==26.4.0
typing-extensions==4.15.0
ultimate-sitemap-parser==1.8.1
urllib3==2.7.0
validators==0.35.0
w3lib==2.4.1
wheel==0.47.0
zope-interface==8.5
Loading