diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2f46b09 --- /dev/null +++ b/.env.example @@ -0,0 +1,3 @@ +GEONAMES_USERNAME="your-geonames-username--do-not-use-demo" +GEORESOLVER_USER_AGENT="georesolver/0.2 (+https://your-project.example; contact: you@example.org)" +GEORESOLVER_USER_AGENT_CONTACT="you@example.org" \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 11c704d..39a964a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.2.4] - 2026-05-26 +- Fixed documentation url in `pyproject.toml` [Fix issue [#4](https://github.com/jairomelo/GeoResolver/issues/4)] +- Added a shared requests Session with default headers: + - Custom User-Agent + - Accept: application/json + - Accept-Language +- Added a conservative retry adapter for transient statuses (429/5xx). +- Routed BaseQuery HTTP calls through that session. +- Added timeout and optional per-call header override support in the shared GET helper. +- Added a configurable User-Agent builder. + +--- + ## [v0.2.2] - 2025-07-14 ### Added @@ -88,5 +101,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Previous stable release. See git history for details of earlier versions. -[Unreleased]: https://github.com/jairomelo/georesolver/compare/v0.1.4...HEAD +[Unreleased]: https://github.com/jairomelo/georesolver/compare/v0.2.4...HEAD +[v0.2.4]: https://github.com/jairomelo/georesolver/releases/tag/v0.2.4 [v0.1.4]: https://github.com/jairomelo/georesolver/releases/tag/v0.1.4 diff --git a/README.md b/README.md index 3ed6d1e..60e7122 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![CI](https://github.com/jairomelo/GeoResolver/actions/workflows/ci.yml/badge.svg) ![License](https://img.shields.io/pypi/l/georesolver) ![Downloads](https://static.pepy.tech/badge/georesolver) -[![Documentation](https://img.shields.io/badge/docs-online-blue)](https://jairomelo.com/Georesolver/) +[![Documentation](https://img.shields.io/badge/docs-online-blue)](http://jairomelo.com/GeoResolver/georesolver.html) [![Issues](https://img.shields.io/github/issues/jairomelo/Georesolver)](https://github.com/jairomelo/Georesolver/issues) @@ -265,6 +265,21 @@ Each service-specific list should contain valid place type codes or labels expec This library queries the Wikidata MediaWiki API via the endpoint: `https://www.wikidata.org/w/api.php` +### Request identification (recommended) + +Some public APIs (including Wikidata and, in some scenarios, WHG) may reject requests sent with the default Python user agent. +GeoResolver sends an identifiable `User-Agent` by default, and you can customize it with environment variables: + +```bash +# Full override +GEORESOLVER_USER_AGENT="georesolver/0.2 (+https://your-project.example; contact: you@example.org)" + +# Or append contact info to the default GeoResolver user agent +GEORESOLVER_USER_AGENT_CONTACT="you@example.org" +``` + +For production pipelines, it is recommended to provide a contact email or URL to align with service policies and simplify troubleshooting with providers. + It does not use the SPARQL endpoint (`https://query.wikidata.org/sparql`), as this approach is faster and more reliable for simple place lookups. The library performs entity searches by name and retrieves coordinates, country (P17), and administrative data from the entity information. **Enhanced in v0.2.0**: WikidataQuery now provides better country and administrative entity data retrieval, with improved matching against the BaseQuery interface for consistency across all services. diff --git a/pyproject.toml b/pyproject.toml index 9d433c6..376023a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta" [project] name = "georesolver" -version = "0.2.2" +version = "0.2.4" description = "Multi-source place name to coordinates resolver using TGN, WHG, GeoNames, and Wikidata" authors = [ {name="Jairo Antonio Melo Florez", email="jairoantoniomelo@gmail.com"} ] readme = {file = "README.md", content-type = "text/markdown"} -license = {text = "GPL-3.0-only"} +license = "GPL-3.0-only" requires-python = ">=3.9" dependencies = [ "SPARQLWrapper~=2.0.0", @@ -34,7 +34,6 @@ keywords = [ classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", "Natural Language :: English", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", @@ -51,7 +50,7 @@ classifiers = [ [project.urls] Homepage = "https://github.com/jairomelo/Georesolver" Issues = "https://github.com/jairomelo/Georesolver/issues" -Documentation = "https://jairomelo.com/Georesolver/" +Documentation = "https://jairomelo.com/GeoResolver/georesolver.html" [tool.setuptools.package-data] "georesolver" = ["data/mappings/places_map.json"] diff --git a/src/georesolver/base.py b/src/georesolver/base.py index 6717a4a..27ed66a 100644 --- a/src/georesolver/base.py +++ b/src/georesolver/base.py @@ -3,6 +3,9 @@ from ratelimit import limits, sleep_and_retry import requests import requests_cache +import os +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry from georesolver.utils.LoggerHandler import setup_logger class BaseQuery(ABC): @@ -11,6 +14,22 @@ class BaseQuery(ABC): Handles caching, rate limiting, and basic GET requests. """ + @staticmethod + def _build_default_user_agent() -> str: + """ + Build an identifiable User-Agent required by some public APIs. + Users can fully override it with GEORESOLVER_USER_AGENT. + """ + configured_ua = os.getenv("GEORESOLVER_USER_AGENT") + if configured_ua: + return configured_ua + + contact = os.getenv("GEORESOLVER_USER_AGENT_CONTACT", "").strip() + base_ua = "georesolver/0.2 (+https://pypi.org/project/georesolver)" + if contact: + return f"{base_ua}; contact: {contact}" + return base_ua + def __init__( self, base_url: str, @@ -24,6 +43,27 @@ def __init__( self.base_url = base_url.rstrip("/") self.calls, self.period = rate_limit + # A non-default User-Agent is required by some services (e.g., Wikidata/WHG). + custom_ua = self._build_default_user_agent() + self.default_headers = { + "User-Agent": custom_ua, + "Accept": "application/json", + "Accept-Language": "en-US,en;q=0.9" + } + + self.session = requests.Session() + self.session.headers.update(self.default_headers) + + retry = Retry( + total=3, + backoff_factor=0.5, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods={"GET"} + ) + adapter = HTTPAdapter(max_retries=retry) + self.session.mount("https://", adapter) + self.session.mount("http://", adapter) + if enable_cache: requests_cache.install_cache(cache_name, expire_after=cache_expiry) self.logger.info(f"Installed cache '{cache_name}' (expires after {cache_expiry}s)") @@ -32,13 +72,18 @@ def __init__( @limits(calls=30, period=1) def _limited_get(self, url: str, - params: Optional[Dict[str, Any]] = None) -> requests.Response: + params: Optional[Dict[str, Any]] = None, + headers: Optional[Dict[str, str]] = None, + timeout: int = 20) -> requests.Response: """ Internal method to perform a GET request with rate limiting. """ full_url = f"{self.base_url}{url}" if not url.startswith("http") else url try: - response = requests.get(full_url, params=params) + merged_headers = self.default_headers.copy() + if headers: + merged_headers.update(headers) + response = self.session.get(full_url, params=params, headers=merged_headers, timeout=timeout) response.raise_for_status() if getattr(response, "from_cache", False): self.logger.info(f"[CACHE HIT] {response.url}")