From f4c892e6d9ee6b94592d161ba9127f6a98a4bc73 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:53:43 +0200 Subject: [PATCH 01/18] feat: add code_system discriminator field to NUTSResult --- app/models.py | 7 +++++++ tests/test_api.py | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/app/models.py b/app/models.py index ee141db..e378914 100644 --- a/app/models.py +++ b/app/models.py @@ -6,6 +6,13 @@ class NUTSResult(BaseModel): postal_code: str = Field(description="The queried postal code (normalized)") country_code: str = Field(description="ISO 3166-1 alpha-2 country code") + code_system: Literal["NUTS", "ITL"] = Field( + default="NUTS", + description=( + "Territorial coding scheme of the nuts1/2/3 fields. 'NUTS' for GISCO-sourced " + "EU/EFTA/candidate data; 'ITL' for UK data from the ONS NSPL." + ), + ) match_type: Literal["exact", "estimated", "approximate"] = Field( description="How the result was determined" ) diff --git a/tests/test_api.py b/tests/test_api.py index de9bcd6..49f7cc8 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -17,6 +17,11 @@ def test_200_cache_header(self, client): resp = client.get("/lookup", params={"postal_code": "10115", "country": "DE"}) assert "public" in resp.headers.get("cache-control", "") + def test_response_includes_code_system_nuts(self, client): + resp = client.get("/lookup", params={"postal_code": "10115", "country": "DE"}) + assert resp.status_code == 200 + assert resp.json()["code_system"] == "NUTS" + def test_400_unsupported_country(self, client): resp = client.get("/lookup", params={"postal_code": "12345", "country": "ZZ"}) assert resp.status_code == 400 From 665f21179696a0b9702350e7bf321c334d3672fd Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:54:49 +0200 Subject: [PATCH 02/18] feat: add UK postcode regex and bump patterns version to 1.3 --- app/postal_patterns.json | 7 ++++++- tests/test_postal_patterns.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/app/postal_patterns.json b/app/postal_patterns.json index 7e2fdd3..a0ea06c 100644 --- a/app/postal_patterns.json +++ b/app/postal_patterns.json @@ -1,5 +1,5 @@ { - "_meta": { "version": "1.2", "date": "2026-07-02" }, + "_meta": { "version": "1.3", "date": "2026-07-03" }, "AL": { "regex": "^(?:AL[\\s\\-–—.]*)?([0-9]{4})$", "example": "1001, AL-1001, AL 1001" @@ -182,5 +182,10 @@ "regex": "^(?:TR[\\s\\-\u2013\u2014.]*)?(\\d{5})$", "example": "06100, TR-06100, TR 06100", "expected_digits": 5 + }, + "UK": { + "regex": "^([A-Z]{1,2}[0-9][0-9A-Z]?\\s?[0-9][A-Z]{2})$", + "example": "SW1A 2AA, EC1A 1BB, M1 1AA, B33 8TH", + "tercet_map": "outward_only" } } diff --git a/tests/test_postal_patterns.py b/tests/test_postal_patterns.py index 76fe1cd..f121d02 100644 --- a/tests/test_postal_patterns.py +++ b/tests/test_postal_patterns.py @@ -1,6 +1,13 @@ """Tests for postal_patterns.py — preprocessing, tercet_map, extraction.""" -from app.postal_patterns import _apply_tercet_map, _preprocess, extract_postal_code +import pytest + +from app.postal_patterns import ( + PATTERNS_META, + _apply_tercet_map, + _preprocess, + extract_postal_code, +) # ── _preprocess tests ───────────────────────────────────────────────────────── @@ -177,3 +184,27 @@ def test_lowercase_prefix(self): def test_three_digit_not_matched_as_four(self): # Too short: regex requires exactly 4 digits; must NOT become a 4-digit code. assert extract_postal_code("AL", "100") != "1000" + + +class TestUKExtraction: + @pytest.mark.parametrize( + "raw, expected", + [ + ("SW1A 2AA", "SW1A2AA"), + ("sw1a 2aa", "SW1A2AA"), + ("SW1A2AA", "SW1A2AA"), + ("M1 1AA", "M11AA"), + ("B33 8TH", "B338TH"), + ("W1A 1HQ", "W1A1HQ"), + ("CR2 6XH", "CR26XH"), + ("DN55 1PT", "DN551PT"), + ("EC1A 1BB", "EC1A1BB"), + ], + ) + def test_uk_regex_extracts_normalized_full_postcode(self, raw, expected): + assert extract_postal_code("UK", raw) == expected + + +def test_patterns_meta_version_bumped(): + # Adding UK is an additive coverage change; minor version bump. + assert PATTERNS_META["version"] == "1.3" From c033ff9bdaef3dd83780509179b1d0f4ead2db36 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:55:26 +0200 Subject: [PATCH 03/18] feat: add outward_only action and extract_outward helper --- app/postal_patterns.py | 30 +++++++++++++++++++++++++++--- tests/test_postal_patterns.py | 27 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/app/postal_patterns.py b/app/postal_patterns.py index 31e5833..c4725d3 100644 --- a/app/postal_patterns.py +++ b/app/postal_patterns.py @@ -5,9 +5,11 @@ - example: Human-readable format examples - tercet_map: Optional transform to align extracted code with TERCET lookup key. Supported actions: - truncate:N — keep only the first N characters - prepend:XX — prepend string XX to the extracted code - keep_alpha — keep only leading alphabetic characters + truncate:N — keep only the first N characters + prepend:XX — prepend string XX to the extracted code + keep_alpha — keep only leading alphabetic characters + outward_only — marker: country supports outward-code + fallback (lookup Tier 3.5); no key transform - expected_digits: Expected number of digits for all-numeric postal codes. Used by _preprocess() to restore leading zeros lost in Excel/CSV exports (e.g. "8461" → "08461" for ES with expected_digits=5). @@ -77,9 +79,31 @@ def _apply_tercet_map(code: str, rule: str) -> str: if action == "keep_alpha": m = re.match(r"^([A-Z]+)", code) return m.group(1) if m else code + if action == "outward_only": + # Marker: the country supports outward-code-only fallback (lookup Tier 3.5). + # It does not transform the Tier 1 key; see extract_outward(). + return code return code +def extract_outward(country_code: str, raw_input: str) -> str | None: + """Return the outward (district) portion for countries flagged outward_only. + + For UK postcodes, the outward portion is the normalised code minus its last + three characters (the inward code). Input shorter than 4 chars after + normalisation is treated as already being an outward code (e.g. bare "SW1A"). + + Returns None for countries that do not declare tercet_map="outward_only". + """ + entry = POSTAL_PATTERNS.get(country_code) + if not entry or entry.get("tercet_map") != "outward_only": + return None + normalised = normalize_postal_code(raw_input) + if len(normalised) <= 4: + return normalised + return normalised[:-3] + + def extract_postal_code(country_code: str, raw_input: str) -> str: """Extract and normalize postal code using country-specific pattern. diff --git a/tests/test_postal_patterns.py b/tests/test_postal_patterns.py index f121d02..4f57392 100644 --- a/tests/test_postal_patterns.py +++ b/tests/test_postal_patterns.py @@ -6,6 +6,7 @@ PATTERNS_META, _apply_tercet_map, _preprocess, + extract_outward, extract_postal_code, ) @@ -208,3 +209,29 @@ def test_uk_regex_extracts_normalized_full_postcode(self, raw, expected): def test_patterns_meta_version_bumped(): # Adding UK is an additive coverage change; minor version bump. assert PATTERNS_META["version"] == "1.3" + + +class TestExtractOutward: + @pytest.mark.parametrize( + "raw, expected_outward", + [ + ("SW1A 2AA", "SW1A"), + ("sw1a2aa", "SW1A"), + ("M1 1AA", "M1"), + ("B33 8TH", "B33"), + ("EC1A 1BB", "EC1A"), + ("DN55 1PT", "DN55"), + ("SW1A", "SW1A"), # outward-only input + ("M1", "M1"), + ], + ) + def test_extract_outward_for_uk(self, raw, expected_outward): + assert extract_outward("UK", raw) == expected_outward + + def test_returns_none_for_country_without_flag(self): + # AT does not declare outward_only; outward extraction is undefined. + assert extract_outward("AT", "1010") is None + + def test_extract_postal_code_unaffected_by_outward_only_flag(self): + # Tier 1 lookup for UK must still yield the full normalised postcode. + assert extract_postal_code("UK", "SW1A 2AA") == "SW1A2AA" From 3ac8260d9d3f60cc1ce2b5b1bfd5768a746b0cef Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:56:28 +0200 Subject: [PATCH 04/18] feat: recognise NSPL pcds/itl columns in _parse_csv_content --- app/data_loader.py | 17 +++++++++++++---- tests/test_data_loader.py | 11 +++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/app/data_loader.py b/app/data_loader.py index a3720bc..915af80 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -247,16 +247,25 @@ def _parse_csv_content(text: str, country_code: str, *, overwrite: bool = False) reader = csv.DictReader(io.StringIO(text), delimiter=delimiter) fieldnames = [f.strip().upper() for f in (reader.fieldnames or [])] - # Find the postal code column + # Find the postal code column ("PCDS" is the NSPL formatted-postcode column) pc_col = None - for candidate in ("CODE", "PC", "POSTAL_CODE", "POSTCODE", "PC_FMT"): + for candidate in ("CODE", "PC", "POSTAL_CODE", "POSTCODE", "PC_FMT", "PCDS"): if candidate in fieldnames: pc_col = candidate break - # Find the NUTS3 column — prefer current version, never fall back to old versions + # Find the NUTS3 column — prefer current version, never fall back to old versions. + # ITL* candidates cover the UK NSPL dataset (ITL3 codes are NUTS3-equivalent). nuts3_col = None - for candidate in (f"NUTS3_{settings.nuts_version}", "NUTS3", "NUTS_ID", "NUTS"): + for candidate in ( + f"NUTS3_{settings.nuts_version}", + "NUTS3", + "NUTS_ID", + "NUTS", + "ITL3CD", + "ITL3", + "ITL", + ): if candidate in fieldnames: nuts3_col = candidate break diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index a43d951..fd96ca9 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -1,5 +1,6 @@ """Tests for data_loader.py — normalize functions and lookup tiers.""" +from app import data_loader from app.data_loader import lookup, normalize_country, normalize_postal_code @@ -287,3 +288,13 @@ def test_sample_codes_resolve_estimated(self): assert result["nuts3"] in self.VALID_AL_NUTS3 assert result["nuts2"] == result["nuts3"][:4] assert result["nuts1"] == "AL0" + + +class TestNSPLColumnParsing: + def test_parse_csv_recognises_nspl_columns(self, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {}) + nspl_csv = "pcds,itl,doterm\nSW1A 2AA,TLI32,\nEC1A 1BB,TLI32,\n" + rows = data_loader._parse_csv_content(nspl_csv, "UK") + assert rows == 2 + assert data_loader._lookup[("UK", "SW1A2AA")] == "TLI32" + assert data_loader._lookup[("UK", "EC1A1BB")] == "TLI32" From 2f9f78018070706b32d1f984a0715f5d78d16652 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:57:07 +0200 Subject: [PATCH 05/18] feat: add skip_terminated flag to filter NSPL doterm rows --- app/data_loader.py | 26 ++++++++++++++++++++++++-- tests/test_data_loader.py | 18 ++++++++++++++++++ 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/app/data_loader.py b/app/data_loader.py index 915af80..837ac0d 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -232,8 +232,19 @@ def _sniff_dialect(text: str) -> csv.Dialect | None: return None -def _parse_csv_content(text: str, country_code: str, *, overwrite: bool = False) -> int: - """Parse CSV/TSV content and populate the lookup table. Returns row count.""" +def _parse_csv_content( + text: str, + country_code: str, + *, + overwrite: bool = False, + skip_terminated: bool = False, +) -> int: + """Parse CSV/TSV content and populate the lookup table. Returns row count. + + When skip_terminated is True (used for the NSPL dataset), rows with a + non-blank DOTERM (date of termination) column are skipped so only live + postcodes are loaded. + """ count = 0 skipped = 0 @@ -285,17 +296,28 @@ def _parse_csv_content(text: str, country_code: str, *, overwrite: bool = False) cc_col = candidate break + # Detect optional DOTERM column for live-only filtering (NSPL) + doterm_col = None + if skip_terminated: + for candidate in ("DOTERM", "DOT", "DATE_OF_TERMINATION"): + if candidate in fieldnames: + doterm_col = candidate + break + # Map back to original-case field names from DictReader orig_fields = list(reader.fieldnames or []) pc_orig = orig_fields[fieldnames.index(pc_col)] nuts3_orig = orig_fields[fieldnames.index(nuts3_col)] cc_orig = orig_fields[fieldnames.index(cc_col)] if cc_col else None + doterm_orig = orig_fields[fieldnames.index(doterm_col)] if doterm_col else None if not country_code and cc_col is None: logger.warning("No country code available (not in URL or CSV columns), skipping file") return 0 for row in reader: + if doterm_orig and row.get(doterm_orig, "").strip(): + continue pc = row.get(pc_orig, "") nuts3 = row.get(nuts3_orig, "").strip() if not pc or not nuts3: diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index fd96ca9..189b374 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -298,3 +298,21 @@ def test_parse_csv_recognises_nspl_columns(self, monkeypatch): assert rows == 2 assert data_loader._lookup[("UK", "SW1A2AA")] == "TLI32" assert data_loader._lookup[("UK", "EC1A1BB")] == "TLI32" + + def test_skip_terminated_filters_doterm_rows(self, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {}) + nspl_csv = ( + "pcds,itl,doterm\n" + "SW1A 2AA,TLI32,\n" + "M1 9NS,TLD46,202312\n" # terminated, skip + "EC1A 1BB,TLI32,\n" + ) + rows = data_loader._parse_csv_content(nspl_csv, "UK", skip_terminated=True) + assert rows == 2 + assert ("UK", "M19NS") not in data_loader._lookup + + def test_skip_terminated_default_false_keeps_all_rows(self, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {}) + nspl_csv = "pcds,itl,doterm\nSW1A 2AA,TLI32,\nM1 9NS,TLD46,202312\n" + rows = data_loader._parse_csv_content(nspl_csv, "UK") + assert rows == 2 From 50c543fdccce05a733b2e1333e9b4ff1cba9b48e Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:58:14 +0200 Subject: [PATCH 06/18] feat: add NSPL URL and ITL names URLs to settings --- app/config.py | 10 ++++++++++ app/settings.json | 1 + tests/test_config.py | 24 ++++++++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/app/config.py b/app/config.py index 4a5de69..533d724 100644 --- a/app/config.py +++ b/app/config.py @@ -31,6 +31,9 @@ class Settings(BaseSettings): cache_max_age: int = _defaults.get("cache_max_age", 3600) startup_timeout: int = 300 docs_enabled: bool = True + # NSPL (UK postcode → ITL3) — optional, no-op when unset (TERCET-only deployment) + nspl_url: str = _defaults.get("nspl_url", "") + itl_names_urls: str = "" photon_url: str = "" photon_timeout_seconds: float = 5.0 nuts_geojson_url: str = ( @@ -70,6 +73,13 @@ def extra_source_urls(self) -> list[str]: return [] return [u.strip() for u in self.extra_sources.split(",") if u.strip()] + @property + def itl_names_url_list(self) -> list[str]: + """Parse PC2NUTS_ITL_NAMES_URLS comma-separated string into a URL list.""" + if not self.itl_names_urls.strip(): + return [] + return [u.strip() for u in self.itl_names_urls.split(",") if u.strip()] + @property def trusted_tokens(self) -> frozenset[str]: """Parse PC2NUTS_TRUSTED_TOKENS comma-separated list into a frozenset. diff --git a/app/settings.json b/app/settings.json index 4f47c75..d3b478f 100644 --- a/app/settings.json +++ b/app/settings.json @@ -1,5 +1,6 @@ { "tercet_base_url": "https://gisco-services.ec.europa.eu/tercet/NUTS-2024/", + "nspl_url": "", "countries": [ "AT", "BE", "BG", "CY", "CZ", "DE", "DK", "EE", "EL", "ES", "FI", "FR", "HR", "HU", "IE", "IT", "LT", "LU", "LV", "MT", diff --git a/tests/test_config.py b/tests/test_config.py index fe3bf40..8af0aaf 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -62,3 +62,27 @@ def test_interval_negative_is_rejected(self, monkeypatch): def test_synthetic_nuts_fallback_has_fo(): from app.config import settings assert settings.synthetic_nuts_fallback.get("FO") == "FO000" + + +class TestNSPLSettings: + def test_nspl_url_defaults_empty(self): + assert Settings().nspl_url == "" + + def test_itl_names_urls_defaults_empty(self): + assert Settings().itl_names_urls == "" + + def test_itl_names_url_list_parses_csv(self): + s = Settings(itl_names_urls="https://a/x.csv, https://b/y.csv ,") + assert s.itl_names_url_list == ["https://a/x.csv", "https://b/y.csv"] + + def test_itl_names_url_list_empty_when_unset(self): + assert Settings().itl_names_url_list == [] + + def test_nspl_url_from_env(self, monkeypatch): + monkeypatch.setenv("PC2NUTS_NSPL_URL", "https://ons/nspl.zip") + assert Settings().nspl_url == "https://ons/nspl.zip" + + def test_uk_not_in_settings_countries(self): + """Regression guard: UK must not appear in the GISCO country list — + it would trigger wasted GISCO URL guesses (Codex review, PR #52).""" + assert "UK" not in Settings().countries From dd54c28665a491012f3cdce258fbea673f62f1be Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:59:02 +0200 Subject: [PATCH 07/18] feat: add conditional GET wrapper for cached ZIP downloads --- app/data_loader.py | 17 +++++++++++++++++ tests/test_data_loader.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/app/data_loader.py b/app/data_loader.py index 837ac0d..cbe4c09 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -347,6 +347,23 @@ def _parse_csv_content( return count +def _download_zip_conditional( + client: httpx.Client, url: str, cached_meta: dict +) -> httpx.Response: + """Download with conditional-GET headers; returns the raw httpx.Response. + + cached_meta keys: 'etag' and 'last_modified' (either may be absent). The + caller handles 200 (re-parse), 304 (keep cache), and error statuses. Applies + to both TERCET and NSPL so an unchanged upstream ZIP is not re-fetched. + """ + headers = {} + if cached_meta.get("etag"): + headers["If-None-Match"] = cached_meta["etag"] + if cached_meta.get("last_modified"): + headers["If-Modified-Since"] = cached_meta["last_modified"] + return client.get(url, headers=headers, timeout=60, follow_redirects=True) + + def _download_zip(client: httpx.Client, url: str) -> bytes | None: """Download a ZIP with one retry on transient network errors. diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index 189b374..be0ae79 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -1,5 +1,7 @@ """Tests for data_loader.py — normalize functions and lookup tiers.""" +import httpx + from app import data_loader from app.data_loader import lookup, normalize_country, normalize_postal_code @@ -316,3 +318,36 @@ def test_skip_terminated_default_false_keeps_all_rows(self, monkeypatch): nspl_csv = "pcds,itl,doterm\nSW1A 2AA,TLI32,\nM1 9NS,TLD46,202312\n" rows = data_loader._parse_csv_content(nspl_csv, "UK") assert rows == 2 + + +class TestConditionalGet: + def test_sends_conditional_headers_when_etag_known(self): + captured = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["headers"] = dict(request.headers) + return httpx.Response(304) + + client = httpx.Client(transport=httpx.MockTransport(handler)) + cached_meta = { + "etag": '"abc123"', + "last_modified": "Wed, 01 Jan 2025 00:00:00 GMT", + } + result = data_loader._download_zip_conditional( + client, "https://example.com/foo.zip", cached_meta + ) + assert result.status_code == 304 + assert captured["headers"]["if-none-match"] == '"abc123"' + assert captured["headers"]["if-modified-since"] == "Wed, 01 Jan 2025 00:00:00 GMT" + + def test_omits_headers_when_meta_empty(self): + captured = {} + + def handler(request: httpx.Request) -> httpx.Response: + captured["headers"] = dict(request.headers) + return httpx.Response(200, content=b"x") + + client = httpx.Client(transport=httpx.MockTransport(handler)) + data_loader._download_zip_conditional(client, "https://example.com/foo.zip", {}) + assert "if-none-match" not in captured["headers"] + assert "if-modified-since" not in captured["headers"] From 2ef5762ca1fa390411f5c18bfa051bf0e68da2bc Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:02:23 +0200 Subject: [PATCH 08/18] feat: implement NSPL loader with isolated failure handling --- app/data_loader.py | 68 +++++++++++++++++++++++++++++++++++++++ tests/test_api.py | 2 +- tests/test_data_loader.py | 53 ++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/app/data_loader.py b/app/data_loader.py index cbe4c09..6ed8985 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -24,6 +24,10 @@ _MAX_UNCOMPRESSED_SIZE = 100 * 1024 * 1024 # 100 MB +# The NSPL postcode CSV (~1.79M live rows) is far larger than a TERCET file; it +# needs its own, higher extraction cap. Source is operator-configured (trusted). +_MAX_NSPL_UNCOMPRESSED_SIZE = 1024 * 1024 * 1024 # 1 GB + logger = logging.getLogger(__name__) # postal_code -> NUTS3 code, keyed by (country_code, normalized_postal_code) @@ -463,6 +467,64 @@ def _download_and_parse_zip( return total +def _load_nspl(client: httpx.Client, url: str, cache_dir: Path) -> int: + """Fetch the NSPL ZIP and load UK postcode → ITL3 entries into _lookup. + + Returns the number of rows added. Returns 0 when url is empty or any error + occurs — an NSPL failure must never block TERCET-only operation. Terminated + postcodes (non-blank DOTERM) are filtered out. UK is registered in the loaded + country set automatically because its rows land in _lookup. + """ + if not url: + return 0 + cache_path = cache_dir / "nspl.zip" + try: + resp = _download_zip_conditional(client, url, {}) + if resp.status_code == 304: + # Unchanged upstream — nothing to (re)load this run. + return 0 + resp.raise_for_status() + content = resp.content + if not zipfile.is_zipfile(io.BytesIO(content)): + logger.warning("NSPL response from %s is not a valid ZIP, skipping", url) + return 0 + try: + cache_path.write_bytes(content) + except OSError as exc: + logger.warning("Failed to cache NSPL ZIP: %s", exc) + + total = 0 + with zipfile.ZipFile(io.BytesIO(content)) as zf: + for name in zf.namelist(): + # The postcode CSV is the "NSPL*.csv" (real releases ship it under + # Data/); other bundled CSVs (user guide, column lookups) lack the + # pcds/itl columns and are ignored by _parse_csv_content anyway. + if not name.lower().endswith(".csv") or "nspl" not in name.lower(): + continue + file_size = zf.getinfo(name).file_size + if file_size > _MAX_NSPL_UNCOMPRESSED_SIZE: + logger.warning( + "Skipping %s: uncompressed size %d exceeds NSPL limit %d", + name, + file_size, + _MAX_NSPL_UNCOMPRESSED_SIZE, + ) + continue + raw = zf.read(name) + for enc in ("utf-8-sig", "utf-8", "latin-1"): + try: + text = raw.decode(enc) + break + except UnicodeDecodeError: + continue + total += _parse_csv_content(text, "UK", overwrite=False, skip_terminated=True) + logger.info("NSPL loaded: %d live UK postcodes", total) + return total + except (httpx.HTTPError, zipfile.BadZipFile, OSError) as exc: + logger.warning("NSPL load failed: %s", exc) + return 0 + + def _db_path() -> Path: """Return the path for the SQLite cache DB, scoped by NUTS version.""" return Path(settings.data_dir) / f"postalcode2nuts_NUTS-{settings.nuts_version}.db" @@ -1007,6 +1069,12 @@ def load_data() -> None: if extra_count: logger.info("Extra sources added %d entries (overwrite mode)", extra_count) + # NSPL (UK postcodes via ITL) — optional, no-op when nspl_url unset + if not timed_out and settings.nspl_url: + nspl_count = _load_nspl(client, settings.nspl_url, cache_dir) + if nspl_count > 0: + logger.info("Loaded %d entries for UK from NSPL", nspl_count) + # NUTS region names if not timed_out: _download_nuts_names(client) diff --git a/tests/test_api.py b/tests/test_api.py index 49f7cc8..5852924 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -195,7 +195,7 @@ def test_includes_patterns_version(self, client): resp = client.get("/health") data = resp.json() assert "patterns_version" in data - assert data["patterns_version"] == "1.2" + assert data["patterns_version"] == "1.3" def test_includes_nuts_names(self, client): resp = client.get("/health") diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index be0ae79..a5dd2fb 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -320,6 +320,59 @@ def test_skip_terminated_default_false_keeps_all_rows(self, monkeypatch): assert rows == 2 +class TestLoadNSPL: + @staticmethod + def _zip_bytes(csv_text, arcname="NSPL.csv"): + import io as _io + import zipfile + + buf = _io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr(arcname, csv_text) + return buf.getvalue() + + def test_populates_lookup_from_zip(self, tmp_path, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {}) + csv_text = ( + "pcds,itl,doterm\n" + "SW1A 2AA,TLI32,\n" + "EC1A 1BB,TLI32,\n" + "M1 9NS,TLD46,202312\n" # terminated + ) + content = self._zip_bytes(csv_text) + + def handler(request): + return httpx.Response(200, content=content, headers={"ETag": '"v1"'}) + + client = httpx.Client(transport=httpx.MockTransport(handler)) + count = data_loader._load_nspl(client, "https://example.com/NSPL.zip", tmp_path) + assert count == 2 + assert data_loader._lookup[("UK", "SW1A2AA")] == "TLI32" + assert ("UK", "M19NS") not in data_loader._lookup + + def test_returns_zero_when_url_unset(self, tmp_path): + client = httpx.Client(transport=httpx.MockTransport(lambda r: httpx.Response(404))) + assert data_loader._load_nspl(client, "", tmp_path) == 0 + + def test_swallows_exceptions(self, tmp_path, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {}) + + def handler(request): + raise httpx.ConnectError("boom") + + client = httpx.Client(transport=httpx.MockTransport(handler)) + assert data_loader._load_nspl(client, "https://example.com/x.zip", tmp_path) == 0 + + def test_non_zip_response_returns_zero(self, tmp_path, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {}) + + def handler(request): + return httpx.Response(200, content=b"not a zip") + + client = httpx.Client(transport=httpx.MockTransport(handler)) + assert data_loader._load_nspl(client, "https://example.com/x.zip", tmp_path) == 0 + + class TestConditionalGet: def test_sends_conditional_headers_when_etag_known(self): captured = {} From 2678d06da0a4d83782eec3ed56ffc7fb790f5848 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:05:05 +0200 Subject: [PATCH 09/18] feat: build outward-code majority-vote index for Tier 3.5 --- app/data_loader.py | 30 ++++++++++++++++++++++++++++++ tests/conftest.py | 16 ++++++++++++++++ tests/test_data_loader.py | 26 ++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/app/data_loader.py b/app/data_loader.py index 6ed8985..c784db5 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -57,6 +57,10 @@ # NUTS region names: nuts_id -> name_latn _nuts_names: dict[str, str] = {} +# Outward-code index for lookup Tier 3.5 (UK): (country_code, outward) -> +# (nuts3, agreement_ratio). Built from _lookup by majority vote at load time. +_outward_lookup: dict[tuple[str, str], tuple[str, float]] = {} + # Staleness tracking _data_stale: bool = False _data_loaded_at: str = "" @@ -825,6 +829,29 @@ def _build_prefix_index() -> None: ) +def _build_outward_index(country_code: str) -> None: + """Populate _outward_lookup for one country by majority vote per outward code. + + Outward = the full normalised postcode minus its last three characters (UK + convention). Codes shorter than four characters are skipped (no meaningful + split). Used by lookup Tier 3.5 for outward-only or otherwise-unmatched input. + """ + groups: dict[str, list[str]] = {} + for (cc, code), nuts3 in _lookup.items(): + if cc != country_code or len(code) < 4: + continue + outward = code[:-3] + groups.setdefault(outward, []).append(nuts3) + + for outward, nuts3_list in groups.items(): + counts = Counter(nuts3_list) + winner, count = counts.most_common(1)[0] + agreement = count / len(nuts3_list) + _outward_lookup[(country_code, outward)] = (winner, agreement) + if groups: + logger.info("Built outward index for %s: %d outward codes", country_code, len(groups)) + + def _estimate_by_prefix(cc: str, postal_code: str) -> dict | None: """Runtime estimation via longest prefix match + majority vote. @@ -996,6 +1023,7 @@ def load_data() -> None: _lookup.clear() _estimates.clear() _nuts_names.clear() + _outward_lookup.clear() _data_stale = False _extra_source_count = len(settings.extra_source_urls) @@ -1017,6 +1045,7 @@ def load_data() -> None: _revalidate_estimates() _load_nuts_names_from_db(db) _build_prefix_index() + _build_outward_index("UK") return _lookup.clear() @@ -1109,6 +1138,7 @@ def load_data() -> None: logger.warning("TERCET refresh failed — serving stale cache") _build_prefix_index() + _build_outward_index("UK") def _build_result(match_type: str, nuts3: str, nuts1: str = "", nuts2: str = "", **confidence) -> dict: diff --git a/tests/conftest.py b/tests/conftest.py index d93e956..0c57fe9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,6 +29,11 @@ ("YY", "1002"): "YY111", ("YY", "1003"): "YY111", ("YY", "2001"): "YY112", + # UK (ITL via NSPL): SW1A → TLI32 majority, EC1A → TLI32, M1 → TLD45 + ("UK", "SW1A2AA"): "TLI32", + ("UK", "SW1A1AA"): "TLI32", + ("UK", "EC1A1BB"): "TLI32", + ("UK", "M11AA"): "TLD45", } MOCK_ESTIMATES = { @@ -76,6 +81,12 @@ "AL0": "Shqipëria", "AL02": "Qender", "AL022": "Tiranë", + "TLI": "London", + "TLI3": "Inner London - East", + "TLI32": "Tower Hamlets and Newham", + "TLD": "North West (England)", + "TLD4": "Greater Manchester", + "TLD45": "Manchester", } @@ -94,6 +105,7 @@ def mock_data(): orig_single = data_loader._single_nuts3.copy() orig_synthetic = data_loader._synthetic_nuts.copy() orig_fallback = data_loader._country_fallback.copy() + orig_outward = data_loader._outward_lookup.copy() # Populate data_loader._lookup.clear() @@ -103,6 +115,8 @@ def mock_data(): data_loader._nuts_names.clear() data_loader._nuts_names.update(MOCK_NUTS_NAMES) data_loader._build_prefix_index() + data_loader._outward_lookup.clear() + data_loader._build_outward_index("UK") yield @@ -121,6 +135,8 @@ def mock_data(): data_loader._synthetic_nuts.update(orig_synthetic) data_loader._country_fallback.clear() data_loader._country_fallback.update(orig_fallback) + data_loader._outward_lookup.clear() + data_loader._outward_lookup.update(orig_outward) @pytest.fixture() diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index a5dd2fb..4a52356 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -1,6 +1,7 @@ """Tests for data_loader.py — normalize functions and lookup tiers.""" import httpx +import pytest from app import data_loader from app.data_loader import lookup, normalize_country, normalize_postal_code @@ -373,6 +374,31 @@ def handler(request): assert data_loader._load_nspl(client, "https://example.com/x.zip", tmp_path) == 0 +class TestBuildOutwardIndex: + def test_majority_vote(self, monkeypatch): + monkeypatch.setattr( + data_loader, + "_lookup", + { + ("UK", "SW1A2AA"): "TLI32", + ("UK", "SW1A1AA"): "TLI32", + ("UK", "SW1A0AA"): "TLI31", # minority + ("UK", "M11AA"): "TLD45", + ("UK", "M11AB"): "TLD45", + }, + ) + monkeypatch.setattr(data_loader, "_outward_lookup", {}) + data_loader._build_outward_index("UK") + assert data_loader._outward_lookup[("UK", "SW1A")] == ("TLI32", pytest.approx(2 / 3)) + assert data_loader._outward_lookup[("UK", "M1")] == ("TLD45", pytest.approx(1.0)) + + def test_skips_short_codes(self, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {("UK", "AB1"): "TLC11"}) + monkeypatch.setattr(data_loader, "_outward_lookup", {}) + data_loader._build_outward_index("UK") + assert data_loader._outward_lookup == {} + + class TestConditionalGet: def test_sends_conditional_headers_when_etag_known(self): captured = {} From 3487042611f2ace4aef0799428cfde0cc2eb6dc7 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:05:46 +0200 Subject: [PATCH 10/18] feat: alias GB to UK for ISO 3166-1 input compatibility --- app/data_loader.py | 12 ++++++++++-- tests/test_api.py | 7 +++++++ tests/test_data_loader.py | 9 +++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/app/data_loader.py b/app/data_loader.py index c784db5..ef17803 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -82,9 +82,17 @@ def normalize_postal_code(code: str) -> str: def normalize_country(country_code: str) -> str: - """Normalize a country code: uppercase + map GR→EL (ISO vs GISCO convention).""" + """Normalize a country code: uppercase + map non-canonical aliases. + + GR → EL (ISO vs GISCO convention) + GB → UK (ISO vs NSPL/internal convention) + """ cc = country_code.strip().upper() - return "EL" if cc == "GR" else cc + if cc == "GR": + return "EL" + if cc == "GB": + return "UK" + return cc def get_lookup_table() -> dict[tuple[str, str], str]: diff --git a/tests/test_api.py b/tests/test_api.py index 5852924..b42bb24 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -22,6 +22,13 @@ def test_response_includes_code_system_nuts(self, client): assert resp.status_code == 200 assert resp.json()["code_system"] == "NUTS" + def test_lookup_accepts_gb_alias(self, client): + resp_uk = client.get("/lookup", params={"country": "UK", "postal_code": "SW1A 2AA"}) + resp_gb = client.get("/lookup", params={"country": "GB", "postal_code": "SW1A 2AA"}) + assert resp_uk.status_code == 200 + assert resp_gb.status_code == 200 + assert resp_uk.json() == resp_gb.json() + def test_400_unsupported_country(self, client): resp = client.get("/lookup", params={"postal_code": "12345", "country": "ZZ"}) assert resp.status_code == 400 diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index 4a52356..b2b8c06 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -46,6 +46,15 @@ def test_strips_whitespace(self): def test_el_stays_el(self): assert normalize_country("EL") == "EL" + def test_gb_to_uk(self): + assert normalize_country("GB") == "UK" + + def test_gb_lowercase(self): + assert normalize_country("gb") == "UK" + + def test_uk_stays_uk(self): + assert normalize_country("UK") == "UK" + # ── lookup tests (all 5 tiers) ────────────────────────────────────────────── From 5a9f7187ba370dc234a656283aa5a8d5e18a078b Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:08:08 +0200 Subject: [PATCH 11/18] feat: add outward-code lookup tier to lookup waterfall (UK) --- app/data_loader.py | 24 +++++++++++++++++++++++- tests/test_api.py | 7 +++++++ tests/test_data_loader.py | 27 +++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/app/data_loader.py b/app/data_loader.py index ef17803..6645380 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -1192,6 +1192,8 @@ def lookup(country_code: str, postal_code: str) -> dict | None: 1. Exact TERCET match → confidence 1.0 2. Pre-computed estimate → stored confidence per level 2b. Albania block map → district-block NUTS3, match_type='estimated' (#118) + 3.5. Outward-code lookup (UK) → majority-vote ITL3 for the outward code, + match_type='estimated', medium confidence (before generic prefix) 3. Runtime prefix-based estimation → calculated confidence 4. Country-level majority vote → unanimous NUTS1/2, dominant NUTS3 (e.g. MT) 5. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU) @@ -1200,7 +1202,7 @@ def lookup(country_code: str, postal_code: str) -> dict | None: Returns a dict with nuts1/2/3, match_type, and per-level confidence, or None. """ - from app.postal_patterns import extract_postal_code + from app.postal_patterns import extract_outward, extract_postal_code cc = normalize_country(country_code) @@ -1240,6 +1242,26 @@ def lookup(country_code: str, postal_code: str) -> dict | None: nuts3_confidence=conf["nuts3"], ) + # Tier 3.5: Outward-code lookup (UK and any country flagged outward_only). + # Placed before generic prefix estimation because the outward code is the + # meaningful UK boundary: a curated majority vote over the whole outward + # beats an arbitrary prefix match, and it yields match_type='estimated' with + # medium confidence. extract_outward returns None for non-outward countries, + # so this tier is inert for everything except UK. + outward = extract_outward(cc, postal_code) + if outward is not None: + outward_hit = _outward_lookup.get((cc, outward)) + if outward_hit is not None: + o_nuts3, _agreement = outward_hit + conf = settings.confidence_map["medium"] + return _build_result( + "estimated", + o_nuts3, + nuts1_confidence=conf["nuts1"], + nuts2_confidence=conf["nuts2"], + nuts3_confidence=conf["nuts3"], + ) + # Tier 3: Runtime prefix-based estimation approx = _estimate_by_prefix(cc, extracted) if approx is not None: diff --git a/tests/test_api.py b/tests/test_api.py index b42bb24..aa116d8 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -29,6 +29,13 @@ def test_lookup_accepts_gb_alias(self, client): assert resp_gb.status_code == 200 assert resp_uk.json() == resp_gb.json() + def test_uk_outward_only_input_returns_estimated(self, client): + resp = client.get("/lookup", params={"country": "UK", "postal_code": "SW1A"}) + assert resp.status_code == 200 + body = resp.json() + assert body["match_type"] == "estimated" + assert body["nuts3"] == "TLI32" + def test_400_unsupported_country(self, client): resp = client.get("/lookup", params={"postal_code": "12345", "country": "ZZ"}) assert resp.status_code == 400 diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index b2b8c06..b95a32a 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -383,6 +383,33 @@ def handler(request): assert data_loader._load_nspl(client, "https://example.com/x.zip", tmp_path) == 0 +class TestUKOutwardLookup: + def test_outward_only_input_returns_estimated(self, mock_data): + # "SW1A" has no inward part; resolves via the outward majority-vote tier. + result = lookup("UK", "SW1A") + assert result is not None + assert result["nuts3"] == "TLI32" + assert result["match_type"] == "estimated" + assert result["nuts1_confidence"] == pytest.approx(0.90) + assert result["nuts2_confidence"] == pytest.approx(0.80) + assert result["nuts3_confidence"] == pytest.approx(0.70) + + def test_full_postcode_still_exact(self, mock_data): + result = lookup("UK", "SW1A 2AA") + assert result["match_type"] == "exact" + assert result["nuts3"] == "TLI32" + + def test_unlisted_full_postcode_resolves_via_outward(self, mock_data): + # Valid-format UK postcode not in the data → outward "SW1A" still resolves. + result = lookup("UK", "SW1A 9ZZ") + assert result is not None + assert result["nuts3"] == "TLI32" + assert result["match_type"] == "estimated" + + def test_unknown_outward_returns_none(self, mock_data): + assert lookup("UK", "ZZ99") is None + + class TestBuildOutwardIndex: def test_majority_vote(self, monkeypatch): monkeypatch.setattr( From 95b85aba62e0937b44d4a2820606f8f284138cd1 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:09:41 +0200 Subject: [PATCH 12/18] feat: tag UK lookups with code_system=ITL via code prefix --- app/data_loader.py | 6 ++++++ tests/test_api.py | 5 +++++ tests/test_data_loader.py | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/app/data_loader.py b/app/data_loader.py index 6645380..53fe21f 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -1154,10 +1154,16 @@ def _build_result(match_type: str, nuts3: str, nuts1: str = "", nuts2: str = "", If nuts1/nuts2 are not provided, they are derived from nuts3. Confidence keys: nuts1_confidence, nuts2_confidence, nuts3_confidence. + + code_system is derived from the code itself: ITL codes are the UK's + NUTS successor and uniquely carry the "TL" prefix (no NUTS country code is + "TL"), so every "TL…" result is tagged "ITL" and all others "NUTS". """ n1 = nuts1 or nuts3[:3] n2 = nuts2 or nuts3[:4] + code_system = "ITL" if nuts3[:2] == "TL" else "NUTS" return { + "code_system": code_system, "match_type": match_type, "nuts1": n1, "nuts1_confidence": confidence.get("nuts1_confidence", 1.0), diff --git a/tests/test_api.py b/tests/test_api.py index aa116d8..41f05e4 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -36,6 +36,11 @@ def test_uk_outward_only_input_returns_estimated(self, client): assert body["match_type"] == "estimated" assert body["nuts3"] == "TLI32" + def test_uk_response_has_code_system_itl(self, client): + resp = client.get("/lookup", params={"country": "UK", "postal_code": "SW1A 2AA"}) + assert resp.status_code == 200 + assert resp.json()["code_system"] == "ITL" + def test_400_unsupported_country(self, client): resp = client.get("/lookup", params={"postal_code": "12345", "country": "ZZ"}) assert resp.status_code == 400 diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index b95a32a..fe7a440 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -409,6 +409,14 @@ def test_unlisted_full_postcode_resolves_via_outward(self, mock_data): def test_unknown_outward_returns_none(self, mock_data): assert lookup("UK", "ZZ99") is None + def test_uk_result_tagged_itl(self, mock_data): + assert lookup("UK", "SW1A 2AA")["code_system"] == "ITL" + assert lookup("UK", "SW1A")["code_system"] == "ITL" + + def test_non_uk_result_tagged_nuts(self, mock_data): + assert lookup("AT", "1010")["code_system"] == "NUTS" + assert lookup("DE", "10118")["code_system"] == "NUTS" + class TestBuildOutwardIndex: def test_majority_vote(self, monkeypatch): From 34c28cd66579fd8a43b63be5c54c404f4a0f53ca Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:10:33 +0200 Subject: [PATCH 13/18] feat: load ITL region names from ONS Names-and-Codes CSVs --- app/data_loader.py | 49 +++++++++++++++++++++++++++++++++++++++ tests/test_data_loader.py | 36 ++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/app/data_loader.py b/app/data_loader.py index 53fe21f..814dce6 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -725,6 +725,51 @@ def _download_nuts_names(client: httpx.Client) -> int: return count +def _load_itl_names(client: httpx.Client, urls: list[str]) -> int: + """Fetch ONS ITL "Names and Codes" CSVs and merge them into _nuts_names. + + NSPL carries ITL codes but not names. Each ONS CSV pairs a code column with + a name column whose headers vary by release year (e.g. ITL321CD/ITL321NM at + level 3, ITL221CD/ITL221NM at level 2) — columns are matched by the CD/NM + suffix rather than exact name. Failures are logged and skipped, never raised. + """ + if not urls: + return 0 + total = 0 + for url in urls: + try: + resp = client.get(url, timeout=30, follow_redirects=True) + resp.raise_for_status() + text = resp.text + except httpx.HTTPError as exc: + logger.warning("ITL names fetch failed for %s: %s", url, exc) + continue + try: + reader = csv.DictReader(io.StringIO(text)) + fieldnames = [f.strip() for f in (reader.fieldnames or [])] + code_col = next( + (f for f in fieldnames if f.upper().endswith("CD") and "ITL" in f.upper()), + None, + ) + name_col = next( + (f for f in fieldnames if f.upper().endswith("NM") and "ITL" in f.upper()), + None, + ) + if not code_col or not name_col: + logger.warning("No ITL CD/NM columns in %s; headers=%s", url, fieldnames) + continue + for row in reader: + code = (row.get(code_col) or "").strip().upper() + name = (row.get(name_col) or "").strip() + if code and name: + _nuts_names[code] = name + total += 1 + except csv.Error as exc: + logger.warning("ITL names parse failed for %s: %s", url, exc) + logger.info("ITL names loaded: %d entries from %d URLs", total, len(urls)) + return total + + def _load_nuts_names_from_db(db: Path) -> bool: """Load NUTS region names from SQLite cache. Graceful if table is missing.""" try: @@ -1112,6 +1157,10 @@ def load_data() -> None: if nspl_count > 0: logger.info("Loaded %d entries for UK from NSPL", nspl_count) + # ITL region names (ONS Names-and-Codes) — optional, no-op when unset + if not timed_out and settings.itl_names_url_list: + _load_itl_names(client, settings.itl_names_url_list) + # NUTS region names if not timed_out: _download_nuts_names(client) diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index fe7a440..fe8731b 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -443,6 +443,42 @@ def test_skips_short_codes(self, monkeypatch): assert data_loader._outward_lookup == {} +class TestLoadITLNames: + def test_populates_nuts_names(self, monkeypatch): + monkeypatch.setattr(data_loader, "_nuts_names", {}) + + def handler(request): + body = "ITL321CD,ITL321NM\nTLI32,Tower Hamlets\nTLI31,Hackney and Newham\n" + return httpx.Response(200, content=body.encode()) + + client = httpx.Client(transport=httpx.MockTransport(handler)) + count = data_loader._load_itl_names(client, ["https://example.com/itl3.csv"]) + assert count == 2 + assert data_loader._nuts_names["TLI32"] == "Tower Hamlets" + + def test_empty_url_list_no_op(self): + client = httpx.Client(transport=httpx.MockTransport(lambda r: httpx.Response(404))) + assert data_loader._load_itl_names(client, []) == 0 + + def test_missing_columns_skipped(self, monkeypatch): + monkeypatch.setattr(data_loader, "_nuts_names", {}) + + def handler(request): + return httpx.Response(200, content=b"foo,bar\n1,2\n") + + client = httpx.Client(transport=httpx.MockTransport(handler)) + assert data_loader._load_itl_names(client, ["https://example.com/x.csv"]) == 0 + + def test_http_error_swallowed(self, monkeypatch): + monkeypatch.setattr(data_loader, "_nuts_names", {}) + + def handler(request): + raise httpx.ConnectError("boom") + + client = httpx.Client(transport=httpx.MockTransport(handler)) + assert data_loader._load_itl_names(client, ["https://example.com/x.csv"]) == 0 + + class TestConditionalGet: def test_sends_conditional_headers_when_etag_known(self): captured = {} From 2741093ac63ff33f7bc07e1318ca0aa84d5b48bf Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:10:54 +0200 Subject: [PATCH 14/18] test: confirm NSPL failure does not block TERCET serving --- tests/test_data_loader.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index fe8731b..349e4e6 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -382,6 +382,21 @@ def handler(request): client = httpx.Client(transport=httpx.MockTransport(handler)) assert data_loader._load_nspl(client, "https://example.com/x.zip", tmp_path) == 0 + def test_nspl_failure_does_not_block_tercet(self, tmp_path, monkeypatch): + """If NSPL is unreachable, previously-loaded TERCET data must still serve.""" + monkeypatch.setattr(data_loader, "_lookup", {("AT", "1010"): "AT130"}) + + def handler(request): + raise httpx.ConnectError("ons unavailable") + + client = httpx.Client(transport=httpx.MockTransport(handler)) + nspl_count = data_loader._load_nspl(client, "https://ons.invalid/nspl.zip", tmp_path) + assert nspl_count == 0 + # AT lookup must still work (TERCET data untouched) + result = data_loader.lookup("AT", "1010") + assert result is not None + assert result["nuts3"] == "AT130" + class TestUKOutwardLookup: def test_outward_only_input_returns_estimated(self, mock_data): From 63bb1a3dbc2d45d4116b44af40ec9aa9322d83de Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:14:00 +0200 Subject: [PATCH 15/18] docs: document UK/ITL support, outward-code tier, OGL attribution --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f8824c0..8d6a165 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,16 @@ Faroe Islands (FO) — not part of NUTS; synthetic result. > **Faroe Islands** is an autonomous Danish territory with no NUTS coverage and no GISCO TERCET file. Lookups for FO are served by a synthetic single-region fallback (Tier 6) configured via `synthetic_nuts_fallback` in `app/settings.json`, returning `FO0` / `FO00` / `FO000` with `match_type="approximate"` and capped confidence (`0.90` / `0.85` / `0.80`) for any well-formed 3-digit code. The code is fabricated, not derived from a real NUTS dataset — contrast Montenegro's `ME000`, which is a genuine single-region NUTS code. +### United Kingdom (ITL) + +The UK left the EU, so it is no longer part of NUTS. Its successor classification, **ITL (International Territorial Level)**, is published by the ONS and mapped to postcodes via the [National Statistics Postcode Lookup (NSPL)](https://geoportal.statistics.gov.uk/). When configured (see `PC2NUTS_NSPL_URL`), the service accepts UK postcodes (`country=UK`, or `country=GB` as an alias) and returns ITL1/2/3 codes in the same `nuts1/2/3` fields, with `code_system: "ITL"` to distinguish them. + +ITL is **not** a drop-in for NUTS-2016 UK: it diverges at L2 (41 vs 40 regions) and L3 (179 vs 174), and ONS discontinued the bidirectional NUTS↔ITL lookups in 2023. Branch on `code_system` when comparing UK results against historical NUTS-UK data. + +UK coverage is **optional and operator-configured** — the ~178 MB NSPL ZIP is not bundled. When `PC2NUTS_NSPL_URL` is unset (the default), UK is unsupported and returns the standard `400`. Outward-code-only input (e.g. `SW1A`) resolves to the majority ITL3 for that outward code with `estimated`/medium confidence. + +> **Out of scope:** Crown Dependencies (Jersey JE, Guernsey GG, Isle of Man IM) and Gibraltar (GI) use UK-style postcodes but are not in ITL geography or NSPL, and are not supported — lookups for those country codes return `400`. + ## Deployment tiers PostalCode2NUTS runs in one of two tiers, chosen at deploy time by a single config @@ -157,6 +167,7 @@ GET /lookup?country=AT&postal_code=A-1010 { "postal_code": "A-1010", "country_code": "AT", + "code_system": "NUTS", "match_type": "exact", "nuts1": "AT1", "nuts1_name": "Ostösterreich", @@ -193,10 +204,37 @@ GET /lookup?country=AT&postal_code=1012 } ``` +**Example — UK postcode (ITL):** + +``` +GET /lookup?country=UK&postal_code=SW1A%202AA +``` + +```json +{ + "postal_code": "SW1A2AA", + "country_code": "UK", + "code_system": "ITL", + "match_type": "exact", + "nuts1": "TLI", + "nuts1_name": "London", + "nuts1_confidence": 1.0, + "nuts2": "TLI3", + "nuts2_name": "Inner London - East", + "nuts2_confidence": 1.0, + "nuts3": "TLI32", + "nuts3_name": "Tower Hamlets and Newham", + "nuts3_confidence": 1.0 +} +``` + +`country=GB` is accepted as an alias for `UK`. See [United Kingdom (ITL)](#united-kingdom-itl) for the NUTS-vs-ITL distinction. + Every response includes: | Field | Description | |-------|-------------| +| `code_system` | Territorial scheme of the `nuts{1,2,3}` fields: `NUTS` for EU/EFTA/candidate data, `ITL` for UK data (see [United Kingdom (ITL)](#united-kingdom-itl)) | | `match_type` | How the result was determined: `exact`, `estimated`, or `approximate` | | `nuts{1,2,3}_name` | Human-readable region name (Latin script), or `null` if unavailable | | `nuts{1,2,3}_confidence` | Confidence score (0.0–1.0) for each NUTS level | @@ -432,6 +470,7 @@ User input: "Traiskirchen" | SI | 4 digits | SI- | `1000`, `SI-1000` | | SK | 3 digits + optional space + 2 digits | SK- | `81101`, `811 01`, `SK-81101` | | TR | 5 digits | TR- | `06100`, `TR-06100`, `34000` | +| UK | 1–2 letters + digit + optional letter/digit + optional space + digit + 2 letters (ITL via NSPL; requires `PC2NUTS_NSPL_URL`) | GB accepted as alias | `SW1A 2AA`, `EC1A 1BB`, `M1 1AA`, `B33 8TH`, `SW1A` (outward only) | ## Configuration @@ -444,6 +483,8 @@ All settings are overridable via environment variables prefixed with `PC2NUTS_`: | `PC2NUTS_DB_CACHE_TTL_DAYS` | `30` | Days between automatic TERCET data refreshes. If the refresh fails, the service falls back to the previous data and sets `data_stale: true` in the health endpoint. | | `PC2NUTS_ESTIMATES_CSV` | `./tercet_missing_codes.csv` | Path to the estimates CSV. Loaded automatically at startup if the file exists. | | `PC2NUTS_EXTRA_SOURCES` | *(empty)* | Comma-separated list of ZIP URLs containing additional postal code data. Loaded after TERCET; entries overwrite TERCET data. | +| `PC2NUTS_NSPL_URL` | *(empty)* | URL to the latest [NSPL](https://geoportal.statistics.gov.uk/) ZIP from the ONS Open Geography Portal. Enables UK (ITL) support; when unset, UK is unsupported. The URL changes each quarterly release, so update it accordingly. | +| `PC2NUTS_ITL_NAMES_URLS` | *(empty)* | Comma-separated list of ONS "Names and Codes" CSV URLs (one per ITL level) that supply UK region names. Loaded after NSPL. | | `PC2NUTS_RATE_LIMIT` | `120/minute` | Rate limit for `/lookup` and `/pattern` endpoints. Uses [slowapi](https://github.com/laurentS/slowapi) syntax (e.g. `100/minute`, `5/second`). `/health` is exempt. The default leaves comfortable headroom under the measured aggregate ceiling (~30 RPS) — see [`docs/performance.md`](docs/performance.md) for the rationale. | | `PC2NUTS_RATE_LIMIT_HEADERS` | `true` | When `true`, `429` responses include `Retry-After` and `X-RateLimit-Limit` / `X-RateLimit-Remaining` headers. | | `PC2NUTS_CACHE_MAX_AGE` | `3600` | `Cache-Control: public, max-age=` (seconds) set on `/lookup`, `/pattern`, and `/` responses. | @@ -648,6 +689,10 @@ Each estimate carries a confidence label (high / medium / low) that is mapped to Confidence is higher at coarser NUTS levels because neighbouring codes are more likely to share the same NUTS1 region than the same NUTS3 region. +### UK outward-code lookup (`match_type: "estimated"`) — UK only + +For UK postcodes (loaded from NSPL — see [United Kingdom (ITL)](#united-kingdom-itl)), when the full postcode is not an exact match, the service looks up the **outward code** — everything before the final three characters, e.g. `SW1A` for `SW1A 2AA`. An index built at load time maps each outward code to the majority-vote ITL3 among all postcodes sharing it. This runs ahead of the generic prefix approximation below because the outward code is the meaningful UK boundary. It also handles outward-only input (`SW1A` submitted alone). Confidence uses the medium tier (NUTS1 0.90 / NUTS2 0.80 / NUTS3 0.70), since one outward code can straddle two adjacent ITL3 regions in dense urban areas. + ### Tier 3: Runtime approximation (`match_type: "approximate"`) If neither an exact match nor a pre-computed estimate exists, the service performs a runtime estimation using prefix matching against all known TERCET codes for that country. @@ -1010,7 +1055,7 @@ Optional `tercet_map` field for countries where the TERCET key differs from the } ``` -Supported `tercet_map` actions: `truncate:N`, `prepend:XX`, `keep_alpha`. +Supported `tercet_map` actions: `truncate:N`, `prepend:XX`, `keep_alpha`, `outward_only` (marks a country for outward-code fallback, as used by UK). ### 3. `README.md` — update coverage section @@ -1023,10 +1068,14 @@ Add the country to the appropriate group (EU, EFTA, or candidate) and add a row No Python code changes are required. +> **Non-GISCO sources** (currently only the UK via NSPL) are different: they require a dedicated loader path and configuration (a source ZIP URL and any names files), not just a JSON edit — and must **not** be added to `settings.json` `countries`, or the GISCO loader would waste requests guessing non-existent TERCET URLs. See `_load_nspl` and `_load_itl_names` in `app/data_loader.py` for the NSPL precedent. + ## Data sources & attribution **Postal code → NUTS (both tiers).** [GISCO TERCET flat files](https://ec.europa.eu/eurostat/web/gisco/geodata/administrative-units/postal-codes) ([download](https://gisco-services.ec.europa.eu/tercet/flat-files)), © European Union – GISCO, licensed [CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/). Albanian NUTS3 assignments come from the country's official postal-code block-allocation scheme (Posta Shqiptare), cross-validated against [GeoNames](https://www.geonames.org/) admin1 tagging ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)). +**UK postal code → ITL (optional).** [ONS National Statistics Postcode Lookup (NSPL)](https://geoportal.statistics.gov.uk/) and the ONS ITL "Names and Codes" files, © Crown copyright and database right, licensed under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). Contains OS data © Crown copyright and database right. Loaded only when `PC2NUTS_NSPL_URL` is configured. + The [EU Open Data Portal dataset](https://data.europa.eu/data/datasets/postcodes-and-nuts-nomenclature-of-territorial-units-for-statistics) was also considered as a data source. However, its refresh cycle lags behind the GISCO TERCET flat files, so direct sourcing from GISCO was chosen for more up-to-date coverage. **Address → geocode → NUTS (Full tier only).** The optional geocoding tier relies on: From 6088941b1696b8c85eb1051a5006306741d3f331 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:15:38 +0200 Subject: [PATCH 16/18] chore: release v1.1.0 (UK/ITL support #7) --- CHANGELOG.md | 20 ++++++++++++++++++++ app/__init__.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2135a0f..0f76374 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,8 +6,28 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ## [Unreleased] +## [1.1.0] - 2026-07-03 + ### Added +- **United Kingdom (ITL) support** (#7): the service can now resolve UK postcodes + to [ITL](https://www.ons.gov.uk/methodology/geography/ukgeographies/eurostat) + (International Territorial Level) codes — the UK's post-Brexit successor to + NUTS. Sourced from the ONS [National Statistics Postcode Lookup + (NSPL)](https://geoportal.statistics.gov.uk/), loaded only when + `PC2NUTS_NSPL_URL` is configured (the ~178 MB dataset is not bundled). UK is + treated as a parallel data channel: it reuses the same in-memory lookup, SQLite + cache, and waterfall as TERCET, and an NSPL failure never blocks TERCET serving. + - New response field **`code_system`** (`"NUTS"` | `"ITL"`) on `/lookup` + (additive, non-breaking) marks which scheme the `nuts1/2/3` fields carry. + ITL diverges from NUTS-2016 UK at L2/L3, so consumers should branch on it. + - **`country=GB` accepted** as an alias for `UK` (like `GR → EL`). + - **Outward-code lookup**: outward-only input (e.g. `SW1A`) or an unlisted + full postcode resolves to the majority-vote ITL3 for that outward code with + `match_type="estimated"` and medium confidence. + - New config: `PC2NUTS_NSPL_URL`, `PC2NUTS_ITL_NAMES_URLS`. `patterns_version` + bumped to `1.3`. Crown Dependencies (JE/GG/IM) and Gibraltar (GI) are out of + scope and return `400`. - **Albania coverage completeness** (#118): AL postal codes now resolve via the official postal-code block-allocation scheme (`app/albania_blocks.py`) instead of the incomplete GeoNames estimates. A code maps to its NUTS3 region by its diff --git a/app/__init__.py b/app/__init__.py index 5becc17..6849410 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1 +1 @@ -__version__ = "1.0.0" +__version__ = "1.1.0" From 03c267f02c565177f28f5da7ab64439cde9ad3c8 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:17:35 +0200 Subject: [PATCH 17/18] style: apply ruff format to data_loader --- app/data_loader.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/app/data_loader.py b/app/data_loader.py index 814dce6..e5528dc 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -363,9 +363,7 @@ def _parse_csv_content( return count -def _download_zip_conditional( - client: httpx.Client, url: str, cached_meta: dict -) -> httpx.Response: +def _download_zip_conditional(client: httpx.Client, url: str, cached_meta: dict) -> httpx.Response: """Download with conditional-GET headers; returns the raw httpx.Response. cached_meta keys: 'etag' and 'last_modified' (either may be absent). The From 9050047b387404b9fef6216392393734d9a23adc Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 22:25:27 +0200 Subject: [PATCH 18/18] fix: address Codex review on UK/ITL (#7) - outward miss for outward_only countries no longer falls through to generic prefix estimation (would answer from an arbitrary 1-2 char prefix) - reuse cached nspl.zip when the NSPL fetch transiently fails, so an ONS outage does not silently drop UK support on a rebuild - bust the SQLite fast-path cache when NSPL/ITL-names config changes, so enabling/disabling/swapping the URL takes effect without waiting for TTL --- .gitignore | 3 + app/data_loader.py | 118 ++++++++++++++++++++++++---------- tests/test_api.py | 12 +--- tests/test_config.py | 1 + tests/test_data_loader.py | 65 +++++++++++++++++-- tests/test_postal_patterns.py | 3 + 6 files changed, 154 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index 119d6e8..1a4ca0e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ local-data/ # Private planning / SDD artifacts — infra context + operator token; never commit or push docs/superpowers/ .superpowers/ + +# Local SQLite cache artifacts +postalcode2nuts_NUTS-*.db diff --git a/app/data_loader.py b/app/data_loader.py index e5528dc..b81d4b3 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -155,6 +155,20 @@ def _extra_sources_hash() -> str: return hashlib.sha256(joined.encode()).hexdigest()[:16] +def _nspl_config_hash() -> str: + """SHA-256 hash (16 hex chars) of the NSPL / ITL-names configuration. + + Returns empty string when NSPL is unconfigured, so a TERCET-only deployment's + cache stays valid. Enabling, disabling, or changing PC2NUTS_NSPL_URL / + PC2NUTS_ITL_NAMES_URLS changes the hash, busting the fast-path cache so UK + rows are added (or dropped) on the next load instead of after TTL expiry. + """ + if not settings.nspl_url and not settings.itl_names_url_list: + return "" + joined = settings.nspl_url + "|" + ",".join(settings.itl_names_url_list) + return hashlib.sha256(joined.encode()).hexdigest()[:16] + + def _load_extra_sources(client: httpx.Client, cache_dir: Path, *, deadline: float = 0) -> int: """Download and parse extra data source ZIPs. Returns total entries written.""" global _extra_source_count @@ -477,12 +491,47 @@ def _download_and_parse_zip( return total +def _parse_nspl_zip(content: bytes) -> int: + """Parse NSPL ZIP bytes and load live UK postcode → ITL3 rows into _lookup. + + Returns the number of rows added. Raises zipfile.BadZipFile for invalid input. + """ + total = 0 + with zipfile.ZipFile(io.BytesIO(content)) as zf: + for name in zf.namelist(): + # The postcode CSV is the "NSPL*.csv" (real releases ship it under + # Data/); other bundled CSVs (user guide, column lookups) lack the + # pcds/itl columns and are ignored by _parse_csv_content anyway. + if not name.lower().endswith(".csv") or "nspl" not in name.lower(): + continue + file_size = zf.getinfo(name).file_size + if file_size > _MAX_NSPL_UNCOMPRESSED_SIZE: + logger.warning( + "Skipping %s: uncompressed size %d exceeds NSPL limit %d", + name, + file_size, + _MAX_NSPL_UNCOMPRESSED_SIZE, + ) + continue + raw = zf.read(name) + for enc in ("utf-8-sig", "utf-8", "latin-1"): + try: + text = raw.decode(enc) + break + except UnicodeDecodeError: + continue + total += _parse_csv_content(text, "UK", overwrite=False, skip_terminated=True) + return total + + def _load_nspl(client: httpx.Client, url: str, cache_dir: Path) -> int: """Fetch the NSPL ZIP and load UK postcode → ITL3 entries into _lookup. - Returns the number of rows added. Returns 0 when url is empty or any error - occurs — an NSPL failure must never block TERCET-only operation. Terminated - postcodes (non-blank DOTERM) are filtered out. UK is registered in the loaded + Returns the number of rows added. Returns 0 when url is empty. An NSPL + failure must never block TERCET-only operation, so on a fetch/parse failure + the previously-cached nspl.zip is reused when present (a transient ONS outage + must not silently drop UK support for a configured deployment). Terminated + postcodes (non-blank DOTERM) are filtered out. UK registers in the loaded country set automatically because its rows land in _lookup. """ if not url: @@ -491,47 +540,38 @@ def _load_nspl(client: httpx.Client, url: str, cache_dir: Path) -> int: try: resp = _download_zip_conditional(client, url, {}) if resp.status_code == 304: - # Unchanged upstream — nothing to (re)load this run. - return 0 + # Unchanged upstream — reload from the cached copy if we have one. + return _load_nspl_from_cache(cache_path) resp.raise_for_status() content = resp.content if not zipfile.is_zipfile(io.BytesIO(content)): - logger.warning("NSPL response from %s is not a valid ZIP, skipping", url) - return 0 + logger.warning("NSPL response from %s is not a valid ZIP", url) + return _load_nspl_from_cache(cache_path) try: cache_path.write_bytes(content) except OSError as exc: logger.warning("Failed to cache NSPL ZIP: %s", exc) - - total = 0 - with zipfile.ZipFile(io.BytesIO(content)) as zf: - for name in zf.namelist(): - # The postcode CSV is the "NSPL*.csv" (real releases ship it under - # Data/); other bundled CSVs (user guide, column lookups) lack the - # pcds/itl columns and are ignored by _parse_csv_content anyway. - if not name.lower().endswith(".csv") or "nspl" not in name.lower(): - continue - file_size = zf.getinfo(name).file_size - if file_size > _MAX_NSPL_UNCOMPRESSED_SIZE: - logger.warning( - "Skipping %s: uncompressed size %d exceeds NSPL limit %d", - name, - file_size, - _MAX_NSPL_UNCOMPRESSED_SIZE, - ) - continue - raw = zf.read(name) - for enc in ("utf-8-sig", "utf-8", "latin-1"): - try: - text = raw.decode(enc) - break - except UnicodeDecodeError: - continue - total += _parse_csv_content(text, "UK", overwrite=False, skip_terminated=True) + total = _parse_nspl_zip(content) logger.info("NSPL loaded: %d live UK postcodes", total) return total except (httpx.HTTPError, zipfile.BadZipFile, OSError) as exc: - logger.warning("NSPL load failed: %s", exc) + logger.warning("NSPL fetch failed (%s); trying cached copy", exc) + return _load_nspl_from_cache(cache_path) + + +def _load_nspl_from_cache(cache_path: Path) -> int: + """Load UK rows from a previously-cached nspl.zip. Returns 0 if unavailable.""" + if not cache_path.is_file(): + return 0 + try: + content = cache_path.read_bytes() + if not zipfile.is_zipfile(io.BytesIO(content)): + return 0 + total = _parse_nspl_zip(content) + logger.info("NSPL loaded from cache: %d live UK postcodes", total) + return total + except (zipfile.BadZipFile, OSError) as exc: + logger.warning("Cached NSPL ZIP unusable: %s", exc) return 0 @@ -564,6 +604,10 @@ def _db_is_valid(db: Path) -> bool: if stored_hash != _extra_sources_hash(): logger.info("Extra sources configuration changed, will rebuild") return False + # Check if NSPL / ITL-names configuration changed (enable/disable/URL swap) + if meta.get("nspl_config_hash", "") != _nspl_config_hash(): + logger.info("NSPL configuration changed, will rebuild") + return False return True except (sqlite3.Error, KeyError, ValueError) as exc: logger.info("DB cache unusable (%s), will rebuild", exc) @@ -1038,6 +1082,7 @@ def _save_to_db(db: Path) -> None: ("estimate_count", str(len(_estimates))), ("nuts_names_count", str(len(_nuts_names))), ("extra_sources_hash", _extra_sources_hash()), + ("nspl_config_hash", _nspl_config_hash()), ], ) con.commit() @@ -1314,6 +1359,11 @@ def lookup(country_code: str, postal_code: str) -> dict | None: nuts2_confidence=conf["nuts2"], nuts3_confidence=conf["nuts3"], ) + # Outward is the authoritative boundary for outward_only countries. A + # miss means the code isn't in NSPL — stop here rather than fall through + # to generic prefix estimation, which would answer from an arbitrary 1–2 + # character prefix (e.g. "SW" for an unknown SW99, mixing distinct ITL3s). + return None # Tier 3: Runtime prefix-based estimation approx = _estimate_by_prefix(cc, extracted) diff --git a/tests/test_api.py b/tests/test_api.py index 41f05e4..8aac62d 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -454,9 +454,7 @@ def test_200_on_successful_refresh(self, trusted_client, monkeypatch): ) async def fake_refresh(client=None): - return RefreshResult( - status="refreshed", previous_count=7000, new_count=7042, skipped_rows=0 - ) + return RefreshResult(status="refreshed", previous_count=7000, new_count=7042, skipped_rows=0) monkeypatch.setattr(estimates_refresh, "refresh_estimates_once", fake_refresh) @@ -483,9 +481,7 @@ def test_200_on_unchanged_refresh(self, trusted_client, monkeypatch): ) async def fake_refresh(client=None): - return RefreshResult( - status="unchanged", previous_count=7000, new_count=7000, skipped_rows=0 - ) + return RefreshResult(status="unchanged", previous_count=7000, new_count=7000, skipped_rows=0) monkeypatch.setattr(estimates_refresh, "refresh_estimates_once", fake_refresh) @@ -510,9 +506,7 @@ def test_502_on_failed_refresh(self, trusted_client, monkeypatch): ) async def fake_refresh(client=None): - return RefreshResult( - status="failed", previous_count=7000, new_count=7000, reason="http=503" - ) + return RefreshResult(status="failed", previous_count=7000, new_count=7000, reason="http=503") monkeypatch.setattr(estimates_refresh, "refresh_estimates_once", fake_refresh) diff --git a/tests/test_config.py b/tests/test_config.py index 8af0aaf..cdbb8ae 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -61,6 +61,7 @@ def test_interval_negative_is_rejected(self, monkeypatch): def test_synthetic_nuts_fallback_has_fo(): from app.config import settings + assert settings.synthetic_nuts_fallback.get("FO") == "FO000" diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index 349e4e6..7166c92 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -183,6 +183,7 @@ def test_tier6_fo_rejects_two_digit(self, mock_data): def test_tier6_fo_in_loaded_countries(self, mock_data): from app.data_loader import get_loaded_countries + assert "FO" in get_loaded_countries() @@ -272,8 +273,18 @@ def test_al_stays_in_loaded_countries(self): class TestBundledAlbaniaData: VALID_AL_NUTS3 = { - "AL011", "AL012", "AL013", "AL014", "AL015", "AL021", - "AL022", "AL031", "AL032", "AL033", "AL034", "AL035", + "AL011", + "AL012", + "AL013", + "AL014", + "AL015", + "AL021", + "AL022", + "AL031", + "AL032", + "AL033", + "AL034", + "AL035", } def test_no_al_rows_remain_in_estimates_csv(self): @@ -382,6 +393,19 @@ def handler(request): client = httpx.Client(transport=httpx.MockTransport(handler)) assert data_loader._load_nspl(client, "https://example.com/x.zip", tmp_path) == 0 + def test_transient_failure_reuses_cached_zip(self, tmp_path, monkeypatch): + monkeypatch.setattr(data_loader, "_lookup", {}) + # Seed the on-disk cache from a prior successful run. + (tmp_path / "nspl.zip").write_bytes(self._zip_bytes("pcds,itl,doterm\nSW1A 2AA,TLI32,\n")) + + def handler(request): + raise httpx.ConnectError("ons down") + + client = httpx.Client(transport=httpx.MockTransport(handler)) + count = data_loader._load_nspl(client, "https://example.com/x.zip", tmp_path) + assert count == 1 + assert data_loader._lookup[("UK", "SW1A2AA")] == "TLI32" + def test_nspl_failure_does_not_block_tercet(self, tmp_path, monkeypatch): """If NSPL is unreachable, previously-loaded TERCET data must still serve.""" monkeypatch.setattr(data_loader, "_lookup", {("AT", "1010"): "AT130"}) @@ -424,6 +448,11 @@ def test_unlisted_full_postcode_resolves_via_outward(self, mock_data): def test_unknown_outward_returns_none(self, mock_data): assert lookup("UK", "ZZ99") is None + def test_outward_miss_does_not_fall_through_to_prefix(self, mock_data): + # "SW99 9ZZ" shares the "SW" prefix with SW1A… but SW99 is not a known + # outward; must NOT return an arbitrary prefix-based ITL3 — stop instead. + assert lookup("UK", "SW99 9ZZ") is None + def test_uk_result_tagged_itl(self, mock_data): assert lookup("UK", "SW1A 2AA")["code_system"] == "ITL" assert lookup("UK", "SW1A")["code_system"] == "ITL" @@ -494,6 +523,34 @@ def handler(request): assert data_loader._load_itl_names(client, ["https://example.com/x.csv"]) == 0 +class TestNSPLConfigHash: + def test_empty_when_unconfigured(self, monkeypatch): + monkeypatch.setattr(data_loader.settings, "nspl_url", "", raising=False) + monkeypatch.setattr(data_loader.settings, "itl_names_urls", "", raising=False) + assert data_loader._nspl_config_hash() == "" + + def test_changes_when_url_set(self, monkeypatch): + monkeypatch.setattr(data_loader.settings, "nspl_url", "", raising=False) + monkeypatch.setattr(data_loader.settings, "itl_names_urls", "", raising=False) + empty = data_loader._nspl_config_hash() + monkeypatch.setattr(data_loader.settings, "nspl_url", "https://ons/nspl.zip", raising=False) + assert data_loader._nspl_config_hash() != empty + assert data_loader._nspl_config_hash() != "" + + def test_db_invalidated_when_nspl_config_changes(self, tmp_path, monkeypatch): + monkeypatch.setattr(data_loader.settings, "nspl_url", "", raising=False) + monkeypatch.setattr(data_loader.settings, "itl_names_urls", "", raising=False) + monkeypatch.setattr(data_loader, "_lookup", {("AT", "1010"): "AT130"}) + monkeypatch.setattr(data_loader, "_estimates", {}) + monkeypatch.setattr(data_loader, "_nuts_names", {}) + db = tmp_path / "cache.db" + data_loader._save_to_db(db) + assert data_loader._db_is_valid(db) is True + # Operator now enables NSPL → cache must be considered invalid. + monkeypatch.setattr(data_loader.settings, "nspl_url", "https://ons/nspl.zip", raising=False) + assert data_loader._db_is_valid(db) is False + + class TestConditionalGet: def test_sends_conditional_headers_when_etag_known(self): captured = {} @@ -507,9 +564,7 @@ def handler(request: httpx.Request) -> httpx.Response: "etag": '"abc123"', "last_modified": "Wed, 01 Jan 2025 00:00:00 GMT", } - result = data_loader._download_zip_conditional( - client, "https://example.com/foo.zip", cached_meta - ) + result = data_loader._download_zip_conditional(client, "https://example.com/foo.zip", cached_meta) assert result.status_code == 304 assert captured["headers"]["if-none-match"] == '"abc123"' assert captured["headers"]["if-modified-since"] == "Wed, 01 Jan 2025 00:00:00 GMT" diff --git a/tests/test_postal_patterns.py b/tests/test_postal_patterns.py index 4f57392..918497e 100644 --- a/tests/test_postal_patterns.py +++ b/tests/test_postal_patterns.py @@ -151,10 +151,12 @@ def test_me_with_space_prefix(self): class TestFaroeIslands: def test_bare_three_digits(self): from app.postal_patterns import extract_postal_code + assert extract_postal_code("FO", "100") == "100" def test_prefixed_variants(self): from app.postal_patterns import extract_postal_code + assert extract_postal_code("FO", "FO-100") == "100" assert extract_postal_code("FO", "FO 100") == "100" assert extract_postal_code("FO", "FO100") == "100" @@ -163,6 +165,7 @@ def test_rejects_non_three_digit(self): # No regex match → falls back to normalize_postal_code(cleaned), # which is NOT a clean 3-digit extraction. from app.postal_patterns import _COMPILED + pat = _COMPILED["FO"] assert pat.match("1234") is None assert pat.match("ABC") is None