From dece7f5646f575d51869369b328fce941334ef62 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 21:37:36 +0200 Subject: [PATCH] fix: reject unallocated AL postal-code prefixes instead of range-filling (#118) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The block resolver used a continuous bisect, so any 4-digit code >=1000 resolved to its enclosing district block — including codes in prefixes that belong to no district (e.g. 1900, 9999), which were returned as high-confidence regions despite not existing. Key on the allocated 2-digit district prefix instead: real codes and every #118 gap code still resolve identically (the golden test is unchanged), but non-existent codes now return not-found rather than a fabricated region. Flagged by Codex review on #133. --- CHANGELOG.md | 13 +++++++------ README.md | 2 +- app/albania_blocks.py | 35 ++++++++++++++++------------------- tests/test_albania_blocks.py | 20 ++++++++++++++++---- 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9643e5f..2135a0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,12 +10,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). - **Albania coverage completeness** (#118): AL postal codes now resolve via the official postal-code block-allocation scheme (`app/albania_blocks.py`) instead - of the incomplete GeoNames estimates. Every well-formed 4-digit AL code maps to - its NUTS3 region by district block — codes GeoNames omitted (e.g. Tirana 1055, - and whole districts like Gramsh 33xx / Peqin 35xx / Tepelenë 63xx / Përmet - 64xx) no longer 404. Validated to reproduce all 489 previously-shipped codes - identically. Because the map is code, not data, AL coverage is now immune to - the `PC2NUTS_ESTIMATES_REFRESH_URL` full-replace clobber. + of the incomplete GeoNames estimates. A code maps to its NUTS3 region by its + allocated district prefix — codes GeoNames omitted (e.g. Tirana 1055, and whole + districts like Gramsh 33xx / Peqin 35xx / Tepelenë 63xx / Përmet 64xx) no + longer 404, while codes whose prefix belongs to no district (non-existent) return + not-found rather than a fabricated region. Validated to reproduce all 489 + previously-shipped codes identically. Because the map is code, not data, AL + coverage is now immune to the `PC2NUTS_ESTIMATES_REFRESH_URL` full-replace clobber. ## [1.0.0] - 2026-07-03 diff --git a/README.md b/README.md index e43d0ee..f8824c0 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Albania (AL), North Macedonia (MK), Montenegro (ME), Serbia (RS), Türkiye (TR) > **Montenegro** is treated by Eurostat as a single nationwide unit at every NUTS level (`ME0` / `ME00` / `ME000`), and GISCO does not currently publish a TERCET file for it. Lookups for ME are served by the single-NUTS3 fallback (Tier 5) configured via `single_nuts3_fallback` in `app/settings.json`, returning `ME000` for any valid 5-digit code starting with `8`. -> **Albania** has a full NUTS hierarchy (`AL0`; `AL01` / `AL02` / `AL03`; 12 NUTS3 counties `AL011`–`AL035`) but Eurostat publishes no GISCO TERCET file for it. Coverage is provided by an authoritative postal-code **block resolver** (`app/albania_blocks.py`): Albanian codes are block-allocated by district — the first two digits identify one of ~33 postal districts, each belonging to one of the 12 NUTS3 qarks — so **any** well-formed 4-digit code resolves to its qark via the block it falls into. Lookups return `match_type="estimated"` with `high` confidence — see [Estimates](#estimates). +> **Albania** has a full NUTS hierarchy (`AL0`; `AL01` / `AL02` / `AL03`; 12 NUTS3 counties `AL011`–`AL035`) but Eurostat publishes no GISCO TERCET file for it. Coverage is provided by an authoritative postal-code **block resolver** (`app/albania_blocks.py`): Albanian codes are block-allocated by district — the first two digits identify one of ~33 postal districts, each belonging to one of the 12 NUTS3 qarks — so a code resolves to its qark by its district prefix. Codes whose prefix belongs to no district (i.e. that don't exist) return not-found rather than a fabricated region. Lookups return `match_type="estimated"` with `high` confidence — see [Estimates](#estimates). **Other territories** (1): Faroe Islands (FO) — not part of NUTS; synthetic result. diff --git a/app/albania_blocks.py b/app/albania_blocks.py index 3e4af78..0a2c875 100644 --- a/app/albania_blocks.py +++ b/app/albania_blocks.py @@ -2,10 +2,11 @@ Albania has no Eurostat TERCET file. Its postal codes are block-allocated by district: the first two digits identify one of ~33 postal districts, and each -district sits in exactly one of the 12 qarks (= NUTS3). A range map keyed on the -district-center codes resolves ANY well-formed 4-digit code to its NUTS3 by the -block it falls into — covering the gaps GeoNames leaves (issue #118) by -construction, at NUTS3 granularity. +district sits in exactly one of the 12 qarks (= NUTS3). Keying on that allocated +2-digit prefix resolves any code in a real district to its NUTS3 — covering the +gaps GeoNames leaves (issue #118) by construction, at NUTS3 granularity — while a +code whose prefix belongs to no district returns None rather than a fabricated +region. Source: official Posta Shqiptare allocation, cross-checked vs. Wikipedia "Postal codes in Albania" and the UPU addressing PDF. The district->qark->NUTS3 mapping @@ -17,14 +18,12 @@ from __future__ import annotations -from bisect import bisect_right - SUPPORTED: frozenset[str] = frozenset({"AL"}) -# (district-center code, NUTS3, district name). Ascending by code. Each code is -# the LOWER bound of that district's block; a block runs to the next code. -# 1700 "Transit" / 1800 "EMS" are non-geographic service codes folded into -# Tirana (AL022), matching how GeoNames tags the 17xx/18xx prefixes. +# (district-center code, NUTS3, district name). Ascending by code. The first two +# digits of each code are the district's allocated prefix; the last two identify +# a postal office within it. 1700 "Transit" / 1800 "EMS" are non-geographic +# service codes folded into Tirana (AL022), matching how GeoNames tags 17xx/18xx. BLOCKS: list[tuple[int, str, str]] = [ (1000, "AL022", "Tirana"), (1500, "AL012", "Kruje"), @@ -63,20 +62,18 @@ (9700, "AL035", "Sarande"), ] -_STARTS = [b[0] for b in BLOCKS] -_NUTS3 = [b[1] for b in BLOCKS] +# Allocated 2-digit district prefix -> NUTS3. Each district owns a distinct +# prefix, so this is a 1:1 map with one entry per block. +_PREFIX_TO_NUTS3: dict[str, str] = {str(code)[:2]: nuts3 for code, nuts3, _ in BLOCKS} def resolve_al_block(postal_code: str) -> str | None: """NUTS3 code for a well-formed 4-digit AL postal code, else None. - Any code >= 1000 maps to its enclosing district block (incl. 9800-9999 -> - Sarande/AL035 as best-effort). Codes < 1000, wrong length, or non-numeric - return None. + A code resolves only when its first two digits are an allocated district + prefix; codes in unallocated prefixes (no such district) return None rather + than a fabricated region. Wrong length or non-numeric input also returns None. """ if not (len(postal_code) == 4 and postal_code.isdigit()): return None - n = int(postal_code) - if n < _STARTS[0]: - return None - return _NUTS3[bisect_right(_STARTS, n) - 1] + return _PREFIX_TO_NUTS3.get(postal_code[:2]) diff --git a/tests/test_albania_blocks.py b/tests/test_albania_blocks.py index d5a45f1..9cb315c 100644 --- a/tests/test_albania_blocks.py +++ b/tests/test_albania_blocks.py @@ -41,14 +41,26 @@ def test_service_codes_fold_into_tirana(): assert resolve_al_block("1800") == "AL022" # EMS -def test_top_open_range_maps_to_sarande(): - assert resolve_al_block("9800") == "AL035" - assert resolve_al_block("9999") == "AL035" +def test_unallocated_prefixes_return_none(): + # A code whose 2-digit district prefix is not allocated to any district does + # not exist — return None rather than inventing a confident region (#118). + assert resolve_al_block("1900") is None # prefix 19 — no district + assert resolve_al_block("2100") is None # prefix 21 — between Durres(20)/Kavaje(25) + assert resolve_al_block("1250") is None # prefix 12 — between Tirana(10)/Kruje(15) + assert resolve_al_block("9800") is None # prefix 98 — above Sarande(97) + assert resolve_al_block("9999") is None # prefix 99 — no district + + +def test_within_district_range_still_resolves(): + # Any code inside an allocated district's 2-digit space resolves to that + # district — the block scheme is authoritative at district granularity. + assert resolve_al_block("1099") == "AL022" # still prefix 10 (Tirana) + assert resolve_al_block("7099") == "AL034" # still prefix 70 (Korce) def test_malformed_and_out_of_range_return_none(): assert resolve_al_block("100") is None # too short assert resolve_al_block("10011") is None # too long assert resolve_al_block("10AB") is None # non-digit - assert resolve_al_block("0999") is None # below the lowest block + assert resolve_al_block("0999") is None # prefix 09 — no district assert resolve_al_block("") is None