From e719705570875feae7ae816c168be8d66ab64e7e Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 19:14:46 +0200 Subject: [PATCH 1/7] feat: add Albania postal-code block resolver (#118) --- app/albania_blocks.py | 82 ++++++++++++++++++++++++++++++++++++ tests/test_albania_blocks.py | 54 ++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 app/albania_blocks.py create mode 100644 tests/test_albania_blocks.py diff --git a/app/albania_blocks.py b/app/albania_blocks.py new file mode 100644 index 0000000..3e4af78 --- /dev/null +++ b/app/albania_blocks.py @@ -0,0 +1,82 @@ +"""Authoritative Albania (AL) NUTS3 resolver from the postal-code BLOCK scheme. + +Albania has no Eurostat TERCET file. Its postal codes are block-allocated by +district: the first two digits identify one of ~33 postal districts, and each +district sits in exactly one of the 12 qarks (= NUTS3). A range map keyed on the +district-center codes resolves ANY well-formed 4-digit code to its NUTS3 by the +block it falls into — covering the gaps GeoNames leaves (issue #118) by +construction, at NUTS3 granularity. + +Source: official Posta Shqiptare allocation, cross-checked vs. Wikipedia "Postal +codes in Albania" and the UPU addressing PDF. The district->qark->NUTS3 mapping +reuses the GISCO-verified qark codes; the two non-obvious assignments (Kruje +15xx -> AL012, Kavaje 25xx -> AL022) are confirmed by GeoNames' own 15xx/25xx +tagging. Validated 100% against the 489 previously-shipped GeoNames codes (see +tests/test_albania_golden.py). +""" + +from __future__ import annotations + +from bisect import bisect_right + +SUPPORTED: frozenset[str] = frozenset({"AL"}) + +# (district-center code, NUTS3, district name). Ascending by code. Each code is +# the LOWER bound of that district's block; a block runs to the next code. +# 1700 "Transit" / 1800 "EMS" are non-geographic service codes folded into +# Tirana (AL022), matching how GeoNames tags the 17xx/18xx prefixes. +BLOCKS: list[tuple[int, str, str]] = [ + (1000, "AL022", "Tirana"), + (1500, "AL012", "Kruje"), + (1700, "AL022", "Transit (service)"), + (1800, "AL022", "EMS Office (service)"), + (2000, "AL012", "Durres"), + (2500, "AL022", "Kavaje"), + (3000, "AL021", "Elbasan"), + (3300, "AL021", "Gramsh"), + (3400, "AL021", "Librazhd"), + (3500, "AL021", "Peqin"), + (4000, "AL015", "Shkoder"), + (4300, "AL015", "Malesi e Madhe"), + (4400, "AL015", "Puke"), + (4500, "AL014", "Lezhe"), + (4600, "AL014", "Mirdite"), + (4700, "AL014", "Kurbin"), + (5000, "AL031", "Berat"), + (5300, "AL031", "Kucove"), + (5400, "AL031", "Skrapar"), + (6000, "AL033", "Gjirokaster"), + (6300, "AL033", "Tepelene"), + (6400, "AL033", "Permet"), + (7000, "AL034", "Korce"), + (7300, "AL034", "Pogradec"), + (7400, "AL034", "Kolonje"), + (8000, "AL011", "Mat"), + (8300, "AL011", "Diber"), + (8400, "AL011", "Bulqize"), + (8500, "AL013", "Kukes"), + (8600, "AL013", "Has"), + (8700, "AL013", "Tropoje"), + (9000, "AL032", "Lushnje"), + (9300, "AL032", "Fier"), + (9400, "AL035", "Vlore"), + (9700, "AL035", "Sarande"), +] + +_STARTS = [b[0] for b in BLOCKS] +_NUTS3 = [b[1] for b in BLOCKS] + + +def resolve_al_block(postal_code: str) -> str | None: + """NUTS3 code for a well-formed 4-digit AL postal code, else None. + + Any code >= 1000 maps to its enclosing district block (incl. 9800-9999 -> + Sarande/AL035 as best-effort). Codes < 1000, wrong length, or non-numeric + return None. + """ + if not (len(postal_code) == 4 and postal_code.isdigit()): + return None + n = int(postal_code) + if n < _STARTS[0]: + return None + return _NUTS3[bisect_right(_STARTS, n) - 1] diff --git a/tests/test_albania_blocks.py b/tests/test_albania_blocks.py new file mode 100644 index 0000000..d5a45f1 --- /dev/null +++ b/tests/test_albania_blocks.py @@ -0,0 +1,54 @@ +"""Tests for app/albania_blocks.py.""" + +from app.albania_blocks import BLOCKS, SUPPORTED, resolve_al_block + + +def test_supported_is_al_only(): + assert SUPPORTED == frozenset({"AL"}) + + +def test_blocks_sorted_and_valid(): + codes = [c for c, _, _ in BLOCKS] + assert codes == sorted(codes), "BLOCKS must be ascending by code" + for _, nuts3, name in BLOCKS: + assert len(nuts3) == 5 and nuts3.startswith("AL"), nuts3 + assert name + + +def test_known_district_centers_resolve(): + # District-center codes map to their qark's NUTS3. + assert resolve_al_block("1001") == "AL022" # Tirana + assert resolve_al_block("2001") == "AL012" # Durres + assert resolve_al_block("9401") == "AL035" # Vlore + + +def test_non_obvious_blocks(): + # The two assignments confirmed against GeoNames' own tagging. + assert resolve_al_block("1501") == "AL012" # Kruje -> Durres qark + assert resolve_al_block("2501") == "AL022" # Kavaje -> Tirana qark + + +def test_gap_codes_resolve_not_none(): + # Codes GeoNames omits (issue #118) still resolve via their block. + assert resolve_al_block("1055") == "AL022" + assert resolve_al_block("1065") == "AL022" + assert resolve_al_block("3350") == "AL021" # Gramsh block (GeoNames has none) + assert resolve_al_block("6450") == "AL033" # Permet block (GeoNames has none) + + +def test_service_codes_fold_into_tirana(): + assert resolve_al_block("1700") == "AL022" # Transit + assert resolve_al_block("1800") == "AL022" # EMS + + +def test_top_open_range_maps_to_sarande(): + assert resolve_al_block("9800") == "AL035" + assert resolve_al_block("9999") == "AL035" + + +def test_malformed_and_out_of_range_return_none(): + assert resolve_al_block("100") is None # too short + assert resolve_al_block("10011") is None # too long + assert resolve_al_block("10AB") is None # non-digit + assert resolve_al_block("0999") is None # below the lowest block + assert resolve_al_block("") is None From 03fef0861cbfd6c7354b70de8d9a9b2d9d89717a Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 19:17:47 +0200 Subject: [PATCH 2/7] test: golden regression locking block resolver to shipped AL codes (#118) --- tests/fixtures/albania_geonames_golden.csv | 490 +++++++++++++++++++++ tests/test_albania_golden.py | 27 ++ 2 files changed, 517 insertions(+) create mode 100644 tests/fixtures/albania_geonames_golden.csv create mode 100644 tests/test_albania_golden.py diff --git a/tests/fixtures/albania_geonames_golden.csv b/tests/fixtures/albania_geonames_golden.csv new file mode 100644 index 0000000..828edbf --- /dev/null +++ b/tests/fixtures/albania_geonames_golden.csv @@ -0,0 +1,490 @@ +postal_code,nuts3 +1001,AL022 +1002,AL022 +1003,AL022 +1004,AL022 +1005,AL022 +1006,AL022 +1007,AL022 +1008,AL022 +1009,AL022 +1010,AL022 +1011,AL022 +1012,AL022 +1013,AL022 +1014,AL022 +1015,AL022 +1016,AL022 +1017,AL022 +1018,AL022 +1019,AL022 +1020,AL022 +1021,AL022 +1022,AL022 +1023,AL022 +1024,AL022 +1025,AL022 +1026,AL022 +1027,AL022 +1028,AL022 +1029,AL022 +1030,AL022 +1031,AL022 +1032,AL022 +1033,AL022 +1034,AL022 +1035,AL022 +1036,AL022 +1037,AL022 +1038,AL022 +1039,AL022 +1040,AL022 +1041,AL022 +1042,AL022 +1043,AL022 +1044,AL022 +1045,AL022 +1046,AL022 +1047,AL022 +1048,AL022 +1049,AL022 +1050,AL022 +1051,AL022 +1052,AL022 +1053,AL022 +1054,AL022 +1501,AL012 +1502,AL012 +1503,AL012 +1504,AL012 +1505,AL012 +1506,AL012 +1507,AL012 +1508,AL012 +1700,AL022 +1800,AL022 +2001,AL012 +2002,AL012 +2003,AL012 +2004,AL012 +2005,AL012 +2006,AL012 +2007,AL012 +2008,AL012 +2009,AL012 +2010,AL012 +2011,AL012 +2012,AL012 +2013,AL012 +2014,AL012 +2015,AL012 +2016,AL012 +2017,AL012 +2018,AL012 +2019,AL012 +2020,AL012 +2021,AL012 +2022,AL012 +2501,AL022 +2502,AL022 +2503,AL022 +2504,AL022 +2505,AL022 +2506,AL022 +2507,AL022 +2508,AL022 +2509,AL022 +2510,AL022 +2511,AL022 +3001,AL021 +3002,AL021 +3003,AL021 +3004,AL021 +3005,AL021 +3006,AL021 +3007,AL021 +3008,AL021 +3009,AL021 +3010,AL021 +3011,AL021 +3012,AL021 +3013,AL021 +3014,AL021 +3015,AL021 +3016,AL021 +3017,AL021 +3018,AL021 +3019,AL021 +3020,AL021 +3021,AL021 +3022,AL021 +3023,AL021 +3024,AL021 +3025,AL021 +3026,AL021 +3027,AL021 +3401,AL021 +3402,AL021 +3403,AL021 +3404,AL021 +3405,AL021 +3406,AL021 +3407,AL021 +3408,AL021 +3409,AL021 +3410,AL021 +3411,AL021 +3412,AL021 +3413,AL021 +4001,AL015 +4002,AL015 +4003,AL015 +4004,AL015 +4005,AL015 +4006,AL015 +4007,AL015 +4008,AL015 +4009,AL015 +4010,AL015 +4011,AL015 +4012,AL015 +4013,AL015 +4014,AL015 +4015,AL015 +4016,AL015 +4017,AL015 +4018,AL015 +4019,AL015 +4020,AL015 +4021,AL015 +4022,AL015 +4023,AL015 +4024,AL015 +4025,AL015 +4026,AL015 +4027,AL015 +4028,AL015 +4029,AL015 +4030,AL015 +4301,AL015 +4302,AL015 +4303,AL015 +4304,AL015 +4305,AL015 +4306,AL015 +4401,AL015 +4402,AL015 +4403,AL015 +4404,AL015 +4405,AL015 +4406,AL015 +4407,AL015 +4408,AL015 +4409,AL015 +4501,AL014 +4502,AL014 +4503,AL014 +4504,AL014 +4505,AL014 +4506,AL014 +4507,AL014 +4508,AL014 +4509,AL014 +4510,AL014 +4511,AL014 +4601,AL014 +4602,AL014 +4603,AL014 +4604,AL014 +4605,AL014 +4606,AL014 +4607,AL014 +4608,AL014 +4701,AL014 +4702,AL014 +4703,AL014 +4704,AL014 +4705,AL014 +4706,AL014 +5001,AL031 +5002,AL031 +5003,AL031 +5004,AL031 +5005,AL031 +5006,AL031 +5007,AL031 +5008,AL031 +5009,AL031 +5010,AL031 +5011,AL031 +5012,AL031 +5013,AL031 +5014,AL031 +5015,AL031 +5016,AL031 +5017,AL031 +5018,AL031 +5019,AL031 +5020,AL031 +5021,AL031 +5022,AL031 +5301,AL031 +5302,AL031 +5303,AL031 +5304,AL031 +5305,AL031 +5306,AL031 +5307,AL031 +5308,AL031 +5401,AL031 +5402,AL031 +5403,AL031 +5404,AL031 +5405,AL031 +5406,AL031 +5407,AL031 +5408,AL031 +5409,AL031 +5410,AL031 +5411,AL031 +5412,AL031 +5413,AL031 +6001,AL033 +6002,AL033 +6003,AL033 +6004,AL033 +6005,AL033 +6006,AL033 +6007,AL033 +6008,AL033 +6009,AL033 +6010,AL033 +6011,AL033 +6012,AL033 +6013,AL033 +6014,AL033 +6015,AL033 +6016,AL033 +6017,AL033 +6018,AL033 +6019,AL033 +7001,AL034 +7002,AL034 +7003,AL034 +7004,AL034 +7005,AL034 +7006,AL034 +7007,AL034 +7008,AL034 +7009,AL034 +7010,AL034 +7011,AL034 +7012,AL034 +7013,AL034 +7014,AL034 +7015,AL034 +7016,AL034 +7017,AL034 +7018,AL034 +7019,AL034 +7020,AL034 +7021,AL034 +7022,AL034 +7023,AL034 +7024,AL034 +7025,AL034 +7026,AL034 +7027,AL034 +7028,AL034 +7029,AL034 +7301,AL034 +7302,AL034 +7303,AL034 +7304,AL034 +7305,AL034 +7306,AL034 +7307,AL034 +7308,AL034 +7309,AL034 +7310,AL034 +7401,AL034 +7402,AL034 +7403,AL034 +7404,AL034 +7405,AL034 +7406,AL034 +8001,AL011 +8002,AL011 +8003,AL011 +8004,AL011 +8005,AL011 +8006,AL011 +8007,AL011 +8008,AL011 +8009,AL011 +8010,AL011 +8011,AL011 +8012,AL011 +8013,AL011 +8014,AL011 +8301,AL011 +8302,AL011 +8303,AL011 +8304,AL011 +8305,AL011 +8306,AL011 +8307,AL011 +8308,AL011 +8309,AL011 +8310,AL011 +8311,AL011 +8312,AL011 +8313,AL011 +8314,AL011 +8315,AL011 +8401,AL011 +8402,AL011 +8403,AL011 +8404,AL011 +8405,AL011 +8406,AL011 +8407,AL011 +8408,AL011 +8409,AL011 +8501,AL013 +8502,AL013 +8503,AL013 +8504,AL013 +8505,AL013 +8506,AL013 +8507,AL013 +8508,AL013 +8509,AL013 +8510,AL013 +8511,AL013 +8512,AL013 +8513,AL013 +8514,AL013 +8515,AL013 +8516,AL013 +8517,AL013 +8518,AL013 +8519,AL013 +8520,AL013 +8601,AL013 +8602,AL013 +8603,AL013 +8604,AL013 +8605,AL013 +8701,AL013 +8702,AL013 +8703,AL013 +8704,AL013 +8705,AL013 +8706,AL013 +8707,AL013 +9001,AL032 +9002,AL032 +9003,AL032 +9004,AL032 +9005,AL032 +9006,AL032 +9007,AL032 +9008,AL032 +9009,AL032 +9010,AL032 +9011,AL032 +9012,AL032 +9013,AL032 +9014,AL032 +9015,AL032 +9016,AL032 +9017,AL032 +9018,AL032 +9019,AL032 +9020,AL032 +9021,AL032 +9022,AL032 +9301,AL032 +9302,AL032 +9303,AL032 +9304,AL032 +9305,AL032 +9306,AL032 +9307,AL032 +9308,AL032 +9309,AL032 +9310,AL032 +9311,AL032 +9312,AL032 +9313,AL032 +9314,AL032 +9315,AL032 +9316,AL032 +9317,AL032 +9318,AL032 +9319,AL032 +9320,AL032 +9321,AL032 +9322,AL032 +9323,AL032 +9324,AL032 +9325,AL032 +9326,AL032 +9327,AL032 +9328,AL032 +9329,AL032 +9330,AL032 +9331,AL032 +9332,AL032 +9333,AL032 +9334,AL032 +9335,AL032 +9401,AL035 +9402,AL035 +9403,AL035 +9404,AL035 +9405,AL035 +9406,AL035 +9407,AL035 +9408,AL035 +9409,AL035 +9410,AL035 +9411,AL035 +9412,AL035 +9413,AL035 +9414,AL035 +9415,AL035 +9416,AL035 +9417,AL035 +9418,AL035 +9419,AL035 +9420,AL035 +9421,AL035 +9422,AL035 +9423,AL035 +9424,AL035 +9425,AL035 +9426,AL035 +9427,AL035 +9701,AL035 +9702,AL035 +9703,AL035 +9704,AL035 +9705,AL035 +9706,AL035 +9707,AL035 +9708,AL035 +9709,AL035 +9710,AL035 +9711,AL035 +9712,AL035 +9713,AL035 +9714,AL035 +9715,AL035 +9716,AL035 +9717,AL035 +9718,AL035 +9719,AL035 +9720,AL035 +9721,AL035 diff --git a/tests/test_albania_golden.py b/tests/test_albania_golden.py new file mode 100644 index 0000000..c5c4146 --- /dev/null +++ b/tests/test_albania_golden.py @@ -0,0 +1,27 @@ +"""Golden regression: the block resolver must reproduce every NUTS3 that the +retired GeoNames generator assigned to the 489 shipped AL codes.""" + +import csv +from pathlib import Path + +from app.albania_blocks import resolve_al_block + +GOLDEN = Path(__file__).parent / "fixtures" / "albania_geonames_golden.csv" + + +def _golden_rows(): + with open(GOLDEN, encoding="utf-8", newline="") as f: + return [(r["postal_code"], r["nuts3"]) for r in csv.DictReader(f)] + + +def test_golden_fixture_is_populated(): + assert len(_golden_rows()) >= 480 + + +def test_block_resolver_reproduces_every_geonames_code(): + mismatches = [ + (pc, geo, resolve_al_block(pc)) + for pc, geo in _golden_rows() + if resolve_al_block(pc) != geo + ] + assert mismatches == [], f"block map disagrees with GeoNames on: {mismatches[:10]}" From 7e69728f81112f29a5f6454d826e37226c3d7f73 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 19:20:28 +0200 Subject: [PATCH 3/7] feat: resolve Albania via the block map in lookup() (#118) --- app/data_loader.py | 18 ++++++++++++++++++ tests/test_data_loader.py | 27 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/app/data_loader.py b/app/data_loader.py index 00028ee..e753443 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -16,6 +16,8 @@ import httpx +from app.albania_blocks import SUPPORTED as AL_SUPPORTED +from app.albania_blocks import resolve_al_block from app.config import settings _NUTS3_RE = re.compile(r"^[A-Z]{2}[A-Z0-9]{1,3}$") @@ -96,6 +98,7 @@ def get_loaded_countries() -> set[str]: | {cc for cc, _ in _estimates} | set(_single_nuts3.keys()) | set(_synthetic_nuts.keys()) + | set(AL_SUPPORTED) ) @@ -1067,6 +1070,21 @@ def lookup(country_code: str, postal_code: str) -> dict | None: nuts3_confidence=est["nuts3_confidence"], ) + # Tier 2b: Albania authoritative block map (#118). AL has no TERCET and no + # estimate rows; the official postal-district block scheme resolves any + # well-formed 4-digit code to its NUTS3. + if cc in AL_SUPPORTED: + al_nuts3 = resolve_al_block(extracted) + if al_nuts3 is not None: + conf = settings.confidence_map["high"] + return _build_result( + "estimated", + al_nuts3, + nuts1_confidence=conf["nuts1"], + nuts2_confidence=conf["nuts2"], + nuts3_confidence=conf["nuts3"], + ) + # Tier 3: Runtime prefix-based estimation approx = _estimate_by_prefix(cc, extracted) if approx is not None: diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index 34496b7..7a78cad 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -230,6 +230,33 @@ def test_albania_resolves_via_estimates(self, mock_data): assert result["nuts3_name"] == "Tiranë" +class TestAlbaniaBlockTier: + def test_gap_code_resolves_via_block(self, mock_data): + # 1055 is absent from the GeoNames estimates (the #118 gap) but the + # block tier resolves it to the Tirana qark. + from app.data_loader import lookup + + result = lookup("AL", "1055") + assert result is not None + assert result["match_type"] == "estimated" + assert result["nuts3"] == "AL022" + assert result["nuts1"] == "AL0" + assert result["nuts3_confidence"] == 0.9 + + def test_district_geonames_omits_resolves(self, mock_data): + from app.data_loader import lookup + + # Peqin (35xx) — GeoNames has no such codes at all. + result = lookup("AL", "3550") + assert result is not None + assert result["nuts3"] == "AL021" + + def test_al_stays_in_loaded_countries(self): + from app.data_loader import get_loaded_countries + + assert "AL" in get_loaded_countries() + + class TestBundledAlbaniaData: VALID_AL_NUTS3 = { "AL011", "AL012", "AL013", "AL014", "AL015", "AL021", From 138597431c256963e11da8e5c1de49a3a5660319 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 19:24:35 +0200 Subject: [PATCH 4/7] refactor: drop GeoNames AL estimates in favor of the block resolver (#118) --- scripts/build_albania_estimates.py | 107 ------- tercet_missing_codes.csv | 489 ----------------------------- tests/test_albania_estimates.py | 83 ----- tests/test_data_loader.py | 26 +- 4 files changed, 18 insertions(+), 687 deletions(-) delete mode 100644 scripts/build_albania_estimates.py delete mode 100644 tests/test_albania_estimates.py diff --git a/scripts/build_albania_estimates.py b/scripts/build_albania_estimates.py deleted file mode 100644 index 3938fe5..0000000 --- a/scripts/build_albania_estimates.py +++ /dev/null @@ -1,107 +0,0 @@ -"""Generate Albania (AL) postal-code → NUTS3 estimates from GeoNames. - -Albania has a full NUTS hierarchy but no Eurostat TERCET correspondence file, -so we derive per-postal-code NUTS3 codes from GeoNames' admin1 (qark) tagging. -The 12 qarks map 1:1 to the 12 NUTS3 counties. Output is merged into -tercet_missing_codes.csv (Tier-2 estimates), leaving other countries untouched. - -Run from the repo root: python scripts/build_albania_estimates.py -""" - -import io -import zipfile -from pathlib import Path - -import httpx - -GEONAMES_URL = "https://download.geonames.org/export/zip/AL.zip" -CSV_PATH = Path("tercet_missing_codes.csv") -CONFIDENCE = "high" - -# GeoNames admin1 (qark) name -> NUTS3 code (NUTS 2024). Verified against GISCO. -QARK_TO_NUTS3 = { - "Qarku i Dibrës": "AL011", - "Qarku i Durrësit": "AL012", - "Qarku i Kukësit": "AL013", - "Qarku i Lezhës": "AL014", - "Qarku i Shkodrës": "AL015", - "Qarku i Elbasanit": "AL021", - "Tirana": "AL022", - "Qarku i Beratit": "AL031", - "Qarku i Fierit": "AL032", - "Qarku i Gjirokastrës": "AL033", - "Qarku i Korçës": "AL034", - "Qarku i Vlorës": "AL035", -} - - -def rows_from_geonames(records: list[list[str]]) -> list[dict]: - """Convert parsed GeoNames records into deduped, sorted AL estimate rows.""" - seen: dict[str, str] = {} - for rec in records: - if len(rec) < 4: - continue - pc = rec[1].strip() - qark = rec[3].strip() - if not (len(pc) == 4 and pc.isdigit()): - continue - nuts3 = QARK_TO_NUTS3.get(qark) - if nuts3 is None: - raise ValueError(f"Unmapped GeoNames admin1 (qark): {qark!r}") - seen[pc] = nuts3 - rows: list[dict] = [] - for pc in sorted(seen): - nuts3 = seen[pc] - rows.append( - { - "COUNTRY_CODE": "AL", - "POSTAL_CODE": pc, - "ESTIMATED_NUTS3": nuts3, - "ESTIMATED_NUTS2": nuts3[:4], - "ESTIMATED_NUTS1": nuts3[:3], - "CONFIDENCE": CONFIDENCE, - } - ) - return rows - - -def merge_into_csv(csv_path: Path, al_rows: list[dict]) -> None: - """Rewrite csv_path: header + AL rows + existing non-AL rows (original order). - - Preserves the file's existing line terminator (CRLF or LF) so regenerating - does not churn every existing line's ending. - """ - # newline="" disables universal-newline translation so we can detect the - # file's actual line terminator instead of always seeing "\n" - # (Path.read_text() has no newline= param until Python 3.13). - with open(csv_path, encoding="utf-8", newline="") as f: - raw = f.read() - newline = "\r\n" if "\r\n" in raw else "\n" - lines = raw.splitlines() - header = lines[0] - kept = [ln for ln in lines[1:] if ln and not ln.startswith("AL,")] - al_lines = [ - f"{r['COUNTRY_CODE']},{r['POSTAL_CODE']},{r['ESTIMATED_NUTS3']}," - f"{r['ESTIMATED_NUTS2']},{r['ESTIMATED_NUTS1']},{r['CONFIDENCE']}" - for r in al_rows - ] - # newline="" disables write-side translation so the detected terminator is - # written verbatim (else newline=None would translate the "\n" inside each - # "\r\n" to os.linesep on non-POSIX runners). write_text gained newline= in 3.10. - csv_path.write_text(newline.join([header, *al_lines, *kept]) + newline, encoding="utf-8", newline="") - - -def main() -> None: - with httpx.Client(follow_redirects=True) as client: - resp = client.get(GEONAMES_URL, timeout=60) - resp.raise_for_status() - with zipfile.ZipFile(io.BytesIO(resp.content)) as zf: - text = zf.read("AL.txt").decode("utf-8") - records = [line.split("\t") for line in text.splitlines() if line.strip()] - rows = rows_from_geonames(records) - merge_into_csv(CSV_PATH, rows) - print(f"Wrote {len(rows)} Albania estimate rows to {CSV_PATH}") - - -if __name__ == "__main__": - main() diff --git a/tercet_missing_codes.csv b/tercet_missing_codes.csv index a2532ce..c9cc317 100644 --- a/tercet_missing_codes.csv +++ b/tercet_missing_codes.csv @@ -1,493 +1,4 @@ COUNTRY_CODE,POSTAL_CODE,ESTIMATED_NUTS3,ESTIMATED_NUTS2,ESTIMATED_NUTS1,CONFIDENCE -AL,1001,AL022,AL02,AL0,high -AL,1002,AL022,AL02,AL0,high -AL,1003,AL022,AL02,AL0,high -AL,1004,AL022,AL02,AL0,high -AL,1005,AL022,AL02,AL0,high -AL,1006,AL022,AL02,AL0,high -AL,1007,AL022,AL02,AL0,high -AL,1008,AL022,AL02,AL0,high -AL,1009,AL022,AL02,AL0,high -AL,1010,AL022,AL02,AL0,high -AL,1011,AL022,AL02,AL0,high -AL,1012,AL022,AL02,AL0,high -AL,1013,AL022,AL02,AL0,high -AL,1014,AL022,AL02,AL0,high -AL,1015,AL022,AL02,AL0,high -AL,1016,AL022,AL02,AL0,high -AL,1017,AL022,AL02,AL0,high -AL,1018,AL022,AL02,AL0,high -AL,1019,AL022,AL02,AL0,high -AL,1020,AL022,AL02,AL0,high -AL,1021,AL022,AL02,AL0,high -AL,1022,AL022,AL02,AL0,high -AL,1023,AL022,AL02,AL0,high -AL,1024,AL022,AL02,AL0,high -AL,1025,AL022,AL02,AL0,high -AL,1026,AL022,AL02,AL0,high -AL,1027,AL022,AL02,AL0,high -AL,1028,AL022,AL02,AL0,high -AL,1029,AL022,AL02,AL0,high -AL,1030,AL022,AL02,AL0,high -AL,1031,AL022,AL02,AL0,high -AL,1032,AL022,AL02,AL0,high -AL,1033,AL022,AL02,AL0,high -AL,1034,AL022,AL02,AL0,high -AL,1035,AL022,AL02,AL0,high -AL,1036,AL022,AL02,AL0,high -AL,1037,AL022,AL02,AL0,high -AL,1038,AL022,AL02,AL0,high -AL,1039,AL022,AL02,AL0,high -AL,1040,AL022,AL02,AL0,high -AL,1041,AL022,AL02,AL0,high -AL,1042,AL022,AL02,AL0,high -AL,1043,AL022,AL02,AL0,high -AL,1044,AL022,AL02,AL0,high -AL,1045,AL022,AL02,AL0,high -AL,1046,AL022,AL02,AL0,high -AL,1047,AL022,AL02,AL0,high -AL,1048,AL022,AL02,AL0,high -AL,1049,AL022,AL02,AL0,high -AL,1050,AL022,AL02,AL0,high -AL,1051,AL022,AL02,AL0,high -AL,1052,AL022,AL02,AL0,high -AL,1053,AL022,AL02,AL0,high -AL,1054,AL022,AL02,AL0,high -AL,1501,AL012,AL01,AL0,high -AL,1502,AL012,AL01,AL0,high -AL,1503,AL012,AL01,AL0,high -AL,1504,AL012,AL01,AL0,high -AL,1505,AL012,AL01,AL0,high -AL,1506,AL012,AL01,AL0,high -AL,1507,AL012,AL01,AL0,high -AL,1508,AL012,AL01,AL0,high -AL,1700,AL022,AL02,AL0,high -AL,1800,AL022,AL02,AL0,high -AL,2001,AL012,AL01,AL0,high -AL,2002,AL012,AL01,AL0,high -AL,2003,AL012,AL01,AL0,high -AL,2004,AL012,AL01,AL0,high -AL,2005,AL012,AL01,AL0,high -AL,2006,AL012,AL01,AL0,high -AL,2007,AL012,AL01,AL0,high -AL,2008,AL012,AL01,AL0,high -AL,2009,AL012,AL01,AL0,high -AL,2010,AL012,AL01,AL0,high -AL,2011,AL012,AL01,AL0,high -AL,2012,AL012,AL01,AL0,high -AL,2013,AL012,AL01,AL0,high -AL,2014,AL012,AL01,AL0,high -AL,2015,AL012,AL01,AL0,high -AL,2016,AL012,AL01,AL0,high -AL,2017,AL012,AL01,AL0,high -AL,2018,AL012,AL01,AL0,high -AL,2019,AL012,AL01,AL0,high -AL,2020,AL012,AL01,AL0,high -AL,2021,AL012,AL01,AL0,high -AL,2022,AL012,AL01,AL0,high -AL,2501,AL022,AL02,AL0,high -AL,2502,AL022,AL02,AL0,high -AL,2503,AL022,AL02,AL0,high -AL,2504,AL022,AL02,AL0,high -AL,2505,AL022,AL02,AL0,high -AL,2506,AL022,AL02,AL0,high -AL,2507,AL022,AL02,AL0,high -AL,2508,AL022,AL02,AL0,high -AL,2509,AL022,AL02,AL0,high -AL,2510,AL022,AL02,AL0,high -AL,2511,AL022,AL02,AL0,high -AL,3001,AL021,AL02,AL0,high -AL,3002,AL021,AL02,AL0,high -AL,3003,AL021,AL02,AL0,high -AL,3004,AL021,AL02,AL0,high -AL,3005,AL021,AL02,AL0,high -AL,3006,AL021,AL02,AL0,high -AL,3007,AL021,AL02,AL0,high -AL,3008,AL021,AL02,AL0,high -AL,3009,AL021,AL02,AL0,high -AL,3010,AL021,AL02,AL0,high -AL,3011,AL021,AL02,AL0,high -AL,3012,AL021,AL02,AL0,high -AL,3013,AL021,AL02,AL0,high -AL,3014,AL021,AL02,AL0,high -AL,3015,AL021,AL02,AL0,high -AL,3016,AL021,AL02,AL0,high -AL,3017,AL021,AL02,AL0,high -AL,3018,AL021,AL02,AL0,high -AL,3019,AL021,AL02,AL0,high -AL,3020,AL021,AL02,AL0,high -AL,3021,AL021,AL02,AL0,high -AL,3022,AL021,AL02,AL0,high -AL,3023,AL021,AL02,AL0,high -AL,3024,AL021,AL02,AL0,high -AL,3025,AL021,AL02,AL0,high -AL,3026,AL021,AL02,AL0,high -AL,3027,AL021,AL02,AL0,high -AL,3401,AL021,AL02,AL0,high -AL,3402,AL021,AL02,AL0,high -AL,3403,AL021,AL02,AL0,high -AL,3404,AL021,AL02,AL0,high -AL,3405,AL021,AL02,AL0,high -AL,3406,AL021,AL02,AL0,high -AL,3407,AL021,AL02,AL0,high -AL,3408,AL021,AL02,AL0,high -AL,3409,AL021,AL02,AL0,high -AL,3410,AL021,AL02,AL0,high -AL,3411,AL021,AL02,AL0,high -AL,3412,AL021,AL02,AL0,high -AL,3413,AL021,AL02,AL0,high -AL,4001,AL015,AL01,AL0,high -AL,4002,AL015,AL01,AL0,high -AL,4003,AL015,AL01,AL0,high -AL,4004,AL015,AL01,AL0,high -AL,4005,AL015,AL01,AL0,high -AL,4006,AL015,AL01,AL0,high -AL,4007,AL015,AL01,AL0,high -AL,4008,AL015,AL01,AL0,high -AL,4009,AL015,AL01,AL0,high -AL,4010,AL015,AL01,AL0,high -AL,4011,AL015,AL01,AL0,high -AL,4012,AL015,AL01,AL0,high -AL,4013,AL015,AL01,AL0,high -AL,4014,AL015,AL01,AL0,high -AL,4015,AL015,AL01,AL0,high -AL,4016,AL015,AL01,AL0,high -AL,4017,AL015,AL01,AL0,high -AL,4018,AL015,AL01,AL0,high -AL,4019,AL015,AL01,AL0,high -AL,4020,AL015,AL01,AL0,high -AL,4021,AL015,AL01,AL0,high -AL,4022,AL015,AL01,AL0,high -AL,4023,AL015,AL01,AL0,high -AL,4024,AL015,AL01,AL0,high -AL,4025,AL015,AL01,AL0,high -AL,4026,AL015,AL01,AL0,high -AL,4027,AL015,AL01,AL0,high -AL,4028,AL015,AL01,AL0,high -AL,4029,AL015,AL01,AL0,high -AL,4030,AL015,AL01,AL0,high -AL,4301,AL015,AL01,AL0,high -AL,4302,AL015,AL01,AL0,high -AL,4303,AL015,AL01,AL0,high -AL,4304,AL015,AL01,AL0,high -AL,4305,AL015,AL01,AL0,high -AL,4306,AL015,AL01,AL0,high -AL,4401,AL015,AL01,AL0,high -AL,4402,AL015,AL01,AL0,high -AL,4403,AL015,AL01,AL0,high -AL,4404,AL015,AL01,AL0,high -AL,4405,AL015,AL01,AL0,high -AL,4406,AL015,AL01,AL0,high -AL,4407,AL015,AL01,AL0,high -AL,4408,AL015,AL01,AL0,high -AL,4409,AL015,AL01,AL0,high -AL,4501,AL014,AL01,AL0,high -AL,4502,AL014,AL01,AL0,high -AL,4503,AL014,AL01,AL0,high -AL,4504,AL014,AL01,AL0,high -AL,4505,AL014,AL01,AL0,high -AL,4506,AL014,AL01,AL0,high -AL,4507,AL014,AL01,AL0,high -AL,4508,AL014,AL01,AL0,high -AL,4509,AL014,AL01,AL0,high -AL,4510,AL014,AL01,AL0,high -AL,4511,AL014,AL01,AL0,high -AL,4601,AL014,AL01,AL0,high -AL,4602,AL014,AL01,AL0,high -AL,4603,AL014,AL01,AL0,high -AL,4604,AL014,AL01,AL0,high -AL,4605,AL014,AL01,AL0,high -AL,4606,AL014,AL01,AL0,high -AL,4607,AL014,AL01,AL0,high -AL,4608,AL014,AL01,AL0,high -AL,4701,AL014,AL01,AL0,high -AL,4702,AL014,AL01,AL0,high -AL,4703,AL014,AL01,AL0,high -AL,4704,AL014,AL01,AL0,high -AL,4705,AL014,AL01,AL0,high -AL,4706,AL014,AL01,AL0,high -AL,5001,AL031,AL03,AL0,high -AL,5002,AL031,AL03,AL0,high -AL,5003,AL031,AL03,AL0,high -AL,5004,AL031,AL03,AL0,high -AL,5005,AL031,AL03,AL0,high -AL,5006,AL031,AL03,AL0,high -AL,5007,AL031,AL03,AL0,high -AL,5008,AL031,AL03,AL0,high -AL,5009,AL031,AL03,AL0,high -AL,5010,AL031,AL03,AL0,high -AL,5011,AL031,AL03,AL0,high -AL,5012,AL031,AL03,AL0,high -AL,5013,AL031,AL03,AL0,high -AL,5014,AL031,AL03,AL0,high -AL,5015,AL031,AL03,AL0,high -AL,5016,AL031,AL03,AL0,high -AL,5017,AL031,AL03,AL0,high -AL,5018,AL031,AL03,AL0,high -AL,5019,AL031,AL03,AL0,high -AL,5020,AL031,AL03,AL0,high -AL,5021,AL031,AL03,AL0,high -AL,5022,AL031,AL03,AL0,high -AL,5301,AL031,AL03,AL0,high -AL,5302,AL031,AL03,AL0,high -AL,5303,AL031,AL03,AL0,high -AL,5304,AL031,AL03,AL0,high -AL,5305,AL031,AL03,AL0,high -AL,5306,AL031,AL03,AL0,high -AL,5307,AL031,AL03,AL0,high -AL,5308,AL031,AL03,AL0,high -AL,5401,AL031,AL03,AL0,high -AL,5402,AL031,AL03,AL0,high -AL,5403,AL031,AL03,AL0,high -AL,5404,AL031,AL03,AL0,high -AL,5405,AL031,AL03,AL0,high -AL,5406,AL031,AL03,AL0,high -AL,5407,AL031,AL03,AL0,high -AL,5408,AL031,AL03,AL0,high -AL,5409,AL031,AL03,AL0,high -AL,5410,AL031,AL03,AL0,high -AL,5411,AL031,AL03,AL0,high -AL,5412,AL031,AL03,AL0,high -AL,5413,AL031,AL03,AL0,high -AL,6001,AL033,AL03,AL0,high -AL,6002,AL033,AL03,AL0,high -AL,6003,AL033,AL03,AL0,high -AL,6004,AL033,AL03,AL0,high -AL,6005,AL033,AL03,AL0,high -AL,6006,AL033,AL03,AL0,high -AL,6007,AL033,AL03,AL0,high -AL,6008,AL033,AL03,AL0,high -AL,6009,AL033,AL03,AL0,high -AL,6010,AL033,AL03,AL0,high -AL,6011,AL033,AL03,AL0,high -AL,6012,AL033,AL03,AL0,high -AL,6013,AL033,AL03,AL0,high -AL,6014,AL033,AL03,AL0,high -AL,6015,AL033,AL03,AL0,high -AL,6016,AL033,AL03,AL0,high -AL,6017,AL033,AL03,AL0,high -AL,6018,AL033,AL03,AL0,high -AL,6019,AL033,AL03,AL0,high -AL,7001,AL034,AL03,AL0,high -AL,7002,AL034,AL03,AL0,high -AL,7003,AL034,AL03,AL0,high -AL,7004,AL034,AL03,AL0,high -AL,7005,AL034,AL03,AL0,high -AL,7006,AL034,AL03,AL0,high -AL,7007,AL034,AL03,AL0,high -AL,7008,AL034,AL03,AL0,high -AL,7009,AL034,AL03,AL0,high -AL,7010,AL034,AL03,AL0,high -AL,7011,AL034,AL03,AL0,high -AL,7012,AL034,AL03,AL0,high -AL,7013,AL034,AL03,AL0,high -AL,7014,AL034,AL03,AL0,high -AL,7015,AL034,AL03,AL0,high -AL,7016,AL034,AL03,AL0,high -AL,7017,AL034,AL03,AL0,high -AL,7018,AL034,AL03,AL0,high -AL,7019,AL034,AL03,AL0,high -AL,7020,AL034,AL03,AL0,high -AL,7021,AL034,AL03,AL0,high -AL,7022,AL034,AL03,AL0,high -AL,7023,AL034,AL03,AL0,high -AL,7024,AL034,AL03,AL0,high -AL,7025,AL034,AL03,AL0,high -AL,7026,AL034,AL03,AL0,high -AL,7027,AL034,AL03,AL0,high -AL,7028,AL034,AL03,AL0,high -AL,7029,AL034,AL03,AL0,high -AL,7301,AL034,AL03,AL0,high -AL,7302,AL034,AL03,AL0,high -AL,7303,AL034,AL03,AL0,high -AL,7304,AL034,AL03,AL0,high -AL,7305,AL034,AL03,AL0,high -AL,7306,AL034,AL03,AL0,high -AL,7307,AL034,AL03,AL0,high -AL,7308,AL034,AL03,AL0,high -AL,7309,AL034,AL03,AL0,high -AL,7310,AL034,AL03,AL0,high -AL,7401,AL034,AL03,AL0,high -AL,7402,AL034,AL03,AL0,high -AL,7403,AL034,AL03,AL0,high -AL,7404,AL034,AL03,AL0,high -AL,7405,AL034,AL03,AL0,high -AL,7406,AL034,AL03,AL0,high -AL,8001,AL011,AL01,AL0,high -AL,8002,AL011,AL01,AL0,high -AL,8003,AL011,AL01,AL0,high -AL,8004,AL011,AL01,AL0,high -AL,8005,AL011,AL01,AL0,high -AL,8006,AL011,AL01,AL0,high -AL,8007,AL011,AL01,AL0,high -AL,8008,AL011,AL01,AL0,high -AL,8009,AL011,AL01,AL0,high -AL,8010,AL011,AL01,AL0,high -AL,8011,AL011,AL01,AL0,high -AL,8012,AL011,AL01,AL0,high -AL,8013,AL011,AL01,AL0,high -AL,8014,AL011,AL01,AL0,high -AL,8301,AL011,AL01,AL0,high -AL,8302,AL011,AL01,AL0,high -AL,8303,AL011,AL01,AL0,high -AL,8304,AL011,AL01,AL0,high -AL,8305,AL011,AL01,AL0,high -AL,8306,AL011,AL01,AL0,high -AL,8307,AL011,AL01,AL0,high -AL,8308,AL011,AL01,AL0,high -AL,8309,AL011,AL01,AL0,high -AL,8310,AL011,AL01,AL0,high -AL,8311,AL011,AL01,AL0,high -AL,8312,AL011,AL01,AL0,high -AL,8313,AL011,AL01,AL0,high -AL,8314,AL011,AL01,AL0,high -AL,8315,AL011,AL01,AL0,high -AL,8401,AL011,AL01,AL0,high -AL,8402,AL011,AL01,AL0,high -AL,8403,AL011,AL01,AL0,high -AL,8404,AL011,AL01,AL0,high -AL,8405,AL011,AL01,AL0,high -AL,8406,AL011,AL01,AL0,high -AL,8407,AL011,AL01,AL0,high -AL,8408,AL011,AL01,AL0,high -AL,8409,AL011,AL01,AL0,high -AL,8501,AL013,AL01,AL0,high -AL,8502,AL013,AL01,AL0,high -AL,8503,AL013,AL01,AL0,high -AL,8504,AL013,AL01,AL0,high -AL,8505,AL013,AL01,AL0,high -AL,8506,AL013,AL01,AL0,high -AL,8507,AL013,AL01,AL0,high -AL,8508,AL013,AL01,AL0,high -AL,8509,AL013,AL01,AL0,high -AL,8510,AL013,AL01,AL0,high -AL,8511,AL013,AL01,AL0,high -AL,8512,AL013,AL01,AL0,high -AL,8513,AL013,AL01,AL0,high -AL,8514,AL013,AL01,AL0,high -AL,8515,AL013,AL01,AL0,high -AL,8516,AL013,AL01,AL0,high -AL,8517,AL013,AL01,AL0,high -AL,8518,AL013,AL01,AL0,high -AL,8519,AL013,AL01,AL0,high -AL,8520,AL013,AL01,AL0,high -AL,8601,AL013,AL01,AL0,high -AL,8602,AL013,AL01,AL0,high -AL,8603,AL013,AL01,AL0,high -AL,8604,AL013,AL01,AL0,high -AL,8605,AL013,AL01,AL0,high -AL,8701,AL013,AL01,AL0,high -AL,8702,AL013,AL01,AL0,high -AL,8703,AL013,AL01,AL0,high -AL,8704,AL013,AL01,AL0,high -AL,8705,AL013,AL01,AL0,high -AL,8706,AL013,AL01,AL0,high -AL,8707,AL013,AL01,AL0,high -AL,9001,AL032,AL03,AL0,high -AL,9002,AL032,AL03,AL0,high -AL,9003,AL032,AL03,AL0,high -AL,9004,AL032,AL03,AL0,high -AL,9005,AL032,AL03,AL0,high -AL,9006,AL032,AL03,AL0,high -AL,9007,AL032,AL03,AL0,high -AL,9008,AL032,AL03,AL0,high -AL,9009,AL032,AL03,AL0,high -AL,9010,AL032,AL03,AL0,high -AL,9011,AL032,AL03,AL0,high -AL,9012,AL032,AL03,AL0,high -AL,9013,AL032,AL03,AL0,high -AL,9014,AL032,AL03,AL0,high -AL,9015,AL032,AL03,AL0,high -AL,9016,AL032,AL03,AL0,high -AL,9017,AL032,AL03,AL0,high -AL,9018,AL032,AL03,AL0,high -AL,9019,AL032,AL03,AL0,high -AL,9020,AL032,AL03,AL0,high -AL,9021,AL032,AL03,AL0,high -AL,9022,AL032,AL03,AL0,high -AL,9301,AL032,AL03,AL0,high -AL,9302,AL032,AL03,AL0,high -AL,9303,AL032,AL03,AL0,high -AL,9304,AL032,AL03,AL0,high -AL,9305,AL032,AL03,AL0,high -AL,9306,AL032,AL03,AL0,high -AL,9307,AL032,AL03,AL0,high -AL,9308,AL032,AL03,AL0,high -AL,9309,AL032,AL03,AL0,high -AL,9310,AL032,AL03,AL0,high -AL,9311,AL032,AL03,AL0,high -AL,9312,AL032,AL03,AL0,high -AL,9313,AL032,AL03,AL0,high -AL,9314,AL032,AL03,AL0,high -AL,9315,AL032,AL03,AL0,high -AL,9316,AL032,AL03,AL0,high -AL,9317,AL032,AL03,AL0,high -AL,9318,AL032,AL03,AL0,high -AL,9319,AL032,AL03,AL0,high -AL,9320,AL032,AL03,AL0,high -AL,9321,AL032,AL03,AL0,high -AL,9322,AL032,AL03,AL0,high -AL,9323,AL032,AL03,AL0,high -AL,9324,AL032,AL03,AL0,high -AL,9325,AL032,AL03,AL0,high -AL,9326,AL032,AL03,AL0,high -AL,9327,AL032,AL03,AL0,high -AL,9328,AL032,AL03,AL0,high -AL,9329,AL032,AL03,AL0,high -AL,9330,AL032,AL03,AL0,high -AL,9331,AL032,AL03,AL0,high -AL,9332,AL032,AL03,AL0,high -AL,9333,AL032,AL03,AL0,high -AL,9334,AL032,AL03,AL0,high -AL,9335,AL032,AL03,AL0,high -AL,9401,AL035,AL03,AL0,high -AL,9402,AL035,AL03,AL0,high -AL,9403,AL035,AL03,AL0,high -AL,9404,AL035,AL03,AL0,high -AL,9405,AL035,AL03,AL0,high -AL,9406,AL035,AL03,AL0,high -AL,9407,AL035,AL03,AL0,high -AL,9408,AL035,AL03,AL0,high -AL,9409,AL035,AL03,AL0,high -AL,9410,AL035,AL03,AL0,high -AL,9411,AL035,AL03,AL0,high -AL,9412,AL035,AL03,AL0,high -AL,9413,AL035,AL03,AL0,high -AL,9414,AL035,AL03,AL0,high -AL,9415,AL035,AL03,AL0,high -AL,9416,AL035,AL03,AL0,high -AL,9417,AL035,AL03,AL0,high -AL,9418,AL035,AL03,AL0,high -AL,9419,AL035,AL03,AL0,high -AL,9420,AL035,AL03,AL0,high -AL,9421,AL035,AL03,AL0,high -AL,9422,AL035,AL03,AL0,high -AL,9423,AL035,AL03,AL0,high -AL,9424,AL035,AL03,AL0,high -AL,9425,AL035,AL03,AL0,high -AL,9426,AL035,AL03,AL0,high -AL,9427,AL035,AL03,AL0,high -AL,9701,AL035,AL03,AL0,high -AL,9702,AL035,AL03,AL0,high -AL,9703,AL035,AL03,AL0,high -AL,9704,AL035,AL03,AL0,high -AL,9705,AL035,AL03,AL0,high -AL,9706,AL035,AL03,AL0,high -AL,9707,AL035,AL03,AL0,high -AL,9708,AL035,AL03,AL0,high -AL,9709,AL035,AL03,AL0,high -AL,9710,AL035,AL03,AL0,high -AL,9711,AL035,AL03,AL0,high -AL,9712,AL035,AL03,AL0,high -AL,9713,AL035,AL03,AL0,high -AL,9714,AL035,AL03,AL0,high -AL,9715,AL035,AL03,AL0,high -AL,9716,AL035,AL03,AL0,high -AL,9717,AL035,AL03,AL0,high -AL,9718,AL035,AL03,AL0,high -AL,9719,AL035,AL03,AL0,high -AL,9720,AL035,AL03,AL0,high -AL,9721,AL035,AL03,AL0,high AT,1012,AT130,AT13,AT1,high AT,1017,AT130,AT13,AT1,high AT,1034,AT130,AT13,AT1,high diff --git a/tests/test_albania_estimates.py b/tests/test_albania_estimates.py deleted file mode 100644 index 75dd4ea..0000000 --- a/tests/test_albania_estimates.py +++ /dev/null @@ -1,83 +0,0 @@ -"""Tests for scripts/build_albania_estimates.py (pure transform).""" - -import importlib.util -from pathlib import Path - -_spec = importlib.util.spec_from_file_location( - "build_albania_estimates", - Path(__file__).resolve().parent.parent / "scripts" / "build_albania_estimates.py", -) -build = importlib.util.module_from_spec(_spec) -_spec.loader.exec_module(build) - - -# GeoNames columns: country, postalcode, place, admin1name, admin1code, ... -SAMPLE = [ - ["AL", "1001", "Tirane", "Tirana", "40"], - ["AL", "1001", "Tirane dup", "Tirana", "40"], # duplicate PC, same qark - ["AL", "5001", "Qender Berat", "Qarku i Beratit", "40"], - ["AL", "9401", "Vlore", "Qarku i Vlorës", "50"], -] - - -def test_maps_qark_to_nuts3_and_derives_levels(): - rows = build.rows_from_geonames(SAMPLE) - by_pc = {r["POSTAL_CODE"]: r for r in rows} - assert by_pc["1001"]["ESTIMATED_NUTS3"] == "AL022" - assert by_pc["1001"]["ESTIMATED_NUTS2"] == "AL02" - assert by_pc["1001"]["ESTIMATED_NUTS1"] == "AL0" - assert by_pc["5001"]["ESTIMATED_NUTS3"] == "AL031" - assert by_pc["9401"]["ESTIMATED_NUTS3"] == "AL035" - assert all(r["COUNTRY_CODE"] == "AL" for r in rows) - assert all(r["CONFIDENCE"] == "high" for r in rows) - - -def test_dedupes_by_postal_code(): - rows = build.rows_from_geonames(SAMPLE) - pcs = [r["POSTAL_CODE"] for r in rows] - assert len(pcs) == len(set(pcs)) == 3 - - -def test_sorted_by_postal_code(): - rows = build.rows_from_geonames(SAMPLE) - pcs = [r["POSTAL_CODE"] for r in rows] - assert pcs == sorted(pcs) - - -def test_unmapped_qark_raises(): - import pytest - - with pytest.raises(ValueError): - build.rows_from_geonames([["AL", "1001", "X", "Unknown County", "40"]]) - - -def test_skips_non_four_digit(): - rows = build.rows_from_geonames([["AL", "100", "X", "Tirana", "40"]]) - assert rows == [] - - -def test_merge_preserves_crlf(tmp_path): - csv_path = tmp_path / "est.csv" - csv_path.write_bytes( - b"COUNTRY_CODE,POSTAL_CODE,ESTIMATED_NUTS3,ESTIMATED_NUTS2,ESTIMATED_NUTS1,CONFIDENCE\r\n" - b"AT,1010,AT130,AT13,AT1,high\r\n" - ) - al_rows = build.rows_from_geonames([["AL", "1001", "T", "Tirana", "40"]]) - build.merge_into_csv(csv_path, al_rows) - data = csv_path.read_bytes() - assert b"\r\nAL,1001,AL022,AL02,AL0,high\r\n" in data - assert data.endswith(b"AT,1010,AT130,AT13,AT1,high\r\n") - assert data.count(b"\r\n") == 3 - - -def test_merge_preserves_lf(tmp_path): - csv_path = tmp_path / "est.csv" - csv_path.write_bytes( - b"COUNTRY_CODE,POSTAL_CODE,ESTIMATED_NUTS3,ESTIMATED_NUTS2,ESTIMATED_NUTS1,CONFIDENCE\n" - b"AT,1010,AT130,AT13,AT1,high\n" - ) - al_rows = build.rows_from_geonames([["AL", "1001", "T", "Tirana", "40"]]) - build.merge_into_csv(csv_path, al_rows) - data = csv_path.read_bytes() - assert b"\r\n" not in data - assert b"AL,1001,AL022,AL02,AL0,high\n" in data diff --git a/tests/test_data_loader.py b/tests/test_data_loader.py index 7a78cad..a43d951 100644 --- a/tests/test_data_loader.py +++ b/tests/test_data_loader.py @@ -263,17 +263,27 @@ class TestBundledAlbaniaData: "AL022", "AL031", "AL032", "AL033", "AL034", "AL035", } - def test_albania_rows_present_and_valid(self): + def test_no_al_rows_remain_in_estimates_csv(self): from pathlib import Path from app.data_loader import parse_estimates_from_text text = Path("tercet_missing_codes.csv").read_text(encoding="utf-8") parsed, _ = parse_estimates_from_text(text) - al = {pc: est for (cc, pc), est in parsed.items() if cc == "AL"} - assert len(al) >= 480 - for pc, est in al.items(): - assert pc.isdigit() and len(pc) == 4 - assert est["nuts3"] in self.VALID_AL_NUTS3 - assert est["nuts2"] == est["nuts3"][:4] - assert est["nuts1"] == "AL0" + assert not any(cc == "AL" for cc, _ in parsed), "AL now resolves via the block map, not estimates" + + def test_block_map_covers_all_twelve_nuts3(self): + from app.albania_blocks import BLOCKS + + assert {nuts3 for _, nuts3, _ in BLOCKS} == self.VALID_AL_NUTS3 + + def test_sample_codes_resolve_estimated(self): + from app.data_loader import lookup + + for pc in ("1001", "1055", "5001", "9401", "3550"): + result = lookup("AL", pc) + assert result is not None + assert result["match_type"] == "estimated" + assert result["nuts3"] in self.VALID_AL_NUTS3 + assert result["nuts2"] == result["nuts3"][:4] + assert result["nuts1"] == "AL0" From 9952d0d1f85254cae4c1a4a56552f15c6e68e318 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 19:28:36 +0200 Subject: [PATCH 5/7] docs: changelog for Albania block resolver (#118) --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2222564..9643e5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/). ## [Unreleased] +### Added + +- **Albania coverage completeness** (#118): AL postal codes now resolve via the + official postal-code block-allocation scheme (`app/albania_blocks.py`) instead + of the incomplete GeoNames estimates. Every well-formed 4-digit AL code maps to + its NUTS3 region by district block — codes GeoNames omitted (e.g. Tirana 1055, + and whole districts like Gramsh 33xx / Peqin 35xx / Tepelenë 63xx / Përmet + 64xx) no longer 404. Validated to reproduce all 489 previously-shipped codes + identically. Because the map is code, not data, AL coverage is now immune to + the `PC2NUTS_ESTIMATES_REFRESH_URL` full-replace clobber. + ## [1.0.0] - 2026-07-03 ### Added From 388076de53b1b6d921c1ffe9b3d4679f99528877 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 19:34:51 +0200 Subject: [PATCH 6/7] docs: align README and lookup() docstring with the Albania block resolver (#118) --- README.md | 10 +++++----- app/data_loader.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5da6cbb..ac181be 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Albania (AL), North Macedonia (MK), Montenegro (ME), Serbia (RS), Türkiye (TR) > **Montenegro** is treated by Eurostat as a single nationwide unit at every NUTS level (`ME0` / `ME00` / `ME000`), and GISCO does not currently publish a TERCET file for it. Lookups for ME are served by the single-NUTS3 fallback (Tier 5) configured via `single_nuts3_fallback` in `app/settings.json`, returning `ME000` for any valid 5-digit code starting with `8`. -> **Albania** has a full NUTS hierarchy (`AL0`; `AL01` / `AL02` / `AL03`; 12 NUTS3 counties `AL011`–`AL035`) but Eurostat publishes no GISCO TERCET file for it. Coverage is provided through the Tier-2 estimates layer: each of ~489 Albanian 4-digit postal codes is mapped to its NUTS3 county (qark) via GeoNames' admin1 tagging, which corresponds 1:1 to the NUTS3 regions. Lookups return `match_type="estimated"` with `high` confidence — see [Estimates](#estimates). +> **Albania** has a full NUTS hierarchy (`AL0`; `AL01` / `AL02` / `AL03`; 12 NUTS3 counties `AL011`–`AL035`) but Eurostat publishes no GISCO TERCET file for it. Coverage is provided by an authoritative postal-code **block resolver** (`app/albania_blocks.py`): Albanian codes are block-allocated by district — the first two digits identify one of ~33 postal districts, each belonging to one of the 12 NUTS3 qarks — so **any** well-formed 4-digit code resolves to its qark via the block it falls into. Lookups return `match_type="estimated"` with `high` confidence — see [Estimates](#estimates). **Other territories** (1): Faroe Islands (FO) — not part of NUTS; synthetic result. @@ -746,7 +746,7 @@ The estimates file contains **7,632 entries** across 33 countries, with the foll | medium | 1,439 | 18.9% | | low | 447 | 5.9% | -Countries with the most estimates: TR (1,778), LT (1,231), FR (526), DE (500), AL (489), EL (387), CZ (361), RO (358). +Countries with the most estimates: TR (1,778), LT (1,231), FR (526), DE (500), EL (387), CZ (361), RO (358). ### Revalidation @@ -847,7 +847,7 @@ docker build -t postalcode2nuts . docker run -p 8000:8000 postalcode2nuts ``` -On first start the service downloads TERCET data for the 34 countries with GISCO coverage (~2-5 minutes depending on network); Montenegro (single-NUTS3 fallback) and Albania (estimates-only, bundled in `tercet_missing_codes.csv`) need no download. After that everything is cached in a SQLite database for instant restarts. +On first start the service downloads TERCET data for the 34 countries with GISCO coverage (~2-5 minutes depending on network); Montenegro (single-NUTS3 fallback) and Albania (resolved in-code via the postal-code block map) need no download. After that everything is cached in a SQLite database for instant restarts. ### Persistent data volume @@ -939,7 +939,7 @@ tests/ ├── test_nuts_pip.py ├── test_auth.py ├── test_token_db.py -└── ... # full suite also covers estimates refresh, rate limiting, Albania estimates, etc. +└── ... # full suite also covers estimates refresh, rate limiting, the Albania block resolver, etc. scripts/ ├── import_estimates.py # CLI: import pre-computed estimates into SQLite DB └── tokens.py # CLI: manage trusted-token DB (init/add/list/revoke) @@ -1025,7 +1025,7 @@ No Python code changes are required. ## Data sources & attribution -**Postal code → NUTS (both tiers).** [GISCO TERCET flat files](https://ec.europa.eu/eurostat/web/gisco/geodata/administrative-units/postal-codes) ([download](https://gisco-services.ec.europa.eu/tercet/flat-files)), © European Union – GISCO, licensed [CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/). Albanian estimates are derived from [GeoNames](https://www.geonames.org/) admin1 tagging, licensed [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). +**Postal code → NUTS (both tiers).** [GISCO TERCET flat files](https://ec.europa.eu/eurostat/web/gisco/geodata/administrative-units/postal-codes) ([download](https://gisco-services.ec.europa.eu/tercet/flat-files)), © European Union – GISCO, licensed [CC-BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/). Albanian NUTS3 assignments come from the country's official postal-code block-allocation scheme (Posta Shqiptare), cross-validated against [GeoNames](https://www.geonames.org/) admin1 tagging ([CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)). The [EU Open Data Portal dataset](https://data.europa.eu/data/datasets/postcodes-and-nuts-nomenclature-of-territorial-units-for-statistics) was also considered as a data source. However, its refresh cycle lags behind the GISCO TERCET flat files, so direct sourcing from GISCO was chosen for more up-to-date coverage. diff --git a/app/data_loader.py b/app/data_loader.py index e753443..a3720bc 100644 --- a/app/data_loader.py +++ b/app/data_loader.py @@ -1034,9 +1034,10 @@ def _matches_pattern(cc: str, raw: str) -> bool: def lookup(country_code: str, postal_code: str) -> dict | None: """Look up NUTS codes for a given country + postal code. - Six-tier fall-through: + Tiered fall-through: 1. Exact TERCET match → confidence 1.0 2. Pre-computed estimate → stored confidence per level + 2b. Albania block map → district-block NUTS3, match_type='estimated' (#118) 3. Runtime prefix-based estimation → calculated confidence 4. Country-level majority vote → unanimous NUTS1/2, dominant NUTS3 (e.g. MT) 5. Single-NUTS3 country fallback → confidence 1.0 (e.g. LI, CY, LU) From ec2290baa2c520cb176030b809533987aaa126b8 Mon Sep 17 00:00:00 2001 From: bk86a Date: Fri, 3 Jul 2026 19:36:24 +0200 Subject: [PATCH 7/7] docs: refresh estimates statistics after Albania rows removed (#118) --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ac181be..e43d0ee 100644 --- a/README.md +++ b/README.md @@ -738,13 +738,13 @@ These labels map to numerical confidence scores per NUTS level. Coarser levels r ### Current coverage -The estimates file contains **7,632 entries** across 33 countries, with the following confidence distribution: +The estimates file contains **7,143 entries** across 32 countries, with the following confidence distribution: | Confidence | Count | Share | |------------|-------|-------| -| high | 5,746 | 75.3% | -| medium | 1,439 | 18.9% | -| low | 447 | 5.9% | +| high | 5,257 | 73.6% | +| medium | 1,439 | 20.1% | +| low | 447 | 6.3% | Countries with the most estimates: TR (1,778), LT (1,231), FR (526), DE (500), EL (387), CZ (361), RO (358).