From 21d74c8e648a63f6715b5f292b107d1805ee5d76 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Thu, 21 May 2026 10:42:33 -0600 Subject: [PATCH 01/19] a chunked pyogrio generator --- pyproject.toml | 15 ++++++- scripts/bench_engines.py | 92 ++++++++++++++++++++++++++++++++++++++++ src/rasterstats/io.py | 90 +++++++++++++++++++++++++++++++++++++-- src/rasterstats/main.py | 8 +++- tests/test_io.py | 54 +++++++++++++++++++++++ 5 files changed, 252 insertions(+), 7 deletions(-) create mode 100644 scripts/bench_engines.py diff --git a/pyproject.toml b/pyproject.toml index 23c76b4..fa6e5b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,8 @@ classifiers = [ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Topic :: Utilities", "Topic :: Scientific/Engineering :: GIS", ] @@ -31,7 +33,10 @@ dependencies = [ "affine", "click >7.1, !=8.2.1", "cligj >=0.4", - "fiona", + # TODO either fiona OR pyogrio is required, which should be the default? + # fiona -> backward compatible but won't have support for 3.14+ binaries + # pyogrio -> initial tests show poor performance (3x slower), need to tune, + # otherwise would introduce a bad performance regression. "numpy >=1.9", "rasterio >=1.0", "simplejson", @@ -42,6 +47,12 @@ dependencies = [ progress = [ "tqdm" ] +fiona = [ + "fiona" +] +pyogrio = [ + "pyogrio" +] docs = [ "numpydoc", "sphinx", @@ -56,7 +67,7 @@ test = [ "simplejson", ] dev = [ - "rasterstats[test]", + "rasterstats[test,fiona,pyogrio]", "ruff", "twine", ] diff --git a/scripts/bench_engines.py b/scripts/bench_engines.py new file mode 100644 index 0000000..e924207 --- /dev/null +++ b/scripts/bench_engines.py @@ -0,0 +1,92 @@ +"""Benchmark fiona vs pyogrio as read backends for rasterstats. + +Generates N random point features inside the extent of tests/data/slope.tif, +writes them to a temporary GeoJSON file, then times how long each engine takes +to iterate over every feature via ``read_features``. + +Usage +----- + uv run python scripts/bench_engines.py +""" + +import json +import random +import sys +import tempfile +import time +from pathlib import Path + +import rasterio + +# Resolve the repo root so the script works from any cwd +REPO_ROOT = Path(__file__).parent.parent +SLOPE_TIF = REPO_ROOT / "tests" / "data" / "slope.tif" +DEFAULT_N = 100_000 + + +def generate_geojson(path: Path, n: int) -> None: + with rasterio.open(SLOPE_TIF) as src: + left, bottom, right, top = src.bounds + + features = [ + { + "type": "Feature", + "properties": {"id": i}, + "geometry": { + "type": "Point", + "coordinates": [ + random.uniform(left, right), + random.uniform(bottom, top), + ], + }, + } + for i in range(n) + ] + fc = {"type": "FeatureCollection", "features": features} + with open(path, "w") as f: + json.dump(fc, f) + + +def time_engine(path: Path, engine: str, n: int) -> float: + # Import here so the benchmark reflects real-world import cost only once + from rasterstats.io import read_features + + t0 = time.perf_counter() + count = sum(1 for _ in read_features(str(path), engine=engine)) + elapsed = time.perf_counter() - t0 + assert count == n, f"Expected {n} features, got {count}" + return elapsed + + +def main() -> None: + n = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_N + + print(f"Generating {n:,} random point features over {SLOPE_TIF.name} …") + with tempfile.NamedTemporaryFile(suffix=".geojson", delete=False) as tmp: + tmp_path = Path(tmp.name) + + try: + generate_geojson(tmp_path, n) + file_mb = tmp_path.stat().st_size / 1024 / 1024 + print(f"Wrote {file_mb:.1f} MB → {tmp_path}\n") + + fiona_secs = time_engine(tmp_path, "fiona", n) + print(f"fiona : {fiona_secs:.3f}s ({n / fiona_secs:,.0f} feat/s)") + + try: + import pyogrio # noqa: F401 + + pyogrio_secs = time_engine(tmp_path, "pyogrio", n) + print(f"pyogrio: {pyogrio_secs:.3f}s ({n / pyogrio_secs:,.0f} feat/s)") + ratio = fiona_secs / pyogrio_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1/ratio):.2f}x faster") + except ImportError: + print("pyogrio not installed - skipping pyogrio benchmark") + print("Install with: pip install rasterstats[pyogrio]") + finally: + tmp_path.unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index 7ae8d87..e07aa7c 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -29,22 +29,104 @@ "MultiPolygon", ] +# Fiona backend + try: # Fiona 1.9+ import fiona.model - def fiona_generator(obj, layer=0): + def _fiona_generator(obj, layer=0): with fiona.open(obj, "r", layer=layer) as src: for feat in src: yield fiona.model.to_dict(feat) except ModuleNotFoundError: # Fiona <1.9 - def fiona_generator(obj, layer=0): + def _fiona_generator(obj, layer=0): with fiona.open(obj, "r", layer=layer) as src: yield from src +# pyogrio backend + + +def _pyogrio_generator(obj, layer=0, chunk_size=24_000): + """Yield GeoJSON-like Feature dicts using pyogrio, reading in chunks.""" + try: + import pyogrio + import pyogrio.raw + except ImportError as e: + raise ImportError( + "pyogrio is required for engine='pyogrio'. " + "Install it with: pip install rasterstats[pyogrio]" + ) from e + + info = pyogrio.read_info(obj, layer=layer) + total = info["features"] + field_names = list(info["fields"]) + + skip = 0 + while skip < total: + _meta, _fids, geom_wkb, field_data = pyogrio.raw.read( + obj, + layer=layer, + skip_features=skip, + max_features=chunk_size, + ) + batch_size = len(geom_wkb) + for i in range(batch_size): + geom = wkb.loads(geom_wkb[i]) + props = { + name: ( + field_data[j][i].item() + if hasattr(field_data[j][i], "item") + else field_data[j][i] + ) + for j, name in enumerate(field_names) + } + yield { + "type": "Feature", + "geometry": geom.__geo_interface__, + "properties": props, + } + skip += batch_size + if batch_size < chunk_size: + break + + +# Public dispatcher + + +def feature_generator(obj, layer=0, engine=None): + """Yield GeoJSON-like Feature dicts from a file-based vector source. + + Parameters + ---------- + obj : str or PathLike + Path to a vector data source supported by fiona or pyogrio. + layer : int or str, optional + Layer index or name (default: 0). + engine : {"fiona", "pyogrio"} or None, optional + Backend to use for reading. ``None`` and ``"fiona"`` both select + the fiona backend. Pass ``"pyogrio"`` explicitly to use pyogrio. + + Yields + ------ + dict + GeoJSON-like Feature dicts. + """ + if engine is None or engine == "fiona": + yield from _fiona_generator(obj, layer=layer) + elif engine == "pyogrio": + yield from _pyogrio_generator(obj, layer=layer) + else: + raise ValueError(f"Unknown engine {engine!r}. Choose 'fiona' or 'pyogrio'.") + + +# Backward-compatible alias +fiona_generator = feature_generator + + def wrap_geom(geom): """Wraps a geometry dict in an GeoJSON Feature""" return {"type": "Feature", "properties": {}, "geometry": geom} @@ -89,7 +171,7 @@ def parse_feature(obj): raise ValueError(f"Can't parse {obj} as a geojson Feature object") -def read_features(obj, layer=0): +def read_features(obj, layer=0, engine=None): features_iter = None if isinstance(obj, (str, PathLike)): obj = str(obj) @@ -98,7 +180,7 @@ def read_features(obj, layer=0): with fiona.open(obj, "r", layer=layer) as src: assert len(src) > 0 - features_iter = fiona_generator(obj, layer) + features_iter = feature_generator(obj, layer, engine=engine) except ( AssertionError, DriverError, diff --git a/src/rasterstats/main.py b/src/rasterstats/main.py index 43d9dc1..2f10f6a 100644 --- a/src/rasterstats/main.py +++ b/src/rasterstats/main.py @@ -71,6 +71,7 @@ def gen_zonal_stats( prefix=None, geojson_out=False, boundless=True, + engine=None, **kwargs, ): """Zonal statistics of raster values aggregated to vector geometries. @@ -145,6 +146,11 @@ def gen_zonal_stats( Allow features that extend beyond the raster dataset's extent, default: True Cells outside dataset extents are treated as nodata. + engine : {"fiona", "pyogrio"} or None, optional + Backend to use when reading file-based vector sources. + ``None`` or ``"fiona"`` (the default) both select the fiona backend. + Pass ``"pyogrio"`` to use pyogrio explicitly. + Returns ------- generator of dicts (if geojson_out is False) @@ -179,7 +185,7 @@ def gen_zonal_stats( band = band_num with Raster(raster, affine, nodata, band) as rast: - features_iter = read_features(vectors, layer) + features_iter = read_features(vectors, layer, engine=engine) for _, feat in enumerate(features_iter): geom = shape(feat["geometry"]) diff --git a/tests/test_io.py b/tests/test_io.py index 3199658..5ad22aa 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -11,6 +11,7 @@ Raster, boundless_array, bounds_window, + feature_generator, fiona_generator, read_featurecollection, read_features, @@ -370,7 +371,60 @@ def next(self): assert list(read_features(geothing)) == features +# feature_generator / engine tests + + +def test_feature_generator_fiona_default(): + """feature_generator with no engine= uses fiona and matches target.""" + result = list(feature_generator(polygons)) + assert result == target_features + + +def test_feature_generator_explicit_fiona(): + """feature_generator with engine='fiona' matches target.""" + result = list(feature_generator(polygons, engine="fiona")) + assert result == target_features + + +def test_feature_generator_pyogrio(): + """feature_generator with engine='pyogrio' yields same geometries.""" + pytest.importorskip("pyogrio") + result = list(feature_generator(polygons, engine="pyogrio")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) + + +def test_feature_generator_pyogrio_properties(): + """pyogrio engine preserves feature properties.""" + pytest.importorskip("pyogrio") + result = list(feature_generator(polygons, engine="pyogrio")) + assert all("properties" in f for f in result) + assert all("id" in f["properties"] for f in result) + + +def test_feature_generator_invalid_engine(): + """An unrecognised engine name raises ValueError.""" + with pytest.raises(ValueError, match="Unknown engine"): + list(feature_generator(polygons, engine="gdal")) + + +def test_fiona_generator_alias(): + """fiona_generator is still importable and works as a backward-compat alias.""" + result = list(fiona_generator(polygons)) + assert result == target_features + + +def test_read_features_pyogrio_engine(): + """read_features forwards engine= to the underlying generator.""" + pytest.importorskip("pyogrio") + result = list(read_features(polygons, engine="pyogrio")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) + + # Optional tests + + def test_geodataframe(): gpd = pytest.importorskip("geopandas") From 6d945d84abdbc65088b3e4e0c3a41447d69959e6 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Thu, 21 May 2026 12:28:03 -0600 Subject: [PATCH 02/19] improve readability, increase chunk size --- src/rasterstats/io.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index e07aa7c..bf06ec9 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -8,6 +8,7 @@ import fiona import numpy as np import rasterio +import shapely from affine import Affine from fiona.errors import DriverError from rasterio.enums import MaskFlags @@ -50,7 +51,7 @@ def _fiona_generator(obj, layer=0): # pyogrio backend -def _pyogrio_generator(obj, layer=0, chunk_size=24_000): +def _pyogrio_generator(obj, layer=0, chunk_size=65535): """Yield GeoJSON-like Feature dicts using pyogrio, reading in chunks.""" try: import pyogrio @@ -67,26 +68,24 @@ def _pyogrio_generator(obj, layer=0, chunk_size=24_000): skip = 0 while skip < total: - _meta, _fids, geom_wkb, field_data = pyogrio.raw.read( + _meta, _fids, geometries, field_data = pyogrio.raw.read( obj, layer=layer, skip_features=skip, max_features=chunk_size, + use_arrow=True, ) - batch_size = len(geom_wkb) + batch_size = len(geometries) + geoms = shapely.from_wkb(geometries) + cols = [col.tolist() for col in field_data] for i in range(batch_size): - geom = wkb.loads(geom_wkb[i]) - props = { - name: ( - field_data[j][i].item() - if hasattr(field_data[j][i], "item") - else field_data[j][i] - ) - for j, name in enumerate(field_names) - } + props = {name: cols[j][i] for j, name in enumerate(field_names)} yield { "type": "Feature", - "geometry": geom.__geo_interface__, + # converting to geom dict, only to convert back later? + # slight performance hit but it keeps the API identical. + # TODO optimize + "geometry": geoms[i].__geo_interface__, "properties": props, } skip += batch_size From 5c6a6e85c4278fcae0608e1af6e8fc11345296e4 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Thu, 21 May 2026 12:40:29 -0600 Subject: [PATCH 03/19] no arrow, benchmark geopackages --- scripts/bench_engines.py | 58 +++++++++++++++++++++++++++++++++++----- src/rasterstats/io.py | 5 ++-- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/scripts/bench_engines.py b/scripts/bench_engines.py index e924207..1623b0a 100644 --- a/scripts/bench_engines.py +++ b/scripts/bench_engines.py @@ -1,8 +1,9 @@ """Benchmark fiona vs pyogrio as read backends for rasterstats. Generates N random point features inside the extent of tests/data/slope.tif, -writes them to a temporary GeoJSON file, then times how long each engine takes -to iterate over every feature via ``read_features``. +writes them to a temporary GeoJSON file, converts it to a GeoPackage via +ogr2ogr, then times how long each engine takes to iterate over every feature +via ``read_features``. Usage ----- @@ -11,6 +12,7 @@ import json import random +import subprocess import sys import tempfile import time @@ -47,12 +49,30 @@ def generate_geojson(path: Path, n: int) -> None: json.dump(fc, f) -def time_engine(path: Path, engine: str, n: int) -> float: +GPKG_LAYER = "features" + + +def generate_gpkg(geojson_path: Path, gpkg_path: Path) -> None: + """Convert an existing GeoJSON file to GeoPackage using ogr2ogr.""" + subprocess.run( + [ + "ogr2ogr", + "-f", "GPKG", + "-nln", GPKG_LAYER, + str(gpkg_path), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + +def time_engine(path: Path, engine: str, n: int, layer=0) -> float: # Import here so the benchmark reflects real-world import cost only once from rasterstats.io import read_features t0 = time.perf_counter() - count = sum(1 for _ in read_features(str(path), engine=engine)) + count = sum(1 for _ in read_features(str(path), layer=layer, engine=engine)) elapsed = time.perf_counter() - t0 assert count == n, f"Expected {n} features, got {count}" return elapsed @@ -64,12 +84,20 @@ def main() -> None: print(f"Generating {n:,} random point features over {SLOPE_TIF.name} …") with tempfile.NamedTemporaryFile(suffix=".geojson", delete=False) as tmp: tmp_path = Path(tmp.name) + gpkg_path = Path(tempfile.mktemp(suffix=".gpkg")) try: generate_geojson(tmp_path, n) file_mb = tmp_path.stat().st_size / 1024 / 1024 - print(f"Wrote {file_mb:.1f} MB → {tmp_path}\n") + print(f"Wrote GeoJSON {file_mb:.1f} MB → {tmp_path}") + print("Converting to GeoPackage via ogr2ogr …") + generate_gpkg(tmp_path, gpkg_path) + gpkg_mb = gpkg_path.stat().st_size / 1024 / 1024 + print(f"Wrote GeoPackage {gpkg_mb:.1f} MB → {gpkg_path}\n") + + # --- GeoJSON benchmark --- + print("=== GeoJSON ===") fiona_secs = time_engine(tmp_path, "fiona", n) print(f"fiona : {fiona_secs:.3f}s ({n / fiona_secs:,.0f} feat/s)") @@ -80,12 +108,30 @@ def main() -> None: print(f"pyogrio: {pyogrio_secs:.3f}s ({n / pyogrio_secs:,.0f} feat/s)") ratio = fiona_secs / pyogrio_secs faster = "pyogrio" if ratio > 1 else "fiona" - print(f"\n{faster} is {max(ratio, 1/ratio):.2f}x faster") + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoJSON)") except ImportError: print("pyogrio not installed - skipping pyogrio benchmark") print("Install with: pip install rasterstats[pyogrio]") + + # --- GeoPackage benchmark --- + print("\n=== GeoPackage ===") + fiona_gpkg_secs = time_engine(gpkg_path, "fiona", n, layer=GPKG_LAYER) + print(f"fiona : {fiona_gpkg_secs:.3f}s ({n / fiona_gpkg_secs:,.0f} feat/s)") + + try: + import pyogrio # noqa: F401 + + pyogrio_gpkg_secs = time_engine(gpkg_path, "pyogrio", n, layer=GPKG_LAYER) + print(f"pyogrio: {pyogrio_gpkg_secs:.3f}s ({n / pyogrio_gpkg_secs:,.0f} feat/s)") + ratio = fiona_gpkg_secs / pyogrio_gpkg_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoPackage)") + except ImportError: + pass # already reported above in the GeoJSON block + finally: tmp_path.unlink(missing_ok=True) + gpkg_path.unlink(missing_ok=True) if __name__ == "__main__": diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index bf06ec9..548ba81 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -51,7 +51,7 @@ def _fiona_generator(obj, layer=0): # pyogrio backend -def _pyogrio_generator(obj, layer=0, chunk_size=65535): +def _pyogrio_generator(obj, layer=0, chunk_size=65536): """Yield GeoJSON-like Feature dicts using pyogrio, reading in chunks.""" try: import pyogrio @@ -73,7 +73,8 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65535): layer=layer, skip_features=skip, max_features=chunk_size, - use_arrow=True, + # Note do not use_arrow=True, reads all records into mem + # https://pyogrio.readthedocs.io/en/latest/about.html#how-it-works ) batch_size = len(geometries) geoms = shapely.from_wkb(geometries) From 445b3afe2d5015737d594211a63c3abb7e301bec Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Thu, 21 May 2026 16:05:19 -0600 Subject: [PATCH 04/19] optimize --- src/rasterstats/io.py | 7 +++---- src/rasterstats/main.py | 5 ++++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index 548ba81..9cff315 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -83,10 +83,9 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): props = {name: cols[j][i] for j, name in enumerate(field_names)} yield { "type": "Feature", - # converting to geom dict, only to convert back later? - # slight performance hit but it keeps the API identical. - # TODO optimize - "geometry": geoms[i].__geo_interface__, + # cheat/optimize: return the shapely geometry + # instead of a true geojson-like dict + "geometry": geoms[i], "properties": props, } skip += batch_size diff --git a/src/rasterstats/main.py b/src/rasterstats/main.py index 2f10f6a..20812b9 100644 --- a/src/rasterstats/main.py +++ b/src/rasterstats/main.py @@ -4,6 +4,7 @@ import numpy as np from affine import Affine +from shapely import Geometry from shapely.geometry import shape from rasterstats.io import Raster, read_features @@ -187,7 +188,9 @@ def gen_zonal_stats( with Raster(raster, affine, nodata, band) as rast: features_iter = read_features(vectors, layer, engine=engine) for _, feat in enumerate(features_iter): - geom = shape(feat["geometry"]) + geom = feat["geometry"] + if not isinstance(geom, Geometry): + geom = shape(geom) if "Point" in geom.geom_type: geom = boxify_points(geom, rast) From 7aafff85eb9d478eecd358894eb4150b2c4ec776 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Thu, 21 May 2026 17:32:42 -0600 Subject: [PATCH 05/19] additional formats --- scripts/bench_engines.py | 90 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/scripts/bench_engines.py b/scripts/bench_engines.py index 1623b0a..549daf2 100644 --- a/scripts/bench_engines.py +++ b/scripts/bench_engines.py @@ -1,9 +1,9 @@ """Benchmark fiona vs pyogrio as read backends for rasterstats. Generates N random point features inside the extent of tests/data/slope.tif, -writes them to a temporary GeoJSON file, converts it to a GeoPackage via -ogr2ogr, then times how long each engine takes to iterate over every feature -via ``read_features``. +writes them to a temporary GeoJSON file, converts it to GeoPackage, Shapefile, +and Parquet via ogr2ogr, then times how long each engine takes to iterate over +every feature via ``read_features``. Usage ----- @@ -12,6 +12,7 @@ import json import random +import shutil import subprocess import sys import tempfile @@ -50,6 +51,7 @@ def generate_geojson(path: Path, n: int) -> None: GPKG_LAYER = "features" +SHP_LAYER = "features" def generate_gpkg(geojson_path: Path, gpkg_path: Path) -> None: @@ -67,6 +69,39 @@ def generate_gpkg(geojson_path: Path, gpkg_path: Path) -> None: ) +def generate_shp(geojson_path: Path, shp_dir: Path) -> None: + """Convert an existing GeoJSON file to ESRI Shapefile using ogr2ogr. + + ogr2ogr writes a directory of sidecar files (.shp, .dbf, .shx, .prj) + so ``shp_dir`` must be a directory path that does not yet exist. + """ + subprocess.run( + [ + "ogr2ogr", + "-f", "ESRI Shapefile", + "-nln", SHP_LAYER, + str(shp_dir), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + +def generate_parquet(geojson_path: Path, parquet_path: Path) -> None: + """Convert an existing GeoJSON file to Parquet using ogr2ogr.""" + subprocess.run( + [ + "ogr2ogr", + "-f", "Parquet", + str(parquet_path), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + def time_engine(path: Path, engine: str, n: int, layer=0) -> float: # Import here so the benchmark reflects real-world import cost only once from rasterstats.io import read_features @@ -85,6 +120,8 @@ def main() -> None: with tempfile.NamedTemporaryFile(suffix=".geojson", delete=False) as tmp: tmp_path = Path(tmp.name) gpkg_path = Path(tempfile.mktemp(suffix=".gpkg")) + shp_dir = Path(tempfile.mkdtemp()) + parquet_path = Path(tempfile.mktemp(suffix=".parquet")) try: generate_geojson(tmp_path, n) @@ -94,7 +131,17 @@ def main() -> None: print("Converting to GeoPackage via ogr2ogr …") generate_gpkg(tmp_path, gpkg_path) gpkg_mb = gpkg_path.stat().st_size / 1024 / 1024 - print(f"Wrote GeoPackage {gpkg_mb:.1f} MB → {gpkg_path}\n") + print(f"Wrote GeoPackage {gpkg_mb:.1f} MB → {gpkg_path}") + + print("Converting to Shapefile via ogr2ogr …") + generate_shp(tmp_path, shp_dir) + shp_mb = sum(f.stat().st_size for f in shp_dir.rglob("*") if f.is_file()) / 1024 / 1024 + print(f"Wrote Shapefile {shp_mb:.1f} MB → {shp_dir}") + + print("Converting to Parquet via ogr2ogr …") + generate_parquet(tmp_path, parquet_path) + parquet_mb = parquet_path.stat().st_size / 1024 / 1024 + print(f"Wrote Parquet {parquet_mb:.1f} MB → {parquet_path}\n") # --- GeoJSON benchmark --- print("=== GeoJSON ===") @@ -129,9 +176,44 @@ def main() -> None: except ImportError: pass # already reported above in the GeoJSON block + # --- Shapefile benchmark --- + shp_path = shp_dir / (SHP_LAYER + ".shp") + print("\n=== Shapefile ===") + fiona_shp_secs = time_engine(shp_path, "fiona", n, layer=SHP_LAYER) + print(f"fiona : {fiona_shp_secs:.3f}s ({n / fiona_shp_secs:,.0f} feat/s)") + + try: + import pyogrio # noqa: F401 + + pyogrio_shp_secs = time_engine(shp_path, "pyogrio", n, layer=SHP_LAYER) + print(f"pyogrio: {pyogrio_shp_secs:.3f}s ({n / pyogrio_shp_secs:,.0f} feat/s)") + ratio = fiona_shp_secs / pyogrio_shp_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Shapefile)") + except ImportError: + pass # already reported above in the GeoJSON block + + # --- Parquet benchmark --- + print("\n=== Parquet ===") + fiona_parquet_secs = time_engine(parquet_path, "fiona", n, layer=0) + print(f"fiona : {fiona_parquet_secs:.3f}s ({n / fiona_parquet_secs:,.0f} feat/s)") + + try: + import pyogrio # noqa: F401 + + pyogrio_parquet_secs = time_engine(parquet_path, "pyogrio", n, layer=0) + print(f"pyogrio: {pyogrio_parquet_secs:.3f}s ({n / pyogrio_parquet_secs:,.0f} feat/s)") + ratio = fiona_parquet_secs / pyogrio_parquet_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Parquet)") + except ImportError: + pass # already reported above in the GeoJSON block + finally: tmp_path.unlink(missing_ok=True) gpkg_path.unlink(missing_ok=True) + shutil.rmtree(shp_dir, ignore_errors=True) + parquet_path.unlink(missing_ok=True) if __name__ == "__main__": From 5491ca4a99e7a138a3da8f2d9e1387a6d0ae2d89 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 09:30:44 -0600 Subject: [PATCH 06/19] ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0de38f5..e07e7a6 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ Vagrantfile venv .eggs .cache +.python-version # no uv lockfile until this is fixed: # https://github.com/astral-sh/uv/issues/10845 From 9571d3ff49c104739cff1b09dbcae980de7421c5 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 09:48:07 -0600 Subject: [PATCH 07/19] ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index e07e7a6..286c44b 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ venv .eggs .cache .python-version +*.lprof # no uv lockfile until this is fixed: # https://github.com/astral-sh/uv/issues/10845 From 5bff0c857102d5180dd4a6dff85642acb1f41810 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 09:48:48 -0600 Subject: [PATCH 08/19] format, make pyogrio the default --- pyproject.toml | 120 +++++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 68 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fa6e5b6..cc60b4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,4 @@ +# Note: code auto-formatted with `taplo fmt` [build-system] requires = ["hatchling"] build-backend = "hatchling.build" @@ -5,72 +6,59 @@ build-backend = "hatchling.build" [project] name = "rasterstats" description = "Summarize geospatial raster datasets based on vector geometries" -authors = [ - {name = "Matthew Perry", email = "perrygeo@gmail.com"}, -] +authors = [{ name = "Matthew Perry", email = "perrygeo@gmail.com" }] readme = "README.rst" -keywords = ["gis", "geospatial", "geographic", "raster", "vector", "zonal statistics"] +keywords = [ + "gis", + "geospatial", + "geographic", + "raster", + "vector", + "zonal statistics", +] dynamic = ["version"] -license = {text = "BSD-3-Clause"} +license = { text = "BSD-3-Clause" } classifiers = [ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - "Programming Language :: Python :: 3.14", - "Topic :: Utilities", - "Topic :: Scientific/Engineering :: GIS", + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Utilities", + "Topic :: Scientific/Engineering :: GIS", ] requires-python = ">=3.9" dependencies = [ - "affine", - "click >7.1, !=8.2.1", - "cligj >=0.4", - # TODO either fiona OR pyogrio is required, which should be the default? - # fiona -> backward compatible but won't have support for 3.14+ binaries - # pyogrio -> initial tests show poor performance (3x slower), need to tune, - # otherwise would introduce a bad performance regression. - "numpy >=1.9", - "rasterio >=1.0", - "simplejson", - "shapely", + "affine", + "click >7.1, !=8.2.1", + "cligj >=0.4", + "numpy >=1.9", + "pyogrio", + "rasterio >=1.0", + "simplejson", + "shapely", ] [project.optional-dependencies] -progress = [ - "tqdm" -] -fiona = [ - "fiona" -] -pyogrio = [ - "pyogrio" -] -docs = [ - "numpydoc", - "sphinx", - "sphinx-rtd-theme", -] +progress = ["tqdm"] +fiona = ["fiona"] +docs = ["numpydoc", "sphinx", "sphinx-rtd-theme"] test = [ - "coverage", - "geopandas", - "pyshp >=1.1.4", - "pytest >=4.6", - "pytest-cov >=2.2.0", - "simplejson", -] -dev = [ - "rasterstats[test,fiona,pyogrio]", - "ruff", - "twine", + "coverage", + "geopandas", + "pyshp >=1.1.4", + "pytest >=4.6", + "pytest-cov >=2.2.0", + "simplejson", ] +dev = ["rasterstats[test,fiona]", "ruff", "twine"] [project.entry-points."rasterio.rio_plugins"] zonalstats = "rasterstats.cli:zonalstats" @@ -84,31 +72,27 @@ Documentation = "https://pythonhosted.org/rasterstats/" only-include = ["src", "tests"] [tool.pytest.ini_options] -filterwarnings = [ - "error", - "ignore::UserWarning", -] +filterwarnings = ["error", "ignore::UserWarning"] testpaths = ["tests"] -# addopts = "--verbose -rf --ipdb --maxfail=1" +addopts = "--verbose -rf --cov rasterstats --cov-report html" [tool.setuptools.dynamic] -version = {attr = "rasterstats._version.__version__"} +version = { attr = "rasterstats._version.__version__" } [tool.hatch.version] path = "src/rasterstats/_version.py" [tool.ruff.lint] select = [ - "E", # pycodestyle - "F", # Pyflakes - "I", # isort - "RUF", # Ruff-specific rules - "UP", # pyupgrade + "E", # pycodestyle + "F", # Pyflakes + "I", # isort + "RUF", # Ruff-specific rules + "UP", # pyupgrade ] ignore = [ - "RUF005", # Consider iterable unpacking instead of concatenation + "RUF005", # Disable: Consider iterable unpacking instead of concatenation ] [tool.ruff] -# TODO: files in docs/notebooks/ use old versions and are incompatible with modern tools extend-exclude = ["*.ipynb"] From 9cd509a9a259856dc372f47d5e35fe19dfabb801 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 09:51:33 -0600 Subject: [PATCH 09/19] test against feature_generator --- tests/test_io.py | 79 ++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/tests/test_io.py b/tests/test_io.py index 5ad22aa..fb6d4a8 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,7 +1,6 @@ import json from pathlib import Path -import fiona import numpy as np import pytest import rasterio @@ -29,7 +28,7 @@ eps = 1e-6 -target_features = [f for f in fiona_generator(polygons)] +target_features = list(feature_generator(polygons)) target_geoms = [shape(f["geometry"]) for f in target_features] @@ -53,16 +52,24 @@ def _test_read_features_single(indata): def test_fiona_path(): - assert list(read_features(polygons)) == target_features + """read_features on a path uses pyogrio default; geometries match.""" + features = list(read_features(polygons)) + geoms = [shape(f["geometry"]) for f in features] + _compare_geomlists(geoms, target_geoms) def test_layer_index(): + fiona = pytest.importorskip("fiona") layer = fiona.listlayers(data_dir).index("polygons") - assert list(read_features(data_dir, layer=layer)) == target_features + result = list(read_features(data_dir, layer=layer)) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_layer_name(): - assert list(read_features(data_dir, layer="polygons")) == target_features + result = list(read_features(data_dir, layer="polygons")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_path_unicode(): @@ -71,75 +78,78 @@ def test_path_unicode(): except NameError: # python3, it's already unicode upolygons = polygons - assert list(read_features(upolygons)) == target_features + result = list(read_features(upolygons)) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_featurecollection(): - assert ( - read_featurecollection(polygons)["features"] - == list(read_features(polygons)) - == target_features - ) + fc_features = read_featurecollection(polygons)["features"] + rf_features = list(read_features(polygons)) + geoms_fc = [shape(f["geometry"]) for f in fc_features] + geoms_rf = [shape(f["geometry"]) for f in rf_features] + _compare_geomlists(geoms_fc, target_geoms) + _compare_geomlists(geoms_rf, target_geoms) def test_shapely(): - indata = [shape(f["geometry"]) for f in fiona_generator(polygons)] + indata = [shape(f["geometry"]) for f in feature_generator(polygons)] _test_read_features(indata) _test_read_features_single(indata[0]) def test_wkt(): - indata = [shape(f["geometry"]).wkt for f in fiona_generator(polygons)] + indata = [shape(f["geometry"]).wkt for f in feature_generator(polygons)] _test_read_features(indata) _test_read_features_single(indata[0]) def test_wkb(): - indata = [shape(f["geometry"]).wkb for f in fiona_generator(polygons)] + indata = [shape(f["geometry"]).wkb for f in feature_generator(polygons)] _test_read_features(indata) _test_read_features_single(indata[0]) def test_mapping_features(): # list of Features - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) _test_read_features(indata) def test_mapping_feature(): # list of Features - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) _test_read_features(indata[0]) def test_mapping_geoms(): - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) _test_read_features(indata[0]["geometry"]) def test_mapping_collection(): indata = {"type": "FeatureCollection"} - indata["features"] = [f for f in fiona_generator(polygons)] + indata["features"] = list(feature_generator(polygons)) _test_read_features(indata) def test_jsonstr(): # Feature str - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) indata = json.dumps(indata[0]) _test_read_features(indata) def test_jsonstr_geom(): # geojson geom str - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) indata = json.dumps(indata[0]["geometry"]) _test_read_features(indata) def test_jsonstr_collection(): indata = {"type": "FeatureCollection"} - indata["features"] = [f for f in fiona_generator(polygons)] + indata["features"] = list(feature_generator(polygons)) indata = json.dumps(indata) _test_read_features(indata) @@ -164,19 +174,19 @@ def __init__(self, f): def test_geo_interface(): - indata = [MockGeoInterface(f) for f in fiona_generator(polygons)] + indata = [MockGeoInterface(f) for f in feature_generator(polygons)] _test_read_features(indata) def test_geo_interface_geom(): - indata = [MockGeoInterface(f["geometry"]) for f in fiona_generator(polygons)] + indata = [MockGeoInterface(f["geometry"]) for f in feature_generator(polygons)] _test_read_features(indata) def test_geo_interface_collection(): # geointerface for featurecollection? indata = {"type": "FeatureCollection"} - indata["features"] = [f for f in fiona_generator(polygons)] + indata["features"] = list(feature_generator(polygons)) indata = MockGeoInterface(indata) _test_read_features(indata) @@ -374,16 +384,19 @@ def next(self): # feature_generator / engine tests -def test_feature_generator_fiona_default(): - """feature_generator with no engine= uses fiona and matches target.""" +def test_feature_generator_pyogrio_default(): + """feature_generator with no engine= uses pyogrio (new default).""" result = list(feature_generator(polygons)) - assert result == target_features + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_feature_generator_explicit_fiona(): - """feature_generator with engine='fiona' matches target.""" + """feature_generator with engine='fiona' yields correct geometries.""" + pytest.importorskip("fiona") result = list(feature_generator(polygons, engine="fiona")) - assert result == target_features + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_feature_generator_pyogrio(): @@ -409,9 +422,11 @@ def test_feature_generator_invalid_engine(): def test_fiona_generator_alias(): - """fiona_generator is still importable and works as a backward-compat alias.""" - result = list(fiona_generator(polygons)) - assert result == target_features + """fiona_generator backward-compat alias still works when fiona is installed.""" + pytest.importorskip("fiona") + result = list(fiona_generator(polygons, engine="fiona")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_read_features_pyogrio_engine(): From 2910f8a0435e86abdd2fec71df82fb9c1cb4cb54 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 09:52:14 -0600 Subject: [PATCH 10/19] removed fiona imports, make pyogrio the default vector engine --- src/rasterstats/io.py | 84 ++++++++++++++++++++++++----------------- src/rasterstats/main.py | 7 ++-- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index 9cff315..43883b2 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -5,12 +5,10 @@ from json import JSONDecodeError from os import PathLike -import fiona import numpy as np import rasterio import shapely from affine import Affine -from fiona.errors import DriverError from rasterio.enums import MaskFlags from rasterio.transform import guard_transform from shapely import wkb, wkt @@ -32,18 +30,27 @@ # Fiona backend -try: - # Fiona 1.9+ - import fiona.model - def _fiona_generator(obj, layer=0): +def _fiona_generator(obj, layer=0): + """Yield GeoJSON-like Feature dicts using fiona (optional engine). + + Raises ImportError with a helpful message if fiona is not installed. + """ + try: + import fiona + except ImportError: + raise ImportError( + "fiona is required for engine='fiona'. " + "Install it with: pip install rasterstats[fiona]" + ) + try: + import fiona.model + with fiona.open(obj, "r", layer=layer) as src: for feat in src: yield fiona.model.to_dict(feat) - -except ModuleNotFoundError: - # Fiona <1.9 - def _fiona_generator(obj, layer=0): + except ImportError: + # fiona < 1.9 — no fiona.model with fiona.open(obj, "r", layer=layer) as src: yield from src @@ -83,9 +90,7 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): props = {name: cols[j][i] for j, name in enumerate(field_names)} yield { "type": "Feature", - # cheat/optimize: return the shapely geometry - # instead of a true geojson-like dict - "geometry": geoms[i], + "geometry": geoms[i].__geo_interface__, "properties": props, } skip += batch_size @@ -95,6 +100,8 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): # Public dispatcher +DEFAULT_ENGINE = "pyogrio" + def feature_generator(obj, layer=0, engine=None): """Yield GeoJSON-like Feature dicts from a file-based vector source. @@ -102,24 +109,26 @@ def feature_generator(obj, layer=0, engine=None): Parameters ---------- obj : str or PathLike - Path to a vector data source supported by fiona or pyogrio. + Path to a vector data source supported by pyogrio (default) or fiona. layer : int or str, optional Layer index or name (default: 0). - engine : {"fiona", "pyogrio"} or None, optional - Backend to use for reading. ``None`` and ``"fiona"`` both select - the fiona backend. Pass ``"pyogrio"`` explicitly to use pyogrio. + engine : {"pyogrio", "fiona"} or None, optional + Backend to use for reading. ``None`` selects the default engine + (``"pyogrio"``). Pass ``"fiona"`` to opt in to the fiona backend + (requires ``pip install rasterstats[fiona]``). Yields ------ dict GeoJSON-like Feature dicts. """ - if engine is None or engine == "fiona": - yield from _fiona_generator(obj, layer=layer) - elif engine == "pyogrio": + resolved = engine if engine is not None else DEFAULT_ENGINE + if resolved == "pyogrio": yield from _pyogrio_generator(obj, layer=layer) + elif resolved == "fiona": + yield from _fiona_generator(obj, layer=layer) else: - raise ValueError(f"Unknown engine {engine!r}. Choose 'fiona' or 'pyogrio'.") + raise ValueError(f"Unknown engine {resolved!r}. Choose 'pyogrio' or 'fiona'.") # Backward-compatible alias @@ -170,24 +179,31 @@ def parse_feature(obj): raise ValueError(f"Can't parse {obj} as a geojson Feature object") +def _is_vector_file(path, layer): + """Return True if ``path`` is a readable vector file with at least one feature. + + Uses pyogrio.read_info for the probe — no fiona required. + Returns False for invalid paths, non-vector files, and genuinely empty sources. + A feature count of -1 (unknown, e.g. some GeoJSON drivers) is treated as + non-empty (True). + """ + try: + import pyogrio + + info = pyogrio.read_info(path, layer=layer) + # -1 means the driver cannot report a count; treat as non-empty + return info["features"] != 0 + except Exception: + return False + + def read_features(obj, layer=0, engine=None): features_iter = None if isinstance(obj, (str, PathLike)): obj = str(obj) - try: - # test it as fiona data source - with fiona.open(obj, "r", layer=layer) as src: - assert len(src) > 0 - + if _is_vector_file(obj, layer): features_iter = feature_generator(obj, layer, engine=engine) - except ( - AssertionError, - DriverError, - OSError, - TypeError, - UnicodeDecodeError, - ValueError, - ): + else: try: mapping = json.loads(obj) if "type" in mapping and mapping["type"] == "FeatureCollection": diff --git a/src/rasterstats/main.py b/src/rasterstats/main.py index 20812b9..533a90c 100644 --- a/src/rasterstats/main.py +++ b/src/rasterstats/main.py @@ -147,10 +147,11 @@ def gen_zonal_stats( Allow features that extend beyond the raster dataset's extent, default: True Cells outside dataset extents are treated as nodata. - engine : {"fiona", "pyogrio"} or None, optional + engine : {"pyogrio", "fiona"} or None, optional Backend to use when reading file-based vector sources. - ``None`` or ``"fiona"`` (the default) both select the fiona backend. - Pass ``"pyogrio"`` to use pyogrio explicitly. + ``None`` selects the default engine (``"pyogrio"``). + Pass ``"fiona"`` to opt in to the fiona backend + (requires ``pip install rasterstats[fiona]``). Returns ------- From 44c081cd31349654c247c22ada42425bab272d8c Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 11:10:13 -0600 Subject: [PATCH 11/19] improve benchmark --- pyproject.toml | 1 + scripts/bench_engines.py | 171 ++++++++++++++++++++++++--------------- 2 files changed, 105 insertions(+), 67 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cc60b4f..dbd35eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,7 @@ select = [ "UP", # pyupgrade ] ignore = [ + "E501", # Disable: line too long "RUF005", # Disable: Consider iterable unpacking instead of concatenation ] diff --git a/scripts/bench_engines.py b/scripts/bench_engines.py index 549daf2..d662d1a 100644 --- a/scripts/bench_engines.py +++ b/scripts/bench_engines.py @@ -2,7 +2,7 @@ Generates N random point features inside the extent of tests/data/slope.tif, writes them to a temporary GeoJSON file, converts it to GeoPackage, Shapefile, -and Parquet via ogr2ogr, then times how long each engine takes to iterate over +and other formats via ogr2ogr, then times how long each engine takes to iterate over every feature via ``read_features``. Usage @@ -21,6 +21,9 @@ import rasterio +from rasterstats.io import read_features +from rasterstats.main import gen_zonal_stats + # Resolve the repo root so the script works from any cwd REPO_ROOT = Path(__file__).parent.parent SLOPE_TIF = REPO_ROOT / "tests" / "data" / "slope.tif" @@ -59,8 +62,10 @@ def generate_gpkg(geojson_path: Path, gpkg_path: Path) -> None: subprocess.run( [ "ogr2ogr", - "-f", "GPKG", - "-nln", GPKG_LAYER, + "-f", + "GPKG", + "-nln", + GPKG_LAYER, str(gpkg_path), str(geojson_path), ], @@ -78,8 +83,10 @@ def generate_shp(geojson_path: Path, shp_dir: Path) -> None: subprocess.run( [ "ogr2ogr", - "-f", "ESRI Shapefile", - "-nln", SHP_LAYER, + "-f", + "ESRI Shapefile", + "-nln", + SHP_LAYER, str(shp_dir), str(geojson_path), ], @@ -88,12 +95,28 @@ def generate_shp(geojson_path: Path, shp_dir: Path) -> None: ) +def generate_flatgeobuf(geojson_path: Path, fgb_path: Path) -> None: + """Convert an existing GeoJSON file to FlatGeobuf using ogr2ogr.""" + subprocess.run( + [ + "ogr2ogr", + "-f", + "FlatGeobuf", + str(fgb_path), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + def generate_parquet(geojson_path: Path, parquet_path: Path) -> None: """Convert an existing GeoJSON file to Parquet using ogr2ogr.""" subprocess.run( [ "ogr2ogr", - "-f", "Parquet", + "-f", + "Parquet", str(parquet_path), str(geojson_path), ], @@ -103,9 +126,7 @@ def generate_parquet(geojson_path: Path, parquet_path: Path) -> None: def time_engine(path: Path, engine: str, n: int, layer=0) -> float: - # Import here so the benchmark reflects real-world import cost only once - from rasterstats.io import read_features - + "Times how long each engine takes to iterate through all features" t0 = time.perf_counter() count = sum(1 for _ in read_features(str(path), layer=layer, engine=engine)) elapsed = time.perf_counter() - t0 @@ -113,6 +134,21 @@ def time_engine(path: Path, engine: str, n: int, layer=0) -> float: return elapsed +def time_zonal_stats(path: Path, engine: str, n: int, layer=0) -> float: + """Full e2e test running all stats againts the geotiff. + Typically 100x slower than just reading the features and discarding them. + IOW this is NOT typically IO bound for local files. + """ + t0 = time.perf_counter() + count = sum( + 1 + for _ in gen_zonal_stats(str(path), str(SLOPE_TIF), layer=layer, engine=engine) + ) + elapsed = time.perf_counter() - t0 + assert count == n, f"Expected {n} features, got {count}" + return elapsed + + def main() -> None: n = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_N @@ -121,7 +157,8 @@ def main() -> None: tmp_path = Path(tmp.name) gpkg_path = Path(tempfile.mktemp(suffix=".gpkg")) shp_dir = Path(tempfile.mkdtemp()) - parquet_path = Path(tempfile.mktemp(suffix=".parquet")) + fgb_path = Path(tempfile.mktemp(suffix=".fgb")) + # parquet_path = Path(tempfile.mktemp(suffix=".parquet")) try: generate_geojson(tmp_path, n) @@ -135,85 +172,85 @@ def main() -> None: print("Converting to Shapefile via ogr2ogr …") generate_shp(tmp_path, shp_dir) - shp_mb = sum(f.stat().st_size for f in shp_dir.rglob("*") if f.is_file()) / 1024 / 1024 + shp_mb = ( + sum(f.stat().st_size for f in shp_dir.rglob("*") if f.is_file()) + / 1024 + / 1024 + ) print(f"Wrote Shapefile {shp_mb:.1f} MB → {shp_dir}") - print("Converting to Parquet via ogr2ogr …") - generate_parquet(tmp_path, parquet_path) - parquet_mb = parquet_path.stat().st_size / 1024 / 1024 - print(f"Wrote Parquet {parquet_mb:.1f} MB → {parquet_path}\n") + print("Converting to FlatGeobuf via ogr2ogr …") + generate_flatgeobuf(tmp_path, fgb_path) + fgb_mb = fgb_path.stat().st_size / 1024 / 1024 + print(f"Wrote FlatGeobuf {fgb_mb:.1f} MB → {fgb_path}") + + # print("Converting to Parquet via ogr2ogr …") + # generate_parquet(tmp_path, parquet_path) + # parquet_mb = parquet_path.stat().st_size / 1024 / 1024 + # print(f"Wrote Parquet {parquet_mb:.1f} MB → {parquet_path}\n") - # --- GeoJSON benchmark --- print("=== GeoJSON ===") fiona_secs = time_engine(tmp_path, "fiona", n) print(f"fiona : {fiona_secs:.3f}s ({n / fiona_secs:,.0f} feat/s)") - try: - import pyogrio # noqa: F401 - - pyogrio_secs = time_engine(tmp_path, "pyogrio", n) - print(f"pyogrio: {pyogrio_secs:.3f}s ({n / pyogrio_secs:,.0f} feat/s)") - ratio = fiona_secs / pyogrio_secs - faster = "pyogrio" if ratio > 1 else "fiona" - print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoJSON)") - except ImportError: - print("pyogrio not installed - skipping pyogrio benchmark") - print("Install with: pip install rasterstats[pyogrio]") + pyogrio_secs = time_engine(tmp_path, "pyogrio", n) + print(f"pyogrio: {pyogrio_secs:.3f}s ({n / pyogrio_secs:,.0f} feat/s)") + ratio = fiona_secs / pyogrio_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoJSON)") - # --- GeoPackage benchmark --- print("\n=== GeoPackage ===") fiona_gpkg_secs = time_engine(gpkg_path, "fiona", n, layer=GPKG_LAYER) print(f"fiona : {fiona_gpkg_secs:.3f}s ({n / fiona_gpkg_secs:,.0f} feat/s)") - try: - import pyogrio # noqa: F401 + pyogrio_gpkg_secs = time_engine(gpkg_path, "pyogrio", n, layer=GPKG_LAYER) + print( + f"pyogrio: {pyogrio_gpkg_secs:.3f}s ({n / pyogrio_gpkg_secs:,.0f} feat/s)" + ) + ratio = fiona_gpkg_secs / pyogrio_gpkg_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoPackage)") - pyogrio_gpkg_secs = time_engine(gpkg_path, "pyogrio", n, layer=GPKG_LAYER) - print(f"pyogrio: {pyogrio_gpkg_secs:.3f}s ({n / pyogrio_gpkg_secs:,.0f} feat/s)") - ratio = fiona_gpkg_secs / pyogrio_gpkg_secs - faster = "pyogrio" if ratio > 1 else "fiona" - print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoPackage)") - except ImportError: - pass # already reported above in the GeoJSON block - - # --- Shapefile benchmark --- - shp_path = shp_dir / (SHP_LAYER + ".shp") print("\n=== Shapefile ===") + shp_path = shp_dir / (SHP_LAYER + ".shp") fiona_shp_secs = time_engine(shp_path, "fiona", n, layer=SHP_LAYER) print(f"fiona : {fiona_shp_secs:.3f}s ({n / fiona_shp_secs:,.0f} feat/s)") - try: - import pyogrio # noqa: F401 - - pyogrio_shp_secs = time_engine(shp_path, "pyogrio", n, layer=SHP_LAYER) - print(f"pyogrio: {pyogrio_shp_secs:.3f}s ({n / pyogrio_shp_secs:,.0f} feat/s)") - ratio = fiona_shp_secs / pyogrio_shp_secs - faster = "pyogrio" if ratio > 1 else "fiona" - print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Shapefile)") - except ImportError: - pass # already reported above in the GeoJSON block - - # --- Parquet benchmark --- - print("\n=== Parquet ===") - fiona_parquet_secs = time_engine(parquet_path, "fiona", n, layer=0) - print(f"fiona : {fiona_parquet_secs:.3f}s ({n / fiona_parquet_secs:,.0f} feat/s)") - - try: - import pyogrio # noqa: F401 - - pyogrio_parquet_secs = time_engine(parquet_path, "pyogrio", n, layer=0) - print(f"pyogrio: {pyogrio_parquet_secs:.3f}s ({n / pyogrio_parquet_secs:,.0f} feat/s)") - ratio = fiona_parquet_secs / pyogrio_parquet_secs - faster = "pyogrio" if ratio > 1 else "fiona" - print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Parquet)") - except ImportError: - pass # already reported above in the GeoJSON block + pyogrio_shp_secs = time_engine(shp_path, "pyogrio", n, layer=SHP_LAYER) + print(f"pyogrio: {pyogrio_shp_secs:.3f}s ({n / pyogrio_shp_secs:,.0f} feat/s)") + ratio = fiona_shp_secs / pyogrio_shp_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Shapefile)") + + print("\n=== FlatGeobuf ===") + print("fiona : skip") # fiona wheels not necessarily built with flatgeobuf + # fiona_fgb_secs = time_engine(fgb_path, "fiona", n, layer=None) + # print(f"fiona : {fiona_fgb_secs:.3f}s ({n / fiona_fgb_secs:,.0f} feat/s)") + + pyogrio_fgb_secs = time_engine(fgb_path, "pyogrio", n, layer=None) + print(f"pyogrio: {pyogrio_fgb_secs:.3f}s ({n / pyogrio_fgb_secs:,.0f} feat/s)") + + # The pyogrio binary wheels don't support parquet yet + # print(pyogrio.list_drivers()) + # print("\n=== Parquet ===") + # fiona_parquet_secs = time_engine(parquet_path, "fiona", n, layer=0) + # print( + # f"fiona : {fiona_parquet_secs:.3f}s ({n / fiona_parquet_secs:,.0f} feat/s)" + # ) + # + # pyogrio_parquet_secs = time_engine(parquet_path, "pyogrio", n, layer=0) + # print( + # f"pyogrio: {pyogrio_parquet_secs:.3f}s ({n / pyogrio_parquet_secs:,.0f} feat/s)" + # ) + # ratio = fiona_parquet_secs / pyogrio_parquet_secs + # faster = "pyogrio" if ratio > 1 else "fiona" + # print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Parquet)") finally: tmp_path.unlink(missing_ok=True) gpkg_path.unlink(missing_ok=True) shutil.rmtree(shp_dir, ignore_errors=True) - parquet_path.unlink(missing_ok=True) + fgb_path.unlink(missing_ok=True) if __name__ == "__main__": From 797e0e8191b9e7b00ab537b79ac1ef1f95e5d6a1 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 11:11:49 -0600 Subject: [PATCH 12/19] clear control flow, imports --- src/rasterstats/io.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index 43883b2..5097dfa 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -6,6 +6,7 @@ from os import PathLike import numpy as np +import pyogrio import rasterio import shapely from affine import Affine @@ -188,13 +189,10 @@ def _is_vector_file(path, layer): non-empty (True). """ try: - import pyogrio - info = pyogrio.read_info(path, layer=layer) - # -1 means the driver cannot report a count; treat as non-empty - return info["features"] != 0 except Exception: return False + return info["features"] != 0 def read_features(obj, layer=0, engine=None): From 341c76b8704273ea9a26ea464e4c3fcf32571a9a Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 15:16:18 -0600 Subject: [PATCH 13/19] docs --- docs/index.rst | 4 ++++ docs/installation.rst | 34 ++++++++++++++++++++++++++-------- docs/manual.rst | 9 ++++++--- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index ca44d47..fd8e6d8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,10 @@ Quickstart Install:: + uv add rasterstats + +Or with pip:: + pip install rasterstats diff --git a/docs/installation.rst b/docs/installation.rst index feba805..e117727 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,19 +1,37 @@ Installation ============ -Depends on libgdal, rasterio, fiona, shapely and numpy +Depends on libgdal, rasterio, fiona, shapely and numpy. -Using Ubuntu 14.04:: +Using ``uv`` (recommended):: - sudo apt-get install python-numpy libgdal1h gdal-bin libgdal-dev - pip install rasterstats + uv add rasterstats -Or homebrew on OS X:: +Or with pip:: - brew install gdal pip install rasterstats -For Windows, follow the `rasterio installation `_ and then run:: +Platform-specific GDAL setup +----------------------------- - pip install rasterstats +**Ubuntu**:: + + sudo apt-get install libgdal-dev gdal-bin + +then install rasterstats as above. + +**macOS** (Homebrew):: + + brew install gdal + +then install rasterstats as above. + +**Windows**: follow the `rasterio installation `_, +then install rasterstats as above. + +Tests +----- + +To run the python unit tests + uv run pytest diff --git a/docs/manual.rst b/docs/manual.rst index d508d74..b4dbcf7 100644 --- a/docs/manual.rst +++ b/docs/manual.rst @@ -41,7 +41,7 @@ while ``point_query`` gives us a list of raster values corresponding to each inp Vector Data Sources ------------------- The most common use case is having vector data sources in a file such as an ESRI Shapefile or any -other format supported by ``fiona``. The path to the file can be passed in directly as the first argument:: +other format supported by ``pyogrio``. The path to the file can be passed in directly as the first argument:: >>> zs = zonal_stats('tests/data/polygons.shp', 'tests/data/slope.tif') @@ -49,6 +49,11 @@ If you have multi-layer sources, you can specify the ``layer`` by either name or >>> zs = zonal_stats('tests/data', 'tests/data/slope.tif', layer="polygons") +If you have ``fiona`` installed and want to use it as the vector iteration engine +instead of the default ``pyogrio``, you can optionally add ``engine='fiona'`` + + >>> zs = zonal_stats("tests/data/polygons.shp", "tests/data/slope.tif", engine="fiona") + In addition to the basic usage above, rasterstats supports other mechanisms of specifying vector geometries. @@ -294,5 +299,3 @@ command line tool, as well as GRASS's `r.statistics `_. - - From a2957df4ffe6f2659ef725273d1d84fb1739d717 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 16:10:35 -0600 Subject: [PATCH 14/19] engine --- src/rasterstats/point.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/rasterstats/point.py b/src/rasterstats/point.py index f262c6c..fc182d5 100644 --- a/src/rasterstats/point.py +++ b/src/rasterstats/point.py @@ -109,6 +109,7 @@ def gen_point_query( property_name="value", geojson_out=False, boundless=True, + engine=None, ): """ Given a set of vector features and a raster, @@ -161,6 +162,12 @@ def gen_point_query( Allow features that extend beyond the raster dataset's extent, default: True Cells outside dataset extents are treated as nodata. + engine : {"pyogrio", "fiona"} or None, optional + Backend to use when reading file-based vector sources. + ``None`` selects the default engine (``"pyogrio"``). + Pass ``"fiona"`` to opt in to the fiona backend + (requires ``pip install rasterstats[fiona]``). + Returns ------- generator of arrays (if ``geojson_out`` is False) @@ -169,7 +176,7 @@ def gen_point_query( if interpolate not in ["nearest", "bilinear"]: raise ValueError("interpolate must be nearest or bilinear") - features_iter = read_features(vectors, layer) + features_iter = read_features(vectors, layer, engine=engine) with Raster(raster, nodata=nodata, affine=affine, band=band) as rast: for feat in features_iter: From bccec2cdab1f2123a1a74b4eda0b14dc7351d297 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Fri, 22 May 2026 16:11:56 -0600 Subject: [PATCH 15/19] account for features = -1 --- src/rasterstats/io.py | 24 ++++++---------- tests/test_io.py | 67 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 66 insertions(+), 25 deletions(-) diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index 5097dfa..229c76d 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -7,6 +7,7 @@ import numpy as np import pyogrio +import pyogrio.raw import rasterio import shapely from affine import Affine @@ -61,21 +62,15 @@ def _fiona_generator(obj, layer=0): def _pyogrio_generator(obj, layer=0, chunk_size=65536): """Yield GeoJSON-like Feature dicts using pyogrio, reading in chunks.""" - try: - import pyogrio - import pyogrio.raw - except ImportError as e: - raise ImportError( - "pyogrio is required for engine='pyogrio'. " - "Install it with: pip install rasterstats[pyogrio]" - ) from e - info = pyogrio.read_info(obj, layer=layer) - total = info["features"] field_names = list(info["fields"]) + # Some sources advertise `info["features"]" == -1` (WFS, certain VRT sources). + # ie they don't support *fast* feature counting, thus return -1 + # But they *do* support iteration, which is all we need. + skip = 0 - while skip < total: + while True: _meta, _fids, geometries, field_data = pyogrio.raw.read( obj, layer=layer, @@ -89,9 +84,10 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): cols = [col.tolist() for col in field_data] for i in range(batch_size): props = {name: cols[j][i] for j, name in enumerate(field_names)} + geom = geoms[i] yield { "type": "Feature", - "geometry": geoms[i].__geo_interface__, + "geometry": geom.__geo_interface__ if geom is not None else None, "properties": props, } skip += batch_size @@ -132,10 +128,6 @@ def feature_generator(obj, layer=0, engine=None): raise ValueError(f"Unknown engine {resolved!r}. Choose 'pyogrio' or 'fiona'.") -# Backward-compatible alias -fiona_generator = feature_generator - - def wrap_geom(geom): """Wraps a geometry dict in an GeoJSON Feature""" return {"type": "Feature", "properties": {}, "geometry": geom} diff --git a/tests/test_io.py b/tests/test_io.py index fb6d4a8..1100f00 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -11,7 +11,6 @@ boundless_array, bounds_window, feature_generator, - fiona_generator, read_featurecollection, read_features, rowcol, @@ -421,14 +420,6 @@ def test_feature_generator_invalid_engine(): list(feature_generator(polygons, engine="gdal")) -def test_fiona_generator_alias(): - """fiona_generator backward-compat alias still works when fiona is installed.""" - pytest.importorskip("fiona") - result = list(fiona_generator(polygons, engine="fiona")) - geoms = [shape(f["geometry"]) for f in result] - _compare_geomlists(geoms, target_geoms) - - def test_read_features_pyogrio_engine(): """read_features forwards engine= to the underlying generator.""" pytest.importorskip("pyogrio") @@ -437,6 +428,64 @@ def test_read_features_pyogrio_engine(): _compare_geomlists(geoms, target_geoms) +def test_pyogrio_null_geometry(tmp_path): + """_pyogrio_generator yields None geometry for features with null geometry + rather than raising AttributeError.""" + geojson = { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "geometry": {"type": "Point", "coordinates": [0, 0]}, "properties": {"n": 1}}, + {"type": "Feature", "geometry": None, "properties": {"n": 2}}, + {"type": "Feature", "geometry": {"type": "Point", "coordinates": [1, 1]}, "properties": {"n": 3}}, + ], + } + p = tmp_path / "null_geom.geojson" + p.write_text(json.dumps(geojson)) + + results = list(feature_generator(str(p), engine="pyogrio")) + assert len(results) == 3 + assert results[0]["geometry"] is not None + assert results[1]["geometry"] is None + assert results[2]["geometry"] is not None + assert results[1]["properties"]["n"] == 2 + + +def test_pyogrio_unknown_feature_count(tmp_path, monkeypatch): + """_pyogrio_generator reads all features when read_info returns features=-1. + + Some GDAL drivers (e.g. WFS) cannot report feature count ahead of time and + return -1. The old ``while skip < total`` loop silently yielded nothing in + this case. The new unconditional loop must still yield all features. + """ + import pyogrio + + geojson = { + "type": "FeatureCollection", + "features": [ + {"type": "Feature", "geometry": {"type": "Point", "coordinates": [float(i), 0.0]}, "properties": {"n": i}} + for i in range(5) + ], + } + p = tmp_path / "unknown_count.geojson" + p.write_text(json.dumps(geojson)) + + real_read_info = pyogrio.read_info + + def fake_read_info(path, **kwargs): + info = real_read_info(path, **kwargs) + info["features"] = -1 + return info + + monkeypatch.setattr(pyogrio, "read_info", fake_read_info) + # Also patch the name used inside the io module + import rasterstats.io as io_mod + monkeypatch.setattr(io_mod, "pyogrio", pyogrio) + + results = list(feature_generator(str(p), engine="pyogrio")) + assert len(results) == 5 + assert [f["properties"]["n"] for f in results] == list(range(5)) + + # Optional tests From b01819d5a044a32682e23baaf94353bb085da3e4 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Sat, 23 May 2026 13:57:02 -0600 Subject: [PATCH 16/19] add fid, catch only DataSourceError, test --- CHANGELOG.txt | 5 +++++ src/rasterstats/io.py | 8 ++++++-- tests/test_io.py | 43 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 7e9d042..0722fff 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,8 @@ +0.21.0 +- [Breaking] Replaces fiona with pyogrio as the default vector io engine #314 + - Should give better performance for most use cases. + - Pass `engine=fiona` to use the old code path + 0.20.0 - Progress bar for interactive use (#300) - Fixes to support Fiona 1.10 (#301) diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index 229c76d..d6b7a01 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -71,11 +71,12 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): skip = 0 while True: - _meta, _fids, geometries, field_data = pyogrio.raw.read( + _meta, fids, geometries, field_data = pyogrio.raw.read( obj, layer=layer, skip_features=skip, max_features=chunk_size, + return_fids=True, # Note do not use_arrow=True, reads all records into mem # https://pyogrio.readthedocs.io/en/latest/about.html#how-it-works ) @@ -85,8 +86,10 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): for i in range(batch_size): props = {name: cols[j][i] for j, name in enumerate(field_names)} geom = geoms[i] + fid = int(fids[i]) yield { "type": "Feature", + "id": fid, "geometry": geom.__geo_interface__ if geom is not None else None, "properties": props, } @@ -182,7 +185,8 @@ def _is_vector_file(path, layer): """ try: info = pyogrio.read_info(path, layer=layer) - except Exception: + # important: don't catch DataLayerErrors, let those bubble up directly + except pyogrio.errors.DataSourceError: return False return info["features"] != 0 diff --git a/tests/test_io.py b/tests/test_io.py index 1100f00..45e1aad 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -2,12 +2,14 @@ from pathlib import Path import numpy as np +import pyogrio import pytest import rasterio from shapely.geometry import shape from rasterstats.io import ( # todo parse_feature Raster, + _is_vector_file, boundless_array, bounds_window, feature_generator, @@ -434,9 +436,17 @@ def test_pyogrio_null_geometry(tmp_path): geojson = { "type": "FeatureCollection", "features": [ - {"type": "Feature", "geometry": {"type": "Point", "coordinates": [0, 0]}, "properties": {"n": 1}}, + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [0, 0]}, + "properties": {"n": 1}, + }, {"type": "Feature", "geometry": None, "properties": {"n": 2}}, - {"type": "Feature", "geometry": {"type": "Point", "coordinates": [1, 1]}, "properties": {"n": 3}}, + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [1, 1]}, + "properties": {"n": 3}, + }, ], } p = tmp_path / "null_geom.geojson" @@ -462,7 +472,11 @@ def test_pyogrio_unknown_feature_count(tmp_path, monkeypatch): geojson = { "type": "FeatureCollection", "features": [ - {"type": "Feature", "geometry": {"type": "Point", "coordinates": [float(i), 0.0]}, "properties": {"n": i}} + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [float(i), 0.0]}, + "properties": {"n": i}, + } for i in range(5) ], } @@ -479,6 +493,7 @@ def fake_read_info(path, **kwargs): monkeypatch.setattr(pyogrio, "read_info", fake_read_info) # Also patch the name used inside the io module import rasterstats.io as io_mod + monkeypatch.setattr(io_mod, "pyogrio", pyogrio) results = list(feature_generator(str(p), engine="pyogrio")) @@ -486,6 +501,28 @@ def fake_read_info(path, **kwargs): assert [f["properties"]["n"] for f in results] == list(range(5)) +# Test detection of vector files +# Private function but deserves unit tests + + +def test_is_vector_file(): + assert _is_vector_file(polygons, "polygons") + + +def test_wrong_vector_layer(): + "Data source is fine. DataLayerErrors pass through to fail fast." + with pytest.raises(pyogrio.errors.DataLayerError): + _is_vector_file(polygons, "something-else") + + +def test_isnt_vector_file(): + assert not _is_vector_file(raster, "raster") + + +def test_is_absent_file(): + assert not _is_vector_file("/tmp/sdfjaohgoadfknkjdsfj", "dnf") + + # Optional tests From b43a69d79f542d09ad4bfc4a9d2f6240c9b5bbfd Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Sat, 23 May 2026 14:24:29 -0600 Subject: [PATCH 17/19] changes --- CHANGELOG.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 0722fff..0b82682 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,7 +1,8 @@ 0.21.0 -- [Breaking] Replaces fiona with pyogrio as the default vector io engine #314 - - Should give better performance for most use cases. +- [Breaking] Replace fiona with pyogrio as the default vector io engine #314 + - Should give better performance for most use cases. See benchmarks in PR. - Pass `engine=fiona` to use the old code path + - Vector file detection is now done with pyogrio, driver discovery etc may differ accordingly. 0.20.0 - Progress bar for interactive use (#300) From cae08cc4634aa8351347f774695ee221270d2b70 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Sat, 23 May 2026 14:25:49 -0600 Subject: [PATCH 18/19] expose chunk_size parameter in public interfaces --- src/rasterstats/io.py | 21 +++++++++++++++------ src/rasterstats/main.py | 12 ++++++++++-- src/rasterstats/point.py | 12 ++++++++++-- tests/test_io.py | 30 ++++++++++++++++++++++++++++++ tests/test_zonal.py | 8 ++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index d6b7a01..93e9015 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -59,8 +59,10 @@ def _fiona_generator(obj, layer=0): # pyogrio backend +DEFAULT_CHUNK_SIZE = 65536 -def _pyogrio_generator(obj, layer=0, chunk_size=65536): + +def _pyogrio_generator(obj, layer=0, chunk_size=DEFAULT_CHUNK_SIZE): """Yield GeoJSON-like Feature dicts using pyogrio, reading in chunks.""" info = pyogrio.read_info(obj, layer=layer) field_names = list(info["fields"]) @@ -86,7 +88,7 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): for i in range(batch_size): props = {name: cols[j][i] for j, name in enumerate(field_names)} geom = geoms[i] - fid = int(fids[i]) + fid = str(fids[i]) # matches engine=fiona behavior yield { "type": "Feature", "id": fid, @@ -103,7 +105,7 @@ def _pyogrio_generator(obj, layer=0, chunk_size=65536): DEFAULT_ENGINE = "pyogrio" -def feature_generator(obj, layer=0, engine=None): +def feature_generator(obj, layer=0, engine=None, chunk_size=DEFAULT_CHUNK_SIZE): """Yield GeoJSON-like Feature dicts from a file-based vector source. Parameters @@ -116,6 +118,11 @@ def feature_generator(obj, layer=0, engine=None): Backend to use for reading. ``None`` selects the default engine (``"pyogrio"``). Pass ``"fiona"`` to opt in to the fiona backend (requires ``pip install rasterstats[fiona]``). + chunk_size : int, optional + Number of features to read per batch when using the pyogrio engine. + Reduce this value to lower peak memory usage for datasets with + large, vertex-dense geometries. Default: ``DEFAULT_CHUNK_SIZE``. + Has no effect when ``engine='fiona'``. Yields ------ @@ -124,7 +131,7 @@ def feature_generator(obj, layer=0, engine=None): """ resolved = engine if engine is not None else DEFAULT_ENGINE if resolved == "pyogrio": - yield from _pyogrio_generator(obj, layer=layer) + yield from _pyogrio_generator(obj, layer=layer, chunk_size=chunk_size) elif resolved == "fiona": yield from _fiona_generator(obj, layer=layer) else: @@ -191,12 +198,14 @@ def _is_vector_file(path, layer): return info["features"] != 0 -def read_features(obj, layer=0, engine=None): +def read_features(obj, layer=0, engine=None, chunk_size=DEFAULT_CHUNK_SIZE): features_iter = None if isinstance(obj, (str, PathLike)): obj = str(obj) if _is_vector_file(obj, layer): - features_iter = feature_generator(obj, layer, engine=engine) + features_iter = feature_generator( + obj, layer, engine=engine, chunk_size=chunk_size + ) else: try: mapping = json.loads(obj) diff --git a/src/rasterstats/main.py b/src/rasterstats/main.py index 533a90c..0db7918 100644 --- a/src/rasterstats/main.py +++ b/src/rasterstats/main.py @@ -7,7 +7,7 @@ from shapely import Geometry from shapely.geometry import shape -from rasterstats.io import Raster, read_features +from rasterstats.io import DEFAULT_CHUNK_SIZE, Raster, read_features from rasterstats.utils import ( boxify_points, check_stats, @@ -73,6 +73,7 @@ def gen_zonal_stats( geojson_out=False, boundless=True, engine=None, + chunk_size=DEFAULT_CHUNK_SIZE, **kwargs, ): """Zonal statistics of raster values aggregated to vector geometries. @@ -153,6 +154,13 @@ def gen_zonal_stats( Pass ``"fiona"`` to opt in to the fiona backend (requires ``pip install rasterstats[fiona]``). + chunk_size : int, optional + Number of features to read per batch when using the pyogrio engine. + Reduce this value to lower peak memory usage for datasets with + large, vertex-dense geometries. Default: ``DEFAULT_CHUNK_SIZE``. + Has no effect when ``engine='fiona'`` or when ``vectors`` is not a + file path. + Returns ------- generator of dicts (if geojson_out is False) @@ -187,7 +195,7 @@ def gen_zonal_stats( band = band_num with Raster(raster, affine, nodata, band) as rast: - features_iter = read_features(vectors, layer, engine=engine) + features_iter = read_features(vectors, layer, engine=engine, chunk_size=chunk_size) for _, feat in enumerate(features_iter): geom = feat["geometry"] if not isinstance(geom, Geometry): diff --git a/src/rasterstats/point.py b/src/rasterstats/point.py index fc182d5..aa2e20d 100644 --- a/src/rasterstats/point.py +++ b/src/rasterstats/point.py @@ -2,7 +2,7 @@ from shapely.geometry import shape from shapely.ops import transform -from rasterstats.io import Raster, read_features +from rasterstats.io import DEFAULT_CHUNK_SIZE, Raster, read_features def point_window_unitxy(x, y, affine): @@ -110,6 +110,7 @@ def gen_point_query( geojson_out=False, boundless=True, engine=None, + chunk_size=DEFAULT_CHUNK_SIZE, ): """ Given a set of vector features and a raster, @@ -168,6 +169,13 @@ def gen_point_query( Pass ``"fiona"`` to opt in to the fiona backend (requires ``pip install rasterstats[fiona]``). + chunk_size : int, optional + Number of features to read per batch when using the pyogrio engine. + Reduce this value to lower peak memory usage for datasets with + large, vertex-dense geometries. Default: ``DEFAULT_CHUNK_SIZE``. + Has no effect when ``engine='fiona'`` or when ``vectors`` is not a + file path. + Returns ------- generator of arrays (if ``geojson_out`` is False) @@ -176,7 +184,7 @@ def gen_point_query( if interpolate not in ["nearest", "bilinear"]: raise ValueError("interpolate must be nearest or bilinear") - features_iter = read_features(vectors, layer, engine=engine) + features_iter = read_features(vectors, layer, engine=engine, chunk_size=chunk_size) with Raster(raster, nodata=nodata, affine=affine, band=band) as rast: for feat in features_iter: diff --git a/tests/test_io.py b/tests/test_io.py index 45e1aad..8242452 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -8,6 +8,7 @@ from shapely.geometry import shape from rasterstats.io import ( # todo parse_feature + DEFAULT_CHUNK_SIZE, Raster, _is_vector_file, boundless_array, @@ -535,6 +536,35 @@ def test_geodataframe(): assert list(read_features(df)) +def test_feature_generator_chunk_size(tmp_path): + """Reading with default engine, fiona engine, and alternative chunk sizes + all result in equivalent feature dicts.""" + geojson = { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [float(i), 0.0]}, + "properties": {"n": i}, + } + for i in range(6) + ], + } + p = tmp_path / "chunked.geojson" + p.write_text(json.dumps(geojson)) + + default_results = list(feature_generator(polygons)) + small_chunk_results = list(feature_generator(polygons, chunk_size=2)) + fiona_results = list(feature_generator(polygons, engine="fiona", chunk_size=2)) + + assert default_results == small_chunk_results + # cannot compare them directly since they differ in tuples vs list representation for some reason + # let json roundtrip normalize it + assert json.loads(json.dumps(default_results)) == json.loads( + json.dumps(fiona_results) + ) + + # TODO # io.parse_features on a feature-only geo_interface # TODO # io.parse_features on a feature-only geojson-like object # TODO # io.read_features on a feature-only diff --git a/tests/test_zonal.py b/tests/test_zonal.py index c906a53..d39ed59 100644 --- a/tests/test_zonal.py +++ b/tests/test_zonal.py @@ -558,6 +558,14 @@ def test_nan_counts(): assert "nan" not in res +def test_zonal_stats_chunk_size_stats(): + """chunk_size= flows all the way through to _pyogrio_generator.""" + polygons = data_dir / "polygons.shp" + default_stats = zonal_stats(polygons, raster) + chunked_stats = zonal_stats(polygons, raster, chunk_size=1) + assert default_stats == chunked_stats + + # Optional tests def test_geodataframe_zonal(): gpd = pytest.importorskip("geopandas") From 77b2218be456e957692c8aa00205d2d26db7e587 Mon Sep 17 00:00:00 2001 From: Matthew Perry Date: Sat, 23 May 2026 14:47:33 -0600 Subject: [PATCH 19/19] bump --- CHANGELOG.txt | 11 +++++++---- src/rasterstats/_version.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 0b82682..cf1e175 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,8 +1,11 @@ 0.21.0 -- [Breaking] Replace fiona with pyogrio as the default vector io engine #314 - - Should give better performance for most use cases. See benchmarks in PR. - - Pass `engine=fiona` to use the old code path - - Vector file detection is now done with pyogrio, driver discovery etc may differ accordingly. +- Replace fiona with pyogrio as the default vector io engine #314 + - Binary wheel support for python 3.14 + - Promises better performance. See improvements in PR. + - Pass `engine=fiona` to use the old code path. + - Vector file detection now also uses pyogrio, so driver discovery etc may have changed for edge cases. +- fix bug in progress bar #304 +- build, tooling, and test improvements #306 #308 #309 #310 0.20.0 - Progress bar for interactive use (#300) diff --git a/src/rasterstats/_version.py b/src/rasterstats/_version.py index 5f4bb0b..6a726d8 100644 --- a/src/rasterstats/_version.py +++ b/src/rasterstats/_version.py @@ -1 +1 @@ -__version__ = "0.20.0" +__version__ = "0.21.0"