diff --git a/.gitignore b/.gitignore index 0de38f5..286c44b 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,8 @@ Vagrantfile venv .eggs .cache +.python-version +*.lprof # no uv lockfile until this is fixed: # https://github.com/astral-sh/uv/issues/10845 diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 7e9d042..cf1e175 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,12 @@ +0.21.0 +- Replace fiona with pyogrio as the default vector io engine #314 + - Binary wheel support for python 3.14 + - Promises better performance. See improvements in PR. + - Pass `engine=fiona` to use the old code path. + - Vector file detection now also uses pyogrio, so driver discovery etc may have changed for edge cases. +- fix bug in progress bar #304 +- build, tooling, and test improvements #306 #308 #309 #310 + 0.20.0 - Progress bar for interactive use (#300) - Fixes to support Fiona 1.10 (#301) diff --git a/docs/index.rst b/docs/index.rst index ca44d47..fd8e6d8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,10 @@ Quickstart Install:: + uv add rasterstats + +Or with pip:: + pip install rasterstats diff --git a/docs/installation.rst b/docs/installation.rst index feba805..e117727 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,19 +1,37 @@ Installation ============ -Depends on libgdal, rasterio, fiona, shapely and numpy +Depends on libgdal, rasterio, fiona, shapely and numpy. -Using Ubuntu 14.04:: +Using ``uv`` (recommended):: - sudo apt-get install python-numpy libgdal1h gdal-bin libgdal-dev - pip install rasterstats + uv add rasterstats -Or homebrew on OS X:: +Or with pip:: - brew install gdal pip install rasterstats -For Windows, follow the `rasterio installation `_ and then run:: +Platform-specific GDAL setup +----------------------------- - pip install rasterstats +**Ubuntu**:: + + sudo apt-get install libgdal-dev gdal-bin + +then install rasterstats as above. + +**macOS** (Homebrew):: + + brew install gdal + +then install rasterstats as above. + +**Windows**: follow the `rasterio installation `_, +then install rasterstats as above. + +Tests +----- + +To run the python unit tests + uv run pytest diff --git a/docs/manual.rst b/docs/manual.rst index d508d74..b4dbcf7 100644 --- a/docs/manual.rst +++ b/docs/manual.rst @@ -41,7 +41,7 @@ while ``point_query`` gives us a list of raster values corresponding to each inp Vector Data Sources ------------------- The most common use case is having vector data sources in a file such as an ESRI Shapefile or any -other format supported by ``fiona``. The path to the file can be passed in directly as the first argument:: +other format supported by ``pyogrio``. The path to the file can be passed in directly as the first argument:: >>> zs = zonal_stats('tests/data/polygons.shp', 'tests/data/slope.tif') @@ -49,6 +49,11 @@ If you have multi-layer sources, you can specify the ``layer`` by either name or >>> zs = zonal_stats('tests/data', 'tests/data/slope.tif', layer="polygons") +If you have ``fiona`` installed and want to use it as the vector iteration engine +instead of the default ``pyogrio``, you can optionally add ``engine='fiona'`` + + >>> zs = zonal_stats("tests/data/polygons.shp", "tests/data/slope.tif", engine="fiona") + In addition to the basic usage above, rasterstats supports other mechanisms of specifying vector geometries. @@ -294,5 +299,3 @@ command line tool, as well as GRASS's `r.statistics `_. - - diff --git a/pyproject.toml b/pyproject.toml index 23c76b4..dbd35eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,4 @@ +# Note: code auto-formatted with `taplo fmt` [build-system] requires = ["hatchling"] build-backend = "hatchling.build" @@ -5,61 +6,59 @@ build-backend = "hatchling.build" [project] name = "rasterstats" description = "Summarize geospatial raster datasets based on vector geometries" -authors = [ - {name = "Matthew Perry", email = "perrygeo@gmail.com"}, -] +authors = [{ name = "Matthew Perry", email = "perrygeo@gmail.com" }] readme = "README.rst" -keywords = ["gis", "geospatial", "geographic", "raster", "vector", "zonal statistics"] +keywords = [ + "gis", + "geospatial", + "geographic", + "raster", + "vector", + "zonal statistics", +] dynamic = ["version"] -license = {text = "BSD-3-Clause"} +license = { text = "BSD-3-Clause" } classifiers = [ - "Development Status :: 4 - Beta", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: Utilities", - "Topic :: Scientific/Engineering :: GIS", + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Utilities", + "Topic :: Scientific/Engineering :: GIS", ] requires-python = ">=3.9" dependencies = [ - "affine", - "click >7.1, !=8.2.1", - "cligj >=0.4", - "fiona", - "numpy >=1.9", - "rasterio >=1.0", - "simplejson", - "shapely", + "affine", + "click >7.1, !=8.2.1", + "cligj >=0.4", + "numpy >=1.9", + "pyogrio", + "rasterio >=1.0", + "simplejson", + "shapely", ] [project.optional-dependencies] -progress = [ - "tqdm" -] -docs = [ - "numpydoc", - "sphinx", - "sphinx-rtd-theme", -] +progress = ["tqdm"] +fiona = ["fiona"] +docs = ["numpydoc", "sphinx", "sphinx-rtd-theme"] test = [ - "coverage", - "geopandas", - "pyshp >=1.1.4", - "pytest >=4.6", - "pytest-cov >=2.2.0", - "simplejson", -] -dev = [ - "rasterstats[test]", - "ruff", - "twine", + "coverage", + "geopandas", + "pyshp >=1.1.4", + "pytest >=4.6", + "pytest-cov >=2.2.0", + "simplejson", ] +dev = ["rasterstats[test,fiona]", "ruff", "twine"] [project.entry-points."rasterio.rio_plugins"] zonalstats = "rasterstats.cli:zonalstats" @@ -73,31 +72,28 @@ Documentation = "https://pythonhosted.org/rasterstats/" only-include = ["src", "tests"] [tool.pytest.ini_options] -filterwarnings = [ - "error", - "ignore::UserWarning", -] +filterwarnings = ["error", "ignore::UserWarning"] testpaths = ["tests"] -# addopts = "--verbose -rf --ipdb --maxfail=1" +addopts = "--verbose -rf --cov rasterstats --cov-report html" [tool.setuptools.dynamic] -version = {attr = "rasterstats._version.__version__"} +version = { attr = "rasterstats._version.__version__" } [tool.hatch.version] path = "src/rasterstats/_version.py" [tool.ruff.lint] select = [ - "E", # pycodestyle - "F", # Pyflakes - "I", # isort - "RUF", # Ruff-specific rules - "UP", # pyupgrade + "E", # pycodestyle + "F", # Pyflakes + "I", # isort + "RUF", # Ruff-specific rules + "UP", # pyupgrade ] ignore = [ - "RUF005", # Consider iterable unpacking instead of concatenation + "E501", # Disable: line too long + "RUF005", # Disable: Consider iterable unpacking instead of concatenation ] [tool.ruff] -# TODO: files in docs/notebooks/ use old versions and are incompatible with modern tools extend-exclude = ["*.ipynb"] diff --git a/scripts/bench_engines.py b/scripts/bench_engines.py new file mode 100644 index 0000000..d662d1a --- /dev/null +++ b/scripts/bench_engines.py @@ -0,0 +1,257 @@ +"""Benchmark fiona vs pyogrio as read backends for rasterstats. + +Generates N random point features inside the extent of tests/data/slope.tif, +writes them to a temporary GeoJSON file, converts it to GeoPackage, Shapefile, +and other formats via ogr2ogr, then times how long each engine takes to iterate over +every feature via ``read_features``. + +Usage +----- + uv run python scripts/bench_engines.py +""" + +import json +import random +import shutil +import subprocess +import sys +import tempfile +import time +from pathlib import Path + +import rasterio + +from rasterstats.io import read_features +from rasterstats.main import gen_zonal_stats + +# Resolve the repo root so the script works from any cwd +REPO_ROOT = Path(__file__).parent.parent +SLOPE_TIF = REPO_ROOT / "tests" / "data" / "slope.tif" +DEFAULT_N = 100_000 + + +def generate_geojson(path: Path, n: int) -> None: + with rasterio.open(SLOPE_TIF) as src: + left, bottom, right, top = src.bounds + + features = [ + { + "type": "Feature", + "properties": {"id": i}, + "geometry": { + "type": "Point", + "coordinates": [ + random.uniform(left, right), + random.uniform(bottom, top), + ], + }, + } + for i in range(n) + ] + fc = {"type": "FeatureCollection", "features": features} + with open(path, "w") as f: + json.dump(fc, f) + + +GPKG_LAYER = "features" +SHP_LAYER = "features" + + +def generate_gpkg(geojson_path: Path, gpkg_path: Path) -> None: + """Convert an existing GeoJSON file to GeoPackage using ogr2ogr.""" + subprocess.run( + [ + "ogr2ogr", + "-f", + "GPKG", + "-nln", + GPKG_LAYER, + str(gpkg_path), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + +def generate_shp(geojson_path: Path, shp_dir: Path) -> None: + """Convert an existing GeoJSON file to ESRI Shapefile using ogr2ogr. + + ogr2ogr writes a directory of sidecar files (.shp, .dbf, .shx, .prj) + so ``shp_dir`` must be a directory path that does not yet exist. + """ + subprocess.run( + [ + "ogr2ogr", + "-f", + "ESRI Shapefile", + "-nln", + SHP_LAYER, + str(shp_dir), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + +def generate_flatgeobuf(geojson_path: Path, fgb_path: Path) -> None: + """Convert an existing GeoJSON file to FlatGeobuf using ogr2ogr.""" + subprocess.run( + [ + "ogr2ogr", + "-f", + "FlatGeobuf", + str(fgb_path), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + +def generate_parquet(geojson_path: Path, parquet_path: Path) -> None: + """Convert an existing GeoJSON file to Parquet using ogr2ogr.""" + subprocess.run( + [ + "ogr2ogr", + "-f", + "Parquet", + str(parquet_path), + str(geojson_path), + ], + check=True, + capture_output=True, + ) + + +def time_engine(path: Path, engine: str, n: int, layer=0) -> float: + "Times how long each engine takes to iterate through all features" + t0 = time.perf_counter() + count = sum(1 for _ in read_features(str(path), layer=layer, engine=engine)) + elapsed = time.perf_counter() - t0 + assert count == n, f"Expected {n} features, got {count}" + return elapsed + + +def time_zonal_stats(path: Path, engine: str, n: int, layer=0) -> float: + """Full e2e test running all stats againts the geotiff. + Typically 100x slower than just reading the features and discarding them. + IOW this is NOT typically IO bound for local files. + """ + t0 = time.perf_counter() + count = sum( + 1 + for _ in gen_zonal_stats(str(path), str(SLOPE_TIF), layer=layer, engine=engine) + ) + elapsed = time.perf_counter() - t0 + assert count == n, f"Expected {n} features, got {count}" + return elapsed + + +def main() -> None: + n = int(sys.argv[1]) if len(sys.argv) > 1 else DEFAULT_N + + print(f"Generating {n:,} random point features over {SLOPE_TIF.name} …") + with tempfile.NamedTemporaryFile(suffix=".geojson", delete=False) as tmp: + tmp_path = Path(tmp.name) + gpkg_path = Path(tempfile.mktemp(suffix=".gpkg")) + shp_dir = Path(tempfile.mkdtemp()) + fgb_path = Path(tempfile.mktemp(suffix=".fgb")) + # parquet_path = Path(tempfile.mktemp(suffix=".parquet")) + + try: + generate_geojson(tmp_path, n) + file_mb = tmp_path.stat().st_size / 1024 / 1024 + print(f"Wrote GeoJSON {file_mb:.1f} MB → {tmp_path}") + + print("Converting to GeoPackage via ogr2ogr …") + generate_gpkg(tmp_path, gpkg_path) + gpkg_mb = gpkg_path.stat().st_size / 1024 / 1024 + print(f"Wrote GeoPackage {gpkg_mb:.1f} MB → {gpkg_path}") + + print("Converting to Shapefile via ogr2ogr …") + generate_shp(tmp_path, shp_dir) + shp_mb = ( + sum(f.stat().st_size for f in shp_dir.rglob("*") if f.is_file()) + / 1024 + / 1024 + ) + print(f"Wrote Shapefile {shp_mb:.1f} MB → {shp_dir}") + + print("Converting to FlatGeobuf via ogr2ogr …") + generate_flatgeobuf(tmp_path, fgb_path) + fgb_mb = fgb_path.stat().st_size / 1024 / 1024 + print(f"Wrote FlatGeobuf {fgb_mb:.1f} MB → {fgb_path}") + + # print("Converting to Parquet via ogr2ogr …") + # generate_parquet(tmp_path, parquet_path) + # parquet_mb = parquet_path.stat().st_size / 1024 / 1024 + # print(f"Wrote Parquet {parquet_mb:.1f} MB → {parquet_path}\n") + + print("=== GeoJSON ===") + fiona_secs = time_engine(tmp_path, "fiona", n) + print(f"fiona : {fiona_secs:.3f}s ({n / fiona_secs:,.0f} feat/s)") + + pyogrio_secs = time_engine(tmp_path, "pyogrio", n) + print(f"pyogrio: {pyogrio_secs:.3f}s ({n / pyogrio_secs:,.0f} feat/s)") + ratio = fiona_secs / pyogrio_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoJSON)") + + print("\n=== GeoPackage ===") + fiona_gpkg_secs = time_engine(gpkg_path, "fiona", n, layer=GPKG_LAYER) + print(f"fiona : {fiona_gpkg_secs:.3f}s ({n / fiona_gpkg_secs:,.0f} feat/s)") + + pyogrio_gpkg_secs = time_engine(gpkg_path, "pyogrio", n, layer=GPKG_LAYER) + print( + f"pyogrio: {pyogrio_gpkg_secs:.3f}s ({n / pyogrio_gpkg_secs:,.0f} feat/s)" + ) + ratio = fiona_gpkg_secs / pyogrio_gpkg_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (GeoPackage)") + + print("\n=== Shapefile ===") + shp_path = shp_dir / (SHP_LAYER + ".shp") + fiona_shp_secs = time_engine(shp_path, "fiona", n, layer=SHP_LAYER) + print(f"fiona : {fiona_shp_secs:.3f}s ({n / fiona_shp_secs:,.0f} feat/s)") + + pyogrio_shp_secs = time_engine(shp_path, "pyogrio", n, layer=SHP_LAYER) + print(f"pyogrio: {pyogrio_shp_secs:.3f}s ({n / pyogrio_shp_secs:,.0f} feat/s)") + ratio = fiona_shp_secs / pyogrio_shp_secs + faster = "pyogrio" if ratio > 1 else "fiona" + print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Shapefile)") + + print("\n=== FlatGeobuf ===") + print("fiona : skip") # fiona wheels not necessarily built with flatgeobuf + # fiona_fgb_secs = time_engine(fgb_path, "fiona", n, layer=None) + # print(f"fiona : {fiona_fgb_secs:.3f}s ({n / fiona_fgb_secs:,.0f} feat/s)") + + pyogrio_fgb_secs = time_engine(fgb_path, "pyogrio", n, layer=None) + print(f"pyogrio: {pyogrio_fgb_secs:.3f}s ({n / pyogrio_fgb_secs:,.0f} feat/s)") + + # The pyogrio binary wheels don't support parquet yet + # print(pyogrio.list_drivers()) + # print("\n=== Parquet ===") + # fiona_parquet_secs = time_engine(parquet_path, "fiona", n, layer=0) + # print( + # f"fiona : {fiona_parquet_secs:.3f}s ({n / fiona_parquet_secs:,.0f} feat/s)" + # ) + # + # pyogrio_parquet_secs = time_engine(parquet_path, "pyogrio", n, layer=0) + # print( + # f"pyogrio: {pyogrio_parquet_secs:.3f}s ({n / pyogrio_parquet_secs:,.0f} feat/s)" + # ) + # ratio = fiona_parquet_secs / pyogrio_parquet_secs + # faster = "pyogrio" if ratio > 1 else "fiona" + # print(f"\n{faster} is {max(ratio, 1 / ratio):.2f}x faster (Parquet)") + + finally: + tmp_path.unlink(missing_ok=True) + gpkg_path.unlink(missing_ok=True) + shutil.rmtree(shp_dir, ignore_errors=True) + fgb_path.unlink(missing_ok=True) + + +if __name__ == "__main__": + main() diff --git a/src/rasterstats/_version.py b/src/rasterstats/_version.py index 5f4bb0b..6a726d8 100644 --- a/src/rasterstats/_version.py +++ b/src/rasterstats/_version.py @@ -1 +1 @@ -__version__ = "0.20.0" +__version__ = "0.21.0" diff --git a/src/rasterstats/io.py b/src/rasterstats/io.py index 7ae8d87..93e9015 100644 --- a/src/rasterstats/io.py +++ b/src/rasterstats/io.py @@ -5,11 +5,12 @@ from json import JSONDecodeError from os import PathLike -import fiona import numpy as np +import pyogrio +import pyogrio.raw import rasterio +import shapely from affine import Affine -from fiona.errors import DriverError from rasterio.enums import MaskFlags from rasterio.transform import guard_transform from shapely import wkb, wkt @@ -29,22 +30,114 @@ "MultiPolygon", ] -try: - # Fiona 1.9+ - import fiona.model +# Fiona backend + + +def _fiona_generator(obj, layer=0): + """Yield GeoJSON-like Feature dicts using fiona (optional engine). + + Raises ImportError with a helpful message if fiona is not installed. + """ + try: + import fiona + except ImportError: + raise ImportError( + "fiona is required for engine='fiona'. " + "Install it with: pip install rasterstats[fiona]" + ) + try: + import fiona.model - def fiona_generator(obj, layer=0): with fiona.open(obj, "r", layer=layer) as src: for feat in src: yield fiona.model.to_dict(feat) - -except ModuleNotFoundError: - # Fiona <1.9 - def fiona_generator(obj, layer=0): + except ImportError: + # fiona < 1.9 — no fiona.model with fiona.open(obj, "r", layer=layer) as src: yield from src +# pyogrio backend + +DEFAULT_CHUNK_SIZE = 65536 + + +def _pyogrio_generator(obj, layer=0, chunk_size=DEFAULT_CHUNK_SIZE): + """Yield GeoJSON-like Feature dicts using pyogrio, reading in chunks.""" + info = pyogrio.read_info(obj, layer=layer) + field_names = list(info["fields"]) + + # Some sources advertise `info["features"]" == -1` (WFS, certain VRT sources). + # ie they don't support *fast* feature counting, thus return -1 + # But they *do* support iteration, which is all we need. + + skip = 0 + while True: + _meta, fids, geometries, field_data = pyogrio.raw.read( + obj, + layer=layer, + skip_features=skip, + max_features=chunk_size, + return_fids=True, + # Note do not use_arrow=True, reads all records into mem + # https://pyogrio.readthedocs.io/en/latest/about.html#how-it-works + ) + batch_size = len(geometries) + geoms = shapely.from_wkb(geometries) + cols = [col.tolist() for col in field_data] + for i in range(batch_size): + props = {name: cols[j][i] for j, name in enumerate(field_names)} + geom = geoms[i] + fid = str(fids[i]) # matches engine=fiona behavior + yield { + "type": "Feature", + "id": fid, + "geometry": geom.__geo_interface__ if geom is not None else None, + "properties": props, + } + skip += batch_size + if batch_size < chunk_size: + break + + +# Public dispatcher + +DEFAULT_ENGINE = "pyogrio" + + +def feature_generator(obj, layer=0, engine=None, chunk_size=DEFAULT_CHUNK_SIZE): + """Yield GeoJSON-like Feature dicts from a file-based vector source. + + Parameters + ---------- + obj : str or PathLike + Path to a vector data source supported by pyogrio (default) or fiona. + layer : int or str, optional + Layer index or name (default: 0). + engine : {"pyogrio", "fiona"} or None, optional + Backend to use for reading. ``None`` selects the default engine + (``"pyogrio"``). Pass ``"fiona"`` to opt in to the fiona backend + (requires ``pip install rasterstats[fiona]``). + chunk_size : int, optional + Number of features to read per batch when using the pyogrio engine. + Reduce this value to lower peak memory usage for datasets with + large, vertex-dense geometries. Default: ``DEFAULT_CHUNK_SIZE``. + Has no effect when ``engine='fiona'``. + + Yields + ------ + dict + GeoJSON-like Feature dicts. + """ + resolved = engine if engine is not None else DEFAULT_ENGINE + if resolved == "pyogrio": + yield from _pyogrio_generator(obj, layer=layer, chunk_size=chunk_size) + elif resolved == "fiona": + yield from _fiona_generator(obj, layer=layer) + else: + raise ValueError(f"Unknown engine {resolved!r}. Choose 'pyogrio' or 'fiona'.") + + def wrap_geom(geom): """Wraps a geometry dict in an GeoJSON Feature""" return {"type": "Feature", "properties": {}, "geometry": geom} @@ -89,24 +182,31 @@ def parse_feature(obj): raise ValueError(f"Can't parse {obj} as a geojson Feature object") -def read_features(obj, layer=0): +def _is_vector_file(path, layer): + """Return True if ``path`` is a readable vector file with at least one feature. + + Uses pyogrio.read_info for the probe — no fiona required. + Returns False for invalid paths, non-vector files, and genuinely empty sources. + A feature count of -1 (unknown, e.g. some GeoJSON drivers) is treated as + non-empty (True). + """ + try: + info = pyogrio.read_info(path, layer=layer) + # important: don't catch DataLayerErrors, let those bubble up directly + except pyogrio.errors.DataSourceError: + return False + return info["features"] != 0 + + +def read_features(obj, layer=0, engine=None, chunk_size=DEFAULT_CHUNK_SIZE): features_iter = None if isinstance(obj, (str, PathLike)): obj = str(obj) - try: - # test it as fiona data source - with fiona.open(obj, "r", layer=layer) as src: - assert len(src) > 0 - - features_iter = fiona_generator(obj, layer) - except ( - AssertionError, - DriverError, - OSError, - TypeError, - UnicodeDecodeError, - ValueError, - ): + if _is_vector_file(obj, layer): + features_iter = feature_generator( + obj, layer, engine=engine, chunk_size=chunk_size + ) + else: try: mapping = json.loads(obj) if "type" in mapping and mapping["type"] == "FeatureCollection": diff --git a/src/rasterstats/main.py b/src/rasterstats/main.py index 43d9dc1..0db7918 100644 --- a/src/rasterstats/main.py +++ b/src/rasterstats/main.py @@ -4,9 +4,10 @@ import numpy as np from affine import Affine +from shapely import Geometry from shapely.geometry import shape -from rasterstats.io import Raster, read_features +from rasterstats.io import DEFAULT_CHUNK_SIZE, Raster, read_features from rasterstats.utils import ( boxify_points, check_stats, @@ -71,6 +72,8 @@ def gen_zonal_stats( prefix=None, geojson_out=False, boundless=True, + engine=None, + chunk_size=DEFAULT_CHUNK_SIZE, **kwargs, ): """Zonal statistics of raster values aggregated to vector geometries. @@ -145,6 +148,19 @@ def gen_zonal_stats( Allow features that extend beyond the raster dataset's extent, default: True Cells outside dataset extents are treated as nodata. + engine : {"pyogrio", "fiona"} or None, optional + Backend to use when reading file-based vector sources. + ``None`` selects the default engine (``"pyogrio"``). + Pass ``"fiona"`` to opt in to the fiona backend + (requires ``pip install rasterstats[fiona]``). + + chunk_size : int, optional + Number of features to read per batch when using the pyogrio engine. + Reduce this value to lower peak memory usage for datasets with + large, vertex-dense geometries. Default: ``DEFAULT_CHUNK_SIZE``. + Has no effect when ``engine='fiona'`` or when ``vectors`` is not a + file path. + Returns ------- generator of dicts (if geojson_out is False) @@ -179,9 +195,11 @@ def gen_zonal_stats( band = band_num with Raster(raster, affine, nodata, band) as rast: - features_iter = read_features(vectors, layer) + features_iter = read_features(vectors, layer, engine=engine, chunk_size=chunk_size) for _, feat in enumerate(features_iter): - geom = shape(feat["geometry"]) + geom = feat["geometry"] + if not isinstance(geom, Geometry): + geom = shape(geom) if "Point" in geom.geom_type: geom = boxify_points(geom, rast) diff --git a/src/rasterstats/point.py b/src/rasterstats/point.py index f262c6c..aa2e20d 100644 --- a/src/rasterstats/point.py +++ b/src/rasterstats/point.py @@ -2,7 +2,7 @@ from shapely.geometry import shape from shapely.ops import transform -from rasterstats.io import Raster, read_features +from rasterstats.io import DEFAULT_CHUNK_SIZE, Raster, read_features def point_window_unitxy(x, y, affine): @@ -109,6 +109,8 @@ def gen_point_query( property_name="value", geojson_out=False, boundless=True, + engine=None, + chunk_size=DEFAULT_CHUNK_SIZE, ): """ Given a set of vector features and a raster, @@ -161,6 +163,19 @@ def gen_point_query( Allow features that extend beyond the raster dataset's extent, default: True Cells outside dataset extents are treated as nodata. + engine : {"pyogrio", "fiona"} or None, optional + Backend to use when reading file-based vector sources. + ``None`` selects the default engine (``"pyogrio"``). + Pass ``"fiona"`` to opt in to the fiona backend + (requires ``pip install rasterstats[fiona]``). + + chunk_size : int, optional + Number of features to read per batch when using the pyogrio engine. + Reduce this value to lower peak memory usage for datasets with + large, vertex-dense geometries. Default: ``DEFAULT_CHUNK_SIZE``. + Has no effect when ``engine='fiona'`` or when ``vectors`` is not a + file path. + Returns ------- generator of arrays (if ``geojson_out`` is False) @@ -169,7 +184,7 @@ def gen_point_query( if interpolate not in ["nearest", "bilinear"]: raise ValueError("interpolate must be nearest or bilinear") - features_iter = read_features(vectors, layer) + features_iter = read_features(vectors, layer, engine=engine, chunk_size=chunk_size) with Raster(raster, nodata=nodata, affine=affine, band=band) as rast: for feat in features_iter: diff --git a/tests/test_io.py b/tests/test_io.py index 3199658..8242452 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -1,17 +1,19 @@ import json from pathlib import Path -import fiona import numpy as np +import pyogrio import pytest import rasterio from shapely.geometry import shape from rasterstats.io import ( # todo parse_feature + DEFAULT_CHUNK_SIZE, Raster, + _is_vector_file, boundless_array, bounds_window, - fiona_generator, + feature_generator, read_featurecollection, read_features, rowcol, @@ -28,7 +30,7 @@ eps = 1e-6 -target_features = [f for f in fiona_generator(polygons)] +target_features = list(feature_generator(polygons)) target_geoms = [shape(f["geometry"]) for f in target_features] @@ -52,16 +54,24 @@ def _test_read_features_single(indata): def test_fiona_path(): - assert list(read_features(polygons)) == target_features + """read_features on a path uses pyogrio default; geometries match.""" + features = list(read_features(polygons)) + geoms = [shape(f["geometry"]) for f in features] + _compare_geomlists(geoms, target_geoms) def test_layer_index(): + fiona = pytest.importorskip("fiona") layer = fiona.listlayers(data_dir).index("polygons") - assert list(read_features(data_dir, layer=layer)) == target_features + result = list(read_features(data_dir, layer=layer)) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_layer_name(): - assert list(read_features(data_dir, layer="polygons")) == target_features + result = list(read_features(data_dir, layer="polygons")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_path_unicode(): @@ -70,75 +80,78 @@ def test_path_unicode(): except NameError: # python3, it's already unicode upolygons = polygons - assert list(read_features(upolygons)) == target_features + result = list(read_features(upolygons)) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) def test_featurecollection(): - assert ( - read_featurecollection(polygons)["features"] - == list(read_features(polygons)) - == target_features - ) + fc_features = read_featurecollection(polygons)["features"] + rf_features = list(read_features(polygons)) + geoms_fc = [shape(f["geometry"]) for f in fc_features] + geoms_rf = [shape(f["geometry"]) for f in rf_features] + _compare_geomlists(geoms_fc, target_geoms) + _compare_geomlists(geoms_rf, target_geoms) def test_shapely(): - indata = [shape(f["geometry"]) for f in fiona_generator(polygons)] + indata = [shape(f["geometry"]) for f in feature_generator(polygons)] _test_read_features(indata) _test_read_features_single(indata[0]) def test_wkt(): - indata = [shape(f["geometry"]).wkt for f in fiona_generator(polygons)] + indata = [shape(f["geometry"]).wkt for f in feature_generator(polygons)] _test_read_features(indata) _test_read_features_single(indata[0]) def test_wkb(): - indata = [shape(f["geometry"]).wkb for f in fiona_generator(polygons)] + indata = [shape(f["geometry"]).wkb for f in feature_generator(polygons)] _test_read_features(indata) _test_read_features_single(indata[0]) def test_mapping_features(): # list of Features - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) _test_read_features(indata) def test_mapping_feature(): # list of Features - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) _test_read_features(indata[0]) def test_mapping_geoms(): - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) _test_read_features(indata[0]["geometry"]) def test_mapping_collection(): indata = {"type": "FeatureCollection"} - indata["features"] = [f for f in fiona_generator(polygons)] + indata["features"] = list(feature_generator(polygons)) _test_read_features(indata) def test_jsonstr(): # Feature str - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) indata = json.dumps(indata[0]) _test_read_features(indata) def test_jsonstr_geom(): # geojson geom str - indata = [f for f in fiona_generator(polygons)] + indata = list(feature_generator(polygons)) indata = json.dumps(indata[0]["geometry"]) _test_read_features(indata) def test_jsonstr_collection(): indata = {"type": "FeatureCollection"} - indata["features"] = [f for f in fiona_generator(polygons)] + indata["features"] = list(feature_generator(polygons)) indata = json.dumps(indata) _test_read_features(indata) @@ -163,19 +176,19 @@ def __init__(self, f): def test_geo_interface(): - indata = [MockGeoInterface(f) for f in fiona_generator(polygons)] + indata = [MockGeoInterface(f) for f in feature_generator(polygons)] _test_read_features(indata) def test_geo_interface_geom(): - indata = [MockGeoInterface(f["geometry"]) for f in fiona_generator(polygons)] + indata = [MockGeoInterface(f["geometry"]) for f in feature_generator(polygons)] _test_read_features(indata) def test_geo_interface_collection(): # geointerface for featurecollection? indata = {"type": "FeatureCollection"} - indata["features"] = [f for f in fiona_generator(polygons)] + indata["features"] = list(feature_generator(polygons)) indata = MockGeoInterface(indata) _test_read_features(indata) @@ -370,7 +383,150 @@ def next(self): assert list(read_features(geothing)) == features +# feature_generator / engine tests + + +def test_feature_generator_pyogrio_default(): + """feature_generator with no engine= uses pyogrio (new default).""" + result = list(feature_generator(polygons)) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) + + +def test_feature_generator_explicit_fiona(): + """feature_generator with engine='fiona' yields correct geometries.""" + pytest.importorskip("fiona") + result = list(feature_generator(polygons, engine="fiona")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) + + +def test_feature_generator_pyogrio(): + """feature_generator with engine='pyogrio' yields same geometries.""" + pytest.importorskip("pyogrio") + result = list(feature_generator(polygons, engine="pyogrio")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) + + +def test_feature_generator_pyogrio_properties(): + """pyogrio engine preserves feature properties.""" + pytest.importorskip("pyogrio") + result = list(feature_generator(polygons, engine="pyogrio")) + assert all("properties" in f for f in result) + assert all("id" in f["properties"] for f in result) + + +def test_feature_generator_invalid_engine(): + """An unrecognised engine name raises ValueError.""" + with pytest.raises(ValueError, match="Unknown engine"): + list(feature_generator(polygons, engine="gdal")) + + +def test_read_features_pyogrio_engine(): + """read_features forwards engine= to the underlying generator.""" + pytest.importorskip("pyogrio") + result = list(read_features(polygons, engine="pyogrio")) + geoms = [shape(f["geometry"]) for f in result] + _compare_geomlists(geoms, target_geoms) + + +def test_pyogrio_null_geometry(tmp_path): + """_pyogrio_generator yields None geometry for features with null geometry + rather than raising AttributeError.""" + geojson = { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [0, 0]}, + "properties": {"n": 1}, + }, + {"type": "Feature", "geometry": None, "properties": {"n": 2}}, + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [1, 1]}, + "properties": {"n": 3}, + }, + ], + } + p = tmp_path / "null_geom.geojson" + p.write_text(json.dumps(geojson)) + + results = list(feature_generator(str(p), engine="pyogrio")) + assert len(results) == 3 + assert results[0]["geometry"] is not None + assert results[1]["geometry"] is None + assert results[2]["geometry"] is not None + assert results[1]["properties"]["n"] == 2 + + +def test_pyogrio_unknown_feature_count(tmp_path, monkeypatch): + """_pyogrio_generator reads all features when read_info returns features=-1. + + Some GDAL drivers (e.g. WFS) cannot report feature count ahead of time and + return -1. The old ``while skip < total`` loop silently yielded nothing in + this case. The new unconditional loop must still yield all features. + """ + import pyogrio + + geojson = { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [float(i), 0.0]}, + "properties": {"n": i}, + } + for i in range(5) + ], + } + p = tmp_path / "unknown_count.geojson" + p.write_text(json.dumps(geojson)) + + real_read_info = pyogrio.read_info + + def fake_read_info(path, **kwargs): + info = real_read_info(path, **kwargs) + info["features"] = -1 + return info + + monkeypatch.setattr(pyogrio, "read_info", fake_read_info) + # Also patch the name used inside the io module + import rasterstats.io as io_mod + + monkeypatch.setattr(io_mod, "pyogrio", pyogrio) + + results = list(feature_generator(str(p), engine="pyogrio")) + assert len(results) == 5 + assert [f["properties"]["n"] for f in results] == list(range(5)) + + +# Test detection of vector files +# Private function but deserves unit tests + + +def test_is_vector_file(): + assert _is_vector_file(polygons, "polygons") + + +def test_wrong_vector_layer(): + "Data source is fine. DataLayerErrors pass through to fail fast." + with pytest.raises(pyogrio.errors.DataLayerError): + _is_vector_file(polygons, "something-else") + + +def test_isnt_vector_file(): + assert not _is_vector_file(raster, "raster") + + +def test_is_absent_file(): + assert not _is_vector_file("/tmp/sdfjaohgoadfknkjdsfj", "dnf") + + # Optional tests + + def test_geodataframe(): gpd = pytest.importorskip("geopandas") @@ -380,6 +536,35 @@ def test_geodataframe(): assert list(read_features(df)) +def test_feature_generator_chunk_size(tmp_path): + """Reading with default engine, fiona engine, and alternative chunk sizes + all result in equivalent feature dicts.""" + geojson = { + "type": "FeatureCollection", + "features": [ + { + "type": "Feature", + "geometry": {"type": "Point", "coordinates": [float(i), 0.0]}, + "properties": {"n": i}, + } + for i in range(6) + ], + } + p = tmp_path / "chunked.geojson" + p.write_text(json.dumps(geojson)) + + default_results = list(feature_generator(polygons)) + small_chunk_results = list(feature_generator(polygons, chunk_size=2)) + fiona_results = list(feature_generator(polygons, engine="fiona", chunk_size=2)) + + assert default_results == small_chunk_results + # cannot compare them directly since they differ in tuples vs list representation for some reason + # let json roundtrip normalize it + assert json.loads(json.dumps(default_results)) == json.loads( + json.dumps(fiona_results) + ) + + # TODO # io.parse_features on a feature-only geo_interface # TODO # io.parse_features on a feature-only geojson-like object # TODO # io.read_features on a feature-only diff --git a/tests/test_zonal.py b/tests/test_zonal.py index c906a53..d39ed59 100644 --- a/tests/test_zonal.py +++ b/tests/test_zonal.py @@ -558,6 +558,14 @@ def test_nan_counts(): assert "nan" not in res +def test_zonal_stats_chunk_size_stats(): + """chunk_size= flows all the way through to _pyogrio_generator.""" + polygons = data_dir / "polygons.shp" + default_stats = zonal_stats(polygons, raster) + chunked_stats = zonal_stats(polygons, raster, chunk_size=1) + assert default_stats == chunked_stats + + # Optional tests def test_geodataframe_zonal(): gpd = pytest.importorskip("geopandas")