From 4a1058ada87b636f2a2e32d3d906f5f02c4a7246 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Thu, 30 Apr 2026 12:37:18 +0100 Subject: [PATCH 01/15] added cell stats operator and changed nan behaviour in plot_histogram_series Co-authored-by: Copilot --- src/CSET/operators/feature.py | 144 ++++++++++++++++++++++++++++++++++ src/CSET/operators/plot.py | 4 +- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index a3cfe0c72..6f35b9a02 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -184,3 +184,147 @@ def track( tracking_cubelist.append(tracking_cube) return tracking_cubelist + + +def cell_stats( + cube: iris.cube.Cube, + threshold: float, + under_threshold: bool = False, + min_size: int = 4, + save_data: bool = False, +): + """Identify features in each timestep and output statistics. + + Parameters + ---------- + threshold: float + The threshold value for feature detection. + under_threshold: bool, optional + If set to True, features are identified where the data is below the threshold. + If set to False, features are identified where the data is above the threshold. + Default is False. + min_size: int, optional + The minimum number of contiguous grid points required for a feature to be tracked. + Default is 4. + save_data: bool, optional + If set to True, all tracking data is saved to disk for further analysis (including csv + and txt files containing feature properties that are not returned in output cubes). + Default is False. + + Returns + ------- + cell_stats_cubes: iris.cube.CubeList + An iris CubeList containing "feature_size", "feature_mean", and "feature_max" cubes. + + Notes + ----- + This operator uses the Simple-Track package with tracking disabled to identify features + in each timestep and compile cell statistics. Outputs cubes containing feature size (number + of grid points), mean value within features, and maximum value within features. + + Links + ---------- + .. https://github.com/ParaChute-UK/simple-track + + Examples + -------- + >>> cell_stats_cubes = feature.cell_stats(threshold=2) + >>> feature_size_cube = cell_stats_cubes.extract_cube("feature_size") + >>> plt.hist(feature_size_cube[-1]) + >>> plt.show() + + """ + # Setup config + tracker_config = { + "FEATURE": { + "threshold": threshold, + "under_threshold": under_threshold, + "min_size": min_size, + }, + "OUTPUT": { + "save_data": save_data, + "experiment_name": "feature_tracking", + "path": f"{os.getcwd()}/cell-stats_data", + "skip_tracking": True, + }, + } + logging.debug(f"Tracker config: {tracker_config}") + + # Get cube data into a dict to pass to Tracker + times = cube.coord("time").points + time_units = cube.coord("time").units + times_dt = [time_units.num2pydate(t) for t in times] + cube_dict = { + time: cube_slice.data + for time, cube_slice in zip(times_dt, cube.slices_over("time"), strict=True) + } + + # Run tracking, returning Timeline object + timeline = Tracker(tracker_config).run(cube_dict) + logging.debug("Tracking completed") + + # Get feature data from each frame of data, append to list of lists + # before conversion to numpy array (which requires knowledge of max + # number of features before constructing array shape) + size_data, mean_data, max_data = [], [], [] + number_of_features = [] + for time in times_dt: + frame = timeline.get_frame(time) + features = frame.features + size_data.append([feature.get_size() for feature in features.values()]) + mean_data.append([feature.mean for feature in features.values()]) + max_data.append([feature.max for feature in features.values()]) + number_of_features.append(len(features)) + + # Pad data with NaNs to create arrays of consistent shape (max number of features across + # all timesteps) + arr_size = max(number_of_features) + + # Size data is integer, but we need to pad with NaNs, so fill with invalid value first + size_data = np.array( + [ + np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100) + for sizes in size_data + ], + dtype=float, + ) + size_data[size_data == -100] = np.nan + + # Mean and max data are already float, so can be padded with NaNs directly. + mean_data = np.array( + [ + np.pad(means, (0, arr_size - len(means)), constant_values=np.nan) + for means in mean_data + ] + ) + max_data = np.array( + [ + np.pad(maxs, (0, arr_size - len(maxs)), constant_values=np.nan) + for maxs in max_data + ] + ) + + # Create cubes + time_coord = cube.coord("time").copy() + feature_coord = iris.coords.DimCoord( + np.arange(arr_size), + long_name="feature_number", + var_name="feature_number", + units="1", + ) + coords = [time_coord, feature_coord] + coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] + + cell_stats_cubelist = iris.cube.CubeList() + cube_names = ["feature_size", "feature_mean", "feature_max"] + data_arrays = [size_data, mean_data, max_data] + for cube_name, data_array in zip(cube_names, data_arrays, strict=True): + cell_stats_cube = iris.cube.Cube( + data_array, + long_name=cube_name, + units="1", + dim_coords_and_dims=coords_and_dims, + ) + cell_stats_cubelist.append(cell_stats_cube) + + return cell_stats_cubelist diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 5ac8966a6..a88626181 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -3074,8 +3074,8 @@ def plot_histogram_series( break if levels is None: - vmin = min(cb.data.min() for cb in cubes) - vmax = max(cb.data.max() for cb in cubes) + vmin = min(np.nanmin(cb.data) for cb in cubes) + vmax = max(np.nanmax(cb.data) for cb in cubes) # Make postage stamp plots if stamp_coordinate exists and has more than a # single point. If single_plot is True: From 40f87c21012bcba52d4bd6d7c499cf3c22b4f878 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Thu, 30 Apr 2026 12:39:26 +0100 Subject: [PATCH 02/15] added example cell_stats recipe --- .../example_feature_cell_stats.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100755 src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml diff --git a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml new file mode 100755 index 000000000..728c16cd1 --- /dev/null +++ b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml @@ -0,0 +1,26 @@ +category: Quick Look +title: Example running cell_stats and plotting histograms +description: | + Uses the feature.cell_stats operator to calculate cell size, mean cell values and max cell values for + identified features. + +steps: + - operator: read.read_cubes + file_paths: $INPUT_PATHS + + - operator: filters.filter_cubes + constraint: + operator: constraints.generate_var_constraint + varname: precipitation_flux + + - operator: feature.cell_stats + threshold: 3 + + # Filter tracking cubelist to one of "feature_size", "feature_mean" or "feature_max" + - operator: filters.filter_cubes + constraint: + operator: constraints.generate_var_constraint + varname: feature_max + + - operator: plot.plot_histogram_series + From 74cb58e71d8d0fc4e8ba6981fd5dde0f47b3a1ac Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Fri, 1 May 2026 14:15:05 +0100 Subject: [PATCH 03/15] log-log plotting for features, and frequency on y-axis rather than density --- src/CSET/operators/plot.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index a88626181..562c2d5b7 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -1368,7 +1368,10 @@ def _plot_and_save_histogram_series( # Set default that histograms will produce probability density function # at each bin (integral over range sums to 1). - density = True + if "feature" in cubes[0].long_name: + density = False + else: + density = True for cube in iter_maybe(cubes): # Easier to check title (where var name originates) @@ -1400,6 +1403,10 @@ def _plot_and_save_histogram_series( np.max(bins), ) + if "feature" in cube.long_name: + ax.set_yscale("log") + ax.set_xscale("log") + # Reshape cube data into a single array to allow for a single histogram. # Otherwise we plot xdim histograms stacked. cube_data_1d = (cube.data).flatten() @@ -1432,6 +1439,8 @@ def _plot_and_save_histogram_series( ax.set_ylabel( f"Contribution to mean ({iter_maybe(cubes)[0].units})", fontsize=14 ) + if "feature" in cubes[0].long_name: + ax.set_ylabel("Frequency", fontsize=14) ax.set_xlim(vmin, vmax) ax.tick_params(axis="both", labelsize=12) From e1070751fc8cce6920a0e604f90bbd89d329f423 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Fri, 1 May 2026 14:25:33 +0100 Subject: [PATCH 04/15] used same log bins for "surface_microphysical" as "feature" cubes - may need revisiting for mean and max --- src/CSET/operators/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 562c2d5b7..9e0cca1ab 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -1377,7 +1377,7 @@ def _plot_and_save_histogram_series( # Easier to check title (where var name originates) # than seeing if long names exist etc. # Exception case, where distribution better fits log scales/bins. - if "surface_microphysical" in title: + if "surface_microphysical" in title or "feature" in cube.long_name: if "amount" in title: # Compute histogram following Klingaman et al. (2017): ASoP bin2 = np.exp(np.log(0.02) + 0.1 * np.linspace(0, 99, 100)) From f75e6d9c357aeceda20ecd6b519f55e142b5b56a Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Fri, 1 May 2026 15:51:51 +0100 Subject: [PATCH 05/15] added feature_effective_radius as alternative to feature_size Co-authored-by: Copilot --- src/CSET/operators/feature.py | 50 ++++++++++++++++--- .../example_feature_cell_stats.yaml | 6 ++- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index 6f35b9a02..6dd32c510 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -17,6 +17,8 @@ import os import iris +import iris.cube +import iris.util import numpy as np from simpletrack.track import Tracker @@ -191,6 +193,7 @@ def cell_stats( threshold: float, under_threshold: bool = False, min_size: int = 4, + feature_size_unit: str = "feature_effective_radius", save_data: bool = False, ): """Identify features in each timestep and output statistics. @@ -206,6 +209,11 @@ def cell_stats( min_size: int, optional The minimum number of contiguous grid points required for a feature to be tracked. Default is 4. + feature_size_unit: str, optional + The unit to define feature size output. Options are "feature_effective_radius" + (the radius of a circle with the same area as the feature, in km) or " + feature_grid_points" (the number of grid points contained in the feature). + Default is "feature_effective_radius". save_data: bool, optional If set to True, all tracking data is saved to disk for further analysis (including csv and txt files containing feature properties that are not returned in output cubes). @@ -314,15 +322,43 @@ def cell_stats( ) coords = [time_coord, feature_coord] coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] - cell_stats_cubelist = iris.cube.CubeList() - cube_names = ["feature_size", "feature_mean", "feature_max"] - data_arrays = [size_data, mean_data, max_data] - for cube_name, data_array in zip(cube_names, data_arrays, strict=True): + + # Set cube properties + cube_properties = { + "feature_size": { + "data": size_data, + "long_name": feature_size_unit, + "units": "1", + }, + "feature_mean": {"data": mean_data, "long_name": "feature_mean", "units": 1}, + "feature_max": {"data": max_data, "long_name": "feature_max", "units": 1}, + } + if feature_size_unit == "feature_effective_radius": + # Conert feature size to effective radius, assuming uniform grid + # Guess coord representing horizontal grid (choose first available) + hzntl_coord = [ + coord + for coord in cube.coords() + if iris.util.guess_coord_axis(coord) in ["X", "Y"] + ][0] + logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") + # Convert to km if possible + hzntl_coord.convert_units("km") + # Naive grid spacing estimate, correct for regular grids, likely wildly + # inaccruate for LFRic/irregular grids + grid_spacing = np.abs(np.diff(hzntl_coord.points).mean()) + # Convert feature size in grid points to effective radius in km + size_data = np.sqrt(size_data * grid_spacing**2 / np.pi) + # Reset cube properties + cube_properties["feature_size"]["data"] = size_data + cube_properties["feature_size"]["units"] = "km" + + for cb_props in cube_properties.values(): cell_stats_cube = iris.cube.Cube( - data_array, - long_name=cube_name, - units="1", + data=cb_props["data"], + long_name=cb_props["long_name"], + units=cb_props["units"], dim_coords_and_dims=coords_and_dims, ) cell_stats_cubelist.append(cell_stats_cube) diff --git a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml index 728c16cd1..300417442 100755 --- a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml +++ b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml @@ -14,13 +14,15 @@ steps: varname: precipitation_flux - operator: feature.cell_stats + feature_size_unit: "feature_effective_radius" threshold: 3 - # Filter tracking cubelist to one of "feature_size", "feature_mean" or "feature_max" + # Filter tracking cubelist to one of "feature_mean", "feature_max", or + # "feature_size/feature_effective_radius" (depending on feature_size_unit set above), - operator: filters.filter_cubes constraint: operator: constraints.generate_var_constraint - varname: feature_max + varname: feature_effective_radius - operator: plot.plot_histogram_series From e928ba892cf757b3875b9e5d42c9531a50e1aaac Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Fri, 1 May 2026 15:56:26 +0100 Subject: [PATCH 06/15] additional comments --- src/CSET/operators/feature.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index 6dd32c510..c58151741 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -288,7 +288,8 @@ def cell_stats( # all timesteps) arr_size = max(number_of_features) - # Size data is integer, but we need to pad with NaNs, so fill with invalid value first + # Size data is integer, but we need to pad with NaNs (which is a float), so fill + # with invalid value first size_data = np.array( [ np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100) @@ -344,6 +345,7 @@ def cell_stats( ][0] logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") # Convert to km if possible + # TODO: fall back to feature_size if this conversion fails hzntl_coord.convert_units("km") # Naive grid spacing estimate, correct for regular grids, likely wildly # inaccruate for LFRic/irregular grids @@ -354,6 +356,7 @@ def cell_stats( cube_properties["feature_size"]["data"] = size_data cube_properties["feature_size"]["units"] = "km" + # Populate cubelist for cb_props in cube_properties.values(): cell_stats_cube = iris.cube.Cube( data=cb_props["data"], From 730c7aa1dc1be06b0e2009e74dd097431e898769 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Fri, 1 May 2026 18:00:19 +0100 Subject: [PATCH 07/15] added simple-track to dependencies --- pyproject.toml | 1 + requirements/environment.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e23672040..b5716c1b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "scipy", "scikit-image", "dask", + "simple-track", ] [project.urls] diff --git a/requirements/environment.yml b/requirements/environment.yml index af5ba93ac..2fbcaa3cb 100644 --- a/requirements/environment.yml +++ b/requirements/environment.yml @@ -16,6 +16,7 @@ dependencies: - scipy - scikit-image # For image processing techniques. - dask + - simple-track # Build dependencies - setuptools>=64 From a3bce7723a9a190c9b88f98a0dfb153612c56d5c Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Fri, 8 May 2026 13:25:03 +0100 Subject: [PATCH 08/15] added cube metadata copying to feature cubes, and error catching for feature_effective_radius conversion --- src/CSET/operators/feature.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index c58151741..a9ebbcec3 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -346,7 +346,12 @@ def cell_stats( logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") # Convert to km if possible # TODO: fall back to feature_size if this conversion fails - hzntl_coord.convert_units("km") + # TODO: this will still add the name "feature_effective_radius" to the feature_size + # cube, even though the data is actually pixel size. Figure out elegant solution. + try: + hzntl_coord.convert_units("km") + except: + pass # Naive grid spacing estimate, correct for regular grids, likely wildly # inaccruate for LFRic/irregular grids grid_spacing = np.abs(np.diff(hzntl_coord.points).mean()) @@ -356,6 +361,25 @@ def cell_stats( cube_properties["feature_size"]["data"] = size_data cube_properties["feature_size"]["units"] = "km" + # Get list of coords to copy from input cube to output cubes + copyable_coord_names = [ + "realization", + "hour", + "forecast_period", + "forecast_reference_time", + "model_name", + ] + input_cube_coord_names = [] + for coord in cube.coords(): + input_cube_coord_names.append(coord.standard_name) + input_cube_coord_names.append(coord.long_name) + + coords_to_copy = [ + coord_name + for coord_name in copyable_coord_names + if coord_name in input_cube_coord_names + ] + # Populate cubelist for cb_props in cube_properties.values(): cell_stats_cube = iris.cube.Cube( @@ -364,6 +388,11 @@ def cell_stats( units=cb_props["units"], dim_coords_and_dims=coords_and_dims, ) + # Add other metadata from input cube + for coord_name in coords_to_copy: + cell_stats_cube.add_aux_coord(cube.coord(coord_name).copy()) + + # Add to cubelist cell_stats_cubelist.append(cell_stats_cube) return cell_stats_cubelist From c52c4302851076a8a75d79fe62c44d957d0d7357 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Fri, 8 May 2026 13:31:04 +0100 Subject: [PATCH 09/15] added cset_comparison_base to list of coord metadata to copy to feature cube (may not be necessary?) --- src/CSET/operators/feature.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index a9ebbcec3..a33a385f4 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -368,6 +368,7 @@ def cell_stats( "forecast_period", "forecast_reference_time", "model_name", + "cset_comparison_base", ] input_cube_coord_names = [] for coord in cube.coords(): From 54c3db580e803ee70f3d2a7aa86b26bd97e63e1e Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Wed, 13 May 2026 11:06:41 +0100 Subject: [PATCH 10/15] copying coord to cell_stats_cube now copies dimension information (assuming same cube structure) --- src/CSET/operators/feature.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index a33a385f4..adefa9bf3 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -391,7 +391,13 @@ def cell_stats( ) # Add other metadata from input cube for coord_name in coords_to_copy: - cell_stats_cube.add_aux_coord(cube.coord(coord_name).copy()) + coord = cube.coord(coord_name).copy() + # Check if this coord represents a dimension of data + dims = cube.coord_dims(coord) + if len(dims) > 1: + cell_stats_cube.add_aux_coord(coord, dims) + else: + cell_stats_cube.add_aux_coord(coord) # Add to cubelist cell_stats_cubelist.append(cell_stats_cube) From dda9d48c57f747bbe0e8d445a7186f7ad72b63e0 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Tue, 19 May 2026 15:04:53 +0100 Subject: [PATCH 11/15] removed feature_effective_radius support for now, added support for multiple models in cell_stats operator --- src/CSET/operators/feature.py | 331 +++++++++--------- .../example_feature_cell_stats.yaml | 19 +- 2 files changed, 182 insertions(+), 168 deletions(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index adefa9bf3..f9ce014d1 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -22,6 +22,8 @@ import numpy as np from simpletrack.track import Tracker +from CSET._common import iter_maybe + def track( cube: iris.cube.Cube, @@ -67,7 +69,7 @@ def track( Returns ------- tracking_cubes: iris.cube.CubeList - A list of iris cubes containing tracking data, including feauture ID, lifetime, + A list of iris cubes containing tracking data, including feature ID, lifetime, and locations of initiating features. Notes @@ -92,7 +94,7 @@ def track( "feature_init": A 2D binary field indicating the location of newly initiated features at each timestep. These features are identified as having a lifetime of 1 AND have initiated sufficiently - far from other, existing features that they are not considered to have spawed from them. + far from other, existing features that they are not considered to have spawned from them. Links ---------- @@ -189,7 +191,7 @@ def track( def cell_stats( - cube: iris.cube.Cube, + cubes: iris.cube.Cube | iris.cube.CubeList, threshold: float, under_threshold: bool = False, min_size: int = 4, @@ -242,164 +244,179 @@ def cell_stats( >>> plt.show() """ - # Setup config - tracker_config = { - "FEATURE": { - "threshold": threshold, - "under_threshold": under_threshold, - "min_size": min_size, - }, - "OUTPUT": { - "save_data": save_data, - "experiment_name": "feature_tracking", - "path": f"{os.getcwd()}/cell-stats_data", - "skip_tracking": True, - }, - } - logging.debug(f"Tracker config: {tracker_config}") - - # Get cube data into a dict to pass to Tracker - times = cube.coord("time").points - time_units = cube.coord("time").units - times_dt = [time_units.num2pydate(t) for t in times] - cube_dict = { - time: cube_slice.data - for time, cube_slice in zip(times_dt, cube.slices_over("time"), strict=True) - } + # Check inputs + cubes = iter_maybe(cubes) + cell_stats_cubelist = iris.cube.CubeList() - # Run tracking, returning Timeline object - timeline = Tracker(tracker_config).run(cube_dict) - logging.debug("Tracking completed") + # Run tracking on all input data + for cube in cubes: + model_name = cube.attributes.get("model_name", None) + # Setup config + tracker_config = { + "FEATURE": { + "threshold": threshold, + "under_threshold": under_threshold, + "min_size": min_size, + }, + "OUTPUT": { + "save_data": save_data, + "experiment_name": "feature_tracking", + "path": f"{os.getcwd()}/{model_name}/cell-stats_data", + "skip_tracking": True, + }, + } + logging.debug(f"Tracker config: {tracker_config}") + + # Get cube data into a dict to pass to Tracker + times = cube.coord("time").points + time_units = cube.coord("time").units + times_dt = [time_units.num2pydate(t) for t in times] + cube_dict = { + time: cube_slice.data + for time, cube_slice in zip(times_dt, cube.slices_over("time"), strict=True) + } + + # Run tracking, returning Timeline object + timeline = Tracker(tracker_config).run(cube_dict) + + logging.debug(f"Tracking completed for {model_name}") + + # Get feature data from each frame of data, append to list of lists + # before conversion to numpy array (which requires knowledge of max + # number of features before constructing array shape) + size_data, mean_data, max_data = [], [], [] + number_of_features = [] + for time in times_dt: + frame = timeline.get_frame(time) + features = frame.features + size_data.append([feature.get_size() for feature in features.values()]) + mean_data.append([feature.mean for feature in features.values()]) + max_data.append([feature.max for feature in features.values()]) + number_of_features.append(len(features)) + + # Pad data with NaNs to create arrays of consistent shape (max number of features across + # all timesteps) + arr_size = max(number_of_features) + + # Size data is integer, but we need to pad with NaNs (which is a float), so fill + # with invalid value first + size_data = np.array( + [ + np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100) + for sizes in size_data + ], + dtype=float, + ) + size_data[size_data == -100] = np.nan + + # Mean and max data are already float, so can be padded with NaNs directly. + mean_data = np.array( + [ + np.pad(means, (0, arr_size - len(means)), constant_values=np.nan) + for means in mean_data + ] + ) + max_data = np.array( + [ + np.pad(maxs, (0, arr_size - len(maxs)), constant_values=np.nan) + for maxs in max_data + ] + ) - # Get feature data from each frame of data, append to list of lists - # before conversion to numpy array (which requires knowledge of max - # number of features before constructing array shape) - size_data, mean_data, max_data = [], [], [] - number_of_features = [] - for time in times_dt: - frame = timeline.get_frame(time) - features = frame.features - size_data.append([feature.get_size() for feature in features.values()]) - mean_data.append([feature.mean for feature in features.values()]) - max_data.append([feature.max for feature in features.values()]) - number_of_features.append(len(features)) - - # Pad data with NaNs to create arrays of consistent shape (max number of features across - # all timesteps) - arr_size = max(number_of_features) - - # Size data is integer, but we need to pad with NaNs (which is a float), so fill - # with invalid value first - size_data = np.array( - [ - np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100) - for sizes in size_data - ], - dtype=float, - ) - size_data[size_data == -100] = np.nan - - # Mean and max data are already float, so can be padded with NaNs directly. - mean_data = np.array( - [ - np.pad(means, (0, arr_size - len(means)), constant_values=np.nan) - for means in mean_data + # Create cubes + time_coord = cube.coord("time").copy() + feature_coord = iris.coords.DimCoord( + np.arange(arr_size), + long_name="feature_number", + var_name="feature_number", + units="1", + ) + coords = [time_coord, feature_coord] + coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] + + # Set cube properties + cube_properties = { + "feature_size": { + "data": size_data, + "long_name": feature_size_unit, + "units": "1", + }, + "feature_mean": { + "data": mean_data, + "long_name": "feature_mean", + "units": 1, + }, + "feature_max": {"data": max_data, "long_name": "feature_max", "units": 1}, + } + # TODO: need to consider better implementation which doesn't overwrite feature_size cube + # TODO: consider iris.analysis.cartography.area_weights() or hzntl_coord.is_regular() + # to check for regular spacing + # TODO: Check for hzntl_coords is empty list and handle elegantly + # if feature_size_unit == "feature_effective_radius": + # # Convert feature size to effective radius, assuming uniform grid + # # Guess coord representing horizontal grid (choose first available) + # hzntl_coord = [ + # coord + # for coord in cube.coords() + # if iris.util.guess_coord_axis(coord) in ["X", "Y"] + # ][0] + # logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") + # # Convert to km if possible + # # TODO: fall back to feature_size if this conversion fails + # # TODO: this will still add the name "feature_effective_radius" to the feature_size + # # cube, even though the data is actually pixel size. Figure out elegant solution. + # try: + # hzntl_coord.convert_units("km") + # except: + # pass + # # Naive grid spacing estimate, correct for regular grids, likely wildly + # # inaccruate for LFRic/irregular grids + # grid_spacing = np.abs(np.diff(hzntl_coord.points).mean()) + # # Convert feature size in grid points to effective radius in km + # size_data = np.sqrt(size_data * grid_spacing**2 / np.pi) + # # Reset cube properties + # cube_properties["feature_size"]["data"] = size_data + # cube_properties["feature_size"]["units"] = "km" + + # Get list of coords to copy from input cube to output cubes + copyable_coord_names = [ + "realization", + "hour", + "forecast_period", + "forecast_reference_time", + "model_name", + "cset_comparison_base", ] - ) - max_data = np.array( - [ - np.pad(maxs, (0, arr_size - len(maxs)), constant_values=np.nan) - for maxs in max_data + input_cube_coord_names = [] + for coord in cube.coords(): + input_cube_coord_names.append(coord.standard_name) + input_cube_coord_names.append(coord.long_name) + + coords_to_copy = [ + coord_name + for coord_name in copyable_coord_names + if coord_name in input_cube_coord_names ] - ) - - # Create cubes - time_coord = cube.coord("time").copy() - feature_coord = iris.coords.DimCoord( - np.arange(arr_size), - long_name="feature_number", - var_name="feature_number", - units="1", - ) - coords = [time_coord, feature_coord] - coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] - cell_stats_cubelist = iris.cube.CubeList() - - # Set cube properties - cube_properties = { - "feature_size": { - "data": size_data, - "long_name": feature_size_unit, - "units": "1", - }, - "feature_mean": {"data": mean_data, "long_name": "feature_mean", "units": 1}, - "feature_max": {"data": max_data, "long_name": "feature_max", "units": 1}, - } - if feature_size_unit == "feature_effective_radius": - # Conert feature size to effective radius, assuming uniform grid - # Guess coord representing horizontal grid (choose first available) - hzntl_coord = [ - coord - for coord in cube.coords() - if iris.util.guess_coord_axis(coord) in ["X", "Y"] - ][0] - logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") - # Convert to km if possible - # TODO: fall back to feature_size if this conversion fails - # TODO: this will still add the name "feature_effective_radius" to the feature_size - # cube, even though the data is actually pixel size. Figure out elegant solution. - try: - hzntl_coord.convert_units("km") - except: - pass - # Naive grid spacing estimate, correct for regular grids, likely wildly - # inaccruate for LFRic/irregular grids - grid_spacing = np.abs(np.diff(hzntl_coord.points).mean()) - # Convert feature size in grid points to effective radius in km - size_data = np.sqrt(size_data * grid_spacing**2 / np.pi) - # Reset cube properties - cube_properties["feature_size"]["data"] = size_data - cube_properties["feature_size"]["units"] = "km" - - # Get list of coords to copy from input cube to output cubes - copyable_coord_names = [ - "realization", - "hour", - "forecast_period", - "forecast_reference_time", - "model_name", - "cset_comparison_base", - ] - input_cube_coord_names = [] - for coord in cube.coords(): - input_cube_coord_names.append(coord.standard_name) - input_cube_coord_names.append(coord.long_name) - - coords_to_copy = [ - coord_name - for coord_name in copyable_coord_names - if coord_name in input_cube_coord_names - ] - - # Populate cubelist - for cb_props in cube_properties.values(): - cell_stats_cube = iris.cube.Cube( - data=cb_props["data"], - long_name=cb_props["long_name"], - units=cb_props["units"], - dim_coords_and_dims=coords_and_dims, - ) - # Add other metadata from input cube - for coord_name in coords_to_copy: - coord = cube.coord(coord_name).copy() - # Check if this coord represents a dimension of data - dims = cube.coord_dims(coord) - if len(dims) > 1: - cell_stats_cube.add_aux_coord(coord, dims) - else: - cell_stats_cube.add_aux_coord(coord) - # Add to cubelist - cell_stats_cubelist.append(cell_stats_cube) + # Populate cubelist + for cb_props in cube_properties.values(): + cell_stats_cube = iris.cube.Cube( + data=cb_props["data"], + long_name=cb_props["long_name"], + units=cb_props["units"], + dim_coords_and_dims=coords_and_dims, + ) + # Add other metadata from input cube + for coord_name in coords_to_copy: + coord = cube.coord(coord_name).copy() + # Check if this coord represents a dimension of data + dims = cube.coord_dims(coord) + if len(dims) > 1: + cell_stats_cube.add_aux_coord(coord, dims) + else: + cell_stats_cube.add_aux_coord(coord) + + # Add to cubelist + cell_stats_cubelist.append(cell_stats_cube) return cell_stats_cubelist diff --git a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml index 300417442..2f21438eb 100755 --- a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml +++ b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml @@ -1,28 +1,25 @@ category: Quick Look title: Example running cell_stats and plotting histograms description: | - Uses the feature.cell_stats operator to calculate cell size, mean cell values and max cell values for - identified features. + Uses the feature.cell_stats operator to calculate cell size, mean cell values and max cell values for + identified features. steps: - operator: read.read_cubes file_paths: $INPUT_PATHS - - operator: filters.filter_cubes + - operator: filters.filter_multiple_cubes constraint: operator: constraints.generate_var_constraint - varname: precipitation_flux + varname: $VARNAME - operator: feature.cell_stats - feature_size_unit: "feature_effective_radius" threshold: 3 - # Filter tracking cubelist to one of "feature_mean", "feature_max", or - # "feature_size/feature_effective_radius" (depending on feature_size_unit set above), - - operator: filters.filter_cubes + # Filter tracking cubelist to one of "feature_mean", "feature_max", or "feature_size" + - operator: filters.filter_multiple_cubes constraint: operator: constraints.generate_var_constraint - varname: feature_effective_radius - - - operator: plot.plot_histogram_series + varname: feature_size + - operator: plot.plot_histogram_series From 65764a747af5ada054c250c700aa7be2512f17de Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Tue, 19 May 2026 15:52:42 +0100 Subject: [PATCH 12/15] changed example cell_stats recipe to include MODEL_NAME input arg, fixed removal of feature_effective_radius, input cube attrs now copied to cell_stats_cube --- src/CSET/operators/feature.py | 16 +++++++++------- .../example_feature_cell_stats.yaml | 1 + 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index f9ce014d1..006cdc7d3 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -195,7 +195,6 @@ def cell_stats( threshold: float, under_threshold: bool = False, min_size: int = 4, - feature_size_unit: str = "feature_effective_radius", save_data: bool = False, ): """Identify features in each timestep and output statistics. @@ -211,11 +210,11 @@ def cell_stats( min_size: int, optional The minimum number of contiguous grid points required for a feature to be tracked. Default is 4. - feature_size_unit: str, optional - The unit to define feature size output. Options are "feature_effective_radius" - (the radius of a circle with the same area as the feature, in km) or " - feature_grid_points" (the number of grid points contained in the feature). - Default is "feature_effective_radius". + # feature_size_unit: str, optional + # The unit to define feature size output. Options are "feature_effective_radius" + # (the radius of a circle with the same area as the feature, in km) or " + # feature_grid_points" (the number of grid points contained in the feature). + # Default is "feature_effective_radius". save_data: bool, optional If set to True, all tracking data is saved to disk for further analysis (including csv and txt files containing feature properties that are not returned in output cubes). @@ -338,7 +337,7 @@ def cell_stats( cube_properties = { "feature_size": { "data": size_data, - "long_name": feature_size_unit, + "long_name": "feature_size", "units": "1", }, "feature_mean": { @@ -416,6 +415,9 @@ def cell_stats( else: cell_stats_cube.add_aux_coord(coord) + # Copy over attributes + cell_stats_cube.attributes = cube.attributes + # Add to cubelist cell_stats_cubelist.append(cell_stats_cube) diff --git a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml index 2f21438eb..9acf8e381 100755 --- a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml +++ b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml @@ -7,6 +7,7 @@ description: | steps: - operator: read.read_cubes file_paths: $INPUT_PATHS + model_names: $MODEL_NAME - operator: filters.filter_multiple_cubes constraint: From 7ed0592c66622800f4767c9a3f3887764057a0ca Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Wed, 27 May 2026 13:36:14 +0100 Subject: [PATCH 13/15] fixed bug for assigning coord dims --- src/CSET/operators/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index 006cdc7d3..6d9e80dea 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -410,7 +410,7 @@ def cell_stats( coord = cube.coord(coord_name).copy() # Check if this coord represents a dimension of data dims = cube.coord_dims(coord) - if len(dims) > 1: + if len(dims) > 0: cell_stats_cube.add_aux_coord(coord, dims) else: cell_stats_cube.add_aux_coord(coord) From 8554d53ecc02418edc6e2034896cad5f5300bce0 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Tue, 16 Jun 2026 14:06:22 +0100 Subject: [PATCH 14/15] added feature effective radius calculation --- src/CSET/operators/feature.py | 119 ++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 47 deletions(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index 6d9e80dea..0eba0d72f 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -210,11 +210,6 @@ def cell_stats( min_size: int, optional The minimum number of contiguous grid points required for a feature to be tracked. Default is 4. - # feature_size_unit: str, optional - # The unit to define feature size output. Options are "feature_effective_radius" - # (the radius of a circle with the same area as the feature, in km) or " - # feature_grid_points" (the number of grid points contained in the feature). - # Default is "feature_effective_radius". save_data: bool, optional If set to True, all tracking data is saved to disk for further analysis (including csv and txt files containing feature properties that are not returned in output cubes). @@ -223,13 +218,15 @@ def cell_stats( Returns ------- cell_stats_cubes: iris.cube.CubeList - An iris CubeList containing "feature_size", "feature_mean", and "feature_max" cubes. + An iris CubeList containing "feature_size", "feature_effective_radius", "feature_mean", + and "feature_max" cubes. Notes ----- This operator uses the Simple-Track package with tracking disabled to identify features in each timestep and compile cell statistics. Outputs cubes containing feature size (number - of grid points), mean value within features, and maximum value within features. + of grid points), effective radius (in km), mean value within features, and maximum + value within features. Links ---------- @@ -308,6 +305,11 @@ def cell_stats( ) size_data[size_data == -100] = np.nan + # Get effective radius from feature size, using horizontal coordinate of input cube to estimate grid spacing + effective_radius_data = _get_effective_radius_from_feature_size( + size_data=size_data, cube_with_hzntl_coord=cube + ) + # Mean and max data are already float, so can be padded with NaNs directly. mean_data = np.array( [ @@ -322,17 +324,6 @@ def cell_stats( ] ) - # Create cubes - time_coord = cube.coord("time").copy() - feature_coord = iris.coords.DimCoord( - np.arange(arr_size), - long_name="feature_number", - var_name="feature_number", - units="1", - ) - coords = [time_coord, feature_coord] - coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] - # Set cube properties cube_properties = { "feature_size": { @@ -346,36 +337,23 @@ def cell_stats( "units": 1, }, "feature_max": {"data": max_data, "long_name": "feature_max", "units": 1}, + "feature_effective_radius": { + "data": effective_radius_data, + "long_name": "feature_effective_radius", + "units": "km", + }, } - # TODO: need to consider better implementation which doesn't overwrite feature_size cube - # TODO: consider iris.analysis.cartography.area_weights() or hzntl_coord.is_regular() - # to check for regular spacing - # TODO: Check for hzntl_coords is empty list and handle elegantly - # if feature_size_unit == "feature_effective_radius": - # # Convert feature size to effective radius, assuming uniform grid - # # Guess coord representing horizontal grid (choose first available) - # hzntl_coord = [ - # coord - # for coord in cube.coords() - # if iris.util.guess_coord_axis(coord) in ["X", "Y"] - # ][0] - # logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") - # # Convert to km if possible - # # TODO: fall back to feature_size if this conversion fails - # # TODO: this will still add the name "feature_effective_radius" to the feature_size - # # cube, even though the data is actually pixel size. Figure out elegant solution. - # try: - # hzntl_coord.convert_units("km") - # except: - # pass - # # Naive grid spacing estimate, correct for regular grids, likely wildly - # # inaccruate for LFRic/irregular grids - # grid_spacing = np.abs(np.diff(hzntl_coord.points).mean()) - # # Convert feature size in grid points to effective radius in km - # size_data = np.sqrt(size_data * grid_spacing**2 / np.pi) - # # Reset cube properties - # cube_properties["feature_size"]["data"] = size_data - # cube_properties["feature_size"]["units"] = "km" + + # Create cubes + time_coord = cube.coord("time").copy() + feature_coord = iris.coords.DimCoord( + np.arange(arr_size), + long_name="feature_number", + var_name="feature_number", + units="1", + ) + coords = [time_coord, feature_coord] + coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] # Get list of coords to copy from input cube to output cubes copyable_coord_names = [ @@ -422,3 +400,50 @@ def cell_stats( cell_stats_cubelist.append(cell_stats_cube) return cell_stats_cubelist + + +def _get_effective_radius_from_feature_size( + size_data: np.ndarray, cube_with_hzntl_coord: iris.cube.Cube +) -> np.ndarray: + """Convert feature size in grid points to effective radius in km. + + Parameters + ---------- + size_data: np.ndarray + An array containing "feature_size" data, in units of grid points. + cube_with_hzntl_coord: iris.cube.Cube + An iris cube containing a horizontal coordinate, which is used to + estimate the grid spacing for the effective radius calculation. + + Returns + ------- + effective_radii_data: np.ndarray + An array containing "feature_effective_radius" data, in units of km. + + Notes + ----- + This function assumes that the input cube has a horizontal coordinate system that is regular and + that the grid spacing can be estimated from the horizontal coordinates. The effective radius is + calculated as the radius of a circle with the same area as the feature size in grid points. + + """ + # Guess coord representing horizontal grid (choose first available) + hzntl_coord = [ + coord + for coord in cube_with_hzntl_coord.coords() + if iris.util.guess_coord_axis(coord) in ["X", "Y"] + ][0] + logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") + + # Check coordinate is regular, but only warn if not, this is a naive estimate + # and will be inaccurate for irregular grids + if not iris.util.is_regular(hzntl_coord): + logging.warning( + f"Horizontal coordinate {hzntl_coord} is not regular. " + "Effective radius calculation may be inaccurate." + ) + + grid_spacing = np.abs(np.mean(np.diff(hzntl_coord.points))) + effective_radii_data = np.sqrt(size_data * grid_spacing**2 / np.pi) + + return effective_radii_data From 963d808f87599d3ef74c0fad3132e223c1b18085 Mon Sep 17 00:00:00 2001 From: Adam Gainford Date: Tue, 16 Jun 2026 15:01:33 +0100 Subject: [PATCH 15/15] modularised cell_stats operator, added test --- src/CSET/operators/feature.py | 252 ++++++++++++++++++++------------ tests/operators/test_feature.py | 41 +++++- 2 files changed, 195 insertions(+), 98 deletions(-) diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index 0eba0d72f..31a36e9b1 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -20,6 +20,7 @@ import iris.cube import iris.util import numpy as np +from simpletrack.frame import Timeline from simpletrack.track import Tracker from CSET._common import iter_maybe @@ -274,62 +275,24 @@ def cell_stats( # Run tracking, returning Timeline object timeline = Tracker(tracker_config).run(cube_dict) - logging.debug(f"Tracking completed for {model_name}") - # Get feature data from each frame of data, append to list of lists - # before conversion to numpy array (which requires knowledge of max - # number of features before constructing array shape) - size_data, mean_data, max_data = [], [], [] - number_of_features = [] - for time in times_dt: - frame = timeline.get_frame(time) - features = frame.features - size_data.append([feature.get_size() for feature in features.values()]) - mean_data.append([feature.mean for feature in features.values()]) - max_data.append([feature.max for feature in features.values()]) - number_of_features.append(len(features)) - - # Pad data with NaNs to create arrays of consistent shape (max number of features across - # all timesteps) - arr_size = max(number_of_features) - - # Size data is integer, but we need to pad with NaNs (which is a float), so fill - # with invalid value first - size_data = np.array( - [ - np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100) - for sizes in size_data - ], - dtype=float, + # Get feature data from each frame of data + size_data, mean_data, max_data = _get_cell_stats_arrays_from_timeline( + timeline=timeline, expected_frame_times=times_dt ) - size_data[size_data == -100] = np.nan # Get effective radius from feature size, using horizontal coordinate of input cube to estimate grid spacing effective_radius_data = _get_effective_radius_from_feature_size( size_data=size_data, cube_with_hzntl_coord=cube ) - # Mean and max data are already float, so can be padded with NaNs directly. - mean_data = np.array( - [ - np.pad(means, (0, arr_size - len(means)), constant_values=np.nan) - for means in mean_data - ] - ) - max_data = np.array( - [ - np.pad(maxs, (0, arr_size - len(maxs)), constant_values=np.nan) - for maxs in max_data - ] - ) - - # Set cube properties + # Set output cube properties cube_properties = { "feature_size": { "data": size_data, "long_name": "feature_size", - "units": "1", + "units": 1, }, "feature_mean": { "data": mean_data, @@ -344,62 +307,75 @@ def cell_stats( }, } - # Create cubes - time_coord = cube.coord("time").copy() - feature_coord = iris.coords.DimCoord( - np.arange(arr_size), - long_name="feature_number", - var_name="feature_number", - units="1", + # Create cubes, add to existing cubelist + cell_stats_cubelist.extend( + _add_cell_stats_data_to_cubes( + data_and_metadata_dict=cube_properties, template_cube=cube + ) ) - coords = [time_coord, feature_coord] - coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] - - # Get list of coords to copy from input cube to output cubes - copyable_coord_names = [ - "realization", - "hour", - "forecast_period", - "forecast_reference_time", - "model_name", - "cset_comparison_base", + + return cell_stats_cubelist + + +def _get_cell_stats_arrays_from_timeline( + timeline: Timeline, expected_frame_times: list +) -> list[np.ndarray]: + """Extract cell statistics data from a Simple-Track Timeline object. + + Parameters + ---------- + timeline: Timeline + A Simple-Track Timeline object containing tracked features. + + Returns + ------- + size_data: np.ndarray + A numpy array containing the size of each feature in grid points. + mean_data: np.ndarray + A numpy array containing the mean value of each feature. + max_data: np.ndarray + A numpy array containing the maximum value of each feature. + """ + size_data, mean_data, max_data = [], [], [] + number_of_features = [] + for time in expected_frame_times: + frame = timeline.get_frame(time) + features = frame.features + size_data.append([feature.get_size() for feature in features.values()]) + mean_data.append([feature.mean for feature in features.values()]) + max_data.append([feature.max for feature in features.values()]) + number_of_features.append(len(features)) + + # Pad data with NaNs to create arrays of consistent shape (max number of features across + # all timesteps) + arr_size = max(number_of_features) + + # Size data is integer, but we need to pad with NaNs (which is a float), so fill + # with invalid value first + size_data = np.array( + [ + np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100) + for sizes in size_data + ], + dtype=float, + ) + size_data[size_data == -100] = np.nan + + # Mean and max data are already float, so can be padded with NaNs directly. + mean_data = np.array( + [ + np.pad(means, (0, arr_size - len(means)), constant_values=np.nan) + for means in mean_data ] - input_cube_coord_names = [] - for coord in cube.coords(): - input_cube_coord_names.append(coord.standard_name) - input_cube_coord_names.append(coord.long_name) - - coords_to_copy = [ - coord_name - for coord_name in copyable_coord_names - if coord_name in input_cube_coord_names + ) + max_data = np.array( + [ + np.pad(maxs, (0, arr_size - len(maxs)), constant_values=np.nan) + for maxs in max_data ] + ) - # Populate cubelist - for cb_props in cube_properties.values(): - cell_stats_cube = iris.cube.Cube( - data=cb_props["data"], - long_name=cb_props["long_name"], - units=cb_props["units"], - dim_coords_and_dims=coords_and_dims, - ) - # Add other metadata from input cube - for coord_name in coords_to_copy: - coord = cube.coord(coord_name).copy() - # Check if this coord represents a dimension of data - dims = cube.coord_dims(coord) - if len(dims) > 0: - cell_stats_cube.add_aux_coord(coord, dims) - else: - cell_stats_cube.add_aux_coord(coord) - - # Copy over attributes - cell_stats_cube.attributes = cube.attributes - - # Add to cubelist - cell_stats_cubelist.append(cell_stats_cube) - - return cell_stats_cubelist + return size_data, mean_data, max_data def _get_effective_radius_from_feature_size( @@ -447,3 +423,89 @@ def _get_effective_radius_from_feature_size( effective_radii_data = np.sqrt(size_data * grid_spacing**2 / np.pi) return effective_radii_data + + +def _add_cell_stats_data_to_cubes( + data_and_metadata_dict: dict, template_cube: iris.cube.Cube +) -> iris.cube.CubeList: + """Add data to cubes, using template cube for metadata. + + Parameters + ---------- + data_and_metadata_dict: dict + A dictionary containing data and metadata for each cube to be created. + The keys are the long names of the cubes, and the values are dictionaries + containing the data and units for each cube. + + template_cube: iris.cube.Cube + An iris cube to use as a template for the new cubes. The new cubes will + have the same attributes as the template cube. + + Returns + ------- + cubelist: iris.cube.CubeList + A list of iris cubes containing the added data. + + """ + cubelist = iris.cube.CubeList() + + # Construct coordinates for new cubes + time_coord = template_cube.coord("time").copy() + # To construct feature coordinate, look at the size of dimension 1 for each data + arr_size = max( + [data_and_metadata_dict[cb]["data"].shape[1] for cb in data_and_metadata_dict] + ) + feature_coord = iris.coords.DimCoord( + np.arange(arr_size), + long_name="feature_number", + var_name="feature_number", + units="1", + ) + coords = [time_coord, feature_coord] + coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] + + # Get list of coords to copy from input cube to output cubes + copyable_coord_names = [ + "realization", + "hour", + "forecast_period", + "forecast_reference_time", + "model_name", + "cset_comparison_base", + ] + input_cube_coord_names = [] + for coord in template_cube.coords(): + input_cube_coord_names.append(coord.standard_name) + input_cube_coord_names.append(coord.long_name) + + coords_to_copy = [ + coord_name + for coord_name in copyable_coord_names + if coord_name in input_cube_coord_names + ] + + # Populate cubelist + for cb_props in data_and_metadata_dict.values(): + cell_stats_cube = iris.cube.Cube( + data=cb_props["data"], + long_name=cb_props["long_name"], + units=cb_props["units"], + dim_coords_and_dims=coords_and_dims, + ) + # Add other metadata from input cube + for coord_name in coords_to_copy: + coord = template_cube.coord(coord_name).copy() + # Check if this coord represents a dimension of data + dims = template_cube.coord_dims(coord) + if len(dims) > 0: + cell_stats_cube.add_aux_coord(coord, dims) + else: + cell_stats_cube.add_aux_coord(coord) + + # Copy over attributes + cell_stats_cube.attributes = template_cube.attributes + + # Add to cubelist + cubelist.append(cell_stats_cube) + + return cubelist diff --git a/tests/operators/test_feature.py b/tests/operators/test_feature.py index 4d2565a7c..0ea13ebdf 100644 --- a/tests/operators/test_feature.py +++ b/tests/operators/test_feature.py @@ -30,9 +30,9 @@ def feature_cube() -> iris.cube.Cube: """Set up three timesteps of data and place into cube.""" data_arr = np.zeros((3, 10, 10)) - data_arr[0, 2:6, 2:6] = 1 - data_arr[1, 3:7, 3:7] = 1 - data_arr[2, 4:8, 4:8] = 1 + data_arr[0, 2:6, 2:6] = 5 + data_arr[1, 3:7, 3:7] = 10 + data_arr[2, 4:8, 4:8] = 20 time_units = cf_units.Unit("days since 2000-01-01 00:00:00", calendar="gregorian") time_start = dt.datetime(2010, 1, 1, 0, 0, 0) @@ -127,3 +127,38 @@ def test_save_data(feature_cube, tmp_path) -> None: # Check expected csv file is created in output directory expected_file = f"{output_directory}/frame_20100101_0000.csv" assert os.path.isfile(expected_file) + + +def test_cell_stats_operator(feature_cube): + """ + Test the cell_stats operator returns expected size, mean, and max values. + + The expected values are based on the feature_cube data and the threshold of 0.5. + """ + threshold = 0.5 + min_size = 1 + cubelist = feature.cell_stats( + cubes=feature_cube, threshold=threshold, min_size=min_size + ) + + # Extract data from cubelist, squeeze since there is only one feature per timestep + # in this test case + size_data = np.squeeze(cubelist.extract_cube("feature_size").data) + mean_data = np.squeeze(cubelist.extract_cube("feature_mean").data) + max_data = np.squeeze(cubelist.extract_cube("feature_max").data) + effective_radius_data = np.squeeze( + cubelist.extract_cube("feature_effective_radius").data + ) + + # Expected values based on the feature_cube data + expected_size_data = np.array([16, 16, 16]) # Each feature is a 4x4 square + expected_mean_data = np.array([5.0, 10.0, 20.0]) # Mean values of each feature + expected_max_data = np.array([5.0, 10.0, 20.0]) # Max values of each feature + + grid_spacing = 10 # Assuming grid spacing is 10 meters from test setup + expected_radius_data = np.sqrt(expected_size_data * grid_spacing**2 / np.pi) + + np.testing.assert_array_equal(size_data, expected_size_data) + np.testing.assert_array_equal(mean_data, expected_mean_data) + np.testing.assert_array_equal(max_data, expected_max_data) + np.testing.assert_array_equal(effective_radius_data, expected_radius_data)