diff --git a/src/CSET/operators/feature.py b/src/CSET/operators/feature.py index a3cfe0c72..31a36e9b1 100755 --- a/src/CSET/operators/feature.py +++ b/src/CSET/operators/feature.py @@ -17,9 +17,14 @@ import os import iris +import iris.cube +import iris.util import numpy as np +from simpletrack.frame import Timeline from simpletrack.track import Tracker +from CSET._common import iter_maybe + def track( cube: iris.cube.Cube, @@ -65,7 +70,7 @@ def track( Returns ------- tracking_cubes: iris.cube.CubeList - A list of iris cubes containing tracking data, including feauture ID, lifetime, + A list of iris cubes containing tracking data, including feature ID, lifetime, and locations of initiating features. Notes @@ -90,7 +95,7 @@ def track( "feature_init": A 2D binary field indicating the location of newly initiated features at each timestep. These features are identified as having a lifetime of 1 AND have initiated sufficiently - far from other, existing features that they are not considered to have spawed from them. + far from other, existing features that they are not considered to have spawned from them. Links ---------- @@ -184,3 +189,323 @@ def track( tracking_cubelist.append(tracking_cube) return tracking_cubelist + + +def cell_stats( + cubes: iris.cube.Cube | iris.cube.CubeList, + threshold: float, + under_threshold: bool = False, + min_size: int = 4, + save_data: bool = False, +): + """Identify features in each timestep and output statistics. + + Parameters + ---------- + threshold: float + The threshold value for feature detection. + under_threshold: bool, optional + If set to True, features are identified where the data is below the threshold. + If set to False, features are identified where the data is above the threshold. + Default is False. + min_size: int, optional + The minimum number of contiguous grid points required for a feature to be tracked. + Default is 4. + save_data: bool, optional + If set to True, all tracking data is saved to disk for further analysis (including csv + and txt files containing feature properties that are not returned in output cubes). + Default is False. + + Returns + ------- + cell_stats_cubes: iris.cube.CubeList + An iris CubeList containing "feature_size", "feature_effective_radius", "feature_mean", + and "feature_max" cubes. + + Notes + ----- + This operator uses the Simple-Track package with tracking disabled to identify features + in each timestep and compile cell statistics. Outputs cubes containing feature size (number + of grid points), effective radius (in km), mean value within features, and maximum + value within features. + + Links + ---------- + .. https://github.com/ParaChute-UK/simple-track + + Examples + -------- + >>> cell_stats_cubes = feature.cell_stats(threshold=2) + >>> feature_size_cube = cell_stats_cubes.extract_cube("feature_size") + >>> plt.hist(feature_size_cube[-1]) + >>> plt.show() + + """ + # Check inputs + cubes = iter_maybe(cubes) + cell_stats_cubelist = iris.cube.CubeList() + + # Run tracking on all input data + for cube in cubes: + model_name = cube.attributes.get("model_name", None) + # Setup config + tracker_config = { + "FEATURE": { + "threshold": threshold, + "under_threshold": under_threshold, + "min_size": min_size, + }, + "OUTPUT": { + "save_data": save_data, + "experiment_name": "feature_tracking", + "path": f"{os.getcwd()}/{model_name}/cell-stats_data", + "skip_tracking": True, + }, + } + logging.debug(f"Tracker config: {tracker_config}") + + # Get cube data into a dict to pass to Tracker + times = cube.coord("time").points + time_units = cube.coord("time").units + times_dt = [time_units.num2pydate(t) for t in times] + cube_dict = { + time: cube_slice.data + for time, cube_slice in zip(times_dt, cube.slices_over("time"), strict=True) + } + + # Run tracking, returning Timeline object + timeline = Tracker(tracker_config).run(cube_dict) + logging.debug(f"Tracking completed for {model_name}") + + # Get feature data from each frame of data + size_data, mean_data, max_data = _get_cell_stats_arrays_from_timeline( + timeline=timeline, expected_frame_times=times_dt + ) + + # Get effective radius from feature size, using horizontal coordinate of input cube to estimate grid spacing + effective_radius_data = _get_effective_radius_from_feature_size( + size_data=size_data, cube_with_hzntl_coord=cube + ) + + # Set output cube properties + cube_properties = { + "feature_size": { + "data": size_data, + "long_name": "feature_size", + "units": 1, + }, + "feature_mean": { + "data": mean_data, + "long_name": "feature_mean", + "units": 1, + }, + "feature_max": {"data": max_data, "long_name": "feature_max", "units": 1}, + "feature_effective_radius": { + "data": effective_radius_data, + "long_name": "feature_effective_radius", + "units": "km", + }, + } + + # Create cubes, add to existing cubelist + cell_stats_cubelist.extend( + _add_cell_stats_data_to_cubes( + data_and_metadata_dict=cube_properties, template_cube=cube + ) + ) + + return cell_stats_cubelist + + +def _get_cell_stats_arrays_from_timeline( + timeline: Timeline, expected_frame_times: list +) -> list[np.ndarray]: + """Extract cell statistics data from a Simple-Track Timeline object. + + Parameters + ---------- + timeline: Timeline + A Simple-Track Timeline object containing tracked features. + + Returns + ------- + size_data: np.ndarray + A numpy array containing the size of each feature in grid points. + mean_data: np.ndarray + A numpy array containing the mean value of each feature. + max_data: np.ndarray + A numpy array containing the maximum value of each feature. + """ + size_data, mean_data, max_data = [], [], [] + number_of_features = [] + for time in expected_frame_times: + frame = timeline.get_frame(time) + features = frame.features + size_data.append([feature.get_size() for feature in features.values()]) + mean_data.append([feature.mean for feature in features.values()]) + max_data.append([feature.max for feature in features.values()]) + number_of_features.append(len(features)) + + # Pad data with NaNs to create arrays of consistent shape (max number of features across + # all timesteps) + arr_size = max(number_of_features) + + # Size data is integer, but we need to pad with NaNs (which is a float), so fill + # with invalid value first + size_data = np.array( + [ + np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100) + for sizes in size_data + ], + dtype=float, + ) + size_data[size_data == -100] = np.nan + + # Mean and max data are already float, so can be padded with NaNs directly. + mean_data = np.array( + [ + np.pad(means, (0, arr_size - len(means)), constant_values=np.nan) + for means in mean_data + ] + ) + max_data = np.array( + [ + np.pad(maxs, (0, arr_size - len(maxs)), constant_values=np.nan) + for maxs in max_data + ] + ) + + return size_data, mean_data, max_data + + +def _get_effective_radius_from_feature_size( + size_data: np.ndarray, cube_with_hzntl_coord: iris.cube.Cube +) -> np.ndarray: + """Convert feature size in grid points to effective radius in km. + + Parameters + ---------- + size_data: np.ndarray + An array containing "feature_size" data, in units of grid points. + cube_with_hzntl_coord: iris.cube.Cube + An iris cube containing a horizontal coordinate, which is used to + estimate the grid spacing for the effective radius calculation. + + Returns + ------- + effective_radii_data: np.ndarray + An array containing "feature_effective_radius" data, in units of km. + + Notes + ----- + This function assumes that the input cube has a horizontal coordinate system that is regular and + that the grid spacing can be estimated from the horizontal coordinates. The effective radius is + calculated as the radius of a circle with the same area as the feature size in grid points. + + """ + # Guess coord representing horizontal grid (choose first available) + hzntl_coord = [ + coord + for coord in cube_with_hzntl_coord.coords() + if iris.util.guess_coord_axis(coord) in ["X", "Y"] + ][0] + logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}") + + # Check coordinate is regular, but only warn if not, this is a naive estimate + # and will be inaccurate for irregular grids + if not iris.util.is_regular(hzntl_coord): + logging.warning( + f"Horizontal coordinate {hzntl_coord} is not regular. " + "Effective radius calculation may be inaccurate." + ) + + grid_spacing = np.abs(np.mean(np.diff(hzntl_coord.points))) + effective_radii_data = np.sqrt(size_data * grid_spacing**2 / np.pi) + + return effective_radii_data + + +def _add_cell_stats_data_to_cubes( + data_and_metadata_dict: dict, template_cube: iris.cube.Cube +) -> iris.cube.CubeList: + """Add data to cubes, using template cube for metadata. + + Parameters + ---------- + data_and_metadata_dict: dict + A dictionary containing data and metadata for each cube to be created. + The keys are the long names of the cubes, and the values are dictionaries + containing the data and units for each cube. + + template_cube: iris.cube.Cube + An iris cube to use as a template for the new cubes. The new cubes will + have the same attributes as the template cube. + + Returns + ------- + cubelist: iris.cube.CubeList + A list of iris cubes containing the added data. + + """ + cubelist = iris.cube.CubeList() + + # Construct coordinates for new cubes + time_coord = template_cube.coord("time").copy() + # To construct feature coordinate, look at the size of dimension 1 for each data + arr_size = max( + [data_and_metadata_dict[cb]["data"].shape[1] for cb in data_and_metadata_dict] + ) + feature_coord = iris.coords.DimCoord( + np.arange(arr_size), + long_name="feature_number", + var_name="feature_number", + units="1", + ) + coords = [time_coord, feature_coord] + coords_and_dims = [(coord, i) for i, coord in enumerate(coords)] + + # Get list of coords to copy from input cube to output cubes + copyable_coord_names = [ + "realization", + "hour", + "forecast_period", + "forecast_reference_time", + "model_name", + "cset_comparison_base", + ] + input_cube_coord_names = [] + for coord in template_cube.coords(): + input_cube_coord_names.append(coord.standard_name) + input_cube_coord_names.append(coord.long_name) + + coords_to_copy = [ + coord_name + for coord_name in copyable_coord_names + if coord_name in input_cube_coord_names + ] + + # Populate cubelist + for cb_props in data_and_metadata_dict.values(): + cell_stats_cube = iris.cube.Cube( + data=cb_props["data"], + long_name=cb_props["long_name"], + units=cb_props["units"], + dim_coords_and_dims=coords_and_dims, + ) + # Add other metadata from input cube + for coord_name in coords_to_copy: + coord = template_cube.coord(coord_name).copy() + # Check if this coord represents a dimension of data + dims = template_cube.coord_dims(coord) + if len(dims) > 0: + cell_stats_cube.add_aux_coord(coord, dims) + else: + cell_stats_cube.add_aux_coord(coord) + + # Copy over attributes + cell_stats_cube.attributes = template_cube.attributes + + # Add to cubelist + cubelist.append(cell_stats_cube) + + return cubelist diff --git a/src/CSET/operators/plot.py b/src/CSET/operators/plot.py index 08665447b..19452ad8c 100644 --- a/src/CSET/operators/plot.py +++ b/src/CSET/operators/plot.py @@ -1272,13 +1272,16 @@ def _plot_and_save_histogram_series( # Set default that histograms will produce probability density function # at each bin (integral over range sums to 1). - density = True + if "feature" in cubes[0].long_name: + density = False + else: + density = True for cube in iter_maybe(cubes): # Easier to check title (where var name originates) # than seeing if long names exist etc. # Exception case, where distribution better fits log scales/bins. - if "surface_microphysical" in title: + if "surface_microphysical" in title or "feature" in cube.long_name: if "amount" in title: # Compute histogram following Klingaman et al. (2017): ASoP bin2 = np.exp(np.log(0.02) + 0.1 * np.linspace(0, 99, 100)) @@ -1304,6 +1307,10 @@ def _plot_and_save_histogram_series( np.max(bins), ) + if "feature" in cube.long_name: + ax.set_yscale("log") + ax.set_xscale("log") + # Reshape cube data into a single array to allow for a single histogram. # Otherwise we plot xdim histograms stacked. cube_data_1d = (cube.data).flatten() @@ -1336,6 +1343,8 @@ def _plot_and_save_histogram_series( ax.set_ylabel( f"Contribution to mean ({iter_maybe(cubes)[0].units})", fontsize=14 ) + if "feature" in cubes[0].long_name: + ax.set_ylabel("Frequency", fontsize=14) ax.set_xlim(vmin, vmax) ax.tick_params(axis="both", labelsize=12) diff --git a/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml new file mode 100755 index 000000000..9acf8e381 --- /dev/null +++ b/src/CSET/recipes/example_recipes/example_feature_cell_stats.yaml @@ -0,0 +1,26 @@ +category: Quick Look +title: Example running cell_stats and plotting histograms +description: | + Uses the feature.cell_stats operator to calculate cell size, mean cell values and max cell values for + identified features. + +steps: + - operator: read.read_cubes + file_paths: $INPUT_PATHS + model_names: $MODEL_NAME + + - operator: filters.filter_multiple_cubes + constraint: + operator: constraints.generate_var_constraint + varname: $VARNAME + + - operator: feature.cell_stats + threshold: 3 + + # Filter tracking cubelist to one of "feature_mean", "feature_max", or "feature_size" + - operator: filters.filter_multiple_cubes + constraint: + operator: constraints.generate_var_constraint + varname: feature_size + + - operator: plot.plot_histogram_series diff --git a/tests/operators/test_feature.py b/tests/operators/test_feature.py index 4d2565a7c..0ea13ebdf 100644 --- a/tests/operators/test_feature.py +++ b/tests/operators/test_feature.py @@ -30,9 +30,9 @@ def feature_cube() -> iris.cube.Cube: """Set up three timesteps of data and place into cube.""" data_arr = np.zeros((3, 10, 10)) - data_arr[0, 2:6, 2:6] = 1 - data_arr[1, 3:7, 3:7] = 1 - data_arr[2, 4:8, 4:8] = 1 + data_arr[0, 2:6, 2:6] = 5 + data_arr[1, 3:7, 3:7] = 10 + data_arr[2, 4:8, 4:8] = 20 time_units = cf_units.Unit("days since 2000-01-01 00:00:00", calendar="gregorian") time_start = dt.datetime(2010, 1, 1, 0, 0, 0) @@ -127,3 +127,38 @@ def test_save_data(feature_cube, tmp_path) -> None: # Check expected csv file is created in output directory expected_file = f"{output_directory}/frame_20100101_0000.csv" assert os.path.isfile(expected_file) + + +def test_cell_stats_operator(feature_cube): + """ + Test the cell_stats operator returns expected size, mean, and max values. + + The expected values are based on the feature_cube data and the threshold of 0.5. + """ + threshold = 0.5 + min_size = 1 + cubelist = feature.cell_stats( + cubes=feature_cube, threshold=threshold, min_size=min_size + ) + + # Extract data from cubelist, squeeze since there is only one feature per timestep + # in this test case + size_data = np.squeeze(cubelist.extract_cube("feature_size").data) + mean_data = np.squeeze(cubelist.extract_cube("feature_mean").data) + max_data = np.squeeze(cubelist.extract_cube("feature_max").data) + effective_radius_data = np.squeeze( + cubelist.extract_cube("feature_effective_radius").data + ) + + # Expected values based on the feature_cube data + expected_size_data = np.array([16, 16, 16]) # Each feature is a 4x4 square + expected_mean_data = np.array([5.0, 10.0, 20.0]) # Mean values of each feature + expected_max_data = np.array([5.0, 10.0, 20.0]) # Max values of each feature + + grid_spacing = 10 # Assuming grid spacing is 10 meters from test setup + expected_radius_data = np.sqrt(expected_size_data * grid_spacing**2 / np.pi) + + np.testing.assert_array_equal(size_data, expected_size_data) + np.testing.assert_array_equal(mean_data, expected_mean_data) + np.testing.assert_array_equal(max_data, expected_max_data) + np.testing.assert_array_equal(effective_radius_data, expected_radius_data)