Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
329 changes: 327 additions & 2 deletions src/CSET/operators/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@
import os

import iris
import iris.cube
import iris.util
import numpy as np
from simpletrack.frame import Timeline
from simpletrack.track import Tracker

from CSET._common import iter_maybe


def track(
cube: iris.cube.Cube,
Expand Down Expand Up @@ -65,7 +70,7 @@ def track(
Returns
-------
tracking_cubes: iris.cube.CubeList
A list of iris cubes containing tracking data, including feauture ID, lifetime,
A list of iris cubes containing tracking data, including feature ID, lifetime,
and locations of initiating features.

Notes
Expand All @@ -90,7 +95,7 @@ def track(
"feature_init":
A 2D binary field indicating the location of newly initiated features at each timestep.
These features are identified as having a lifetime of 1 AND have initiated sufficiently
far from other, existing features that they are not considered to have spawed from them.
far from other, existing features that they are not considered to have spawned from them.

Links
----------
Expand Down Expand Up @@ -184,3 +189,323 @@ def track(
tracking_cubelist.append(tracking_cube)

return tracking_cubelist


def cell_stats(
cubes: iris.cube.Cube | iris.cube.CubeList,
threshold: float,
under_threshold: bool = False,
min_size: int = 4,
save_data: bool = False,
):
"""Identify features in each timestep and output statistics.

Parameters
----------
threshold: float
The threshold value for feature detection.
under_threshold: bool, optional
If set to True, features are identified where the data is below the threshold.
If set to False, features are identified where the data is above the threshold.
Default is False.
min_size: int, optional
The minimum number of contiguous grid points required for a feature to be tracked.
Default is 4.
save_data: bool, optional
If set to True, all tracking data is saved to disk for further analysis (including csv
and txt files containing feature properties that are not returned in output cubes).
Default is False.

Returns
-------
cell_stats_cubes: iris.cube.CubeList
An iris CubeList containing "feature_size", "feature_effective_radius", "feature_mean",
and "feature_max" cubes.

Notes
-----
This operator uses the Simple-Track package with tracking disabled to identify features
in each timestep and compile cell statistics. Outputs cubes containing feature size (number
of grid points), effective radius (in km), mean value within features, and maximum
value within features.

Links
----------
.. https://github.com/ParaChute-UK/simple-track

Examples
--------
>>> cell_stats_cubes = feature.cell_stats(threshold=2)
>>> feature_size_cube = cell_stats_cubes.extract_cube("feature_size")
>>> plt.hist(feature_size_cube[-1])
>>> plt.show()

"""
# Check inputs
cubes = iter_maybe(cubes)
cell_stats_cubelist = iris.cube.CubeList()

# Run tracking on all input data
for cube in cubes:
model_name = cube.attributes.get("model_name", None)
# Setup config
tracker_config = {
"FEATURE": {
"threshold": threshold,
"under_threshold": under_threshold,
"min_size": min_size,
},
"OUTPUT": {
"save_data": save_data,
"experiment_name": "feature_tracking",
"path": f"{os.getcwd()}/{model_name}/cell-stats_data",
"skip_tracking": True,
},
}
logging.debug(f"Tracker config: {tracker_config}")

# Get cube data into a dict to pass to Tracker
times = cube.coord("time").points
time_units = cube.coord("time").units
times_dt = [time_units.num2pydate(t) for t in times]
cube_dict = {
time: cube_slice.data
for time, cube_slice in zip(times_dt, cube.slices_over("time"), strict=True)
}

# Run tracking, returning Timeline object
timeline = Tracker(tracker_config).run(cube_dict)
logging.debug(f"Tracking completed for {model_name}")

# Get feature data from each frame of data
size_data, mean_data, max_data = _get_cell_stats_arrays_from_timeline(
timeline=timeline, expected_frame_times=times_dt
)

# Get effective radius from feature size, using horizontal coordinate of input cube to estimate grid spacing
effective_radius_data = _get_effective_radius_from_feature_size(
size_data=size_data, cube_with_hzntl_coord=cube
)

# Set output cube properties
cube_properties = {
"feature_size": {
"data": size_data,
"long_name": "feature_size",
"units": 1,
},
"feature_mean": {
"data": mean_data,
"long_name": "feature_mean",
"units": 1,
},
"feature_max": {"data": max_data, "long_name": "feature_max", "units": 1},
"feature_effective_radius": {
"data": effective_radius_data,
"long_name": "feature_effective_radius",
"units": "km",
},
}

# Create cubes, add to existing cubelist
cell_stats_cubelist.extend(
_add_cell_stats_data_to_cubes(
data_and_metadata_dict=cube_properties, template_cube=cube
)
)

return cell_stats_cubelist


def _get_cell_stats_arrays_from_timeline(
timeline: Timeline, expected_frame_times: list
) -> list[np.ndarray]:
"""Extract cell statistics data from a Simple-Track Timeline object.

Parameters
----------
timeline: Timeline
A Simple-Track Timeline object containing tracked features.

Returns
-------
size_data: np.ndarray
A numpy array containing the size of each feature in grid points.
mean_data: np.ndarray
A numpy array containing the mean value of each feature.
max_data: np.ndarray
A numpy array containing the maximum value of each feature.
"""
size_data, mean_data, max_data = [], [], []
number_of_features = []
for time in expected_frame_times:
frame = timeline.get_frame(time)
features = frame.features
size_data.append([feature.get_size() for feature in features.values()])
mean_data.append([feature.mean for feature in features.values()])
max_data.append([feature.max for feature in features.values()])
number_of_features.append(len(features))

# Pad data with NaNs to create arrays of consistent shape (max number of features across
# all timesteps)
arr_size = max(number_of_features)

# Size data is integer, but we need to pad with NaNs (which is a float), so fill
# with invalid value first
size_data = np.array(
[
np.pad(sizes, (0, arr_size - len(sizes)), constant_values=-100)
for sizes in size_data
],
dtype=float,
)
size_data[size_data == -100] = np.nan

# Mean and max data are already float, so can be padded with NaNs directly.
mean_data = np.array(
[
np.pad(means, (0, arr_size - len(means)), constant_values=np.nan)
for means in mean_data
]
)
max_data = np.array(
[
np.pad(maxs, (0, arr_size - len(maxs)), constant_values=np.nan)
for maxs in max_data
]
)

return size_data, mean_data, max_data


def _get_effective_radius_from_feature_size(
size_data: np.ndarray, cube_with_hzntl_coord: iris.cube.Cube
) -> np.ndarray:
"""Convert feature size in grid points to effective radius in km.

Parameters
----------
size_data: np.ndarray
An array containing "feature_size" data, in units of grid points.
cube_with_hzntl_coord: iris.cube.Cube
An iris cube containing a horizontal coordinate, which is used to
estimate the grid spacing for the effective radius calculation.

Returns
-------
effective_radii_data: np.ndarray
An array containing "feature_effective_radius" data, in units of km.

Notes
-----
This function assumes that the input cube has a horizontal coordinate system that is regular and
that the grid spacing can be estimated from the horizontal coordinates. The effective radius is
calculated as the radius of a circle with the same area as the feature size in grid points.

"""
# Guess coord representing horizontal grid (choose first available)
hzntl_coord = [
coord
for coord in cube_with_hzntl_coord.coords()
if iris.util.guess_coord_axis(coord) in ["X", "Y"]
][0]
logging.debug(f"Attempting to convert to effective radius using {hzntl_coord}")

# Check coordinate is regular, but only warn if not, this is a naive estimate
# and will be inaccurate for irregular grids
if not iris.util.is_regular(hzntl_coord):
logging.warning(
f"Horizontal coordinate {hzntl_coord} is not regular. "
"Effective radius calculation may be inaccurate."
)

grid_spacing = np.abs(np.mean(np.diff(hzntl_coord.points)))
effective_radii_data = np.sqrt(size_data * grid_spacing**2 / np.pi)

return effective_radii_data


def _add_cell_stats_data_to_cubes(
data_and_metadata_dict: dict, template_cube: iris.cube.Cube
) -> iris.cube.CubeList:
"""Add data to cubes, using template cube for metadata.

Parameters
----------
data_and_metadata_dict: dict
A dictionary containing data and metadata for each cube to be created.
The keys are the long names of the cubes, and the values are dictionaries
containing the data and units for each cube.

template_cube: iris.cube.Cube
An iris cube to use as a template for the new cubes. The new cubes will
have the same attributes as the template cube.

Returns
-------
cubelist: iris.cube.CubeList
A list of iris cubes containing the added data.

"""
cubelist = iris.cube.CubeList()

# Construct coordinates for new cubes
time_coord = template_cube.coord("time").copy()
# To construct feature coordinate, look at the size of dimension 1 for each data
arr_size = max(
[data_and_metadata_dict[cb]["data"].shape[1] for cb in data_and_metadata_dict]
)
feature_coord = iris.coords.DimCoord(
np.arange(arr_size),
long_name="feature_number",
var_name="feature_number",
units="1",
)
coords = [time_coord, feature_coord]
coords_and_dims = [(coord, i) for i, coord in enumerate(coords)]

# Get list of coords to copy from input cube to output cubes
copyable_coord_names = [
"realization",
"hour",
"forecast_period",
"forecast_reference_time",
"model_name",
"cset_comparison_base",
]
input_cube_coord_names = []
for coord in template_cube.coords():
input_cube_coord_names.append(coord.standard_name)
input_cube_coord_names.append(coord.long_name)

coords_to_copy = [
coord_name
for coord_name in copyable_coord_names
if coord_name in input_cube_coord_names
]

# Populate cubelist
for cb_props in data_and_metadata_dict.values():
cell_stats_cube = iris.cube.Cube(
data=cb_props["data"],
long_name=cb_props["long_name"],
units=cb_props["units"],
dim_coords_and_dims=coords_and_dims,
)
# Add other metadata from input cube
for coord_name in coords_to_copy:
coord = template_cube.coord(coord_name).copy()
# Check if this coord represents a dimension of data
dims = template_cube.coord_dims(coord)
if len(dims) > 0:
cell_stats_cube.add_aux_coord(coord, dims)
else:
cell_stats_cube.add_aux_coord(coord)

# Copy over attributes
cell_stats_cube.attributes = template_cube.attributes

# Add to cubelist
cubelist.append(cell_stats_cube)

return cubelist
13 changes: 11 additions & 2 deletions src/CSET/operators/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,13 +1272,16 @@ def _plot_and_save_histogram_series(

# Set default that histograms will produce probability density function
# at each bin (integral over range sums to 1).
density = True
if "feature" in cubes[0].long_name:
density = False
else:
density = True

for cube in iter_maybe(cubes):
# Easier to check title (where var name originates)
# than seeing if long names exist etc.
# Exception case, where distribution better fits log scales/bins.
if "surface_microphysical" in title:
if "surface_microphysical" in title or "feature" in cube.long_name:
if "amount" in title:
# Compute histogram following Klingaman et al. (2017): ASoP
bin2 = np.exp(np.log(0.02) + 0.1 * np.linspace(0, 99, 100))
Expand All @@ -1304,6 +1307,10 @@ def _plot_and_save_histogram_series(
np.max(bins),
)

if "feature" in cube.long_name:
ax.set_yscale("log")
ax.set_xscale("log")

# Reshape cube data into a single array to allow for a single histogram.
# Otherwise we plot xdim histograms stacked.
cube_data_1d = (cube.data).flatten()
Expand Down Expand Up @@ -1336,6 +1343,8 @@ def _plot_and_save_histogram_series(
ax.set_ylabel(
f"Contribution to mean ({iter_maybe(cubes)[0].units})", fontsize=14
)
if "feature" in cubes[0].long_name:
ax.set_ylabel("Frequency", fontsize=14)
ax.set_xlim(vmin, vmax)
ax.tick_params(axis="both", labelsize=12)

Expand Down
Loading
Loading