diff --git a/src/data_profiling/config.py b/src/data_profiling/config.py index 09dbecdde..5f55ea983 100644 --- a/src/data_profiling/config.py +++ b/src/data_profiling/config.py @@ -153,6 +153,8 @@ class Histogram(BaseModel): max_bins: int = 250 x_axis_labels: bool = True density: bool = False + # Percentile cutoff for truncated histogram (0.0 to disable, e.g. 0.05 for 5-95%) + percentile_cutoff: float = 0.0 class CatFrequencyPlot(BaseModel): diff --git a/src/data_profiling/model/summary_algorithms.py b/src/data_profiling/model/summary_algorithms.py index dccd4178c..83acaea1c 100644 --- a/src/data_profiling/model/summary_algorithms.py +++ b/src/data_profiling/model/summary_algorithms.py @@ -106,6 +106,42 @@ def histogram_compute( ) stats[name] = hist + + # Compute truncated histogram if percentile_cutoff is set + cutoff = hist_config.percentile_cutoff + if cutoff > 0.0: + lower = np.percentile(finite, cutoff * 100) + upper = np.percentile(finite, (1 - cutoff) * 100) + mask = (finite_values >= lower) & (finite_values <= upper) + truncated_values = finite_values[mask] + truncated_weights = weights[mask] if weights is not None else None + + if len(truncated_values) > 0: + t_vmin = float(np.min(truncated_values)) + t_vmax = float(np.max(truncated_values)) + t_range = t_vmax - t_vmin + + if t_range == 0: + eps = 0.5 if t_vmin == 0 else abs(t_vmin) * 0.1 + t_bins = np.array([t_vmin - eps, t_vmin + eps]) + else: + requested_bins = hist_config.bins if hist_config.bins > 0 else "auto" + if isinstance(requested_bins, int): + safe_bins = min(requested_bins, n_unique, hist_config.max_bins) + safe_bins = max(1, safe_bins) + t_bins = np.linspace(t_vmin, t_vmax, safe_bins + 1) + else: + t_bins = np.histogram_bin_edges(truncated_values, bins="auto") + if len(t_bins) - 1 > hist_config.max_bins: + t_bins = np.linspace(t_vmin, t_vmax, hist_config.max_bins + 1) + + stats[f"{name}_truncated"] = np.histogram( + truncated_values, + bins=t_bins, + weights=truncated_weights, + density=hist_config.density, + ) + return stats diff --git a/src/data_profiling/report/structure/variables/render_real.py b/src/data_profiling/report/structure/variables/render_real.py index 59764c419..749c385a5 100644 --- a/src/data_profiling/report/structure/variables/render_real.py +++ b/src/data_profiling/report/structure/variables/render_real.py @@ -119,6 +119,7 @@ def render_real(config: Settings, summary: dict) -> dict: ) summary_histogram = summary.get("histogram", []) + summary_histogram_truncated = summary.get("histogram_truncated", None) mini_hist_data = None @@ -268,7 +269,6 @@ def render_real(config: Settings, summary: dict) -> dict: f"Histogram with fixed size bins " f"(bins={len(summary_histogram[1]) - 1})" ) - hist = image_or_empty( hist_data, alt="Histogram", @@ -277,6 +277,20 @@ def render_real(config: Settings, summary: dict) -> dict: name="Histogram", anchor_id=f"{varid}histogram", ) + # Truncated histogram + truncated_hist_data = None + if summary_histogram_truncated is not None: + cutoff = config.plot.histogram.percentile_cutoff + truncated_hist_data = histogram(config, *summary_histogram_truncated) + + truncated_hist = image_or_empty( + truncated_hist_data, + alt="Truncated Histogram", + image_format=image_format, + caption=f"Histogram (truncated) ({cutoff:.0%} - {1-cutoff:.0%} percentile)" if truncated_hist_data else None, + name="Truncated Histogram", + anchor_id=f"{varid}histogram_truncated", + ) fq = FrequencyTable( template_variables["freq_table_rows"], @@ -306,7 +320,7 @@ def render_real(config: Settings, summary: dict) -> dict: ) template_variables["bottom"] = Container( - [statistics, hist, fq, evs], + [statistics, hist, truncated_hist, fq, evs], sequence_type="tabs", anchor_id=f"{varid}bottom", )