diff --git a/src/data_profiling/config.py b/src/data_profiling/config.py
index 09dbecdde..5f55ea983 100644
--- a/src/data_profiling/config.py
+++ b/src/data_profiling/config.py
@@ -153,6 +153,8 @@ class Histogram(BaseModel):
max_bins: int = 250
x_axis_labels: bool = True
density: bool = False
+ # Percentile cutoff for truncated histogram (0.0 to disable, e.g. 0.05 for 5-95%)
+ percentile_cutoff: float = 0.0
class CatFrequencyPlot(BaseModel):
diff --git a/src/data_profiling/model/summary_algorithms.py b/src/data_profiling/model/summary_algorithms.py
index dccd4178c..83acaea1c 100644
--- a/src/data_profiling/model/summary_algorithms.py
+++ b/src/data_profiling/model/summary_algorithms.py
@@ -106,6 +106,42 @@ def histogram_compute(
)
stats[name] = hist
+
+ # Compute truncated histogram if percentile_cutoff is set
+ cutoff = hist_config.percentile_cutoff
+ if cutoff > 0.0:
+ lower = np.percentile(finite, cutoff * 100)
+ upper = np.percentile(finite, (1 - cutoff) * 100)
+ mask = (finite_values >= lower) & (finite_values <= upper)
+ truncated_values = finite_values[mask]
+ truncated_weights = weights[mask] if weights is not None else None
+
+ if len(truncated_values) > 0:
+ t_vmin = float(np.min(truncated_values))
+ t_vmax = float(np.max(truncated_values))
+ t_range = t_vmax - t_vmin
+
+ if t_range == 0:
+ eps = 0.5 if t_vmin == 0 else abs(t_vmin) * 0.1
+ t_bins = np.array([t_vmin - eps, t_vmin + eps])
+ else:
+ requested_bins = hist_config.bins if hist_config.bins > 0 else "auto"
+ if isinstance(requested_bins, int):
+ safe_bins = min(requested_bins, n_unique, hist_config.max_bins)
+ safe_bins = max(1, safe_bins)
+ t_bins = np.linspace(t_vmin, t_vmax, safe_bins + 1)
+ else:
+ t_bins = np.histogram_bin_edges(truncated_values, bins="auto")
+ if len(t_bins) - 1 > hist_config.max_bins:
+ t_bins = np.linspace(t_vmin, t_vmax, hist_config.max_bins + 1)
+
+ stats[f"{name}_truncated"] = np.histogram(
+ truncated_values,
+ bins=t_bins,
+ weights=truncated_weights,
+ density=hist_config.density,
+ )
+
return stats
diff --git a/src/data_profiling/report/structure/variables/render_real.py b/src/data_profiling/report/structure/variables/render_real.py
index 59764c419..749c385a5 100644
--- a/src/data_profiling/report/structure/variables/render_real.py
+++ b/src/data_profiling/report/structure/variables/render_real.py
@@ -119,6 +119,7 @@ def render_real(config: Settings, summary: dict) -> dict:
)
summary_histogram = summary.get("histogram", [])
+ summary_histogram_truncated = summary.get("histogram_truncated", None)
mini_hist_data = None
@@ -268,7 +269,6 @@ def render_real(config: Settings, summary: dict) -> dict:
f"Histogram with fixed size bins "
f"(bins={len(summary_histogram[1]) - 1})"
)
-
hist = image_or_empty(
hist_data,
alt="Histogram",
@@ -277,6 +277,20 @@ def render_real(config: Settings, summary: dict) -> dict:
name="Histogram",
anchor_id=f"{varid}histogram",
)
+ # Truncated histogram
+ truncated_hist_data = None
+ if summary_histogram_truncated is not None:
+ cutoff = config.plot.histogram.percentile_cutoff
+ truncated_hist_data = histogram(config, *summary_histogram_truncated)
+
+ truncated_hist = image_or_empty(
+ truncated_hist_data,
+ alt="Truncated Histogram",
+ image_format=image_format,
+ caption=f"Histogram (truncated) ({cutoff:.0%} - {1-cutoff:.0%} percentile)" if truncated_hist_data else None,
+ name="Truncated Histogram",
+ anchor_id=f"{varid}histogram_truncated",
+ )
fq = FrequencyTable(
template_variables["freq_table_rows"],
@@ -306,7 +320,7 @@ def render_real(config: Settings, summary: dict) -> dict:
)
template_variables["bottom"] = Container(
- [statistics, hist, fq, evs],
+ [statistics, hist, truncated_hist, fq, evs],
sequence_type="tabs",
anchor_id=f"{varid}bottom",
)