Skip to content

Commit a26d747

Browse files
alexbarrosaquemy
authored andcommitted
feat: add density histogram (#1458)
* feat: add histogram density option * test: add unit test * fix: discard weights if exceed max_bins
1 parent a5a227f commit a26d747

3 files changed

Lines changed: 34 additions & 7 deletions

File tree

src/ydata_profiling/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ class Histogram(BaseModel):
148148
# Maximum number of bins (when bins=0)
149149
max_bins: int = 250
150150
x_axis_labels: bool = True
151+
density: bool = False
151152

152153

153154
class CatFrequencyPlot(BaseModel):

src/ydata_profiling/model/summary_algorithms.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,16 @@ def histogram_compute(
3434
weights: Optional[np.ndarray] = None,
3535
) -> dict:
3636
stats = {}
37-
bins = config.plot.histogram.bins
38-
bins_arg = "auto" if bins == 0 else min(bins, n_unique)
37+
hist_config = config.plot.histogram
38+
bins_arg = "auto" if hist_config.bins == 0 else min(hist_config.bins, n_unique)
3939
bins = np.histogram_bin_edges(finite_values, bins=bins_arg)
40-
stats[name] = np.histogram(finite_values, bins=bins, weights=weights)
40+
if len(bins) > hist_config.max_bins:
41+
bins = np.histogram_bin_edges(finite_values, bins=hist_config.max_bins)
42+
weights = weights if weights and len(weights) == hist_config.max_bins else None
4143

42-
max_bins = config.plot.histogram.max_bins
43-
if bins_arg == "auto" and len(stats[name][1]) > max_bins:
44-
bins = np.histogram_bin_edges(finite_values, bins=max_bins)
45-
stats[name] = np.histogram(finite_values, bins=bins, weights=None)
44+
stats[name] = np.histogram(
45+
finite_values, bins=bins, weights=weights, density=config.plot.histogram.density
46+
)
4647

4748
return stats
4849

tests/unit/test_summary_algos.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
import pandas as pd
33
import pytest
44

5+
from ydata_profiling.config import Settings
56
from ydata_profiling.model.summary_algorithms import (
67
describe_counts,
78
describe_generic,
89
describe_supported,
10+
histogram_compute,
911
)
1012

1113

@@ -53,3 +55,26 @@ def test_summary_supported_empty_df(config, empty_data):
5355
assert summary["p_distinct"] == 0
5456
assert summary["n_unique"] == 0
5557
assert not summary["is_unique"]
58+
59+
60+
@pytest.fixture
61+
def numpy_array():
62+
return np.random.choice(list(range(10)), size=1000)
63+
64+
65+
def test_compute_histogram(numpy_array):
66+
config = Settings()
67+
n_unique = len(np.unique(numpy_array))
68+
hist = histogram_compute(config, numpy_array, n_unique)
69+
assert "histogram" in hist
70+
assert len(hist["histogram"][0]) == n_unique
71+
assert len(hist["histogram"][1]) == n_unique + 1
72+
assert sum(hist["histogram"][0]) == len(numpy_array)
73+
74+
config.plot.histogram.density = True
75+
hist = histogram_compute(config, numpy_array, n_unique)
76+
assert "histogram" in hist
77+
assert len(hist["histogram"][0]) == n_unique
78+
assert len(hist["histogram"][1]) == n_unique + 1
79+
hist_values = hist["histogram"][0] * np.diff(hist["histogram"][1])
80+
assert sum(hist_values) == pytest.approx(1, 0.1)

0 commit comments

Comments
 (0)