Skip to content

Commit 7bbba9e

Browse files
committed
fix: unify description classes
1 parent 3fa69a9 commit 7bbba9e

4 files changed

Lines changed: 74 additions & 108 deletions

File tree

Lines changed: 41 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,53 @@
1-
from dataclasses import dataclass
2-
31
import pandas as pd
42

53
from ydata_profiling.config import Settings
64
from ydata_profiling.model.var_description.counts import VarCounts
75

86

9-
@dataclass
10-
class VarCountsPandas(VarCounts):
11-
value_counts_without_nan: pd.Series
12-
"""Counts of values in the series without NaN."""
13-
value_counts_index_sorted: pd.Series
14-
"""Sorted counts of values in the series without NaN."""
7+
def get_counts_pandas(config: Settings, series: pd.Series) -> VarCounts:
8+
"""Get a VarCounts object for a pandas series."""
9+
length = len(series)
1510

16-
def __init__(self, config: Settings, series: pd.Series):
17-
"""Counts the values in a series (with and without NaN, distinct).
11+
try:
12+
value_counts_with_nan = series.value_counts(dropna=False)
13+
_ = set(value_counts_with_nan.index)
14+
hashable = True
15+
except: # noqa: E722
16+
hashable = False
1817

19-
Args:
20-
config: report Settings object
21-
series: Series for which we want to calculate the values.
22-
summary: series' summary
18+
value_counts_without_nan = None
19+
value_counts_index_sorted = None
20+
if hashable:
21+
value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]
2322

24-
Returns:
25-
A dictionary with the count values (with and without NaN, distinct).
26-
"""
27-
length = len(series)
23+
null_index = value_counts_with_nan.index.isnull()
24+
if null_index.any():
25+
n_missing = value_counts_with_nan[null_index].sum()
26+
value_counts_without_nan = value_counts_with_nan[~null_index]
27+
else:
28+
n_missing = 0
29+
value_counts_without_nan = value_counts_with_nan
2830

2931
try:
30-
value_counts_with_nan = series.value_counts(dropna=False)
31-
_ = set(value_counts_with_nan.index)
32-
hashable = True
33-
except: # noqa: E722
34-
hashable = False
35-
36-
value_counts_without_nan = None
37-
value_counts_index_sorted = None
38-
if hashable:
39-
value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]
40-
41-
null_index = value_counts_with_nan.index.isnull()
42-
if null_index.any():
43-
n_missing = value_counts_with_nan[null_index].sum()
44-
value_counts_without_nan = value_counts_with_nan[~null_index]
45-
else:
46-
n_missing = 0
47-
value_counts_without_nan = value_counts_with_nan
48-
49-
try:
50-
value_counts_index_sorted = value_counts_without_nan.sort_index(
51-
ascending=True
52-
)
53-
ordering = True
54-
except TypeError:
55-
ordering = False
56-
else:
57-
n_missing = series.isna().sum()
32+
value_counts_index_sorted = value_counts_without_nan.sort_index(
33+
ascending=True
34+
)
35+
ordering = True
36+
except TypeError:
5837
ordering = False
59-
60-
super().__init__(
61-
hashable=hashable,
62-
value_counts_without_nan=value_counts_without_nan,
63-
value_counts_index_sorted=value_counts_index_sorted,
64-
ordering=ordering,
65-
n_missing=n_missing,
66-
n=length,
67-
p_missing=series.isna().sum() / length if length > 0 else 0,
68-
count=length - series.isna().sum(),
69-
memory_size=series.memory_usage(deep=config.memory_deep),
70-
value_counts=None,
71-
)
38+
else:
39+
n_missing = series.isna().sum()
40+
ordering = False
41+
42+
return VarCounts(
43+
hashable=hashable,
44+
value_counts_without_nan=value_counts_without_nan,
45+
value_counts_index_sorted=value_counts_index_sorted,
46+
ordering=ordering,
47+
n_missing=n_missing,
48+
n=length,
49+
p_missing=series.isna().sum() / length if length > 0 else 0,
50+
count=length - series.isna().sum(),
51+
memory_size=series.memory_usage(deep=config.memory_deep),
52+
value_counts=None,
53+
)
Lines changed: 13 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,27 @@
11
from __future__ import annotations
22

3-
from dataclasses import dataclass
4-
53
import pandas as pd
64

75
from ydata_profiling.config import Settings
8-
from ydata_profiling.model.pandas.var_description.counts_pandas import VarCountsPandas
6+
from ydata_profiling.model.pandas.var_description.counts_pandas import get_counts_pandas
97
from ydata_profiling.model.var_description.default import (
108
VarDescription,
119
VarDescriptionHashable,
1210
)
1311

1412

15-
@dataclass
16-
class VarDescriptionPandas(VarDescription):
17-
"""Default description for pandas columns."""
18-
19-
@classmethod
20-
def from_var_counts(
21-
cls, var_counts: VarCountsPandas, init_dict: dict
22-
) -> VarDescriptionPandas:
23-
"""Get a default description from a VarCountsPandas object."""
24-
return VarDescriptionPandas(
25-
n=var_counts.n,
26-
count=var_counts.count,
27-
n_missing=var_counts.n_missing,
28-
p_missing=var_counts.p_missing,
29-
hashable=var_counts.hashable,
30-
memory_size=var_counts.memory_size,
31-
ordering=var_counts.ordering,
32-
value_counts_index_sorted=var_counts.value_counts_index_sorted,
33-
value_counts_without_nan=var_counts.value_counts_without_nan,
34-
var_specific=init_dict,
35-
)
36-
37-
38-
@dataclass
39-
class VarDescriptionPandasHashable(VarDescriptionHashable):
40-
"""Default description for pandas columns that are hashable (common types)."""
13+
def get_default_pandas_description(
14+
config: Settings, series: pd.Series, init_dict: dict
15+
) -> VarDescription | VarDescriptionHashable:
16+
var_counts = get_counts_pandas(config, series)
4117

42-
@classmethod
43-
def from_var_counts(
44-
cls, var_counts: VarCountsPandas, init_dict: dict
45-
) -> VarDescriptionPandasHashable:
46-
"""Get a default description for a hashable column from a VarCountsPandas object."""
47-
_count = var_counts.count
18+
if var_counts.hashable:
19+
count = var_counts.count
4820
value_counts = var_counts.value_counts_without_nan
4921
distinct_count = len(value_counts)
5022
unique_count = value_counts.where(value_counts == 1).count()
5123

52-
return VarDescriptionPandasHashable(
24+
return VarDescriptionHashable(
5325
n=var_counts.n,
5426
count=var_counts.count,
5527
n_missing=var_counts.n_missing,
@@ -60,19 +32,11 @@ def from_var_counts(
6032
value_counts_index_sorted=var_counts.value_counts_index_sorted,
6133
value_counts_without_nan=var_counts.value_counts_without_nan,
6234
n_distinct=distinct_count,
63-
p_distinct=distinct_count / _count if _count > 0 else 0,
64-
is_unique=unique_count == _count and _count > 0,
35+
p_distinct=distinct_count / count if count > 0 else 0,
36+
is_unique=unique_count == count and count > 0,
6537
n_unique=unique_count,
66-
p_unique=unique_count / _count if _count > 0 else 0,
38+
p_unique=unique_count / count if count > 0 else 0,
39+
value_counts=None,
6740
var_specific=init_dict,
68-
value_counts=var_counts.value_counts,
6941
)
70-
71-
72-
def get_default_pandas_description(
73-
config: Settings, series: pd.Series, init_dict: dict
74-
) -> VarDescriptionPandas | VarDescriptionPandasHashable:
75-
_var_counts = VarCountsPandas(config, series)
76-
if _var_counts.hashable:
77-
return VarDescriptionPandasHashable.from_var_counts(_var_counts, init_dict)
78-
return VarDescriptionPandas.from_var_counts(_var_counts, init_dict)
42+
return VarDescription.from_var_counts(var_counts, init_dict)

src/ydata_profiling/model/var_description/counts.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,6 @@ class VarCounts:
2222
"""Sorted counts of values in the series without NaN. Sorted by counts."""
2323
ordering: Union[bool, list]
2424
memory_size: Union[int, list]
25+
26+
value_counts: Any
27+
"""Counts of values in original series type. Values as index, counts as values."""

src/ydata_profiling/model/var_description/default.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,23 @@ def __iter__(self) -> Iterator:
3838
"""To support old dict like interface."""
3939
return self.var_specific.__iter__()
4040

41+
@classmethod
42+
def from_var_counts(cls, var_counts: VarCounts, init_dict: dict) -> VarDescription:
43+
"""Get a default description from a VarCounts object."""
44+
return VarDescription(
45+
n=var_counts.n,
46+
count=var_counts.count,
47+
n_missing=var_counts.n_missing,
48+
p_missing=var_counts.p_missing,
49+
hashable=var_counts.hashable,
50+
memory_size=var_counts.memory_size,
51+
ordering=var_counts.ordering,
52+
var_specific=init_dict,
53+
value_counts_index_sorted=var_counts.value_counts_index_sorted,
54+
value_counts_without_nan=var_counts.value_counts_without_nan,
55+
value_counts=var_counts.value_counts,
56+
)
57+
4158

4259
@dataclass
4360
class VarDescriptionHashable(VarDescription):

0 commit comments

Comments
 (0)