|
| 1 | +from __future__ import annotations |
| 2 | + |
1 | 3 | from dataclasses import dataclass |
2 | 4 |
|
3 | 5 | import pandas as pd |
4 | 6 |
|
5 | 7 | from ydata_profiling.config import Settings |
6 | 8 | from ydata_profiling.model.pandas.var_description.counts_pandas import VarCountsPandas |
7 | | -from ydata_profiling.model.var_description.default import VarDescription |
| 9 | +from ydata_profiling.model.var_description.default import ( |
| 10 | + VarDescription, |
| 11 | + VarDescriptionHashable, |
| 12 | +) |
8 | 13 |
|
9 | 14 |
|
10 | 15 | @dataclass |
11 | 16 | class VarDescriptionPandas(VarDescription): |
12 | | - """Default description for pandas columns""" |
| 17 | + """Default description for pandas columns.""" |
| 18 | + |
| 19 | + @classmethod |
| 20 | + def from_var_counts( |
| 21 | + cls, var_counts: VarCountsPandas, init_dict: dict |
| 22 | + ) -> VarDescriptionPandas: |
| 23 | + """Get a default description from a VarCountsPandas object.""" |
| 24 | + return VarDescriptionPandas( |
| 25 | + n=var_counts.n, |
| 26 | + count=var_counts.count, |
| 27 | + n_missing=var_counts.n_missing, |
| 28 | + p_missing=var_counts.p_missing, |
| 29 | + hashable=var_counts.hashable, |
| 30 | + memory_size=var_counts.memory_size, |
| 31 | + ordering=var_counts.ordering, |
| 32 | + value_counts_index_sorted=var_counts.value_counts_index_sorted, |
| 33 | + value_counts_without_nan=var_counts.value_counts_without_nan, |
| 34 | + var_specific=init_dict, |
| 35 | + ) |
13 | 36 |
|
14 | | - def __init__(self, config: Settings, series: pd.Series, init_dict: dict): |
15 | | - _var_counts = VarCountsPandas(config, series) |
16 | 37 |
|
17 | | - _count = _var_counts.count |
18 | | - value_counts = _var_counts.value_counts_without_nan |
| 38 | +@dataclass |
| 39 | +class VarDescriptionPandasHashable(VarDescriptionHashable): |
| 40 | + """Default description for pandas columns that are hashable (common types).""" |
| 41 | + |
| 42 | + @classmethod |
| 43 | + def from_var_counts( |
| 44 | + cls, var_counts: VarCountsPandas, init_dict: dict |
| 45 | + ) -> VarDescriptionPandasHashable: |
| 46 | + """Get a default description for a hashable column from a VarCountsPandas object.""" |
| 47 | + _count = var_counts.count |
| 48 | + value_counts = var_counts.value_counts_without_nan |
19 | 49 | distinct_count = len(value_counts) |
20 | 50 | unique_count = value_counts.where(value_counts == 1).count() |
21 | 51 |
|
22 | | - super().__init__( |
23 | | - n=_var_counts.n, |
24 | | - count=_var_counts.count, |
25 | | - n_missing=_var_counts.n_missing, |
26 | | - p_missing=_var_counts.p_missing, |
27 | | - hashable=_var_counts.hashable, |
28 | | - memory_size=_var_counts.memory_size, |
29 | | - ordering=_var_counts.ordering, |
30 | | - value_counts_index_sorted=_var_counts.value_counts_index_sorted, |
31 | | - value_counts_without_nan=_var_counts.value_counts_without_nan, |
| 52 | + return VarDescriptionPandasHashable( |
| 53 | + n=var_counts.n, |
| 54 | + count=var_counts.count, |
| 55 | + n_missing=var_counts.n_missing, |
| 56 | + p_missing=var_counts.p_missing, |
| 57 | + hashable=var_counts.hashable, |
| 58 | + memory_size=var_counts.memory_size, |
| 59 | + ordering=var_counts.ordering, |
| 60 | + value_counts_index_sorted=var_counts.value_counts_index_sorted, |
| 61 | + value_counts_without_nan=var_counts.value_counts_without_nan, |
32 | 62 | n_distinct=distinct_count, |
33 | 63 | p_distinct=distinct_count / _count if _count > 0 else 0, |
34 | 64 | is_unique=unique_count == _count and _count > 0, |
35 | 65 | n_unique=unique_count, |
36 | 66 | p_unique=unique_count / _count if _count > 0 else 0, |
37 | 67 | var_specific=init_dict, |
38 | 68 | ) |
| 69 | + |
| 70 | + |
| 71 | +def get_default_pandas_description( |
| 72 | + config: Settings, series: pd.Series, init_dict: dict |
| 73 | +) -> VarDescriptionPandas | VarDescriptionPandasHashable: |
| 74 | + _var_counts = VarCountsPandas(config, series) |
| 75 | + if _var_counts.hashable: |
| 76 | + return VarDescriptionPandasHashable.from_var_counts(_var_counts, init_dict) |
| 77 | + return VarDescriptionPandas.from_var_counts(_var_counts, init_dict) |
0 commit comments