Skip to content

Commit f3aebfd

Browse files
committed
feat: add vardescription class
1 parent b0c422b commit f3aebfd

20 files changed

Lines changed: 246 additions & 80 deletions

src/ydata_profiling/model/pandas/correlations_pandas.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Correlations between variables."""
2+
23
import itertools
34
import warnings
45
from typing import Callable, Optional
@@ -20,6 +21,7 @@
2021
DiscretizationType,
2122
Discretizer,
2223
)
24+
from ydata_profiling.model.var_description.default import VarDescription
2325

2426

2527
@Spearman.compute.register(Settings, pd.DataFrame, dict)
@@ -84,9 +86,9 @@ def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float:
8486
return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True)
8587

8688

87-
@Cramers.compute.register(Settings, pd.DataFrame, dict)
89+
@Cramers.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
8890
def pandas_cramers_compute(
89-
config: Settings, df: pd.DataFrame, summary: dict
91+
config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
9092
) -> Optional[pd.DataFrame]:
9193
threshold = config.categorical_maximum_correlation_distinct
9294

@@ -125,9 +127,9 @@ def pandas_cramers_compute(
125127
return correlation_matrix
126128

127129

128-
@PhiK.compute.register(Settings, pd.DataFrame, dict)
130+
@PhiK.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
129131
def pandas_phik_compute(
130-
config: Settings, df: pd.DataFrame, summary: dict
132+
config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
131133
) -> Optional[pd.DataFrame]:
132134
df_cols_dict = {i: list(df.columns).index(i) for i in df.columns}
133135

@@ -161,9 +163,9 @@ def pandas_phik_compute(
161163
return correlation
162164

163165

164-
@Auto.compute.register(Settings, pd.DataFrame, dict)
166+
@Auto.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
165167
def pandas_auto_compute(
166-
config: Settings, df: pd.DataFrame, summary: dict
168+
config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
167169
) -> Optional[pd.DataFrame]:
168170
threshold = config.categorical_maximum_correlation_distinct
169171
numerical_columns = [
@@ -192,7 +194,6 @@ def pandas_auto_compute(
192194
columns=columns_tested,
193195
)
194196
for col_1_name, col_2_name in itertools.combinations(columns_tested, 2):
195-
196197
method = (
197198
_pairwise_spearman
198199
if col_1_name and col_2_name not in categorical_columns

src/ydata_profiling/model/pandas/describe_boolean_pandas.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,14 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
7-
from ydata_profiling.model.summary_algorithms import (
8-
describe_boolean_1d,
9-
series_hashable,
10-
)
7+
from ydata_profiling.model.summary_algorithms import describe_boolean_1d
8+
from ydata_profiling.model.var_description.default import VarDescription
119

1210

1311
@describe_boolean_1d.register
14-
@series_hashable
1512
def pandas_describe_boolean_1d(
16-
config: Settings, series: pd.Series, summary: dict
17-
) -> Tuple[Settings, pd.Series, dict]:
13+
config: Settings, series: pd.Series, summary: VarDescription
14+
) -> Tuple[Settings, pd.Series, VarDescription]:
1815
"""Describe a boolean series.
1916
2017
Args:
@@ -25,8 +22,7 @@ def pandas_describe_boolean_1d(
2522
Returns:
2623
A dict containing calculated series description values.
2724
"""
28-
29-
value_counts = summary["value_counts_without_nan"]
25+
value_counts = summary.value_counts_without_nan
3026
summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})
3127

3228
summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))

src/ydata_profiling/model/pandas/describe_categorical_pandas.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
series_handle_nulls,
1717
series_hashable,
1818
)
19+
from ydata_profiling.model.var_description.default import VarDescription
1920

2021

2122
def get_character_counts_vc(vc: pd.Series) -> pd.Series:
@@ -210,8 +211,8 @@ def length_summary_vc(vc: pd.Series) -> dict:
210211
@series_hashable
211212
@series_handle_nulls
212213
def pandas_describe_categorical_1d(
213-
config: Settings, series: pd.Series, summary: dict
214-
) -> Tuple[Settings, pd.Series, dict]:
214+
config: Settings, series: pd.Series, summary: VarDescription
215+
) -> Tuple[Settings, pd.Series, VarDescription]:
215216
"""Describe a categorical series.
216217
217218
Args:
@@ -222,12 +223,11 @@ def pandas_describe_categorical_1d(
222223
Returns:
223224
A dict containing calculated series description values.
224225
"""
225-
226226
# Make sure we deal with strings (Issue #100)
227227
series = series.astype(str)
228228

229229
# Only run if at least 1 non-missing value
230-
value_counts = summary["value_counts_without_nan"]
230+
value_counts = summary.value_counts_without_nan
231231
value_counts.index = value_counts.index.astype(str)
232232

233233
summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))

src/ydata_profiling/model/pandas/describe_counts_pandas.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_counts
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_counts.register
1011
def pandas_describe_counts(
11-
config: Settings, series: pd.Series, summary: dict
12-
) -> Tuple[Settings, pd.Series, dict]:
12+
config: Settings, series: pd.Series, summary: VarDescription
13+
) -> Tuple[Settings, pd.Series, VarDescription]:
1314
"""Counts the values in a series (with and without NaN, distinct).
1415
1516
Args:
@@ -27,7 +28,7 @@ def pandas_describe_counts(
2728
except: # noqa: E722
2829
hashable = False
2930

30-
summary["hashable"] = hashable
31+
summary.hashable = hashable
3132

3233
if hashable:
3334
value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]
@@ -58,6 +59,6 @@ def pandas_describe_counts(
5859
ordering = False
5960

6061
summary["ordering"] = ordering
61-
summary["n_missing"] = n_missing
62+
summary.n_missing = n_missing
6263

6364
return config, series, summary

src/ydata_profiling/model/pandas/describe_date_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
series_handle_nulls,
1212
series_hashable,
1313
)
14+
from ydata_profiling.model.var_description.default import VarDescription
1415

1516

1617
@describe_date_1d.register
1718
@series_hashable
1819
@series_handle_nulls
1920
def pandas_describe_date_1d(
20-
config: Settings, series: pd.Series, summary: dict
21-
) -> Tuple[Settings, pd.Series, dict]:
21+
config: Settings, series: pd.Series, summary: VarDescription
22+
) -> Tuple[Settings, pd.Series, VarDescription]:
2223
"""Describe a date series.
2324
2425
Args:

src/ydata_profiling/model/pandas/describe_file_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from ydata_profiling.config import Settings
88
from ydata_profiling.model.summary_algorithms import describe_file_1d, histogram_compute
9+
from ydata_profiling.model.var_description.default import VarDescription
910

1011

1112
def file_summary(series: pd.Series) -> dict:
@@ -36,8 +37,8 @@ def convert_datetime(x: float) -> str:
3637

3738
@describe_file_1d.register
3839
def pandas_describe_file_1d(
39-
config: Settings, series: pd.Series, summary: dict
40-
) -> Tuple[Settings, pd.Series, dict]:
40+
config: Settings, series: pd.Series, summary: VarDescription
41+
) -> Tuple[Settings, pd.Series, VarDescription]:
4142
if series.hasnans:
4243
raise ValueError("May not contain NaNs")
4344
if not hasattr(series, "str"):

src/ydata_profiling/model/pandas/describe_generic_pandas.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_generic
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_generic.register
1011
def pandas_describe_generic(
11-
config: Settings, series: pd.Series, summary: dict
12-
) -> Tuple[Settings, pd.Series, dict]:
12+
config: Settings, series: pd.Series, summary: VarDescription
13+
) -> Tuple[Settings, pd.Series, VarDescription]:
1314
"""Describe generic series.
1415
1516
Args:
@@ -27,8 +28,8 @@ def pandas_describe_generic(
2728
summary.update(
2829
{
2930
"n": length,
30-
"p_missing": summary["n_missing"] / length if length > 0 else 0,
31-
"count": length - summary["n_missing"],
31+
"p_missing": summary.n_missing / length if length > 0 else 0,
32+
"count": length - summary.n_missing,
3233
"memory_size": series.memory_usage(deep=config.memory_deep),
3334
}
3435
)

src/ydata_profiling/model/pandas/describe_image_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
describe_image_1d,
1313
named_aggregate_summary,
1414
)
15+
from ydata_profiling.model.var_description.default import VarDescription
1516
from ydata_profiling.utils.imghdr_patch import * # noqa: F401,F403
1617

1718

@@ -243,8 +244,8 @@ def image_summary(series: pd.Series, exif: bool = False, hash: bool = False) ->
243244

244245
@describe_image_1d.register
245246
def pandas_describe_image_1d(
246-
config: Settings, series: pd.Series, summary: dict
247-
) -> Tuple[Settings, pd.Series, dict]:
247+
config: Settings, series: pd.Series, summary: VarDescription
248+
) -> Tuple[Settings, pd.Series, VarDescription]:
248249
if series.hasnans:
249250
raise ValueError("May not contain NaNs")
250251
if not hasattr(series, "str"):

src/ydata_profiling/model/pandas/describe_numeric_pandas.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44
import pandas as pd
55

6+
from ydata_profiling.model.var_description.default import VarDescription
67
from ydata_profiling.utils.compat import pandas_version_info
78

89
if pandas_version_info() >= (1, 5):
@@ -44,9 +45,9 @@ def numeric_stats_pandas(series: pd.Series) -> Dict[str, Any]:
4445

4546

4647
def numeric_stats_numpy(
47-
present_values: np.ndarray, series: pd.Series, series_description: Dict[str, Any]
48+
present_values: np.ndarray, series: pd.Series, series_description: VarDescription
4849
) -> Dict[str, Any]:
49-
vc = series_description["value_counts_without_nan"]
50+
vc = series_description.value_counts_without_nan
5051
index_values = vc.index.values
5152

5253
# FIXME: can be performance optimized by using weights in std, var, kurt and skew...
@@ -80,8 +81,8 @@ def numeric_stats_numpy(
8081
@series_hashable
8182
@series_handle_nulls
8283
def pandas_describe_numeric_1d(
83-
config: Settings, series: pd.Series, summary: dict
84-
) -> Tuple[Settings, pd.Series, dict]:
84+
config: Settings, series: pd.Series, summary: VarDescription
85+
) -> Tuple[Settings, pd.Series, VarDescription]:
8586
"""Describe a numeric series.
8687
8788
Args:
@@ -96,11 +97,11 @@ def pandas_describe_numeric_1d(
9697
chi_squared_threshold = config.vars.num.chi_squared_threshold
9798
quantiles = config.vars.num.quantiles
9899

99-
value_counts = summary["value_counts_without_nan"]
100+
value_counts = summary.value_counts_without_nan
100101

101102
negative_index = value_counts.index < 0
102103
summary["n_negative"] = value_counts.loc[negative_index].sum()
103-
summary["p_negative"] = summary["n_negative"] / summary["n"]
104+
summary["p_negative"] = summary["n_negative"] / summary.n
104105

105106
infinity_values = [np.inf, -np.inf]
106107
infinity_index = value_counts.index.isin(infinity_values)
@@ -139,8 +140,8 @@ def pandas_describe_numeric_1d(
139140
)
140141
stats["iqr"] = stats["75%"] - stats["25%"]
141142
stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
142-
stats["p_zeros"] = stats["n_zeros"] / summary["n"]
143-
stats["p_infinite"] = summary["n_infinite"] / summary["n"]
143+
stats["p_zeros"] = stats["n_zeros"] / summary.n
144+
stats["p_infinite"] = summary["n_infinite"] / summary.n
144145

145146
stats["monotonic_increase"] = series.is_monotonic_increasing
146147
stats["monotonic_decrease"] = series.is_monotonic_decreasing

src/ydata_profiling/model/pandas/describe_path_pandas.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from ydata_profiling.config import Settings
77
from ydata_profiling.model.summary_algorithms import describe_path_1d
8+
from ydata_profiling.model.var_description.default import VarDescription
89

910

1011
def path_summary(series: pd.Series) -> dict:
@@ -19,8 +20,9 @@ def path_summary(series: pd.Series) -> dict:
1920

2021
# TODO: optimize using value counts
2122
summary = {
22-
"common_prefix": os.path.commonprefix(series.values.tolist())
23-
or "No common prefix",
23+
"common_prefix": (
24+
os.path.commonprefix(series.values.tolist()) or "No common prefix"
25+
),
2426
"stem_counts": series.map(lambda x: os.path.splitext(x)[0]).value_counts(),
2527
"suffix_counts": series.map(lambda x: os.path.splitext(x)[1]).value_counts(),
2628
"name_counts": series.map(lambda x: os.path.basename(x)).value_counts(),
@@ -39,8 +41,8 @@ def path_summary(series: pd.Series) -> dict:
3941

4042
@describe_path_1d.register
4143
def pandas_describe_path_1d(
42-
config: Settings, series: pd.Series, summary: dict
43-
) -> Tuple[Settings, pd.Series, dict]:
44+
config: Settings, series: pd.Series, summary: VarDescription
45+
) -> Tuple[Settings, pd.Series, VarDescription]:
4446
"""Describe a path series.
4547
4648
Args:

0 commit comments

Comments
 (0)