Skip to content

Commit 36cb9ee

Browse files
committed
feat: add vardescription class
1 parent 497b1ac commit 36cb9ee

20 files changed

Lines changed: 246 additions & 80 deletions

src/ydata_profiling/model/pandas/correlations_pandas.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Correlations between variables."""
2+
23
import itertools
34
import warnings
45
from typing import Callable, Optional
@@ -20,6 +21,7 @@
2021
DiscretizationType,
2122
Discretizer,
2223
)
24+
from ydata_profiling.model.var_description.default import VarDescription
2325

2426

2527
@Spearman.compute.register(Settings, pd.DataFrame, dict)
@@ -87,9 +89,9 @@ def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float:
8789
return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True)
8890

8991

90-
@Cramers.compute.register(Settings, pd.DataFrame, dict)
92+
@Cramers.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
9193
def pandas_cramers_compute(
92-
config: Settings, df: pd.DataFrame, summary: dict
94+
config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
9395
) -> Optional[pd.DataFrame]:
9496
threshold = config.categorical_maximum_correlation_distinct
9597

@@ -128,9 +130,9 @@ def pandas_cramers_compute(
128130
return correlation_matrix
129131

130132

131-
@PhiK.compute.register(Settings, pd.DataFrame, dict)
133+
@PhiK.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
132134
def pandas_phik_compute(
133-
config: Settings, df: pd.DataFrame, summary: dict
135+
config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
134136
) -> Optional[pd.DataFrame]:
135137
df_cols_dict = {i: list(df.columns).index(i) for i in df.columns}
136138

@@ -164,9 +166,9 @@ def pandas_phik_compute(
164166
return correlation
165167

166168

167-
@Auto.compute.register(Settings, pd.DataFrame, dict)
169+
@Auto.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
168170
def pandas_auto_compute(
169-
config: Settings, df: pd.DataFrame, summary: dict
171+
config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
170172
) -> Optional[pd.DataFrame]:
171173
threshold = config.categorical_maximum_correlation_distinct
172174
numerical_columns = [
@@ -195,7 +197,6 @@ def pandas_auto_compute(
195197
columns=columns_tested,
196198
)
197199
for col_1_name, col_2_name in itertools.combinations(columns_tested, 2):
198-
199200
method = (
200201
_pairwise_spearman
201202
if any(elem in categorical_columns for elem in [col_1_name, col_2_name])

src/ydata_profiling/model/pandas/describe_boolean_pandas.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,14 @@
55

66
from ydata_profiling.config import Settings
77
from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
8-
from ydata_profiling.model.summary_algorithms import (
9-
describe_boolean_1d,
10-
series_hashable,
11-
)
8+
from ydata_profiling.model.summary_algorithms import describe_boolean_1d
9+
from ydata_profiling.model.var_description.default import VarDescription
1210

1311

1412
@describe_boolean_1d.register
15-
@series_hashable
1613
def pandas_describe_boolean_1d(
17-
config: Settings, series: pd.Series, summary: dict
18-
) -> Tuple[Settings, pd.Series, dict]:
14+
config: Settings, series: pd.Series, summary: VarDescription
15+
) -> Tuple[Settings, pd.Series, VarDescription]:
1916
"""Describe a boolean series.
2017
2118
Args:
@@ -26,8 +23,7 @@ def pandas_describe_boolean_1d(
2623
Returns:
2724
A dict containing calculated series description values.
2825
"""
29-
30-
value_counts: pd.Series = summary["value_counts_without_nan"]
26+
value_counts: pd.Series = summary.value_counts_without_nan
3127
if not value_counts.empty:
3228
summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})
3329
summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))

src/ydata_profiling/model/pandas/describe_categorical_pandas.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
series_handle_nulls,
1717
series_hashable,
1818
)
19+
from ydata_profiling.model.var_description.default import VarDescription
1920

2021

2122
def get_character_counts_vc(vc: pd.Series) -> pd.Series:
@@ -214,8 +215,8 @@ def length_summary_vc(vc: pd.Series) -> dict:
214215
@series_hashable
215216
@series_handle_nulls
216217
def pandas_describe_categorical_1d(
217-
config: Settings, series: pd.Series, summary: dict
218-
) -> Tuple[Settings, pd.Series, dict]:
218+
config: Settings, series: pd.Series, summary: VarDescription
219+
) -> Tuple[Settings, pd.Series, VarDescription]:
219220
"""Describe a categorical series.
220221
221222
Args:
@@ -226,12 +227,11 @@ def pandas_describe_categorical_1d(
226227
Returns:
227228
A dict containing calculated series description values.
228229
"""
229-
230230
# Make sure we deal with strings (Issue #100)
231231
series = series.astype(str)
232232

233233
# Only run if at least 1 non-missing value
234-
value_counts = summary["value_counts_without_nan"]
234+
value_counts = summary.value_counts_without_nan
235235
value_counts.index = value_counts.index.astype(str)
236236

237237
summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))

src/ydata_profiling/model/pandas/describe_counts_pandas.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_counts
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_counts.register
1011
def pandas_describe_counts(
11-
config: Settings, series: pd.Series, summary: dict
12-
) -> Tuple[Settings, pd.Series, dict]:
12+
config: Settings, series: pd.Series, summary: VarDescription
13+
) -> Tuple[Settings, pd.Series, VarDescription]:
1314
"""Counts the values in a series (with and without NaN, distinct).
1415
1516
Args:
@@ -27,7 +28,7 @@ def pandas_describe_counts(
2728
except: # noqa: E722
2829
hashable = False
2930

30-
summary["hashable"] = hashable
31+
summary.hashable = hashable
3132

3233
if hashable:
3334
value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]
@@ -58,6 +59,6 @@ def pandas_describe_counts(
5859
ordering = False
5960

6061
summary["ordering"] = ordering
61-
summary["n_missing"] = n_missing
62+
summary.n_missing = n_missing
6263

6364
return config, series, summary

src/ydata_profiling/model/pandas/describe_date_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@
1111
series_handle_nulls,
1212
series_hashable,
1313
)
14+
from ydata_profiling.model.var_description.default import VarDescription
1415

1516

1617
@describe_date_1d.register
1718
@series_hashable
1819
@series_handle_nulls
1920
def pandas_describe_date_1d(
20-
config: Settings, series: pd.Series, summary: dict
21-
) -> Tuple[Settings, pd.Series, dict]:
21+
config: Settings, series: pd.Series, summary: VarDescription
22+
) -> Tuple[Settings, pd.Series, VarDescription]:
2223
"""Describe a date series.
2324
2425
Args:

src/ydata_profiling/model/pandas/describe_file_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from ydata_profiling.config import Settings
88
from ydata_profiling.model.summary_algorithms import describe_file_1d, histogram_compute
9+
from ydata_profiling.model.var_description.default import VarDescription
910

1011

1112
def file_summary(series: pd.Series) -> dict:
@@ -36,8 +37,8 @@ def convert_datetime(x: float) -> str:
3637

3738
@describe_file_1d.register
3839
def pandas_describe_file_1d(
39-
config: Settings, series: pd.Series, summary: dict
40-
) -> Tuple[Settings, pd.Series, dict]:
40+
config: Settings, series: pd.Series, summary: VarDescription
41+
) -> Tuple[Settings, pd.Series, VarDescription]:
4142
if series.hasnans:
4243
raise ValueError("May not contain NaNs")
4344
if not hasattr(series, "str"):

src/ydata_profiling/model/pandas/describe_generic_pandas.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55
from ydata_profiling.config import Settings
66
from ydata_profiling.model.summary_algorithms import describe_generic
7+
from ydata_profiling.model.var_description.default import VarDescription
78

89

910
@describe_generic.register
1011
def pandas_describe_generic(
11-
config: Settings, series: pd.Series, summary: dict
12-
) -> Tuple[Settings, pd.Series, dict]:
12+
config: Settings, series: pd.Series, summary: VarDescription
13+
) -> Tuple[Settings, pd.Series, VarDescription]:
1314
"""Describe generic series.
1415
1516
Args:
@@ -27,8 +28,8 @@ def pandas_describe_generic(
2728
summary.update(
2829
{
2930
"n": length,
30-
"p_missing": summary["n_missing"] / length if length > 0 else 0,
31-
"count": length - summary["n_missing"],
31+
"p_missing": summary.n_missing / length if length > 0 else 0,
32+
"count": length - summary.n_missing,
3233
"memory_size": series.memory_usage(deep=config.memory_deep),
3334
}
3435
)

src/ydata_profiling/model/pandas/describe_image_pandas.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
describe_image_1d,
1313
named_aggregate_summary,
1414
)
15+
from ydata_profiling.model.var_description.default import VarDescription
1516
from ydata_profiling.utils.imghdr_patch import * # noqa: F401,F403
1617

1718

@@ -243,8 +244,8 @@ def image_summary(series: pd.Series, exif: bool = False, hash: bool = False) ->
243244

244245
@describe_image_1d.register
245246
def pandas_describe_image_1d(
246-
config: Settings, series: pd.Series, summary: dict
247-
) -> Tuple[Settings, pd.Series, dict]:
247+
config: Settings, series: pd.Series, summary: VarDescription
248+
) -> Tuple[Settings, pd.Series, VarDescription]:
248249
if series.hasnans:
249250
raise ValueError("May not contain NaNs")
250251
if not hasattr(series, "str"):

src/ydata_profiling/model/pandas/describe_numeric_pandas.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44
import pandas as pd
55

6+
from ydata_profiling.model.var_description.default import VarDescription
67
from ydata_profiling.utils.compat import pandas_version_info
78

89
if pandas_version_info() >= (1, 5):
@@ -44,9 +45,9 @@ def numeric_stats_pandas(series: pd.Series) -> Dict[str, Any]:
4445

4546

4647
def numeric_stats_numpy(
47-
present_values: np.ndarray, series: pd.Series, series_description: Dict[str, Any]
48+
present_values: np.ndarray, series: pd.Series, series_description: VarDescription
4849
) -> Dict[str, Any]:
49-
vc = series_description["value_counts_without_nan"]
50+
vc = series_description.value_counts_without_nan
5051
index_values = vc.index.values
5152

5253
# FIXME: can be performance optimized by using weights in std, var, kurt and skew...
@@ -80,8 +81,8 @@ def numeric_stats_numpy(
8081
@series_hashable
8182
@series_handle_nulls
8283
def pandas_describe_numeric_1d(
83-
config: Settings, series: pd.Series, summary: dict
84-
) -> Tuple[Settings, pd.Series, dict]:
84+
config: Settings, series: pd.Series, summary: VarDescription
85+
) -> Tuple[Settings, pd.Series, VarDescription]:
8586
"""Describe a numeric series.
8687
8788
Args:
@@ -96,11 +97,11 @@ def pandas_describe_numeric_1d(
9697
chi_squared_threshold = config.vars.num.chi_squared_threshold
9798
quantiles = config.vars.num.quantiles
9899

99-
value_counts = summary["value_counts_without_nan"]
100+
value_counts = summary.value_counts_without_nan
100101

101102
negative_index = value_counts.index < 0
102103
summary["n_negative"] = value_counts.loc[negative_index].sum()
103-
summary["p_negative"] = summary["n_negative"] / summary["n"]
104+
summary["p_negative"] = summary["n_negative"] / summary.n
104105

105106
infinity_values = [np.inf, -np.inf]
106107
infinity_index = value_counts.index.isin(infinity_values)
@@ -139,8 +140,8 @@ def pandas_describe_numeric_1d(
139140
)
140141
stats["iqr"] = stats["75%"] - stats["25%"]
141142
stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
142-
stats["p_zeros"] = stats["n_zeros"] / summary["n"]
143-
stats["p_infinite"] = summary["n_infinite"] / summary["n"]
143+
stats["p_zeros"] = stats["n_zeros"] / summary.n
144+
stats["p_infinite"] = summary["n_infinite"] / summary.n
144145

145146
stats["monotonic_increase"] = series.is_monotonic_increasing
146147
stats["monotonic_decrease"] = series.is_monotonic_decreasing

src/ydata_profiling/model/pandas/describe_path_pandas.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from ydata_profiling.config import Settings
77
from ydata_profiling.model.summary_algorithms import describe_path_1d
8+
from ydata_profiling.model.var_description.default import VarDescription
89

910

1011
def path_summary(series: pd.Series) -> dict:
@@ -19,8 +20,9 @@ def path_summary(series: pd.Series) -> dict:
1920

2021
# TODO: optimize using value counts
2122
summary = {
22-
"common_prefix": os.path.commonprefix(series.values.tolist())
23-
or "No common prefix",
23+
"common_prefix": (
24+
os.path.commonprefix(series.values.tolist()) or "No common prefix"
25+
),
2426
"stem_counts": series.map(lambda x: os.path.splitext(x)[0]).value_counts(),
2527
"suffix_counts": series.map(lambda x: os.path.splitext(x)[1]).value_counts(),
2628
"name_counts": series.map(lambda x: os.path.basename(x)).value_counts(),
@@ -39,8 +41,8 @@ def path_summary(series: pd.Series) -> dict:
3941

4042
@describe_path_1d.register
4143
def pandas_describe_path_1d(
42-
config: Settings, series: pd.Series, summary: dict
43-
) -> Tuple[Settings, pd.Series, dict]:
44+
config: Settings, series: pd.Series, summary: VarDescription
45+
) -> Tuple[Settings, pd.Series, VarDescription]:
4446
"""Describe a path series.
4547
4648
Args:

0 commit comments

Comments
 (0)