Skip to content

Commit ec78726

Browse files
committed
update series description. replace dict with class
1 parent 36e2fa7 commit ec78726

28 files changed

Lines changed: 395 additions & 192 deletions

src/ydata_profiling/model/alerts.py

Lines changed: 50 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
"""Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant
22
values, high correlations)."""
33
from enum import Enum, auto, unique
4-
from typing import Any, Dict, List, Optional, Set
4+
from typing import Dict, List, Optional, Set
55

66
import numpy as np
77
import pandas as pd
88

99
from ydata_profiling.config import Settings
1010
from ydata_profiling.model.correlations import perform_check_correlation
11+
from ydata_profiling.model.var_description.default import VarDescription
1112

1213

1314
def fmt_percent(value: float, edge_cases: bool = True) -> str:
@@ -143,13 +144,13 @@ def __repr__(self):
143144
class ConstantLengthAlert(Alert):
144145
def __init__(
145146
self,
146-
values: Optional[Dict] = None,
147+
values: VarDescription,
147148
column_name: Optional[str] = None,
148149
is_empty: bool = False,
149150
):
150151
super().__init__(
151152
alert_type=AlertType.CONSTANT_LENGTH,
152-
values=values,
153+
values=values.var_specific,
153154
column_name=column_name,
154155
fields={"composition_min_length", "composition_max_length"},
155156
is_empty=is_empty,
@@ -162,15 +163,14 @@ def _get_description(self) -> str:
162163
class ConstantAlert(Alert):
163164
def __init__(
164165
self,
165-
values: Optional[Dict] = None,
166+
values: VarDescription,
166167
column_name: Optional[str] = None,
167168
is_empty: bool = False,
168169
):
169170
super().__init__(
170171
alert_type=AlertType.CONSTANT,
171-
values=values,
172+
values={"n_distinct": values.n_distinct},
172173
column_name=column_name,
173-
fields={"n_distinct"},
174174
is_empty=is_empty,
175175
)
176176

@@ -181,7 +181,7 @@ def _get_description(self) -> str:
181181
class DuplicatesAlert(Alert):
182182
def __init__(
183183
self,
184-
values: Optional[Dict] = None,
184+
values: dict,
185185
column_name: Optional[str] = None,
186186
is_empty: bool = False,
187187
):
@@ -203,15 +203,14 @@ def _get_description(self) -> str:
203203
class EmptyAlert(Alert):
204204
def __init__(
205205
self,
206-
values: Optional[Dict] = None,
206+
values: VarDescription,
207207
column_name: Optional[str] = None,
208208
is_empty: bool = False,
209209
):
210210
super().__init__(
211211
alert_type=AlertType.EMPTY,
212-
values=values,
212+
values={"n": values.n},
213213
column_name=column_name,
214-
fields={"n"},
215214
is_empty=is_empty,
216215
)
217216

@@ -222,15 +221,14 @@ def _get_description(self) -> str:
222221
class HighCardinalityAlert(Alert):
223222
def __init__(
224223
self,
225-
values: Optional[Dict] = None,
224+
values: VarDescription,
226225
column_name: Optional[str] = None,
227226
is_empty: bool = False,
228227
):
229228
super().__init__(
230229
alert_type=AlertType.HIGH_CARDINALITY,
231-
values=values,
230+
values={"n_distinct": values.n_distinct},
232231
column_name=column_name,
233-
fields={"n_distinct"},
234232
is_empty=is_empty,
235233
)
236234

@@ -244,7 +242,7 @@ def _get_description(self) -> str:
244242
class HighCorrelationAlert(Alert):
245243
def __init__(
246244
self,
247-
values: Optional[Dict] = None,
245+
values: Dict,
248246
column_name: Optional[str] = None,
249247
is_empty: bool = False,
250248
):
@@ -270,13 +268,13 @@ def _get_description(self) -> str:
270268
class ImbalanceAlert(Alert):
271269
def __init__(
272270
self,
273-
values: Optional[Dict] = None,
271+
values: VarDescription,
274272
column_name: Optional[str] = None,
275273
is_empty: bool = False,
276274
):
277275
super().__init__(
278276
alert_type=AlertType.IMBALANCE,
279-
values=values,
277+
values=values.var_specific,
280278
column_name=column_name,
281279
fields={"imbalance"},
282280
is_empty=is_empty,
@@ -293,13 +291,13 @@ def _get_description(self) -> str:
293291
class InfiniteAlert(Alert):
294292
def __init__(
295293
self,
296-
values: Optional[Dict] = None,
294+
values: VarDescription,
297295
column_name: Optional[str] = None,
298296
is_empty: bool = False,
299297
):
300298
super().__init__(
301299
alert_type=AlertType.INFINITE,
302-
values=values,
300+
values=values.var_specific,
303301
column_name=column_name,
304302
fields={"p_infinite", "n_infinite"},
305303
is_empty=is_empty,
@@ -315,15 +313,14 @@ def _get_description(self) -> str:
315313
class MissingAlert(Alert):
316314
def __init__(
317315
self,
318-
values: Optional[Dict] = None,
316+
values: VarDescription,
319317
column_name: Optional[str] = None,
320318
is_empty: bool = False,
321319
):
322320
super().__init__(
323321
alert_type=AlertType.MISSING,
324-
values=values,
322+
values={"p_missing": values.p_missing, "n_missing": values.n_missing},
325323
column_name=column_name,
326-
fields={"p_missing", "n_missing"},
327324
is_empty=is_empty,
328325
)
329326

@@ -373,13 +370,13 @@ def _get_description(self) -> str:
373370
class SkewedAlert(Alert):
374371
def __init__(
375372
self,
376-
values: Optional[Dict] = None,
373+
values: VarDescription,
377374
column_name: Optional[str] = None,
378375
is_empty: bool = False,
379376
):
380377
super().__init__(
381378
alert_type=AlertType.SKEWED,
382-
values=values,
379+
values=values.var_specific,
383380
column_name=column_name,
384381
fields={"skewness"},
385382
is_empty=is_empty,
@@ -432,15 +429,19 @@ def _get_description(self) -> str:
432429
class UniqueAlert(Alert):
433430
def __init__(
434431
self,
435-
values: Optional[Dict] = None,
432+
values: VarDescription,
436433
column_name: Optional[str] = None,
437434
is_empty: bool = False,
438435
):
439436
super().__init__(
440437
alert_type=AlertType.UNIQUE,
441-
values=values,
438+
values={
439+
"n_distinct": values.n_distinct,
440+
"p_distinct": values.p_distinct,
441+
"n_unique": values.n_unique,
442+
"p_unique": values.p_unique,
443+
},
442444
column_name=column_name,
443-
fields={"n_distinct", "p_distinct", "n_unique", "p_unique"},
444445
is_empty=is_empty,
445446
)
446447

@@ -469,13 +470,13 @@ def _get_description(self) -> str:
469470
class ZerosAlert(Alert):
470471
def __init__(
471472
self,
472-
values: Optional[Dict] = None,
473+
values: VarDescription,
473474
column_name: Optional[str] = None,
474475
is_empty: bool = False,
475476
):
476477
super().__init__(
477478
alert_type=AlertType.ZEROS,
478-
values=values,
479+
values=values.var_specific,
479480
column_name=column_name,
480481
fields={"n_zeros", "p_zeros"},
481482
is_empty=is_empty,
@@ -531,7 +532,7 @@ def check_table_alerts(table: dict) -> List[Alert]:
531532
return alerts
532533

533534

534-
def numeric_alerts(config: Settings, summary: dict) -> List[Alert]:
535+
def numeric_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
535536
alerts: List[Alert] = []
536537

537538
# Skewness
@@ -555,7 +556,7 @@ def numeric_alerts(config: Settings, summary: dict) -> List[Alert]:
555556
return alerts
556557

557558

558-
def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]:
559+
def timeseries_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
559560
alerts: List[Alert] = numeric_alerts(config, summary)
560561

561562
if not summary["stationary"]:
@@ -567,7 +568,7 @@ def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]:
567568
return alerts
568569

569570

570-
def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
571+
def categorical_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
571572
alerts: List[Alert] = []
572573

573574
# High cardinality
@@ -585,7 +586,7 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
585586

586587
# Constant length
587588
if "composition" in summary and summary["min_length"] == summary["max_length"]:
588-
alerts.append(ConstantLengthAlert())
589+
alerts.append(ConstantLengthAlert(summary))
589590

590591
# Imbalance
591592
if (
@@ -596,46 +597,48 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
596597
return alerts
597598

598599

599-
def boolean_alerts(config: Settings, summary: dict) -> List[Alert]:
600+
def boolean_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
600601
alerts: List[Alert] = []
601602

602603
if (
603604
"imbalance" in summary
604605
and summary["imbalance"] > config.vars.bool.imbalance_threshold
605606
):
606-
alerts.append(ImbalanceAlert())
607+
alerts.append(ImbalanceAlert(summary))
607608
return alerts
608609

609610

610-
def generic_alerts(summary: dict) -> List[Alert]:
611+
def generic_alerts(summary: VarDescription) -> List[Alert]:
611612
alerts: List[Alert] = []
612613

613614
# Missing
614-
if alert_value(summary["p_missing"]):
615-
alerts.append(MissingAlert())
615+
if alert_value(summary.p_missing):
616+
alerts.append(MissingAlert(summary))
616617

617618
return alerts
618619

619620

620-
def supported_alerts(summary: dict) -> List[Alert]:
621+
def supported_alerts(summary: VarDescription) -> List[Alert]:
621622
alerts: List[Alert] = []
622623

623-
if summary.get("n_distinct", np.nan) == summary["n"]:
624-
alerts.append(UniqueAlert())
625-
if summary.get("n_distinct", np.nan) == 1:
624+
if summary.n_distinct == summary.n:
625+
alerts.append(UniqueAlert(summary))
626+
if summary.n_distinct == 1:
626627
alerts.append(ConstantAlert(summary))
627628
return alerts
628629

629630

630-
def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]:
631+
def unsupported_alerts(summary: VarDescription) -> List[Alert]:
631632
alerts: List[Alert] = [
632633
UnsupportedAlert(),
633634
RejectedAlert(),
634635
]
635636
return alerts
636637

637638

638-
def check_variable_alerts(config: Settings, col: str, description: dict) -> List[Alert]:
639+
def check_variable_alerts(
640+
config: Settings, col: str, description: VarDescription
641+
) -> List[Alert]:
639642
"""Checks individual variables for alerts.
640643
641644
Args:
@@ -665,7 +668,6 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
665668

666669
for idx in range(len(alerts)):
667670
alerts[idx].column_name = col
668-
alerts[idx].values = description
669671
return alerts
670672

671673

@@ -693,7 +695,10 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert
693695

694696

695697
def get_alerts(
696-
config: Settings, table_stats: dict, series_description: dict, correlations: dict
698+
config: Settings,
699+
table_stats: dict,
700+
series_description: dict[str, VarDescription],
701+
correlations: dict,
697702
) -> List[Alert]:
698703
alerts: List[Alert] = check_table_alerts(table_stats)
699704
for col, description in series_description.items():

src/ydata_profiling/model/describe.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from ydata_profiling.model.summary import get_series_descriptions
2424
from ydata_profiling.model.table import get_table_stats
2525
from ydata_profiling.model.timeseries_index import get_time_index_description
26+
from ydata_profiling.model.var_description.default import VarDescription
2627
from ydata_profiling.utils.progress_bar import progress
2728
from ydata_profiling.version import __version__
2829

@@ -71,7 +72,7 @@ def describe(
7172

7273
# Variable-specific
7374
pbar.total += len(df.columns)
74-
series_description = get_series_descriptions(
75+
series_description: dict[str, VarDescription] = get_series_descriptions(
7576
config, df, summarizer, typeset, pbar
7677
)
7778

src/ydata_profiling/model/description.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from datetime import datetime, timedelta
33
from typing import Any, Dict, List, Optional, Union
44

5+
from ydata_profiling.model.var_description.default import VarDescription
6+
57

68
@dataclass
79
class BaseAnalysis:
@@ -96,7 +98,7 @@ class BaseDescription:
9698
analysis: BaseAnalysis
9799
time_index_analysis: Optional[TimeIndexAnalysis]
98100
table: Any
99-
variables: Dict[str, Any]
101+
variables: Dict[str, VarDescription]
100102
scatter: Any
101103
correlations: Dict[str, Any]
102104
missing: Dict[str, Any]

0 commit comments

Comments
 (0)