Skip to content

Commit f6fa91c

Browse files
committed
feat: make pandas profiling work just fine
1 parent 36cb9ee commit f6fa91c

7 files changed

Lines changed: 84 additions & 74 deletions

File tree

src/ydata_profiling/model/alerts.py

Lines changed: 50 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
"""Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant
22
values, high correlations)."""
3+
34
from enum import Enum, auto, unique
4-
from typing import Any, Dict, List, Optional, Set
5+
from typing import Dict, List, Optional, Set
56

67
import numpy as np
78
import pandas as pd
89

910
from ydata_profiling.config import Settings
1011
from ydata_profiling.model.correlations import perform_check_correlation
12+
from ydata_profiling.model.var_description.default import VarDescription
1113

1214

1315
def fmt_percent(value: float, edge_cases: bool = True) -> str:
@@ -143,13 +145,13 @@ def __repr__(self):
143145
class ConstantLengthAlert(Alert):
144146
def __init__(
145147
self,
146-
values: Optional[Dict] = None,
148+
values: VarDescription,
147149
column_name: Optional[str] = None,
148150
is_empty: bool = False,
149151
):
150152
super().__init__(
151153
alert_type=AlertType.CONSTANT_LENGTH,
152-
values=values,
154+
values=values.var_specific,
153155
column_name=column_name,
154156
fields={"composition_min_length", "composition_max_length"},
155157
is_empty=is_empty,
@@ -162,15 +164,14 @@ def _get_description(self) -> str:
162164
class ConstantAlert(Alert):
163165
def __init__(
164166
self,
165-
values: Optional[Dict] = None,
167+
values: VarDescription,
166168
column_name: Optional[str] = None,
167169
is_empty: bool = False,
168170
):
169171
super().__init__(
170172
alert_type=AlertType.CONSTANT,
171-
values=values,
173+
values={"n_distinct": values["n_distinct"]},
172174
column_name=column_name,
173-
fields={"n_distinct"},
174175
is_empty=is_empty,
175176
)
176177

@@ -181,7 +182,7 @@ def _get_description(self) -> str:
181182
class DuplicatesAlert(Alert):
182183
def __init__(
183184
self,
184-
values: Optional[Dict] = None,
185+
values: dict,
185186
column_name: Optional[str] = None,
186187
is_empty: bool = False,
187188
):
@@ -203,15 +204,14 @@ def _get_description(self) -> str:
203204
class EmptyAlert(Alert):
204205
def __init__(
205206
self,
206-
values: Optional[Dict] = None,
207+
values: VarDescription,
207208
column_name: Optional[str] = None,
208209
is_empty: bool = False,
209210
):
210211
super().__init__(
211212
alert_type=AlertType.EMPTY,
212-
values=values,
213+
values={"n": values.n},
213214
column_name=column_name,
214-
fields={"n"},
215215
is_empty=is_empty,
216216
)
217217

@@ -222,15 +222,14 @@ def _get_description(self) -> str:
222222
class HighCardinalityAlert(Alert):
223223
def __init__(
224224
self,
225-
values: Optional[Dict] = None,
225+
values: VarDescription,
226226
column_name: Optional[str] = None,
227227
is_empty: bool = False,
228228
):
229229
super().__init__(
230230
alert_type=AlertType.HIGH_CARDINALITY,
231-
values=values,
231+
values={"n_distinct": values["n_distinct"]},
232232
column_name=column_name,
233-
fields={"n_distinct"},
234233
is_empty=is_empty,
235234
)
236235

@@ -244,7 +243,7 @@ def _get_description(self) -> str:
244243
class HighCorrelationAlert(Alert):
245244
def __init__(
246245
self,
247-
values: Optional[Dict] = None,
246+
values: Dict,
248247
column_name: Optional[str] = None,
249248
is_empty: bool = False,
250249
):
@@ -270,13 +269,13 @@ def _get_description(self) -> str:
270269
class ImbalanceAlert(Alert):
271270
def __init__(
272271
self,
273-
values: Optional[Dict] = None,
272+
values: VarDescription,
274273
column_name: Optional[str] = None,
275274
is_empty: bool = False,
276275
):
277276
super().__init__(
278277
alert_type=AlertType.IMBALANCE,
279-
values=values,
278+
values=values.var_specific,
280279
column_name=column_name,
281280
fields={"imbalance"},
282281
is_empty=is_empty,
@@ -293,13 +292,13 @@ def _get_description(self) -> str:
293292
class InfiniteAlert(Alert):
294293
def __init__(
295294
self,
296-
values: Optional[Dict] = None,
295+
values: VarDescription,
297296
column_name: Optional[str] = None,
298297
is_empty: bool = False,
299298
):
300299
super().__init__(
301300
alert_type=AlertType.INFINITE,
302-
values=values,
301+
values=values.var_specific,
303302
column_name=column_name,
304303
fields={"p_infinite", "n_infinite"},
305304
is_empty=is_empty,
@@ -315,15 +314,14 @@ def _get_description(self) -> str:
315314
class MissingAlert(Alert):
316315
def __init__(
317316
self,
318-
values: Optional[Dict] = None,
317+
values: VarDescription,
319318
column_name: Optional[str] = None,
320319
is_empty: bool = False,
321320
):
322321
super().__init__(
323322
alert_type=AlertType.MISSING,
324-
values=values,
323+
values={"p_missing": values.p_missing, "n_missing": values.n_missing},
325324
column_name=column_name,
326-
fields={"p_missing", "n_missing"},
327325
is_empty=is_empty,
328326
)
329327

@@ -373,13 +371,13 @@ def _get_description(self) -> str:
373371
class SkewedAlert(Alert):
374372
def __init__(
375373
self,
376-
values: Optional[Dict] = None,
374+
values: VarDescription,
377375
column_name: Optional[str] = None,
378376
is_empty: bool = False,
379377
):
380378
super().__init__(
381379
alert_type=AlertType.SKEWED,
382-
values=values,
380+
values=values.var_specific,
383381
column_name=column_name,
384382
fields={"skewness"},
385383
is_empty=is_empty,
@@ -432,15 +430,19 @@ def _get_description(self) -> str:
432430
class UniqueAlert(Alert):
433431
def __init__(
434432
self,
435-
values: Optional[Dict] = None,
433+
values: VarDescription,
436434
column_name: Optional[str] = None,
437435
is_empty: bool = False,
438436
):
439437
super().__init__(
440438
alert_type=AlertType.UNIQUE,
441-
values=values,
439+
values={
440+
"n_distinct": values["n_distinct"],
441+
"p_distinct": values["p_distinct"],
442+
"n_unique": values["n_unique"],
443+
"p_unique": values["p_unique"],
444+
},
442445
column_name=column_name,
443-
fields={"n_distinct", "p_distinct", "n_unique", "p_unique"},
444446
is_empty=is_empty,
445447
)
446448

@@ -469,13 +471,13 @@ def _get_description(self) -> str:
469471
class ZerosAlert(Alert):
470472
def __init__(
471473
self,
472-
values: Optional[Dict] = None,
474+
values: VarDescription,
473475
column_name: Optional[str] = None,
474476
is_empty: bool = False,
475477
):
476478
super().__init__(
477479
alert_type=AlertType.ZEROS,
478-
values=values,
480+
values=values.var_specific,
479481
column_name=column_name,
480482
fields={"n_zeros", "p_zeros"},
481483
is_empty=is_empty,
@@ -531,7 +533,7 @@ def check_table_alerts(table: dict) -> List[Alert]:
531533
return alerts
532534

533535

534-
def numeric_alerts(config: Settings, summary: dict) -> List[Alert]:
536+
def numeric_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
535537
alerts: List[Alert] = []
536538

537539
# Skewness
@@ -555,7 +557,7 @@ def numeric_alerts(config: Settings, summary: dict) -> List[Alert]:
555557
return alerts
556558

557559

558-
def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]:
560+
def timeseries_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
559561
alerts: List[Alert] = numeric_alerts(config, summary)
560562

561563
if not summary["stationary"]:
@@ -567,7 +569,7 @@ def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]:
567569
return alerts
568570

569571

570-
def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
572+
def categorical_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
571573
alerts: List[Alert] = []
572574

573575
# High cardinality
@@ -585,7 +587,7 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
585587

586588
# Constant length
587589
if "composition" in summary and summary["min_length"] == summary["max_length"]:
588-
alerts.append(ConstantLengthAlert())
590+
alerts.append(ConstantLengthAlert(summary))
589591

590592
# Imbalance
591593
if (
@@ -596,46 +598,48 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
596598
return alerts
597599

598600

599-
def boolean_alerts(config: Settings, summary: dict) -> List[Alert]:
601+
def boolean_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
600602
alerts: List[Alert] = []
601603

602604
if (
603605
"imbalance" in summary
604606
and summary["imbalance"] > config.vars.bool.imbalance_threshold
605607
):
606-
alerts.append(ImbalanceAlert())
608+
alerts.append(ImbalanceAlert(summary))
607609
return alerts
608610

609611

610-
def generic_alerts(summary: dict) -> List[Alert]:
612+
def generic_alerts(summary: VarDescription) -> List[Alert]:
611613
alerts: List[Alert] = []
612614

613615
# Missing
614-
if alert_value(summary["p_missing"]):
615-
alerts.append(MissingAlert())
616+
if alert_value(summary.p_missing):
617+
alerts.append(MissingAlert(summary))
616618

617619
return alerts
618620

619621

620-
def supported_alerts(summary: dict) -> List[Alert]:
622+
def supported_alerts(summary: VarDescription) -> List[Alert]:
621623
alerts: List[Alert] = []
622624

623-
if summary.get("n_distinct", np.nan) == summary["n"]:
624-
alerts.append(UniqueAlert())
625+
if summary.get("n_distinct", np.nan) == summary.n:
626+
alerts.append(UniqueAlert(summary))
625627
if summary.get("n_distinct", np.nan) == 1:
626628
alerts.append(ConstantAlert(summary))
627629
return alerts
628630

629631

630-
def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]:
632+
def unsupported_alerts(summary: VarDescription) -> List[Alert]:
631633
alerts: List[Alert] = [
632634
UnsupportedAlert(),
633635
RejectedAlert(),
634636
]
635637
return alerts
636638

637639

638-
def check_variable_alerts(config: Settings, col: str, description: dict) -> List[Alert]:
640+
def check_variable_alerts(
641+
config: Settings, col: str, description: VarDescription
642+
) -> List[Alert]:
639643
"""Checks individual variables for alerts.
640644
641645
Args:
@@ -665,7 +669,6 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
665669

666670
for idx in range(len(alerts)):
667671
alerts[idx].column_name = col
668-
alerts[idx].values = description
669672
return alerts
670673

671674

@@ -693,7 +696,10 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert
693696

694697

695698
def get_alerts(
696-
config: Settings, table_stats: dict, series_description: dict, correlations: dict
699+
config: Settings,
700+
table_stats: dict,
701+
series_description: dict[str, VarDescription],
702+
correlations: dict,
697703
) -> List[Alert]:
698704
alerts: List[Alert] = check_table_alerts(table_stats)
699705
for col, description in series_description.items():

src/ydata_profiling/model/describe.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Organize the calculation of statistics for each series in this DataFrame."""
2+
23
from datetime import datetime
34
from typing import Any, Dict, Optional
45

@@ -23,6 +24,7 @@
2324
from ydata_profiling.model.summary import get_series_descriptions
2425
from ydata_profiling.model.table import get_table_stats
2526
from ydata_profiling.model.timeseries_index import get_time_index_description
27+
from ydata_profiling.model.var_description.default import VarDescription
2628
from ydata_profiling.utils.progress_bar import progress
2729
from ydata_profiling.version import __version__
2830

@@ -71,7 +73,7 @@ def describe(
7173

7274
# Variable-specific
7375
pbar.total += len(df.columns)
74-
series_description = get_series_descriptions(
76+
series_description: dict[str, VarDescription] = get_series_descriptions(
7577
config, df, summarizer, typeset, pbar
7678
)
7779

src/ydata_profiling/model/description.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
from pandas import Timedelta
66

7+
from ydata_profiling.model.var_description.default import VarDescription
8+
79

810
@dataclass
911
class BaseAnalysis:
@@ -98,7 +100,7 @@ class BaseDescription:
98100
analysis: BaseAnalysis
99101
time_index_analysis: Optional[TimeIndexAnalysis]
100102
table: Any
101-
variables: Dict[str, Any]
103+
variables: Dict[str, VarDescription]
102104
scatter: Any
103105
correlations: Dict[str, Any]
104106
missing: Dict[str, Any]

0 commit comments

Comments
 (0)