11"""Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant
22values, high correlations)."""
3+
34from enum import Enum , auto , unique
4- from typing import Any , Dict , List , Optional , Set
5+ from typing import Dict , List , Optional , Set
56
67import numpy as np
78import pandas as pd
89
910from ydata_profiling .config import Settings
1011from ydata_profiling .model .correlations import perform_check_correlation
12+ from ydata_profiling .model .var_description .default import VarDescription
1113
1214
1315def fmt_percent (value : float , edge_cases : bool = True ) -> str :
@@ -143,13 +145,13 @@ def __repr__(self):
143145class ConstantLengthAlert (Alert ):
144146 def __init__ (
145147 self ,
146- values : Optional [ Dict ] = None ,
148+ values : VarDescription ,
147149 column_name : Optional [str ] = None ,
148150 is_empty : bool = False ,
149151 ):
150152 super ().__init__ (
151153 alert_type = AlertType .CONSTANT_LENGTH ,
152- values = values ,
154+ values = values . var_specific ,
153155 column_name = column_name ,
154156 fields = {"composition_min_length" , "composition_max_length" },
155157 is_empty = is_empty ,
@@ -162,15 +164,14 @@ def _get_description(self) -> str:
162164class ConstantAlert (Alert ):
163165 def __init__ (
164166 self ,
165- values : Optional [ Dict ] = None ,
167+ values : VarDescription ,
166168 column_name : Optional [str ] = None ,
167169 is_empty : bool = False ,
168170 ):
169171 super ().__init__ (
170172 alert_type = AlertType .CONSTANT ,
171- values = values ,
173+ values = { "n_distinct" : values [ "n_distinct" ]} ,
172174 column_name = column_name ,
173- fields = {"n_distinct" },
174175 is_empty = is_empty ,
175176 )
176177
@@ -181,7 +182,7 @@ def _get_description(self) -> str:
181182class DuplicatesAlert (Alert ):
182183 def __init__ (
183184 self ,
184- values : Optional [ Dict ] = None ,
185+ values : dict ,
185186 column_name : Optional [str ] = None ,
186187 is_empty : bool = False ,
187188 ):
@@ -203,15 +204,14 @@ def _get_description(self) -> str:
203204class EmptyAlert (Alert ):
204205 def __init__ (
205206 self ,
206- values : Optional [ Dict ] = None ,
207+ values : VarDescription ,
207208 column_name : Optional [str ] = None ,
208209 is_empty : bool = False ,
209210 ):
210211 super ().__init__ (
211212 alert_type = AlertType .EMPTY ,
212- values = values ,
213+ values = { "n" : values . n } ,
213214 column_name = column_name ,
214- fields = {"n" },
215215 is_empty = is_empty ,
216216 )
217217
@@ -222,15 +222,14 @@ def _get_description(self) -> str:
222222class HighCardinalityAlert (Alert ):
223223 def __init__ (
224224 self ,
225- values : Optional [ Dict ] = None ,
225+ values : VarDescription ,
226226 column_name : Optional [str ] = None ,
227227 is_empty : bool = False ,
228228 ):
229229 super ().__init__ (
230230 alert_type = AlertType .HIGH_CARDINALITY ,
231- values = values ,
231+ values = { "n_distinct" : values [ "n_distinct" ]} ,
232232 column_name = column_name ,
233- fields = {"n_distinct" },
234233 is_empty = is_empty ,
235234 )
236235
@@ -244,7 +243,7 @@ def _get_description(self) -> str:
244243class HighCorrelationAlert (Alert ):
245244 def __init__ (
246245 self ,
247- values : Optional [ Dict ] = None ,
246+ values : Dict ,
248247 column_name : Optional [str ] = None ,
249248 is_empty : bool = False ,
250249 ):
@@ -270,13 +269,13 @@ def _get_description(self) -> str:
270269class ImbalanceAlert (Alert ):
271270 def __init__ (
272271 self ,
273- values : Optional [ Dict ] = None ,
272+ values : VarDescription ,
274273 column_name : Optional [str ] = None ,
275274 is_empty : bool = False ,
276275 ):
277276 super ().__init__ (
278277 alert_type = AlertType .IMBALANCE ,
279- values = values ,
278+ values = values . var_specific ,
280279 column_name = column_name ,
281280 fields = {"imbalance" },
282281 is_empty = is_empty ,
@@ -293,13 +292,13 @@ def _get_description(self) -> str:
293292class InfiniteAlert (Alert ):
294293 def __init__ (
295294 self ,
296- values : Optional [ Dict ] = None ,
295+ values : VarDescription ,
297296 column_name : Optional [str ] = None ,
298297 is_empty : bool = False ,
299298 ):
300299 super ().__init__ (
301300 alert_type = AlertType .INFINITE ,
302- values = values ,
301+ values = values . var_specific ,
303302 column_name = column_name ,
304303 fields = {"p_infinite" , "n_infinite" },
305304 is_empty = is_empty ,
@@ -315,15 +314,14 @@ def _get_description(self) -> str:
315314class MissingAlert (Alert ):
316315 def __init__ (
317316 self ,
318- values : Optional [ Dict ] = None ,
317+ values : VarDescription ,
319318 column_name : Optional [str ] = None ,
320319 is_empty : bool = False ,
321320 ):
322321 super ().__init__ (
323322 alert_type = AlertType .MISSING ,
324- values = values ,
323+ values = { "p_missing" : values . p_missing , "n_missing" : values . n_missing } ,
325324 column_name = column_name ,
326- fields = {"p_missing" , "n_missing" },
327325 is_empty = is_empty ,
328326 )
329327
@@ -373,13 +371,13 @@ def _get_description(self) -> str:
373371class SkewedAlert (Alert ):
374372 def __init__ (
375373 self ,
376- values : Optional [ Dict ] = None ,
374+ values : VarDescription ,
377375 column_name : Optional [str ] = None ,
378376 is_empty : bool = False ,
379377 ):
380378 super ().__init__ (
381379 alert_type = AlertType .SKEWED ,
382- values = values ,
380+ values = values . var_specific ,
383381 column_name = column_name ,
384382 fields = {"skewness" },
385383 is_empty = is_empty ,
@@ -432,15 +430,19 @@ def _get_description(self) -> str:
432430class UniqueAlert (Alert ):
433431 def __init__ (
434432 self ,
435- values : Optional [ Dict ] = None ,
433+ values : VarDescription ,
436434 column_name : Optional [str ] = None ,
437435 is_empty : bool = False ,
438436 ):
439437 super ().__init__ (
440438 alert_type = AlertType .UNIQUE ,
441- values = values ,
439+ values = {
440+ "n_distinct" : values ["n_distinct" ],
441+ "p_distinct" : values ["p_distinct" ],
442+ "n_unique" : values ["n_unique" ],
443+ "p_unique" : values ["p_unique" ],
444+ },
442445 column_name = column_name ,
443- fields = {"n_distinct" , "p_distinct" , "n_unique" , "p_unique" },
444446 is_empty = is_empty ,
445447 )
446448
@@ -469,13 +471,13 @@ def _get_description(self) -> str:
469471class ZerosAlert (Alert ):
470472 def __init__ (
471473 self ,
472- values : Optional [ Dict ] = None ,
474+ values : VarDescription ,
473475 column_name : Optional [str ] = None ,
474476 is_empty : bool = False ,
475477 ):
476478 super ().__init__ (
477479 alert_type = AlertType .ZEROS ,
478- values = values ,
480+ values = values . var_specific ,
479481 column_name = column_name ,
480482 fields = {"n_zeros" , "p_zeros" },
481483 is_empty = is_empty ,
@@ -531,7 +533,7 @@ def check_table_alerts(table: dict) -> List[Alert]:
531533 return alerts
532534
533535
534- def numeric_alerts (config : Settings , summary : dict ) -> List [Alert ]:
536+ def numeric_alerts (config : Settings , summary : VarDescription ) -> List [Alert ]:
535537 alerts : List [Alert ] = []
536538
537539 # Skewness
@@ -555,7 +557,7 @@ def numeric_alerts(config: Settings, summary: dict) -> List[Alert]:
555557 return alerts
556558
557559
558- def timeseries_alerts (config : Settings , summary : dict ) -> List [Alert ]:
560+ def timeseries_alerts (config : Settings , summary : VarDescription ) -> List [Alert ]:
559561 alerts : List [Alert ] = numeric_alerts (config , summary )
560562
561563 if not summary ["stationary" ]:
@@ -567,7 +569,7 @@ def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]:
567569 return alerts
568570
569571
570- def categorical_alerts (config : Settings , summary : dict ) -> List [Alert ]:
572+ def categorical_alerts (config : Settings , summary : VarDescription ) -> List [Alert ]:
571573 alerts : List [Alert ] = []
572574
573575 # High cardinality
@@ -585,7 +587,7 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
585587
586588 # Constant length
587589 if "composition" in summary and summary ["min_length" ] == summary ["max_length" ]:
588- alerts .append (ConstantLengthAlert ())
590+ alerts .append (ConstantLengthAlert (summary ))
589591
590592 # Imbalance
591593 if (
@@ -596,46 +598,48 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
596598 return alerts
597599
598600
599- def boolean_alerts (config : Settings , summary : dict ) -> List [Alert ]:
601+ def boolean_alerts (config : Settings , summary : VarDescription ) -> List [Alert ]:
600602 alerts : List [Alert ] = []
601603
602604 if (
603605 "imbalance" in summary
604606 and summary ["imbalance" ] > config .vars .bool .imbalance_threshold
605607 ):
606- alerts .append (ImbalanceAlert ())
608+ alerts .append (ImbalanceAlert (summary ))
607609 return alerts
608610
609611
610- def generic_alerts (summary : dict ) -> List [Alert ]:
612+ def generic_alerts (summary : VarDescription ) -> List [Alert ]:
611613 alerts : List [Alert ] = []
612614
613615 # Missing
614- if alert_value (summary [ " p_missing" ] ):
615- alerts .append (MissingAlert ())
616+ if alert_value (summary . p_missing ):
617+ alerts .append (MissingAlert (summary ))
616618
617619 return alerts
618620
619621
620- def supported_alerts (summary : dict ) -> List [Alert ]:
622+ def supported_alerts (summary : VarDescription ) -> List [Alert ]:
621623 alerts : List [Alert ] = []
622624
623- if summary .get ("n_distinct" , np .nan ) == summary [ "n" ] :
624- alerts .append (UniqueAlert ())
625+ if summary .get ("n_distinct" , np .nan ) == summary . n :
626+ alerts .append (UniqueAlert (summary ))
625627 if summary .get ("n_distinct" , np .nan ) == 1 :
626628 alerts .append (ConstantAlert (summary ))
627629 return alerts
628630
629631
630- def unsupported_alerts (summary : Dict [ str , Any ] ) -> List [Alert ]:
632+ def unsupported_alerts (summary : VarDescription ) -> List [Alert ]:
631633 alerts : List [Alert ] = [
632634 UnsupportedAlert (),
633635 RejectedAlert (),
634636 ]
635637 return alerts
636638
637639
638- def check_variable_alerts (config : Settings , col : str , description : dict ) -> List [Alert ]:
640+ def check_variable_alerts (
641+ config : Settings , col : str , description : VarDescription
642+ ) -> List [Alert ]:
639643 """Checks individual variables for alerts.
640644
641645 Args:
@@ -665,7 +669,6 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
665669
666670 for idx in range (len (alerts )):
667671 alerts [idx ].column_name = col
668- alerts [idx ].values = description
669672 return alerts
670673
671674
@@ -693,7 +696,10 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert
693696
694697
695698def get_alerts (
696- config : Settings , table_stats : dict , series_description : dict , correlations : dict
699+ config : Settings ,
700+ table_stats : dict ,
701+ series_description : dict [str , VarDescription ],
702+ correlations : dict ,
697703) -> List [Alert ]:
698704 alerts : List [Alert ] = check_table_alerts (table_stats )
699705 for col , description in series_description .items ():
0 commit comments