diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 1b16d27a0..698801389 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -9,6 +9,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.correlations import perform_check_correlation +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.styles import get_alert_styles @@ -157,13 +158,13 @@ def __repr__(self): class ConstantLengthAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.CONSTANT_LENGTH, - values=values, + values=values.var_specific, column_name=column_name, fields={"composition_min_length", "composition_max_length"}, is_empty=is_empty, @@ -176,15 +177,17 @@ def _get_description(self) -> str: class ConstantAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.CONSTANT, - values=values, + values={ + "n_distinct": values["n_distinct"], + "value_counts_without_nan": values.value_counts_without_nan, + }, column_name=column_name, - fields={"n_distinct"}, is_empty=is_empty, ) @@ -195,7 +198,7 @@ def _get_description(self) -> str: class DuplicatesAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: dict, column_name: Optional[str] = None, is_empty: bool = False, ): @@ -239,15 +242,14 @@ def _get_description(self) -> str: class EmptyAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.EMPTY, - values=values, + values={"n": values.n}, column_name=column_name, - fields={"n"}, is_empty=is_empty, ) @@ -258,15 +260,14 @@ def _get_description(self) -> str: class HighCardinalityAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.HIGH_CARDINALITY, - values=values, + values={"n_distinct": values["n_distinct"]}, column_name=column_name, - fields={"n_distinct"}, is_empty=is_empty, ) @@ -302,7 +303,7 @@ def _get_description(self) -> str: class HighCorrelationAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: Dict, column_name: Optional[str] = None, is_empty: bool = False, ): @@ -328,13 +329,13 @@ def _get_description(self) -> str: class ImbalanceAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.IMBALANCE, - values=values, + values=values.var_specific, column_name=column_name, fields={"imbalance"}, is_empty=is_empty, @@ -351,13 +352,13 @@ def _get_description(self) -> str: class InfiniteAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.INFINITE, - values=values, + values=values.var_specific, column_name=column_name, fields={"p_infinite", "n_infinite"}, is_empty=is_empty, @@ -373,15 +374,14 @@ def _get_description(self) -> str: class MissingAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.MISSING, - values=values, + values={"p_missing": values.p_missing, "n_missing": values.n_missing}, column_name=column_name, - fields={"p_missing", "n_missing"}, is_empty=is_empty, ) @@ -431,13 +431,13 @@ def _get_description(self) -> str: class SkewedAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.SKEWED, - values=values, + values=values.var_specific, column_name=column_name, fields={"skewness"}, is_empty=is_empty, @@ -490,15 +490,19 @@ def _get_description(self) -> str: class UniqueAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.UNIQUE, - values=values, + values={ + "n_distinct": values["n_distinct"], + "p_distinct": values["p_distinct"], + "n_unique": values["n_unique"], + "p_unique": values["p_unique"], + }, column_name=column_name, - fields={"n_distinct", "p_distinct", "n_unique", "p_unique"}, is_empty=is_empty, ) @@ -527,13 +531,13 @@ def _get_description(self) -> str: class ZerosAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.ZEROS, - values=values, + values=values.var_specific, column_name=column_name, fields={"n_zeros", "p_zeros"}, is_empty=is_empty, @@ -589,7 +593,7 @@ def check_table_alerts(table: dict) -> List[Alert]: return alerts -def numeric_alerts(config: Settings, summary: dict) -> List[Alert]: +def numeric_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] # Skewness @@ -613,7 +617,7 @@ def numeric_alerts(config: Settings, summary: dict) -> List[Alert]: return alerts -def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]: +def timeseries_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = numeric_alerts(config, summary) if not summary["stationary"]: @@ -625,7 +629,7 @@ def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]: return alerts -def categorical_alerts(config: Settings, summary: dict) -> List[Alert]: +def categorical_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] # High cardinality @@ -643,7 +647,7 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]: # Constant length if "composition" in summary and summary["min_length"] == summary["max_length"]: - alerts.append(ConstantLengthAlert()) + alerts.append(ConstantLengthAlert(summary)) # Imbalance if ( @@ -654,32 +658,32 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]: return alerts -def boolean_alerts(config: Settings, summary: dict) -> List[Alert]: +def boolean_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] if ( "imbalance" in summary and summary["imbalance"] > config.vars.bool.imbalance_threshold ): - alerts.append(ImbalanceAlert()) + alerts.append(ImbalanceAlert(summary)) return alerts -def generic_alerts(summary: dict) -> List[Alert]: +def generic_alerts(summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] # Missing - if alert_value(summary["p_missing"]): - alerts.append(MissingAlert()) + if alert_value(summary.p_missing): + alerts.append(MissingAlert(summary)) return alerts -def supported_alerts(summary: dict) -> List[Alert]: +def supported_alerts(summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] - if summary.get("n_distinct", np.nan) == summary["n"]: - alerts.append(UniqueAlert()) + if summary.get("n_distinct", np.nan) == summary.n: + alerts.append(UniqueAlert(summary)) if summary.get("n_distinct", np.nan) == 1: alerts.append(ConstantAlert(summary)) return alerts @@ -693,7 +697,9 @@ def unsupported_alerts() -> List[Alert]: return alerts -def check_variable_alerts(config: Settings, col: str, description: dict) -> List[Alert]: +def check_variable_alerts( + config: Settings, col: str, description: VarDescription +) -> List[Alert]: """Checks individual variables for alerts. Args: @@ -723,7 +729,6 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List for idx in range(len(alerts)): alerts[idx].column_name = col - alerts[idx].values = description return alerts @@ -751,7 +756,10 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert def get_alerts( - config: Settings, table_stats: dict, series_description: dict, correlations: dict + config: Settings, + table_stats: dict, + series_description: Dict[str, VarDescription], + correlations: dict, ) -> List[Alert]: alerts: List[Alert] = check_table_alerts(table_stats) for col, description in series_description.items(): diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 74bdf924a..f085adee0 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -1,4 +1,5 @@ """Organize the calculation of statistics for each series in this DataFrame.""" + from datetime import datetime from typing import Any, Dict, Optional, Union @@ -23,6 +24,7 @@ from ydata_profiling.model.summary import get_series_descriptions from ydata_profiling.model.table import get_table_stats from ydata_profiling.model.timeseries_index import get_time_index_description +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.progress_bar import progress from ydata_profiling.version import __version__ @@ -86,7 +88,7 @@ def describe( # Variable-specific pbar.total += len(df.columns) - series_description = get_series_descriptions( + series_description: Dict[str, VarDescription] = get_series_descriptions( config, df, summarizer, typeset, pbar ) diff --git a/src/ydata_profiling/model/description.py b/src/ydata_profiling/model/description.py index fd1d22ae6..71a1d50ea 100644 --- a/src/ydata_profiling/model/description.py +++ b/src/ydata_profiling/model/description.py @@ -4,6 +4,8 @@ from pandas import Timedelta +from ydata_profiling.model.var_description.default import VarDescription + @dataclass class BaseAnalysis: @@ -98,7 +100,7 @@ class BaseDescription: analysis: BaseAnalysis time_index_analysis: Optional[TimeIndexAnalysis] table: Any - variables: Dict[str, Any] + variables: Dict[str, VarDescription] scatter: Any correlations: Dict[str, Any] missing: Dict[str, Any] diff --git a/src/ydata_profiling/model/expectation_algorithms.py b/src/ydata_profiling/model/expectation_algorithms.py index cbbeb635e..50c1feea7 100644 --- a/src/ydata_profiling/model/expectation_algorithms.py +++ b/src/ydata_profiling/model/expectation_algorithms.py @@ -1,12 +1,14 @@ from typing import Any, Tuple +from ydata_profiling.model.var_description.default import VarDescription + def generic_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: batch.expect_column_to_exist(name) - if summary["n_missing"] == 0: + if summary.n_missing == 0: batch.expect_column_values_to_not_be_null(name) if summary["p_unique"] == 1.0: @@ -16,8 +18,8 @@ def generic_expectations( def numeric_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: from great_expectations.profile.base import ProfilerTypeMapping numeric_type_names = ( @@ -56,8 +58,8 @@ def numeric_expectations( def categorical_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: # Use for both categorical and special case (boolean) absolute_threshold = 10 relative_threshold = 0.2 @@ -66,20 +68,20 @@ def categorical_expectations( or summary["p_distinct"] < relative_threshold ): batch.expect_column_values_to_be_in_set( - name, set(summary["value_counts_without_nan"].keys()) + name, set(summary.value_counts_without_nan.keys()) ) return name, summary, batch def path_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: return name, summary, batch def datetime_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: if any(k in summary for k in ["min", "max"]): batch.expect_column_values_to_be_between( name, @@ -92,20 +94,20 @@ def datetime_expectations( def image_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: return name, summary, batch def url_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: return name, summary, batch def file_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: # By definition within our type logic, a file exists (as it's a path that also exists) batch.expect_file_to_exist(name) diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py index 381df8e86..c7d21070f 100644 --- a/src/ydata_profiling/model/pandas/__init__.py +++ b/src/ydata_profiling/model/pandas/__init__.py @@ -3,7 +3,6 @@ # List of modules in the 'pandas' model that should be imported explicitly PANDAS_MODULES = [ "correlations_pandas", - "describe_generic_pandas", "describe_boolean_pandas", "describe_categorical_pandas", "describe_url_pandas", @@ -14,7 +13,6 @@ "describe_path_pandas", "describe_image_pandas", "describe_date_pandas", - "describe_counts_pandas", "duplicates_pandas", "sample_pandas", "table_pandas", @@ -35,7 +33,6 @@ # Explicitly list exposed names for clarity __all__ = [ - "pandas_describe_generic", "pandas_describe_boolean_1d", "pandas_describe_categorical_1d", "pandas_describe_url_1d", @@ -46,7 +43,6 @@ "pandas_describe_path_1d", "pandas_describe_image_1d", "pandas_describe_date_1d", - "pandas_describe_counts", "pandas_get_duplicates", "pandas_get_sample", "pandas_get_table_stats", diff --git a/src/ydata_profiling/model/pandas/correlations_pandas.py b/src/ydata_profiling/model/pandas/correlations_pandas.py index 94eef2f95..68a766aaf 100644 --- a/src/ydata_profiling/model/pandas/correlations_pandas.py +++ b/src/ydata_profiling/model/pandas/correlations_pandas.py @@ -1,4 +1,5 @@ """Correlations between variables.""" + import itertools import warnings from typing import Callable, Optional @@ -12,6 +13,7 @@ DiscretizationType, Discretizer, ) +from ydata_profiling.model.var_description.default import VarDescription def spearman_compute( @@ -77,7 +79,7 @@ def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float: def cramers_compute( - config: Settings, df: pd.DataFrame, summary: dict + config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] ) -> Optional[pd.DataFrame]: threshold = config.categorical_maximum_correlation_distinct @@ -117,7 +119,7 @@ def cramers_compute( def phik_compute( - config: Settings, df: pd.DataFrame, summary: dict + config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] ) -> Optional[pd.DataFrame]: df_cols_dict = {i: list(df.columns).index(i) for i in df.columns} @@ -152,7 +154,7 @@ def phik_compute( def auto_compute( - config: Settings, df: pd.DataFrame, summary: dict + config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] ) -> Optional[pd.DataFrame]: threshold = config.categorical_maximum_correlation_distinct numerical_columns = [ @@ -181,7 +183,6 @@ def auto_compute( columns=columns_tested, ) for col_1_name, col_2_name in itertools.combinations(columns_tested, 2): - method = ( _pairwise_spearman if any(elem in categorical_columns for elem in [col_1_name, col_2_name]) diff --git a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py index 9b2014db7..07d446337 100644 --- a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py @@ -5,17 +5,14 @@ from ydata_profiling.config import Settings from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score -from ydata_profiling.model.summary_algorithms import ( - describe_boolean_1d, - series_hashable, -) +from ydata_profiling.model.summary_algorithms import describe_boolean_1d +from ydata_profiling.model.var_description.default import VarDescription @describe_boolean_1d.register -@series_hashable def pandas_describe_boolean_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a boolean series. Args: @@ -26,8 +23,7 @@ def pandas_describe_boolean_1d( Returns: A dict containing calculated series description values. """ - - value_counts: pd.Series = summary["value_counts_without_nan"] + value_counts: pd.Series = summary.value_counts_without_nan if not value_counts.empty: summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]}) summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index e711acdd9..325112f45 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -16,6 +16,7 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.information import DisplayInfo @@ -218,8 +219,8 @@ def length_summary_vc(vc: pd.Series) -> dict: @series_hashable @series_handle_nulls def pandas_describe_categorical_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a categorical series. Args: @@ -237,7 +238,7 @@ def pandas_describe_categorical_1d( series = series.astype(str) # Only run if at least 1 non-missing value - value_counts = summary["value_counts_without_nan"] + value_counts = summary.value_counts_without_nan value_counts.index = value_counts.index.astype(str) summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) diff --git a/src/ydata_profiling/model/pandas/describe_counts_pandas.py b/src/ydata_profiling/model/pandas/describe_counts_pandas.py deleted file mode 100644 index 07cdad9d5..000000000 --- a/src/ydata_profiling/model/pandas/describe_counts_pandas.py +++ /dev/null @@ -1,63 +0,0 @@ -from typing import Tuple - -import pandas as pd - -from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_counts - - -@describe_counts.register -def pandas_describe_counts( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: - """Counts the values in a series (with and without NaN, distinct). - - Args: - config: report Settings object - series: Series for which we want to calculate the values. - summary: series' summary - - Returns: - A dictionary with the count values (with and without NaN, distinct). - """ - try: - value_counts_with_nan = series.value_counts(dropna=False) - _ = set(value_counts_with_nan.index) - hashable = True - except: # noqa: E722 - hashable = False - - summary["hashable"] = hashable - - if hashable: - value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0] - - null_index = value_counts_with_nan.index.isnull() - if null_index.any(): - n_missing = value_counts_with_nan[null_index].sum() - value_counts_without_nan = value_counts_with_nan[~null_index] - else: - n_missing = 0 - value_counts_without_nan = value_counts_with_nan - - summary.update( - { - "value_counts_without_nan": value_counts_without_nan, - } - ) - - try: - summary["value_counts_index_sorted"] = summary[ - "value_counts_without_nan" - ].sort_index(ascending=True) - ordering = True - except TypeError: - ordering = False - else: - n_missing = series.isna().sum() - ordering = False - - summary["ordering"] = ordering - summary["n_missing"] = n_missing - - return config, series, summary diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index 72b25a697..169e7367a 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -12,6 +12,7 @@ series_hashable, ) from ydata_profiling.model.typeset_relations import is_pandas_1 +from ydata_profiling.model.var_description.default import VarDescription def to_datetime(series: pd.Series) -> pd.Series: @@ -24,8 +25,8 @@ def to_datetime(series: pd.Series) -> pd.Series: @series_hashable @series_handle_nulls def pandas_describe_date_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a date series. Args: @@ -36,13 +37,14 @@ def pandas_describe_date_1d( Returns: A dict containing calculated series description values. """ + og_series = series.dropna() series = to_datetime(og_series) invalid_values = og_series[series.isna()] series = series.dropna() - if summary["value_counts_without_nan"].empty: + if summary.value_counts_without_nan.empty: values = series.values summary.update( { @@ -71,7 +73,7 @@ def pandas_describe_date_1d( { "invalid_dates": invalid_values.nunique(), "n_invalid_dates": len(invalid_values), - "p_invalid_dates": len(invalid_values) / summary["n"], + "p_invalid_dates": len(invalid_values) / summary.n, } ) return config, values, summary diff --git a/src/ydata_profiling/model/pandas/describe_file_pandas.py b/src/ydata_profiling/model/pandas/describe_file_pandas.py index 84ee3c4ab..18b4e511c 100644 --- a/src/ydata_profiling/model/pandas/describe_file_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_file_pandas.py @@ -6,6 +6,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_file_1d, histogram_compute +from ydata_profiling.model.var_description.default import VarDescription def file_summary(series: pd.Series) -> dict: @@ -36,8 +37,8 @@ def convert_datetime(x: float) -> str: @describe_file_1d.register def pandas_describe_file_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: if series.hasnans: raise ValueError("May not contain NaNs") if not hasattr(series, "str"): diff --git a/src/ydata_profiling/model/pandas/describe_generic_pandas.py b/src/ydata_profiling/model/pandas/describe_generic_pandas.py deleted file mode 100644 index 21b804e66..000000000 --- a/src/ydata_profiling/model/pandas/describe_generic_pandas.py +++ /dev/null @@ -1,36 +0,0 @@ -from typing import Tuple - -import pandas as pd - -from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_generic - - -@describe_generic.register -def pandas_describe_generic( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: - """Describe generic series. - - Args: - config: report Settings object - series: The Series to describe. - summary: The dict containing the series description so far. - - Returns: - A dict containing calculated series description values. - """ - - # number of observations in the Series - length = len(series) - - summary.update( - { - "n": length, - "p_missing": summary["n_missing"] / length if length > 0 else 0, - "count": length - summary["n_missing"], - "memory_size": series.memory_usage(deep=config.memory_deep), - } - ) - - return config, series, summary diff --git a/src/ydata_profiling/model/pandas/describe_image_pandas.py b/src/ydata_profiling/model/pandas/describe_image_pandas.py index 08675ed0c..d5f7c8975 100644 --- a/src/ydata_profiling/model/pandas/describe_image_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_image_pandas.py @@ -12,6 +12,7 @@ describe_image_1d, named_aggregate_summary, ) +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.imghdr_patch import * # noqa: F401,F403 @@ -243,8 +244,8 @@ def image_summary(series: pd.Series, exif: bool = False, hash: bool = False) -> @describe_image_1d.register def pandas_describe_image_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: if series.hasnans: raise ValueError("May not contain NaNs") if not hasattr(series, "str"): diff --git a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py index 21eee6c11..f6797e601 100644 --- a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.compat import pandas_version_info if pandas_version_info() >= (1, 5): @@ -44,9 +45,9 @@ def numeric_stats_pandas(series: pd.Series) -> Dict[str, Any]: def numeric_stats_numpy( - present_values: np.ndarray, series: pd.Series, series_description: Dict[str, Any] + present_values: np.ndarray, series: pd.Series, series_description: VarDescription ) -> Dict[str, Any]: - vc = series_description["value_counts_without_nan"] + vc = series_description.value_counts_without_nan index_values = vc.index.values # FIXME: can be performance optimized by using weights in std, var, kurt and skew... @@ -80,8 +81,8 @@ def numeric_stats_numpy( @series_hashable @series_handle_nulls def pandas_describe_numeric_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a numeric series. Args: @@ -96,11 +97,11 @@ def pandas_describe_numeric_1d( chi_squared_threshold = config.vars.num.chi_squared_threshold quantiles = config.vars.num.quantiles - value_counts = summary["value_counts_without_nan"] + value_counts = summary.value_counts_without_nan negative_index = value_counts.index < 0 summary["n_negative"] = value_counts.loc[negative_index].sum() - summary["p_negative"] = summary["n_negative"] / summary["n"] + summary["p_negative"] = summary["n_negative"] / summary.n infinity_values = [np.inf, -np.inf] infinity_index = value_counts.index.isin(infinity_values) @@ -138,9 +139,10 @@ def pandas_describe_numeric_1d( } ) stats["iqr"] = stats["75%"] - stats["25%"] - stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.nan - stats["p_zeros"] = stats["n_zeros"] / summary["n"] - stats["p_infinite"] = summary["n_infinite"] / summary["n"] + + stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN + stats["p_zeros"] = stats["n_zeros"] / summary.n + stats["p_infinite"] = summary["n_infinite"] / summary.n stats["monotonic_increase"] = series.is_monotonic_increasing stats["monotonic_decrease"] = series.is_monotonic_decreasing diff --git a/src/ydata_profiling/model/pandas/describe_path_pandas.py b/src/ydata_profiling/model/pandas/describe_path_pandas.py index e3e536f99..31ac65f88 100644 --- a/src/ydata_profiling/model/pandas/describe_path_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_path_pandas.py @@ -5,6 +5,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_path_1d +from ydata_profiling.model.var_description.default import VarDescription def path_summary(series: pd.Series) -> dict: @@ -19,8 +20,9 @@ def path_summary(series: pd.Series) -> dict: # TODO: optimize using value counts summary = { - "common_prefix": os.path.commonprefix(series.values.tolist()) - or "No common prefix", + "common_prefix": ( + os.path.commonprefix(series.values.tolist()) or "No common prefix" + ), "stem_counts": series.map(lambda x: os.path.splitext(x)[0]).value_counts(), "suffix_counts": series.map(lambda x: os.path.splitext(x)[1]).value_counts(), "name_counts": series.map(lambda x: os.path.basename(x)).value_counts(), @@ -39,8 +41,8 @@ def path_summary(series: pd.Series) -> dict: @describe_path_1d.register def pandas_describe_path_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a path series. Args: diff --git a/src/ydata_profiling/model/pandas/describe_supported_pandas.py b/src/ydata_profiling/model/pandas/describe_supported_pandas.py index 16bd9ab38..69e19f873 100644 --- a/src/ydata_profiling/model/pandas/describe_supported_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_supported_pandas.py @@ -3,14 +3,17 @@ import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_supported, series_hashable +from ydata_profiling.model.pandas.var_description.default_pandas import ( + get_default_pandas_description, +) +from ydata_profiling.model.summary_algorithms import describe_supported +from ydata_profiling.model.var_description.default import VarDescription @describe_supported.register -@series_hashable def pandas_describe_supported( - config: Settings, series: pd.Series, series_description: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, description: dict +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a supported series. Args: @@ -22,20 +25,6 @@ def pandas_describe_supported( A dict containing calculated series description values. """ - # number of non-NaN observations in the Series - count = series_description["count"] + series_description = get_default_pandas_description(config, series, description) - value_counts = series_description["value_counts_without_nan"] - distinct_count = len(value_counts) - unique_count = value_counts.where(value_counts == 1).count() - - stats = { - "n_distinct": distinct_count, - "p_distinct": distinct_count / count if count > 0 else 0, - "is_unique": unique_count == count and count > 0, - "n_unique": unique_count, - "p_unique": unique_count / count if count > 0 else 0, - } - stats.update(series_description) - - return config, series, stats + return config, series, series_description diff --git a/src/ydata_profiling/model/pandas/describe_text_pandas.py b/src/ydata_profiling/model/pandas/describe_text_pandas.py index d1dc734c4..a8e70a5bc 100644 --- a/src/ydata_profiling/model/pandas/describe_text_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_text_pandas.py @@ -13,6 +13,7 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.var_description.default import VarDescription @series_hashable @@ -20,8 +21,8 @@ def pandas_describe_text_1d( config: Settings, series: pd.Series, - summary: dict, -) -> Tuple[Settings, pd.Series, dict]: + summary: VarDescription, +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe string series. Args: @@ -36,7 +37,7 @@ def pandas_describe_text_1d( series = series.astype(str) # Only run if at least 1 non-missing value - value_counts = summary["value_counts_without_nan"] + value_counts = summary.value_counts_without_nan value_counts.index = value_counts.index.astype(str) summary.update({"first_rows": series.head(5)}) diff --git a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py index 7db4d56f3..3ca98fb14 100644 --- a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py @@ -13,6 +13,7 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.var_description.default import VarDescription def stationarity_test(config: Settings, series: pd.Series) -> Tuple[bool, float]: @@ -198,8 +199,8 @@ def compute_gap_stats(series: pd.Series) -> pd.Series: @series_hashable @series_handle_nulls def pandas_describe_timeseries_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a timeseries. Args: diff --git a/src/ydata_profiling/model/pandas/describe_url_pandas.py b/src/ydata_profiling/model/pandas/describe_url_pandas.py index bfe5239bf..4a64a8c30 100644 --- a/src/ydata_profiling/model/pandas/describe_url_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_url_pandas.py @@ -5,6 +5,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_url_1d +from ydata_profiling.model.var_description.default import VarDescription def url_summary(series: pd.Series) -> dict: @@ -29,8 +30,8 @@ def url_summary(series: pd.Series) -> dict: @describe_url_1d.register def pandas_describe_url_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a url series. Args: diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 2103844f8..1dc169baa 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -10,6 +10,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.typeset import ProfilingTypeSet +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.compat import optional_option_context from ydata_profiling.utils.dataframe import sort_column_names @@ -25,7 +26,7 @@ def pandas_describe_1d( series: pd.Series, summarizer: BaseSummarizer, typeset: VisionsTypeset, -) -> dict: +) -> VarDescription: """Describe a series (infer the variable type, then calculate type-specific values). Args: @@ -74,8 +75,8 @@ def pandas_get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> dict: - def describe_column(name: str, series: pd.Series) -> Tuple[str, dict]: +) -> dict[str, VarDescription]: + def describe_column(name: str, series: pd.Series) -> Tuple[str, VarDescription]: """Process a single series to get the column description.""" pbar.set_postfix_str(f"Describe variable: {name}") description = pandas_describe_1d(config, series, summarizer, typeset) diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index a919ee33b..9198fb0e0 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -1,14 +1,16 @@ from collections import Counter +from typing import Dict import pandas as pd from ydata_profiling.config import Settings from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.var_description.default import VarDescription @get_table_stats.register def pandas_get_table_stats( - config: Settings, df: pd.DataFrame, variable_stats: dict + config: Settings, df: pd.DataFrame, variable_stats: Dict[str, VarDescription] ) -> dict: """General statistics for the DataFrame. @@ -36,10 +38,10 @@ def pandas_get_table_stats( } for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: + if series_summary.n_missing > 0: table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: + table_stats["n_cells_missing"] += series_summary.n_missing + if series_summary.n_missing == n: table_stats["n_vars_all_missing"] += 1 table_stats["p_cells_missing"] = ( diff --git a/src/ydata_profiling/model/pandas/var_description/counts_pandas.py b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py new file mode 100644 index 000000000..6ffc3e6d5 --- /dev/null +++ b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py @@ -0,0 +1,53 @@ +import pandas as pd + +from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.counts import VarCounts + + +def get_counts_pandas(config: Settings, series: pd.Series) -> VarCounts: + """Get a VarCounts object for a pandas series.""" + length = len(series) + + try: + value_counts_with_nan = series.value_counts(dropna=False) + _ = set(value_counts_with_nan.index) + hashable = True + except: # noqa: E722 + hashable = False + + value_counts_without_nan = None + value_counts_index_sorted = None + if hashable: + value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0] + + null_index = value_counts_with_nan.index.isnull() + if null_index.any(): + n_missing = value_counts_with_nan[null_index].sum() + value_counts_without_nan = value_counts_with_nan[~null_index] + else: + n_missing = 0 + value_counts_without_nan = value_counts_with_nan + + try: + value_counts_index_sorted = value_counts_without_nan.sort_index( + ascending=True + ) + ordering = True + except TypeError: + ordering = False + else: + n_missing = series.isna().sum() + ordering = False + + return VarCounts( + hashable=hashable, + value_counts_without_nan=value_counts_without_nan, + value_counts_index_sorted=value_counts_index_sorted, + ordering=ordering, + n_missing=n_missing, + n=length, + p_missing=series.isna().sum() / length if length > 0 else 0, + count=length - series.isna().sum(), + memory_size=series.memory_usage(deep=config.memory_deep), + value_counts=None, + ) diff --git a/src/ydata_profiling/model/pandas/var_description/default_pandas.py b/src/ydata_profiling/model/pandas/var_description/default_pandas.py new file mode 100644 index 000000000..6a1c21a42 --- /dev/null +++ b/src/ydata_profiling/model/pandas/var_description/default_pandas.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import pandas as pd + +from ydata_profiling.config import Settings +from ydata_profiling.model.pandas.var_description.counts_pandas import get_counts_pandas +from ydata_profiling.model.var_description.default import VarDescription + + +def get_default_pandas_description( + config: Settings, series: pd.Series, init_dict: dict +) -> VarDescription: + var_counts = get_counts_pandas(config, series) + + if var_counts.hashable: + count = var_counts.count + value_counts = var_counts.value_counts_without_nan + distinct_count = len(value_counts) + unique_count = value_counts.where(value_counts == 1).count() + + init_dict.update( + { + "n_distinct": distinct_count, + "p_distinct": distinct_count / count if count > 0 else 0, + "is_unique": unique_count == count and count > 0, + "n_unique": unique_count, + "p_unique": unique_count / count if count > 0 else 0, + } + ) + + return VarDescription.from_var_counts(var_counts, init_dict) diff --git a/src/ydata_profiling/model/spark/__init__.py b/src/ydata_profiling/model/spark/__init__.py index b71241218..3459eedbd 100644 --- a/src/ydata_profiling/model/spark/__init__.py +++ b/src/ydata_profiling/model/spark/__init__.py @@ -6,9 +6,7 @@ "dataframe_spark", "describe_boolean_spark", "describe_categorical_spark", - "describe_counts_spark", "describe_date_spark", - "describe_generic_spark", "describe_numeric_spark", "describe_supported_spark", "duplicates_spark", diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py index f9f1d1ecb..0c6fb69a4 100644 --- a/src/ydata_profiling/model/spark/correlations_spark.py +++ b/src/ydata_profiling/model/spark/correlations_spark.py @@ -1,4 +1,5 @@ """Correlations between variables.""" + from typing import Optional import pandas as pd diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py index 148dbce6c..c38295475 100644 --- a/src/ydata_profiling/model/spark/describe_boolean_spark.py +++ b/src/ydata_profiling/model/spark/describe_boolean_spark.py @@ -3,11 +3,12 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription def describe_boolean_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a boolean series. Args: @@ -18,7 +19,7 @@ def describe_boolean_1d_spark( A dict containing calculated series description values. """ - value_counts = summary["value_counts"] + value_counts = summary.value_counts # get the most common boolean value and its frequency top = value_counts.first() diff --git a/src/ydata_profiling/model/spark/describe_categorical_spark.py b/src/ydata_profiling/model/spark/describe_categorical_spark.py index 5afdb475c..562472b3d 100644 --- a/src/ydata_profiling/model/spark/describe_categorical_spark.py +++ b/src/ydata_profiling/model/spark/describe_categorical_spark.py @@ -4,12 +4,13 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_categorical_1d +from ydata_profiling.model.var_description.default import VarDescription @describe_categorical_1d.register def describe_categorical_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a categorical series. Args: diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py index c44d36650..20264f577 100644 --- a/src/ydata_profiling/model/spark/describe_date_spark.py +++ b/src/ydata_profiling/model/spark/describe_date_spark.py @@ -5,6 +5,7 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription def date_stats_spark(df: DataFrame, summary: dict) -> dict: @@ -19,8 +20,8 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict: def describe_date_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a date series. Args: diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py deleted file mode 100644 index 1171881cd..000000000 --- a/src/ydata_profiling/model/spark/describe_generic_spark.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Tuple - -from pyspark.sql import DataFrame - -from ydata_profiling.config import Settings - - -def describe_generic_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: - """Describe generic series. - Args: - series: The Series to describe. - summary: The dict containing the series description so far. - Returns: - A dict containing calculated series description values. - """ - - # number of observations in the Series - length = df.count() - - summary["n"] = length - summary["p_missing"] = summary["n_missing"] / length - summary["count"] = length - summary["n_missing"] - - # FIXME: This is not correct, but used to fulfil render expectations - # @chanedwin - summary["memory_size"] = 0 - - return config, df, summary diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 0fbd68198..55db2cc00 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -6,9 +6,10 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import histogram_compute +from ydata_profiling.model.var_description.default import VarDescription -def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: +def numeric_stats_spark(df: DataFrame, summary: VarDescription) -> dict: column = df.columns[0] expr = [ @@ -25,8 +26,8 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: def describe_numeric_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a boolean series. Args: @@ -47,7 +48,7 @@ def describe_numeric_1d_spark( summary["kurtosis"] = stats["kurtosis"] summary["sum"] = stats["sum"] - value_counts = summary["value_counts"] + value_counts = summary.value_counts n_infinite = ( value_counts.where(F.col(df.columns[0]).isin([np.inf, -np.inf])) @@ -102,12 +103,12 @@ def describe_numeric_1d_spark( ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0] # FIXME: move to fmt - summary["p_negative"] = summary["n_negative"] / summary["n"] + summary["p_negative"] = summary["n_negative"] / summary.n summary["range"] = summary["max"] - summary["min"] summary["iqr"] = summary["75%"] - summary["25%"] summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.NaN - summary["p_zeros"] = summary["n_zeros"] / summary["n"] - summary["p_infinite"] = summary["n_infinite"] / summary["n"] + summary["p_zeros"] = summary["n_zeros"] / summary.n + summary["p_infinite"] = summary["n_infinite"] / summary.n # TODO - enable this feature # because spark doesn't have an indexing system, there isn't really the idea of monotonic increase/decrease @@ -121,14 +122,14 @@ def describe_numeric_1d_spark( # the alternative is to do this in spark natively, but it is not trivial infinity_values = [np.inf, -np.inf] - infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values) + infinity_index = summary.value_counts_without_nan.index.isin(infinity_values) summary.update( histogram_compute( config, - summary["value_counts_without_nan"][~infinity_index].index.values, + summary.value_counts_without_nan[~infinity_index].index.values, summary["n_distinct"], - weights=summary["value_counts_without_nan"][~infinity_index].values, + weights=summary.value_counts_without_nan[~infinity_index].values, ) ) diff --git a/src/ydata_profiling/model/spark/describe_supported_spark.py b/src/ydata_profiling/model/spark/describe_supported_spark.py index 1758f668d..d5d395156 100644 --- a/src/ydata_profiling/model/spark/describe_supported_spark.py +++ b/src/ydata_profiling/model/spark/describe_supported_spark.py @@ -3,13 +3,17 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.spark.var_description.default_spark import ( + get_default_spark_description, +) from ydata_profiling.model.summary_algorithms import describe_supported +from ydata_profiling.model.var_description.default import VarDescription @describe_supported.register def describe_supported_spark( config: Settings, series: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a supported series. Args: series: The Series to describe. @@ -18,16 +22,6 @@ def describe_supported_spark( A dict containing calculated series description values. """ - # number of non-NaN observations in the Series - count = summary["count"] - n_distinct = summary["value_counts"].count() + series_description = get_default_spark_description(config, series, summary) - summary["n_distinct"] = n_distinct - summary["p_distinct"] = n_distinct / count if count > 0 else 0 - - n_unique = summary["value_counts"].where("count == 1").count() - summary["is_unique"] = n_unique == count - summary["n_unique"] = n_unique - summary["p_unique"] = n_unique / count if count > 0 else 0 - - return config, series, summary + return config, series, series_description diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py index 6d7804cf5..6d9a6af0b 100644 --- a/src/ydata_profiling/model/spark/describe_text_spark.py +++ b/src/ydata_profiling/model/spark/describe_text_spark.py @@ -3,11 +3,12 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription def describe_text_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a categorical series. Args: diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index e8145d76c..f12dcb440 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -1,4 +1,5 @@ """Compute statistical description of datasets.""" + from pyspark.sql import DataFrame from ydata_profiling.config import Settings diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/var_description/counts_spark.py similarity index 71% rename from src/ydata_profiling/model/spark/describe_counts_spark.py rename to src/ydata_profiling/model/spark/var_description/counts_spark.py index d7a091e7f..264330a1a 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/var_description/counts_spark.py @@ -1,30 +1,22 @@ """ Pyspark counts """ -from typing import Tuple - import pandas as pd from pyspark.sql import DataFrame from pyspark.sql import functions as F from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_counts - +from ydata_profiling.model.var_description.default import VarCounts -@describe_counts.register -def describe_counts_spark( - config: Settings, series: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: - """Counts the values in a series (with and without NaN, distinct). +def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts: + """Get a VarCounts object for a spark series. Args: config: Profiling settings. series: Spark DataFrame column for which we want to calculate the values. summary: Dictionary to store the summary results. - - Returns: - Updated settings, input series, and summary dictionary. """ + length = series.count() # Count occurrences of each value value_counts = series.groupBy(series.columns[0]).count() @@ -49,10 +41,6 @@ def describe_counts_spark( .squeeze(axis="columns") ) - summary["n_missing"] = n_missing - summary["value_counts"] = value_counts.persist() - summary["value_counts_index_sorted"] = top_200_sorted - column = series.columns[0] if series.dtypes[0][1] in ("int", "float", "bigint", "double"): @@ -73,15 +61,25 @@ def describe_counts_spark( .limit(200) # Limit for performance ) - # Convert to Pandas Series, forcing proper structure + # Convert to Pandas Series, forcing proper structure if value_counts_no_nan.count() > 0: pdf = value_counts_no_nan.toPandas().set_index(column)["count"] - summary["value_counts_without_nan"] = pd.Series( - pdf - ) # Ensures it's always a Series + value_counts_without_nan = pd.Series(pdf) # Ensures it's always a Series else: - summary["value_counts_without_nan"] = pd.Series( - dtype=int - ) # Ensures an empty Series - - return config, series, summary + value_counts_without_nan = pd.Series(dtype=int) # Ensures an empty Series + + # @chanedwin + memory_size = 0 + + return VarCounts( + hashable=False, + value_counts_without_nan=value_counts_without_nan, + value_counts_index_sorted=top_200_sorted, + ordering=False, + n_missing=n_missing, + n=length, + p_missing=n_missing / length, + count=length - n_missing, + memory_size=memory_size, + value_counts=value_counts.persist(), + ) diff --git a/src/ydata_profiling/model/spark/var_description/default_spark.py b/src/ydata_profiling/model/spark/var_description/default_spark.py new file mode 100644 index 000000000..687be178e --- /dev/null +++ b/src/ydata_profiling/model/spark/var_description/default_spark.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from pyspark.sql import DataFrame + +from ydata_profiling.config import Settings +from ydata_profiling.model.spark.var_description.counts_spark import get_counts_spark +from ydata_profiling.model.var_description.default import VarDescription + + +def get_default_spark_description( + config: Settings, series: DataFrame, init_dict: dict +) -> VarDescription: + var_counts = get_counts_spark(config, series) + + count = var_counts.count + n_distinct = var_counts.value_counts.count() + + p_distinct = n_distinct / count if count > 0 else 0 + + n_unique = var_counts.value_counts.where("count == 1").count() + is_unique = n_unique == count + p_unique = n_unique / count if count > 0 else 0 + + init_dict.update( + { + "n_distinct": n_distinct, + "p_distinct": p_distinct, + "is_unique": is_unique, + "n_unique": n_unique, + "p_unique": p_unique, + } + ) + + return VarDescription( + n=var_counts.n, + count=var_counts.count, + n_missing=var_counts.n_missing, + p_missing=var_counts.p_missing, + hashable=var_counts.hashable, + memory_size=var_counts.memory_size, + ordering=var_counts.ordering, + value_counts_index_sorted=var_counts.value_counts_index_sorted, + value_counts_without_nan=var_counts.value_counts_without_nan, + value_counts=var_counts.value_counts, + var_specific=init_dict, + ) diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..bb296cf06 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -13,10 +13,8 @@ from ydata_profiling.model.pandas import ( pandas_describe_boolean_1d, pandas_describe_categorical_1d, - pandas_describe_counts, pandas_describe_date_1d, pandas_describe_file_1d, - pandas_describe_generic, pandas_describe_image_1d, pandas_describe_numeric_1d, pandas_describe_path_1d, @@ -34,6 +32,7 @@ describe_timeseries_1d, describe_url_1d, ) +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.backend import is_pyspark_installed @@ -45,7 +44,7 @@ class BaseSummarizer(Handler): def summarize( self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] - ) -> dict: + ) -> VarDescription: """Generates the summary for a given series""" return self.handle(str(dtype), config, series, {"type": str(dtype)}) @@ -70,9 +69,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, describe_categorical_1d_spark, - describe_counts_spark, describe_date_1d_spark, - describe_generic_spark, describe_numeric_1d_spark, describe_supported_spark, describe_text_1d_spark, @@ -80,8 +77,6 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: summary_map = { "Unsupported": [ - describe_counts_spark, - describe_generic_spark, describe_supported_spark, ], "Numeric": [describe_numeric_1d_spark], @@ -98,8 +93,6 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: else: summary_map = { "Unsupported": [ - pandas_describe_counts, - pandas_describe_generic, pandas_describe_supported, ], "Numeric": [pandas_describe_numeric_1d], @@ -116,7 +109,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: return summary_map -def format_summary(summary: Union[BaseDescription, dict]) -> dict: +def format_summary(summary: Union[BaseDescription, VarDescription, dict]) -> dict: """Prepare summary for export to json file. Args: diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index 4fa9831a7..1b02f0789 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -12,6 +12,7 @@ pandas_get_series_descriptions, ) from ydata_profiling.model.summarizer import BaseSummarizer +from ydata_profiling.model.var_description.default import VarDescription spec = importlib.util.find_spec("pyspark") if spec is None: @@ -33,7 +34,7 @@ def describe_1d( series: Any, summarizer: BaseSummarizer, typeset: VisionsTypeset, -) -> dict: +) -> VarDescription: """ Add here the description and improve the documentation Args: @@ -57,7 +58,7 @@ def get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> dict: +) -> dict[str, VarDescription]: if isinstance(df, pd.DataFrame): return pandas_get_series_descriptions(config, df, summarizer, typeset, pbar) elif isinstance(df, sparkDataFrame): # type: ignore diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index e70c467d2..468daffc4 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -7,6 +7,7 @@ from scipy.stats import chisquare from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription T = TypeVar("T") @@ -61,13 +62,18 @@ def chi_square( def series_hashable( - fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]] -) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]: + fn: Callable[ + [Settings, pd.Series, VarDescription], + Tuple[Settings, pd.Series, VarDescription], + ] +) -> Callable[ + [Settings, pd.Series, VarDescription], Tuple[Settings, pd.Series, VarDescription] +]: @functools.wraps(fn) def inner( - config: Settings, series: pd.Series, summary: dict - ) -> Tuple[Settings, pd.Series, dict]: - if not summary["hashable"]: + config: Settings, series: pd.Series, summary: VarDescription + ) -> Tuple[Settings, pd.Series, VarDescription]: + if not summary.hashable: return config, series, summary return fn(config, series, summary) @@ -75,14 +81,19 @@ def inner( def series_handle_nulls( - fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]] -) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]: + fn: Callable[ + [Settings, pd.Series, VarDescription], + Tuple[Settings, pd.Series, VarDescription], + ] +) -> Callable[ + [Settings, pd.Series, VarDescription], Tuple[Settings, pd.Series, VarDescription] +]: """Decorator for nullable series""" @functools.wraps(fn) def inner( - config: Settings, series: pd.Series, summary: dict - ) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription + ) -> Tuple[Settings, pd.Series, VarDescription]: if series.hasnans: series = series.dropna() @@ -102,92 +113,78 @@ def named_aggregate_summary(series: pd.Series, key: str) -> dict: return summary -@multimethod -def describe_counts( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: - raise NotImplementedError() - - @multimethod def describe_supported( config: Settings, series: Any, series_description: dict -) -> Tuple[Settings, Any, dict]: - raise NotImplementedError() - - -@multimethod -def describe_generic( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_numeric_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_text_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict, Any]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_date_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_categorical_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_url_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_file_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_path_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_image_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_boolean_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_timeseries_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() diff --git a/src/ydata_profiling/model/var_description/counts.py b/src/ydata_profiling/model/var_description/counts.py new file mode 100644 index 000000000..70f96af20 --- /dev/null +++ b/src/ydata_profiling/model/var_description/counts.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from typing import Any, Union + + +@dataclass +class VarCounts: + """Data about counts in variable column.""" + + n: Union[int, list] + """Count of rows in the series.""" + count: Union[int, list] + """Count of not missing rows in the series.""" + n_missing: Union[int, list] + """Count of missing rows in the series.""" + p_missing: Union[float, list] + """Proportion of missing rows in the series.""" + + hashable: Union[bool, list] + value_counts_without_nan: Any + """Counts of values in the series without NaN. Values as index, counts as values.""" + value_counts_index_sorted: Any + """Sorted counts of values in the series without NaN. Sorted by counts.""" + ordering: Union[bool, list] + memory_size: Union[int, list] + + value_counts: Any + """Counts of values in original series type. Values as index, counts as values.""" diff --git a/src/ydata_profiling/model/var_description/default.py b/src/ydata_profiling/model/var_description/default.py new file mode 100644 index 000000000..05fb38ed0 --- /dev/null +++ b/src/ydata_profiling/model/var_description/default.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from collections import abc +from dataclasses import dataclass +from typing import Any, Iterator + +from ydata_profiling.model.var_description.counts import VarCounts + + +@dataclass +class VarDescription(VarCounts): + """Default description for one data column. + Extends VarCounts class with information about distinct and unique values.""" + + var_specific: dict + + def __getitem__(self, item: str): + """Make the object subscriptable.""" + return self.var_specific[item] + + def __setitem__(self, key: str, value: Any): + """Make the object subscriptable.""" + self.var_specific[key] = value + + def update(self, _dict: dict) -> None: + """To support old dict like interface.""" + self.var_specific.update(_dict) + + def items(self) -> abc.ItemsView: + """To support old dict like interface.""" + return self.var_specific.items() + + def get(self, key: str, default: Any = None) -> Any: + """To support old dict like interface.""" + return self.var_specific.get(key, default) + + def pop(self, key: str, default: Any = None) -> Any: + """To support old dict like interface.""" + return self.var_specific.pop(key, default) + + def __iter__(self) -> Iterator: + """To support old dict like interface.""" + return self.var_specific.__iter__() + + @classmethod + def from_var_counts(cls, var_counts: VarCounts, init_dict: dict) -> VarDescription: + """Get a default description from a VarCounts object.""" + return VarDescription( + n=var_counts.n, + count=var_counts.count, + n_missing=var_counts.n_missing, + p_missing=var_counts.p_missing, + hashable=var_counts.hashable, + memory_size=var_counts.memory_size, + ordering=var_counts.ordering, + var_specific=init_dict, + value_counts_index_sorted=var_counts.value_counts_index_sorted, + value_counts_without_nan=var_counts.value_counts_without_nan, + value_counts=var_counts.value_counts, + ) diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..2a77f2535 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -138,7 +138,7 @@ def render_variables_section( "alert_fields": alert_fields, } - template_variables.update(summary) + summary.update(template_variables) # Per type template variables if isinstance(summary["type"], list): @@ -159,7 +159,7 @@ def render_variables_section( else: variable_type = summary["type"] render_map_type = render_map.get(variable_type, render_map["Unsupported"]) - template_variables.update(render_map_type(config, template_variables)) + template_variables.update(render_map_type(config, summary)) # Ignore these if reject_variables: diff --git a/src/ydata_profiling/report/structure/variables/render_boolean.py b/src/ydata_profiling/report/structure/variables/render_boolean.py index e6bdbe4d0..b2213f682 100644 --- a/src/ydata_profiling/report/structure/variables/render_boolean.py +++ b/src/ydata_profiling/report/structure/variables/render_boolean.py @@ -1,6 +1,7 @@ from typing import List from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent from ydata_profiling.report.presentation.core import ( Container, @@ -16,7 +17,7 @@ from ydata_profiling.visualisation.plot import cat_frequency_plot -def render_boolean(config: Settings, summary: dict) -> dict: +def render_boolean(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] n_obs_bool = config.vars.bool.n_obs image_format = config.plot.image_format @@ -48,17 +49,17 @@ def render_boolean(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], @@ -67,8 +68,8 @@ def render_boolean(config: Settings, summary: dict) -> dict: fqm = FrequencyTableSmall( freq_table( - freqtable=summary["value_counts_without_nan"], - n=summary["n"], + freqtable=summary.value_counts_without_nan, + n=summary.n, max_number_to_print=n_obs_bool, ), redact=False, @@ -89,7 +90,7 @@ def render_boolean(config: Settings, summary: dict) -> dict: max_unique = config.plot.cat_freq.max_unique if show and (max_unique > 0): - if isinstance(summary["value_counts_without_nan"], list): + if isinstance(summary.value_counts_without_nan, list): items.append( Container( [ @@ -103,7 +104,7 @@ def render_boolean(config: Settings, summary: dict) -> dict: name=config.html.style._labels[idx], anchor_id=f"{varid}cat_frequency_plot_{idx}", ) - for idx, s in enumerate(summary["value_counts_without_nan"]) + for idx, s in enumerate(summary.value_counts_without_nan) ], anchor_id=f"{varid}cat_frequency_plot", name="Common Values (Plot)", @@ -114,10 +115,7 @@ def render_boolean(config: Settings, summary: dict) -> dict: else: items.append( Image( - cat_frequency_plot( - config, - summary["value_counts_without_nan"], - ), + cat_frequency_plot(config, summary.value_counts_without_nan), image_format=image_format, alt="Common Values (Plot)", name="Common Values (Plot)", diff --git a/src/ydata_profiling/report/structure/variables/render_categorical.py b/src/ydata_profiling/report/structure/variables/render_categorical.py index 86f5a262a..db1b5ec52 100644 --- a/src/ydata_profiling/report/structure/variables/render_categorical.py +++ b/src/ydata_profiling/report/structure/variables/render_categorical.py @@ -3,6 +3,7 @@ import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import ( fmt, fmt_bytesize, @@ -27,7 +28,7 @@ def render_categorical_frequency( - config: Settings, summary: dict, varid: str + config: Settings, summary: VarDescription, varid: str ) -> Renderable: frequency_table = Table( [ @@ -54,7 +55,7 @@ def render_categorical_frequency( def render_categorical_length( - config: Settings, summary: dict, varid: str + config: Settings, summary: VarDescription, varid: str ) -> Tuple[Renderable, Renderable]: length_table = Table( [ @@ -117,7 +118,7 @@ def _get_n(value: Union[list, pd.DataFrame]) -> Union[int, List[int]]: def render_categorical_unicode( - config: Settings, summary: dict, varid: str + config: Settings, summary: VarDescription, varid: str ) -> Tuple[Renderable, Renderable]: n_freq_table_max = config.n_freq_table_max @@ -329,7 +330,7 @@ def render_categorical_unicode( ) -def render_categorical(config: Settings, summary: dict) -> dict: +def render_categorical(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] n_obs_cat = config.vars.cat.n_obs image_format = config.plot.image_format @@ -366,17 +367,17 @@ def render_categorical(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], @@ -385,8 +386,8 @@ def render_categorical(config: Settings, summary: dict) -> dict: fqm = FrequencyTableSmall( freq_table( - freqtable=summary["value_counts_without_nan"], - n=summary["count"], + freqtable=summary.value_counts_without_nan, + n=summary.count, max_number_to_print=n_obs_cat, ), redact=config.vars.cat.redact, @@ -459,26 +460,28 @@ def render_categorical(config: Settings, summary: dict) -> dict: max_unique = config.plot.cat_freq.max_unique if show and (max_unique > 0): - if isinstance(summary["value_counts_without_nan"], list): + if isinstance(summary.value_counts_without_nan, list): string_items.append( Container( [ - Image( - cat_frequency_plot( - config, - s, - ), - image_format=image_format, - alt=config.html.style._labels[idx], - name=config.html.style._labels[idx], - anchor_id=f"{varid}cat_frequency_plot_{idx}", + ( + Image( + cat_frequency_plot( + config, + s, + ), + image_format=image_format, + alt=config.html.style._labels[idx], + name=config.html.style._labels[idx], + anchor_id=f"{varid}cat_frequency_plot_{idx}", + ) + if summary["n_distinct"][idx] <= max_unique + else HTML( + f"
config.plot.cat_freq.max_unique)"
+ )
)
- if summary["n_distinct"][idx] <= max_unique
- else HTML(
- f"config.plot.cat_freq.max_unique)"
- )
- for idx, s in enumerate(summary["value_counts_without_nan"])
+ for idx, s in enumerate(summary.value_counts_without_nan)
],
anchor_id=f"{varid}cat_frequency_plot",
name="Common Values (Plot)",
@@ -493,7 +496,7 @@ def render_categorical(config: Settings, summary: dict) -> dict:
Image(
cat_frequency_plot(
config,
- summary["value_counts_without_nan"],
+ summary.value_counts_without_nan,
),
image_format=image_format,
alt="Common Values (Plot)",
@@ -515,9 +518,9 @@ def render_categorical(config: Settings, summary: dict) -> dict:
string_items,
name="Categories",
anchor_id=f"{varid}string",
- sequence_type="named_list"
- if len(config.html.style._labels) > 1
- else "batch_grid",
+ sequence_type=(
+ "named_list" if len(config.html.style._labels) > 1 else "batch_grid"
+ ),
batch_size=len(config.html.style._labels),
),
]
diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py
index aef8de357..b597eda08 100644
--- a/src/ydata_profiling/report/structure/variables/render_common.py
+++ b/src/ydata_profiling/report/structure/variables/render_common.py
@@ -1,30 +1,31 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.presentation.frequency_table_utils import (
extreme_obs_table,
freq_table,
)
-def render_common(config: Settings, summary: dict) -> dict:
+def render_common(config: Settings, summary: VarDescription) -> dict:
n_extreme_obs = config.n_extreme_obs
n_freq_table_max = config.n_freq_table_max
template_variables = {
# TODO: with nan
"freq_table_rows": freq_table(
- freqtable=summary["value_counts_without_nan"],
- n=summary["n"],
+ freqtable=summary.value_counts_without_nan,
+ n=summary.n,
max_number_to_print=n_freq_table_max,
),
"firstn_expanded": extreme_obs_table(
- freqtable=summary["value_counts_index_sorted"],
+ freqtable=summary.value_counts_index_sorted,
number_to_print=n_extreme_obs,
- n=summary["n"],
+ n=summary.n,
),
"lastn_expanded": extreme_obs_table(
- freqtable=summary["value_counts_index_sorted"][::-1],
+ freqtable=summary.value_counts_index_sorted[::-1],
number_to_print=n_extreme_obs,
- n=summary["n"],
+ n=summary.n,
),
}
diff --git a/src/ydata_profiling/report/structure/variables/render_complex.py b/src/ydata_profiling/report/structure/variables/render_complex.py
index 5995285e5..5c4ea7d09 100644
--- a/src/ydata_profiling/report/structure/variables/render_complex.py
+++ b/src/ydata_profiling/report/structure/variables/render_complex.py
@@ -1,4 +1,5 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import (
fmt,
fmt_bytesize,
@@ -15,7 +16,7 @@
from ydata_profiling.visualisation.plot import scatter_complex
-def render_complex(config: Settings, summary: dict) -> dict:
+def render_complex(config: Settings, summary: VarDescription) -> dict:
varid = summary["varid"]
template_variables = {}
image_format = config.plot.image_format
@@ -37,14 +38,14 @@ def render_complex(config: Settings, summary: dict) -> dict:
"name": "Distinct (%)",
"value": fmt_percent(summary["p_distinct"]),
},
- {"name": "Missing", "value": fmt(summary["n_missing"])},
+ {"name": "Missing", "value": fmt(summary.n_missing)},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
},
],
style=config.html.style,
diff --git a/src/ydata_profiling/report/structure/variables/render_count.py b/src/ydata_profiling/report/structure/variables/render_count.py
index e11e9913e..e9b238659 100644
--- a/src/ydata_profiling/report/structure/variables/render_count.py
+++ b/src/ydata_profiling/report/structure/variables/render_count.py
@@ -1,4 +1,5 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import (
fmt,
fmt_bytesize,
@@ -16,7 +17,7 @@
from ydata_profiling.visualisation.plot import histogram, mini_histogram
-def render_count(config: Settings, summary: dict) -> dict:
+def render_count(config: Settings, summary: VarDescription) -> dict:
template_variables = render_common(config, summary)
image_format = config.plot.image_format
@@ -44,12 +45,12 @@ def render_count(config: Settings, summary: dict) -> dict:
},
{
"name": "Missing",
- "value": fmt(summary["n_missing"]),
+ "value": fmt(summary.n_missing),
"alert": False,
},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
"alert": False,
},
],
@@ -87,7 +88,7 @@ def render_count(config: Settings, summary: dict) -> dict:
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
"alert": False,
},
],
diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py
index 1f142daae..da33ad211 100644
--- a/src/ydata_profiling/report/structure/variables/render_date.py
+++ b/src/ydata_profiling/report/structure/variables/render_date.py
@@ -1,6 +1,7 @@
from typing import Any, Dict
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
from ydata_profiling.report.presentation.core import (
Container,
@@ -11,7 +12,7 @@
from ydata_profiling.visualisation.plot import histogram, mini_histogram
-def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
+def render_date(config: Settings, summary: VarDescription) -> Dict[str, Any]:
varid = summary["varid"]
template_variables = {}
@@ -41,17 +42,17 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
},
{
"name": "Missing",
- "value": fmt(summary["n_missing"]),
+ "value": fmt(summary.n_missing),
"alert": False,
},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
"alert": False,
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
"alert": False,
},
],
diff --git a/src/ydata_profiling/report/structure/variables/render_file.py b/src/ydata_profiling/report/structure/variables/render_file.py
index 81379a41f..e54dd6e6e 100644
--- a/src/ydata_profiling/report/structure/variables/render_file.py
+++ b/src/ydata_profiling/report/structure/variables/render_file.py
@@ -1,6 +1,7 @@
from typing import List
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.presentation.core import Container, FrequencyTable, Image
from ydata_profiling.report.presentation.core.renderable import Renderable
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
@@ -8,7 +9,7 @@
from ydata_profiling.visualisation.plot import histogram
-def render_file(config: Settings, summary: dict) -> dict:
+def render_file(config: Settings, summary: VarDescription) -> dict:
varid = summary["varid"]
template_variables = render_path(config, summary)
@@ -44,7 +45,7 @@ def render_file(config: Settings, summary: dict) -> dict:
FrequencyTable(
freq_table(
freqtable=summary[file_date_id].value_counts(),
- n=summary["n"],
+ n=summary.n,
max_number_to_print=n_freq_table_max,
),
name=description,
diff --git a/src/ydata_profiling/report/structure/variables/render_generic.py b/src/ydata_profiling/report/structure/variables/render_generic.py
index 0a8ce1e55..0b468b127 100644
--- a/src/ydata_profiling/report/structure/variables/render_generic.py
+++ b/src/ydata_profiling/report/structure/variables/render_generic.py
@@ -1,4 +1,5 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
from ydata_profiling.report.presentation.core import (
HTML,
@@ -8,7 +9,7 @@
)
-def render_generic(config: Settings, summary: dict) -> dict:
+def render_generic(config: Settings, summary: VarDescription) -> dict:
info = VariableInfo(
anchor_id=summary["varid"],
alerts=summary["alerts"],
@@ -22,17 +23,17 @@ def render_generic(config: Settings, summary: dict) -> dict:
[
{
"name": "Missing",
- "value": fmt(summary["n_missing"]),
+ "value": fmt(summary.n_missing),
"alert": "n_missing" in summary["alert_fields"],
},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
"alert": "p_missing" in summary["alert_fields"],
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
"alert": False,
},
],
diff --git a/src/ydata_profiling/report/structure/variables/render_image.py b/src/ydata_profiling/report/structure/variables/render_image.py
index ea1336208..a4491ef89 100644
--- a/src/ydata_profiling/report/structure/variables/render_image.py
+++ b/src/ydata_profiling/report/structure/variables/render_image.py
@@ -1,6 +1,7 @@
import pandas as pd
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import fmt_numeric
from ydata_profiling.report.presentation.core import (
Container,
@@ -13,7 +14,7 @@
from ydata_profiling.visualisation.plot import scatter_series
-def render_image(config: Settings, summary: dict) -> dict:
+def render_image(config: Settings, summary: VarDescription) -> dict:
varid = summary["varid"]
n_freq_table_max = config.n_freq_table_max
redact = config.vars.cat.redact
@@ -135,7 +136,7 @@ def render_image(config: Settings, summary: dict) -> dict:
FrequencyTable(
freq_table(
freqtable=summary["image_dimensions"].value_counts(),
- n=summary["n"],
+ n=summary.n,
max_number_to_print=n_freq_table_max,
),
name="Common values",
@@ -156,7 +157,7 @@ def render_image(config: Settings, summary: dict) -> dict:
FrequencyTable(
freq_table(
freqtable=pd.Series(summary["exif_keys_counts"]),
- n=summary["n"],
+ n=summary.n,
max_number_to_print=n_freq_table_max,
),
name="Exif keys",
@@ -172,7 +173,7 @@ def render_image(config: Settings, summary: dict) -> dict:
FrequencyTable(
freq_table(
freqtable=counts,
- n=summary["n"],
+ n=summary.n,
max_number_to_print=n_freq_table_max,
),
name=key,
diff --git a/src/ydata_profiling/report/structure/variables/render_path.py b/src/ydata_profiling/report/structure/variables/render_path.py
index d7cde6f06..eaade0114 100644
--- a/src/ydata_profiling/report/structure/variables/render_path.py
+++ b/src/ydata_profiling/report/structure/variables/render_path.py
@@ -1,4 +1,5 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import fmt, fmt_numeric
from ydata_profiling.report.presentation.core import Container, FrequencyTable, Table
from ydata_profiling.report.presentation.frequency_table_utils import freq_table
@@ -7,7 +8,7 @@
)
-def render_path(config: Settings, summary: dict) -> dict:
+def render_path(config: Settings, summary: VarDescription) -> dict:
varid = summary["varid"]
n_freq_table_max = config.n_freq_table_max
redact = config.vars.cat.redact
@@ -18,7 +19,7 @@ def render_path(config: Settings, summary: dict) -> dict:
for path_part in keys:
template_variables[f"freqtable_{path_part}"] = freq_table(
freqtable=summary[f"{path_part}_counts"],
- n=summary["n"],
+ n=summary.n,
max_number_to_print=n_freq_table_max,
)
diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py
index 227200c27..2c9005d44 100644
--- a/src/ydata_profiling/report/structure/variables/render_real.py
+++ b/src/ydata_profiling/report/structure/variables/render_real.py
@@ -1,4 +1,5 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import (
fmt,
fmt_bytesize,
@@ -17,7 +18,7 @@
from ydata_profiling.visualisation.plot import histogram, mini_histogram
-def render_real(config: Settings, summary: dict) -> dict:
+def render_real(config: Settings, summary: VarDescription) -> dict:
varid = summary["varid"]
template_variables = render_common(config, summary)
image_format = config.plot.image_format
@@ -48,12 +49,12 @@ def render_real(config: Settings, summary: dict) -> dict:
},
{
"name": "Missing",
- "value": fmt(summary["n_missing"]),
+ "value": fmt(summary.n_missing),
"alert": "n_missing" in summary["alert_fields"],
},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
"alert": "p_missing" in summary["alert_fields"],
},
{
@@ -111,7 +112,7 @@ def render_real(config: Settings, summary: dict) -> dict:
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
"alert": False,
},
],
diff --git a/src/ydata_profiling/report/structure/variables/render_text.py b/src/ydata_profiling/report/structure/variables/render_text.py
index 5eadf3799..c4c690e5e 100644
--- a/src/ydata_profiling/report/structure/variables/render_text.py
+++ b/src/ydata_profiling/report/structure/variables/render_text.py
@@ -1,6 +1,7 @@
from typing import Any, Dict, List
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
from ydata_profiling.report.presentation.core import (
Container,
@@ -21,7 +22,7 @@
from ydata_profiling.visualisation.plot import plot_word_cloud
-def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
+def render_text(config: Settings, summary: VarDescription) -> Dict[str, Any]:
if config.vars.text.redact:
render = render_categorical(config, summary)
return render
@@ -58,17 +59,17 @@ def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
},
{
"name": "Missing",
- "value": fmt(summary["n_missing"]),
+ "value": fmt(summary.n_missing),
"alert": "n_missing" in summary["alert_fields"],
},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
"alert": "p_missing" in summary["alert_fields"],
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
"alert": False,
},
],
diff --git a/src/ydata_profiling/report/structure/variables/render_timeseries.py b/src/ydata_profiling/report/structure/variables/render_timeseries.py
index 6f3bc27cd..78e62402d 100644
--- a/src/ydata_profiling/report/structure/variables/render_timeseries.py
+++ b/src/ydata_profiling/report/structure/variables/render_timeseries.py
@@ -1,4 +1,5 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import (
fmt,
fmt_bytesize,
@@ -81,7 +82,7 @@ def _render_gap_tab(config: Settings, summary: dict) -> Container:
)
-def render_timeseries(config: Settings, summary: dict) -> dict:
+def render_timeseries(config: Settings, summary: VarDescription) -> dict:
varid = summary["varid"]
template_variables = render_common(config, summary)
image_format = config.plot.image_format
@@ -111,12 +112,12 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
},
{
"name": "Missing",
- "value": fmt(summary["n_missing"]),
+ "value": fmt(summary.n_missing),
"alert": "n_missing" in summary["alert_fields"],
},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
"alert": "p_missing" in summary["alert_fields"],
},
{
@@ -164,7 +165,7 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
"alert": False,
},
],
diff --git a/src/ydata_profiling/report/structure/variables/render_url.py b/src/ydata_profiling/report/structure/variables/render_url.py
index f35d6dcb6..59c007ea5 100644
--- a/src/ydata_profiling/report/structure/variables/render_url.py
+++ b/src/ydata_profiling/report/structure/variables/render_url.py
@@ -1,4 +1,5 @@
from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
from ydata_profiling.report.presentation.core import (
Container,
@@ -11,7 +12,7 @@
from ydata_profiling.report.structure.variables.render_common import render_common
-def render_url(config: Settings, summary: dict) -> dict:
+def render_url(config: Settings, summary: VarDescription) -> dict:
varid = summary["varid"]
n_freq_table_max = config.n_freq_table_max
@@ -24,7 +25,7 @@ def render_url(config: Settings, summary: dict) -> dict:
for url_part in keys:
template_variables[f"freqtable_{url_part}"] = freq_table(
freqtable=summary[f"{url_part}_counts"],
- n=summary["n"],
+ n=summary.n,
max_number_to_print=n_freq_table_max,
)
@@ -101,17 +102,17 @@ def render_url(config: Settings, summary: dict) -> dict:
},
{
"name": "Missing",
- "value": fmt(summary["n_missing"]),
+ "value": fmt(summary.n_missing),
"alert": "n_missing" in summary["alert_fields"],
},
{
"name": "Missing (%)",
- "value": fmt_percent(summary["p_missing"]),
+ "value": fmt_percent(summary.p_missing),
"alert": "p_missing" in summary["alert_fields"],
},
{
"name": "Memory size",
- "value": fmt_bytesize(summary["memory_size"]),
+ "value": fmt_bytesize(summary.memory_size),
"alert": False,
},
],
@@ -120,8 +121,8 @@ def render_url(config: Settings, summary: dict) -> dict:
fqm = FrequencyTableSmall(
freq_table(
- freqtable=summary["value_counts_without_nan"],
- n=summary["n"],
+ freqtable=summary.value_counts_without_nan,
+ n=summary.n,
max_number_to_print=n_obs_cat,
),
redact=redact,
diff --git a/tests/unit/test_comparison.py b/tests/unit/test_comparison.py
index 748c5af12..6d5a547b1 100644
--- a/tests/unit/test_comparison.py
+++ b/tests/unit/test_comparison.py
@@ -66,7 +66,8 @@ def test_generate_comparison():
p1 = ProfileReport(df1, title="p1")
p2 = ProfileReport(df2, title="p1")
- html = p1.compare(p2).to_html()
+ _compare = p1.compare(p2)
+ html = _compare.to_html()
assert len(html) > 0
diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py
index cbae1bda6..362ea7c0b 100644
--- a/tests/unit/test_describe.py
+++ b/tests/unit/test_describe.py
@@ -9,6 +9,7 @@
from ydata_profiling.model.describe import describe
from ydata_profiling.model.summary import describe_1d
from ydata_profiling.model.typeset import ProfilingTypeSet
+from ydata_profiling.model.var_description.default import VarDescription
check_is_NaN = "ydata_profiling.check_is_NaN"
@@ -49,7 +50,7 @@ def test_describe_unique(data, expected, summarizer, typeset):
config = Settings()
config.vars.num.low_categorical_threshold = 0
- desc_1d = describe_1d(config, data, summarizer, typeset)
+ desc_1d: VarDescription = describe_1d(config, data, summarizer, typeset)
if expected["is_unique"] is not None:
assert (
desc_1d["p_unique"] == expected["p_unique"]
@@ -562,6 +563,13 @@ def test_describe_df(column, describe_data, expected_results, summarizer):
for k, v in expected_results[column].items():
if v == check_is_NaN:
test_condition = k not in results.variables[column]
+ # values from common description
+ elif k in asdict(results.variables[column]):
+ if isinstance(v, float):
+ assert pytest.approx(v) == getattr(results.variables[column], k)
+ else:
+ assert v == getattr(results.variables[column], k)
+ continue
elif isinstance(v, float):
test_condition = pytest.approx(v) == results.variables[column][k]
else:
@@ -595,6 +603,6 @@ def test_decribe_series_type_schema(config, summarizer):
result = describe(config, df, summarizer, typeset)
assert result.variables["date"]["type"] == "DateTime"
- assert result.variables["date"]["n_missing"] == 0
+ assert result.variables["date"].n_missing == 0
assert result.variables["date"]["n_invalid_dates"] == 2
assert result.variables["date"]["p_invalid_dates"] == 0.5
diff --git a/tests/unit/test_ge_integration_expectations.py b/tests/unit/test_ge_integration_expectations.py
index 4ef0b1465..ed721e4a7 100644
--- a/tests/unit/test_ge_integration_expectations.py
+++ b/tests/unit/test_ge_integration_expectations.py
@@ -1,4 +1,4 @@
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
import pytest
@@ -20,14 +20,22 @@ def batch():
def test_generic_expectations(batch):
- generic_expectations("column", {"n_missing": 0, "p_unique": 1.0}, batch)
+ default_desc = MagicMock()
+ default_desc.n_missing = 0
+ d = {"p_unique": 1.0}
+ default_desc.__getitem__.side_effect = d.__getitem__
+ generic_expectations("column", default_desc, batch)
batch.expect_column_to_exist.assert_called_once()
batch.expect_column_values_to_not_be_null.assert_called_once()
batch.expect_column_values_to_be_unique.assert_called_once()
def test_generic_expectations_min(batch):
- generic_expectations("column", {"n_missing": 1, "p_unique": 0.5}, batch)
+ default_desc = MagicMock()
+ default_desc.n_missing = 1
+ d = {"p_unique": 0.5}
+ default_desc.__getitem__.side_effect = d.__getitem__
+ generic_expectations("column", default_desc, batch)
batch.expect_column_to_exist.assert_called_once()
batch.expect_column_values_to_not_be_null.assert_not_called()
batch.expect_column_values_to_be_unique.assert_not_called()
@@ -93,22 +101,21 @@ def test_numeric_expectations_min(batch):
def test_categorical_expectations(batch):
- categorical_expectations(
- "column",
- {
- "n_distinct": 1,
- "p_distinct": 0.1,
- "value_counts_without_nan": {"val1": 1, "val2": 2},
- },
- batch,
- )
+ default_desc = MagicMock()
+ d = {"n_distinct": 1, "p_unique": 0.1}
+ default_desc.__getitem__.side_effect = d.__getitem__
+ default_desc.value_counts_without_nan = {"val1": 1, "val2": 2}
+ categorical_expectations("column", default_desc, batch)
batch.expect_column_values_to_be_in_set.assert_called_once_with(
"column", {"val1", "val2"}
)
def test_categorical_expectations_min(batch):
- categorical_expectations("column", {"n_distinct": 15, "p_distinct": 1.0}, batch)
+ default_desc = MagicMock()
+ d = {"n_distinct": 15, "p_distinct": 1.0}
+ default_desc.__getitem__.side_effect = d.__getitem__
+ categorical_expectations("column", default_desc, batch)
batch.expect_column_values_to_be_in_set.assert_not_called()
diff --git a/tests/unit/test_summarizer.py b/tests/unit/test_summarizer.py
index f8d374216..631dab650 100644
--- a/tests/unit/test_summarizer.py
+++ b/tests/unit/test_summarizer.py
@@ -2,13 +2,14 @@
import pandas as pd
+from ydata_profiling.config import Settings
from ydata_profiling.model.summarizer import ProfilingSummarizer, format_summary
from ydata_profiling.model.typeset import ProfilingTypeSet
base_path = os.path.abspath(os.path.dirname(__file__))
-def test_summarizer(config):
+def test_summarizer_base_types(config: Settings):
pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
_ = format_summary(pps.summarize(config, pd.Series([1, 2, 3, 4, 5]), "Unsupported"))
@@ -23,9 +24,23 @@ def test_summarizer(config):
_ = format_summary(
pps.summarize(config, pd.Series(["abc", "abc", "abba"]), "Categorical")
)
+
+ _ = format_summary(
+ pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean")
+ )
+
+
+def test_summarizer_url(config: Settings):
+ config.vars.url.active = True
+ pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
_ = format_summary(
pps.summarize(config, pd.Series(["https://www.example.com"]), "URL")
)
+
+
+def test_summarizer_path(config: Settings):
+ config.vars.path.active = True
+ pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
_ = format_summary(
pps.summarize(
config,
@@ -40,6 +55,12 @@ def test_summarizer(config):
"Path",
)
)
+
+
+def test_summarizer_file(config: Settings):
+ config.vars.path.active = True
+ config.vars.file.active = True
+ pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
_ = format_summary(
pps.summarize(
config,
@@ -53,15 +74,19 @@ def test_summarizer(config):
"File",
)
)
+
+
+def test_summarizer_image(config: Settings):
+ config.vars.path.active = True
+ config.vars.file.active = True
+ config.vars.image.active = True
+ pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
_ = format_summary(
pps.summarize(
config,
pd.Series(
- [os.path.abspath(base_path + r"../../../docsrc/assets/logo_header.png")]
+ [os.path.abspath(base_path + r"../../../docs/_static/img/cli.png")]
),
"Image",
)
)
- _ = format_summary(
- pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean")
- )
diff --git a/tests/unit/test_summary_algos.py b/tests/unit/test_summary_algos.py
index 523ce5fcd..d7aa90045 100644
--- a/tests/unit/test_summary_algos.py
+++ b/tests/unit/test_summary_algos.py
@@ -4,35 +4,37 @@
from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import (
- describe_counts,
- describe_generic,
describe_supported,
histogram_compute,
)
+from ydata_profiling.model.var_description.default import VarDescription
def test_count_summary_sorted(config):
s = pd.Series([1] + [2] * 1000)
- _, sn, r = describe_counts(config, s, {})
- assert r["value_counts_without_nan"].index[0] == 2
- assert r["value_counts_without_nan"].index[1] == 1
+ r: VarDescription
+ _, sn, r = describe_supported(config, s, {})
+ assert r.value_counts_without_nan.index[0] == 2
+ assert r.value_counts_without_nan.index[1] == 1
def test_count_summary_nat(config):
+ r: VarDescription
s = pd.to_datetime(pd.Series([1, 2] + [np.nan, pd.NaT]))
- _, sn, r = describe_counts(config, s, {})
- assert len(r["value_counts_without_nan"].index) == 2
+ _, sn, r = describe_supported(config, s, {})
+ assert len(r.value_counts_without_nan.index) == 2
def test_count_summary_category(config):
+ r: VarDescription
s = pd.Series(
pd.Categorical(
["Poor", "Neutral"] + [np.nan] * 100,
categories=["Poor", "Neutral", "Excellent"],
)
)
- _, sn, r = describe_counts(config, s, {})
- assert len(r["value_counts_without_nan"].index) == 2
+ _, sn, r = describe_supported(config, s, {})
+ assert len(r.value_counts_without_nan.index) == 2
@pytest.fixture(scope="class")
@@ -41,16 +43,12 @@ def empty_data() -> pd.DataFrame:
def test_summary_supported_empty_df(config, empty_data):
- _, series, summary = describe_counts(config, empty_data["A"], {})
- assert summary["n_missing"] == 0
- assert "p_missing" not in summary
-
- _, series, summary = describe_generic(config, series, summary)
- assert summary["n_missing"] == 0
- assert summary["p_missing"] == 0
- assert summary["count"] == 0
-
- _, _, summary = describe_supported(config, series, summary)
+ summary: VarDescription
+ _, _, summary = describe_supported(config, empty_data["A"], {})
+ assert summary.n_missing == 0
+ assert summary.n_missing == 0
+ assert summary.p_missing == 0
+ assert summary.count == 0
assert summary["n_distinct"] == 0
assert summary["p_distinct"] == 0
assert summary["n_unique"] == 0