From 36cb9ee8f1eb04de9c6ae6840233175ffd8626fe Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Sat, 16 Dec 2023 16:11:14 +0100 Subject: [PATCH 01/10] feat: add vardescription class --- .../model/pandas/correlations_pandas.py | 15 ++--- .../model/pandas/describe_boolean_pandas.py | 14 ++--- .../pandas/describe_categorical_pandas.py | 8 +-- .../model/pandas/describe_counts_pandas.py | 9 +-- .../model/pandas/describe_date_pandas.py | 5 +- .../model/pandas/describe_file_pandas.py | 5 +- .../model/pandas/describe_generic_pandas.py | 9 +-- .../model/pandas/describe_image_pandas.py | 5 +- .../model/pandas/describe_numeric_pandas.py | 17 +++--- .../model/pandas/describe_path_pandas.py | 10 ++-- .../model/pandas/describe_supported_pandas.py | 29 +++------- .../model/pandas/describe_text_pandas.py | 7 ++- .../pandas/describe_timeseries_pandas.py | 5 +- .../model/pandas/describe_url_pandas.py | 5 +- .../model/pandas/summary_pandas.py | 7 ++- .../model/pandas/table_pandas.py | 9 +-- .../pandas/var_description/counts_pandas.py | 53 ++++++++++++++++++ .../pandas/var_description/default_pandas.py | 31 ++++++++++ .../model/var_description/counts.py | 27 +++++++++ .../model/var_description/default.py | 56 +++++++++++++++++++ 20 files changed, 246 insertions(+), 80 deletions(-) create mode 100644 src/ydata_profiling/model/pandas/var_description/counts_pandas.py create mode 100644 src/ydata_profiling/model/pandas/var_description/default_pandas.py create mode 100644 src/ydata_profiling/model/var_description/counts.py create mode 100644 src/ydata_profiling/model/var_description/default.py diff --git a/src/ydata_profiling/model/pandas/correlations_pandas.py b/src/ydata_profiling/model/pandas/correlations_pandas.py index ab82e6353..698969270 100644 --- a/src/ydata_profiling/model/pandas/correlations_pandas.py +++ b/src/ydata_profiling/model/pandas/correlations_pandas.py @@ -1,4 +1,5 @@ """Correlations between variables.""" + import itertools import warnings from typing import Callable, Optional @@ -20,6 +21,7 @@ DiscretizationType, Discretizer, ) +from ydata_profiling.model.var_description.default import VarDescription @Spearman.compute.register(Settings, pd.DataFrame, dict) @@ -87,9 +89,9 @@ def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float: return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True) -@Cramers.compute.register(Settings, pd.DataFrame, dict) +@Cramers.compute.register(Settings, pd.DataFrame, dict[str, VarDescription]) def pandas_cramers_compute( - config: Settings, df: pd.DataFrame, summary: dict + config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] ) -> Optional[pd.DataFrame]: threshold = config.categorical_maximum_correlation_distinct @@ -128,9 +130,9 @@ def pandas_cramers_compute( return correlation_matrix -@PhiK.compute.register(Settings, pd.DataFrame, dict) +@PhiK.compute.register(Settings, pd.DataFrame, dict[str, VarDescription]) def pandas_phik_compute( - config: Settings, df: pd.DataFrame, summary: dict + config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] ) -> Optional[pd.DataFrame]: df_cols_dict = {i: list(df.columns).index(i) for i in df.columns} @@ -164,9 +166,9 @@ def pandas_phik_compute( return correlation -@Auto.compute.register(Settings, pd.DataFrame, dict) +@Auto.compute.register(Settings, pd.DataFrame, dict[str, VarDescription]) def pandas_auto_compute( - config: Settings, df: pd.DataFrame, summary: dict + config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] ) -> Optional[pd.DataFrame]: threshold = config.categorical_maximum_correlation_distinct numerical_columns = [ @@ -195,7 +197,6 @@ def pandas_auto_compute( columns=columns_tested, ) for col_1_name, col_2_name in itertools.combinations(columns_tested, 2): - method = ( _pairwise_spearman if any(elem in categorical_columns for elem in [col_1_name, col_2_name]) diff --git a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py index 9b2014db7..07d446337 100644 --- a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py @@ -5,17 +5,14 @@ from ydata_profiling.config import Settings from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score -from ydata_profiling.model.summary_algorithms import ( - describe_boolean_1d, - series_hashable, -) +from ydata_profiling.model.summary_algorithms import describe_boolean_1d +from ydata_profiling.model.var_description.default import VarDescription @describe_boolean_1d.register -@series_hashable def pandas_describe_boolean_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a boolean series. Args: @@ -26,8 +23,7 @@ def pandas_describe_boolean_1d( Returns: A dict containing calculated series description values. """ - - value_counts: pd.Series = summary["value_counts_without_nan"] + value_counts: pd.Series = summary.value_counts_without_nan if not value_counts.empty: summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]}) summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index 31ae57417..b2b381aa1 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -16,6 +16,7 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.var_description.default import VarDescription def get_character_counts_vc(vc: pd.Series) -> pd.Series: @@ -214,8 +215,8 @@ def length_summary_vc(vc: pd.Series) -> dict: @series_hashable @series_handle_nulls def pandas_describe_categorical_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a categorical series. Args: @@ -226,12 +227,11 @@ def pandas_describe_categorical_1d( Returns: A dict containing calculated series description values. """ - # Make sure we deal with strings (Issue #100) series = series.astype(str) # Only run if at least 1 non-missing value - value_counts = summary["value_counts_without_nan"] + value_counts = summary.value_counts_without_nan value_counts.index = value_counts.index.astype(str) summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) diff --git a/src/ydata_profiling/model/pandas/describe_counts_pandas.py b/src/ydata_profiling/model/pandas/describe_counts_pandas.py index 07cdad9d5..416474d25 100644 --- a/src/ydata_profiling/model/pandas/describe_counts_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_counts_pandas.py @@ -4,12 +4,13 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_counts +from ydata_profiling.model.var_description.default import VarDescription @describe_counts.register def pandas_describe_counts( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Counts the values in a series (with and without NaN, distinct). Args: @@ -27,7 +28,7 @@ def pandas_describe_counts( except: # noqa: E722 hashable = False - summary["hashable"] = hashable + summary.hashable = hashable if hashable: value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0] @@ -58,6 +59,6 @@ def pandas_describe_counts( ordering = False summary["ordering"] = ordering - summary["n_missing"] = n_missing + summary.n_missing = n_missing return config, series, summary diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index 1ff64a50f..39ca21b8c 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -11,14 +11,15 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.var_description.default import VarDescription @describe_date_1d.register @series_hashable @series_handle_nulls def pandas_describe_date_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a date series. Args: diff --git a/src/ydata_profiling/model/pandas/describe_file_pandas.py b/src/ydata_profiling/model/pandas/describe_file_pandas.py index 84ee3c4ab..18b4e511c 100644 --- a/src/ydata_profiling/model/pandas/describe_file_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_file_pandas.py @@ -6,6 +6,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_file_1d, histogram_compute +from ydata_profiling.model.var_description.default import VarDescription def file_summary(series: pd.Series) -> dict: @@ -36,8 +37,8 @@ def convert_datetime(x: float) -> str: @describe_file_1d.register def pandas_describe_file_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: if series.hasnans: raise ValueError("May not contain NaNs") if not hasattr(series, "str"): diff --git a/src/ydata_profiling/model/pandas/describe_generic_pandas.py b/src/ydata_profiling/model/pandas/describe_generic_pandas.py index 21b804e66..fcc5b04b6 100644 --- a/src/ydata_profiling/model/pandas/describe_generic_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_generic_pandas.py @@ -4,12 +4,13 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_generic +from ydata_profiling.model.var_description.default import VarDescription @describe_generic.register def pandas_describe_generic( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe generic series. Args: @@ -27,8 +28,8 @@ def pandas_describe_generic( summary.update( { "n": length, - "p_missing": summary["n_missing"] / length if length > 0 else 0, - "count": length - summary["n_missing"], + "p_missing": summary.n_missing / length if length > 0 else 0, + "count": length - summary.n_missing, "memory_size": series.memory_usage(deep=config.memory_deep), } ) diff --git a/src/ydata_profiling/model/pandas/describe_image_pandas.py b/src/ydata_profiling/model/pandas/describe_image_pandas.py index 08675ed0c..d5f7c8975 100644 --- a/src/ydata_profiling/model/pandas/describe_image_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_image_pandas.py @@ -12,6 +12,7 @@ describe_image_1d, named_aggregate_summary, ) +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.imghdr_patch import * # noqa: F401,F403 @@ -243,8 +244,8 @@ def image_summary(series: pd.Series, exif: bool = False, hash: bool = False) -> @describe_image_1d.register def pandas_describe_image_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: if series.hasnans: raise ValueError("May not contain NaNs") if not hasattr(series, "str"): diff --git a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py index fa3ffd6cf..c51e0ddd8 100644 --- a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.compat import pandas_version_info if pandas_version_info() >= (1, 5): @@ -44,9 +45,9 @@ def numeric_stats_pandas(series: pd.Series) -> Dict[str, Any]: def numeric_stats_numpy( - present_values: np.ndarray, series: pd.Series, series_description: Dict[str, Any] + present_values: np.ndarray, series: pd.Series, series_description: VarDescription ) -> Dict[str, Any]: - vc = series_description["value_counts_without_nan"] + vc = series_description.value_counts_without_nan index_values = vc.index.values # FIXME: can be performance optimized by using weights in std, var, kurt and skew... @@ -80,8 +81,8 @@ def numeric_stats_numpy( @series_hashable @series_handle_nulls def pandas_describe_numeric_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a numeric series. Args: @@ -96,11 +97,11 @@ def pandas_describe_numeric_1d( chi_squared_threshold = config.vars.num.chi_squared_threshold quantiles = config.vars.num.quantiles - value_counts = summary["value_counts_without_nan"] + value_counts = summary.value_counts_without_nan negative_index = value_counts.index < 0 summary["n_negative"] = value_counts.loc[negative_index].sum() - summary["p_negative"] = summary["n_negative"] / summary["n"] + summary["p_negative"] = summary["n_negative"] / summary.n infinity_values = [np.inf, -np.inf] infinity_index = value_counts.index.isin(infinity_values) @@ -139,8 +140,8 @@ def pandas_describe_numeric_1d( ) stats["iqr"] = stats["75%"] - stats["25%"] stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN - stats["p_zeros"] = stats["n_zeros"] / summary["n"] - stats["p_infinite"] = summary["n_infinite"] / summary["n"] + stats["p_zeros"] = stats["n_zeros"] / summary.n + stats["p_infinite"] = summary["n_infinite"] / summary.n stats["monotonic_increase"] = series.is_monotonic_increasing stats["monotonic_decrease"] = series.is_monotonic_decreasing diff --git a/src/ydata_profiling/model/pandas/describe_path_pandas.py b/src/ydata_profiling/model/pandas/describe_path_pandas.py index e3e536f99..31ac65f88 100644 --- a/src/ydata_profiling/model/pandas/describe_path_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_path_pandas.py @@ -5,6 +5,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_path_1d +from ydata_profiling.model.var_description.default import VarDescription def path_summary(series: pd.Series) -> dict: @@ -19,8 +20,9 @@ def path_summary(series: pd.Series) -> dict: # TODO: optimize using value counts summary = { - "common_prefix": os.path.commonprefix(series.values.tolist()) - or "No common prefix", + "common_prefix": ( + os.path.commonprefix(series.values.tolist()) or "No common prefix" + ), "stem_counts": series.map(lambda x: os.path.splitext(x)[0]).value_counts(), "suffix_counts": series.map(lambda x: os.path.splitext(x)[1]).value_counts(), "name_counts": series.map(lambda x: os.path.basename(x)).value_counts(), @@ -39,8 +41,8 @@ def path_summary(series: pd.Series) -> dict: @describe_path_1d.register def pandas_describe_path_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a path series. Args: diff --git a/src/ydata_profiling/model/pandas/describe_supported_pandas.py b/src/ydata_profiling/model/pandas/describe_supported_pandas.py index 16bd9ab38..69e19f873 100644 --- a/src/ydata_profiling/model/pandas/describe_supported_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_supported_pandas.py @@ -3,14 +3,17 @@ import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_supported, series_hashable +from ydata_profiling.model.pandas.var_description.default_pandas import ( + get_default_pandas_description, +) +from ydata_profiling.model.summary_algorithms import describe_supported +from ydata_profiling.model.var_description.default import VarDescription @describe_supported.register -@series_hashable def pandas_describe_supported( - config: Settings, series: pd.Series, series_description: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, description: dict +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a supported series. Args: @@ -22,20 +25,6 @@ def pandas_describe_supported( A dict containing calculated series description values. """ - # number of non-NaN observations in the Series - count = series_description["count"] + series_description = get_default_pandas_description(config, series, description) - value_counts = series_description["value_counts_without_nan"] - distinct_count = len(value_counts) - unique_count = value_counts.where(value_counts == 1).count() - - stats = { - "n_distinct": distinct_count, - "p_distinct": distinct_count / count if count > 0 else 0, - "is_unique": unique_count == count and count > 0, - "n_unique": unique_count, - "p_unique": unique_count / count if count > 0 else 0, - } - stats.update(series_description) - - return config, series, stats + return config, series, series_description diff --git a/src/ydata_profiling/model/pandas/describe_text_pandas.py b/src/ydata_profiling/model/pandas/describe_text_pandas.py index 2701b9760..1cf71f200 100644 --- a/src/ydata_profiling/model/pandas/describe_text_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_text_pandas.py @@ -14,6 +14,7 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.var_description.default import VarDescription @describe_text_1d.register @@ -22,8 +23,8 @@ def pandas_describe_text_1d( config: Settings, series: pd.Series, - summary: dict, -) -> Tuple[Settings, pd.Series, dict]: + summary: VarDescription, +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe string series. Args: @@ -38,7 +39,7 @@ def pandas_describe_text_1d( series = series.astype(str) # Only run if at least 1 non-missing value - value_counts = summary["value_counts_without_nan"] + value_counts = summary.value_counts_without_nan value_counts.index = value_counts.index.astype(str) summary.update({"first_rows": series.head(5)}) diff --git a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py index 5ffe99a9f..de8903460 100644 --- a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py @@ -13,6 +13,7 @@ series_handle_nulls, series_hashable, ) +from ydata_profiling.model.var_description.default import VarDescription def stationarity_test(config: Settings, series: pd.Series) -> Tuple[bool, float]: @@ -195,8 +196,8 @@ def compute_gap_stats(series: pd.Series) -> pd.Series: @series_hashable @series_handle_nulls def pandas_describe_timeseries_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a timeseries. Args: diff --git a/src/ydata_profiling/model/pandas/describe_url_pandas.py b/src/ydata_profiling/model/pandas/describe_url_pandas.py index bfe5239bf..4a64a8c30 100644 --- a/src/ydata_profiling/model/pandas/describe_url_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_url_pandas.py @@ -5,6 +5,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_url_1d +from ydata_profiling.model.var_description.default import VarDescription def url_summary(series: pd.Series) -> dict: @@ -29,8 +30,8 @@ def url_summary(series: pd.Series) -> dict: @describe_url_1d.register def pandas_describe_url_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, pd.Series, VarDescription]: """Describe a url series. Args: diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index bbb401fd0..d66906caa 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -13,6 +13,7 @@ from ydata_profiling.model.summarizer import BaseSummarizer from ydata_profiling.model.summary import describe_1d, get_series_descriptions from ydata_profiling.model.typeset import ProfilingTypeSet +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.dataframe import sort_column_names @@ -22,7 +23,7 @@ def pandas_describe_1d( series: pd.Series, summarizer: BaseSummarizer, typeset: VisionsTypeset, -) -> dict: +) -> VarDescription: """Describe a series (infer the variable type, then calculate type-specific values). Args: @@ -64,8 +65,8 @@ def pandas_get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> dict: - def multiprocess_1d(args: tuple) -> Tuple[str, dict]: +) -> dict[str, VarDescription]: + def multiprocess_1d(args: tuple) -> Tuple[str, VarDescription]: """Wrapper to process series in parallel. Args: diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index a919ee33b..bef531e2f 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -4,11 +4,12 @@ from ydata_profiling.config import Settings from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.var_description.default import VarDescription @get_table_stats.register def pandas_get_table_stats( - config: Settings, df: pd.DataFrame, variable_stats: dict + config: Settings, df: pd.DataFrame, variable_stats: dict[str, VarDescription] ) -> dict: """General statistics for the DataFrame. @@ -36,10 +37,10 @@ def pandas_get_table_stats( } for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: + if series_summary.n_missing > 0: table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: + table_stats["n_cells_missing"] += series_summary.n_missing + if series_summary.n_missing == n: table_stats["n_vars_all_missing"] += 1 table_stats["p_cells_missing"] = ( diff --git a/src/ydata_profiling/model/pandas/var_description/counts_pandas.py b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py new file mode 100644 index 000000000..6ffc3e6d5 --- /dev/null +++ b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py @@ -0,0 +1,53 @@ +import pandas as pd + +from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.counts import VarCounts + + +def get_counts_pandas(config: Settings, series: pd.Series) -> VarCounts: + """Get a VarCounts object for a pandas series.""" + length = len(series) + + try: + value_counts_with_nan = series.value_counts(dropna=False) + _ = set(value_counts_with_nan.index) + hashable = True + except: # noqa: E722 + hashable = False + + value_counts_without_nan = None + value_counts_index_sorted = None + if hashable: + value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0] + + null_index = value_counts_with_nan.index.isnull() + if null_index.any(): + n_missing = value_counts_with_nan[null_index].sum() + value_counts_without_nan = value_counts_with_nan[~null_index] + else: + n_missing = 0 + value_counts_without_nan = value_counts_with_nan + + try: + value_counts_index_sorted = value_counts_without_nan.sort_index( + ascending=True + ) + ordering = True + except TypeError: + ordering = False + else: + n_missing = series.isna().sum() + ordering = False + + return VarCounts( + hashable=hashable, + value_counts_without_nan=value_counts_without_nan, + value_counts_index_sorted=value_counts_index_sorted, + ordering=ordering, + n_missing=n_missing, + n=length, + p_missing=series.isna().sum() / length if length > 0 else 0, + count=length - series.isna().sum(), + memory_size=series.memory_usage(deep=config.memory_deep), + value_counts=None, + ) diff --git a/src/ydata_profiling/model/pandas/var_description/default_pandas.py b/src/ydata_profiling/model/pandas/var_description/default_pandas.py new file mode 100644 index 000000000..6a1c21a42 --- /dev/null +++ b/src/ydata_profiling/model/pandas/var_description/default_pandas.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import pandas as pd + +from ydata_profiling.config import Settings +from ydata_profiling.model.pandas.var_description.counts_pandas import get_counts_pandas +from ydata_profiling.model.var_description.default import VarDescription + + +def get_default_pandas_description( + config: Settings, series: pd.Series, init_dict: dict +) -> VarDescription: + var_counts = get_counts_pandas(config, series) + + if var_counts.hashable: + count = var_counts.count + value_counts = var_counts.value_counts_without_nan + distinct_count = len(value_counts) + unique_count = value_counts.where(value_counts == 1).count() + + init_dict.update( + { + "n_distinct": distinct_count, + "p_distinct": distinct_count / count if count > 0 else 0, + "is_unique": unique_count == count and count > 0, + "n_unique": unique_count, + "p_unique": unique_count / count if count > 0 else 0, + } + ) + + return VarDescription.from_var_counts(var_counts, init_dict) diff --git a/src/ydata_profiling/model/var_description/counts.py b/src/ydata_profiling/model/var_description/counts.py new file mode 100644 index 000000000..70f96af20 --- /dev/null +++ b/src/ydata_profiling/model/var_description/counts.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from typing import Any, Union + + +@dataclass +class VarCounts: + """Data about counts in variable column.""" + + n: Union[int, list] + """Count of rows in the series.""" + count: Union[int, list] + """Count of not missing rows in the series.""" + n_missing: Union[int, list] + """Count of missing rows in the series.""" + p_missing: Union[float, list] + """Proportion of missing rows in the series.""" + + hashable: Union[bool, list] + value_counts_without_nan: Any + """Counts of values in the series without NaN. Values as index, counts as values.""" + value_counts_index_sorted: Any + """Sorted counts of values in the series without NaN. Sorted by counts.""" + ordering: Union[bool, list] + memory_size: Union[int, list] + + value_counts: Any + """Counts of values in original series type. Values as index, counts as values.""" diff --git a/src/ydata_profiling/model/var_description/default.py b/src/ydata_profiling/model/var_description/default.py new file mode 100644 index 000000000..49aa63e82 --- /dev/null +++ b/src/ydata_profiling/model/var_description/default.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from collections import abc +from dataclasses import dataclass +from typing import Any, Iterator + +from ydata_profiling.model.var_description.counts import VarCounts + + +@dataclass +class VarDescription(VarCounts): + """Default description for one data column. + Extends VarCounts class with information about distinct and unique values.""" + + var_specific: dict + + def __getitem__(self, item: str): + """Make the object subscriptable.""" + return self.var_specific[item] + + def __setitem__(self, key: str, value: Any): + """Make the object subscriptable.""" + self.var_specific[key] = value + + def update(self, _dict: dict) -> None: + """To support old dict like interface.""" + self.var_specific.update(_dict) + + def items(self) -> abc.ItemsView: + """To support old dict like interface.""" + return self.var_specific.items() + + def get(self, key: str, default: Any = None) -> Any: + """To support old dict like interface.""" + return self.var_specific.get(key, default) + + def __iter__(self) -> Iterator: + """To support old dict like interface.""" + return self.var_specific.__iter__() + + @classmethod + def from_var_counts(cls, var_counts: VarCounts, init_dict: dict) -> VarDescription: + """Get a default description from a VarCounts object.""" + return VarDescription( + n=var_counts.n, + count=var_counts.count, + n_missing=var_counts.n_missing, + p_missing=var_counts.p_missing, + hashable=var_counts.hashable, + memory_size=var_counts.memory_size, + ordering=var_counts.ordering, + var_specific=init_dict, + value_counts_index_sorted=var_counts.value_counts_index_sorted, + value_counts_without_nan=var_counts.value_counts_without_nan, + value_counts=var_counts.value_counts, + ) From f6fa91c2bafaf2a0ef7ad2cd916ee80f98d35491 Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Sat, 16 Dec 2023 16:31:42 +0100 Subject: [PATCH 02/10] feat: make pandas profiling work just fine --- src/ydata_profiling/model/alerts.py | 94 ++++++++++--------- src/ydata_profiling/model/describe.py | 4 +- src/ydata_profiling/model/description.py | 4 +- .../model/expectation_algorithms.py | 38 ++++---- src/ydata_profiling/model/summarizer.py | 9 +- src/ydata_profiling/model/summary.py | 5 +- .../report/structure/report.py | 4 +- 7 files changed, 84 insertions(+), 74 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index d3232ea9b..9ce40522a 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -1,13 +1,15 @@ """Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant values, high correlations).""" + from enum import Enum, auto, unique -from typing import Any, Dict, List, Optional, Set +from typing import Dict, List, Optional, Set import numpy as np import pandas as pd from ydata_profiling.config import Settings from ydata_profiling.model.correlations import perform_check_correlation +from ydata_profiling.model.var_description.default import VarDescription def fmt_percent(value: float, edge_cases: bool = True) -> str: @@ -143,13 +145,13 @@ def __repr__(self): class ConstantLengthAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.CONSTANT_LENGTH, - values=values, + values=values.var_specific, column_name=column_name, fields={"composition_min_length", "composition_max_length"}, is_empty=is_empty, @@ -162,15 +164,14 @@ def _get_description(self) -> str: class ConstantAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.CONSTANT, - values=values, + values={"n_distinct": values["n_distinct"]}, column_name=column_name, - fields={"n_distinct"}, is_empty=is_empty, ) @@ -181,7 +182,7 @@ def _get_description(self) -> str: class DuplicatesAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: dict, column_name: Optional[str] = None, is_empty: bool = False, ): @@ -203,15 +204,14 @@ def _get_description(self) -> str: class EmptyAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.EMPTY, - values=values, + values={"n": values.n}, column_name=column_name, - fields={"n"}, is_empty=is_empty, ) @@ -222,15 +222,14 @@ def _get_description(self) -> str: class HighCardinalityAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.HIGH_CARDINALITY, - values=values, + values={"n_distinct": values["n_distinct"]}, column_name=column_name, - fields={"n_distinct"}, is_empty=is_empty, ) @@ -244,7 +243,7 @@ def _get_description(self) -> str: class HighCorrelationAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: Dict, column_name: Optional[str] = None, is_empty: bool = False, ): @@ -270,13 +269,13 @@ def _get_description(self) -> str: class ImbalanceAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.IMBALANCE, - values=values, + values=values.var_specific, column_name=column_name, fields={"imbalance"}, is_empty=is_empty, @@ -293,13 +292,13 @@ def _get_description(self) -> str: class InfiniteAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.INFINITE, - values=values, + values=values.var_specific, column_name=column_name, fields={"p_infinite", "n_infinite"}, is_empty=is_empty, @@ -315,15 +314,14 @@ def _get_description(self) -> str: class MissingAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.MISSING, - values=values, + values={"p_missing": values.p_missing, "n_missing": values.n_missing}, column_name=column_name, - fields={"p_missing", "n_missing"}, is_empty=is_empty, ) @@ -373,13 +371,13 @@ def _get_description(self) -> str: class SkewedAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.SKEWED, - values=values, + values=values.var_specific, column_name=column_name, fields={"skewness"}, is_empty=is_empty, @@ -432,15 +430,19 @@ def _get_description(self) -> str: class UniqueAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.UNIQUE, - values=values, + values={ + "n_distinct": values["n_distinct"], + "p_distinct": values["p_distinct"], + "n_unique": values["n_unique"], + "p_unique": values["p_unique"], + }, column_name=column_name, - fields={"n_distinct", "p_distinct", "n_unique", "p_unique"}, is_empty=is_empty, ) @@ -469,13 +471,13 @@ def _get_description(self) -> str: class ZerosAlert(Alert): def __init__( self, - values: Optional[Dict] = None, + values: VarDescription, column_name: Optional[str] = None, is_empty: bool = False, ): super().__init__( alert_type=AlertType.ZEROS, - values=values, + values=values.var_specific, column_name=column_name, fields={"n_zeros", "p_zeros"}, is_empty=is_empty, @@ -531,7 +533,7 @@ def check_table_alerts(table: dict) -> List[Alert]: return alerts -def numeric_alerts(config: Settings, summary: dict) -> List[Alert]: +def numeric_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] # Skewness @@ -555,7 +557,7 @@ def numeric_alerts(config: Settings, summary: dict) -> List[Alert]: return alerts -def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]: +def timeseries_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = numeric_alerts(config, summary) if not summary["stationary"]: @@ -567,7 +569,7 @@ def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]: return alerts -def categorical_alerts(config: Settings, summary: dict) -> List[Alert]: +def categorical_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] # High cardinality @@ -585,7 +587,7 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]: # Constant length if "composition" in summary and summary["min_length"] == summary["max_length"]: - alerts.append(ConstantLengthAlert()) + alerts.append(ConstantLengthAlert(summary)) # Imbalance if ( @@ -596,38 +598,38 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]: return alerts -def boolean_alerts(config: Settings, summary: dict) -> List[Alert]: +def boolean_alerts(config: Settings, summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] if ( "imbalance" in summary and summary["imbalance"] > config.vars.bool.imbalance_threshold ): - alerts.append(ImbalanceAlert()) + alerts.append(ImbalanceAlert(summary)) return alerts -def generic_alerts(summary: dict) -> List[Alert]: +def generic_alerts(summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] # Missing - if alert_value(summary["p_missing"]): - alerts.append(MissingAlert()) + if alert_value(summary.p_missing): + alerts.append(MissingAlert(summary)) return alerts -def supported_alerts(summary: dict) -> List[Alert]: +def supported_alerts(summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [] - if summary.get("n_distinct", np.nan) == summary["n"]: - alerts.append(UniqueAlert()) + if summary.get("n_distinct", np.nan) == summary.n: + alerts.append(UniqueAlert(summary)) if summary.get("n_distinct", np.nan) == 1: alerts.append(ConstantAlert(summary)) return alerts -def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]: +def unsupported_alerts(summary: VarDescription) -> List[Alert]: alerts: List[Alert] = [ UnsupportedAlert(), RejectedAlert(), @@ -635,7 +637,9 @@ def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]: return alerts -def check_variable_alerts(config: Settings, col: str, description: dict) -> List[Alert]: +def check_variable_alerts( + config: Settings, col: str, description: VarDescription +) -> List[Alert]: """Checks individual variables for alerts. Args: @@ -665,7 +669,6 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List for idx in range(len(alerts)): alerts[idx].column_name = col - alerts[idx].values = description return alerts @@ -693,7 +696,10 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert def get_alerts( - config: Settings, table_stats: dict, series_description: dict, correlations: dict + config: Settings, + table_stats: dict, + series_description: dict[str, VarDescription], + correlations: dict, ) -> List[Alert]: alerts: List[Alert] = check_table_alerts(table_stats) for col, description in series_description.items(): diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index 6a7afffe1..a8bfcac1a 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -1,4 +1,5 @@ """Organize the calculation of statistics for each series in this DataFrame.""" + from datetime import datetime from typing import Any, Dict, Optional @@ -23,6 +24,7 @@ from ydata_profiling.model.summary import get_series_descriptions from ydata_profiling.model.table import get_table_stats from ydata_profiling.model.timeseries_index import get_time_index_description +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.progress_bar import progress from ydata_profiling.version import __version__ @@ -71,7 +73,7 @@ def describe( # Variable-specific pbar.total += len(df.columns) - series_description = get_series_descriptions( + series_description: dict[str, VarDescription] = get_series_descriptions( config, df, summarizer, typeset, pbar ) diff --git a/src/ydata_profiling/model/description.py b/src/ydata_profiling/model/description.py index 6c386704e..e000b6dcb 100644 --- a/src/ydata_profiling/model/description.py +++ b/src/ydata_profiling/model/description.py @@ -4,6 +4,8 @@ from pandas import Timedelta +from ydata_profiling.model.var_description.default import VarDescription + @dataclass class BaseAnalysis: @@ -98,7 +100,7 @@ class BaseDescription: analysis: BaseAnalysis time_index_analysis: Optional[TimeIndexAnalysis] table: Any - variables: Dict[str, Any] + variables: Dict[str, VarDescription] scatter: Any correlations: Dict[str, Any] missing: Dict[str, Any] diff --git a/src/ydata_profiling/model/expectation_algorithms.py b/src/ydata_profiling/model/expectation_algorithms.py index cbbeb635e..50c1feea7 100644 --- a/src/ydata_profiling/model/expectation_algorithms.py +++ b/src/ydata_profiling/model/expectation_algorithms.py @@ -1,12 +1,14 @@ from typing import Any, Tuple +from ydata_profiling.model.var_description.default import VarDescription + def generic_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: batch.expect_column_to_exist(name) - if summary["n_missing"] == 0: + if summary.n_missing == 0: batch.expect_column_values_to_not_be_null(name) if summary["p_unique"] == 1.0: @@ -16,8 +18,8 @@ def generic_expectations( def numeric_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: from great_expectations.profile.base import ProfilerTypeMapping numeric_type_names = ( @@ -56,8 +58,8 @@ def numeric_expectations( def categorical_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: # Use for both categorical and special case (boolean) absolute_threshold = 10 relative_threshold = 0.2 @@ -66,20 +68,20 @@ def categorical_expectations( or summary["p_distinct"] < relative_threshold ): batch.expect_column_values_to_be_in_set( - name, set(summary["value_counts_without_nan"].keys()) + name, set(summary.value_counts_without_nan.keys()) ) return name, summary, batch def path_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: return name, summary, batch def datetime_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: if any(k in summary for k in ["min", "max"]): batch.expect_column_values_to_be_between( name, @@ -92,20 +94,20 @@ def datetime_expectations( def image_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: return name, summary, batch def url_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: return name, summary, batch def file_expectations( - name: str, summary: dict, batch: Any, *args -) -> Tuple[str, dict, Any]: + name: str, summary: VarDescription, batch: Any, *args +) -> Tuple[str, VarDescription, Any]: # By definition within our type logic, a file exists (as it's a path that also exists) batch.expect_file_to_exist(name) diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 92e66733c..e5f46c9b4 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -11,10 +11,8 @@ from ydata_profiling.model.summary_algorithms import ( describe_boolean_1d, describe_categorical_1d, - describe_counts, describe_date_1d, describe_file_1d, - describe_generic, describe_image_1d, describe_numeric_1d, describe_path_1d, @@ -23,6 +21,7 @@ describe_timeseries_1d, describe_url_1d, ) +from ydata_profiling.model.var_description.default import VarDescription class BaseSummarizer(Handler): @@ -33,7 +32,7 @@ class BaseSummarizer(Handler): def summarize( self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType] - ) -> dict: + ) -> VarDescription: """ Returns: @@ -49,8 +48,6 @@ class PandasProfilingSummarizer(BaseSummarizer): def __init__(self, typeset: VisionsTypeset, *args, **kwargs): summary_map: Dict[str, List[Callable]] = { "Unsupported": [ - describe_counts, - describe_generic, describe_supported, ], "Numeric": [ @@ -87,7 +84,7 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs): super().__init__(summary_map, typeset, *args, **kwargs) -def format_summary(summary: Union[BaseDescription, dict]) -> dict: +def format_summary(summary: Union[BaseDescription, VarDescription, dict]) -> dict: """Prepare summary for export to json file. Args: diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index 477aae1ca..8e4179598 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -8,6 +8,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summarizer import BaseSummarizer +from ydata_profiling.model.var_description.default import VarDescription @multimethod @@ -16,7 +17,7 @@ def describe_1d( series: Any, summarizer: BaseSummarizer, typeset: VisionsTypeset, -) -> dict: +) -> VarDescription: raise NotImplementedError() @@ -27,5 +28,5 @@ def get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> dict: +) -> dict[str, VarDescription]: raise NotImplementedError() diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 24b11e56a..71ea837b3 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -138,7 +138,7 @@ def render_variables_section( "alert_fields": alert_fields, } - template_variables.update(summary) + summary.update(template_variables) # Per type template variables if isinstance(summary["type"], list): @@ -159,7 +159,7 @@ def render_variables_section( else: variable_type = summary["type"] render_map_type = render_map.get(variable_type, render_map["Unsupported"]) - template_variables.update(render_map_type(config, template_variables)) + template_variables.update(render_map_type(config, summary)) # Ignore these if reject_variables: From b3e7120e27867fe9417a566af85b160268b78b17 Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Sat, 16 Dec 2023 16:32:20 +0100 Subject: [PATCH 03/10] feat: update render to support VarDescription --- .../structure/variables/render_boolean.py | 22 +++---- .../structure/variables/render_categorical.py | 61 ++++++++++--------- .../structure/variables/render_common.py | 15 ++--- .../structure/variables/render_complex.py | 9 +-- .../structure/variables/render_count.py | 9 +-- .../report/structure/variables/render_date.py | 9 +-- .../report/structure/variables/render_file.py | 5 +- .../structure/variables/render_generic.py | 9 +-- .../structure/variables/render_image.py | 9 +-- .../report/structure/variables/render_path.py | 5 +- .../report/structure/variables/render_real.py | 9 +-- .../report/structure/variables/render_text.py | 9 +-- .../structure/variables/render_timeseries.py | 9 +-- .../report/structure/variables/render_url.py | 15 ++--- 14 files changed, 104 insertions(+), 91 deletions(-) diff --git a/src/ydata_profiling/report/structure/variables/render_boolean.py b/src/ydata_profiling/report/structure/variables/render_boolean.py index e6bdbe4d0..b2213f682 100644 --- a/src/ydata_profiling/report/structure/variables/render_boolean.py +++ b/src/ydata_profiling/report/structure/variables/render_boolean.py @@ -1,6 +1,7 @@ from typing import List from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent from ydata_profiling.report.presentation.core import ( Container, @@ -16,7 +17,7 @@ from ydata_profiling.visualisation.plot import cat_frequency_plot -def render_boolean(config: Settings, summary: dict) -> dict: +def render_boolean(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] n_obs_bool = config.vars.bool.n_obs image_format = config.plot.image_format @@ -48,17 +49,17 @@ def render_boolean(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], @@ -67,8 +68,8 @@ def render_boolean(config: Settings, summary: dict) -> dict: fqm = FrequencyTableSmall( freq_table( - freqtable=summary["value_counts_without_nan"], - n=summary["n"], + freqtable=summary.value_counts_without_nan, + n=summary.n, max_number_to_print=n_obs_bool, ), redact=False, @@ -89,7 +90,7 @@ def render_boolean(config: Settings, summary: dict) -> dict: max_unique = config.plot.cat_freq.max_unique if show and (max_unique > 0): - if isinstance(summary["value_counts_without_nan"], list): + if isinstance(summary.value_counts_without_nan, list): items.append( Container( [ @@ -103,7 +104,7 @@ def render_boolean(config: Settings, summary: dict) -> dict: name=config.html.style._labels[idx], anchor_id=f"{varid}cat_frequency_plot_{idx}", ) - for idx, s in enumerate(summary["value_counts_without_nan"]) + for idx, s in enumerate(summary.value_counts_without_nan) ], anchor_id=f"{varid}cat_frequency_plot", name="Common Values (Plot)", @@ -114,10 +115,7 @@ def render_boolean(config: Settings, summary: dict) -> dict: else: items.append( Image( - cat_frequency_plot( - config, - summary["value_counts_without_nan"], - ), + cat_frequency_plot(config, summary.value_counts_without_nan), image_format=image_format, alt="Common Values (Plot)", name="Common Values (Plot)", diff --git a/src/ydata_profiling/report/structure/variables/render_categorical.py b/src/ydata_profiling/report/structure/variables/render_categorical.py index 86f5a262a..db1b5ec52 100644 --- a/src/ydata_profiling/report/structure/variables/render_categorical.py +++ b/src/ydata_profiling/report/structure/variables/render_categorical.py @@ -3,6 +3,7 @@ import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import ( fmt, fmt_bytesize, @@ -27,7 +28,7 @@ def render_categorical_frequency( - config: Settings, summary: dict, varid: str + config: Settings, summary: VarDescription, varid: str ) -> Renderable: frequency_table = Table( [ @@ -54,7 +55,7 @@ def render_categorical_frequency( def render_categorical_length( - config: Settings, summary: dict, varid: str + config: Settings, summary: VarDescription, varid: str ) -> Tuple[Renderable, Renderable]: length_table = Table( [ @@ -117,7 +118,7 @@ def _get_n(value: Union[list, pd.DataFrame]) -> Union[int, List[int]]: def render_categorical_unicode( - config: Settings, summary: dict, varid: str + config: Settings, summary: VarDescription, varid: str ) -> Tuple[Renderable, Renderable]: n_freq_table_max = config.n_freq_table_max @@ -329,7 +330,7 @@ def render_categorical_unicode( ) -def render_categorical(config: Settings, summary: dict) -> dict: +def render_categorical(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] n_obs_cat = config.vars.cat.n_obs image_format = config.plot.image_format @@ -366,17 +367,17 @@ def render_categorical(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], @@ -385,8 +386,8 @@ def render_categorical(config: Settings, summary: dict) -> dict: fqm = FrequencyTableSmall( freq_table( - freqtable=summary["value_counts_without_nan"], - n=summary["count"], + freqtable=summary.value_counts_without_nan, + n=summary.count, max_number_to_print=n_obs_cat, ), redact=config.vars.cat.redact, @@ -459,26 +460,28 @@ def render_categorical(config: Settings, summary: dict) -> dict: max_unique = config.plot.cat_freq.max_unique if show and (max_unique > 0): - if isinstance(summary["value_counts_without_nan"], list): + if isinstance(summary.value_counts_without_nan, list): string_items.append( Container( [ - Image( - cat_frequency_plot( - config, - s, - ), - image_format=image_format, - alt=config.html.style._labels[idx], - name=config.html.style._labels[idx], - anchor_id=f"{varid}cat_frequency_plot_{idx}", + ( + Image( + cat_frequency_plot( + config, + s, + ), + image_format=image_format, + alt=config.html.style._labels[idx], + name=config.html.style._labels[idx], + anchor_id=f"{varid}cat_frequency_plot_{idx}", + ) + if summary["n_distinct"][idx] <= max_unique + else HTML( + f"

{config.html.style._labels[idx]}


" + f"Number of variable categories passes threshold (config.plot.cat_freq.max_unique)" + ) ) - if summary["n_distinct"][idx] <= max_unique - else HTML( - f"

{config.html.style._labels[idx]}


" - f"Number of variable categories passes threshold (config.plot.cat_freq.max_unique)" - ) - for idx, s in enumerate(summary["value_counts_without_nan"]) + for idx, s in enumerate(summary.value_counts_without_nan) ], anchor_id=f"{varid}cat_frequency_plot", name="Common Values (Plot)", @@ -493,7 +496,7 @@ def render_categorical(config: Settings, summary: dict) -> dict: Image( cat_frequency_plot( config, - summary["value_counts_without_nan"], + summary.value_counts_without_nan, ), image_format=image_format, alt="Common Values (Plot)", @@ -515,9 +518,9 @@ def render_categorical(config: Settings, summary: dict) -> dict: string_items, name="Categories", anchor_id=f"{varid}string", - sequence_type="named_list" - if len(config.html.style._labels) > 1 - else "batch_grid", + sequence_type=( + "named_list" if len(config.html.style._labels) > 1 else "batch_grid" + ), batch_size=len(config.html.style._labels), ), ] diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py index aef8de357..b597eda08 100644 --- a/src/ydata_profiling/report/structure/variables/render_common.py +++ b/src/ydata_profiling/report/structure/variables/render_common.py @@ -1,30 +1,31 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.presentation.frequency_table_utils import ( extreme_obs_table, freq_table, ) -def render_common(config: Settings, summary: dict) -> dict: +def render_common(config: Settings, summary: VarDescription) -> dict: n_extreme_obs = config.n_extreme_obs n_freq_table_max = config.n_freq_table_max template_variables = { # TODO: with nan "freq_table_rows": freq_table( - freqtable=summary["value_counts_without_nan"], - n=summary["n"], + freqtable=summary.value_counts_without_nan, + n=summary.n, max_number_to_print=n_freq_table_max, ), "firstn_expanded": extreme_obs_table( - freqtable=summary["value_counts_index_sorted"], + freqtable=summary.value_counts_index_sorted, number_to_print=n_extreme_obs, - n=summary["n"], + n=summary.n, ), "lastn_expanded": extreme_obs_table( - freqtable=summary["value_counts_index_sorted"][::-1], + freqtable=summary.value_counts_index_sorted[::-1], number_to_print=n_extreme_obs, - n=summary["n"], + n=summary.n, ), } diff --git a/src/ydata_profiling/report/structure/variables/render_complex.py b/src/ydata_profiling/report/structure/variables/render_complex.py index 5995285e5..5c4ea7d09 100644 --- a/src/ydata_profiling/report/structure/variables/render_complex.py +++ b/src/ydata_profiling/report/structure/variables/render_complex.py @@ -1,4 +1,5 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import ( fmt, fmt_bytesize, @@ -15,7 +16,7 @@ from ydata_profiling.visualisation.plot import scatter_complex -def render_complex(config: Settings, summary: dict) -> dict: +def render_complex(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] template_variables = {} image_format = config.plot.image_format @@ -37,14 +38,14 @@ def render_complex(config: Settings, summary: dict) -> dict: "name": "Distinct (%)", "value": fmt_percent(summary["p_distinct"]), }, - {"name": "Missing", "value": fmt(summary["n_missing"])}, + {"name": "Missing", "value": fmt(summary.n_missing)}, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), }, ], style=config.html.style, diff --git a/src/ydata_profiling/report/structure/variables/render_count.py b/src/ydata_profiling/report/structure/variables/render_count.py index e11e9913e..e9b238659 100644 --- a/src/ydata_profiling/report/structure/variables/render_count.py +++ b/src/ydata_profiling/report/structure/variables/render_count.py @@ -1,4 +1,5 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import ( fmt, fmt_bytesize, @@ -16,7 +17,7 @@ from ydata_profiling.visualisation.plot import histogram, mini_histogram -def render_count(config: Settings, summary: dict) -> dict: +def render_count(config: Settings, summary: VarDescription) -> dict: template_variables = render_common(config, summary) image_format = config.plot.image_format @@ -44,12 +45,12 @@ def render_count(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": False, }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": False, }, ], @@ -87,7 +88,7 @@ def render_count(config: Settings, summary: dict) -> dict: }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index c75a80a5e..94b489cf5 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -1,6 +1,7 @@ from typing import Any, Dict from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent from ydata_profiling.report.presentation.core import ( Container, @@ -11,7 +12,7 @@ from ydata_profiling.visualisation.plot import histogram, mini_histogram -def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: +def render_date(config: Settings, summary: VarDescription) -> Dict[str, Any]: varid = summary["varid"] template_variables = {} @@ -41,17 +42,17 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": False, }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": False, }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], diff --git a/src/ydata_profiling/report/structure/variables/render_file.py b/src/ydata_profiling/report/structure/variables/render_file.py index 81379a41f..e54dd6e6e 100644 --- a/src/ydata_profiling/report/structure/variables/render_file.py +++ b/src/ydata_profiling/report/structure/variables/render_file.py @@ -1,6 +1,7 @@ from typing import List from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.presentation.core import Container, FrequencyTable, Image from ydata_profiling.report.presentation.core.renderable import Renderable from ydata_profiling.report.presentation.frequency_table_utils import freq_table @@ -8,7 +9,7 @@ from ydata_profiling.visualisation.plot import histogram -def render_file(config: Settings, summary: dict) -> dict: +def render_file(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] template_variables = render_path(config, summary) @@ -44,7 +45,7 @@ def render_file(config: Settings, summary: dict) -> dict: FrequencyTable( freq_table( freqtable=summary[file_date_id].value_counts(), - n=summary["n"], + n=summary.n, max_number_to_print=n_freq_table_max, ), name=description, diff --git a/src/ydata_profiling/report/structure/variables/render_generic.py b/src/ydata_profiling/report/structure/variables/render_generic.py index 0b2e00efb..c70810cb7 100644 --- a/src/ydata_profiling/report/structure/variables/render_generic.py +++ b/src/ydata_profiling/report/structure/variables/render_generic.py @@ -1,4 +1,5 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent from ydata_profiling.report.presentation.core import ( HTML, @@ -8,7 +9,7 @@ ) -def render_generic(config: Settings, summary: dict) -> dict: +def render_generic(config: Settings, summary: VarDescription) -> dict: info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], @@ -22,17 +23,17 @@ def render_generic(config: Settings, summary: dict) -> dict: [ { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], diff --git a/src/ydata_profiling/report/structure/variables/render_image.py b/src/ydata_profiling/report/structure/variables/render_image.py index ea1336208..a4491ef89 100644 --- a/src/ydata_profiling/report/structure/variables/render_image.py +++ b/src/ydata_profiling/report/structure/variables/render_image.py @@ -1,6 +1,7 @@ import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt_numeric from ydata_profiling.report.presentation.core import ( Container, @@ -13,7 +14,7 @@ from ydata_profiling.visualisation.plot import scatter_series -def render_image(config: Settings, summary: dict) -> dict: +def render_image(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] n_freq_table_max = config.n_freq_table_max redact = config.vars.cat.redact @@ -135,7 +136,7 @@ def render_image(config: Settings, summary: dict) -> dict: FrequencyTable( freq_table( freqtable=summary["image_dimensions"].value_counts(), - n=summary["n"], + n=summary.n, max_number_to_print=n_freq_table_max, ), name="Common values", @@ -156,7 +157,7 @@ def render_image(config: Settings, summary: dict) -> dict: FrequencyTable( freq_table( freqtable=pd.Series(summary["exif_keys_counts"]), - n=summary["n"], + n=summary.n, max_number_to_print=n_freq_table_max, ), name="Exif keys", @@ -172,7 +173,7 @@ def render_image(config: Settings, summary: dict) -> dict: FrequencyTable( freq_table( freqtable=counts, - n=summary["n"], + n=summary.n, max_number_to_print=n_freq_table_max, ), name=key, diff --git a/src/ydata_profiling/report/structure/variables/render_path.py b/src/ydata_profiling/report/structure/variables/render_path.py index d7cde6f06..eaade0114 100644 --- a/src/ydata_profiling/report/structure/variables/render_path.py +++ b/src/ydata_profiling/report/structure/variables/render_path.py @@ -1,4 +1,5 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt, fmt_numeric from ydata_profiling.report.presentation.core import Container, FrequencyTable, Table from ydata_profiling.report.presentation.frequency_table_utils import freq_table @@ -7,7 +8,7 @@ ) -def render_path(config: Settings, summary: dict) -> dict: +def render_path(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] n_freq_table_max = config.n_freq_table_max redact = config.vars.cat.redact @@ -18,7 +19,7 @@ def render_path(config: Settings, summary: dict) -> dict: for path_part in keys: template_variables[f"freqtable_{path_part}"] = freq_table( freqtable=summary[f"{path_part}_counts"], - n=summary["n"], + n=summary.n, max_number_to_print=n_freq_table_max, ) diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py index 227200c27..2c9005d44 100644 --- a/src/ydata_profiling/report/structure/variables/render_real.py +++ b/src/ydata_profiling/report/structure/variables/render_real.py @@ -1,4 +1,5 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import ( fmt, fmt_bytesize, @@ -17,7 +18,7 @@ from ydata_profiling.visualisation.plot import histogram, mini_histogram -def render_real(config: Settings, summary: dict) -> dict: +def render_real(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] template_variables = render_common(config, summary) image_format = config.plot.image_format @@ -48,12 +49,12 @@ def render_real(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { @@ -111,7 +112,7 @@ def render_real(config: Settings, summary: dict) -> dict: }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], diff --git a/src/ydata_profiling/report/structure/variables/render_text.py b/src/ydata_profiling/report/structure/variables/render_text.py index 5eadf3799..c4c690e5e 100644 --- a/src/ydata_profiling/report/structure/variables/render_text.py +++ b/src/ydata_profiling/report/structure/variables/render_text.py @@ -1,6 +1,7 @@ from typing import Any, Dict, List from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent from ydata_profiling.report.presentation.core import ( Container, @@ -21,7 +22,7 @@ from ydata_profiling.visualisation.plot import plot_word_cloud -def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: +def render_text(config: Settings, summary: VarDescription) -> Dict[str, Any]: if config.vars.text.redact: render = render_categorical(config, summary) return render @@ -58,17 +59,17 @@ def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], diff --git a/src/ydata_profiling/report/structure/variables/render_timeseries.py b/src/ydata_profiling/report/structure/variables/render_timeseries.py index 6f3bc27cd..78e62402d 100644 --- a/src/ydata_profiling/report/structure/variables/render_timeseries.py +++ b/src/ydata_profiling/report/structure/variables/render_timeseries.py @@ -1,4 +1,5 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import ( fmt, fmt_bytesize, @@ -81,7 +82,7 @@ def _render_gap_tab(config: Settings, summary: dict) -> Container: ) -def render_timeseries(config: Settings, summary: dict) -> dict: +def render_timeseries(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] template_variables = render_common(config, summary) image_format = config.plot.image_format @@ -111,12 +112,12 @@ def render_timeseries(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { @@ -164,7 +165,7 @@ def render_timeseries(config: Settings, summary: dict) -> dict: }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], diff --git a/src/ydata_profiling/report/structure/variables/render_url.py b/src/ydata_profiling/report/structure/variables/render_url.py index f35d6dcb6..59c007ea5 100644 --- a/src/ydata_profiling/report/structure/variables/render_url.py +++ b/src/ydata_profiling/report/structure/variables/render_url.py @@ -1,4 +1,5 @@ from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent from ydata_profiling.report.presentation.core import ( Container, @@ -11,7 +12,7 @@ from ydata_profiling.report.structure.variables.render_common import render_common -def render_url(config: Settings, summary: dict) -> dict: +def render_url(config: Settings, summary: VarDescription) -> dict: varid = summary["varid"] n_freq_table_max = config.n_freq_table_max @@ -24,7 +25,7 @@ def render_url(config: Settings, summary: dict) -> dict: for url_part in keys: template_variables[f"freqtable_{url_part}"] = freq_table( freqtable=summary[f"{url_part}_counts"], - n=summary["n"], + n=summary.n, max_number_to_print=n_freq_table_max, ) @@ -101,17 +102,17 @@ def render_url(config: Settings, summary: dict) -> dict: }, { "name": "Missing", - "value": fmt(summary["n_missing"]), + "value": fmt(summary.n_missing), "alert": "n_missing" in summary["alert_fields"], }, { "name": "Missing (%)", - "value": fmt_percent(summary["p_missing"]), + "value": fmt_percent(summary.p_missing), "alert": "p_missing" in summary["alert_fields"], }, { "name": "Memory size", - "value": fmt_bytesize(summary["memory_size"]), + "value": fmt_bytesize(summary.memory_size), "alert": False, }, ], @@ -120,8 +121,8 @@ def render_url(config: Settings, summary: dict) -> dict: fqm = FrequencyTableSmall( freq_table( - freqtable=summary["value_counts_without_nan"], - n=summary["n"], + freqtable=summary.value_counts_without_nan, + n=summary.n, max_number_to_print=n_obs_cat, ), redact=redact, From b0dc70938663372fe82f8759bdc1c21f5d0bd06f Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Sat, 16 Dec 2023 19:32:28 +0100 Subject: [PATCH 04/10] feat: update tests --- tests/unit/test_comparison.py | 3 +- tests/unit/test_describe.py | 10 +++++- .../unit/test_ge_integration_expectations.py | 33 ++++++++++------- tests/unit/test_summarizer.py | 33 ++++++++++++++--- tests/unit/test_summary_algos.py | 36 +++++++++---------- 5 files changed, 77 insertions(+), 38 deletions(-) diff --git a/tests/unit/test_comparison.py b/tests/unit/test_comparison.py index 748c5af12..6d5a547b1 100644 --- a/tests/unit/test_comparison.py +++ b/tests/unit/test_comparison.py @@ -66,7 +66,8 @@ def test_generate_comparison(): p1 = ProfileReport(df1, title="p1") p2 = ProfileReport(df2, title="p1") - html = p1.compare(p2).to_html() + _compare = p1.compare(p2) + html = _compare.to_html() assert len(html) > 0 diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py index 0eb10b7b7..073b189c1 100644 --- a/tests/unit/test_describe.py +++ b/tests/unit/test_describe.py @@ -9,6 +9,7 @@ from ydata_profiling.model.describe import describe from ydata_profiling.model.summary import describe_1d from ydata_profiling.model.typeset import ProfilingTypeSet +from ydata_profiling.model.var_description.default import VarDescription check_is_NaN = "ydata_profiling.check_is_NaN" @@ -49,7 +50,7 @@ def test_describe_unique(data, expected, summarizer, typeset): config = Settings() config.vars.num.low_categorical_threshold = 0 - desc_1d = describe_1d(config, data, summarizer, typeset) + desc_1d: VarDescription = describe_1d(config, data, summarizer, typeset) if expected["is_unique"] is not None: assert ( desc_1d["p_unique"] == expected["p_unique"] @@ -562,6 +563,13 @@ def test_describe_df(column, describe_data, expected_results, summarizer): for k, v in expected_results[column].items(): if v == check_is_NaN: test_condition = k not in results.variables[column] + # values from common description + elif k in asdict(results.variables[column]): + if isinstance(v, float): + assert pytest.approx(v) == getattr(results.variables[column], k) + else: + assert v == getattr(results.variables[column], k) + continue elif isinstance(v, float): test_condition = pytest.approx(v) == results.variables[column][k] else: diff --git a/tests/unit/test_ge_integration_expectations.py b/tests/unit/test_ge_integration_expectations.py index 4ef0b1465..ed721e4a7 100644 --- a/tests/unit/test_ge_integration_expectations.py +++ b/tests/unit/test_ge_integration_expectations.py @@ -1,4 +1,4 @@ -from unittest.mock import Mock, patch +from unittest.mock import MagicMock, Mock, patch import pytest @@ -20,14 +20,22 @@ def batch(): def test_generic_expectations(batch): - generic_expectations("column", {"n_missing": 0, "p_unique": 1.0}, batch) + default_desc = MagicMock() + default_desc.n_missing = 0 + d = {"p_unique": 1.0} + default_desc.__getitem__.side_effect = d.__getitem__ + generic_expectations("column", default_desc, batch) batch.expect_column_to_exist.assert_called_once() batch.expect_column_values_to_not_be_null.assert_called_once() batch.expect_column_values_to_be_unique.assert_called_once() def test_generic_expectations_min(batch): - generic_expectations("column", {"n_missing": 1, "p_unique": 0.5}, batch) + default_desc = MagicMock() + default_desc.n_missing = 1 + d = {"p_unique": 0.5} + default_desc.__getitem__.side_effect = d.__getitem__ + generic_expectations("column", default_desc, batch) batch.expect_column_to_exist.assert_called_once() batch.expect_column_values_to_not_be_null.assert_not_called() batch.expect_column_values_to_be_unique.assert_not_called() @@ -93,22 +101,21 @@ def test_numeric_expectations_min(batch): def test_categorical_expectations(batch): - categorical_expectations( - "column", - { - "n_distinct": 1, - "p_distinct": 0.1, - "value_counts_without_nan": {"val1": 1, "val2": 2}, - }, - batch, - ) + default_desc = MagicMock() + d = {"n_distinct": 1, "p_unique": 0.1} + default_desc.__getitem__.side_effect = d.__getitem__ + default_desc.value_counts_without_nan = {"val1": 1, "val2": 2} + categorical_expectations("column", default_desc, batch) batch.expect_column_values_to_be_in_set.assert_called_once_with( "column", {"val1", "val2"} ) def test_categorical_expectations_min(batch): - categorical_expectations("column", {"n_distinct": 15, "p_distinct": 1.0}, batch) + default_desc = MagicMock() + d = {"n_distinct": 15, "p_distinct": 1.0} + default_desc.__getitem__.side_effect = d.__getitem__ + categorical_expectations("column", default_desc, batch) batch.expect_column_values_to_be_in_set.assert_not_called() diff --git a/tests/unit/test_summarizer.py b/tests/unit/test_summarizer.py index 60fea5590..4db83d69e 100644 --- a/tests/unit/test_summarizer.py +++ b/tests/unit/test_summarizer.py @@ -2,13 +2,14 @@ import pandas as pd +from ydata_profiling.config import Settings from ydata_profiling.model.summarizer import PandasProfilingSummarizer, format_summary from ydata_profiling.model.typeset import ProfilingTypeSet base_path = os.path.abspath(os.path.dirname(__file__)) -def test_summarizer(config): +def test_summarizer_base_types(config: Settings): pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary(pps.summarize(config, pd.Series([1, 2, 3, 4, 5]), "Unsupported")) @@ -23,9 +24,23 @@ def test_summarizer(config): _ = format_summary( pps.summarize(config, pd.Series(["abc", "abc", "abba"]), "Categorical") ) + + _ = format_summary( + pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean") + ) + + +def test_summarizer_url(config: Settings): + config.vars.url.active = True + pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize(config, pd.Series(["https://www.example.com"]), "URL") ) + + +def test_summarizer_path(config: Settings): + config.vars.path.active = True + pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize( config, @@ -40,6 +55,12 @@ def test_summarizer(config): "Path", ) ) + + +def test_summarizer_file(config: Settings): + config.vars.path.active = True + config.vars.file.active = True + pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize( config, @@ -53,6 +74,13 @@ def test_summarizer(config): "File", ) ) + + +def test_summarizer_image(config: Settings): + config.vars.path.active = True + config.vars.file.active = True + config.vars.image.active = True + pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize( config, @@ -62,6 +90,3 @@ def test_summarizer(config): "Image", ) ) - _ = format_summary( - pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean") - ) diff --git a/tests/unit/test_summary_algos.py b/tests/unit/test_summary_algos.py index 523ce5fcd..d7aa90045 100644 --- a/tests/unit/test_summary_algos.py +++ b/tests/unit/test_summary_algos.py @@ -4,35 +4,37 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import ( - describe_counts, - describe_generic, describe_supported, histogram_compute, ) +from ydata_profiling.model.var_description.default import VarDescription def test_count_summary_sorted(config): s = pd.Series([1] + [2] * 1000) - _, sn, r = describe_counts(config, s, {}) - assert r["value_counts_without_nan"].index[0] == 2 - assert r["value_counts_without_nan"].index[1] == 1 + r: VarDescription + _, sn, r = describe_supported(config, s, {}) + assert r.value_counts_without_nan.index[0] == 2 + assert r.value_counts_without_nan.index[1] == 1 def test_count_summary_nat(config): + r: VarDescription s = pd.to_datetime(pd.Series([1, 2] + [np.nan, pd.NaT])) - _, sn, r = describe_counts(config, s, {}) - assert len(r["value_counts_without_nan"].index) == 2 + _, sn, r = describe_supported(config, s, {}) + assert len(r.value_counts_without_nan.index) == 2 def test_count_summary_category(config): + r: VarDescription s = pd.Series( pd.Categorical( ["Poor", "Neutral"] + [np.nan] * 100, categories=["Poor", "Neutral", "Excellent"], ) ) - _, sn, r = describe_counts(config, s, {}) - assert len(r["value_counts_without_nan"].index) == 2 + _, sn, r = describe_supported(config, s, {}) + assert len(r.value_counts_without_nan.index) == 2 @pytest.fixture(scope="class") @@ -41,16 +43,12 @@ def empty_data() -> pd.DataFrame: def test_summary_supported_empty_df(config, empty_data): - _, series, summary = describe_counts(config, empty_data["A"], {}) - assert summary["n_missing"] == 0 - assert "p_missing" not in summary - - _, series, summary = describe_generic(config, series, summary) - assert summary["n_missing"] == 0 - assert summary["p_missing"] == 0 - assert summary["count"] == 0 - - _, _, summary = describe_supported(config, series, summary) + summary: VarDescription + _, _, summary = describe_supported(config, empty_data["A"], {}) + assert summary.n_missing == 0 + assert summary.n_missing == 0 + assert summary.p_missing == 0 + assert summary.count == 0 assert summary["n_distinct"] == 0 assert summary["p_distinct"] == 0 assert summary["n_unique"] == 0 From d91400fc7147c72e5b0410726debf85096c9ce90 Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Sat, 16 Dec 2023 20:04:42 +0100 Subject: [PATCH 05/10] feat: add summary algs support for VariableDescription --- .../model/summary_algorithms.py | 85 +++++++++---------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index b97a72ca7..4a82b3313 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -7,6 +7,7 @@ from scipy.stats import chisquare from ydata_profiling.config import Settings +from ydata_profiling.model.var_description.default import VarDescription T = TypeVar("T") @@ -62,13 +63,18 @@ def chi_square( def series_hashable( - fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]] -) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]: + fn: Callable[ + [Settings, pd.Series, VarDescription], + Tuple[Settings, pd.Series, VarDescription], + ] +) -> Callable[ + [Settings, pd.Series, VarDescription], Tuple[Settings, pd.Series, VarDescription] +]: @functools.wraps(fn) def inner( - config: Settings, series: pd.Series, summary: dict - ) -> Tuple[Settings, pd.Series, dict]: - if not summary["hashable"]: + config: Settings, series: pd.Series, summary: VarDescription + ) -> Tuple[Settings, pd.Series, VarDescription]: + if not summary.hashable: return config, series, summary return fn(config, series, summary) @@ -76,14 +82,19 @@ def inner( def series_handle_nulls( - fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]] -) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]: + fn: Callable[ + [Settings, pd.Series, VarDescription], + Tuple[Settings, pd.Series, VarDescription], + ] +) -> Callable[ + [Settings, pd.Series, VarDescription], Tuple[Settings, pd.Series, VarDescription] +]: """Decorator for nullable series""" @functools.wraps(fn) def inner( - config: Settings, series: pd.Series, summary: dict - ) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription + ) -> Tuple[Settings, pd.Series, VarDescription]: if series.hasnans: series = series.dropna() @@ -103,92 +114,78 @@ def named_aggregate_summary(series: pd.Series, key: str) -> dict: return summary -@multimethod -def describe_counts( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: - raise NotImplementedError() - - @multimethod def describe_supported( config: Settings, series: Any, series_description: dict -) -> Tuple[Settings, Any, dict]: - raise NotImplementedError() - - -@multimethod -def describe_generic( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_numeric_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_text_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict, Any]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_date_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_categorical_1d( - config: Settings, series: pd.Series, summary: dict -) -> Tuple[Settings, pd.Series, dict]: + config: Settings, series: pd.Series, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_url_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_file_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_path_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_image_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_boolean_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() @multimethod def describe_timeseries_1d( - config: Settings, series: Any, summary: dict -) -> Tuple[Settings, Any, dict]: + config: Settings, series: Any, summary: VarDescription +) -> Tuple[Settings, Any, VarDescription]: raise NotImplementedError() From b8069d8b89267af5f60f9532abc2baf8fff7650c Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Sat, 16 Dec 2023 20:05:58 +0100 Subject: [PATCH 06/10] feat: add spark support --- src/ydata_profiling/model/alerts.py | 5 +- src/ydata_profiling/model/pandas/__init__.py | 4 -- .../model/pandas/describe_counts_pandas.py | 64 ------------------- .../model/pandas/describe_date_pandas.py | 2 +- .../model/pandas/describe_generic_pandas.py | 37 ----------- src/ydata_profiling/model/spark/__init__.py | 4 -- .../model/spark/correlations_spark.py | 1 + .../model/spark/describe_boolean_spark.py | 7 +- .../model/spark/describe_categorical_spark.py | 5 +- .../model/spark/describe_date_spark.py | 5 +- .../model/spark/describe_generic_spark.py | 32 ---------- .../model/spark/describe_numeric_spark.py | 21 +++--- .../model/spark/describe_supported_spark.py | 20 ++---- .../model/spark/describe_text_spark.py | 5 +- .../model/spark/summary_spark.py | 1 + .../model/spark/timeseries_index_spark.py | 1 + .../counts_spark.py} | 42 ++++++------ .../spark/var_description/default_spark.py | 46 +++++++++++++ .../model/var_description/default.py | 4 ++ 19 files changed, 110 insertions(+), 196 deletions(-) delete mode 100644 src/ydata_profiling/model/pandas/describe_counts_pandas.py delete mode 100644 src/ydata_profiling/model/pandas/describe_generic_pandas.py delete mode 100644 src/ydata_profiling/model/spark/describe_generic_spark.py rename src/ydata_profiling/model/spark/{describe_counts_spark.py => var_description/counts_spark.py} (58%) create mode 100644 src/ydata_profiling/model/spark/var_description/default_spark.py diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 9ce40522a..ec352ddbf 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -170,7 +170,10 @@ def __init__( ): super().__init__( alert_type=AlertType.CONSTANT, - values={"n_distinct": values["n_distinct"]}, + values={ + "n_distinct": values["n_distinct"], + "value_counts_without_nan": values.value_counts_without_nan, + }, column_name=column_name, is_empty=is_empty, ) diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py index 59ccf853c..e929d4731 100644 --- a/src/ydata_profiling/model/pandas/__init__.py +++ b/src/ydata_profiling/model/pandas/__init__.py @@ -3,10 +3,8 @@ dataframe_pandas, describe_boolean_pandas, describe_categorical_pandas, - describe_counts_pandas, describe_date_pandas, describe_file_pandas, - describe_generic_pandas, describe_image_pandas, describe_numeric_pandas, describe_path_pandas, @@ -27,10 +25,8 @@ "dataframe_pandas", "describe_boolean_pandas", "describe_categorical_pandas", - "describe_counts_pandas", "describe_date_pandas", "describe_file_pandas", - "describe_generic_pandas", "describe_image_pandas", "describe_numeric_pandas", "describe_path_pandas", diff --git a/src/ydata_profiling/model/pandas/describe_counts_pandas.py b/src/ydata_profiling/model/pandas/describe_counts_pandas.py deleted file mode 100644 index 416474d25..000000000 --- a/src/ydata_profiling/model/pandas/describe_counts_pandas.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import Tuple - -import pandas as pd - -from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_counts -from ydata_profiling.model.var_description.default import VarDescription - - -@describe_counts.register -def pandas_describe_counts( - config: Settings, series: pd.Series, summary: VarDescription -) -> Tuple[Settings, pd.Series, VarDescription]: - """Counts the values in a series (with and without NaN, distinct). - - Args: - config: report Settings object - series: Series for which we want to calculate the values. - summary: series' summary - - Returns: - A dictionary with the count values (with and without NaN, distinct). - """ - try: - value_counts_with_nan = series.value_counts(dropna=False) - _ = set(value_counts_with_nan.index) - hashable = True - except: # noqa: E722 - hashable = False - - summary.hashable = hashable - - if hashable: - value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0] - - null_index = value_counts_with_nan.index.isnull() - if null_index.any(): - n_missing = value_counts_with_nan[null_index].sum() - value_counts_without_nan = value_counts_with_nan[~null_index] - else: - n_missing = 0 - value_counts_without_nan = value_counts_with_nan - - summary.update( - { - "value_counts_without_nan": value_counts_without_nan, - } - ) - - try: - summary["value_counts_index_sorted"] = summary[ - "value_counts_without_nan" - ].sort_index(ascending=True) - ordering = True - except TypeError: - ordering = False - else: - n_missing = series.isna().sum() - ordering = False - - summary["ordering"] = ordering - summary.n_missing = n_missing - - return config, series, summary diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index 39ca21b8c..d07425b5f 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -30,7 +30,7 @@ def pandas_describe_date_1d( Returns: A dict containing calculated series description values. """ - if summary["value_counts_without_nan"].empty: + if summary.value_counts_without_nan.empty: values = series.values summary.update( { diff --git a/src/ydata_profiling/model/pandas/describe_generic_pandas.py b/src/ydata_profiling/model/pandas/describe_generic_pandas.py deleted file mode 100644 index fcc5b04b6..000000000 --- a/src/ydata_profiling/model/pandas/describe_generic_pandas.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import Tuple - -import pandas as pd - -from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_generic -from ydata_profiling.model.var_description.default import VarDescription - - -@describe_generic.register -def pandas_describe_generic( - config: Settings, series: pd.Series, summary: VarDescription -) -> Tuple[Settings, pd.Series, VarDescription]: - """Describe generic series. - - Args: - config: report Settings object - series: The Series to describe. - summary: The dict containing the series description so far. - - Returns: - A dict containing calculated series description values. - """ - - # number of observations in the Series - length = len(series) - - summary.update( - { - "n": length, - "p_missing": summary.n_missing / length if length > 0 else 0, - "count": length - summary.n_missing, - "memory_size": series.memory_usage(deep=config.memory_deep), - } - ) - - return config, series, summary diff --git a/src/ydata_profiling/model/spark/__init__.py b/src/ydata_profiling/model/spark/__init__.py index 854222a9a..7dc7d5043 100644 --- a/src/ydata_profiling/model/spark/__init__.py +++ b/src/ydata_profiling/model/spark/__init__.py @@ -3,9 +3,7 @@ dataframe_spark, describe_boolean_spark, describe_categorical_spark, - describe_counts_spark, describe_date_spark, - describe_generic_spark, describe_numeric_spark, describe_supported_spark, duplicates_spark, @@ -21,9 +19,7 @@ "dataframe_spark", "describe_boolean_spark", "describe_categorical_spark", - "describe_counts_spark", "describe_date_spark", - "describe_generic_spark", "describe_numeric_spark", "describe_supported_spark", "duplicates_spark", diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py index 6f0f2ae25..51c309378 100644 --- a/src/ydata_profiling/model/spark/correlations_spark.py +++ b/src/ydata_profiling/model/spark/correlations_spark.py @@ -1,4 +1,5 @@ """Correlations between variables.""" + from typing import Optional import pandas as pd diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py index ab5cf20fb..815af74b8 100644 --- a/src/ydata_profiling/model/spark/describe_boolean_spark.py +++ b/src/ydata_profiling/model/spark/describe_boolean_spark.py @@ -4,12 +4,13 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_boolean_1d +from ydata_profiling.model.var_description.default import VarDescription @describe_boolean_1d.register def describe_boolean_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a boolean series. Args: @@ -20,7 +21,7 @@ def describe_boolean_1d_spark( A dict containing calculated series description values. """ - value_counts = summary["value_counts"] + value_counts = summary.value_counts # get the most common boolean value and its frequency top = value_counts.first() diff --git a/src/ydata_profiling/model/spark/describe_categorical_spark.py b/src/ydata_profiling/model/spark/describe_categorical_spark.py index 5afdb475c..562472b3d 100644 --- a/src/ydata_profiling/model/spark/describe_categorical_spark.py +++ b/src/ydata_profiling/model/spark/describe_categorical_spark.py @@ -4,12 +4,13 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_categorical_1d +from ydata_profiling.model.var_description.default import VarDescription @describe_categorical_1d.register def describe_categorical_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a categorical series. Args: diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py index a5e11a0f1..4bcee2bbf 100644 --- a/src/ydata_profiling/model/spark/describe_date_spark.py +++ b/src/ydata_profiling/model/spark/describe_date_spark.py @@ -6,6 +6,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_date_1d +from ydata_profiling.model.var_description.default import VarDescription def date_stats_spark(df: DataFrame, summary: dict) -> dict: @@ -21,8 +22,8 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict: @describe_date_1d.register def describe_date_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a date series. Args: diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py deleted file mode 100644 index ee2356c0a..000000000 --- a/src/ydata_profiling/model/spark/describe_generic_spark.py +++ /dev/null @@ -1,32 +0,0 @@ -from typing import Tuple - -from pyspark.sql import DataFrame - -from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_generic - - -@describe_generic.register -def describe_generic_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: - """Describe generic series. - Args: - series: The Series to describe. - summary: The dict containing the series description so far. - Returns: - A dict containing calculated series description values. - """ - - # number of observations in the Series - length = df.count() - - summary["n"] = length - summary["p_missing"] = summary["n_missing"] / length - summary["count"] = length - summary["n_missing"] - - # FIXME: This is not correct, but used to fulfil render expectations - # @chanedwin - summary["memory_size"] = 0 - - return config, df, summary diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 490e33aba..a9fca55cc 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -9,9 +9,10 @@ describe_numeric_1d, histogram_compute, ) +from ydata_profiling.model.var_description.default import VarDescription -def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: +def numeric_stats_spark(df: DataFrame, summary: VarDescription) -> dict: column = df.columns[0] expr = [ @@ -29,8 +30,8 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: @describe_numeric_1d.register def describe_numeric_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a boolean series. Args: @@ -51,7 +52,7 @@ def describe_numeric_1d_spark( summary["kurtosis"] = stats["kurtosis"] summary["sum"] = stats["sum"] - value_counts = summary["value_counts"] + value_counts = summary.value_counts n_infinite = ( value_counts.where(F.col(df.columns[0]).isin([np.inf, -np.inf])) @@ -106,12 +107,12 @@ def describe_numeric_1d_spark( ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0] # FIXME: move to fmt - summary["p_negative"] = summary["n_negative"] / summary["n"] + summary["p_negative"] = summary["n_negative"] / summary.n summary["range"] = summary["max"] - summary["min"] summary["iqr"] = summary["75%"] - summary["25%"] summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.NaN - summary["p_zeros"] = summary["n_zeros"] / summary["n"] - summary["p_infinite"] = summary["n_infinite"] / summary["n"] + summary["p_zeros"] = summary["n_zeros"] / summary.n + summary["p_infinite"] = summary["n_infinite"] / summary.n # TODO - enable this feature # because spark doesn't have an indexing system, there isn't really the idea of monotonic increase/decrease @@ -124,14 +125,14 @@ def describe_numeric_1d_spark( # display in pandas display # the alternative is to do this in spark natively, but it is not trivial infinity_values = [np.inf, -np.inf] - infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values) + infinity_index = summary.value_counts_without_nan.index.isin(infinity_values) summary.update( histogram_compute( config, - summary["value_counts_without_nan"][~infinity_index].index.values, + summary.value_counts_without_nan[~infinity_index].index.values, summary["n_distinct"], - weights=summary["value_counts_without_nan"][~infinity_index].values, + weights=summary.value_counts_without_nan[~infinity_index].values, ) ) diff --git a/src/ydata_profiling/model/spark/describe_supported_spark.py b/src/ydata_profiling/model/spark/describe_supported_spark.py index 1758f668d..d5d395156 100644 --- a/src/ydata_profiling/model/spark/describe_supported_spark.py +++ b/src/ydata_profiling/model/spark/describe_supported_spark.py @@ -3,13 +3,17 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.spark.var_description.default_spark import ( + get_default_spark_description, +) from ydata_profiling.model.summary_algorithms import describe_supported +from ydata_profiling.model.var_description.default import VarDescription @describe_supported.register def describe_supported_spark( config: Settings, series: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a supported series. Args: series: The Series to describe. @@ -18,16 +22,6 @@ def describe_supported_spark( A dict containing calculated series description values. """ - # number of non-NaN observations in the Series - count = summary["count"] - n_distinct = summary["value_counts"].count() + series_description = get_default_spark_description(config, series, summary) - summary["n_distinct"] = n_distinct - summary["p_distinct"] = n_distinct / count if count > 0 else 0 - - n_unique = summary["value_counts"].where("count == 1").count() - summary["is_unique"] = n_unique == count - summary["n_unique"] = n_unique - summary["p_unique"] = n_unique / count if count > 0 else 0 - - return config, series, summary + return config, series, series_description diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py index b5e27f615..6a95b2884 100644 --- a/src/ydata_profiling/model/spark/describe_text_spark.py +++ b/src/ydata_profiling/model/spark/describe_text_spark.py @@ -4,12 +4,13 @@ from ydata_profiling.config import Settings from ydata_profiling.model.summary_algorithms import describe_text_1d +from ydata_profiling.model.var_description.default import VarDescription @describe_text_1d.register def describe_text_1d_spark( - config: Settings, df: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: + config: Settings, df: DataFrame, summary: VarDescription +) -> Tuple[Settings, DataFrame, VarDescription]: """Describe a categorical series. Args: diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py index 13a85f4c3..e2857b0a8 100644 --- a/src/ydata_profiling/model/spark/summary_spark.py +++ b/src/ydata_profiling/model/spark/summary_spark.py @@ -1,4 +1,5 @@ """Compute statistical description of datasets.""" + import multiprocessing from typing import Tuple diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index cdf3d88dd..236825e6a 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -1,4 +1,5 @@ """Compute statistical description of datasets.""" + from pyspark.sql import DataFrame from ydata_profiling.config import Settings diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/var_description/counts_spark.py similarity index 58% rename from src/ydata_profiling/model/spark/describe_counts_spark.py rename to src/ydata_profiling/model/spark/var_description/counts_spark.py index 0f813f2ce..15a2c2bd3 100644 --- a/src/ydata_profiling/model/spark/describe_counts_spark.py +++ b/src/ydata_profiling/model/spark/var_description/counts_spark.py @@ -1,23 +1,12 @@ -from typing import Tuple - from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_counts - - -@describe_counts.register -def describe_counts_spark( - config: Settings, series: DataFrame, summary: dict -) -> Tuple[Settings, DataFrame, dict]: - """Counts the values in a series (with and without NaN, distinct). +from ydata_profiling.model.var_description.counts import VarCounts - Args: - series: Series for which we want to calculate the values. - Returns: - A dictionary with the count values (with and without NaN, distinct). - """ +def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts: + """Get a VarCounts object for a spark series.""" + length = series.count() value_counts = series.groupBy(series.columns).count() value_counts = value_counts.sort("count", ascending=False).persist() @@ -37,14 +26,10 @@ def describe_counts_spark( .squeeze(axis="columns") ) - summary["n_missing"] = n_missing - summary["value_counts"] = value_counts.persist() - summary["value_counts_index_sorted"] = value_counts_index_sorted - # this is necessary as freqtables requires value_counts_without_nan # to be a pandas series. However, if we try to get everything into # pandas we will definitly crash the server - summary["value_counts_without_nan"] = ( + value_counts_without_nan = ( value_counts.dropna() .limit(200) .toPandas() @@ -52,4 +37,19 @@ def describe_counts_spark( .squeeze(axis="columns") ) - return config, series, summary + # FIXME: This is not correct, but used to fulfil render expectations + # @chanedwin + memory_size = 0 + + return VarCounts( + hashable=False, + value_counts_without_nan=value_counts_without_nan, + value_counts_index_sorted=value_counts_index_sorted, + ordering=False, + n_missing=n_missing, + n=length, + p_missing=n_missing / length, + count=length - n_missing, + memory_size=memory_size, + value_counts=value_counts.persist(), + ) diff --git a/src/ydata_profiling/model/spark/var_description/default_spark.py b/src/ydata_profiling/model/spark/var_description/default_spark.py new file mode 100644 index 000000000..ff264e63e --- /dev/null +++ b/src/ydata_profiling/model/spark/var_description/default_spark.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from pyspark.sql import DataFrame + +from ydata_profiling.config import Settings +from ydata_profiling.model.spark.var_description.counts_spark import get_counts_spark +from ydata_profiling.model.var_description.default import VarDescription + + +def get_default_spark_description( + config: Settings, series: DataFrame, init_dict: dict +) -> VarDescription: + var_counts = get_counts_spark(config, series) + + count = var_counts.count + n_distinct = var_counts.value_counts.count() + + p_distinct = n_distinct / count if count > 0 else 0 + + n_unique = var_counts.value_counts.where("count == 1").count() + is_unique = n_unique == count + p_unique = n_unique / count if count > 0 else 0 + + init_dict.update( + { + "n_distinct": n_distinct, + "p_distinct": p_distinct, + "is_unique": is_unique, + "n_unique": n_unique, + "p_unique": p_unique, + } + ) + + return VarDescription( + n=var_counts.n, + count=var_counts.count, + n_missing=var_counts.n_missing, + p_missing=var_counts.p_missing, + hashable=var_counts.hashable, + memory_size=var_counts.memory_size, + ordering=var_counts.ordering, + value_counts_index_sorted=var_counts.value_counts_index_sorted, + value_counts_without_nan=var_counts.value_counts_without_nan, + value_counts=var_counts.value_counts, + var_specific=init_dict, + ) diff --git a/src/ydata_profiling/model/var_description/default.py b/src/ydata_profiling/model/var_description/default.py index 49aa63e82..05fb38ed0 100644 --- a/src/ydata_profiling/model/var_description/default.py +++ b/src/ydata_profiling/model/var_description/default.py @@ -34,6 +34,10 @@ def get(self, key: str, default: Any = None) -> Any: """To support old dict like interface.""" return self.var_specific.get(key, default) + def pop(self, key: str, default: Any = None) -> Any: + """To support old dict like interface.""" + return self.var_specific.pop(key, default) + def __iter__(self) -> Iterator: """To support old dict like interface.""" return self.var_specific.__iter__() From b9f10cf65f47197bfb4240dcb18ef2ed2a11b987 Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Sat, 16 Dec 2023 21:03:31 +0100 Subject: [PATCH 07/10] feat: add support for python 3.8 --- src/ydata_profiling/model/alerts.py | 2 +- src/ydata_profiling/model/describe.py | 2 +- .../model/pandas/correlations_pandas.py | 14 +++++++------- src/ydata_profiling/model/pandas/summary_pandas.py | 4 ++-- src/ydata_profiling/model/pandas/table_pandas.py | 3 ++- src/ydata_profiling/model/summary.py | 4 ++-- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index ec352ddbf..6fbfad069 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -701,7 +701,7 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert def get_alerts( config: Settings, table_stats: dict, - series_description: dict[str, VarDescription], + series_description: Dict[str, VarDescription], correlations: dict, ) -> List[Alert]: alerts: List[Alert] = check_table_alerts(table_stats) diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py index a8bfcac1a..9a2ad1619 100644 --- a/src/ydata_profiling/model/describe.py +++ b/src/ydata_profiling/model/describe.py @@ -73,7 +73,7 @@ def describe( # Variable-specific pbar.total += len(df.columns) - series_description: dict[str, VarDescription] = get_series_descriptions( + series_description: Dict[str, VarDescription] = get_series_descriptions( config, df, summarizer, typeset, pbar ) diff --git a/src/ydata_profiling/model/pandas/correlations_pandas.py b/src/ydata_profiling/model/pandas/correlations_pandas.py index 698969270..510d4ab34 100644 --- a/src/ydata_profiling/model/pandas/correlations_pandas.py +++ b/src/ydata_profiling/model/pandas/correlations_pandas.py @@ -2,7 +2,7 @@ import itertools import warnings -from typing import Callable, Optional +from typing import Callable, Dict, Optional import numpy as np import pandas as pd @@ -89,9 +89,9 @@ def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float: return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True) -@Cramers.compute.register(Settings, pd.DataFrame, dict[str, VarDescription]) +@Cramers.compute.register(Settings, pd.DataFrame, Dict[str, VarDescription]) def pandas_cramers_compute( - config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] + config: Settings, df: pd.DataFrame, summary: Dict[str, VarDescription] ) -> Optional[pd.DataFrame]: threshold = config.categorical_maximum_correlation_distinct @@ -130,9 +130,9 @@ def pandas_cramers_compute( return correlation_matrix -@PhiK.compute.register(Settings, pd.DataFrame, dict[str, VarDescription]) +@PhiK.compute.register(Settings, pd.DataFrame, Dict[str, VarDescription]) def pandas_phik_compute( - config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] + config: Settings, df: pd.DataFrame, summary: Dict[str, VarDescription] ) -> Optional[pd.DataFrame]: df_cols_dict = {i: list(df.columns).index(i) for i in df.columns} @@ -166,9 +166,9 @@ def pandas_phik_compute( return correlation -@Auto.compute.register(Settings, pd.DataFrame, dict[str, VarDescription]) +@Auto.compute.register(Settings, pd.DataFrame, Dict[str, VarDescription]) def pandas_auto_compute( - config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription] + config: Settings, df: pd.DataFrame, summary: Dict[str, VarDescription] ) -> Optional[pd.DataFrame]: threshold = config.categorical_maximum_correlation_distinct numerical_columns = [ diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index d66906caa..190a9250c 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -2,7 +2,7 @@ import multiprocessing import multiprocessing.pool -from typing import Tuple +from typing import Dict, Tuple import numpy as np import pandas as pd @@ -65,7 +65,7 @@ def pandas_get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> dict[str, VarDescription]: +) -> Dict[str, VarDescription]: def multiprocess_1d(args: tuple) -> Tuple[str, VarDescription]: """Wrapper to process series in parallel. diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index bef531e2f..9198fb0e0 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -1,4 +1,5 @@ from collections import Counter +from typing import Dict import pandas as pd @@ -9,7 +10,7 @@ @get_table_stats.register def pandas_get_table_stats( - config: Settings, df: pd.DataFrame, variable_stats: dict[str, VarDescription] + config: Settings, df: pd.DataFrame, variable_stats: Dict[str, VarDescription] ) -> dict: """General statistics for the DataFrame. diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index 8e4179598..e1c0588e2 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -1,6 +1,6 @@ """Compute statistical description of datasets.""" -from typing import Any +from typing import Any, Dict from multimethod import multimethod from tqdm import tqdm @@ -28,5 +28,5 @@ def get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> dict[str, VarDescription]: +) -> Dict[str, VarDescription]: raise NotImplementedError() From 4b53baeaf580ad7814a8865ebadf60f1b99b12d3 Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Fri, 27 Sep 2024 13:44:36 +0200 Subject: [PATCH 08/10] fix: test_summarizer image --- tests/unit/test_summarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_summarizer.py b/tests/unit/test_summarizer.py index 4db83d69e..c12b0cb99 100644 --- a/tests/unit/test_summarizer.py +++ b/tests/unit/test_summarizer.py @@ -85,7 +85,7 @@ def test_summarizer_image(config: Settings): pps.summarize( config, pd.Series( - [os.path.abspath(base_path + r"../../../docsrc/assets/logo_header.png")] + [os.path.abspath(base_path + r"../../../docs/_static/img/cli.png")] ), "Image", ) From 6b1a592c8648fc4a3f633e02e1c05f721dbe69ef Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Tue, 15 Apr 2025 15:25:04 +0200 Subject: [PATCH 09/10] fix errors from master merge --- src/ydata_profiling/model/alerts.py | 3 +- src/ydata_profiling/model/pandas/__init__.py | 4 -- .../model/pandas/correlations_pandas.py | 2 +- .../pandas/describe_categorical_pandas.py | 2 +- .../model/pandas/describe_date_pandas.py | 17 ++++++- .../model/pandas/summary_pandas.py | 4 +- .../spark/var_description/counts_spark.py | 49 +++++++++++++------ .../spark/var_description/default_spark.py | 2 +- src/ydata_profiling/model/summarizer.py | 8 --- src/ydata_profiling/model/summary.py | 1 - tests/unit/test_describe.py | 2 +- tests/unit/test_summarizer.py | 8 +-- 12 files changed, 60 insertions(+), 42 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index d7b3dce23..698801389 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -9,7 +9,6 @@ from ydata_profiling.config import Settings from ydata_profiling.model.correlations import perform_check_correlation - from ydata_profiling.model.var_description.default import VarDescription from ydata_profiling.utils.styles import get_alert_styles @@ -690,7 +689,7 @@ def supported_alerts(summary: VarDescription) -> List[Alert]: return alerts -def unsupported_alerts(summary: VarDescription) -> List[Alert]: +def unsupported_alerts() -> List[Alert]: alerts: List[Alert] = [ UnsupportedAlert(), RejectedAlert(), diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py index 381df8e86..c7d21070f 100644 --- a/src/ydata_profiling/model/pandas/__init__.py +++ b/src/ydata_profiling/model/pandas/__init__.py @@ -3,7 +3,6 @@ # List of modules in the 'pandas' model that should be imported explicitly PANDAS_MODULES = [ "correlations_pandas", - "describe_generic_pandas", "describe_boolean_pandas", "describe_categorical_pandas", "describe_url_pandas", @@ -14,7 +13,6 @@ "describe_path_pandas", "describe_image_pandas", "describe_date_pandas", - "describe_counts_pandas", "duplicates_pandas", "sample_pandas", "table_pandas", @@ -35,7 +33,6 @@ # Explicitly list exposed names for clarity __all__ = [ - "pandas_describe_generic", "pandas_describe_boolean_1d", "pandas_describe_categorical_1d", "pandas_describe_url_1d", @@ -46,7 +43,6 @@ "pandas_describe_path_1d", "pandas_describe_image_1d", "pandas_describe_date_1d", - "pandas_describe_counts", "pandas_get_duplicates", "pandas_get_sample", "pandas_get_table_stats", diff --git a/src/ydata_profiling/model/pandas/correlations_pandas.py b/src/ydata_profiling/model/pandas/correlations_pandas.py index 6ee17b983..68a766aaf 100644 --- a/src/ydata_profiling/model/pandas/correlations_pandas.py +++ b/src/ydata_profiling/model/pandas/correlations_pandas.py @@ -2,7 +2,7 @@ import itertools import warnings -from typing import Callable, Dict, Optional +from typing import Callable, Optional import numpy as np import pandas as pd diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index 576730189..325112f45 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -16,8 +16,8 @@ series_handle_nulls, series_hashable, ) -from ydata_profiling.utils.information import DisplayInfo from ydata_profiling.model.var_description.default import VarDescription +from ydata_profiling.utils.information import DisplayInfo def get_character_counts_vc(vc: pd.Series) -> pd.Series: diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index e1ec721cf..169e7367a 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -11,9 +11,16 @@ series_handle_nulls, series_hashable, ) - +from ydata_profiling.model.typeset_relations import is_pandas_1 from ydata_profiling.model.var_description.default import VarDescription + +def to_datetime(series: pd.Series) -> pd.Series: + if is_pandas_1(): + return pd.to_datetime(series, errors="coerce") + return pd.to_datetime(series, format="mixed", errors="coerce") + + @describe_date_1d.register @series_hashable @series_handle_nulls @@ -31,6 +38,12 @@ def pandas_describe_date_1d( A dict containing calculated series description values. """ + og_series = series.dropna() + series = to_datetime(og_series) + invalid_values = og_series[series.isna()] + + series = series.dropna() + if summary.value_counts_without_nan.empty: values = series.values summary.update( @@ -60,7 +73,7 @@ def pandas_describe_date_1d( { "invalid_dates": invalid_values.nunique(), "n_invalid_dates": len(invalid_values), - "p_invalid_dates": len(invalid_values) / summary["n"], + "p_invalid_dates": len(invalid_values) / summary.n, } ) return config, values, summary diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index bc5a4faf2..6fbd39fa9 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -1,7 +1,7 @@ """Compute statistical description of datasets.""" import multiprocessing from concurrent.futures import ThreadPoolExecutor -from typing import Dict, Any, Tuple +from typing import Any, Dict, Tuple import numpy as np import pandas as pd @@ -10,8 +10,8 @@ from ydata_profiling.config import Settings from ydata_profiling.model.typeset import ProfilingTypeSet -from ydata_profiling.utils.compat import optional_option_context from ydata_profiling.model.var_description.default import VarDescription +from ydata_profiling.utils.compat import optional_option_context from ydata_profiling.utils.dataframe import sort_column_names BaseSummarizer: Any = "BaseSummarizer" # type: ignore diff --git a/src/ydata_profiling/model/spark/var_description/counts_spark.py b/src/ydata_profiling/model/spark/var_description/counts_spark.py index b1ca199d7..264330a1a 100644 --- a/src/ydata_profiling/model/spark/var_description/counts_spark.py +++ b/src/ydata_profiling/model/spark/var_description/counts_spark.py @@ -1,14 +1,12 @@ """ Pyspark counts """ -from typing import Tuple - import pandas as pd from pyspark.sql import DataFrame from pyspark.sql import functions as F from ydata_profiling.config import Settings -from ydata_profiling.model.var_description.counts import VarCounts +from ydata_profiling.model.var_description.default import VarCounts def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts: @@ -21,16 +19,18 @@ def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts: length = series.count() # Count occurrences of each value - value_counts = series.groupBy(series.columns).count() + value_counts = series.groupBy(series.columns[0]).count() # Sort by count descending, persist the result - value_counts = value_counts.sort("count", ascending=False).persist() + value_counts = value_counts.orderBy(F.desc("count")).persist() # Sort by column value ascending (for frequency tables) - value_counts_index_sorted = value_counts.sort(series.columns[0], ascending=True) + value_counts_index_sorted = value_counts.orderBy(F.asc(series.columns[0])) # Count missing values - n_missing = value_counts.where(value_counts[series.columns[0]].isNull()).first() + n_missing = ( + value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first() + ) n_missing = n_missing["count"] if n_missing else 0 # Convert top 200 values to Pandas for frequency table display @@ -41,13 +41,32 @@ def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts: .squeeze(axis="columns") ) - value_counts_without_nan = ( - value_counts.dropna() - .limit(200) - .toPandas() - .set_index(series.columns[0], drop=True) - .squeeze(axis="columns") - ) + column = series.columns[0] + + if series.dtypes[0][1] in ("int", "float", "bigint", "double"): + value_counts_no_nan = ( + value_counts.filter(F.col(column).isNotNull()) # Exclude NaNs + .filter(~F.isnan(F.col(column))) # Remove implicit NaNs (if numeric column) + .groupBy(column) # Group by unique values + .count() # Count occurrences + .orderBy(F.desc("count")) # Sort in descending order + .limit(200) # Limit for performance + ) + else: + value_counts_no_nan = ( + value_counts.filter(F.col(column).isNotNull()) # Exclude NULLs + .groupBy(column) # Group by unique timestamp values + .count() # Count occurrences + .orderBy(F.desc("count")) # Sort by most frequent timestamps + .limit(200) # Limit for performance + ) + + # Convert to Pandas Series, forcing proper structure + if value_counts_no_nan.count() > 0: + pdf = value_counts_no_nan.toPandas().set_index(column)["count"] + value_counts_without_nan = pd.Series(pdf) # Ensures it's always a Series + else: + value_counts_without_nan = pd.Series(dtype=int) # Ensures an empty Series # @chanedwin memory_size = 0 @@ -55,7 +74,7 @@ def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts: return VarCounts( hashable=False, value_counts_without_nan=value_counts_without_nan, - value_counts_index_sorted=value_counts_index_sorted, + value_counts_index_sorted=top_200_sorted, ordering=False, n_missing=n_missing, n=length, diff --git a/src/ydata_profiling/model/spark/var_description/default_spark.py b/src/ydata_profiling/model/spark/var_description/default_spark.py index ff264e63e..687be178e 100644 --- a/src/ydata_profiling/model/spark/var_description/default_spark.py +++ b/src/ydata_profiling/model/spark/var_description/default_spark.py @@ -19,7 +19,7 @@ def get_default_spark_description( n_unique = var_counts.value_counts.where("count == 1").count() is_unique = n_unique == count - p_unique = n_unique / count if count > 0 else 0 + p_unique = n_unique / count if count > 0 else 0 init_dict.update( { diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 600d43029..bb296cf06 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -13,10 +13,8 @@ from ydata_profiling.model.pandas import ( pandas_describe_boolean_1d, pandas_describe_categorical_1d, - pandas_describe_counts, pandas_describe_date_1d, pandas_describe_file_1d, - pandas_describe_generic, pandas_describe_image_1d, pandas_describe_numeric_1d, pandas_describe_path_1d, @@ -71,9 +69,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, describe_categorical_1d_spark, - describe_counts_spark, describe_date_1d_spark, - describe_generic_spark, describe_numeric_1d_spark, describe_supported_spark, describe_text_1d_spark, @@ -81,8 +77,6 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: summary_map = { "Unsupported": [ - describe_counts_spark, - describe_generic_spark, describe_supported_spark, ], "Numeric": [describe_numeric_1d_spark], @@ -99,8 +93,6 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: else: summary_map = { "Unsupported": [ - pandas_describe_counts, - pandas_describe_generic, pandas_describe_supported, ], "Numeric": [pandas_describe_numeric_1d], diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index 2547d64b1..b7f41c660 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -14,7 +14,6 @@ from ydata_profiling.model.summarizer import BaseSummarizer from ydata_profiling.model.var_description.default import VarDescription - spec = importlib.util.find_spec("pyspark") if spec is None: from typing import TypeVar # noqa: E402 diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py index 1bf61aa0c..362ea7c0b 100644 --- a/tests/unit/test_describe.py +++ b/tests/unit/test_describe.py @@ -603,6 +603,6 @@ def test_decribe_series_type_schema(config, summarizer): result = describe(config, df, summarizer, typeset) assert result.variables["date"]["type"] == "DateTime" - assert result.variables["date"]["n_missing"] == 0 + assert result.variables["date"].n_missing == 0 assert result.variables["date"]["n_invalid_dates"] == 2 assert result.variables["date"]["p_invalid_dates"] == 0.5 diff --git a/tests/unit/test_summarizer.py b/tests/unit/test_summarizer.py index 880129b4b..631dab650 100644 --- a/tests/unit/test_summarizer.py +++ b/tests/unit/test_summarizer.py @@ -32,7 +32,7 @@ def test_summarizer_base_types(config: Settings): def test_summarizer_url(config: Settings): config.vars.url.active = True - pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) + pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize(config, pd.Series(["https://www.example.com"]), "URL") ) @@ -40,7 +40,7 @@ def test_summarizer_url(config: Settings): def test_summarizer_path(config: Settings): config.vars.path.active = True - pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) + pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize( config, @@ -60,7 +60,7 @@ def test_summarizer_path(config: Settings): def test_summarizer_file(config: Settings): config.vars.path.active = True config.vars.file.active = True - pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) + pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize( config, @@ -80,7 +80,7 @@ def test_summarizer_image(config: Settings): config.vars.path.active = True config.vars.file.active = True config.vars.image.active = True - pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config)) + pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config)) _ = format_summary( pps.summarize( config, From d27f8eaf9c55c350d11ed86fa3efccfea6183e76 Mon Sep 17 00:00:00 2001 From: Jan Cap Date: Tue, 15 Apr 2025 15:42:50 +0200 Subject: [PATCH 10/10] replace typing Dict with dict --- src/ydata_profiling/model/pandas/summary_pandas.py | 4 ++-- src/ydata_profiling/model/summary.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 6fbd39fa9..1dc169baa 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -1,7 +1,7 @@ """Compute statistical description of datasets.""" import multiprocessing from concurrent.futures import ThreadPoolExecutor -from typing import Any, Dict, Tuple +from typing import Any, Tuple import numpy as np import pandas as pd @@ -75,7 +75,7 @@ def pandas_get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> Dict[str, VarDescription]: +) -> dict[str, VarDescription]: def describe_column(name: str, series: pd.Series) -> Tuple[str, VarDescription]: """Process a single series to get the column description.""" pbar.set_postfix_str(f"Describe variable: {name}") diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py index b7f41c660..1b02f0789 100644 --- a/src/ydata_profiling/model/summary.py +++ b/src/ydata_profiling/model/summary.py @@ -1,6 +1,6 @@ """Compute statistical description of datasets.""" import importlib -from typing import Any, Dict +from typing import Any import pandas as pd from tqdm import tqdm @@ -58,7 +58,7 @@ def get_series_descriptions( summarizer: BaseSummarizer, typeset: VisionsTypeset, pbar: tqdm, -) -> Dict[str, VarDescription]: +) -> dict[str, VarDescription]: if isinstance(df, pd.DataFrame): return pandas_get_series_descriptions(config, df, summarizer, typeset, pbar) elif isinstance(df, sparkDataFrame): # type: ignore