From 36cb9ee8f1eb04de9c6ae6840233175ffd8626fe Mon Sep 17 00:00:00 2001
From: Jan Cap <jan.cap@profinit.eu>
Date: Sat, 16 Dec 2023 16:11:14 +0100
Subject: [PATCH 01/10] feat: add vardescription class

---
 .../model/pandas/correlations_pandas.py       | 15 ++---
 .../model/pandas/describe_boolean_pandas.py   | 14 ++---
 .../pandas/describe_categorical_pandas.py     |  8 +--
 .../model/pandas/describe_counts_pandas.py    |  9 +--
 .../model/pandas/describe_date_pandas.py      |  5 +-
 .../model/pandas/describe_file_pandas.py      |  5 +-
 .../model/pandas/describe_generic_pandas.py   |  9 +--
 .../model/pandas/describe_image_pandas.py     |  5 +-
 .../model/pandas/describe_numeric_pandas.py   | 17 +++---
 .../model/pandas/describe_path_pandas.py      | 10 ++--
 .../model/pandas/describe_supported_pandas.py | 29 +++-------
 .../model/pandas/describe_text_pandas.py      |  7 ++-
 .../pandas/describe_timeseries_pandas.py      |  5 +-
 .../model/pandas/describe_url_pandas.py       |  5 +-
 .../model/pandas/summary_pandas.py            |  7 ++-
 .../model/pandas/table_pandas.py              |  9 +--
 .../pandas/var_description/counts_pandas.py   | 53 ++++++++++++++++++
 .../pandas/var_description/default_pandas.py  | 31 ++++++++++
 .../model/var_description/counts.py           | 27 +++++++++
 .../model/var_description/default.py          | 56 +++++++++++++++++++
 20 files changed, 246 insertions(+), 80 deletions(-)
 create mode 100644 src/ydata_profiling/model/pandas/var_description/counts_pandas.py
 create mode 100644 src/ydata_profiling/model/pandas/var_description/default_pandas.py
 create mode 100644 src/ydata_profiling/model/var_description/counts.py
 create mode 100644 src/ydata_profiling/model/var_description/default.py

diff --git a/src/ydata_profiling/model/pandas/correlations_pandas.py b/src/ydata_profiling/model/pandas/correlations_pandas.py
index ab82e6353..698969270 100644
--- a/src/ydata_profiling/model/pandas/correlations_pandas.py
+++ b/src/ydata_profiling/model/pandas/correlations_pandas.py
@@ -1,4 +1,5 @@
 """Correlations between variables."""
+
 import itertools
 import warnings
 from typing import Callable, Optional
@@ -20,6 +21,7 @@
     DiscretizationType,
     Discretizer,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @Spearman.compute.register(Settings, pd.DataFrame, dict)
@@ -87,9 +89,9 @@ def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float:
     return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True)
 
 
-@Cramers.compute.register(Settings, pd.DataFrame, dict)
+@Cramers.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
 def pandas_cramers_compute(
-    config: Settings, df: pd.DataFrame, summary: dict
+    config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
 ) -> Optional[pd.DataFrame]:
     threshold = config.categorical_maximum_correlation_distinct
 
@@ -128,9 +130,9 @@ def pandas_cramers_compute(
     return correlation_matrix
 
 
-@PhiK.compute.register(Settings, pd.DataFrame, dict)
+@PhiK.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
 def pandas_phik_compute(
-    config: Settings, df: pd.DataFrame, summary: dict
+    config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
 ) -> Optional[pd.DataFrame]:
     df_cols_dict = {i: list(df.columns).index(i) for i in df.columns}
 
@@ -164,9 +166,9 @@ def pandas_phik_compute(
     return correlation
 
 
-@Auto.compute.register(Settings, pd.DataFrame, dict)
+@Auto.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
 def pandas_auto_compute(
-    config: Settings, df: pd.DataFrame, summary: dict
+    config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
 ) -> Optional[pd.DataFrame]:
     threshold = config.categorical_maximum_correlation_distinct
     numerical_columns = [
@@ -195,7 +197,6 @@ def pandas_auto_compute(
         columns=columns_tested,
     )
     for col_1_name, col_2_name in itertools.combinations(columns_tested, 2):
-
         method = (
             _pairwise_spearman
             if any(elem in categorical_columns for elem in [col_1_name, col_2_name])
diff --git a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py
index 9b2014db7..07d446337 100644
--- a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py
@@ -5,17 +5,14 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
-from ydata_profiling.model.summary_algorithms import (
-    describe_boolean_1d,
-    series_hashable,
-)
+from ydata_profiling.model.summary_algorithms import describe_boolean_1d
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_boolean_1d.register
-@series_hashable
 def pandas_describe_boolean_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a boolean series.
 
     Args:
@@ -26,8 +23,7 @@ def pandas_describe_boolean_1d(
     Returns:
         A dict containing calculated series description values.
     """
-
-    value_counts: pd.Series = summary["value_counts_without_nan"]
+    value_counts: pd.Series = summary.value_counts_without_nan
     if not value_counts.empty:
         summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]})
         summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
index 31ae57417..b2b381aa1 100644
--- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
@@ -16,6 +16,7 @@
     series_handle_nulls,
     series_hashable,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def get_character_counts_vc(vc: pd.Series) -> pd.Series:
@@ -214,8 +215,8 @@ def length_summary_vc(vc: pd.Series) -> dict:
 @series_hashable
 @series_handle_nulls
 def pandas_describe_categorical_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a categorical series.
 
     Args:
@@ -226,12 +227,11 @@ def pandas_describe_categorical_1d(
     Returns:
         A dict containing calculated series description values.
     """
-
     # Make sure we deal with strings (Issue #100)
     series = series.astype(str)
 
     # Only run if at least 1 non-missing value
-    value_counts = summary["value_counts_without_nan"]
+    value_counts = summary.value_counts_without_nan
     value_counts.index = value_counts.index.astype(str)
 
     summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts))
diff --git a/src/ydata_profiling/model/pandas/describe_counts_pandas.py b/src/ydata_profiling/model/pandas/describe_counts_pandas.py
index 07cdad9d5..416474d25 100644
--- a/src/ydata_profiling/model/pandas/describe_counts_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_counts_pandas.py
@@ -4,12 +4,13 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_counts
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_counts.register
 def pandas_describe_counts(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Counts the values in a series (with and without NaN, distinct).
 
     Args:
@@ -27,7 +28,7 @@ def pandas_describe_counts(
     except:  # noqa: E722
         hashable = False
 
-    summary["hashable"] = hashable
+    summary.hashable = hashable
 
     if hashable:
         value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]
@@ -58,6 +59,6 @@ def pandas_describe_counts(
         ordering = False
 
     summary["ordering"] = ordering
-    summary["n_missing"] = n_missing
+    summary.n_missing = n_missing
 
     return config, series, summary
diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py
index 1ff64a50f..39ca21b8c 100644
--- a/src/ydata_profiling/model/pandas/describe_date_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py
@@ -11,14 +11,15 @@
     series_handle_nulls,
     series_hashable,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_date_1d.register
 @series_hashable
 @series_handle_nulls
 def pandas_describe_date_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a date series.
 
     Args:
diff --git a/src/ydata_profiling/model/pandas/describe_file_pandas.py b/src/ydata_profiling/model/pandas/describe_file_pandas.py
index 84ee3c4ab..18b4e511c 100644
--- a/src/ydata_profiling/model/pandas/describe_file_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_file_pandas.py
@@ -6,6 +6,7 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_file_1d, histogram_compute
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def file_summary(series: pd.Series) -> dict:
@@ -36,8 +37,8 @@ def convert_datetime(x: float) -> str:
 
 @describe_file_1d.register
 def pandas_describe_file_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     if series.hasnans:
         raise ValueError("May not contain NaNs")
     if not hasattr(series, "str"):
diff --git a/src/ydata_profiling/model/pandas/describe_generic_pandas.py b/src/ydata_profiling/model/pandas/describe_generic_pandas.py
index 21b804e66..fcc5b04b6 100644
--- a/src/ydata_profiling/model/pandas/describe_generic_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_generic_pandas.py
@@ -4,12 +4,13 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_generic
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_generic.register
 def pandas_describe_generic(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe generic series.
 
     Args:
@@ -27,8 +28,8 @@ def pandas_describe_generic(
     summary.update(
         {
             "n": length,
-            "p_missing": summary["n_missing"] / length if length > 0 else 0,
-            "count": length - summary["n_missing"],
+            "p_missing": summary.n_missing / length if length > 0 else 0,
+            "count": length - summary.n_missing,
             "memory_size": series.memory_usage(deep=config.memory_deep),
         }
     )
diff --git a/src/ydata_profiling/model/pandas/describe_image_pandas.py b/src/ydata_profiling/model/pandas/describe_image_pandas.py
index 08675ed0c..d5f7c8975 100644
--- a/src/ydata_profiling/model/pandas/describe_image_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_image_pandas.py
@@ -12,6 +12,7 @@
     describe_image_1d,
     named_aggregate_summary,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.utils.imghdr_patch import *  # noqa: F401,F403
 
 
@@ -243,8 +244,8 @@ def image_summary(series: pd.Series, exif: bool = False, hash: bool = False) ->
 
 @describe_image_1d.register
 def pandas_describe_image_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     if series.hasnans:
         raise ValueError("May not contain NaNs")
     if not hasattr(series, "str"):
diff --git a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py
index fa3ffd6cf..c51e0ddd8 100644
--- a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.utils.compat import pandas_version_info
 
 if pandas_version_info() >= (1, 5):
@@ -44,9 +45,9 @@ def numeric_stats_pandas(series: pd.Series) -> Dict[str, Any]:
 
 
 def numeric_stats_numpy(
-    present_values: np.ndarray, series: pd.Series, series_description: Dict[str, Any]
+    present_values: np.ndarray, series: pd.Series, series_description: VarDescription
 ) -> Dict[str, Any]:
-    vc = series_description["value_counts_without_nan"]
+    vc = series_description.value_counts_without_nan
     index_values = vc.index.values
 
     # FIXME: can be performance optimized by using weights in std, var, kurt and skew...
@@ -80,8 +81,8 @@ def numeric_stats_numpy(
 @series_hashable
 @series_handle_nulls
 def pandas_describe_numeric_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a numeric series.
 
     Args:
@@ -96,11 +97,11 @@ def pandas_describe_numeric_1d(
     chi_squared_threshold = config.vars.num.chi_squared_threshold
     quantiles = config.vars.num.quantiles
 
-    value_counts = summary["value_counts_without_nan"]
+    value_counts = summary.value_counts_without_nan
 
     negative_index = value_counts.index < 0
     summary["n_negative"] = value_counts.loc[negative_index].sum()
-    summary["p_negative"] = summary["n_negative"] / summary["n"]
+    summary["p_negative"] = summary["n_negative"] / summary.n
 
     infinity_values = [np.inf, -np.inf]
     infinity_index = value_counts.index.isin(infinity_values)
@@ -139,8 +140,8 @@ def pandas_describe_numeric_1d(
     )
     stats["iqr"] = stats["75%"] - stats["25%"]
     stats["cv"] = stats["std"] / stats["mean"] if stats["mean"] else np.NaN
-    stats["p_zeros"] = stats["n_zeros"] / summary["n"]
-    stats["p_infinite"] = summary["n_infinite"] / summary["n"]
+    stats["p_zeros"] = stats["n_zeros"] / summary.n
+    stats["p_infinite"] = summary["n_infinite"] / summary.n
 
     stats["monotonic_increase"] = series.is_monotonic_increasing
     stats["monotonic_decrease"] = series.is_monotonic_decreasing
diff --git a/src/ydata_profiling/model/pandas/describe_path_pandas.py b/src/ydata_profiling/model/pandas/describe_path_pandas.py
index e3e536f99..31ac65f88 100644
--- a/src/ydata_profiling/model/pandas/describe_path_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_path_pandas.py
@@ -5,6 +5,7 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_path_1d
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def path_summary(series: pd.Series) -> dict:
@@ -19,8 +20,9 @@ def path_summary(series: pd.Series) -> dict:
 
     # TODO: optimize using value counts
     summary = {
-        "common_prefix": os.path.commonprefix(series.values.tolist())
-        or "No common prefix",
+        "common_prefix": (
+            os.path.commonprefix(series.values.tolist()) or "No common prefix"
+        ),
         "stem_counts": series.map(lambda x: os.path.splitext(x)[0]).value_counts(),
         "suffix_counts": series.map(lambda x: os.path.splitext(x)[1]).value_counts(),
         "name_counts": series.map(lambda x: os.path.basename(x)).value_counts(),
@@ -39,8 +41,8 @@ def path_summary(series: pd.Series) -> dict:
 
 @describe_path_1d.register
 def pandas_describe_path_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a path series.
 
     Args:
diff --git a/src/ydata_profiling/model/pandas/describe_supported_pandas.py b/src/ydata_profiling/model/pandas/describe_supported_pandas.py
index 16bd9ab38..69e19f873 100644
--- a/src/ydata_profiling/model/pandas/describe_supported_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_supported_pandas.py
@@ -3,14 +3,17 @@
 import pandas as pd
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.summary_algorithms import describe_supported, series_hashable
+from ydata_profiling.model.pandas.var_description.default_pandas import (
+    get_default_pandas_description,
+)
+from ydata_profiling.model.summary_algorithms import describe_supported
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_supported.register
-@series_hashable
 def pandas_describe_supported(
-    config: Settings, series: pd.Series, series_description: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, description: dict
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a supported series.
 
     Args:
@@ -22,20 +25,6 @@ def pandas_describe_supported(
         A dict containing calculated series description values.
     """
 
-    # number of non-NaN observations in the Series
-    count = series_description["count"]
+    series_description = get_default_pandas_description(config, series, description)
 
-    value_counts = series_description["value_counts_without_nan"]
-    distinct_count = len(value_counts)
-    unique_count = value_counts.where(value_counts == 1).count()
-
-    stats = {
-        "n_distinct": distinct_count,
-        "p_distinct": distinct_count / count if count > 0 else 0,
-        "is_unique": unique_count == count and count > 0,
-        "n_unique": unique_count,
-        "p_unique": unique_count / count if count > 0 else 0,
-    }
-    stats.update(series_description)
-
-    return config, series, stats
+    return config, series, series_description
diff --git a/src/ydata_profiling/model/pandas/describe_text_pandas.py b/src/ydata_profiling/model/pandas/describe_text_pandas.py
index 2701b9760..1cf71f200 100644
--- a/src/ydata_profiling/model/pandas/describe_text_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_text_pandas.py
@@ -14,6 +14,7 @@
     series_handle_nulls,
     series_hashable,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_text_1d.register
@@ -22,8 +23,8 @@
 def pandas_describe_text_1d(
     config: Settings,
     series: pd.Series,
-    summary: dict,
-) -> Tuple[Settings, pd.Series, dict]:
+    summary: VarDescription,
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe string series.
 
     Args:
@@ -38,7 +39,7 @@ def pandas_describe_text_1d(
     series = series.astype(str)
 
     # Only run if at least 1 non-missing value
-    value_counts = summary["value_counts_without_nan"]
+    value_counts = summary.value_counts_without_nan
     value_counts.index = value_counts.index.astype(str)
 
     summary.update({"first_rows": series.head(5)})
diff --git a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py
index 5ffe99a9f..de8903460 100644
--- a/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_timeseries_pandas.py
@@ -13,6 +13,7 @@
     series_handle_nulls,
     series_hashable,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def stationarity_test(config: Settings, series: pd.Series) -> Tuple[bool, float]:
@@ -195,8 +196,8 @@ def compute_gap_stats(series: pd.Series) -> pd.Series:
 @series_hashable
 @series_handle_nulls
 def pandas_describe_timeseries_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a timeseries.
 
     Args:
diff --git a/src/ydata_profiling/model/pandas/describe_url_pandas.py b/src/ydata_profiling/model/pandas/describe_url_pandas.py
index bfe5239bf..4a64a8c30 100644
--- a/src/ydata_profiling/model/pandas/describe_url_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_url_pandas.py
@@ -5,6 +5,7 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_url_1d
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def url_summary(series: pd.Series) -> dict:
@@ -29,8 +30,8 @@ def url_summary(series: pd.Series) -> dict:
 
 @describe_url_1d.register
 def pandas_describe_url_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, pd.Series, VarDescription]:
     """Describe a url series.
 
     Args:
diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py
index bbb401fd0..d66906caa 100644
--- a/src/ydata_profiling/model/pandas/summary_pandas.py
+++ b/src/ydata_profiling/model/pandas/summary_pandas.py
@@ -13,6 +13,7 @@
 from ydata_profiling.model.summarizer import BaseSummarizer
 from ydata_profiling.model.summary import describe_1d, get_series_descriptions
 from ydata_profiling.model.typeset import ProfilingTypeSet
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.utils.dataframe import sort_column_names
 
 
@@ -22,7 +23,7 @@ def pandas_describe_1d(
     series: pd.Series,
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
-) -> dict:
+) -> VarDescription:
     """Describe a series (infer the variable type, then calculate type-specific values).
 
     Args:
@@ -64,8 +65,8 @@ def pandas_get_series_descriptions(
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
     pbar: tqdm,
-) -> dict:
-    def multiprocess_1d(args: tuple) -> Tuple[str, dict]:
+) -> dict[str, VarDescription]:
+    def multiprocess_1d(args: tuple) -> Tuple[str, VarDescription]:
         """Wrapper to process series in parallel.
 
         Args:
diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py
index a919ee33b..bef531e2f 100644
--- a/src/ydata_profiling/model/pandas/table_pandas.py
+++ b/src/ydata_profiling/model/pandas/table_pandas.py
@@ -4,11 +4,12 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.table import get_table_stats
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @get_table_stats.register
 def pandas_get_table_stats(
-    config: Settings, df: pd.DataFrame, variable_stats: dict
+    config: Settings, df: pd.DataFrame, variable_stats: dict[str, VarDescription]
 ) -> dict:
     """General statistics for the DataFrame.
 
@@ -36,10 +37,10 @@ def pandas_get_table_stats(
     }
 
     for series_summary in variable_stats.values():
-        if "n_missing" in series_summary and series_summary["n_missing"] > 0:
+        if series_summary.n_missing > 0:
             table_stats["n_vars_with_missing"] += 1
-            table_stats["n_cells_missing"] += series_summary["n_missing"]
-            if series_summary["n_missing"] == n:
+            table_stats["n_cells_missing"] += series_summary.n_missing
+            if series_summary.n_missing == n:
                 table_stats["n_vars_all_missing"] += 1
 
     table_stats["p_cells_missing"] = (
diff --git a/src/ydata_profiling/model/pandas/var_description/counts_pandas.py b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py
new file mode 100644
index 000000000..6ffc3e6d5
--- /dev/null
+++ b/src/ydata_profiling/model/pandas/var_description/counts_pandas.py
@@ -0,0 +1,53 @@
+import pandas as pd
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.counts import VarCounts
+
+
+def get_counts_pandas(config: Settings, series: pd.Series) -> VarCounts:
+    """Get a VarCounts object for a pandas series."""
+    length = len(series)
+
+    try:
+        value_counts_with_nan = series.value_counts(dropna=False)
+        _ = set(value_counts_with_nan.index)
+        hashable = True
+    except:  # noqa: E722
+        hashable = False
+
+    value_counts_without_nan = None
+    value_counts_index_sorted = None
+    if hashable:
+        value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]
+
+        null_index = value_counts_with_nan.index.isnull()
+        if null_index.any():
+            n_missing = value_counts_with_nan[null_index].sum()
+            value_counts_without_nan = value_counts_with_nan[~null_index]
+        else:
+            n_missing = 0
+            value_counts_without_nan = value_counts_with_nan
+
+        try:
+            value_counts_index_sorted = value_counts_without_nan.sort_index(
+                ascending=True
+            )
+            ordering = True
+        except TypeError:
+            ordering = False
+    else:
+        n_missing = series.isna().sum()
+        ordering = False
+
+    return VarCounts(
+        hashable=hashable,
+        value_counts_without_nan=value_counts_without_nan,
+        value_counts_index_sorted=value_counts_index_sorted,
+        ordering=ordering,
+        n_missing=n_missing,
+        n=length,
+        p_missing=series.isna().sum() / length if length > 0 else 0,
+        count=length - series.isna().sum(),
+        memory_size=series.memory_usage(deep=config.memory_deep),
+        value_counts=None,
+    )
diff --git a/src/ydata_profiling/model/pandas/var_description/default_pandas.py b/src/ydata_profiling/model/pandas/var_description/default_pandas.py
new file mode 100644
index 000000000..6a1c21a42
--- /dev/null
+++ b/src/ydata_profiling/model/pandas/var_description/default_pandas.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+import pandas as pd
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.pandas.var_description.counts_pandas import get_counts_pandas
+from ydata_profiling.model.var_description.default import VarDescription
+
+
+def get_default_pandas_description(
+    config: Settings, series: pd.Series, init_dict: dict
+) -> VarDescription:
+    var_counts = get_counts_pandas(config, series)
+
+    if var_counts.hashable:
+        count = var_counts.count
+        value_counts = var_counts.value_counts_without_nan
+        distinct_count = len(value_counts)
+        unique_count = value_counts.where(value_counts == 1).count()
+
+        init_dict.update(
+            {
+                "n_distinct": distinct_count,
+                "p_distinct": distinct_count / count if count > 0 else 0,
+                "is_unique": unique_count == count and count > 0,
+                "n_unique": unique_count,
+                "p_unique": unique_count / count if count > 0 else 0,
+            }
+        )
+
+    return VarDescription.from_var_counts(var_counts, init_dict)
diff --git a/src/ydata_profiling/model/var_description/counts.py b/src/ydata_profiling/model/var_description/counts.py
new file mode 100644
index 000000000..70f96af20
--- /dev/null
+++ b/src/ydata_profiling/model/var_description/counts.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass
+from typing import Any, Union
+
+
+@dataclass
+class VarCounts:
+    """Data about counts in variable column."""
+
+    n: Union[int, list]
+    """Count of rows in the series."""
+    count: Union[int, list]
+    """Count of not missing rows in the series."""
+    n_missing: Union[int, list]
+    """Count of missing rows in the series."""
+    p_missing: Union[float, list]
+    """Proportion of missing rows in the series."""
+
+    hashable: Union[bool, list]
+    value_counts_without_nan: Any
+    """Counts of values in the series without NaN. Values as index, counts as values."""
+    value_counts_index_sorted: Any
+    """Sorted counts of values in the series without NaN. Sorted by counts."""
+    ordering: Union[bool, list]
+    memory_size: Union[int, list]
+
+    value_counts: Any
+    """Counts of values in original series type. Values as index, counts as values."""
diff --git a/src/ydata_profiling/model/var_description/default.py b/src/ydata_profiling/model/var_description/default.py
new file mode 100644
index 000000000..49aa63e82
--- /dev/null
+++ b/src/ydata_profiling/model/var_description/default.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from collections import abc
+from dataclasses import dataclass
+from typing import Any, Iterator
+
+from ydata_profiling.model.var_description.counts import VarCounts
+
+
+@dataclass
+class VarDescription(VarCounts):
+    """Default description for one data column.
+    Extends VarCounts class with information about distinct and unique values."""
+
+    var_specific: dict
+
+    def __getitem__(self, item: str):
+        """Make the object subscriptable."""
+        return self.var_specific[item]
+
+    def __setitem__(self, key: str, value: Any):
+        """Make the object subscriptable."""
+        self.var_specific[key] = value
+
+    def update(self, _dict: dict) -> None:
+        """To support old dict like interface."""
+        self.var_specific.update(_dict)
+
+    def items(self) -> abc.ItemsView:
+        """To support old dict like interface."""
+        return self.var_specific.items()
+
+    def get(self, key: str, default: Any = None) -> Any:
+        """To support old dict like interface."""
+        return self.var_specific.get(key, default)
+
+    def __iter__(self) -> Iterator:
+        """To support old dict like interface."""
+        return self.var_specific.__iter__()
+
+    @classmethod
+    def from_var_counts(cls, var_counts: VarCounts, init_dict: dict) -> VarDescription:
+        """Get a default description from a VarCounts object."""
+        return VarDescription(
+            n=var_counts.n,
+            count=var_counts.count,
+            n_missing=var_counts.n_missing,
+            p_missing=var_counts.p_missing,
+            hashable=var_counts.hashable,
+            memory_size=var_counts.memory_size,
+            ordering=var_counts.ordering,
+            var_specific=init_dict,
+            value_counts_index_sorted=var_counts.value_counts_index_sorted,
+            value_counts_without_nan=var_counts.value_counts_without_nan,
+            value_counts=var_counts.value_counts,
+        )

From f6fa91c2bafaf2a0ef7ad2cd916ee80f98d35491 Mon Sep 17 00:00:00 2001
From: Jan Cap <jan.cap@profinit.eu>
Date: Sat, 16 Dec 2023 16:31:42 +0100
Subject: [PATCH 02/10] feat: make pandas profiling work just fine

---
 src/ydata_profiling/model/alerts.py           | 94 ++++++++++---------
 src/ydata_profiling/model/describe.py         |  4 +-
 src/ydata_profiling/model/description.py      |  4 +-
 .../model/expectation_algorithms.py           | 38 ++++----
 src/ydata_profiling/model/summarizer.py       |  9 +-
 src/ydata_profiling/model/summary.py          |  5 +-
 .../report/structure/report.py                |  4 +-
 7 files changed, 84 insertions(+), 74 deletions(-)

diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py
index d3232ea9b..9ce40522a 100644
--- a/src/ydata_profiling/model/alerts.py
+++ b/src/ydata_profiling/model/alerts.py
@@ -1,13 +1,15 @@
 """Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant
 values, high correlations)."""
+
 from enum import Enum, auto, unique
-from typing import Any, Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set
 
 import numpy as np
 import pandas as pd
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.correlations import perform_check_correlation
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def fmt_percent(value: float, edge_cases: bool = True) -> str:
@@ -143,13 +145,13 @@ def __repr__(self):
 class ConstantLengthAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.CONSTANT_LENGTH,
-            values=values,
+            values=values.var_specific,
             column_name=column_name,
             fields={"composition_min_length", "composition_max_length"},
             is_empty=is_empty,
@@ -162,15 +164,14 @@ def _get_description(self) -> str:
 class ConstantAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.CONSTANT,
-            values=values,
+            values={"n_distinct": values["n_distinct"]},
             column_name=column_name,
-            fields={"n_distinct"},
             is_empty=is_empty,
         )
 
@@ -181,7 +182,7 @@ def _get_description(self) -> str:
 class DuplicatesAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: dict,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
@@ -203,15 +204,14 @@ def _get_description(self) -> str:
 class EmptyAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.EMPTY,
-            values=values,
+            values={"n": values.n},
             column_name=column_name,
-            fields={"n"},
             is_empty=is_empty,
         )
 
@@ -222,15 +222,14 @@ def _get_description(self) -> str:
 class HighCardinalityAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.HIGH_CARDINALITY,
-            values=values,
+            values={"n_distinct": values["n_distinct"]},
             column_name=column_name,
-            fields={"n_distinct"},
             is_empty=is_empty,
         )
 
@@ -244,7 +243,7 @@ def _get_description(self) -> str:
 class HighCorrelationAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: Dict,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
@@ -270,13 +269,13 @@ def _get_description(self) -> str:
 class ImbalanceAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.IMBALANCE,
-            values=values,
+            values=values.var_specific,
             column_name=column_name,
             fields={"imbalance"},
             is_empty=is_empty,
@@ -293,13 +292,13 @@ def _get_description(self) -> str:
 class InfiniteAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.INFINITE,
-            values=values,
+            values=values.var_specific,
             column_name=column_name,
             fields={"p_infinite", "n_infinite"},
             is_empty=is_empty,
@@ -315,15 +314,14 @@ def _get_description(self) -> str:
 class MissingAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.MISSING,
-            values=values,
+            values={"p_missing": values.p_missing, "n_missing": values.n_missing},
             column_name=column_name,
-            fields={"p_missing", "n_missing"},
             is_empty=is_empty,
         )
 
@@ -373,13 +371,13 @@ def _get_description(self) -> str:
 class SkewedAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.SKEWED,
-            values=values,
+            values=values.var_specific,
             column_name=column_name,
             fields={"skewness"},
             is_empty=is_empty,
@@ -432,15 +430,19 @@ def _get_description(self) -> str:
 class UniqueAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.UNIQUE,
-            values=values,
+            values={
+                "n_distinct": values["n_distinct"],
+                "p_distinct": values["p_distinct"],
+                "n_unique": values["n_unique"],
+                "p_unique": values["p_unique"],
+            },
             column_name=column_name,
-            fields={"n_distinct", "p_distinct", "n_unique", "p_unique"},
             is_empty=is_empty,
         )
 
@@ -469,13 +471,13 @@ def _get_description(self) -> str:
 class ZerosAlert(Alert):
     def __init__(
         self,
-        values: Optional[Dict] = None,
+        values: VarDescription,
         column_name: Optional[str] = None,
         is_empty: bool = False,
     ):
         super().__init__(
             alert_type=AlertType.ZEROS,
-            values=values,
+            values=values.var_specific,
             column_name=column_name,
             fields={"n_zeros", "p_zeros"},
             is_empty=is_empty,
@@ -531,7 +533,7 @@ def check_table_alerts(table: dict) -> List[Alert]:
     return alerts
 
 
-def numeric_alerts(config: Settings, summary: dict) -> List[Alert]:
+def numeric_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
     alerts: List[Alert] = []
 
     # Skewness
@@ -555,7 +557,7 @@ def numeric_alerts(config: Settings, summary: dict) -> List[Alert]:
     return alerts
 
 
-def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]:
+def timeseries_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
     alerts: List[Alert] = numeric_alerts(config, summary)
 
     if not summary["stationary"]:
@@ -567,7 +569,7 @@ def timeseries_alerts(config: Settings, summary: dict) -> List[Alert]:
     return alerts
 
 
-def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
+def categorical_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
     alerts: List[Alert] = []
 
     # High cardinality
@@ -585,7 +587,7 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
 
     # Constant length
     if "composition" in summary and summary["min_length"] == summary["max_length"]:
-        alerts.append(ConstantLengthAlert())
+        alerts.append(ConstantLengthAlert(summary))
 
     # Imbalance
     if (
@@ -596,38 +598,38 @@ def categorical_alerts(config: Settings, summary: dict) -> List[Alert]:
     return alerts
 
 
-def boolean_alerts(config: Settings, summary: dict) -> List[Alert]:
+def boolean_alerts(config: Settings, summary: VarDescription) -> List[Alert]:
     alerts: List[Alert] = []
 
     if (
         "imbalance" in summary
         and summary["imbalance"] > config.vars.bool.imbalance_threshold
     ):
-        alerts.append(ImbalanceAlert())
+        alerts.append(ImbalanceAlert(summary))
     return alerts
 
 
-def generic_alerts(summary: dict) -> List[Alert]:
+def generic_alerts(summary: VarDescription) -> List[Alert]:
     alerts: List[Alert] = []
 
     # Missing
-    if alert_value(summary["p_missing"]):
-        alerts.append(MissingAlert())
+    if alert_value(summary.p_missing):
+        alerts.append(MissingAlert(summary))
 
     return alerts
 
 
-def supported_alerts(summary: dict) -> List[Alert]:
+def supported_alerts(summary: VarDescription) -> List[Alert]:
     alerts: List[Alert] = []
 
-    if summary.get("n_distinct", np.nan) == summary["n"]:
-        alerts.append(UniqueAlert())
+    if summary.get("n_distinct", np.nan) == summary.n:
+        alerts.append(UniqueAlert(summary))
     if summary.get("n_distinct", np.nan) == 1:
         alerts.append(ConstantAlert(summary))
     return alerts
 
 
-def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]:
+def unsupported_alerts(summary: VarDescription) -> List[Alert]:
     alerts: List[Alert] = [
         UnsupportedAlert(),
         RejectedAlert(),
@@ -635,7 +637,9 @@ def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]:
     return alerts
 
 
-def check_variable_alerts(config: Settings, col: str, description: dict) -> List[Alert]:
+def check_variable_alerts(
+    config: Settings, col: str, description: VarDescription
+) -> List[Alert]:
     """Checks individual variables for alerts.
 
     Args:
@@ -665,7 +669,6 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List
 
     for idx in range(len(alerts)):
         alerts[idx].column_name = col
-        alerts[idx].values = description
     return alerts
 
 
@@ -693,7 +696,10 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert
 
 
 def get_alerts(
-    config: Settings, table_stats: dict, series_description: dict, correlations: dict
+    config: Settings,
+    table_stats: dict,
+    series_description: dict[str, VarDescription],
+    correlations: dict,
 ) -> List[Alert]:
     alerts: List[Alert] = check_table_alerts(table_stats)
     for col, description in series_description.items():
diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py
index 6a7afffe1..a8bfcac1a 100644
--- a/src/ydata_profiling/model/describe.py
+++ b/src/ydata_profiling/model/describe.py
@@ -1,4 +1,5 @@
 """Organize the calculation of statistics for each series in this DataFrame."""
+
 from datetime import datetime
 from typing import Any, Dict, Optional
 
@@ -23,6 +24,7 @@
 from ydata_profiling.model.summary import get_series_descriptions
 from ydata_profiling.model.table import get_table_stats
 from ydata_profiling.model.timeseries_index import get_time_index_description
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.utils.progress_bar import progress
 from ydata_profiling.version import __version__
 
@@ -71,7 +73,7 @@ def describe(
 
         # Variable-specific
         pbar.total += len(df.columns)
-        series_description = get_series_descriptions(
+        series_description: dict[str, VarDescription] = get_series_descriptions(
             config, df, summarizer, typeset, pbar
         )
 
diff --git a/src/ydata_profiling/model/description.py b/src/ydata_profiling/model/description.py
index 6c386704e..e000b6dcb 100644
--- a/src/ydata_profiling/model/description.py
+++ b/src/ydata_profiling/model/description.py
@@ -4,6 +4,8 @@
 
 from pandas import Timedelta
 
+from ydata_profiling.model.var_description.default import VarDescription
+
 
 @dataclass
 class BaseAnalysis:
@@ -98,7 +100,7 @@ class BaseDescription:
     analysis: BaseAnalysis
     time_index_analysis: Optional[TimeIndexAnalysis]
     table: Any
-    variables: Dict[str, Any]
+    variables: Dict[str, VarDescription]
     scatter: Any
     correlations: Dict[str, Any]
     missing: Dict[str, Any]
diff --git a/src/ydata_profiling/model/expectation_algorithms.py b/src/ydata_profiling/model/expectation_algorithms.py
index cbbeb635e..50c1feea7 100644
--- a/src/ydata_profiling/model/expectation_algorithms.py
+++ b/src/ydata_profiling/model/expectation_algorithms.py
@@ -1,12 +1,14 @@
 from typing import Any, Tuple
 
+from ydata_profiling.model.var_description.default import VarDescription
+
 
 def generic_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     batch.expect_column_to_exist(name)
 
-    if summary["n_missing"] == 0:
+    if summary.n_missing == 0:
         batch.expect_column_values_to_not_be_null(name)
 
     if summary["p_unique"] == 1.0:
@@ -16,8 +18,8 @@ def generic_expectations(
 
 
 def numeric_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     from great_expectations.profile.base import ProfilerTypeMapping
 
     numeric_type_names = (
@@ -56,8 +58,8 @@ def numeric_expectations(
 
 
 def categorical_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     # Use for both categorical and special case (boolean)
     absolute_threshold = 10
     relative_threshold = 0.2
@@ -66,20 +68,20 @@ def categorical_expectations(
         or summary["p_distinct"] < relative_threshold
     ):
         batch.expect_column_values_to_be_in_set(
-            name, set(summary["value_counts_without_nan"].keys())
+            name, set(summary.value_counts_without_nan.keys())
         )
     return name, summary, batch
 
 
 def path_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     return name, summary, batch
 
 
 def datetime_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     if any(k in summary for k in ["min", "max"]):
         batch.expect_column_values_to_be_between(
             name,
@@ -92,20 +94,20 @@ def datetime_expectations(
 
 
 def image_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     return name, summary, batch
 
 
 def url_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     return name, summary, batch
 
 
 def file_expectations(
-    name: str, summary: dict, batch: Any, *args
-) -> Tuple[str, dict, Any]:
+    name: str, summary: VarDescription, batch: Any, *args
+) -> Tuple[str, VarDescription, Any]:
     # By definition within our type logic, a file exists (as it's a path that also exists)
     batch.expect_file_to_exist(name)
 
diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
index 92e66733c..e5f46c9b4 100644
--- a/src/ydata_profiling/model/summarizer.py
+++ b/src/ydata_profiling/model/summarizer.py
@@ -11,10 +11,8 @@
 from ydata_profiling.model.summary_algorithms import (
     describe_boolean_1d,
     describe_categorical_1d,
-    describe_counts,
     describe_date_1d,
     describe_file_1d,
-    describe_generic,
     describe_image_1d,
     describe_numeric_1d,
     describe_path_1d,
@@ -23,6 +21,7 @@
     describe_timeseries_1d,
     describe_url_1d,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 class BaseSummarizer(Handler):
@@ -33,7 +32,7 @@ class BaseSummarizer(Handler):
 
     def summarize(
         self, config: Settings, series: pd.Series, dtype: Type[VisionsBaseType]
-    ) -> dict:
+    ) -> VarDescription:
         """
 
         Returns:
@@ -49,8 +48,6 @@ class PandasProfilingSummarizer(BaseSummarizer):
     def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
         summary_map: Dict[str, List[Callable]] = {
             "Unsupported": [
-                describe_counts,
-                describe_generic,
                 describe_supported,
             ],
             "Numeric": [
@@ -87,7 +84,7 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
         super().__init__(summary_map, typeset, *args, **kwargs)
 
 
-def format_summary(summary: Union[BaseDescription, dict]) -> dict:
+def format_summary(summary: Union[BaseDescription, VarDescription, dict]) -> dict:
     """Prepare summary for export to json file.
 
     Args:
diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py
index 477aae1ca..8e4179598 100644
--- a/src/ydata_profiling/model/summary.py
+++ b/src/ydata_profiling/model/summary.py
@@ -8,6 +8,7 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summarizer import BaseSummarizer
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @multimethod
@@ -16,7 +17,7 @@ def describe_1d(
     series: Any,
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
-) -> dict:
+) -> VarDescription:
     raise NotImplementedError()
 
 
@@ -27,5 +28,5 @@ def get_series_descriptions(
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
     pbar: tqdm,
-) -> dict:
+) -> dict[str, VarDescription]:
     raise NotImplementedError()
diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py
index 24b11e56a..71ea837b3 100644
--- a/src/ydata_profiling/report/structure/report.py
+++ b/src/ydata_profiling/report/structure/report.py
@@ -138,7 +138,7 @@ def render_variables_section(
             "alert_fields": alert_fields,
         }
 
-        template_variables.update(summary)
+        summary.update(template_variables)
 
         # Per type template variables
         if isinstance(summary["type"], list):
@@ -159,7 +159,7 @@ def render_variables_section(
         else:
             variable_type = summary["type"]
         render_map_type = render_map.get(variable_type, render_map["Unsupported"])
-        template_variables.update(render_map_type(config, template_variables))
+        template_variables.update(render_map_type(config, summary))
 
         # Ignore these
         if reject_variables:

From b3e7120e27867fe9417a566af85b160268b78b17 Mon Sep 17 00:00:00 2001
From: Jan Cap <jan.cap@profinit.eu>
Date: Sat, 16 Dec 2023 16:32:20 +0100
Subject: [PATCH 03/10] feat: update render to support VarDescription

---
 .../structure/variables/render_boolean.py     | 22 +++----
 .../structure/variables/render_categorical.py | 61 ++++++++++---------
 .../structure/variables/render_common.py      | 15 ++---
 .../structure/variables/render_complex.py     |  9 +--
 .../structure/variables/render_count.py       |  9 +--
 .../report/structure/variables/render_date.py |  9 +--
 .../report/structure/variables/render_file.py |  5 +-
 .../structure/variables/render_generic.py     |  9 +--
 .../structure/variables/render_image.py       |  9 +--
 .../report/structure/variables/render_path.py |  5 +-
 .../report/structure/variables/render_real.py |  9 +--
 .../report/structure/variables/render_text.py |  9 +--
 .../structure/variables/render_timeseries.py  |  9 +--
 .../report/structure/variables/render_url.py  | 15 ++---
 14 files changed, 104 insertions(+), 91 deletions(-)

diff --git a/src/ydata_profiling/report/structure/variables/render_boolean.py b/src/ydata_profiling/report/structure/variables/render_boolean.py
index e6bdbe4d0..b2213f682 100644
--- a/src/ydata_profiling/report/structure/variables/render_boolean.py
+++ b/src/ydata_profiling/report/structure/variables/render_boolean.py
@@ -1,6 +1,7 @@
 from typing import List
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
 from ydata_profiling.report.presentation.core import (
     Container,
@@ -16,7 +17,7 @@
 from ydata_profiling.visualisation.plot import cat_frequency_plot
 
 
-def render_boolean(config: Settings, summary: dict) -> dict:
+def render_boolean(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     n_obs_bool = config.vars.bool.n_obs
     image_format = config.plot.image_format
@@ -48,17 +49,17 @@ def render_boolean(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": "n_missing" in summary["alert_fields"],
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": "p_missing" in summary["alert_fields"],
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
@@ -67,8 +68,8 @@ def render_boolean(config: Settings, summary: dict) -> dict:
 
     fqm = FrequencyTableSmall(
         freq_table(
-            freqtable=summary["value_counts_without_nan"],
-            n=summary["n"],
+            freqtable=summary.value_counts_without_nan,
+            n=summary.n,
             max_number_to_print=n_obs_bool,
         ),
         redact=False,
@@ -89,7 +90,7 @@ def render_boolean(config: Settings, summary: dict) -> dict:
     max_unique = config.plot.cat_freq.max_unique
 
     if show and (max_unique > 0):
-        if isinstance(summary["value_counts_without_nan"], list):
+        if isinstance(summary.value_counts_without_nan, list):
             items.append(
                 Container(
                     [
@@ -103,7 +104,7 @@ def render_boolean(config: Settings, summary: dict) -> dict:
                             name=config.html.style._labels[idx],
                             anchor_id=f"{varid}cat_frequency_plot_{idx}",
                         )
-                        for idx, s in enumerate(summary["value_counts_without_nan"])
+                        for idx, s in enumerate(summary.value_counts_without_nan)
                     ],
                     anchor_id=f"{varid}cat_frequency_plot",
                     name="Common Values (Plot)",
@@ -114,10 +115,7 @@ def render_boolean(config: Settings, summary: dict) -> dict:
         else:
             items.append(
                 Image(
-                    cat_frequency_plot(
-                        config,
-                        summary["value_counts_without_nan"],
-                    ),
+                    cat_frequency_plot(config, summary.value_counts_without_nan),
                     image_format=image_format,
                     alt="Common Values (Plot)",
                     name="Common Values (Plot)",
diff --git a/src/ydata_profiling/report/structure/variables/render_categorical.py b/src/ydata_profiling/report/structure/variables/render_categorical.py
index 86f5a262a..db1b5ec52 100644
--- a/src/ydata_profiling/report/structure/variables/render_categorical.py
+++ b/src/ydata_profiling/report/structure/variables/render_categorical.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import (
     fmt,
     fmt_bytesize,
@@ -27,7 +28,7 @@
 
 
 def render_categorical_frequency(
-    config: Settings, summary: dict, varid: str
+    config: Settings, summary: VarDescription, varid: str
 ) -> Renderable:
     frequency_table = Table(
         [
@@ -54,7 +55,7 @@ def render_categorical_frequency(
 
 
 def render_categorical_length(
-    config: Settings, summary: dict, varid: str
+    config: Settings, summary: VarDescription, varid: str
 ) -> Tuple[Renderable, Renderable]:
     length_table = Table(
         [
@@ -117,7 +118,7 @@ def _get_n(value: Union[list, pd.DataFrame]) -> Union[int, List[int]]:
 
 
 def render_categorical_unicode(
-    config: Settings, summary: dict, varid: str
+    config: Settings, summary: VarDescription, varid: str
 ) -> Tuple[Renderable, Renderable]:
     n_freq_table_max = config.n_freq_table_max
 
@@ -329,7 +330,7 @@ def render_categorical_unicode(
     )
 
 
-def render_categorical(config: Settings, summary: dict) -> dict:
+def render_categorical(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     n_obs_cat = config.vars.cat.n_obs
     image_format = config.plot.image_format
@@ -366,17 +367,17 @@ def render_categorical(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": "n_missing" in summary["alert_fields"],
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": "p_missing" in summary["alert_fields"],
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
@@ -385,8 +386,8 @@ def render_categorical(config: Settings, summary: dict) -> dict:
 
     fqm = FrequencyTableSmall(
         freq_table(
-            freqtable=summary["value_counts_without_nan"],
-            n=summary["count"],
+            freqtable=summary.value_counts_without_nan,
+            n=summary.count,
             max_number_to_print=n_obs_cat,
         ),
         redact=config.vars.cat.redact,
@@ -459,26 +460,28 @@ def render_categorical(config: Settings, summary: dict) -> dict:
     max_unique = config.plot.cat_freq.max_unique
 
     if show and (max_unique > 0):
-        if isinstance(summary["value_counts_without_nan"], list):
+        if isinstance(summary.value_counts_without_nan, list):
             string_items.append(
                 Container(
                     [
-                        Image(
-                            cat_frequency_plot(
-                                config,
-                                s,
-                            ),
-                            image_format=image_format,
-                            alt=config.html.style._labels[idx],
-                            name=config.html.style._labels[idx],
-                            anchor_id=f"{varid}cat_frequency_plot_{idx}",
+                        (
+                            Image(
+                                cat_frequency_plot(
+                                    config,
+                                    s,
+                                ),
+                                image_format=image_format,
+                                alt=config.html.style._labels[idx],
+                                name=config.html.style._labels[idx],
+                                anchor_id=f"{varid}cat_frequency_plot_{idx}",
+                            )
+                            if summary["n_distinct"][idx] <= max_unique
+                            else HTML(
+                                f"<h4 class='indent'>{config.html.style._labels[idx]}</h4><br />"
+                                f"<em>Number of variable categories passes threshold (<code>config.plot.cat_freq.max_unique</code>)</em>"
+                            )
                         )
-                        if summary["n_distinct"][idx] <= max_unique
-                        else HTML(
-                            f"<h4 class='indent'>{config.html.style._labels[idx]}</h4><br />"
-                            f"<em>Number of variable categories passes threshold (<code>config.plot.cat_freq.max_unique</code>)</em>"
-                        )
-                        for idx, s in enumerate(summary["value_counts_without_nan"])
+                        for idx, s in enumerate(summary.value_counts_without_nan)
                     ],
                     anchor_id=f"{varid}cat_frequency_plot",
                     name="Common Values (Plot)",
@@ -493,7 +496,7 @@ def render_categorical(config: Settings, summary: dict) -> dict:
                 Image(
                     cat_frequency_plot(
                         config,
-                        summary["value_counts_without_nan"],
+                        summary.value_counts_without_nan,
                     ),
                     image_format=image_format,
                     alt="Common Values (Plot)",
@@ -515,9 +518,9 @@ def render_categorical(config: Settings, summary: dict) -> dict:
             string_items,
             name="Categories",
             anchor_id=f"{varid}string",
-            sequence_type="named_list"
-            if len(config.html.style._labels) > 1
-            else "batch_grid",
+            sequence_type=(
+                "named_list" if len(config.html.style._labels) > 1 else "batch_grid"
+            ),
             batch_size=len(config.html.style._labels),
         ),
     ]
diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py
index aef8de357..b597eda08 100644
--- a/src/ydata_profiling/report/structure/variables/render_common.py
+++ b/src/ydata_profiling/report/structure/variables/render_common.py
@@ -1,30 +1,31 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.presentation.frequency_table_utils import (
     extreme_obs_table,
     freq_table,
 )
 
 
-def render_common(config: Settings, summary: dict) -> dict:
+def render_common(config: Settings, summary: VarDescription) -> dict:
     n_extreme_obs = config.n_extreme_obs
     n_freq_table_max = config.n_freq_table_max
 
     template_variables = {
         # TODO: with nan
         "freq_table_rows": freq_table(
-            freqtable=summary["value_counts_without_nan"],
-            n=summary["n"],
+            freqtable=summary.value_counts_without_nan,
+            n=summary.n,
             max_number_to_print=n_freq_table_max,
         ),
         "firstn_expanded": extreme_obs_table(
-            freqtable=summary["value_counts_index_sorted"],
+            freqtable=summary.value_counts_index_sorted,
             number_to_print=n_extreme_obs,
-            n=summary["n"],
+            n=summary.n,
         ),
         "lastn_expanded": extreme_obs_table(
-            freqtable=summary["value_counts_index_sorted"][::-1],
+            freqtable=summary.value_counts_index_sorted[::-1],
             number_to_print=n_extreme_obs,
-            n=summary["n"],
+            n=summary.n,
         ),
     }
 
diff --git a/src/ydata_profiling/report/structure/variables/render_complex.py b/src/ydata_profiling/report/structure/variables/render_complex.py
index 5995285e5..5c4ea7d09 100644
--- a/src/ydata_profiling/report/structure/variables/render_complex.py
+++ b/src/ydata_profiling/report/structure/variables/render_complex.py
@@ -1,4 +1,5 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import (
     fmt,
     fmt_bytesize,
@@ -15,7 +16,7 @@
 from ydata_profiling.visualisation.plot import scatter_complex
 
 
-def render_complex(config: Settings, summary: dict) -> dict:
+def render_complex(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     template_variables = {}
     image_format = config.plot.image_format
@@ -37,14 +38,14 @@ def render_complex(config: Settings, summary: dict) -> dict:
                 "name": "Distinct (%)",
                 "value": fmt_percent(summary["p_distinct"]),
             },
-            {"name": "Missing", "value": fmt(summary["n_missing"])},
+            {"name": "Missing", "value": fmt(summary.n_missing)},
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
             },
         ],
         style=config.html.style,
diff --git a/src/ydata_profiling/report/structure/variables/render_count.py b/src/ydata_profiling/report/structure/variables/render_count.py
index e11e9913e..e9b238659 100644
--- a/src/ydata_profiling/report/structure/variables/render_count.py
+++ b/src/ydata_profiling/report/structure/variables/render_count.py
@@ -1,4 +1,5 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import (
     fmt,
     fmt_bytesize,
@@ -16,7 +17,7 @@
 from ydata_profiling.visualisation.plot import histogram, mini_histogram
 
 
-def render_count(config: Settings, summary: dict) -> dict:
+def render_count(config: Settings, summary: VarDescription) -> dict:
     template_variables = render_common(config, summary)
     image_format = config.plot.image_format
 
@@ -44,12 +45,12 @@ def render_count(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": False,
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": False,
             },
         ],
@@ -87,7 +88,7 @@ def render_count(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py
index c75a80a5e..94b489cf5 100644
--- a/src/ydata_profiling/report/structure/variables/render_date.py
+++ b/src/ydata_profiling/report/structure/variables/render_date.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
 from ydata_profiling.report.presentation.core import (
     Container,
@@ -11,7 +12,7 @@
 from ydata_profiling.visualisation.plot import histogram, mini_histogram
 
 
-def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
+def render_date(config: Settings, summary: VarDescription) -> Dict[str, Any]:
     varid = summary["varid"]
     template_variables = {}
 
@@ -41,17 +42,17 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": False,
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": False,
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
diff --git a/src/ydata_profiling/report/structure/variables/render_file.py b/src/ydata_profiling/report/structure/variables/render_file.py
index 81379a41f..e54dd6e6e 100644
--- a/src/ydata_profiling/report/structure/variables/render_file.py
+++ b/src/ydata_profiling/report/structure/variables/render_file.py
@@ -1,6 +1,7 @@
 from typing import List
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.presentation.core import Container, FrequencyTable, Image
 from ydata_profiling.report.presentation.core.renderable import Renderable
 from ydata_profiling.report.presentation.frequency_table_utils import freq_table
@@ -8,7 +9,7 @@
 from ydata_profiling.visualisation.plot import histogram
 
 
-def render_file(config: Settings, summary: dict) -> dict:
+def render_file(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
 
     template_variables = render_path(config, summary)
@@ -44,7 +45,7 @@ def render_file(config: Settings, summary: dict) -> dict:
                 FrequencyTable(
                     freq_table(
                         freqtable=summary[file_date_id].value_counts(),
-                        n=summary["n"],
+                        n=summary.n,
                         max_number_to_print=n_freq_table_max,
                     ),
                     name=description,
diff --git a/src/ydata_profiling/report/structure/variables/render_generic.py b/src/ydata_profiling/report/structure/variables/render_generic.py
index 0b2e00efb..c70810cb7 100644
--- a/src/ydata_profiling/report/structure/variables/render_generic.py
+++ b/src/ydata_profiling/report/structure/variables/render_generic.py
@@ -1,4 +1,5 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
 from ydata_profiling.report.presentation.core import (
     HTML,
@@ -8,7 +9,7 @@
 )
 
 
-def render_generic(config: Settings, summary: dict) -> dict:
+def render_generic(config: Settings, summary: VarDescription) -> dict:
     info = VariableInfo(
         anchor_id=summary["varid"],
         alerts=summary["alerts"],
@@ -22,17 +23,17 @@ def render_generic(config: Settings, summary: dict) -> dict:
         [
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": "n_missing" in summary["alert_fields"],
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": "p_missing" in summary["alert_fields"],
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
diff --git a/src/ydata_profiling/report/structure/variables/render_image.py b/src/ydata_profiling/report/structure/variables/render_image.py
index ea1336208..a4491ef89 100644
--- a/src/ydata_profiling/report/structure/variables/render_image.py
+++ b/src/ydata_profiling/report/structure/variables/render_image.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import fmt_numeric
 from ydata_profiling.report.presentation.core import (
     Container,
@@ -13,7 +14,7 @@
 from ydata_profiling.visualisation.plot import scatter_series
 
 
-def render_image(config: Settings, summary: dict) -> dict:
+def render_image(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     n_freq_table_max = config.n_freq_table_max
     redact = config.vars.cat.redact
@@ -135,7 +136,7 @@ def render_image(config: Settings, summary: dict) -> dict:
         FrequencyTable(
             freq_table(
                 freqtable=summary["image_dimensions"].value_counts(),
-                n=summary["n"],
+                n=summary.n,
                 max_number_to_print=n_freq_table_max,
             ),
             name="Common values",
@@ -156,7 +157,7 @@ def render_image(config: Settings, summary: dict) -> dict:
             FrequencyTable(
                 freq_table(
                     freqtable=pd.Series(summary["exif_keys_counts"]),
-                    n=summary["n"],
+                    n=summary.n,
                     max_number_to_print=n_freq_table_max,
                 ),
                 name="Exif keys",
@@ -172,7 +173,7 @@ def render_image(config: Settings, summary: dict) -> dict:
                 FrequencyTable(
                     freq_table(
                         freqtable=counts,
-                        n=summary["n"],
+                        n=summary.n,
                         max_number_to_print=n_freq_table_max,
                     ),
                     name=key,
diff --git a/src/ydata_profiling/report/structure/variables/render_path.py b/src/ydata_profiling/report/structure/variables/render_path.py
index d7cde6f06..eaade0114 100644
--- a/src/ydata_profiling/report/structure/variables/render_path.py
+++ b/src/ydata_profiling/report/structure/variables/render_path.py
@@ -1,4 +1,5 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import fmt, fmt_numeric
 from ydata_profiling.report.presentation.core import Container, FrequencyTable, Table
 from ydata_profiling.report.presentation.frequency_table_utils import freq_table
@@ -7,7 +8,7 @@
 )
 
 
-def render_path(config: Settings, summary: dict) -> dict:
+def render_path(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     n_freq_table_max = config.n_freq_table_max
     redact = config.vars.cat.redact
@@ -18,7 +19,7 @@ def render_path(config: Settings, summary: dict) -> dict:
     for path_part in keys:
         template_variables[f"freqtable_{path_part}"] = freq_table(
             freqtable=summary[f"{path_part}_counts"],
-            n=summary["n"],
+            n=summary.n,
             max_number_to_print=n_freq_table_max,
         )
 
diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py
index 227200c27..2c9005d44 100644
--- a/src/ydata_profiling/report/structure/variables/render_real.py
+++ b/src/ydata_profiling/report/structure/variables/render_real.py
@@ -1,4 +1,5 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import (
     fmt,
     fmt_bytesize,
@@ -17,7 +18,7 @@
 from ydata_profiling.visualisation.plot import histogram, mini_histogram
 
 
-def render_real(config: Settings, summary: dict) -> dict:
+def render_real(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     template_variables = render_common(config, summary)
     image_format = config.plot.image_format
@@ -48,12 +49,12 @@ def render_real(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": "n_missing" in summary["alert_fields"],
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": "p_missing" in summary["alert_fields"],
             },
             {
@@ -111,7 +112,7 @@ def render_real(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
diff --git a/src/ydata_profiling/report/structure/variables/render_text.py b/src/ydata_profiling/report/structure/variables/render_text.py
index 5eadf3799..c4c690e5e 100644
--- a/src/ydata_profiling/report/structure/variables/render_text.py
+++ b/src/ydata_profiling/report/structure/variables/render_text.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, List
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
 from ydata_profiling.report.presentation.core import (
     Container,
@@ -21,7 +22,7 @@
 from ydata_profiling.visualisation.plot import plot_word_cloud
 
 
-def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
+def render_text(config: Settings, summary: VarDescription) -> Dict[str, Any]:
     if config.vars.text.redact:
         render = render_categorical(config, summary)
         return render
@@ -58,17 +59,17 @@ def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": "n_missing" in summary["alert_fields"],
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": "p_missing" in summary["alert_fields"],
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
diff --git a/src/ydata_profiling/report/structure/variables/render_timeseries.py b/src/ydata_profiling/report/structure/variables/render_timeseries.py
index 6f3bc27cd..78e62402d 100644
--- a/src/ydata_profiling/report/structure/variables/render_timeseries.py
+++ b/src/ydata_profiling/report/structure/variables/render_timeseries.py
@@ -1,4 +1,5 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import (
     fmt,
     fmt_bytesize,
@@ -81,7 +82,7 @@ def _render_gap_tab(config: Settings, summary: dict) -> Container:
     )
 
 
-def render_timeseries(config: Settings, summary: dict) -> dict:
+def render_timeseries(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     template_variables = render_common(config, summary)
     image_format = config.plot.image_format
@@ -111,12 +112,12 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": "n_missing" in summary["alert_fields"],
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": "p_missing" in summary["alert_fields"],
             },
             {
@@ -164,7 +165,7 @@ def render_timeseries(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
diff --git a/src/ydata_profiling/report/structure/variables/render_url.py b/src/ydata_profiling/report/structure/variables/render_url.py
index f35d6dcb6..59c007ea5 100644
--- a/src/ydata_profiling/report/structure/variables/render_url.py
+++ b/src/ydata_profiling/report/structure/variables/render_url.py
@@ -1,4 +1,5 @@
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
 from ydata_profiling.report.presentation.core import (
     Container,
@@ -11,7 +12,7 @@
 from ydata_profiling.report.structure.variables.render_common import render_common
 
 
-def render_url(config: Settings, summary: dict) -> dict:
+def render_url(config: Settings, summary: VarDescription) -> dict:
     varid = summary["varid"]
     n_freq_table_max = config.n_freq_table_max
 
@@ -24,7 +25,7 @@ def render_url(config: Settings, summary: dict) -> dict:
     for url_part in keys:
         template_variables[f"freqtable_{url_part}"] = freq_table(
             freqtable=summary[f"{url_part}_counts"],
-            n=summary["n"],
+            n=summary.n,
             max_number_to_print=n_freq_table_max,
         )
 
@@ -101,17 +102,17 @@ def render_url(config: Settings, summary: dict) -> dict:
             },
             {
                 "name": "Missing",
-                "value": fmt(summary["n_missing"]),
+                "value": fmt(summary.n_missing),
                 "alert": "n_missing" in summary["alert_fields"],
             },
             {
                 "name": "Missing (%)",
-                "value": fmt_percent(summary["p_missing"]),
+                "value": fmt_percent(summary.p_missing),
                 "alert": "p_missing" in summary["alert_fields"],
             },
             {
                 "name": "Memory size",
-                "value": fmt_bytesize(summary["memory_size"]),
+                "value": fmt_bytesize(summary.memory_size),
                 "alert": False,
             },
         ],
@@ -120,8 +121,8 @@ def render_url(config: Settings, summary: dict) -> dict:
 
     fqm = FrequencyTableSmall(
         freq_table(
-            freqtable=summary["value_counts_without_nan"],
-            n=summary["n"],
+            freqtable=summary.value_counts_without_nan,
+            n=summary.n,
             max_number_to_print=n_obs_cat,
         ),
         redact=redact,

From b0dc70938663372fe82f8759bdc1c21f5d0bd06f Mon Sep 17 00:00:00 2001
From: Jan Cap <jan.cap@profinit.eu>
Date: Sat, 16 Dec 2023 19:32:28 +0100
Subject: [PATCH 04/10] feat: update tests

---
 tests/unit/test_comparison.py                 |  3 +-
 tests/unit/test_describe.py                   | 10 +++++-
 .../unit/test_ge_integration_expectations.py  | 33 ++++++++++-------
 tests/unit/test_summarizer.py                 | 33 ++++++++++++++---
 tests/unit/test_summary_algos.py              | 36 +++++++++----------
 5 files changed, 77 insertions(+), 38 deletions(-)

diff --git a/tests/unit/test_comparison.py b/tests/unit/test_comparison.py
index 748c5af12..6d5a547b1 100644
--- a/tests/unit/test_comparison.py
+++ b/tests/unit/test_comparison.py
@@ -66,7 +66,8 @@ def test_generate_comparison():
 
     p1 = ProfileReport(df1, title="p1")
     p2 = ProfileReport(df2, title="p1")
-    html = p1.compare(p2).to_html()
+    _compare = p1.compare(p2)
+    html = _compare.to_html()
     assert len(html) > 0
 
 
diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py
index 0eb10b7b7..073b189c1 100644
--- a/tests/unit/test_describe.py
+++ b/tests/unit/test_describe.py
@@ -9,6 +9,7 @@
 from ydata_profiling.model.describe import describe
 from ydata_profiling.model.summary import describe_1d
 from ydata_profiling.model.typeset import ProfilingTypeSet
+from ydata_profiling.model.var_description.default import VarDescription
 
 check_is_NaN = "ydata_profiling.check_is_NaN"
 
@@ -49,7 +50,7 @@ def test_describe_unique(data, expected, summarizer, typeset):
     config = Settings()
     config.vars.num.low_categorical_threshold = 0
 
-    desc_1d = describe_1d(config, data, summarizer, typeset)
+    desc_1d: VarDescription = describe_1d(config, data, summarizer, typeset)
     if expected["is_unique"] is not None:
         assert (
             desc_1d["p_unique"] == expected["p_unique"]
@@ -562,6 +563,13 @@ def test_describe_df(column, describe_data, expected_results, summarizer):
     for k, v in expected_results[column].items():
         if v == check_is_NaN:
             test_condition = k not in results.variables[column]
+        # values from common description
+        elif k in asdict(results.variables[column]):
+            if isinstance(v, float):
+                assert pytest.approx(v) == getattr(results.variables[column], k)
+            else:
+                assert v == getattr(results.variables[column], k)
+            continue
         elif isinstance(v, float):
             test_condition = pytest.approx(v) == results.variables[column][k]
         else:
diff --git a/tests/unit/test_ge_integration_expectations.py b/tests/unit/test_ge_integration_expectations.py
index 4ef0b1465..ed721e4a7 100644
--- a/tests/unit/test_ge_integration_expectations.py
+++ b/tests/unit/test_ge_integration_expectations.py
@@ -1,4 +1,4 @@
-from unittest.mock import Mock, patch
+from unittest.mock import MagicMock, Mock, patch
 
 import pytest
 
@@ -20,14 +20,22 @@ def batch():
 
 
 def test_generic_expectations(batch):
-    generic_expectations("column", {"n_missing": 0, "p_unique": 1.0}, batch)
+    default_desc = MagicMock()
+    default_desc.n_missing = 0
+    d = {"p_unique": 1.0}
+    default_desc.__getitem__.side_effect = d.__getitem__
+    generic_expectations("column", default_desc, batch)
     batch.expect_column_to_exist.assert_called_once()
     batch.expect_column_values_to_not_be_null.assert_called_once()
     batch.expect_column_values_to_be_unique.assert_called_once()
 
 
 def test_generic_expectations_min(batch):
-    generic_expectations("column", {"n_missing": 1, "p_unique": 0.5}, batch)
+    default_desc = MagicMock()
+    default_desc.n_missing = 1
+    d = {"p_unique": 0.5}
+    default_desc.__getitem__.side_effect = d.__getitem__
+    generic_expectations("column", default_desc, batch)
     batch.expect_column_to_exist.assert_called_once()
     batch.expect_column_values_to_not_be_null.assert_not_called()
     batch.expect_column_values_to_be_unique.assert_not_called()
@@ -93,22 +101,21 @@ def test_numeric_expectations_min(batch):
 
 
 def test_categorical_expectations(batch):
-    categorical_expectations(
-        "column",
-        {
-            "n_distinct": 1,
-            "p_distinct": 0.1,
-            "value_counts_without_nan": {"val1": 1, "val2": 2},
-        },
-        batch,
-    )
+    default_desc = MagicMock()
+    d = {"n_distinct": 1, "p_unique": 0.1}
+    default_desc.__getitem__.side_effect = d.__getitem__
+    default_desc.value_counts_without_nan = {"val1": 1, "val2": 2}
+    categorical_expectations("column", default_desc, batch)
     batch.expect_column_values_to_be_in_set.assert_called_once_with(
         "column", {"val1", "val2"}
     )
 
 
 def test_categorical_expectations_min(batch):
-    categorical_expectations("column", {"n_distinct": 15, "p_distinct": 1.0}, batch)
+    default_desc = MagicMock()
+    d = {"n_distinct": 15, "p_distinct": 1.0}
+    default_desc.__getitem__.side_effect = d.__getitem__
+    categorical_expectations("column", default_desc, batch)
     batch.expect_column_values_to_be_in_set.assert_not_called()
 
 
diff --git a/tests/unit/test_summarizer.py b/tests/unit/test_summarizer.py
index 60fea5590..4db83d69e 100644
--- a/tests/unit/test_summarizer.py
+++ b/tests/unit/test_summarizer.py
@@ -2,13 +2,14 @@
 
 import pandas as pd
 
+from ydata_profiling.config import Settings
 from ydata_profiling.model.summarizer import PandasProfilingSummarizer, format_summary
 from ydata_profiling.model.typeset import ProfilingTypeSet
 
 base_path = os.path.abspath(os.path.dirname(__file__))
 
 
-def test_summarizer(config):
+def test_summarizer_base_types(config: Settings):
     pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
 
     _ = format_summary(pps.summarize(config, pd.Series([1, 2, 3, 4, 5]), "Unsupported"))
@@ -23,9 +24,23 @@ def test_summarizer(config):
     _ = format_summary(
         pps.summarize(config, pd.Series(["abc", "abc", "abba"]), "Categorical")
     )
+
+    _ = format_summary(
+        pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean")
+    )
+
+
+def test_summarizer_url(config: Settings):
+    config.vars.url.active = True
+    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(config, pd.Series(["https://www.example.com"]), "URL")
     )
+
+
+def test_summarizer_path(config: Settings):
+    config.vars.path.active = True
+    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(
             config,
@@ -40,6 +55,12 @@ def test_summarizer(config):
             "Path",
         )
     )
+
+
+def test_summarizer_file(config: Settings):
+    config.vars.path.active = True
+    config.vars.file.active = True
+    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(
             config,
@@ -53,6 +74,13 @@ def test_summarizer(config):
             "File",
         )
     )
+
+
+def test_summarizer_image(config: Settings):
+    config.vars.path.active = True
+    config.vars.file.active = True
+    config.vars.image.active = True
+    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(
             config,
@@ -62,6 +90,3 @@ def test_summarizer(config):
             "Image",
         )
     )
-    _ = format_summary(
-        pps.summarize(config, pd.Series([True, False, True, False, False]), "Boolean")
-    )
diff --git a/tests/unit/test_summary_algos.py b/tests/unit/test_summary_algos.py
index 523ce5fcd..d7aa90045 100644
--- a/tests/unit/test_summary_algos.py
+++ b/tests/unit/test_summary_algos.py
@@ -4,35 +4,37 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import (
-    describe_counts,
-    describe_generic,
     describe_supported,
     histogram_compute,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def test_count_summary_sorted(config):
     s = pd.Series([1] + [2] * 1000)
-    _, sn, r = describe_counts(config, s, {})
-    assert r["value_counts_without_nan"].index[0] == 2
-    assert r["value_counts_without_nan"].index[1] == 1
+    r: VarDescription
+    _, sn, r = describe_supported(config, s, {})
+    assert r.value_counts_without_nan.index[0] == 2
+    assert r.value_counts_without_nan.index[1] == 1
 
 
 def test_count_summary_nat(config):
+    r: VarDescription
     s = pd.to_datetime(pd.Series([1, 2] + [np.nan, pd.NaT]))
-    _, sn, r = describe_counts(config, s, {})
-    assert len(r["value_counts_without_nan"].index) == 2
+    _, sn, r = describe_supported(config, s, {})
+    assert len(r.value_counts_without_nan.index) == 2
 
 
 def test_count_summary_category(config):
+    r: VarDescription
     s = pd.Series(
         pd.Categorical(
             ["Poor", "Neutral"] + [np.nan] * 100,
             categories=["Poor", "Neutral", "Excellent"],
         )
     )
-    _, sn, r = describe_counts(config, s, {})
-    assert len(r["value_counts_without_nan"].index) == 2
+    _, sn, r = describe_supported(config, s, {})
+    assert len(r.value_counts_without_nan.index) == 2
 
 
 @pytest.fixture(scope="class")
@@ -41,16 +43,12 @@ def empty_data() -> pd.DataFrame:
 
 
 def test_summary_supported_empty_df(config, empty_data):
-    _, series, summary = describe_counts(config, empty_data["A"], {})
-    assert summary["n_missing"] == 0
-    assert "p_missing" not in summary
-
-    _, series, summary = describe_generic(config, series, summary)
-    assert summary["n_missing"] == 0
-    assert summary["p_missing"] == 0
-    assert summary["count"] == 0
-
-    _, _, summary = describe_supported(config, series, summary)
+    summary: VarDescription
+    _, _, summary = describe_supported(config, empty_data["A"], {})
+    assert summary.n_missing == 0
+    assert summary.n_missing == 0
+    assert summary.p_missing == 0
+    assert summary.count == 0
     assert summary["n_distinct"] == 0
     assert summary["p_distinct"] == 0
     assert summary["n_unique"] == 0

From d91400fc7147c72e5b0410726debf85096c9ce90 Mon Sep 17 00:00:00 2001
From: Jan Cap <jan.cap@profinit.eu>
Date: Sat, 16 Dec 2023 20:04:42 +0100
Subject: [PATCH 05/10] feat: add summary algs support for VariableDescription

---
 .../model/summary_algorithms.py               | 85 +++++++++----------
 1 file changed, 41 insertions(+), 44 deletions(-)

diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py
index b97a72ca7..4a82b3313 100644
--- a/src/ydata_profiling/model/summary_algorithms.py
+++ b/src/ydata_profiling/model/summary_algorithms.py
@@ -7,6 +7,7 @@
 from scipy.stats import chisquare
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.var_description.default import VarDescription
 
 T = TypeVar("T")
 
@@ -62,13 +63,18 @@ def chi_square(
 
 
 def series_hashable(
-    fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]
-) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]:
+    fn: Callable[
+        [Settings, pd.Series, VarDescription],
+        Tuple[Settings, pd.Series, VarDescription],
+    ]
+) -> Callable[
+    [Settings, pd.Series, VarDescription], Tuple[Settings, pd.Series, VarDescription]
+]:
     @functools.wraps(fn)
     def inner(
-        config: Settings, series: pd.Series, summary: dict
-    ) -> Tuple[Settings, pd.Series, dict]:
-        if not summary["hashable"]:
+        config: Settings, series: pd.Series, summary: VarDescription
+    ) -> Tuple[Settings, pd.Series, VarDescription]:
+        if not summary.hashable:
             return config, series, summary
         return fn(config, series, summary)
 
@@ -76,14 +82,19 @@ def inner(
 
 
 def series_handle_nulls(
-    fn: Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]
-) -> Callable[[Settings, pd.Series, dict], Tuple[Settings, pd.Series, dict]]:
+    fn: Callable[
+        [Settings, pd.Series, VarDescription],
+        Tuple[Settings, pd.Series, VarDescription],
+    ]
+) -> Callable[
+    [Settings, pd.Series, VarDescription], Tuple[Settings, pd.Series, VarDescription]
+]:
     """Decorator for nullable series"""
 
     @functools.wraps(fn)
     def inner(
-        config: Settings, series: pd.Series, summary: dict
-    ) -> Tuple[Settings, pd.Series, dict]:
+        config: Settings, series: pd.Series, summary: VarDescription
+    ) -> Tuple[Settings, pd.Series, VarDescription]:
         if series.hasnans:
             series = series.dropna()
 
@@ -103,92 +114,78 @@ def named_aggregate_summary(series: pd.Series, key: str) -> dict:
     return summary
 
 
-@multimethod
-def describe_counts(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
-    raise NotImplementedError()
-
-
 @multimethod
 def describe_supported(
     config: Settings, series: Any, series_description: dict
-) -> Tuple[Settings, Any, dict]:
-    raise NotImplementedError()
-
-
-@multimethod
-def describe_generic(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_numeric_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_text_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict, Any]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_date_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_categorical_1d(
-    config: Settings, series: pd.Series, summary: dict
-) -> Tuple[Settings, pd.Series, dict]:
+    config: Settings, series: pd.Series, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_url_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_file_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_path_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_image_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_boolean_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()
 
 
 @multimethod
 def describe_timeseries_1d(
-    config: Settings, series: Any, summary: dict
-) -> Tuple[Settings, Any, dict]:
+    config: Settings, series: Any, summary: VarDescription
+) -> Tuple[Settings, Any, VarDescription]:
     raise NotImplementedError()

From b8069d8b89267af5f60f9532abc2baf8fff7650c Mon Sep 17 00:00:00 2001
From: Jan Cap <jan.cap@profinit.eu>
Date: Sat, 16 Dec 2023 20:05:58 +0100
Subject: [PATCH 06/10] feat: add spark support

---
 src/ydata_profiling/model/alerts.py           |  5 +-
 src/ydata_profiling/model/pandas/__init__.py  |  4 --
 .../model/pandas/describe_counts_pandas.py    | 64 -------------------
 .../model/pandas/describe_date_pandas.py      |  2 +-
 .../model/pandas/describe_generic_pandas.py   | 37 -----------
 src/ydata_profiling/model/spark/__init__.py   |  4 --
 .../model/spark/correlations_spark.py         |  1 +
 .../model/spark/describe_boolean_spark.py     |  7 +-
 .../model/spark/describe_categorical_spark.py |  5 +-
 .../model/spark/describe_date_spark.py        |  5 +-
 .../model/spark/describe_generic_spark.py     | 32 ----------
 .../model/spark/describe_numeric_spark.py     | 21 +++---
 .../model/spark/describe_supported_spark.py   | 20 ++----
 .../model/spark/describe_text_spark.py        |  5 +-
 .../model/spark/summary_spark.py              |  1 +
 .../model/spark/timeseries_index_spark.py     |  1 +
 .../counts_spark.py}                          | 42 ++++++------
 .../spark/var_description/default_spark.py    | 46 +++++++++++++
 .../model/var_description/default.py          |  4 ++
 19 files changed, 110 insertions(+), 196 deletions(-)
 delete mode 100644 src/ydata_profiling/model/pandas/describe_counts_pandas.py
 delete mode 100644 src/ydata_profiling/model/pandas/describe_generic_pandas.py
 delete mode 100644 src/ydata_profiling/model/spark/describe_generic_spark.py
 rename src/ydata_profiling/model/spark/{describe_counts_spark.py => var_description/counts_spark.py} (58%)
 create mode 100644 src/ydata_profiling/model/spark/var_description/default_spark.py

diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py
index 9ce40522a..ec352ddbf 100644
--- a/src/ydata_profiling/model/alerts.py
+++ b/src/ydata_profiling/model/alerts.py
@@ -170,7 +170,10 @@ def __init__(
     ):
         super().__init__(
             alert_type=AlertType.CONSTANT,
-            values={"n_distinct": values["n_distinct"]},
+            values={
+                "n_distinct": values["n_distinct"],
+                "value_counts_without_nan": values.value_counts_without_nan,
+            },
             column_name=column_name,
             is_empty=is_empty,
         )
diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py
index 59ccf853c..e929d4731 100644
--- a/src/ydata_profiling/model/pandas/__init__.py
+++ b/src/ydata_profiling/model/pandas/__init__.py
@@ -3,10 +3,8 @@
     dataframe_pandas,
     describe_boolean_pandas,
     describe_categorical_pandas,
-    describe_counts_pandas,
     describe_date_pandas,
     describe_file_pandas,
-    describe_generic_pandas,
     describe_image_pandas,
     describe_numeric_pandas,
     describe_path_pandas,
@@ -27,10 +25,8 @@
     "dataframe_pandas",
     "describe_boolean_pandas",
     "describe_categorical_pandas",
-    "describe_counts_pandas",
     "describe_date_pandas",
     "describe_file_pandas",
-    "describe_generic_pandas",
     "describe_image_pandas",
     "describe_numeric_pandas",
     "describe_path_pandas",
diff --git a/src/ydata_profiling/model/pandas/describe_counts_pandas.py b/src/ydata_profiling/model/pandas/describe_counts_pandas.py
deleted file mode 100644
index 416474d25..000000000
--- a/src/ydata_profiling/model/pandas/describe_counts_pandas.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from typing import Tuple
-
-import pandas as pd
-
-from ydata_profiling.config import Settings
-from ydata_profiling.model.summary_algorithms import describe_counts
-from ydata_profiling.model.var_description.default import VarDescription
-
-
-@describe_counts.register
-def pandas_describe_counts(
-    config: Settings, series: pd.Series, summary: VarDescription
-) -> Tuple[Settings, pd.Series, VarDescription]:
-    """Counts the values in a series (with and without NaN, distinct).
-
-    Args:
-        config: report Settings object
-        series: Series for which we want to calculate the values.
-        summary: series' summary
-
-    Returns:
-        A dictionary with the count values (with and without NaN, distinct).
-    """
-    try:
-        value_counts_with_nan = series.value_counts(dropna=False)
-        _ = set(value_counts_with_nan.index)
-        hashable = True
-    except:  # noqa: E722
-        hashable = False
-
-    summary.hashable = hashable
-
-    if hashable:
-        value_counts_with_nan = value_counts_with_nan[value_counts_with_nan > 0]
-
-        null_index = value_counts_with_nan.index.isnull()
-        if null_index.any():
-            n_missing = value_counts_with_nan[null_index].sum()
-            value_counts_without_nan = value_counts_with_nan[~null_index]
-        else:
-            n_missing = 0
-            value_counts_without_nan = value_counts_with_nan
-
-        summary.update(
-            {
-                "value_counts_without_nan": value_counts_without_nan,
-            }
-        )
-
-        try:
-            summary["value_counts_index_sorted"] = summary[
-                "value_counts_without_nan"
-            ].sort_index(ascending=True)
-            ordering = True
-        except TypeError:
-            ordering = False
-    else:
-        n_missing = series.isna().sum()
-        ordering = False
-
-    summary["ordering"] = ordering
-    summary.n_missing = n_missing
-
-    return config, series, summary
diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py
index 39ca21b8c..d07425b5f 100644
--- a/src/ydata_profiling/model/pandas/describe_date_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py
@@ -30,7 +30,7 @@ def pandas_describe_date_1d(
     Returns:
         A dict containing calculated series description values.
     """
-    if summary["value_counts_without_nan"].empty:
+    if summary.value_counts_without_nan.empty:
         values = series.values
         summary.update(
             {
diff --git a/src/ydata_profiling/model/pandas/describe_generic_pandas.py b/src/ydata_profiling/model/pandas/describe_generic_pandas.py
deleted file mode 100644
index fcc5b04b6..000000000
--- a/src/ydata_profiling/model/pandas/describe_generic_pandas.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Tuple
-
-import pandas as pd
-
-from ydata_profiling.config import Settings
-from ydata_profiling.model.summary_algorithms import describe_generic
-from ydata_profiling.model.var_description.default import VarDescription
-
-
-@describe_generic.register
-def pandas_describe_generic(
-    config: Settings, series: pd.Series, summary: VarDescription
-) -> Tuple[Settings, pd.Series, VarDescription]:
-    """Describe generic series.
-
-    Args:
-        config: report Settings object
-        series: The Series to describe.
-        summary: The dict containing the series description so far.
-
-    Returns:
-        A dict containing calculated series description values.
-    """
-
-    # number of observations in the Series
-    length = len(series)
-
-    summary.update(
-        {
-            "n": length,
-            "p_missing": summary.n_missing / length if length > 0 else 0,
-            "count": length - summary.n_missing,
-            "memory_size": series.memory_usage(deep=config.memory_deep),
-        }
-    )
-
-    return config, series, summary
diff --git a/src/ydata_profiling/model/spark/__init__.py b/src/ydata_profiling/model/spark/__init__.py
index 854222a9a..7dc7d5043 100644
--- a/src/ydata_profiling/model/spark/__init__.py
+++ b/src/ydata_profiling/model/spark/__init__.py
@@ -3,9 +3,7 @@
     dataframe_spark,
     describe_boolean_spark,
     describe_categorical_spark,
-    describe_counts_spark,
     describe_date_spark,
-    describe_generic_spark,
     describe_numeric_spark,
     describe_supported_spark,
     duplicates_spark,
@@ -21,9 +19,7 @@
     "dataframe_spark",
     "describe_boolean_spark",
     "describe_categorical_spark",
-    "describe_counts_spark",
     "describe_date_spark",
-    "describe_generic_spark",
     "describe_numeric_spark",
     "describe_supported_spark",
     "duplicates_spark",
diff --git a/src/ydata_profiling/model/spark/correlations_spark.py b/src/ydata_profiling/model/spark/correlations_spark.py
index 6f0f2ae25..51c309378 100644
--- a/src/ydata_profiling/model/spark/correlations_spark.py
+++ b/src/ydata_profiling/model/spark/correlations_spark.py
@@ -1,4 +1,5 @@
 """Correlations between variables."""
+
 from typing import Optional
 
 import pandas as pd
diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py
index ab5cf20fb..815af74b8 100644
--- a/src/ydata_profiling/model/spark/describe_boolean_spark.py
+++ b/src/ydata_profiling/model/spark/describe_boolean_spark.py
@@ -4,12 +4,13 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_boolean_1d
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_boolean_1d.register
 def describe_boolean_1d_spark(
-    config: Settings, df: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
+    config: Settings, df: DataFrame, summary: VarDescription
+) -> Tuple[Settings, DataFrame, VarDescription]:
     """Describe a boolean series.
 
     Args:
@@ -20,7 +21,7 @@ def describe_boolean_1d_spark(
         A dict containing calculated series description values.
     """
 
-    value_counts = summary["value_counts"]
+    value_counts = summary.value_counts
 
     # get the most common boolean value and its frequency
     top = value_counts.first()
diff --git a/src/ydata_profiling/model/spark/describe_categorical_spark.py b/src/ydata_profiling/model/spark/describe_categorical_spark.py
index 5afdb475c..562472b3d 100644
--- a/src/ydata_profiling/model/spark/describe_categorical_spark.py
+++ b/src/ydata_profiling/model/spark/describe_categorical_spark.py
@@ -4,12 +4,13 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_categorical_1d
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_categorical_1d.register
 def describe_categorical_1d_spark(
-    config: Settings, df: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
+    config: Settings, df: DataFrame, summary: VarDescription
+) -> Tuple[Settings, DataFrame, VarDescription]:
     """Describe a categorical series.
 
     Args:
diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py
index a5e11a0f1..4bcee2bbf 100644
--- a/src/ydata_profiling/model/spark/describe_date_spark.py
+++ b/src/ydata_profiling/model/spark/describe_date_spark.py
@@ -6,6 +6,7 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_date_1d
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 def date_stats_spark(df: DataFrame, summary: dict) -> dict:
@@ -21,8 +22,8 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict:
 
 @describe_date_1d.register
 def describe_date_1d_spark(
-    config: Settings, df: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
+    config: Settings, df: DataFrame, summary: VarDescription
+) -> Tuple[Settings, DataFrame, VarDescription]:
     """Describe a date series.
 
     Args:
diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py
deleted file mode 100644
index ee2356c0a..000000000
--- a/src/ydata_profiling/model/spark/describe_generic_spark.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from typing import Tuple
-
-from pyspark.sql import DataFrame
-
-from ydata_profiling.config import Settings
-from ydata_profiling.model.summary_algorithms import describe_generic
-
-
-@describe_generic.register
-def describe_generic_spark(
-    config: Settings, df: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
-    """Describe generic series.
-    Args:
-        series: The Series to describe.
-        summary: The dict containing the series description so far.
-    Returns:
-        A dict containing calculated series description values.
-    """
-
-    # number of observations in the Series
-    length = df.count()
-
-    summary["n"] = length
-    summary["p_missing"] = summary["n_missing"] / length
-    summary["count"] = length - summary["n_missing"]
-
-    # FIXME: This is not correct, but used to fulfil render expectations
-    # @chanedwin
-    summary["memory_size"] = 0
-
-    return config, df, summary
diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py
index 490e33aba..a9fca55cc 100644
--- a/src/ydata_profiling/model/spark/describe_numeric_spark.py
+++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py
@@ -9,9 +9,10 @@
     describe_numeric_1d,
     histogram_compute,
 )
+from ydata_profiling.model.var_description.default import VarDescription
 
 
-def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
+def numeric_stats_spark(df: DataFrame, summary: VarDescription) -> dict:
     column = df.columns[0]
 
     expr = [
@@ -29,8 +30,8 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
 
 @describe_numeric_1d.register
 def describe_numeric_1d_spark(
-    config: Settings, df: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
+    config: Settings, df: DataFrame, summary: VarDescription
+) -> Tuple[Settings, DataFrame, VarDescription]:
     """Describe a boolean series.
 
     Args:
@@ -51,7 +52,7 @@ def describe_numeric_1d_spark(
     summary["kurtosis"] = stats["kurtosis"]
     summary["sum"] = stats["sum"]
 
-    value_counts = summary["value_counts"]
+    value_counts = summary.value_counts
 
     n_infinite = (
         value_counts.where(F.col(df.columns[0]).isin([np.inf, -np.inf]))
@@ -106,12 +107,12 @@ def describe_numeric_1d_spark(
     ).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0]
 
     # FIXME: move to fmt
-    summary["p_negative"] = summary["n_negative"] / summary["n"]
+    summary["p_negative"] = summary["n_negative"] / summary.n
     summary["range"] = summary["max"] - summary["min"]
     summary["iqr"] = summary["75%"] - summary["25%"]
     summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.NaN
-    summary["p_zeros"] = summary["n_zeros"] / summary["n"]
-    summary["p_infinite"] = summary["n_infinite"] / summary["n"]
+    summary["p_zeros"] = summary["n_zeros"] / summary.n
+    summary["p_infinite"] = summary["n_infinite"] / summary.n
 
     # TODO - enable this feature
     # because spark doesn't have an indexing system, there isn't really the idea of monotonic increase/decrease
@@ -124,14 +125,14 @@ def describe_numeric_1d_spark(
     # display in pandas display
     # the alternative is to do this in spark natively, but it is not trivial
     infinity_values = [np.inf, -np.inf]
-    infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values)
+    infinity_index = summary.value_counts_without_nan.index.isin(infinity_values)
 
     summary.update(
         histogram_compute(
             config,
-            summary["value_counts_without_nan"][~infinity_index].index.values,
+            summary.value_counts_without_nan[~infinity_index].index.values,
             summary["n_distinct"],
-            weights=summary["value_counts_without_nan"][~infinity_index].values,
+            weights=summary.value_counts_without_nan[~infinity_index].values,
         )
     )
 
diff --git a/src/ydata_profiling/model/spark/describe_supported_spark.py b/src/ydata_profiling/model/spark/describe_supported_spark.py
index 1758f668d..d5d395156 100644
--- a/src/ydata_profiling/model/spark/describe_supported_spark.py
+++ b/src/ydata_profiling/model/spark/describe_supported_spark.py
@@ -3,13 +3,17 @@
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
+from ydata_profiling.model.spark.var_description.default_spark import (
+    get_default_spark_description,
+)
 from ydata_profiling.model.summary_algorithms import describe_supported
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_supported.register
 def describe_supported_spark(
     config: Settings, series: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
+) -> Tuple[Settings, DataFrame, VarDescription]:
     """Describe a supported series.
     Args:
         series: The Series to describe.
@@ -18,16 +22,6 @@ def describe_supported_spark(
         A dict containing calculated series description values.
     """
 
-    # number of non-NaN observations in the Series
-    count = summary["count"]
-    n_distinct = summary["value_counts"].count()
+    series_description = get_default_spark_description(config, series, summary)
 
-    summary["n_distinct"] = n_distinct
-    summary["p_distinct"] = n_distinct / count if count > 0 else 0
-
-    n_unique = summary["value_counts"].where("count == 1").count()
-    summary["is_unique"] = n_unique == count
-    summary["n_unique"] = n_unique
-    summary["p_unique"] = n_unique / count if count > 0 else 0
-
-    return config, series, summary
+    return config, series, series_description
diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py
index b5e27f615..6a95b2884 100644
--- a/src/ydata_profiling/model/spark/describe_text_spark.py
+++ b/src/ydata_profiling/model/spark/describe_text_spark.py
@@ -4,12 +4,13 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.summary_algorithms import describe_text_1d
+from ydata_profiling.model.var_description.default import VarDescription
 
 
 @describe_text_1d.register
 def describe_text_1d_spark(
-    config: Settings, df: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
+    config: Settings, df: DataFrame, summary: VarDescription
+) -> Tuple[Settings, DataFrame, VarDescription]:
     """Describe a categorical series.
 
     Args:
diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py
index 13a85f4c3..e2857b0a8 100644
--- a/src/ydata_profiling/model/spark/summary_spark.py
+++ b/src/ydata_profiling/model/spark/summary_spark.py
@@ -1,4 +1,5 @@
 """Compute statistical description of datasets."""
+
 import multiprocessing
 from typing import Tuple
 
diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py
index cdf3d88dd..236825e6a 100644
--- a/src/ydata_profiling/model/spark/timeseries_index_spark.py
+++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py
@@ -1,4 +1,5 @@
 """Compute statistical description of datasets."""
+
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
diff --git a/src/ydata_profiling/model/spark/describe_counts_spark.py b/src/ydata_profiling/model/spark/var_description/counts_spark.py
similarity index 58%
rename from src/ydata_profiling/model/spark/describe_counts_spark.py
rename to src/ydata_profiling/model/spark/var_description/counts_spark.py
index 0f813f2ce..15a2c2bd3 100644
--- a/src/ydata_profiling/model/spark/describe_counts_spark.py
+++ b/src/ydata_profiling/model/spark/var_description/counts_spark.py
@@ -1,23 +1,12 @@
-from typing import Tuple
-
 from pyspark.sql import DataFrame
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.summary_algorithms import describe_counts
-
-
-@describe_counts.register
-def describe_counts_spark(
-    config: Settings, series: DataFrame, summary: dict
-) -> Tuple[Settings, DataFrame, dict]:
-    """Counts the values in a series (with and without NaN, distinct).
+from ydata_profiling.model.var_description.counts import VarCounts
 
-    Args:
-        series: Series for which we want to calculate the values.
 
-    Returns:
-        A dictionary with the count values (with and without NaN, distinct).
-    """
+def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts:
+    """Get a VarCounts object for a spark series."""
+    length = series.count()
 
     value_counts = series.groupBy(series.columns).count()
     value_counts = value_counts.sort("count", ascending=False).persist()
@@ -37,14 +26,10 @@ def describe_counts_spark(
         .squeeze(axis="columns")
     )
 
-    summary["n_missing"] = n_missing
-    summary["value_counts"] = value_counts.persist()
-    summary["value_counts_index_sorted"] = value_counts_index_sorted
-
     # this is necessary as freqtables requires value_counts_without_nan
     # to be a pandas series. However, if we try to get everything into
     # pandas we will definitly crash the server
-    summary["value_counts_without_nan"] = (
+    value_counts_without_nan = (
         value_counts.dropna()
         .limit(200)
         .toPandas()
@@ -52,4 +37,19 @@ def describe_counts_spark(
         .squeeze(axis="columns")
     )
 
-    return config, series, summary
+    # FIXME: This is not correct, but used to fulfil render expectations
+    # @chanedwin
+    memory_size = 0
+
+    return VarCounts(
+        hashable=False,
+        value_counts_without_nan=value_counts_without_nan,
+        value_counts_index_sorted=value_counts_index_sorted,
+        ordering=False,
+        n_missing=n_missing,
+        n=length,
+        p_missing=n_missing / length,
+        count=length - n_missing,
+        memory_size=memory_size,
+        value_counts=value_counts.persist(),
+    )
diff --git a/src/ydata_profiling/model/spark/var_description/default_spark.py b/src/ydata_profiling/model/spark/var_description/default_spark.py
new file mode 100644
index 000000000..ff264e63e
--- /dev/null
+++ b/src/ydata_profiling/model/spark/var_description/default_spark.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from pyspark.sql import DataFrame
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.spark.var_description.counts_spark import get_counts_spark
+from ydata_profiling.model.var_description.default import VarDescription
+
+
+def get_default_spark_description(
+    config: Settings, series: DataFrame, init_dict: dict
+) -> VarDescription:
+    var_counts = get_counts_spark(config, series)
+
+    count = var_counts.count
+    n_distinct = var_counts.value_counts.count()
+
+    p_distinct = n_distinct / count if count > 0 else 0
+
+    n_unique = var_counts.value_counts.where("count == 1").count()
+    is_unique = n_unique == count
+    p_unique = n_unique / count  if count > 0 else 0
+
+    init_dict.update(
+        {
+            "n_distinct": n_distinct,
+            "p_distinct": p_distinct,
+            "is_unique": is_unique,
+            "n_unique": n_unique,
+            "p_unique": p_unique,
+        }
+    )
+
+    return VarDescription(
+        n=var_counts.n,
+        count=var_counts.count,
+        n_missing=var_counts.n_missing,
+        p_missing=var_counts.p_missing,
+        hashable=var_counts.hashable,
+        memory_size=var_counts.memory_size,
+        ordering=var_counts.ordering,
+        value_counts_index_sorted=var_counts.value_counts_index_sorted,
+        value_counts_without_nan=var_counts.value_counts_without_nan,
+        value_counts=var_counts.value_counts,
+        var_specific=init_dict,
+    )
diff --git a/src/ydata_profiling/model/var_description/default.py b/src/ydata_profiling/model/var_description/default.py
index 49aa63e82..05fb38ed0 100644
--- a/src/ydata_profiling/model/var_description/default.py
+++ b/src/ydata_profiling/model/var_description/default.py
@@ -34,6 +34,10 @@ def get(self, key: str, default: Any = None) -> Any:
         """To support old dict like interface."""
         return self.var_specific.get(key, default)
 
+    def pop(self, key: str, default: Any = None) -> Any:
+        """To support old dict like interface."""
+        return self.var_specific.pop(key, default)
+
     def __iter__(self) -> Iterator:
         """To support old dict like interface."""
         return self.var_specific.__iter__()

From b9f10cf65f47197bfb4240dcb18ef2ed2a11b987 Mon Sep 17 00:00:00 2001
From: Jan Cap <jan.cap@profinit.eu>
Date: Sat, 16 Dec 2023 21:03:31 +0100
Subject: [PATCH 07/10] feat: add support for python 3.8

---
 src/ydata_profiling/model/alerts.py                |  2 +-
 src/ydata_profiling/model/describe.py              |  2 +-
 .../model/pandas/correlations_pandas.py            | 14 +++++++-------
 src/ydata_profiling/model/pandas/summary_pandas.py |  4 ++--
 src/ydata_profiling/model/pandas/table_pandas.py   |  3 ++-
 src/ydata_profiling/model/summary.py               |  4 ++--
 6 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py
index ec352ddbf..6fbfad069 100644
--- a/src/ydata_profiling/model/alerts.py
+++ b/src/ydata_profiling/model/alerts.py
@@ -701,7 +701,7 @@ def check_correlation_alerts(config: Settings, correlations: dict) -> List[Alert
 def get_alerts(
     config: Settings,
     table_stats: dict,
-    series_description: dict[str, VarDescription],
+    series_description: Dict[str, VarDescription],
     correlations: dict,
 ) -> List[Alert]:
     alerts: List[Alert] = check_table_alerts(table_stats)
diff --git a/src/ydata_profiling/model/describe.py b/src/ydata_profiling/model/describe.py
index a8bfcac1a..9a2ad1619 100644
--- a/src/ydata_profiling/model/describe.py
+++ b/src/ydata_profiling/model/describe.py
@@ -73,7 +73,7 @@ def describe(
 
         # Variable-specific
         pbar.total += len(df.columns)
-        series_description: dict[str, VarDescription] = get_series_descriptions(
+        series_description: Dict[str, VarDescription] = get_series_descriptions(
             config, df, summarizer, typeset, pbar
         )
 
diff --git a/src/ydata_profiling/model/pandas/correlations_pandas.py b/src/ydata_profiling/model/pandas/correlations_pandas.py
index 698969270..510d4ab34 100644
--- a/src/ydata_profiling/model/pandas/correlations_pandas.py
+++ b/src/ydata_profiling/model/pandas/correlations_pandas.py
@@ -2,7 +2,7 @@
 
 import itertools
 import warnings
-from typing import Callable, Optional
+from typing import Callable, Dict, Optional
 
 import numpy as np
 import pandas as pd
@@ -89,9 +89,9 @@ def _pairwise_cramers(col_1: pd.Series, col_2: pd.Series) -> float:
     return _cramers_corrected_stat(pd.crosstab(col_1, col_2), correction=True)
 
 
-@Cramers.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
+@Cramers.compute.register(Settings, pd.DataFrame, Dict[str, VarDescription])
 def pandas_cramers_compute(
-    config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
+    config: Settings, df: pd.DataFrame, summary: Dict[str, VarDescription]
 ) -> Optional[pd.DataFrame]:
     threshold = config.categorical_maximum_correlation_distinct
 
@@ -130,9 +130,9 @@ def pandas_cramers_compute(
     return correlation_matrix
 
 
-@PhiK.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
+@PhiK.compute.register(Settings, pd.DataFrame, Dict[str, VarDescription])
 def pandas_phik_compute(
-    config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
+    config: Settings, df: pd.DataFrame, summary: Dict[str, VarDescription]
 ) -> Optional[pd.DataFrame]:
     df_cols_dict = {i: list(df.columns).index(i) for i in df.columns}
 
@@ -166,9 +166,9 @@ def pandas_phik_compute(
     return correlation
 
 
-@Auto.compute.register(Settings, pd.DataFrame, dict[str, VarDescription])
+@Auto.compute.register(Settings, pd.DataFrame, Dict[str, VarDescription])
 def pandas_auto_compute(
-    config: Settings, df: pd.DataFrame, summary: dict[str, VarDescription]
+    config: Settings, df: pd.DataFrame, summary: Dict[str, VarDescription]
 ) -> Optional[pd.DataFrame]:
     threshold = config.categorical_maximum_correlation_distinct
     numerical_columns = [
diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py
index d66906caa..190a9250c 100644
--- a/src/ydata_profiling/model/pandas/summary_pandas.py
+++ b/src/ydata_profiling/model/pandas/summary_pandas.py
@@ -2,7 +2,7 @@
 
 import multiprocessing
 import multiprocessing.pool
-from typing import Tuple
+from typing import Dict, Tuple
 
 import numpy as np
 import pandas as pd
@@ -65,7 +65,7 @@ def pandas_get_series_descriptions(
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
     pbar: tqdm,
-) -> dict[str, VarDescription]:
+) -> Dict[str, VarDescription]:
     def multiprocess_1d(args: tuple) -> Tuple[str, VarDescription]:
         """Wrapper to process series in parallel.
 
diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py
index bef531e2f..9198fb0e0 100644
--- a/src/ydata_profiling/model/pandas/table_pandas.py
+++ b/src/ydata_profiling/model/pandas/table_pandas.py
@@ -1,4 +1,5 @@
 from collections import Counter
+from typing import Dict
 
 import pandas as pd
 
@@ -9,7 +10,7 @@
 
 @get_table_stats.register
 def pandas_get_table_stats(
-    config: Settings, df: pd.DataFrame, variable_stats: dict[str, VarDescription]
+    config: Settings, df: pd.DataFrame, variable_stats: Dict[str, VarDescription]
 ) -> dict:
     """General statistics for the DataFrame.
 
diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py
index 8e4179598..e1c0588e2 100644
--- a/src/ydata_profiling/model/summary.py
+++ b/src/ydata_profiling/model/summary.py
@@ -1,6 +1,6 @@
 """Compute statistical description of datasets."""
 
-from typing import Any
+from typing import Any, Dict
 
 from multimethod import multimethod
 from tqdm import tqdm
@@ -28,5 +28,5 @@ def get_series_descriptions(
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
     pbar: tqdm,
-) -> dict[str, VarDescription]:
+) -> Dict[str, VarDescription]:
     raise NotImplementedError()

From 4b53baeaf580ad7814a8865ebadf60f1b99b12d3 Mon Sep 17 00:00:00 2001
From: Jan Cap <vorel007@gmail.com>
Date: Fri, 27 Sep 2024 13:44:36 +0200
Subject: [PATCH 08/10] fix: test_summarizer image

---
 tests/unit/test_summarizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_summarizer.py b/tests/unit/test_summarizer.py
index 4db83d69e..c12b0cb99 100644
--- a/tests/unit/test_summarizer.py
+++ b/tests/unit/test_summarizer.py
@@ -85,7 +85,7 @@ def test_summarizer_image(config: Settings):
         pps.summarize(
             config,
             pd.Series(
-                [os.path.abspath(base_path + r"../../../docsrc/assets/logo_header.png")]
+                [os.path.abspath(base_path + r"../../../docs/_static/img/cli.png")]
             ),
             "Image",
         )

From 6b1a592c8648fc4a3f633e02e1c05f721dbe69ef Mon Sep 17 00:00:00 2001
From: Jan Cap <vorel007@gmail.com>
Date: Tue, 15 Apr 2025 15:25:04 +0200
Subject: [PATCH 09/10] fix errors from master merge

---
 src/ydata_profiling/model/alerts.py           |  3 +-
 src/ydata_profiling/model/pandas/__init__.py  |  4 --
 .../model/pandas/correlations_pandas.py       |  2 +-
 .../pandas/describe_categorical_pandas.py     |  2 +-
 .../model/pandas/describe_date_pandas.py      | 17 ++++++-
 .../model/pandas/summary_pandas.py            |  4 +-
 .../spark/var_description/counts_spark.py     | 49 +++++++++++++------
 .../spark/var_description/default_spark.py    |  2 +-
 src/ydata_profiling/model/summarizer.py       |  8 ---
 src/ydata_profiling/model/summary.py          |  1 -
 tests/unit/test_describe.py                   |  2 +-
 tests/unit/test_summarizer.py                 |  8 +--
 12 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py
index d7b3dce23..698801389 100644
--- a/src/ydata_profiling/model/alerts.py
+++ b/src/ydata_profiling/model/alerts.py
@@ -9,7 +9,6 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.correlations import perform_check_correlation
-
 from ydata_profiling.model.var_description.default import VarDescription
 from ydata_profiling.utils.styles import get_alert_styles
 
@@ -690,7 +689,7 @@ def supported_alerts(summary: VarDescription) -> List[Alert]:
     return alerts
 
 
-def unsupported_alerts(summary: VarDescription) -> List[Alert]:
+def unsupported_alerts() -> List[Alert]:
     alerts: List[Alert] = [
         UnsupportedAlert(),
         RejectedAlert(),
diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py
index 381df8e86..c7d21070f 100644
--- a/src/ydata_profiling/model/pandas/__init__.py
+++ b/src/ydata_profiling/model/pandas/__init__.py
@@ -3,7 +3,6 @@
 # List of modules in the 'pandas' model that should be imported explicitly
 PANDAS_MODULES = [
     "correlations_pandas",
-    "describe_generic_pandas",
     "describe_boolean_pandas",
     "describe_categorical_pandas",
     "describe_url_pandas",
@@ -14,7 +13,6 @@
     "describe_path_pandas",
     "describe_image_pandas",
     "describe_date_pandas",
-    "describe_counts_pandas",
     "duplicates_pandas",
     "sample_pandas",
     "table_pandas",
@@ -35,7 +33,6 @@
 
 # Explicitly list exposed names for clarity
 __all__ = [
-    "pandas_describe_generic",
     "pandas_describe_boolean_1d",
     "pandas_describe_categorical_1d",
     "pandas_describe_url_1d",
@@ -46,7 +43,6 @@
     "pandas_describe_path_1d",
     "pandas_describe_image_1d",
     "pandas_describe_date_1d",
-    "pandas_describe_counts",
     "pandas_get_duplicates",
     "pandas_get_sample",
     "pandas_get_table_stats",
diff --git a/src/ydata_profiling/model/pandas/correlations_pandas.py b/src/ydata_profiling/model/pandas/correlations_pandas.py
index 6ee17b983..68a766aaf 100644
--- a/src/ydata_profiling/model/pandas/correlations_pandas.py
+++ b/src/ydata_profiling/model/pandas/correlations_pandas.py
@@ -2,7 +2,7 @@
 
 import itertools
 import warnings
-from typing import Callable, Dict, Optional
+from typing import Callable, Optional
 
 import numpy as np
 import pandas as pd
diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
index 576730189..325112f45 100644
--- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py
@@ -16,8 +16,8 @@
     series_handle_nulls,
     series_hashable,
 )
-from ydata_profiling.utils.information import DisplayInfo
 from ydata_profiling.model.var_description.default import VarDescription
+from ydata_profiling.utils.information import DisplayInfo
 
 
 def get_character_counts_vc(vc: pd.Series) -> pd.Series:
diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py
index e1ec721cf..169e7367a 100644
--- a/src/ydata_profiling/model/pandas/describe_date_pandas.py
+++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py
@@ -11,9 +11,16 @@
     series_handle_nulls,
     series_hashable,
 )
-
+from ydata_profiling.model.typeset_relations import is_pandas_1
 from ydata_profiling.model.var_description.default import VarDescription
 
+
+def to_datetime(series: pd.Series) -> pd.Series:
+    if is_pandas_1():
+        return pd.to_datetime(series, errors="coerce")
+    return pd.to_datetime(series, format="mixed", errors="coerce")
+
+
 @describe_date_1d.register
 @series_hashable
 @series_handle_nulls
@@ -31,6 +38,12 @@ def pandas_describe_date_1d(
         A dict containing calculated series description values.
     """
 
+    og_series = series.dropna()
+    series = to_datetime(og_series)
+    invalid_values = og_series[series.isna()]
+
+    series = series.dropna()
+
     if summary.value_counts_without_nan.empty:
         values = series.values
         summary.update(
@@ -60,7 +73,7 @@ def pandas_describe_date_1d(
         {
             "invalid_dates": invalid_values.nunique(),
             "n_invalid_dates": len(invalid_values),
-            "p_invalid_dates": len(invalid_values) / summary["n"],
+            "p_invalid_dates": len(invalid_values) / summary.n,
         }
     )
     return config, values, summary
diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py
index bc5a4faf2..6fbd39fa9 100644
--- a/src/ydata_profiling/model/pandas/summary_pandas.py
+++ b/src/ydata_profiling/model/pandas/summary_pandas.py
@@ -1,7 +1,7 @@
 """Compute statistical description of datasets."""
 import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, Any, Tuple
+from typing import Any, Dict, Tuple
 
 import numpy as np
 import pandas as pd
@@ -10,8 +10,8 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.typeset import ProfilingTypeSet
-from ydata_profiling.utils.compat import optional_option_context
 from ydata_profiling.model.var_description.default import VarDescription
+from ydata_profiling.utils.compat import optional_option_context
 from ydata_profiling.utils.dataframe import sort_column_names
 
 BaseSummarizer: Any = "BaseSummarizer"  # type: ignore
diff --git a/src/ydata_profiling/model/spark/var_description/counts_spark.py b/src/ydata_profiling/model/spark/var_description/counts_spark.py
index b1ca199d7..264330a1a 100644
--- a/src/ydata_profiling/model/spark/var_description/counts_spark.py
+++ b/src/ydata_profiling/model/spark/var_description/counts_spark.py
@@ -1,14 +1,12 @@
 """
     Pyspark counts
 """
-from typing import Tuple
-
 import pandas as pd
 from pyspark.sql import DataFrame
 from pyspark.sql import functions as F
 
 from ydata_profiling.config import Settings
-from ydata_profiling.model.var_description.counts import VarCounts
+from ydata_profiling.model.var_description.default import VarCounts
 
 
 def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts:
@@ -21,16 +19,18 @@ def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts:
     length = series.count()
 
     # Count occurrences of each value
-    value_counts = series.groupBy(series.columns).count()
+    value_counts = series.groupBy(series.columns[0]).count()
 
     # Sort by count descending, persist the result
-    value_counts = value_counts.sort("count", ascending=False).persist()
+    value_counts = value_counts.orderBy(F.desc("count")).persist()
 
     # Sort by column value ascending (for frequency tables)
-    value_counts_index_sorted = value_counts.sort(series.columns[0], ascending=True)
+    value_counts_index_sorted = value_counts.orderBy(F.asc(series.columns[0]))
 
     # Count missing values
-    n_missing = value_counts.where(value_counts[series.columns[0]].isNull()).first()
+    n_missing = (
+        value_counts.filter(F.col(series.columns[0]).isNull()).select("count").first()
+    )
     n_missing = n_missing["count"] if n_missing else 0
 
     # Convert top 200 values to Pandas for frequency table display
@@ -41,13 +41,32 @@ def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts:
         .squeeze(axis="columns")
     )
 
-    value_counts_without_nan = (
-        value_counts.dropna()
-        .limit(200)
-        .toPandas()
-        .set_index(series.columns[0], drop=True)
-        .squeeze(axis="columns")
-    )
+    column = series.columns[0]
+
+    if series.dtypes[0][1] in ("int", "float", "bigint", "double"):
+        value_counts_no_nan = (
+            value_counts.filter(F.col(column).isNotNull())  # Exclude NaNs
+            .filter(~F.isnan(F.col(column)))  # Remove implicit NaNs (if numeric column)
+            .groupBy(column)  # Group by unique values
+            .count()  # Count occurrences
+            .orderBy(F.desc("count"))  # Sort in descending order
+            .limit(200)  # Limit for performance
+        )
+    else:
+        value_counts_no_nan = (
+            value_counts.filter(F.col(column).isNotNull())  # Exclude NULLs
+            .groupBy(column)  # Group by unique timestamp values
+            .count()  # Count occurrences
+            .orderBy(F.desc("count"))  # Sort by most frequent timestamps
+            .limit(200)  # Limit for performance
+        )
+
+        # Convert to Pandas Series, forcing proper structure
+    if value_counts_no_nan.count() > 0:
+        pdf = value_counts_no_nan.toPandas().set_index(column)["count"]
+        value_counts_without_nan = pd.Series(pdf)  # Ensures it's always a Series
+    else:
+        value_counts_without_nan = pd.Series(dtype=int)  # Ensures an empty Series
 
     # @chanedwin
     memory_size = 0
@@ -55,7 +74,7 @@ def get_counts_spark(config: Settings, series: DataFrame) -> VarCounts:
     return VarCounts(
         hashable=False,
         value_counts_without_nan=value_counts_without_nan,
-        value_counts_index_sorted=value_counts_index_sorted,
+        value_counts_index_sorted=top_200_sorted,
         ordering=False,
         n_missing=n_missing,
         n=length,
diff --git a/src/ydata_profiling/model/spark/var_description/default_spark.py b/src/ydata_profiling/model/spark/var_description/default_spark.py
index ff264e63e..687be178e 100644
--- a/src/ydata_profiling/model/spark/var_description/default_spark.py
+++ b/src/ydata_profiling/model/spark/var_description/default_spark.py
@@ -19,7 +19,7 @@ def get_default_spark_description(
 
     n_unique = var_counts.value_counts.where("count == 1").count()
     is_unique = n_unique == count
-    p_unique = n_unique / count  if count > 0 else 0
+    p_unique = n_unique / count if count > 0 else 0
 
     init_dict.update(
         {
diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
index 600d43029..bb296cf06 100644
--- a/src/ydata_profiling/model/summarizer.py
+++ b/src/ydata_profiling/model/summarizer.py
@@ -13,10 +13,8 @@
 from ydata_profiling.model.pandas import (
     pandas_describe_boolean_1d,
     pandas_describe_categorical_1d,
-    pandas_describe_counts,
     pandas_describe_date_1d,
     pandas_describe_file_1d,
-    pandas_describe_generic,
     pandas_describe_image_1d,
     pandas_describe_numeric_1d,
     pandas_describe_path_1d,
@@ -71,9 +69,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
             from ydata_profiling.model.spark import (
                 describe_boolean_1d_spark,
                 describe_categorical_1d_spark,
-                describe_counts_spark,
                 describe_date_1d_spark,
-                describe_generic_spark,
                 describe_numeric_1d_spark,
                 describe_supported_spark,
                 describe_text_1d_spark,
@@ -81,8 +77,6 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
 
             summary_map = {
                 "Unsupported": [
-                    describe_counts_spark,
-                    describe_generic_spark,
                     describe_supported_spark,
                 ],
                 "Numeric": [describe_numeric_1d_spark],
@@ -99,8 +93,6 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]:
         else:
             summary_map = {
                 "Unsupported": [
-                    pandas_describe_counts,
-                    pandas_describe_generic,
                     pandas_describe_supported,
                 ],
                 "Numeric": [pandas_describe_numeric_1d],
diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py
index 2547d64b1..b7f41c660 100644
--- a/src/ydata_profiling/model/summary.py
+++ b/src/ydata_profiling/model/summary.py
@@ -14,7 +14,6 @@
 from ydata_profiling.model.summarizer import BaseSummarizer
 from ydata_profiling.model.var_description.default import VarDescription
 
-
 spec = importlib.util.find_spec("pyspark")
 if spec is None:
     from typing import TypeVar  # noqa: E402
diff --git a/tests/unit/test_describe.py b/tests/unit/test_describe.py
index 1bf61aa0c..362ea7c0b 100644
--- a/tests/unit/test_describe.py
+++ b/tests/unit/test_describe.py
@@ -603,6 +603,6 @@ def test_decribe_series_type_schema(config, summarizer):
     result = describe(config, df, summarizer, typeset)
 
     assert result.variables["date"]["type"] == "DateTime"
-    assert result.variables["date"]["n_missing"] == 0
+    assert result.variables["date"].n_missing == 0
     assert result.variables["date"]["n_invalid_dates"] == 2
     assert result.variables["date"]["p_invalid_dates"] == 0.5
diff --git a/tests/unit/test_summarizer.py b/tests/unit/test_summarizer.py
index 880129b4b..631dab650 100644
--- a/tests/unit/test_summarizer.py
+++ b/tests/unit/test_summarizer.py
@@ -32,7 +32,7 @@ def test_summarizer_base_types(config: Settings):
 
 def test_summarizer_url(config: Settings):
     config.vars.url.active = True
-    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
+    pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(config, pd.Series(["https://www.example.com"]), "URL")
     )
@@ -40,7 +40,7 @@ def test_summarizer_url(config: Settings):
 
 def test_summarizer_path(config: Settings):
     config.vars.path.active = True
-    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
+    pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(
             config,
@@ -60,7 +60,7 @@ def test_summarizer_path(config: Settings):
 def test_summarizer_file(config: Settings):
     config.vars.path.active = True
     config.vars.file.active = True
-    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
+    pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(
             config,
@@ -80,7 +80,7 @@ def test_summarizer_image(config: Settings):
     config.vars.path.active = True
     config.vars.file.active = True
     config.vars.image.active = True
-    pps = PandasProfilingSummarizer(typeset=ProfilingTypeSet(config))
+    pps = ProfilingSummarizer(typeset=ProfilingTypeSet(config))
     _ = format_summary(
         pps.summarize(
             config,

From d27f8eaf9c55c350d11ed86fa3efccfea6183e76 Mon Sep 17 00:00:00 2001
From: Jan Cap <vorel007@gmail.com>
Date: Tue, 15 Apr 2025 15:42:50 +0200
Subject: [PATCH 10/10] replace typing Dict with dict

---
 src/ydata_profiling/model/pandas/summary_pandas.py | 4 ++--
 src/ydata_profiling/model/summary.py               | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py
index 6fbd39fa9..1dc169baa 100644
--- a/src/ydata_profiling/model/pandas/summary_pandas.py
+++ b/src/ydata_profiling/model/pandas/summary_pandas.py
@@ -1,7 +1,7 @@
 """Compute statistical description of datasets."""
 import multiprocessing
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Dict, Tuple
+from typing import Any, Tuple
 
 import numpy as np
 import pandas as pd
@@ -75,7 +75,7 @@ def pandas_get_series_descriptions(
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
     pbar: tqdm,
-) -> Dict[str, VarDescription]:
+) -> dict[str, VarDescription]:
     def describe_column(name: str, series: pd.Series) -> Tuple[str, VarDescription]:
         """Process a single series to get the column description."""
         pbar.set_postfix_str(f"Describe variable: {name}")
diff --git a/src/ydata_profiling/model/summary.py b/src/ydata_profiling/model/summary.py
index b7f41c660..1b02f0789 100644
--- a/src/ydata_profiling/model/summary.py
+++ b/src/ydata_profiling/model/summary.py
@@ -1,6 +1,6 @@
 """Compute statistical description of datasets."""
 import importlib
-from typing import Any, Dict
+from typing import Any
 
 import pandas as pd
 from tqdm import tqdm
@@ -58,7 +58,7 @@ def get_series_descriptions(
     summarizer: BaseSummarizer,
     typeset: VisionsTypeset,
     pbar: tqdm,
-) -> Dict[str, VarDescription]:
+) -> dict[str, VarDescription]:
     if isinstance(df, pd.DataFrame):
         return pandas_get_series_descriptions(config, df, summarizer, typeset, pbar)
     elif isinstance(df, sparkDataFrame):  # type: ignore