From c39e0b3407a4cf6dccc7c28314733b3191388ab7 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 15:59:52 +0800 Subject: [PATCH 01/11] feat: initial release --- Dockerfile | 21 +++ src/ydata_profiling/config.py | 142 ++++++++---------- src/ydata_profiling/model/handler.py | 20 +-- src/ydata_profiling/model/summarizer.py | 31 ++-- src/ydata_profiling/profile_report.py | 6 +- .../report/structure/__init__.py | 22 +++ .../report/structure/report.py | 2 +- 7 files changed, 130 insertions(+), 114 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..7bb15bf5d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY . . + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" wheel && \ + pip install --no-cache-dir . && \ + pip install --no-cache-dir "setuptools>=72.0.0,<80.0.0" && \ + pip install --no-cache-dir jupyter + +EXPOSE 8888 + +CMD ["jupyter", "notebook", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"] + + diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 09dbecdde..2bb934ed1 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,24 +6,7 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr - -def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: - """ - Recursive merge dictionaries. - - :param dict1: Base dictionary to merge. - :param dict2: Dictionary to merge on top of base dictionary. - :return: Merged dictionary - """ - for key, val in dict1.items(): - if isinstance(val, dict): - dict2_node = dict2.setdefault(key, {}) - _merge_dictionaries(val, dict2_node) - else: - if key not in dict2: - dict2[key] = val - - return dict2 +from ydata_profiling.utils.common import update class Dataset(BaseModel): @@ -355,60 +338,7 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() - def update(self, updates: dict) -> "Settings": - update = _merge_dictionaries(self.dict(), updates) - return self.parse_obj(self.copy(update=update)) - - @staticmethod - def from_file(config_file: Union[Path, str]) -> "Settings": - """Create a Settings object from a yaml file. - - Args: - config_file: yaml file path - Returns: - Settings - """ - with open(config_file) as f: - data = yaml.safe_load(f) - - return Settings.parse_obj(data) - - -class SparkSettings(Settings): - """ - Setting class with the standard report configuration for Spark DataFrames - All the supported analysis are set to true - """ - - vars: Univariate = Univariate() - - vars.num.low_categorical_threshold = 0 - - infer_dtypes: bool = False - - correlations: Dict[str, Correlation] = { - "spearman": Correlation(key="spearman", calculate=True), - "pearson": Correlation(key="pearson", calculate=True), - } - - correlation_table: bool = True - - interactions: Interactions = Interactions() - interactions.continuous = False - - missing_diagrams: Dict[str, bool] = { - "bar": False, - "matrix": False, - "dendrogram": False, - "heatmap": False, - } - samples: Samples = Samples() - samples.tail = 0 - samples.random = 0 - - -class Config: - arg_groups: Dict[str, Any] = { + _arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -475,8 +405,8 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Config.arg_groups[key] - shorthand_args, _ = Config.shorthands(kwargs, split=False) + kwargs = Settings._arg_groups[key] + shorthand_args, _ = Settings.shorthands(kwargs, split=False) return shorthand_args @staticmethod @@ -485,8 +415,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Config._shorthands: - shorthand_args[key] = Config._shorthands[key] + if value is None and key in Settings._shorthands: + shorthand_args[key] = Settings._shorthands[key] if split: del kwargs[key] @@ -494,3 +424,63 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} + + def update(self, updates: dict) -> "Settings": + merged = update(self.dict().copy(), updates) + return self.parse_obj(self.copy(update=merged)) + + @staticmethod + def from_file(config_file: Union[Path, str]) -> "Settings": + """Create a Settings object from a yaml file. + + Args: + config_file: yaml file path + Returns: + Settings + """ + with open(config_file) as f: + data = yaml.safe_load(f) + + return Settings.parse_obj(data) + + +class SparkSettings(Settings): + """ + Setting class with the standard report configuration for Spark DataFrames + All the supported analysis are set to true + """ + + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes: bool = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman", calculate=True), + "pearson": Correlation(key="pearson", calculate=True), + } + + correlation_table: bool = True + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 + + +class Config(Settings): + """ + Deprecated: Use Settings instead. + Backward compatibility alias for Settings class. + """ + + pass diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..e983ce2a1 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,22 +60,6 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms +from ydata_profiling.report.structure import get_render_map - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map +__all__ = ["compose", "Handler", "get_render_map"] diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..a57ed1c97 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() @@ -65,7 +64,15 @@ def summary_map(self) -> Dict[str, List[Callable]]: return self._summary_map def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map for Pandas summarization.""" + """Creates the summary map based on the backend.""" + common_map = { + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], + } + if self.use_spark: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, @@ -78,7 +85,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: describe_text_1d_spark, ) - summary_map = { + base_map = { "Unsupported": [ describe_counts_spark, describe_generic_spark, @@ -89,14 +96,9 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [describe_text_1d_spark], "Categorical": [describe_categorical_1d_spark], "Boolean": [describe_boolean_1d_spark], - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], } else: - summary_map = { + base_map = { "Unsupported": [ pandas_describe_counts, pandas_describe_generic, @@ -107,13 +109,10 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [pandas_describe_text_1d], "Categorical": [pandas_describe_categorical_1d], "Boolean": [pandas_describe_boolean_1d], - "URL": [pandas_describe_url_1d], - "Path": [pandas_describe_path_1d], - "File": [pandas_describe_file_1d], - "Image": [pandas_describe_image_1d], - "TimeSeries": [pandas_describe_timeseries_1d], } - return summary_map + + base_map.update(common_map) + return base_map def format_summary(summary: Union[BaseDescription, dict]) -> dict: diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index a7d6d9134..916b4681e 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -25,7 +25,7 @@ from typeguard import typechecked from visions import VisionsTypeset -from ydata_profiling.config import Config, Settings, SparkSettings +from ydata_profiling.config import Settings, SparkSettings from ydata_profiling.expectations_report import ExpectationsReport from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType @@ -132,11 +132,11 @@ def __init__( cfg = Settings() for condition, key in groups: if condition: - cfg = cfg.update(Config.get_arg_groups(key)) + cfg = cfg.update(Settings.get_arg_groups(key)) report_config = report_config.update(cfg.dict(exclude_defaults=True)) if len(kwargs) > 0: - shorthands, kwargs = Config.shorthands(kwargs) + shorthands, kwargs = Settings.shorthands(kwargs) report_config = report_config.update( Settings().update(shorthands).dict(exclude_defaults=True) ) diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index 8324d248d..a2efd029a 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -1 +1,23 @@ """Data structure for the report""" +from typing import Callable, Dict + + +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 482b410b2..b64a41aae 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.model.handler import get_render_map +from ydata_profiling.report.structure import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, From 27a314be64b586f58de6a2956d456e2a3d03da1f Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 17:14:23 +0800 Subject: [PATCH 02/11] feat: initial release --- src/ydata_profiling/config.py | 142 +++++++++++++----------- src/ydata_profiling/model/handler.py | 2 - src/ydata_profiling/model/summarizer.py | 31 +++--- src/ydata_profiling/profile_report.py | 6 +- 4 files changed, 95 insertions(+), 86 deletions(-) diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 2bb934ed1..09dbecdde 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -6,7 +6,24 @@ import yaml from pydantic.v1 import BaseModel, BaseSettings, Field, PrivateAttr -from ydata_profiling.utils.common import update + +def _merge_dictionaries(dict1: dict, dict2: dict) -> dict: + """ + Recursive merge dictionaries. + + :param dict1: Base dictionary to merge. + :param dict2: Dictionary to merge on top of base dictionary. + :return: Merged dictionary + """ + for key, val in dict1.items(): + if isinstance(val, dict): + dict2_node = dict2.setdefault(key, {}) + _merge_dictionaries(val, dict2_node) + else: + if key not in dict2: + dict2[key] = val + + return dict2 class Dataset(BaseModel): @@ -338,7 +355,60 @@ class Config: html: Html = Html() notebook: Notebook = Notebook() - _arg_groups: Dict[str, Any] = { + def update(self, updates: dict) -> "Settings": + update = _merge_dictionaries(self.dict(), updates) + return self.parse_obj(self.copy(update=update)) + + @staticmethod + def from_file(config_file: Union[Path, str]) -> "Settings": + """Create a Settings object from a yaml file. + + Args: + config_file: yaml file path + Returns: + Settings + """ + with open(config_file) as f: + data = yaml.safe_load(f) + + return Settings.parse_obj(data) + + +class SparkSettings(Settings): + """ + Setting class with the standard report configuration for Spark DataFrames + All the supported analysis are set to true + """ + + vars: Univariate = Univariate() + + vars.num.low_categorical_threshold = 0 + + infer_dtypes: bool = False + + correlations: Dict[str, Correlation] = { + "spearman": Correlation(key="spearman", calculate=True), + "pearson": Correlation(key="pearson", calculate=True), + } + + correlation_table: bool = True + + interactions: Interactions = Interactions() + interactions.continuous = False + + missing_diagrams: Dict[str, bool] = { + "bar": False, + "matrix": False, + "dendrogram": False, + "heatmap": False, + } + samples: Samples = Samples() + samples.tail = 0 + samples.random = 0 + + +class Config: + arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, @@ -405,8 +475,8 @@ class Config: @staticmethod def get_arg_groups(key: str) -> dict: - kwargs = Settings._arg_groups[key] - shorthand_args, _ = Settings.shorthands(kwargs, split=False) + kwargs = Config.arg_groups[key] + shorthand_args, _ = Config.shorthands(kwargs, split=False) return shorthand_args @staticmethod @@ -415,8 +485,8 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): - if value is None and key in Settings._shorthands: - shorthand_args[key] = Settings._shorthands[key] + if value is None and key in Config._shorthands: + shorthand_args[key] = Config._shorthands[key] if split: del kwargs[key] @@ -424,63 +494,3 @@ def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: return shorthand_args, kwargs else: return shorthand_args, {} - - def update(self, updates: dict) -> "Settings": - merged = update(self.dict().copy(), updates) - return self.parse_obj(self.copy(update=merged)) - - @staticmethod - def from_file(config_file: Union[Path, str]) -> "Settings": - """Create a Settings object from a yaml file. - - Args: - config_file: yaml file path - Returns: - Settings - """ - with open(config_file) as f: - data = yaml.safe_load(f) - - return Settings.parse_obj(data) - - -class SparkSettings(Settings): - """ - Setting class with the standard report configuration for Spark DataFrames - All the supported analysis are set to true - """ - - vars: Univariate = Univariate() - - vars.num.low_categorical_threshold = 0 - - infer_dtypes: bool = False - - correlations: Dict[str, Correlation] = { - "spearman": Correlation(key="spearman", calculate=True), - "pearson": Correlation(key="pearson", calculate=True), - } - - correlation_table: bool = True - - interactions: Interactions = Interactions() - interactions.continuous = False - - missing_diagrams: Dict[str, bool] = { - "bar": False, - "matrix": False, - "dendrogram": False, - "heatmap": False, - } - samples: Samples = Samples() - samples.tail = 0 - samples.random = 0 - - -class Config(Settings): - """ - Deprecated: Use Settings instead. - Backward compatibility alias for Settings class. - """ - - pass diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index e983ce2a1..bcca12a1c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,6 +60,4 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary -from ydata_profiling.report.structure import get_render_map -__all__ = ["compose", "Handler", "get_render_map"] diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index a57ed1c97..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() @@ -64,15 +65,7 @@ def summary_map(self) -> Dict[str, List[Callable]]: return self._summary_map def _create_summary_map(self) -> Dict[str, List[Callable]]: - """Creates the summary map based on the backend.""" - common_map = { - "URL": [describe_url_1d], - "Path": [describe_path_1d], - "File": [describe_file_1d], - "Image": [describe_image_1d], - "TimeSeries": [describe_timeseries_1d], - } - + """Creates the summary map for Pandas summarization.""" if self.use_spark: from ydata_profiling.model.spark import ( describe_boolean_1d_spark, @@ -85,7 +78,7 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: describe_text_1d_spark, ) - base_map = { + summary_map = { "Unsupported": [ describe_counts_spark, describe_generic_spark, @@ -96,9 +89,14 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [describe_text_1d_spark], "Categorical": [describe_categorical_1d_spark], "Boolean": [describe_boolean_1d_spark], + "URL": [describe_url_1d], + "Path": [describe_path_1d], + "File": [describe_file_1d], + "Image": [describe_image_1d], + "TimeSeries": [describe_timeseries_1d], } else: - base_map = { + summary_map = { "Unsupported": [ pandas_describe_counts, pandas_describe_generic, @@ -109,10 +107,13 @@ def _create_summary_map(self) -> Dict[str, List[Callable]]: "Text": [pandas_describe_text_1d], "Categorical": [pandas_describe_categorical_1d], "Boolean": [pandas_describe_boolean_1d], + "URL": [pandas_describe_url_1d], + "Path": [pandas_describe_path_1d], + "File": [pandas_describe_file_1d], + "Image": [pandas_describe_image_1d], + "TimeSeries": [pandas_describe_timeseries_1d], } - - base_map.update(common_map) - return base_map + return summary_map def format_summary(summary: Union[BaseDescription, dict]) -> dict: diff --git a/src/ydata_profiling/profile_report.py b/src/ydata_profiling/profile_report.py index 916b4681e..a7d6d9134 100644 --- a/src/ydata_profiling/profile_report.py +++ b/src/ydata_profiling/profile_report.py @@ -25,7 +25,7 @@ from typeguard import typechecked from visions import VisionsTypeset -from ydata_profiling.config import Settings, SparkSettings +from ydata_profiling.config import Config, Settings, SparkSettings from ydata_profiling.expectations_report import ExpectationsReport from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType @@ -132,11 +132,11 @@ def __init__( cfg = Settings() for condition, key in groups: if condition: - cfg = cfg.update(Settings.get_arg_groups(key)) + cfg = cfg.update(Config.get_arg_groups(key)) report_config = report_config.update(cfg.dict(exclude_defaults=True)) if len(kwargs) > 0: - shorthands, kwargs = Settings.shorthands(kwargs) + shorthands, kwargs = Config.shorthands(kwargs) report_config = report_config.update( Settings().update(shorthands).dict(exclude_defaults=True) ) From 8d8f6b71b5f46178749d0b100ba9bc8cefbfb261 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 17:39:22 +0800 Subject: [PATCH 03/11] feat: initial release --- src/ydata_profiling/model/handler.py | 123 +++++++++--------- .../report/structure/__init__.py | 8 ++ src/ydata_profiling/utils/backend.py | 2 +- 3 files changed, 69 insertions(+), 64 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index bcca12a1c..aa36a811c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,63 +1,60 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary - - - +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable]) -> Callable: + """ + Compose a sequence of functions. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args) -> List[Any]: + result = args # Start with the input arguments + for func in functions: + result = func(*result) if isinstance(result, tuple) else func(result) + return result # type: ignore + + return composed_function # type: ignore + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[Callable]], + typeset: VisionsTypeset, + *args, + **kwargs + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] + ) + + def handle(self, dtype: str, *args, **kwargs) -> dict: + """ + Returns: + object: a tuple containing the config, the dataset series and the summary extracted + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + summary = op(*args)[-1] + return summary diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index a2efd029a..7ba9c10c9 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -3,6 +3,14 @@ def get_render_map() -> Dict[str, Callable]: + """Get the mapping of variable types to their render functions. + + This function was moved from model.handler to report.structure to eliminate + the reverse dependency from model layer to report layer. + + Returns: + Dictionary mapping type names to render functions. + """ import ydata_profiling.report.structure.variables as render_algorithms render_map = { diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index e99d91c11..dd12f9fd3 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,5 +1,5 @@ """ - File with a function to check the backend being used +Backend detection utilities for pandas and spark. """ import importlib From 307cba98bfab9196a8de5355022f3919539e4520 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 18:09:37 +0800 Subject: [PATCH 04/11] feat: initial release --- src/ydata_profiling/model/handler.py | 123 +++++++++--------- src/ydata_profiling/model/summarizer.py | 5 +- .../report/structure/__init__.py | 30 ----- .../report/structure/report.py | 2 +- .../report/structure/variables/__init__.py | 23 ++++ src/ydata_profiling/utils/backend.py | 2 +- 6 files changed, 90 insertions(+), 95 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index aa36a811c..bcca12a1c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,60 +1,63 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence + +import networkx as nx +from visions import VisionsTypeset + + +def compose(functions: Sequence[Callable]) -> Callable: + """ + Compose a sequence of functions. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args) -> List[Any]: + result = args # Start with the input arguments + for func in functions: + result = func(*result) if isinstance(result, tuple) else func(result) + return result # type: ignore + + return composed_function # type: ignore + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[Callable]], + typeset: VisionsTypeset, + *args, + **kwargs + ): + self.mapping = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] + ) + + def handle(self, dtype: str, *args, **kwargs) -> dict: + """ + Returns: + object: a tuple containing the config, the dataset series and the summary extracted + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + summary = op(*args)[-1] + return summary + + + diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..54d839915 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -27,7 +27,7 @@ from ydata_profiling.model.pandas.describe_supported_pandas import ( pandas_describe_supported, ) -from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for +from ydata_profiling.model.summary_algorithms import ( describe_file_1d, describe_image_1d, describe_path_1d, @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/report/structure/__init__.py b/src/ydata_profiling/report/structure/__init__.py index 7ba9c10c9..8324d248d 100644 --- a/src/ydata_profiling/report/structure/__init__.py +++ b/src/ydata_profiling/report/structure/__init__.py @@ -1,31 +1 @@ """Data structure for the report""" -from typing import Callable, Dict - - -def get_render_map() -> Dict[str, Callable]: - """Get the mapping of variable types to their render functions. - - This function was moved from model.handler to report.structure to eliminate - the reverse dependency from model layer to report layer. - - Returns: - Dictionary mapping type names to render functions. - """ - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index b64a41aae..0f027f23f 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.report.structure import get_render_map +from ydata_profiling.report.structure.variables import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py index 64f1d6d54..a8aa301b5 100644 --- a/src/ydata_profiling/report/structure/variables/__init__.py +++ b/src/ydata_profiling/report/structure/variables/__init__.py @@ -1,3 +1,5 @@ +from typing import Callable, Dict + from ydata_profiling.report.structure.variables.render_boolean import render_boolean from ydata_profiling.report.structure.variables.render_categorical import ( render_categorical, @@ -17,6 +19,26 @@ ) from ydata_profiling.report.structure.variables.render_url import render_url + +def get_render_map() -> Dict[str, Callable]: + render_map = { + "Boolean": render_boolean, + "Numeric": render_real, + "Complex": render_complex, + "Text": render_text, + "DateTime": render_date, + "Categorical": render_categorical, + "URL": render_url, + "Path": render_path, + "File": render_file, + "Image": render_image, + "Unsupported": render_generic, + "TimeSeries": render_timeseries, + } + + return render_map + + __all__ = [ "render_boolean", "render_categorical", @@ -32,4 +54,5 @@ "render_text", "render_timeseries", "render_url", + "get_render_map", ] diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index dd12f9fd3..e99d91c11 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,5 +1,5 @@ """ -Backend detection utilities for pandas and spark. + File with a function to check the backend being used """ import importlib From 1e2fa10eaf7a951acea663fa270784244ac18404 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 19:27:37 +0800 Subject: [PATCH 05/11] feat: initial release --- src/ydata_profiling/model/handler.py | 18 +++++++++++++++ src/ydata_profiling/model/summarizer.py | 5 ++-- .../report/structure/report.py | 2 +- .../report/structure/variables/__init__.py | 23 ------------------- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index bcca12a1c..992c1840c 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -60,4 +60,22 @@ def handle(self, dtype: str, *args, **kwargs) -> dict: return summary +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 54d839915..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -27,7 +27,7 @@ from ydata_profiling.model.pandas.describe_supported_pandas import ( pandas_describe_supported, ) -from ydata_profiling.model.summary_algorithms import ( +from ydata_profiling.model.summary_algorithms import ( # Check what is this method used for describe_file_1d, describe_image_1d, describe_path_1d, @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/report/structure/report.py b/src/ydata_profiling/report/structure/report.py index 0f027f23f..482b410b2 100644 --- a/src/ydata_profiling/report/structure/report.py +++ b/src/ydata_profiling/report/structure/report.py @@ -7,7 +7,7 @@ from ydata_profiling.config import Settings from ydata_profiling.model import BaseDescription from ydata_profiling.model.alerts import AlertType -from ydata_profiling.report.structure.variables import get_render_map +from ydata_profiling.model.handler import get_render_map from ydata_profiling.report.presentation.core import ( HTML, Collapse, diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py index a8aa301b5..64f1d6d54 100644 --- a/src/ydata_profiling/report/structure/variables/__init__.py +++ b/src/ydata_profiling/report/structure/variables/__init__.py @@ -1,5 +1,3 @@ -from typing import Callable, Dict - from ydata_profiling.report.structure.variables.render_boolean import render_boolean from ydata_profiling.report.structure.variables.render_categorical import ( render_categorical, @@ -19,26 +17,6 @@ ) from ydata_profiling.report.structure.variables.render_url import render_url - -def get_render_map() -> Dict[str, Callable]: - render_map = { - "Boolean": render_boolean, - "Numeric": render_real, - "Complex": render_complex, - "Text": render_text, - "DateTime": render_date, - "Categorical": render_categorical, - "URL": render_url, - "Path": render_path, - "File": render_file, - "Image": render_image, - "Unsupported": render_generic, - "TimeSeries": render_timeseries, - } - - return render_map - - __all__ = [ "render_boolean", "render_categorical", @@ -54,5 +32,4 @@ def get_render_map() -> Dict[str, Callable]: "render_text", "render_timeseries", "render_url", - "get_render_map", ] From 3f158155243784086e17dd0d59e60dc086dc0844 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 20:03:28 +0800 Subject: [PATCH 06/11] feat: initial release --- src/ydata_profiling/model/alerts.py | 18 +++++++++--------- .../model/summary_algorithms.py | 15 --------------- .../structure/variables/render_common.py | 1 - 3 files changed, 9 insertions(+), 25 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 1b16d27a0..611b5de85 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -12,8 +12,8 @@ from ydata_profiling.utils.styles import get_alert_styles -def fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage. +def _fmt_percent(value: float, edge_cases: bool = True) -> str: + """Format a ratio as a percentage (internal copy to avoid circular imports). Args: edge_cases: Check for edge cases? @@ -209,7 +209,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows" + return f"Dataset has {self.values['n_duplicates']} ({_fmt_percent(self.values['p_duplicates'])}) duplicate rows" else: return "Dataset has no duplicated rows" @@ -231,7 +231,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows" + return f"Dataset has {self.values['n_near_dups']} ({_fmt_percent(self.values['p_near_dups'])}) near duplicate rows" else: return "Dataset has no near duplicated rows" @@ -272,7 +272,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_distinct']:} ({fmt_percent(self.values['p_distinct'])}) distinct values" + return f"[{self.column_name}] has {self.values['n_distinct']:} ({_fmt_percent(self.values['p_distinct'])}) distinct values" else: return f"[{self.column_name}] has a high cardinality" @@ -294,7 +294,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category" + return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {_fmt_percent(self.values['p_fuzzy_vals'])} per category" else: return f"[{self.column_name}] no dirty categories values." @@ -365,7 +365,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_infinite']} ({fmt_percent(self.values['p_infinite'])}) infinite values" + return f"[{self.column_name}] has {self.values['n_infinite']} ({_fmt_percent(self.values['p_infinite'])}) infinite values" else: return f"[{self.column_name}] has infinite values" @@ -387,7 +387,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] {self.values['n_missing']} ({fmt_percent(self.values['p_missing'])}) missing values" + return f"[{self.column_name}] {self.values['n_missing']} ({_fmt_percent(self.values['p_missing'])}) missing values" else: return f"[{self.column_name}] has missing values" @@ -541,7 +541,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_zeros']} ({fmt_percent(self.values['p_zeros'])}) zeros" + return f"[{self.column_name}] has {self.values['n_zeros']} ({_fmt_percent(self.values['p_zeros'])}) zeros" else: return f"[{self.column_name}] has predominantly zeros" diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index 9c3e5ef38..49569605b 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -11,21 +11,6 @@ T = TypeVar("T") -def func_nullable_series_contains(fn: Callable) -> Callable: - @functools.wraps(fn) - def inner( - config: Settings, series: pd.Series, state: dict, *args, **kwargs - ) -> bool: - if series.hasnans: - series = series.dropna() - if series.empty: - return False - - return fn(config, series, state, *args, **kwargs) - - return inner - - def safe_histogram( values: np.ndarray, bins: Union[int, str, np.ndarray] = "auto", diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py index aef8de357..e90935640 100644 --- a/src/ydata_profiling/report/structure/variables/render_common.py +++ b/src/ydata_profiling/report/structure/variables/render_common.py @@ -10,7 +10,6 @@ def render_common(config: Settings, summary: dict) -> dict: n_freq_table_max = config.n_freq_table_max template_variables = { - # TODO: with nan "freq_table_rows": freq_table( freqtable=summary["value_counts_without_nan"], n=summary["n"], From d83e1a17d23c4d1bdee001b326fdaac1a4707548 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 20:42:38 +0800 Subject: [PATCH 07/11] feat: initial release --- src/ydata_profiling/model/alerts.py | 18 ++++----- src/ydata_profiling/model/correlations.py | 28 +++---------- src/ydata_profiling/model/missing.py | 25 +++--------- .../model/pandas/table_pandas.py | 28 ++----------- .../model/spark/table_spark.py | 39 ++----------------- .../model/spark/timeseries_index_spark.py | 2 +- src/ydata_profiling/model/summarizer.py | 3 +- .../model/summary_algorithms.py | 15 +++++++ src/ydata_profiling/model/table.py | 37 ++++++++++++++++++ .../structure/variables/render_common.py | 1 + src/ydata_profiling/utils/backend.py | 34 +++++++++++++++- 11 files changed, 115 insertions(+), 115 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 611b5de85..1b16d27a0 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -12,8 +12,8 @@ from ydata_profiling.utils.styles import get_alert_styles -def _fmt_percent(value: float, edge_cases: bool = True) -> str: - """Format a ratio as a percentage (internal copy to avoid circular imports). +def fmt_percent(value: float, edge_cases: bool = True) -> str: + """Format a ratio as a percentage. Args: edge_cases: Check for edge cases? @@ -209,7 +209,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_duplicates']} ({_fmt_percent(self.values['p_duplicates'])}) duplicate rows" + return f"Dataset has {self.values['n_duplicates']} ({fmt_percent(self.values['p_duplicates'])}) duplicate rows" else: return "Dataset has no duplicated rows" @@ -231,7 +231,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"Dataset has {self.values['n_near_dups']} ({_fmt_percent(self.values['p_near_dups'])}) near duplicate rows" + return f"Dataset has {self.values['n_near_dups']} ({fmt_percent(self.values['p_near_dups'])}) near duplicate rows" else: return "Dataset has no near duplicated rows" @@ -272,7 +272,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_distinct']:} ({_fmt_percent(self.values['p_distinct'])}) distinct values" + return f"[{self.column_name}] has {self.values['n_distinct']:} ({fmt_percent(self.values['p_distinct'])}) distinct values" else: return f"[{self.column_name}] has a high cardinality" @@ -294,7 +294,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {_fmt_percent(self.values['p_fuzzy_vals'])} per category" + return f"[{self.column_name}] has {self.values['n_fuzzy_vals']} fuzzy values: {fmt_percent(self.values['p_fuzzy_vals'])} per category" else: return f"[{self.column_name}] no dirty categories values." @@ -365,7 +365,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_infinite']} ({_fmt_percent(self.values['p_infinite'])}) infinite values" + return f"[{self.column_name}] has {self.values['n_infinite']} ({fmt_percent(self.values['p_infinite'])}) infinite values" else: return f"[{self.column_name}] has infinite values" @@ -387,7 +387,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] {self.values['n_missing']} ({_fmt_percent(self.values['p_missing'])}) missing values" + return f"[{self.column_name}] {self.values['n_missing']} ({fmt_percent(self.values['p_missing'])}) missing values" else: return f"[{self.column_name}] has missing values" @@ -541,7 +541,7 @@ def __init__( def _get_description(self) -> str: if self.values is not None: - return f"[{self.column_name}] has {self.values['n_zeros']} ({_fmt_percent(self.values['p_zeros'])}) zeros" + return f"[{self.column_name}] has {self.values['n_zeros']} ({fmt_percent(self.values['p_zeros'])}) zeros" else: return f"[{self.column_name}] has predominantly zeros" diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 2bbaa1112..25e2e13c4 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -3,12 +3,13 @@ """Correlations between variables.""" import warnings -from typing import Dict, List, Optional, Sized, no_type_check +from typing import Dict, List, Optional, Sized import numpy as np import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.utils.backend import BaseBackend try: from pandas.core.base import DataError @@ -16,30 +17,11 @@ from pandas.errors import DataError -class CorrelationBackend: +class CorrelationBackend(BaseBackend): """Helper class to select and cache the appropriate correlation backend (Pandas or Spark).""" - @no_type_check - def __init__(self, df: Sized): - """Determine backend once and store it for all correlation computations.""" - if isinstance(df, pd.DataFrame): - from ydata_profiling.model.pandas import ( - correlations_pandas as correlation_backend, # type: ignore - ) - else: - from ydata_profiling.model.spark import ( - correlations_spark as correlation_backend, # type: ignore - ) - - self.backend = correlation_backend - - def get_method(self, method_name: str): # noqa: ANN201 - """Retrieve the appropriate correlation method class from the backend.""" - if hasattr(self.backend, method_name): - return getattr(self.backend, method_name) - raise AttributeError( - f"Correlation method '{method_name}' is not available in the backend." - ) + _pandas_module = "ydata_profiling.model.pandas.correlations_pandas" + _spark_module = "ydata_profiling.model.spark.correlations_spark" class Correlation: diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py index 46ec2dee3..aa14cc425 100644 --- a/src/ydata_profiling/model/missing.py +++ b/src/ydata_profiling/model/missing.py @@ -1,32 +1,17 @@ -import importlib import warnings -from typing import Any, Callable, Dict, Optional, Sized +from typing import Any, Dict, Optional, Sized import pandas as pd from ydata_profiling.config import Settings +from ydata_profiling.utils.backend import BaseBackend -class MissingDataBackend: +class MissingDataBackend(BaseBackend): """Helper class to select and cache the appropriate missing-data backend (Pandas or Spark).""" - def __init__(self, df: Sized): - """Determine backend once and store it for all missing-data computations.""" - if isinstance(df, pd.DataFrame): - self.backend_module = "ydata_profiling.model.pandas.missing_pandas" - else: - self.backend_module = "ydata_profiling.model.spark.missing_spark" - - self.module = importlib.import_module(self.backend_module) - - def get_method(self, method_name: str) -> Callable: - """Retrieve the appropriate missing-data function from the backend module.""" - try: - return getattr(self.module, method_name) - except AttributeError as ex: - raise AttributeError( - f"Missing-data function '{method_name}' is not available in {self.backend_module}." - ) from ex + _pandas_module = "ydata_profiling.model.pandas.missing_pandas" + _spark_module = "ydata_profiling.model.spark.missing_spark" class MissingData: diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index a919ee33b..28c79f849 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -1,9 +1,7 @@ -from collections import Counter - import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.table import compute_common_table_stats, get_table_stats @get_table_stats.register @@ -21,36 +19,18 @@ def pandas_get_table_stats( A dictionary that contains the table statistics. """ n = len(df) if not df.empty else 0 + n_var = len(df.columns) memory_size = df.memory_usage(deep=config.memory_deep).sum() record_size = float(memory_size) / n if n > 0 else 0 table_stats = { "n": n, - "n_var": len(df.columns), + "n_var": n_var, "memory_size": memory_size, "record_size": record_size, - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, } - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"]) - if table_stats["n"] > 0 and table_stats["n_var"] > 0 - else 0 - ) - - # Variable type counts - table_stats.update( - {"types": dict(Counter([v["type"] for v in variable_stats.values()]))} - ) + table_stats.update(compute_common_table_stats(n, n_var, variable_stats)) return table_stats diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 33e862e61..2a2985059 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -1,9 +1,7 @@ -from collections import Counter - from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.table import get_table_stats +from ydata_profiling.model.table import compute_common_table_stats, get_table_stats @get_table_stats.register @@ -21,38 +19,9 @@ def get_table_stats_spark( A dictionary that contains the table statistics. """ n = df.count() + n_var = len(df.columns) - result = {"n": n, "n_var": len(df.columns)} - - table_stats = { - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, - } - - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - # without this check we'll get a div by zero error - if result["n"] * result["n_var"] > 0: - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / (result["n"] * result["n_var"]) - if result["n"] > 0 - else 0 - ) - else: - table_stats["p_cells_missing"] = 0 - - result["p_cells_missing"] = table_stats["p_cells_missing"] - result["n_cells_missing"] = table_stats["n_cells_missing"] - result["n_vars_all_missing"] = table_stats["n_vars_all_missing"] - result["n_vars_with_missing"] = table_stats["n_vars_with_missing"] - - # Variable type counts - result["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) + result = {"n": n, "n_var": n_var} + result.update(compute_common_table_stats(n, n_var, variable_stats)) return result diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index e8145d76c..a31f25ccf 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -4,7 +4,7 @@ from ydata_profiling.config import Settings -def spark_get_time_index_description_spark( +def get_time_index_description_spark( config: Settings, df: DataFrame, table_stats: dict, diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index d733a7d36..41b8d6f88 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,9 +50,8 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) -# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer for Pandas DataFrames.""" + """A summarizer supporting both Pandas and Spark DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index 49569605b..9c3e5ef38 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -11,6 +11,21 @@ T = TypeVar("T") +def func_nullable_series_contains(fn: Callable) -> Callable: + @functools.wraps(fn) + def inner( + config: Settings, series: pd.Series, state: dict, *args, **kwargs + ) -> bool: + if series.hasnans: + series = series.dropna() + if series.empty: + return False + + return fn(config, series, state, *args, **kwargs) + + return inner + + def safe_histogram( values: np.ndarray, bins: Union[int, str, np.ndarray] = "auto", diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py index e5eb6fdc2..6f5c7305d 100644 --- a/src/ydata_profiling/model/table.py +++ b/src/ydata_profiling/model/table.py @@ -1,3 +1,4 @@ +from collections import Counter from typing import Any from multimethod import multimethod @@ -5,6 +6,42 @@ from ydata_profiling.config import Settings +def compute_common_table_stats( + n: int, n_var: int, variable_stats: dict +) -> dict: + """Compute common table statistics shared by Pandas and Spark backends. + + Args: + n: Number of rows in the DataFrame + n_var: Number of columns (variables) + variable_stats: Previously calculated statistic on the DataFrame series + + Returns: + A dictionary with common table statistics: missing values counts, percentages, and type counts + """ + table_stats = { + "n_cells_missing": 0, + "n_vars_with_missing": 0, + "n_vars_all_missing": 0, + } + + for series_summary in variable_stats.values(): + if "n_missing" in series_summary and series_summary["n_missing"] > 0: + table_stats["n_vars_with_missing"] += 1 + table_stats["n_cells_missing"] += series_summary["n_missing"] + if series_summary["n_missing"] == n: + table_stats["n_vars_all_missing"] += 1 + + total_cells = n * n_var + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0 + ) + + table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) + + return table_stats + + @multimethod def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict: raise NotImplementedError() diff --git a/src/ydata_profiling/report/structure/variables/render_common.py b/src/ydata_profiling/report/structure/variables/render_common.py index e90935640..aef8de357 100644 --- a/src/ydata_profiling/report/structure/variables/render_common.py +++ b/src/ydata_profiling/report/structure/variables/render_common.py @@ -10,6 +10,7 @@ def render_common(config: Settings, summary: dict) -> dict: n_freq_table_max = config.n_freq_table_max template_variables = { + # TODO: with nan "freq_table_rows": freq_table( freqtable=summary["value_counts_without_nan"], n=summary["n"], diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index e99d91c11..1cee2aea8 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,9 +1,41 @@ """ - File with a function to check the backend being used + File with backend utilities and helper functions to check the backend being used """ import importlib +from typing import Callable, Optional, Sized, Union + +import pandas as pd def is_pyspark_installed() -> bool: """Check if PySpark is installed without importing it.""" return importlib.util.find_spec("pyspark") is not None + + +class BaseBackend: + """Base helper class to select and cache the appropriate backend (Pandas or Spark).""" + + _pandas_module: Optional[str] = None + _spark_module: Optional[str] = None + + def __init__(self, df: Union[pd.DataFrame, Sized]): + """Determine backend once and store it for all computations.""" + if isinstance(df, pd.DataFrame): + module_path = self._pandas_module + else: + module_path = self._spark_module + + if module_path is None: + raise ValueError("Backend module path not configured") + + self.module = importlib.import_module(module_path) + self.module_path = module_path + + def get_method(self, method_name: str) -> Callable: + """Retrieve the appropriate function from the backend module.""" + try: + return getattr(self.module, method_name) + except AttributeError as ex: + raise AttributeError( + f"Function '{method_name}' is not available in {self.module_path}." + ) from ex From a1892a275016c5f6bc9eba06e1fdd88bbbbf5379 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 21:10:03 +0800 Subject: [PATCH 08/11] feat: initial release --- src/ydata_profiling/model/correlations.py | 28 +++++++++++--- src/ydata_profiling/model/missing.py | 25 ++++++++++--- .../model/pandas/table_pandas.py | 27 ++++++++++++-- .../model/spark/describe_boolean_spark.py | 2 + .../model/spark/describe_date_spark.py | 2 + .../model/spark/describe_generic_spark.py | 2 + .../model/spark/describe_numeric_spark.py | 12 +++--- .../model/spark/describe_text_spark.py | 2 + .../model/spark/table_spark.py | 37 +++++++++++++++++-- .../model/spark/timeseries_index_spark.py | 2 + src/ydata_profiling/model/summarizer.py | 3 +- src/ydata_profiling/model/table.py | 37 ------------------- src/ydata_profiling/utils/backend.py | 34 +---------------- 13 files changed, 117 insertions(+), 96 deletions(-) diff --git a/src/ydata_profiling/model/correlations.py b/src/ydata_profiling/model/correlations.py index 25e2e13c4..2bbaa1112 100644 --- a/src/ydata_profiling/model/correlations.py +++ b/src/ydata_profiling/model/correlations.py @@ -3,13 +3,12 @@ """Correlations between variables.""" import warnings -from typing import Dict, List, Optional, Sized +from typing import Dict, List, Optional, Sized, no_type_check import numpy as np import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.utils.backend import BaseBackend try: from pandas.core.base import DataError @@ -17,11 +16,30 @@ from pandas.errors import DataError -class CorrelationBackend(BaseBackend): +class CorrelationBackend: """Helper class to select and cache the appropriate correlation backend (Pandas or Spark).""" - _pandas_module = "ydata_profiling.model.pandas.correlations_pandas" - _spark_module = "ydata_profiling.model.spark.correlations_spark" + @no_type_check + def __init__(self, df: Sized): + """Determine backend once and store it for all correlation computations.""" + if isinstance(df, pd.DataFrame): + from ydata_profiling.model.pandas import ( + correlations_pandas as correlation_backend, # type: ignore + ) + else: + from ydata_profiling.model.spark import ( + correlations_spark as correlation_backend, # type: ignore + ) + + self.backend = correlation_backend + + def get_method(self, method_name: str): # noqa: ANN201 + """Retrieve the appropriate correlation method class from the backend.""" + if hasattr(self.backend, method_name): + return getattr(self.backend, method_name) + raise AttributeError( + f"Correlation method '{method_name}' is not available in the backend." + ) class Correlation: diff --git a/src/ydata_profiling/model/missing.py b/src/ydata_profiling/model/missing.py index aa14cc425..46ec2dee3 100644 --- a/src/ydata_profiling/model/missing.py +++ b/src/ydata_profiling/model/missing.py @@ -1,17 +1,32 @@ +import importlib import warnings -from typing import Any, Dict, Optional, Sized +from typing import Any, Callable, Dict, Optional, Sized import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.utils.backend import BaseBackend -class MissingDataBackend(BaseBackend): +class MissingDataBackend: """Helper class to select and cache the appropriate missing-data backend (Pandas or Spark).""" - _pandas_module = "ydata_profiling.model.pandas.missing_pandas" - _spark_module = "ydata_profiling.model.spark.missing_spark" + def __init__(self, df: Sized): + """Determine backend once and store it for all missing-data computations.""" + if isinstance(df, pd.DataFrame): + self.backend_module = "ydata_profiling.model.pandas.missing_pandas" + else: + self.backend_module = "ydata_profiling.model.spark.missing_spark" + + self.module = importlib.import_module(self.backend_module) + + def get_method(self, method_name: str) -> Callable: + """Retrieve the appropriate missing-data function from the backend module.""" + try: + return getattr(self.module, method_name) + except AttributeError as ex: + raise AttributeError( + f"Missing-data function '{method_name}' is not available in {self.backend_module}." + ) from ex class MissingData: diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index 28c79f849..546b369ef 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -1,7 +1,9 @@ +from collections import Counter + import pandas as pd from ydata_profiling.config import Settings -from ydata_profiling.model.table import compute_common_table_stats, get_table_stats +from ydata_profiling.model.table import get_table_stats @get_table_stats.register @@ -19,18 +21,35 @@ def pandas_get_table_stats( A dictionary that contains the table statistics. """ n = len(df) if not df.empty else 0 - n_var = len(df.columns) memory_size = df.memory_usage(deep=config.memory_deep).sum() record_size = float(memory_size) / n if n > 0 else 0 table_stats = { "n": n, - "n_var": n_var, + "n_var": len(df.columns), "memory_size": memory_size, "record_size": record_size, + "n_cells_missing": 0, + "n_vars_with_missing": 0, + "n_vars_all_missing": 0, } - table_stats.update(compute_common_table_stats(n, n_var, variable_stats)) + for series_summary in variable_stats.values(): + if "n_missing" in series_summary and series_summary["n_missing"] > 0: + table_stats["n_vars_with_missing"] += 1 + table_stats["n_cells_missing"] += series_summary["n_missing"] + if series_summary["n_missing"] == n: + table_stats["n_vars_all_missing"] += 1 + + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / (table_stats["n"] * table_stats["n_var"]) + if table_stats["n"] > 0 and table_stats["n_var"] > 0 + else 0 + ) + + table_stats.update( + {"types": dict(Counter([v["type"] for v in variable_stats.values()]))} + ) return table_stats diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py index 148dbce6c..ab5cf20fb 100644 --- a/src/ydata_profiling/model/spark/describe_boolean_spark.py +++ b/src/ydata_profiling/model/spark/describe_boolean_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_boolean_1d +@describe_boolean_1d.register def describe_boolean_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py index c44d36650..a5e11a0f1 100644 --- a/src/ydata_profiling/model/spark/describe_date_spark.py +++ b/src/ydata_profiling/model/spark/describe_date_spark.py @@ -5,6 +5,7 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_date_1d def date_stats_spark(df: DataFrame, summary: dict) -> dict: @@ -18,6 +19,7 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict: return df.agg(*expr).first().asDict() +@describe_date_1d.register def describe_date_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py index 1171881cd..ee2356c0a 100644 --- a/src/ydata_profiling/model/spark/describe_generic_spark.py +++ b/src/ydata_profiling/model/spark/describe_generic_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_generic +@describe_generic.register def describe_generic_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 8c299577e..395b1461b 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -5,13 +5,15 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import histogram_compute +from ydata_profiling.model.summary_algorithms import ( + describe_numeric_1d, + histogram_compute, +) def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: column = df.columns[0] - # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False) finite_filter = ( F.col(column).isNotNull() & ~F.isnan(F.col(column)) @@ -32,6 +34,7 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: return non_null_df.agg(*expr).first().asDict() +@describe_numeric_1d.register def describe_numeric_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: @@ -90,7 +93,6 @@ def describe_numeric_1d_spark( quantile_threshold = 0.05 if summary.get("n") == summary.get("n_missing"): - # This means the entire column is null/nan, so summary values need to be hard-coded: summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles}) summary["mad"] = np.nan @@ -135,10 +137,6 @@ def describe_numeric_1d_spark( # ... https://stackoverflow.com/questions/60221841/how-to-detect-monotonic-decrease-in-pyspark summary["monotonic"] = 0 - # this function only displays the top N (see config) values for a histogram. - # This might be confusing if there are a lot of values of equal magnitude, but we cannot bring all the values to - # display in pandas display - # the alternative is to do this in spark natively, but it is not trivial infinity_values = [np.inf, -np.inf] infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values) diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py index 6d7804cf5..b5e27f615 100644 --- a/src/ydata_profiling/model/spark/describe_text_spark.py +++ b/src/ydata_profiling/model/spark/describe_text_spark.py @@ -3,8 +3,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_text_1d +@describe_text_1d.register def describe_text_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 2a2985059..17ac03323 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -1,7 +1,9 @@ +from collections import Counter + from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.table import compute_common_table_stats, get_table_stats +from ydata_profiling.model.table import get_table_stats @get_table_stats.register @@ -19,9 +21,36 @@ def get_table_stats_spark( A dictionary that contains the table statistics. """ n = df.count() - n_var = len(df.columns) - result = {"n": n, "n_var": n_var} - result.update(compute_common_table_stats(n, n_var, variable_stats)) + result = {"n": n, "n_var": len(df.columns)} + + table_stats = { + "n_cells_missing": 0, + "n_vars_with_missing": 0, + "n_vars_all_missing": 0, + } + + for series_summary in variable_stats.values(): + if "n_missing" in series_summary and series_summary["n_missing"] > 0: + table_stats["n_vars_with_missing"] += 1 + table_stats["n_cells_missing"] += series_summary["n_missing"] + if series_summary["n_missing"] == n: + table_stats["n_vars_all_missing"] += 1 + + if result["n"] * result["n_var"] > 0: + table_stats["p_cells_missing"] = ( + table_stats["n_cells_missing"] / (result["n"] * result["n_var"]) + if result["n"] > 0 + else 0 + ) + else: + table_stats["p_cells_missing"] = 0 + + result["p_cells_missing"] = table_stats["p_cells_missing"] + result["n_cells_missing"] = table_stats["n_cells_missing"] + result["n_vars_all_missing"] = table_stats["n_vars_all_missing"] + result["n_vars_with_missing"] = table_stats["n_vars_with_missing"] + + result["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) return result diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index a31f25ccf..c16204ac3 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -2,8 +2,10 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings +from ydata_profiling.model.timeseries_index import get_time_index_description +@get_time_index_description.register def get_time_index_description_spark( config: Settings, df: DataFrame, diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index 41b8d6f88..d733a7d36 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -50,8 +50,9 @@ def summarize( return self.handle(str(dtype), config, series, {"type": str(dtype)}) +# Revisit this with the correct support for Spark as well. class ProfilingSummarizer(BaseSummarizer): - """A summarizer supporting both Pandas and Spark DataFrames.""" + """A summarizer for Pandas DataFrames.""" def __init__(self, typeset: VisionsTypeset, use_spark: bool = False): self.use_spark = use_spark and is_pyspark_installed() diff --git a/src/ydata_profiling/model/table.py b/src/ydata_profiling/model/table.py index 6f5c7305d..e5eb6fdc2 100644 --- a/src/ydata_profiling/model/table.py +++ b/src/ydata_profiling/model/table.py @@ -1,4 +1,3 @@ -from collections import Counter from typing import Any from multimethod import multimethod @@ -6,42 +5,6 @@ from ydata_profiling.config import Settings -def compute_common_table_stats( - n: int, n_var: int, variable_stats: dict -) -> dict: - """Compute common table statistics shared by Pandas and Spark backends. - - Args: - n: Number of rows in the DataFrame - n_var: Number of columns (variables) - variable_stats: Previously calculated statistic on the DataFrame series - - Returns: - A dictionary with common table statistics: missing values counts, percentages, and type counts - """ - table_stats = { - "n_cells_missing": 0, - "n_vars_with_missing": 0, - "n_vars_all_missing": 0, - } - - for series_summary in variable_stats.values(): - if "n_missing" in series_summary and series_summary["n_missing"] > 0: - table_stats["n_vars_with_missing"] += 1 - table_stats["n_cells_missing"] += series_summary["n_missing"] - if series_summary["n_missing"] == n: - table_stats["n_vars_all_missing"] += 1 - - total_cells = n * n_var - table_stats["p_cells_missing"] = ( - table_stats["n_cells_missing"] / total_cells if total_cells > 0 else 0 - ) - - table_stats["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) - - return table_stats - - @multimethod def get_table_stats(config: Settings, df: Any, variable_stats: dict) -> dict: raise NotImplementedError() diff --git a/src/ydata_profiling/utils/backend.py b/src/ydata_profiling/utils/backend.py index 1cee2aea8..e99d91c11 100644 --- a/src/ydata_profiling/utils/backend.py +++ b/src/ydata_profiling/utils/backend.py @@ -1,41 +1,9 @@ """ - File with backend utilities and helper functions to check the backend being used + File with a function to check the backend being used """ import importlib -from typing import Callable, Optional, Sized, Union - -import pandas as pd def is_pyspark_installed() -> bool: """Check if PySpark is installed without importing it.""" return importlib.util.find_spec("pyspark") is not None - - -class BaseBackend: - """Base helper class to select and cache the appropriate backend (Pandas or Spark).""" - - _pandas_module: Optional[str] = None - _spark_module: Optional[str] = None - - def __init__(self, df: Union[pd.DataFrame, Sized]): - """Determine backend once and store it for all computations.""" - if isinstance(df, pd.DataFrame): - module_path = self._pandas_module - else: - module_path = self._spark_module - - if module_path is None: - raise ValueError("Backend module path not configured") - - self.module = importlib.import_module(module_path) - self.module_path = module_path - - def get_method(self, method_name: str) -> Callable: - """Retrieve the appropriate function from the backend module.""" - try: - return getattr(self.module, method_name) - except AttributeError as ex: - raise AttributeError( - f"Function '{method_name}' is not available in {self.module_path}." - ) from ex From 754677bed6f889c50091ea51e50d833fe47693c4 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 22:44:21 +0800 Subject: [PATCH 09/11] feat: initial release --- src/ydata_profiling/model/handler.py | 178 ++++++++++-------- .../pandas/describe_categorical_pandas.py | 1 - .../model/pandas/table_pandas.py | 1 + .../model/spark/describe_boolean_spark.py | 2 - .../model/spark/describe_date_spark.py | 2 - .../model/spark/describe_generic_spark.py | 2 - .../model/spark/describe_numeric_spark.py | 12 +- .../model/spark/describe_text_spark.py | 2 - .../model/spark/missing_spark.py | 1 - .../model/spark/table_spark.py | 2 + .../model/spark/timeseries_index_spark.py | 4 +- .../report/presentation/core/collapse.py | 2 +- .../report/presentation/core/container.py | 2 +- .../report/presentation/core/dropdown.py | 2 +- .../report/presentation/core/renderable.py | 6 +- .../report/presentation/core/root.py | 4 +- .../report/presentation/core/variable.py | 6 +- .../presentation/flavours/flavour_html.py | 7 +- .../presentation/flavours/flavour_widget.py | 7 +- .../report/presentation/flavours/flavours.py | 20 +- 20 files changed, 142 insertions(+), 121 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 992c1840c..13722e1cb 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,81 +1,97 @@ -""" - Auxiliary handler methods for data summary extraction -""" -from typing import Any, Callable, Dict, List, Sequence - -import networkx as nx -from visions import VisionsTypeset - - -def compose(functions: Sequence[Callable]) -> Callable: - """ - Compose a sequence of functions. - - :param functions: sequence of functions - :return: combined function applying all functions in order. - """ - - def composed_function(*args) -> List[Any]: - result = args # Start with the input arguments - for func in functions: - result = func(*result) if isinstance(result, tuple) else func(result) - return result # type: ignore - - return composed_function # type: ignore - - -class Handler: - """A generic handler - - Allows any custom mapping between data types and functions - """ - - def __init__( - self, - mapping: Dict[str, List[Callable]], - typeset: VisionsTypeset, - *args, - **kwargs - ): - self.mapping = mapping - self.typeset = typeset - self._complete_dag() - - def _complete_dag(self) -> None: - for from_type, to_type in nx.topological_sort( - nx.line_graph(self.typeset.base_graph) - ): - self.mapping[str(to_type)] = ( - self.mapping[str(from_type)] + self.mapping[str(to_type)] - ) - - def handle(self, dtype: str, *args, **kwargs) -> dict: - """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted - """ - funcs = self.mapping.get(dtype, []) - op = compose(funcs) - summary = op(*args)[-1] - return summary - - -def get_render_map() -> Dict[str, Callable]: - import ydata_profiling.report.structure.variables as render_algorithms - - render_map = { - "Boolean": render_algorithms.render_boolean, - "Numeric": render_algorithms.render_real, - "Complex": render_algorithms.render_complex, - "Text": render_algorithms.render_text, - "DateTime": render_algorithms.render_date, - "Categorical": render_algorithms.render_categorical, - "URL": render_algorithms.render_url, - "Path": render_algorithms.render_path, - "File": render_algorithms.render_file, - "Image": render_algorithms.render_image, - "Unsupported": render_algorithms.render_generic, - "TimeSeries": render_algorithms.render_timeseries, - } - - return render_map +""" + Auxiliary handler methods for data summary extraction +""" +from typing import Any, Callable, Dict, List, Sequence, Tuple, TypeVar, cast + +import networkx as nx +from visions import VisionsTypeset + +T = TypeVar("T") +SummaryFunction = Callable[..., Tuple[Any, ...]] + + +def compose(functions: Sequence[SummaryFunction]) -> SummaryFunction: + """ + Compose a sequence of functions. + + :param functions: sequence of functions + :return: combined function applying all functions in order. + """ + + def composed_function(*args: Any) -> Tuple[Any, ...]: + result: Tuple[Any, ...] = args + for func in functions: + step_result = func(*result) + if not isinstance(step_result, tuple): + result = (step_result,) + else: + result = step_result + return result + + return composed_function + + +class Handler: + """A generic handler + + Allows any custom mapping between data types and functions + """ + + def __init__( + self, + mapping: Dict[str, List[SummaryFunction]], + typeset: VisionsTypeset, + *args: Any, + **kwargs: Any, + ) -> None: + self.mapping: Dict[str, List[SummaryFunction]] = mapping + self.typeset = typeset + self._complete_dag() + + def _complete_dag(self) -> None: + for from_type, to_type in nx.topological_sort( + nx.line_graph(self.typeset.base_graph) + ): + from_type_str = str(from_type) + to_type_str = str(to_type) + + if from_type_str not in self.mapping: + continue + + if to_type_str in self.mapping: + self.mapping[to_type_str] = ( + self.mapping[from_type_str] + self.mapping[to_type_str] + ) + else: + self.mapping[to_type_str] = self.mapping[from_type_str].copy() + + def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Dict[str, Any]: + """ + Returns: + object: a tuple containing the config, the dataset series and the summary extracted + """ + funcs = self.mapping.get(dtype, []) + op = compose(funcs) + result = op(*args) + return cast(Dict[str, Any], result[-1]) + + +def get_render_map() -> Dict[str, Callable]: + import ydata_profiling.report.structure.variables as render_algorithms + + render_map = { + "Boolean": render_algorithms.render_boolean, + "Numeric": render_algorithms.render_real, + "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, + "DateTime": render_algorithms.render_date, + "Categorical": render_algorithms.render_categorical, + "URL": render_algorithms.render_url, + "Path": render_algorithms.render_path, + "File": render_algorithms.render_file, + "Image": render_algorithms.render_image, + "Unsupported": render_algorithms.render_generic, + "TimeSeries": render_algorithms.render_timeseries, + } + + return render_map diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index a53f16d91..568aa7a9c 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -27,7 +27,6 @@ def get_character_counts_vc(vc: pd.Series) -> pd.Series: if len(counts) > 0: counts = counts.groupby(level=0, sort=False).sum() counts = counts.sort_values(ascending=False) - # FIXME: correct in split, below should be zero: print(counts.loc['']) counts = counts[counts.index.str.len() > 0] return counts diff --git a/src/ydata_profiling/model/pandas/table_pandas.py b/src/ydata_profiling/model/pandas/table_pandas.py index 546b369ef..a919ee33b 100644 --- a/src/ydata_profiling/model/pandas/table_pandas.py +++ b/src/ydata_profiling/model/pandas/table_pandas.py @@ -48,6 +48,7 @@ def pandas_get_table_stats( else 0 ) + # Variable type counts table_stats.update( {"types": dict(Counter([v["type"] for v in variable_stats.values()]))} ) diff --git a/src/ydata_profiling/model/spark/describe_boolean_spark.py b/src/ydata_profiling/model/spark/describe_boolean_spark.py index ab5cf20fb..148dbce6c 100644 --- a/src/ydata_profiling/model/spark/describe_boolean_spark.py +++ b/src/ydata_profiling/model/spark/describe_boolean_spark.py @@ -3,10 +3,8 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_boolean_1d -@describe_boolean_1d.register def describe_boolean_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_date_spark.py b/src/ydata_profiling/model/spark/describe_date_spark.py index a5e11a0f1..c44d36650 100644 --- a/src/ydata_profiling/model/spark/describe_date_spark.py +++ b/src/ydata_profiling/model/spark/describe_date_spark.py @@ -5,7 +5,6 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_date_1d def date_stats_spark(df: DataFrame, summary: dict) -> dict: @@ -19,7 +18,6 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict: return df.agg(*expr).first().asDict() -@describe_date_1d.register def describe_date_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_generic_spark.py b/src/ydata_profiling/model/spark/describe_generic_spark.py index ee2356c0a..1171881cd 100644 --- a/src/ydata_profiling/model/spark/describe_generic_spark.py +++ b/src/ydata_profiling/model/spark/describe_generic_spark.py @@ -3,10 +3,8 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_generic -@describe_generic.register def describe_generic_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/describe_numeric_spark.py b/src/ydata_profiling/model/spark/describe_numeric_spark.py index 395b1461b..8c299577e 100644 --- a/src/ydata_profiling/model/spark/describe_numeric_spark.py +++ b/src/ydata_profiling/model/spark/describe_numeric_spark.py @@ -5,15 +5,13 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import ( - describe_numeric_1d, - histogram_compute, -) +from ydata_profiling.model.summary_algorithms import histogram_compute def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: column = df.columns[0] + # Removing null types from numeric summary stats to match Pandas defaults which skip na's (skipna=False) finite_filter = ( F.col(column).isNotNull() & ~F.isnan(F.col(column)) @@ -34,7 +32,6 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict: return non_null_df.agg(*expr).first().asDict() -@describe_numeric_1d.register def describe_numeric_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: @@ -93,6 +90,7 @@ def describe_numeric_1d_spark( quantile_threshold = 0.05 if summary.get("n") == summary.get("n_missing"): + # This means the entire column is null/nan, so summary values need to be hard-coded: summary.update({f"{percentile:.0%}": np.nan for percentile in quantiles}) summary["mad"] = np.nan @@ -137,6 +135,10 @@ def describe_numeric_1d_spark( # ... https://stackoverflow.com/questions/60221841/how-to-detect-monotonic-decrease-in-pyspark summary["monotonic"] = 0 + # this function only displays the top N (see config) values for a histogram. + # This might be confusing if there are a lot of values of equal magnitude, but we cannot bring all the values to + # display in pandas display + # the alternative is to do this in spark natively, but it is not trivial infinity_values = [np.inf, -np.inf] infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values) diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py index b5e27f615..6d7804cf5 100644 --- a/src/ydata_profiling/model/spark/describe_text_spark.py +++ b/src/ydata_profiling/model/spark/describe_text_spark.py @@ -3,10 +3,8 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.summary_algorithms import describe_text_1d -@describe_text_1d.register def describe_text_1d_spark( config: Settings, df: DataFrame, summary: dict ) -> Tuple[Settings, DataFrame, dict]: diff --git a/src/ydata_profiling/model/spark/missing_spark.py b/src/ydata_profiling/model/spark/missing_spark.py index deacf1b89..02529dceb 100644 --- a/src/ydata_profiling/model/spark/missing_spark.py +++ b/src/ydata_profiling/model/spark/missing_spark.py @@ -56,7 +56,6 @@ def __len__(self) -> Optional[int]: def missing_bar(config: Settings, df: DataFrame) -> str: import pyspark.sql.functions as F - # FIXME: move to univariate data_nan_counts = ( df.agg( *[F.count(F.when(F.isnull(c) | F.isnan(c), c)).alias(c) for c in df.columns] diff --git a/src/ydata_profiling/model/spark/table_spark.py b/src/ydata_profiling/model/spark/table_spark.py index 17ac03323..33e862e61 100644 --- a/src/ydata_profiling/model/spark/table_spark.py +++ b/src/ydata_profiling/model/spark/table_spark.py @@ -37,6 +37,7 @@ def get_table_stats_spark( if series_summary["n_missing"] == n: table_stats["n_vars_all_missing"] += 1 + # without this check we'll get a div by zero error if result["n"] * result["n_var"] > 0: table_stats["p_cells_missing"] = ( table_stats["n_cells_missing"] / (result["n"] * result["n_var"]) @@ -51,6 +52,7 @@ def get_table_stats_spark( result["n_vars_all_missing"] = table_stats["n_vars_all_missing"] result["n_vars_with_missing"] = table_stats["n_vars_with_missing"] + # Variable type counts result["types"] = dict(Counter([v["type"] for v in variable_stats.values()])) return result diff --git a/src/ydata_profiling/model/spark/timeseries_index_spark.py b/src/ydata_profiling/model/spark/timeseries_index_spark.py index c16204ac3..e8145d76c 100644 --- a/src/ydata_profiling/model/spark/timeseries_index_spark.py +++ b/src/ydata_profiling/model/spark/timeseries_index_spark.py @@ -2,11 +2,9 @@ from pyspark.sql import DataFrame from ydata_profiling.config import Settings -from ydata_profiling.model.timeseries_index import get_time_index_description -@get_time_index_description.register -def get_time_index_description_spark( +def spark_get_time_index_description_spark( config: Settings, df: DataFrame, table_stats: dict, diff --git a/src/ydata_profiling/report/presentation/core/collapse.py b/src/ydata_profiling/report/presentation/core/collapse.py index a7dba34f1..9bc393602 100644 --- a/src/ydata_profiling/report/presentation/core/collapse.py +++ b/src/ydata_profiling/report/presentation/core/collapse.py @@ -6,7 +6,7 @@ class Collapse(ItemRenderer): - def __init__(self, button: ToggleButton, item: Renderable, **kwargs): + def __init__(self, button: ToggleButton, item: Renderable, **kwargs: Any): super().__init__("collapse", {"button": button, "item": item}, **kwargs) def __repr__(self) -> str: diff --git a/src/ydata_profiling/report/presentation/core/container.py b/src/ydata_profiling/report/presentation/core/container.py index c82f06266..d4ed121ca 100644 --- a/src/ydata_profiling/report/presentation/core/container.py +++ b/src/ydata_profiling/report/presentation/core/container.py @@ -13,7 +13,7 @@ def __init__( anchor_id: Optional[str] = None, classes: Optional[str] = None, oss: Optional[bool] = None, - **kwargs, + **kwargs: Any, ): args = {"items": items, "nested": nested} args.update(**kwargs) diff --git a/src/ydata_profiling/report/presentation/core/dropdown.py b/src/ydata_profiling/report/presentation/core/dropdown.py index c1c2f274e..4c9dfb3a9 100644 --- a/src/ydata_profiling/report/presentation/core/dropdown.py +++ b/src/ydata_profiling/report/presentation/core/dropdown.py @@ -15,7 +15,7 @@ def __init__( anchor_id: str, classes: list, is_row: bool, - **kwargs + **kwargs: Any, ): super().__init__( "dropdown", diff --git a/src/ydata_profiling/report/presentation/core/renderable.py b/src/ydata_profiling/report/presentation/core/renderable.py index 3f7f09f6c..028151532 100644 --- a/src/ydata_profiling/report/presentation/core/renderable.py +++ b/src/ydata_profiling/report/presentation/core/renderable.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, Optional +from typing import Any, Callable, Dict, Optional class Renderable(ABC): @@ -34,9 +34,9 @@ def classes(self) -> str: def render(self) -> Any: pass - def __str__(self): + def __str__(self) -> str: return self.__class__.__name__ @classmethod - def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: # noqa: ANN001 + def convert_to_class(cls, obj: "Renderable", flavour_func: Callable) -> None: obj.__class__ = cls diff --git a/src/ydata_profiling/report/presentation/core/root.py b/src/ydata_profiling/report/presentation/core/root.py index 0c3f1e3c9..6e96e7f14 100644 --- a/src/ydata_profiling/report/presentation/core/root.py +++ b/src/ydata_profiling/report/presentation/core/root.py @@ -11,7 +11,7 @@ class Root(ItemRenderer): """ def __init__( - self, name: str, body: Renderable, footer: Renderable, style: Style, **kwargs + self, name: str, body: Renderable, footer: Renderable, style: Style, **kwargs: Any ): super().__init__( "report", @@ -23,7 +23,7 @@ def __init__( def __repr__(self) -> str: return "Root" - def render(self, **kwargs) -> Any: + def render(self, **kwargs: Any) -> Any: raise NotImplementedError() @classmethod diff --git a/src/ydata_profiling/report/presentation/core/variable.py b/src/ydata_profiling/report/presentation/core/variable.py index cdf063202..34bd110a8 100644 --- a/src/ydata_profiling/report/presentation/core/variable.py +++ b/src/ydata_profiling/report/presentation/core/variable.py @@ -10,13 +10,13 @@ def __init__( top: Renderable, bottom: Optional[Renderable] = None, ignore: bool = False, - **kwargs, + **kwargs: Any, ): super().__init__( "variable", {"top": top, "bottom": bottom, "ignore": ignore}, **kwargs ) - def __str__(self): + def __str__(self) -> str: top_text = str(self.content["top"]).replace("\n", "\n\t") bottom_text = str(self.content["bottom"]).replace("\n", "\n\t") @@ -25,7 +25,7 @@ def __str__(self): text += f"- bottom: {bottom_text}" return text - def __repr__(self): + def __repr__(self) -> str: return "Variable" def render(self) -> Any: diff --git a/src/ydata_profiling/report/presentation/flavours/flavour_html.py b/src/ydata_profiling/report/presentation/flavours/flavour_html.py index b342ff32f..7ad2b9c1d 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavour_html.py +++ b/src/ydata_profiling/report/presentation/flavours/flavour_html.py @@ -41,7 +41,10 @@ HTMLVariableInfo, ) -html_mapping = { +from typing import cast +from ydata_profiling.report.presentation.flavours.flavours import _FlavourMapping + +html_mapping = cast(_FlavourMapping, { Container: HTMLContainer, Variable: HTMLVariable, VariableInfo: HTMLVariableInfo, @@ -59,6 +62,6 @@ Collapse: HTMLCollapse, CorrelationTable: HTMLCorrelationTable, Scores: HTMLScores, -} +}) register_flavour("html", html_mapping) diff --git a/src/ydata_profiling/report/presentation/flavours/flavour_widget.py b/src/ydata_profiling/report/presentation/flavours/flavour_widget.py index b95d724f1..29ff1ad2c 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavour_widget.py +++ b/src/ydata_profiling/report/presentation/flavours/flavour_widget.py @@ -39,7 +39,10 @@ WidgetVariableInfo, ) -widget_mapping = { +from typing import cast +from ydata_profiling.report.presentation.flavours.flavours import _FlavourMapping + +widget_mapping = cast(_FlavourMapping, { Container: WidgetContainer, Variable: WidgetVariable, VariableInfo: WidgetVariableInfo, @@ -56,6 +59,6 @@ ToggleButton: WidgetToggleButton, Collapse: WidgetCollapse, CorrelationTable: WidgetCorrelationTable, -} +}) register_flavour("widget", widget_mapping) diff --git a/src/ydata_profiling/report/presentation/flavours/flavours.py b/src/ydata_profiling/report/presentation/flavours/flavours.py index 10a5fa522..e31aa1e3c 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavours.py +++ b/src/ydata_profiling/report/presentation/flavours/flavours.py @@ -1,26 +1,32 @@ """ Flavours registry information """ +from typing import Callable, Dict, Type + from ydata_profiling.report.presentation.core import Root from ydata_profiling.report.presentation.core.renderable import Renderable -_FLAVOUR_REGISTRY: dict = {} +_FlavourMapping = Dict[Type[Renderable], Type[Renderable]] +_FLAVOUR_REGISTRY: Dict[str, _FlavourMapping] = {} -def register_flavour(name: str, mapping: dict) -> None: +def register_flavour(name: str, mapping: _FlavourMapping) -> None: _FLAVOUR_REGISTRY[name] = mapping -def get_flavour_mapping(name: str) -> dict: +def get_flavour_mapping(name: str) -> _FlavourMapping: if name not in _FLAVOUR_REGISTRY: raise ValueError(f"Flavour '{name}' is not registered.") return _FLAVOUR_REGISTRY[name] +_FlavourFunc = Callable[[Renderable], Renderable] + + def apply_renderable_mapping( - mapping: dict, + mapping: _FlavourMapping, structure: Renderable, - flavour_func, # noqa: ANN001 + flavour_func: _FlavourFunc, ) -> None: mapping[type(structure)].convert_to_class(structure, flavour_func) @@ -29,7 +35,7 @@ def HTMLReport(structure: Root) -> Root: from ydata_profiling.report.presentation.flavours import flavour_html # noqa: F401 mapping = get_flavour_mapping("html") - apply_renderable_mapping(mapping, structure, flavour_func=HTMLReport) + apply_renderable_mapping(mapping, structure, flavour_func=HTMLReport) # type: ignore return structure @@ -39,5 +45,5 @@ def WidgetReport(structure: Root) -> Root: ) mapping = get_flavour_mapping("widget") - apply_renderable_mapping(mapping, structure, flavour_func=WidgetReport) + apply_renderable_mapping(mapping, structure, flavour_func=WidgetReport) # type: ignore return structure From 307270e5d995c4a7c15e58a3c803b589086f0ff6 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 23:15:01 +0800 Subject: [PATCH 10/11] feat: initial release --- src/ydata_profiling/model/handler.py | 67 ++++++++++--------- .../pandas/describe_categorical_pandas.py | 1 + .../model/spark/missing_spark.py | 2 - .../report/presentation/core/collapse.py | 2 +- .../report/presentation/core/container.py | 2 +- .../report/presentation/core/dropdown.py | 2 +- .../report/presentation/core/renderable.py | 4 +- .../report/presentation/core/root.py | 4 +- .../report/presentation/core/variable.py | 6 +- .../presentation/flavours/flavour_html.py | 7 +- .../presentation/flavours/flavour_widget.py | 7 +- .../report/presentation/flavours/flavours.py | 20 +++--- 12 files changed, 57 insertions(+), 67 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 13722e1cb..4ea43192a 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,32 +1,33 @@ """ Auxiliary handler methods for data summary extraction """ -from typing import Any, Callable, Dict, List, Sequence, Tuple, TypeVar, cast +from typing import Any, Callable, Dict, List, Sequence, Tuple, Union import networkx as nx from visions import VisionsTypeset -T = TypeVar("T") -SummaryFunction = Callable[..., Tuple[Any, ...]] - -def compose(functions: Sequence[SummaryFunction]) -> SummaryFunction: +def compose(functions: Sequence[Callable]) -> Callable: """ Compose a sequence of functions. - :param functions: sequence of functions - :return: combined function applying all functions in order. + Each function in the sequence receives the result of the previous function. + Functions are expected to accept and return tuples for proper chaining. + + :param functions: sequence of functions that accept and return tuples + :return: combined function applying all functions in order """ def composed_function(*args: Any) -> Tuple[Any, ...]: - result: Tuple[Any, ...] = args + result: Union[Tuple[Any, ...], Any] = args for func in functions: - step_result = func(*result) - if not isinstance(step_result, tuple): - result = (step_result,) + if isinstance(result, tuple): + result = func(*result) else: - result = step_result - return result + result = func(result) + if isinstance(result, tuple): + return result + return (result,) return composed_function @@ -34,17 +35,18 @@ def composed_function(*args: Any) -> Tuple[Any, ...]: class Handler: """A generic handler - Allows any custom mapping between data types and functions + Allows any custom mapping between data types and functions. + Functions are composed based on the type hierarchy defined in the typeset. """ def __init__( self, - mapping: Dict[str, List[SummaryFunction]], + mapping: Dict[str, List[Callable]], typeset: VisionsTypeset, *args: Any, - **kwargs: Any, - ) -> None: - self.mapping: Dict[str, List[SummaryFunction]] = mapping + **kwargs: Any + ): + self.mapping = mapping self.typeset = typeset self._complete_dag() @@ -52,28 +54,27 @@ def _complete_dag(self) -> None: for from_type, to_type in nx.topological_sort( nx.line_graph(self.typeset.base_graph) ): - from_type_str = str(from_type) - to_type_str = str(to_type) - - if from_type_str not in self.mapping: - continue - - if to_type_str in self.mapping: - self.mapping[to_type_str] = ( - self.mapping[from_type_str] + self.mapping[to_type_str] - ) - else: - self.mapping[to_type_str] = self.mapping[from_type_str].copy() + from_key = str(from_type) + to_key = str(to_type) + self.mapping[to_key] = self.mapping.get(from_key, []) + self.mapping.get( + to_key, [] + ) def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Dict[str, Any]: """ - Returns: - object: a tuple containing the config, the dataset series and the summary extracted + Execute the handler chain for the given data type. + + :param dtype: the data type to handle + :param args: arguments to pass to the handler functions + :param kwargs: keyword arguments (currently unused but reserved for extensibility) + :return: a dictionary containing the summary extracted from the data """ funcs = self.mapping.get(dtype, []) op = compose(funcs) result = op(*args) - return cast(Dict[str, Any], result[-1]) + if result: + return result[-1] if isinstance(result[-1], dict) else {} + return {} def get_render_map() -> Dict[str, Callable]: diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index 568aa7a9c..a53f16d91 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -27,6 +27,7 @@ def get_character_counts_vc(vc: pd.Series) -> pd.Series: if len(counts) > 0: counts = counts.groupby(level=0, sort=False).sum() counts = counts.sort_values(ascending=False) + # FIXME: correct in split, below should be zero: print(counts.loc['']) counts = counts[counts.index.str.len() > 0] return counts diff --git a/src/ydata_profiling/model/spark/missing_spark.py b/src/ydata_profiling/model/spark/missing_spark.py index 02529dceb..5ad367e6e 100644 --- a/src/ydata_profiling/model/spark/missing_spark.py +++ b/src/ydata_profiling/model/spark/missing_spark.py @@ -82,11 +82,9 @@ def missing_matrix(config: Settings, df: DataFrame) -> str: def missing_heatmap(config: Settings, df: DataFrame) -> str: df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) - # Remove completely filled or completely empty variables. columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0] df = df.iloc[:, columns] - # Create and mask the correlation matrix. Construct the base heatmap. corr_mat = df.isnull().corr() mask = np.zeros_like(corr_mat) mask[np.triu_indices_from(mask)] = True diff --git a/src/ydata_profiling/report/presentation/core/collapse.py b/src/ydata_profiling/report/presentation/core/collapse.py index 9bc393602..a7dba34f1 100644 --- a/src/ydata_profiling/report/presentation/core/collapse.py +++ b/src/ydata_profiling/report/presentation/core/collapse.py @@ -6,7 +6,7 @@ class Collapse(ItemRenderer): - def __init__(self, button: ToggleButton, item: Renderable, **kwargs: Any): + def __init__(self, button: ToggleButton, item: Renderable, **kwargs): super().__init__("collapse", {"button": button, "item": item}, **kwargs) def __repr__(self) -> str: diff --git a/src/ydata_profiling/report/presentation/core/container.py b/src/ydata_profiling/report/presentation/core/container.py index d4ed121ca..c82f06266 100644 --- a/src/ydata_profiling/report/presentation/core/container.py +++ b/src/ydata_profiling/report/presentation/core/container.py @@ -13,7 +13,7 @@ def __init__( anchor_id: Optional[str] = None, classes: Optional[str] = None, oss: Optional[bool] = None, - **kwargs: Any, + **kwargs, ): args = {"items": items, "nested": nested} args.update(**kwargs) diff --git a/src/ydata_profiling/report/presentation/core/dropdown.py b/src/ydata_profiling/report/presentation/core/dropdown.py index 4c9dfb3a9..c1c2f274e 100644 --- a/src/ydata_profiling/report/presentation/core/dropdown.py +++ b/src/ydata_profiling/report/presentation/core/dropdown.py @@ -15,7 +15,7 @@ def __init__( anchor_id: str, classes: list, is_row: bool, - **kwargs: Any, + **kwargs ): super().__init__( "dropdown", diff --git a/src/ydata_profiling/report/presentation/core/renderable.py b/src/ydata_profiling/report/presentation/core/renderable.py index 028151532..84265c1c6 100644 --- a/src/ydata_profiling/report/presentation/core/renderable.py +++ b/src/ydata_profiling/report/presentation/core/renderable.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Optional +from typing import Any, Dict, Optional class Renderable(ABC): @@ -38,5 +38,5 @@ def __str__(self) -> str: return self.__class__.__name__ @classmethod - def convert_to_class(cls, obj: "Renderable", flavour_func: Callable) -> None: + def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: obj.__class__ = cls diff --git a/src/ydata_profiling/report/presentation/core/root.py b/src/ydata_profiling/report/presentation/core/root.py index 6e96e7f14..0c3f1e3c9 100644 --- a/src/ydata_profiling/report/presentation/core/root.py +++ b/src/ydata_profiling/report/presentation/core/root.py @@ -11,7 +11,7 @@ class Root(ItemRenderer): """ def __init__( - self, name: str, body: Renderable, footer: Renderable, style: Style, **kwargs: Any + self, name: str, body: Renderable, footer: Renderable, style: Style, **kwargs ): super().__init__( "report", @@ -23,7 +23,7 @@ def __init__( def __repr__(self) -> str: return "Root" - def render(self, **kwargs: Any) -> Any: + def render(self, **kwargs) -> Any: raise NotImplementedError() @classmethod diff --git a/src/ydata_profiling/report/presentation/core/variable.py b/src/ydata_profiling/report/presentation/core/variable.py index 34bd110a8..cdf063202 100644 --- a/src/ydata_profiling/report/presentation/core/variable.py +++ b/src/ydata_profiling/report/presentation/core/variable.py @@ -10,13 +10,13 @@ def __init__( top: Renderable, bottom: Optional[Renderable] = None, ignore: bool = False, - **kwargs: Any, + **kwargs, ): super().__init__( "variable", {"top": top, "bottom": bottom, "ignore": ignore}, **kwargs ) - def __str__(self) -> str: + def __str__(self): top_text = str(self.content["top"]).replace("\n", "\n\t") bottom_text = str(self.content["bottom"]).replace("\n", "\n\t") @@ -25,7 +25,7 @@ def __str__(self) -> str: text += f"- bottom: {bottom_text}" return text - def __repr__(self) -> str: + def __repr__(self): return "Variable" def render(self) -> Any: diff --git a/src/ydata_profiling/report/presentation/flavours/flavour_html.py b/src/ydata_profiling/report/presentation/flavours/flavour_html.py index 7ad2b9c1d..b342ff32f 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavour_html.py +++ b/src/ydata_profiling/report/presentation/flavours/flavour_html.py @@ -41,10 +41,7 @@ HTMLVariableInfo, ) -from typing import cast -from ydata_profiling.report.presentation.flavours.flavours import _FlavourMapping - -html_mapping = cast(_FlavourMapping, { +html_mapping = { Container: HTMLContainer, Variable: HTMLVariable, VariableInfo: HTMLVariableInfo, @@ -62,6 +59,6 @@ Collapse: HTMLCollapse, CorrelationTable: HTMLCorrelationTable, Scores: HTMLScores, -}) +} register_flavour("html", html_mapping) diff --git a/src/ydata_profiling/report/presentation/flavours/flavour_widget.py b/src/ydata_profiling/report/presentation/flavours/flavour_widget.py index 29ff1ad2c..b95d724f1 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavour_widget.py +++ b/src/ydata_profiling/report/presentation/flavours/flavour_widget.py @@ -39,10 +39,7 @@ WidgetVariableInfo, ) -from typing import cast -from ydata_profiling.report.presentation.flavours.flavours import _FlavourMapping - -widget_mapping = cast(_FlavourMapping, { +widget_mapping = { Container: WidgetContainer, Variable: WidgetVariable, VariableInfo: WidgetVariableInfo, @@ -59,6 +56,6 @@ ToggleButton: WidgetToggleButton, Collapse: WidgetCollapse, CorrelationTable: WidgetCorrelationTable, -}) +} register_flavour("widget", widget_mapping) diff --git a/src/ydata_profiling/report/presentation/flavours/flavours.py b/src/ydata_profiling/report/presentation/flavours/flavours.py index e31aa1e3c..5b7551d99 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavours.py +++ b/src/ydata_profiling/report/presentation/flavours/flavours.py @@ -1,32 +1,28 @@ """ Flavours registry information """ -from typing import Callable, Dict, Type +from typing import Callable from ydata_profiling.report.presentation.core import Root from ydata_profiling.report.presentation.core.renderable import Renderable -_FlavourMapping = Dict[Type[Renderable], Type[Renderable]] -_FLAVOUR_REGISTRY: Dict[str, _FlavourMapping] = {} +_FLAVOUR_REGISTRY: dict = {} -def register_flavour(name: str, mapping: _FlavourMapping) -> None: +def register_flavour(name: str, mapping: dict) -> None: _FLAVOUR_REGISTRY[name] = mapping -def get_flavour_mapping(name: str) -> _FlavourMapping: +def get_flavour_mapping(name: str) -> dict: if name not in _FLAVOUR_REGISTRY: raise ValueError(f"Flavour '{name}' is not registered.") return _FLAVOUR_REGISTRY[name] -_FlavourFunc = Callable[[Renderable], Renderable] - - def apply_renderable_mapping( - mapping: _FlavourMapping, + mapping: dict, structure: Renderable, - flavour_func: _FlavourFunc, + flavour_func: Callable[[Renderable], None], ) -> None: mapping[type(structure)].convert_to_class(structure, flavour_func) @@ -35,7 +31,7 @@ def HTMLReport(structure: Root) -> Root: from ydata_profiling.report.presentation.flavours import flavour_html # noqa: F401 mapping = get_flavour_mapping("html") - apply_renderable_mapping(mapping, structure, flavour_func=HTMLReport) # type: ignore + apply_renderable_mapping(mapping, structure, flavour_func=HTMLReport) return structure @@ -45,5 +41,5 @@ def WidgetReport(structure: Root) -> Root: ) mapping = get_flavour_mapping("widget") - apply_renderable_mapping(mapping, structure, flavour_func=WidgetReport) # type: ignore + apply_renderable_mapping(mapping, structure, flavour_func=WidgetReport) return structure From ae0223356748b7299c3c8c24048337647fa7a658 Mon Sep 17 00:00:00 2001 From: Pkcha Date: Sun, 12 Apr 2026 23:34:50 +0800 Subject: [PATCH 11/11] feat: initial release --- src/ydata_profiling/model/handler.py | 62 ++++++++---------- .../model/spark/missing_spark.py | 64 +++++++++++-------- .../report/presentation/core/renderable.py | 9 ++- .../report/presentation/flavours/flavours.py | 27 ++++++-- .../presentation/flavours/html/table.py | 2 +- .../presentation/flavours/html/templates.py | 1 + .../presentation/frequency_table_utils.py | 9 +-- 7 files changed, 100 insertions(+), 74 deletions(-) diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 4ea43192a..e9ba6a39a 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -1,33 +1,31 @@ """ Auxiliary handler methods for data summary extraction """ -from typing import Any, Callable, Dict, List, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Sequence, Tuple import networkx as nx from visions import VisionsTypeset -def compose(functions: Sequence[Callable]) -> Callable: +def compose(functions: Sequence[Callable[..., Any]]) -> Callable[..., Tuple[Any, ...]]: """ Compose a sequence of functions. - Each function in the sequence receives the result of the previous function. - Functions are expected to accept and return tuples for proper chaining. - - :param functions: sequence of functions that accept and return tuples - :return: combined function applying all functions in order + Each function in the sequence should accept the arguments passed to the composed + function and return either a single value or a tuple of values. + + :param functions: sequence of functions + :return: combined function applying all functions in order. """ def composed_function(*args: Any) -> Tuple[Any, ...]: - result: Union[Tuple[Any, ...], Any] = args + result: Tuple[Any, ...] = args for func in functions: - if isinstance(result, tuple): - result = func(*result) - else: - result = func(result) - if isinstance(result, tuple): - return result - return (result,) + result = func(*result) + # Ensure result is always a tuple for consistent unpacking + if not isinstance(result, tuple): + result = (result,) + return result return composed_function @@ -35,13 +33,12 @@ def composed_function(*args: Any) -> Tuple[Any, ...]: class Handler: """A generic handler - Allows any custom mapping between data types and functions. - Functions are composed based on the type hierarchy defined in the typeset. + Allows any custom mapping between data types and functions """ def __init__( self, - mapping: Dict[str, List[Callable]], + mapping: Dict[str, List[Callable[..., Any]]], typeset: VisionsTypeset, *args: Any, **kwargs: Any @@ -54,33 +51,28 @@ def _complete_dag(self) -> None: for from_type, to_type in nx.topological_sort( nx.line_graph(self.typeset.base_graph) ): - from_key = str(from_type) - to_key = str(to_type) - self.mapping[to_key] = self.mapping.get(from_key, []) + self.mapping.get( - to_key, [] + self.mapping[str(to_type)] = ( + self.mapping[str(from_type)] + self.mapping[str(to_type)] ) - def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Dict[str, Any]: + def handle(self, dtype: str, *args: Any, **kwargs: Any) -> Any: """ - Execute the handler chain for the given data type. - - :param dtype: the data type to handle - :param args: arguments to pass to the handler functions - :param kwargs: keyword arguments (currently unused but reserved for extensibility) - :return: a dictionary containing the summary extracted from the data + Execute the handler chain for the given dtype. + + :param dtype: The data type to handle + :param args: Arguments to pass to the handler chain + :return: The last element of the result tuple from the handler chain """ funcs = self.mapping.get(dtype, []) op = compose(funcs) - result = op(*args) - if result: - return result[-1] if isinstance(result[-1], dict) else {} - return {} + summary = op(*args)[-1] + return summary -def get_render_map() -> Dict[str, Callable]: +def get_render_map() -> Dict[str, Callable[..., Any]]: import ydata_profiling.report.structure.variables as render_algorithms - render_map = { + render_map: Dict[str, Callable[..., Any]] = { "Boolean": render_algorithms.render_boolean, "Numeric": render_algorithms.render_real, "Complex": render_algorithms.render_complex, diff --git a/src/ydata_profiling/model/spark/missing_spark.py b/src/ydata_profiling/model/spark/missing_spark.py index 5ad367e6e..384670232 100644 --- a/src/ydata_profiling/model/spark/missing_spark.py +++ b/src/ydata_profiling/model/spark/missing_spark.py @@ -13,47 +13,47 @@ class MissingnoBarSparkPatch: """ - Technical Debt : - This is a monkey patching object that allows usage of the library missingno as is for spark dataframes. - This is because missingno library's bar function always applies a isnull().sum() on dataframes in the visualisation - function, instead of allowing just values counts as an entry point. Thus, in order to calculate the - missing values dataframe in spark, we compute it first, then wrap it in this MissingnoBarSparkPatch object which - will be unwrapped by missingno and return the pre-computed value counts. - The best fix to this currently terrible patch is to submit a PR to missingno to separate preprocessing function - (compute value counts from df) and visualisation functions such that we can call the visualisation directly. - Unfortunately, the missingno library people have not really responded to our issues on gitlab. - See https://github.com/ResidentMario/missingno/issues/119. - We could also fork the missingno library and implement some of the code in our database, but that feels - like bad practice as well. + Adapter class to enable missingno library compatibility with Spark DataFrames. + + The missingno library's visualization functions internally call isnull().sum() + on dataframes. For Spark DataFrames, we pre-compute the null counts and wrap + them in this adapter to provide the expected interface. + + Note: This is a workaround for missingno's lack of separation between + data preprocessing and visualization. See: + https://github.com/ResidentMario/missingno/issues/119 """ def __init__( - self, df: DataFrame, columns: List[str] = None, original_df_size: int = None + self, + df: DataFrame, + columns: Optional[List[str]] = None, + original_df_size: Optional[int] = None ): self.df = df self.columns = columns self.original_df_size = original_df_size - def isnull(self) -> Any: - """ - This patches the .isnull().sum() function called by missingno library - """ - return self # return self to patch .sum() function + def isnull(self) -> "MissingnoBarSparkPatch": + """Returns self to enable chained .isnull().sum() calls.""" + return self def sum(self) -> DataFrame: - """ - This patches the .sum() function called by missingno library - """ - return self.df # return unwrapped dataframe + """Returns the pre-computed null counts dataframe.""" + return self.df def __len__(self) -> Optional[int]: - """ - This patches the len(df) function called by missingno library - """ + """Returns the original dataframe size.""" return self.original_df_size def missing_bar(config: Settings, df: DataFrame) -> str: + """Generate a missing values bar chart for Spark DataFrame. + + :param config: Report settings + :param df: Spark DataFrame + :return: HTML string of the bar chart + """ import pyspark.sql.functions as F data_nan_counts = ( @@ -70,6 +70,12 @@ def missing_bar(config: Settings, df: DataFrame) -> str: def missing_matrix(config: Settings, df: DataFrame) -> str: + """Generate a missing values matrix visualization for Spark DataFrame. + + :param config: Report settings + :param df: Spark DataFrame + :return: HTML string of the matrix visualization + """ df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) return plot_missing_matrix( config, @@ -80,11 +86,19 @@ def missing_matrix(config: Settings, df: DataFrame) -> str: def missing_heatmap(config: Settings, df: DataFrame) -> str: + """Generate a missing values heatmap for Spark DataFrame. + + :param config: Report settings + :param df: Spark DataFrame + :return: HTML string of the heatmap + """ df = MissingnoBarSparkPatch(df, columns=df.columns, original_df_size=df.count()) + # Remove completely filled or completely empty variables. columns = [i for i, n in enumerate(np.var(df.isnull(), axis="rows")) if n > 0] df = df.iloc[:, columns] + # Create and mask the correlation matrix. Construct the base heatmap. corr_mat = df.isnull().corr() mask = np.zeros_like(corr_mat) mask[np.triu_indices_from(mask)] = True diff --git a/src/ydata_profiling/report/presentation/core/renderable.py b/src/ydata_profiling/report/presentation/core/renderable.py index 84265c1c6..1040c9656 100644 --- a/src/ydata_profiling/report/presentation/core/renderable.py +++ b/src/ydata_profiling/report/presentation/core/renderable.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, Optional +from typing import Any, Callable, Dict, Optional class Renderable(ABC): @@ -38,5 +38,10 @@ def __str__(self) -> str: return self.__class__.__name__ @classmethod - def convert_to_class(cls, obj: "Renderable", flavour_func) -> None: + def convert_to_class(cls, obj: "Renderable", flavour_func: Callable[["Renderable"], None]) -> None: + """Convert the object's class to this class and recursively apply flavour to nested items. + + :param obj: The renderable object to convert + :param flavour_func: Function to apply to nested renderable items + """ obj.__class__ = cls diff --git a/src/ydata_profiling/report/presentation/flavours/flavours.py b/src/ydata_profiling/report/presentation/flavours/flavours.py index 5b7551d99..547a7a758 100644 --- a/src/ydata_profiling/report/presentation/flavours/flavours.py +++ b/src/ydata_profiling/report/presentation/flavours/flavours.py @@ -1,29 +1,46 @@ """ Flavours registry information """ -from typing import Callable +from typing import Callable, Dict, Type from ydata_profiling.report.presentation.core import Root from ydata_profiling.report.presentation.core.renderable import Renderable -_FLAVOUR_REGISTRY: dict = {} +_FLAVOUR_REGISTRY: Dict[str, Dict[Type[Renderable], Type[Renderable]]] = {} -def register_flavour(name: str, mapping: dict) -> None: +def register_flavour(name: str, mapping: Dict[Type[Renderable], Type[Renderable]]) -> None: + """Register a flavour mapping. + + :param name: The flavour name + :param mapping: Dictionary mapping core renderable types to flavour-specific types + """ _FLAVOUR_REGISTRY[name] = mapping -def get_flavour_mapping(name: str) -> dict: +def get_flavour_mapping(name: str) -> Dict[Type[Renderable], Type[Renderable]]: + """Get a registered flavour mapping. + + :param name: The flavour name + :return: The flavour mapping dictionary + :raises ValueError: If the flavour is not registered + """ if name not in _FLAVOUR_REGISTRY: raise ValueError(f"Flavour '{name}' is not registered.") return _FLAVOUR_REGISTRY[name] def apply_renderable_mapping( - mapping: dict, + mapping: Dict[Type[Renderable], Type[Renderable]], structure: Renderable, flavour_func: Callable[[Renderable], None], ) -> None: + """Apply flavour mapping to a renderable structure. + + :param mapping: The flavour mapping dictionary + :param structure: The renderable structure to transform + :param flavour_func: The flavour application function for recursive calls + """ mapping[type(structure)].convert_to_class(structure, flavour_func) diff --git a/src/ydata_profiling/report/presentation/flavours/html/table.py b/src/ydata_profiling/report/presentation/flavours/html/table.py index c5d71412b..59aa0eccf 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/table.py +++ b/src/ydata_profiling/report/presentation/flavours/html/table.py @@ -1,4 +1,4 @@ -from ydata_profiling.report.presentation.core.table import Table +from ydata_profiling.report.presentation.core import Table from ydata_profiling.report.presentation.flavours.html import templates diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates.py b/src/ydata_profiling/report/presentation/flavours/html/templates.py index 85e24a46a..30fcecda7 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates.py +++ b/src/ydata_profiling/report/presentation/flavours/html/templates.py @@ -1,6 +1,7 @@ """Contains all templates used for generating the HTML profile report""" import shutil from pathlib import Path +from typing import Any import jinja2 diff --git a/src/ydata_profiling/report/presentation/frequency_table_utils.py b/src/ydata_profiling/report/presentation/frequency_table_utils.py index f194bc514..6517cf621 100644 --- a/src/ydata_profiling/report/presentation/frequency_table_utils.py +++ b/src/ydata_profiling/report/presentation/frequency_table_utils.py @@ -7,8 +7,6 @@ def _frequency_table( freqtable: pd.Series, n: int, max_number_to_print: int ) -> List[Dict[str, Any]]: - # TODO: replace '' by '(Empty)' ? - if max_number_to_print > n: max_number_to_print = n @@ -26,7 +24,6 @@ def _frequency_table( max_freq = max(freqtable.values[0], freq_other, freq_missing) - # TODO: Correctly sort missing and other # No values if max_freq == 0: return [] @@ -77,7 +74,7 @@ def freq_table( freqtable: Union[pd.Series, List[pd.Series]], n: Union[int, List[int]], max_number_to_print: int, -) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]: +) -> List[List[Dict[str, Any]]]: """Render the rows for a frequency table (value, count). Args: @@ -94,7 +91,7 @@ def freq_table( _frequency_table(v, n2, max_number_to_print) for v, n2 in zip(freqtable, n) ] else: - return [_frequency_table(freqtable, n, max_number_to_print)] # type: ignore + return [_frequency_table(freqtable, n, max_number_to_print)] def _extreme_obs_table( @@ -138,4 +135,4 @@ def extreme_obs_table( _extreme_obs_table(v, number_to_print, n1) for v, n1 in zip(freqtable, n) ] - return [_extreme_obs_table(freqtable, number_to_print, n)] # type: ignore + return [_extreme_obs_table(freqtable, number_to_print, n)]